From 54d13032ab62d67303b33e7989425ceb42fd1d9e Mon Sep 17 00:00:00 2001 From: gald Date: Tue, 29 Jul 2025 18:24:01 +0200 Subject: [PATCH] Refactor JSON file retrieval to use regex for improved accuracy in matching file links and streamline file name extraction. --- main.py | 19 ++++++++++--------- test_regex.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 9 deletions(-) create mode 100644 test_regex.py diff --git a/main.py b/main.py index c8a8d43..132d417 100644 --- a/main.py +++ b/main.py @@ -7,7 +7,7 @@ import sys import galdPl import requests -from bs4 import BeautifulSoup +import re import os def get_json_files_from_folder(folder): @@ -15,15 +15,16 @@ def get_json_files_from_folder(folder): url = base_url + folder r = requests.get(url, timeout=10) r.raise_for_status() - soup = BeautifulSoup(r.text, "html.parser") + + # Hledáme JSON soubory pomocí regex + json_pattern = r'href="(/gald/galdistream/src/branch/main/resources/[^"]*\.json)"' + matches = re.findall(json_pattern, r.text) + files = [] - # Hledáme odkazy s .json v href - for a in soup.find_all("a", href=lambda x: x and x.endswith('.json')): - href = a.get("href", "") - if href.startswith("/gald/galdistream/src/branch/main/resources/"): - # Extrahujeme pouze název souboru - file_name = href.split("/")[-1] - files.append(file_name) + for match in matches: + # Extrahujeme pouze název souboru + file_name = match.split("/")[-1] + files.append(file_name) return files def update_json_db(): diff --git a/test_regex.py b/test_regex.py new file mode 100644 index 0000000..b73929a --- /dev/null +++ b/test_regex.py @@ -0,0 +1,49 @@ +import requests +import re +import os + +def get_json_files_from_folder(folder): + base_url = "https://git.gald.site/gald/galdistream/src/branch/main/resources/" + url = base_url + folder + r = requests.get(url, timeout=10) + r.raise_for_status() + + # Hledáme JSON soubory pomocí regex + json_pattern = r'href="(/gald/galdistream/src/branch/main/resources/[^"]*\.json)"' + matches = re.findall(json_pattern, r.text) + + files = [] + for match in matches: + # Extrahujeme pouze název souboru + file_name = match.split("/")[-1] + files.append(file_name) + return files + +def update_json_db(): + base_url_raw = "https://git.gald.site/gald/galdistream/raw/branch/main/resources/" + folders = ["movies", "series"] + all_files = [] + for folder in folders: + try: + files = get_json_files_from_folder(folder) + print(f"Nalezené soubory v {folder}: {files}") + all_files += [f"{folder}/{file}" for file in files] + except Exception as e: + print(f"Chyba při získávání souborů ze složky {folder}: {e}") + + print(f"Celkem souborů ke stažení: {len(all_files)}") + for file in all_files: + url = base_url_raw + file + local_path = "resources/" + file + try: + r = requests.get(url, timeout=10) + r.raise_for_status() + os.makedirs(os.path.dirname(local_path), exist_ok=True) + with open(local_path, "wb") as f: + f.write(r.content) + print(f"Staženo: {file}") + except Exception as e: + print(f"Chyba při stahování {file}: {e}") + +if __name__ == '__main__': + update_json_db() \ No newline at end of file