Refactor JSON file retrieval to use regex for improved accuracy in matching file links and streamline file name extraction.

2025-07-29 18:24:01 +02:00
parent 7a614fd824
commit 54d13032ab
2 changed files with 59 additions and 9 deletions
--- a/main.py
+++ b/main.py
@@ -7,7 +7,7 @@
 import sys
 import galdPl
 import requests
-from bs4 import BeautifulSoup
+import re
 import os
 def get_json_files_from_folder(folder):
@@ -15,15 +15,16 @@ def get_json_files_from_folder(folder):
    url = base_url + folder
    r = requests.get(url, timeout=10)
    r.raise_for_status()
-    soup = BeautifulSoup(r.text, "html.parser")
+    
    # Hledáme JSON soubory pomocí regex
    json_pattern = r'href="(/gald/galdistream/src/branch/main/resources/[^"]*\.json)"'
    matches = re.findall(json_pattern, r.text)
    files = []
-    # Hledáme odkazy s .json v href
+    for match in matches:
-    for a in soup.find_all("a", href=lambda x: x and x.endswith('.json')):
+        # Extrahujeme pouze název souboru
-        href = a.get("href", "")
+        file_name = match.split("/")[-1]
-        if href.startswith("/gald/galdistream/src/branch/main/resources/"):
+        files.append(file_name)
            # Extrahujeme pouze název souboru
            file_name = href.split("/")[-1]
            files.append(file_name)
    return files
 def update_json_db():
--- a/test_regex.py
+++ b/test_regex.py
@@ -0,0 +1,49 @@
 import requests
 import re
 import os
 def get_json_files_from_folder(folder):
    base_url = "https://git.gald.site/gald/galdistream/src/branch/main/resources/"
    url = base_url + folder
    r = requests.get(url, timeout=10)
    r.raise_for_status()
    # Hledáme JSON soubory pomocí regex
    json_pattern = r'href="(/gald/galdistream/src/branch/main/resources/[^"]*\.json)"'
    matches = re.findall(json_pattern, r.text)
    files = []
    for match in matches:
        # Extrahujeme pouze název souboru
        file_name = match.split("/")[-1]
        files.append(file_name)
    return files
 def update_json_db():
    base_url_raw = "https://git.gald.site/gald/galdistream/raw/branch/main/resources/"
    folders = ["movies", "series"]
    all_files = []
    for folder in folders:
        try:
            files = get_json_files_from_folder(folder)
            print(f"Nalezené soubory v {folder}: {files}")
            all_files += [f"{folder}/{file}" for file in files]
        except Exception as e:
            print(f"Chyba při získávání souborů ze složky {folder}: {e}")
    print(f"Celkem souborů ke stažení: {len(all_files)}")
    for file in all_files:
        url = base_url_raw + file
        local_path = "resources/" + file
        try:
            r = requests.get(url, timeout=10)
            r.raise_for_status()
            os.makedirs(os.path.dirname(local_path), exist_ok=True)
            with open(local_path, "wb") as f:
                f.write(r.content)
            print(f"Staženo: {file}")
        except Exception as e:
            print(f"Chyba při stahování {file}: {e}")
 if __name__ == '__main__':
    update_json_db()