From 54d13032ab62d67303b33e7989425ceb42fd1d9e Mon Sep 17 00:00:00 2001
From: gald <pavelsuha95@gmail.com>
Date: Tue, 29 Jul 2025 18:24:01 +0200
Subject: [PATCH] Refactor JSON file retrieval to use regex for improved
 accuracy in matching file links and streamline file name extraction.

---
 main.py       | 19 ++++++++++---------
 test_regex.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 9 deletions(-)
 create mode 100644 test_regex.py

diff --git a/main.py b/main.py
index c8a8d43..132d417 100644
--- a/main.py
+++ b/main.py
@@ -7,7 +7,7 @@
 import sys
 import galdPl
 import requests
-from bs4 import BeautifulSoup
+import re
 import os
 
 def get_json_files_from_folder(folder):
@@ -15,15 +15,16 @@ def get_json_files_from_folder(folder):
     url = base_url + folder
     r = requests.get(url, timeout=10)
     r.raise_for_status()
-    soup = BeautifulSoup(r.text, "html.parser")
+    
+    # Hledáme JSON soubory pomocí regex
+    json_pattern = r'href="(/gald/galdistream/src/branch/main/resources/[^"]*\.json)"'
+    matches = re.findall(json_pattern, r.text)
+    
     files = []
-    # Hledáme odkazy s .json v href
-    for a in soup.find_all("a", href=lambda x: x and x.endswith('.json')):
-        href = a.get("href", "")
-        if href.startswith("/gald/galdistream/src/branch/main/resources/"):
-            # Extrahujeme pouze název souboru
-            file_name = href.split("/")[-1]
-            files.append(file_name)
+    for match in matches:
+        # Extrahujeme pouze název souboru
+        file_name = match.split("/")[-1]
+        files.append(file_name)
     return files
 
 def update_json_db():
diff --git a/test_regex.py b/test_regex.py
new file mode 100644
index 0000000..b73929a
--- /dev/null
+++ b/test_regex.py
@@ -0,0 +1,49 @@
+import requests
+import re
+import os
+
+def get_json_files_from_folder(folder):
+    base_url = "https://git.gald.site/gald/galdistream/src/branch/main/resources/"
+    url = base_url + folder
+    r = requests.get(url, timeout=10)
+    r.raise_for_status()
+    
+    # Hledáme JSON soubory pomocí regex
+    json_pattern = r'href="(/gald/galdistream/src/branch/main/resources/[^"]*\.json)"'
+    matches = re.findall(json_pattern, r.text)
+    
+    files = []
+    for match in matches:
+        # Extrahujeme pouze název souboru
+        file_name = match.split("/")[-1]
+        files.append(file_name)
+    return files
+
+def update_json_db():
+    base_url_raw = "https://git.gald.site/gald/galdistream/raw/branch/main/resources/"
+    folders = ["movies", "series"]
+    all_files = []
+    for folder in folders:
+        try:
+            files = get_json_files_from_folder(folder)
+            print(f"Nalezené soubory v {folder}: {files}")
+            all_files += [f"{folder}/{file}" for file in files]
+        except Exception as e:
+            print(f"Chyba při získávání souborů ze složky {folder}: {e}")
+    
+    print(f"Celkem souborů ke stažení: {len(all_files)}")
+    for file in all_files:
+        url = base_url_raw + file
+        local_path = "resources/" + file
+        try:
+            r = requests.get(url, timeout=10)
+            r.raise_for_status()
+            os.makedirs(os.path.dirname(local_path), exist_ok=True)
+            with open(local_path, "wb") as f:
+                f.write(r.content)
+            print(f"Staženo: {file}")
+        except Exception as e:
+            print(f"Chyba při stahování {file}: {e}")
+
+if __name__ == '__main__':
+    update_json_db() 
\ No newline at end of file