Refactor JSON file retrieval to use regex for improved accuracy in matching file links and streamline file name extraction.

This commit is contained in:
2025-07-29 18:24:01 +02:00
parent 7a614fd824
commit 54d13032ab
2 changed files with 59 additions and 9 deletions

19
main.py
View File

@@ -7,7 +7,7 @@
import sys import sys
import galdPl import galdPl
import requests import requests
from bs4 import BeautifulSoup import re
import os import os
def get_json_files_from_folder(folder): def get_json_files_from_folder(folder):
@@ -15,15 +15,16 @@ def get_json_files_from_folder(folder):
url = base_url + folder url = base_url + folder
r = requests.get(url, timeout=10) r = requests.get(url, timeout=10)
r.raise_for_status() r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
# Hledáme JSON soubory pomocí regex
json_pattern = r'href="(/gald/galdistream/src/branch/main/resources/[^"]*\.json)"'
matches = re.findall(json_pattern, r.text)
files = [] files = []
# Hledáme odkazy s .json v href for match in matches:
for a in soup.find_all("a", href=lambda x: x and x.endswith('.json')): # Extrahujeme pouze název souboru
href = a.get("href", "") file_name = match.split("/")[-1]
if href.startswith("/gald/galdistream/src/branch/main/resources/"): files.append(file_name)
# Extrahujeme pouze název souboru
file_name = href.split("/")[-1]
files.append(file_name)
return files return files
def update_json_db(): def update_json_db():

49
test_regex.py Normal file
View File

@@ -0,0 +1,49 @@
import requests
import re
import os
def get_json_files_from_folder(folder):
base_url = "https://git.gald.site/gald/galdistream/src/branch/main/resources/"
url = base_url + folder
r = requests.get(url, timeout=10)
r.raise_for_status()
# Hledáme JSON soubory pomocí regex
json_pattern = r'href="(/gald/galdistream/src/branch/main/resources/[^"]*\.json)"'
matches = re.findall(json_pattern, r.text)
files = []
for match in matches:
# Extrahujeme pouze název souboru
file_name = match.split("/")[-1]
files.append(file_name)
return files
def update_json_db():
base_url_raw = "https://git.gald.site/gald/galdistream/raw/branch/main/resources/"
folders = ["movies", "series"]
all_files = []
for folder in folders:
try:
files = get_json_files_from_folder(folder)
print(f"Nalezené soubory v {folder}: {files}")
all_files += [f"{folder}/{file}" for file in files]
except Exception as e:
print(f"Chyba při získávání souborů ze složky {folder}: {e}")
print(f"Celkem souborů ke stažení: {len(all_files)}")
for file in all_files:
url = base_url_raw + file
local_path = "resources/" + file
try:
r = requests.get(url, timeout=10)
r.raise_for_status()
os.makedirs(os.path.dirname(local_path), exist_ok=True)
with open(local_path, "wb") as f:
f.write(r.content)
print(f"Staženo: {file}")
except Exception as e:
print(f"Chyba při stahování {file}: {e}")
if __name__ == '__main__':
update_json_db()