Refactor JSON file retrieval to use regex for improved accuracy in matching file links and streamline file name extraction.
This commit is contained in:
15
main.py
15
main.py
@@ -7,7 +7,7 @@
|
|||||||
import sys
|
import sys
|
||||||
import galdPl
|
import galdPl
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
import re
|
||||||
import os
|
import os
|
||||||
|
|
||||||
def get_json_files_from_folder(folder):
|
def get_json_files_from_folder(folder):
|
||||||
@@ -15,14 +15,15 @@ def get_json_files_from_folder(folder):
|
|||||||
url = base_url + folder
|
url = base_url + folder
|
||||||
r = requests.get(url, timeout=10)
|
r = requests.get(url, timeout=10)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
soup = BeautifulSoup(r.text, "html.parser")
|
|
||||||
|
# Hledáme JSON soubory pomocí regex
|
||||||
|
json_pattern = r'href="(/gald/galdistream/src/branch/main/resources/[^"]*\.json)"'
|
||||||
|
matches = re.findall(json_pattern, r.text)
|
||||||
|
|
||||||
files = []
|
files = []
|
||||||
# Hledáme odkazy s .json v href
|
for match in matches:
|
||||||
for a in soup.find_all("a", href=lambda x: x and x.endswith('.json')):
|
|
||||||
href = a.get("href", "")
|
|
||||||
if href.startswith("/gald/galdistream/src/branch/main/resources/"):
|
|
||||||
# Extrahujeme pouze název souboru
|
# Extrahujeme pouze název souboru
|
||||||
file_name = href.split("/")[-1]
|
file_name = match.split("/")[-1]
|
||||||
files.append(file_name)
|
files.append(file_name)
|
||||||
return files
|
return files
|
||||||
|
|
||||||
|
|||||||
49
test_regex.py
Normal file
49
test_regex.py
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
import requests
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
|
||||||
|
def get_json_files_from_folder(folder):
|
||||||
|
base_url = "https://git.gald.site/gald/galdistream/src/branch/main/resources/"
|
||||||
|
url = base_url + folder
|
||||||
|
r = requests.get(url, timeout=10)
|
||||||
|
r.raise_for_status()
|
||||||
|
|
||||||
|
# Hledáme JSON soubory pomocí regex
|
||||||
|
json_pattern = r'href="(/gald/galdistream/src/branch/main/resources/[^"]*\.json)"'
|
||||||
|
matches = re.findall(json_pattern, r.text)
|
||||||
|
|
||||||
|
files = []
|
||||||
|
for match in matches:
|
||||||
|
# Extrahujeme pouze název souboru
|
||||||
|
file_name = match.split("/")[-1]
|
||||||
|
files.append(file_name)
|
||||||
|
return files
|
||||||
|
|
||||||
|
def update_json_db():
|
||||||
|
base_url_raw = "https://git.gald.site/gald/galdistream/raw/branch/main/resources/"
|
||||||
|
folders = ["movies", "series"]
|
||||||
|
all_files = []
|
||||||
|
for folder in folders:
|
||||||
|
try:
|
||||||
|
files = get_json_files_from_folder(folder)
|
||||||
|
print(f"Nalezené soubory v {folder}: {files}")
|
||||||
|
all_files += [f"{folder}/{file}" for file in files]
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Chyba při získávání souborů ze složky {folder}: {e}")
|
||||||
|
|
||||||
|
print(f"Celkem souborů ke stažení: {len(all_files)}")
|
||||||
|
for file in all_files:
|
||||||
|
url = base_url_raw + file
|
||||||
|
local_path = "resources/" + file
|
||||||
|
try:
|
||||||
|
r = requests.get(url, timeout=10)
|
||||||
|
r.raise_for_status()
|
||||||
|
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
||||||
|
with open(local_path, "wb") as f:
|
||||||
|
f.write(r.content)
|
||||||
|
print(f"Staženo: {file}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Chyba při stahování {file}: {e}")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
update_json_db()
|
||||||
Reference in New Issue
Block a user