Spaces:
Build error
Build error
Commit
·
b8f7a78
1
Parent(s):
9b708eb
update
Browse files- hf_scrapper.py +13 -30
- requirements.txt +2 -1
hf_scrapper.py
CHANGED
|
@@ -3,7 +3,7 @@ import requests
|
|
| 3 |
import json
|
| 4 |
import urllib.request
|
| 5 |
from requests.exceptions import RequestException
|
| 6 |
-
from
|
| 7 |
|
| 8 |
def get_system_proxies():
|
| 9 |
try:
|
|
@@ -19,21 +19,20 @@ def get_system_proxies():
|
|
| 19 |
|
| 20 |
def download_and_cache_file(file_url, token, cache_path, proxies=None):
|
| 21 |
print(f"Downloading file from URL: {file_url} to {cache_path} with proxies: {proxies}")
|
| 22 |
-
|
| 23 |
-
# Create a requests session for better performance
|
| 24 |
-
session = requests.Session()
|
| 25 |
-
session.headers.update({'Authorization': f'Bearer {token}'})
|
| 26 |
-
session.proxies.update(proxies)
|
| 27 |
-
|
| 28 |
try:
|
| 29 |
-
response =
|
| 30 |
response.raise_for_status()
|
| 31 |
-
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
with open(cache_path, 'wb') as f:
|
| 34 |
-
|
|
|
|
| 35 |
if chunk:
|
| 36 |
f.write(chunk)
|
|
|
|
| 37 |
print(f'File cached to {cache_path} successfully.')
|
| 38 |
return True
|
| 39 |
except RequestException as e:
|
|
@@ -46,7 +45,6 @@ def get_file_structure(repo, token, path="", proxies=None):
|
|
| 46 |
api_url = f"https://huggingface.co/api/models/{repo}/tree/main/{path}"
|
| 47 |
headers = {'Authorization': f'Bearer {token}'}
|
| 48 |
print(f"Fetching file structure from URL: {api_url} with proxies: {proxies}")
|
| 49 |
-
|
| 50 |
try:
|
| 51 |
response = requests.get(api_url, headers=headers, proxies=proxies)
|
| 52 |
response.raise_for_status()
|
|
@@ -63,24 +61,9 @@ def write_file_structure_to_json(file_structure, file_path):
|
|
| 63 |
except IOError as e:
|
| 64 |
print(f"Error writing file structure to JSON: {e}")
|
| 65 |
|
| 66 |
-
# Function to download files in parallel
|
| 67 |
-
def parallel_downloads(file_urls, token, cache_dir, proxies=None):
|
| 68 |
-
with ThreadPoolExecutor() as executor:
|
| 69 |
-
futures = []
|
| 70 |
-
for file_url in file_urls:
|
| 71 |
-
filename = file_url.split("/")[-1]
|
| 72 |
-
cache_path = os.path.join(cache_dir, filename)
|
| 73 |
-
futures.append(executor.submit(download_and_cache_file, file_url, token, cache_path, proxies))
|
| 74 |
-
# Wait for all futures to complete
|
| 75 |
-
for future in futures:
|
| 76 |
-
future.result()
|
| 77 |
-
|
| 78 |
if __name__ == "__main__":
|
| 79 |
-
|
| 80 |
-
"https://huggingface.co/Unicone-Studio/jellyfin_media/resolve/main/films/Funky%20Monkey%202004/Funky%20Monkey%20(2004)%20Web-dl%201080p.mp4"
|
| 81 |
-
]
|
| 82 |
token = os.getenv("TOKEN")
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
parallel_downloads(file_urls, token, cache_dir, proxies)
|
|
|
|
| 3 |
import json
|
| 4 |
import urllib.request
|
| 5 |
from requests.exceptions import RequestException
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
|
| 8 |
def get_system_proxies():
|
| 9 |
try:
|
|
|
|
| 19 |
|
| 20 |
def download_and_cache_file(file_url, token, cache_path, proxies=None):
|
| 21 |
print(f"Downloading file from URL: {file_url} to {cache_path} with proxies: {proxies}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
try:
|
| 23 |
+
response = requests.get(file_url, headers={'Authorization': f'Bearer {token}'}, proxies=proxies, stream=True)
|
| 24 |
response.raise_for_status()
|
|
|
|
| 25 |
|
| 26 |
+
# Get the total file size from the headers
|
| 27 |
+
total_size = int(response.headers.get('content-length', 0))
|
| 28 |
+
|
| 29 |
+
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
|
| 30 |
with open(cache_path, 'wb') as f:
|
| 31 |
+
# Use tqdm to show download progress
|
| 32 |
+
for chunk in tqdm(response.iter_content(chunk_size=8192), total=total_size//8192, unit='KB', unit_scale=True, unit_divisor=1024):
|
| 33 |
if chunk:
|
| 34 |
f.write(chunk)
|
| 35 |
+
|
| 36 |
print(f'File cached to {cache_path} successfully.')
|
| 37 |
return True
|
| 38 |
except RequestException as e:
|
|
|
|
| 45 |
api_url = f"https://huggingface.co/api/models/{repo}/tree/main/{path}"
|
| 46 |
headers = {'Authorization': f'Bearer {token}'}
|
| 47 |
print(f"Fetching file structure from URL: {api_url} with proxies: {proxies}")
|
|
|
|
| 48 |
try:
|
| 49 |
response = requests.get(api_url, headers=headers, proxies=proxies)
|
| 50 |
response.raise_for_status()
|
|
|
|
| 61 |
except IOError as e:
|
| 62 |
print(f"Error writing file structure to JSON: {e}")
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
if __name__ == "__main__":
|
| 65 |
+
file_url = "https://huggingface.co/Unicone-Studio/jellyfin_media/resolve/main/films/Funky%20Monkey%202004/Funky%20Monkey%20(2004)%20Web-dl%201080p.mp4"
|
|
|
|
|
|
|
| 66 |
token = os.getenv("TOKEN")
|
| 67 |
+
cache_path = "tmp/cache/films/Funky%20Monkey%202004/Funky%20Monkey%20(2004)%20Web-dl%201080p.mp4"
|
| 68 |
+
proxy = get_system_proxies()
|
| 69 |
+
download_and_cache_file(file_url, token, cache_path, proxies=proxy)
|
|
|
requirements.txt
CHANGED
|
@@ -3,4 +3,5 @@ Flask-Cors
|
|
| 3 |
requests
|
| 4 |
python-dotenv
|
| 5 |
ffmpy
|
| 6 |
-
ffmpeg-python
|
|
|
|
|
|
| 3 |
requests
|
| 4 |
python-dotenv
|
| 5 |
ffmpy
|
| 6 |
+
ffmpeg-python
|
| 7 |
+
tqdm
|