MarneMorgan commited on
Commit
ea19c61
·
verified ·
1 Parent(s): 3fa1ad0

Create downloader.py

Browse files
Files changed (1) hide show
  1. downloader.py +94 -0
downloader.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import time
4
+ import requests
5
+ from pathlib import Path
6
+ from urllib.parse import urlparse
7
+
8
+ MAX_BYTES = int(os.getenv("MAX_DOWNLOAD_BYTES", str(250 * 1024 * 1024)))
9
+ TIMEOUT = int(os.getenv("DOWNLOAD_TIMEOUT", "60"))
10
+
11
+ UA = "Mozilla/5.0 (compatible; ai-ffmpeg-render/1.0; +https://huggingface.co/spaces)"
12
+
13
+ def _safe_filename(name: str, fallback: str) -> str:
14
+ name = (name or "").strip() or fallback
15
+ name = name.replace("\\", "_").replace("/", "_").replace("..", "_")
16
+ name = re.sub(r"[^a-zA-Z0-9._-]+", "_", name)
17
+ return name[:120]
18
+
19
+ def _guess_ext(headers: dict) -> str:
20
+ ctype = (headers.get("content-type") or "").lower()
21
+ if "video/mp4" in ctype: return ".mp4"
22
+ if "video/webm" in ctype: return ".webm"
23
+ if "video/quicktime" in ctype: return ".mov"
24
+ if "audio/mpeg" in ctype: return ".mp3"
25
+ if "audio/mp4" in ctype: return ".m4a"
26
+ if "audio/wav" in ctype or "audio/x-wav" in ctype: return ".wav"
27
+ if "image/png" in ctype: return ".png"
28
+ if "image/jpeg" in ctype: return ".jpg"
29
+ if "image/webp" in ctype: return ".webp"
30
+ if "text/vtt" in ctype: return ".vtt"
31
+ return ""
32
+
33
+ def download_to(url: str, dest_dir: Path, index: int) -> Path:
34
+ p = urlparse(url)
35
+ if p.scheme not in ("http", "https"):
36
+ raise ValueError("Only http/https URLs are allowed.")
37
+
38
+ base = Path(p.path).name
39
+ filename = _safe_filename(base, f"input{index}.bin")
40
+
41
+ headers = {"User-Agent": UA, "Accept": "*/*"}
42
+
43
+ last_err = None
44
+ for attempt in range(1, 3): # 2 tries
45
+ try:
46
+ with requests.get(
47
+ url,
48
+ stream=True,
49
+ timeout=TIMEOUT,
50
+ allow_redirects=True,
51
+ headers=headers,
52
+ ) as r:
53
+ r.raise_for_status()
54
+
55
+ # Infer ext if missing
56
+ if "." not in filename or filename.endswith(".bin"):
57
+ ext = _guess_ext(r.headers)
58
+ if ext and not filename.lower().endswith(ext):
59
+ # keep name stable but add ext
60
+ if filename.endswith(".bin"):
61
+ filename = filename[:-4] + ext
62
+ else:
63
+ filename = filename + ext
64
+
65
+ out = dest_dir / filename
66
+
67
+ total = 0
68
+ first_bytes = b""
69
+
70
+ with open(out, "wb") as f:
71
+ for chunk in r.iter_content(chunk_size=1024 * 1024):
72
+ if not chunk:
73
+ continue
74
+ if total == 0:
75
+ first_bytes = chunk[:200].lstrip().lower()
76
+ # Many CDNs return HTML/XML when expired/denied
77
+ if first_bytes.startswith(b"<html") or first_bytes.startswith(b"<!doctype html") or first_bytes.startswith(b"<?xml"):
78
+ raise ValueError("URL returned HTML/XML (often expired/not a direct file).")
79
+ total += len(chunk)
80
+ if total > MAX_BYTES:
81
+ raise ValueError(f"File too large (>{MAX_BYTES} bytes).")
82
+ f.write(chunk)
83
+
84
+ if total < 2048:
85
+ raise ValueError("Downloaded file is too small (likely invalid/expired).")
86
+
87
+ return out
88
+
89
+ except Exception as e:
90
+ last_err = str(e)
91
+ # small backoff
92
+ time.sleep(0.6 * attempt)
93
+
94
+ raise ValueError(f"Download failed after retries: {last_err}")