Update app.py
Browse files
app.py
CHANGED
|
@@ -18,9 +18,59 @@ DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
|
| 18 |
DATASET_DIR.mkdir(parents=True, exist_ok=True)
|
| 19 |
|
| 20 |
DOWNLOAD_URLS = [
|
| 21 |
-
"https://youtu.be/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
]
|
| 23 |
|
|
|
|
| 24 |
USER_AGENTS = [
|
| 25 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
| 26 |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
|
|
@@ -30,29 +80,40 @@ COOKIES_FILE = Path("youtube.com_cookies.txt").resolve() # Place your exported
|
|
| 30 |
RAPIDAPI_HOST = "yt-api.p.rapidapi.com"
|
| 31 |
RAPIDAPI_KEY = os.environ.get("RAPIDAPI_KEY", "7b63a42ed4msha215d4e2fb17099p17ae62jsn0f42bd187691")
|
| 32 |
PROXY = os.environ.get("198.23.239.134:6540:kknqfmqe:0wyvognccou8") # Set this environment variable to your proxy, e.g. http://user:pass@host:port
|
| 33 |
-
HF_DATASET_REPO_ID = os.environ.get("
|
| 34 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 35 |
|
| 36 |
def batch_download_via_api(download_urls, download_dir=DOWNLOAD_DIR):
|
| 37 |
"""
|
| 38 |
Download each video using the public API endpoint and save to download_dir.
|
|
|
|
| 39 |
Also copy to dataset dir and upload to HuggingFace if configured.
|
| 40 |
No zipping, just raw mp4s.
|
| 41 |
"""
|
| 42 |
-
api_url = "https://fred808-data1.hf.space/
|
| 43 |
for url in download_urls:
|
| 44 |
try:
|
| 45 |
resp = requests.post(api_url, json={"urls": [url]}, stream=True)
|
| 46 |
if resp.status_code == 200:
|
| 47 |
-
# Try to
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
with open(out_path, "wb") as f:
|
| 57 |
for chunk in resp.iter_content(chunk_size=8192):
|
| 58 |
if chunk:
|
|
|
|
| 18 |
DATASET_DIR.mkdir(parents=True, exist_ok=True)
|
| 19 |
|
| 20 |
DOWNLOAD_URLS = [
|
| 21 |
+
"https://youtu.be/wJe9zz_G4js",
|
| 22 |
+
"https://youtu.be/6RKL-j1k4Dc",
|
| 23 |
+
"https://youtu.be/tK848ib0BBw",
|
| 24 |
+
"https://youtu.be/r6ZQil-zd5Y",
|
| 25 |
+
"https://youtu.be/-gbCSnkvwNo",
|
| 26 |
+
"https://youtu.be/oso5I277FRY",
|
| 27 |
+
"https://youtu.be/4haAdmHqGOw",
|
| 28 |
+
"https://youtu.be/OeGULgqwJh8",
|
| 29 |
+
"https://youtu.be/QtIfb9JXOJg",
|
| 30 |
+
"https://youtu.be/lgljOqhhgHg",
|
| 31 |
+
"https://youtu.be/bokz-7HbgGM",
|
| 32 |
+
"https://youtu.be/-CiHJ41n6VI",
|
| 33 |
+
"https://youtu.be/Ys4793edotw",
|
| 34 |
+
"https://youtu.be/9N87-yRR5aE",
|
| 35 |
+
"https://youtu.be/5WOrfMz2Sqs",
|
| 36 |
+
"https://youtu.be/1qVbGr_ie30",
|
| 37 |
+
"https://youtu.be/qa_1LjeWsJg",
|
| 38 |
+
"https://youtu.be/9OVvnOh2ZGk",
|
| 39 |
+
"https://youtu.be/xEpVyEi1Hts",
|
| 40 |
+
"https://youtu.be/Wg244y2f9Fw",
|
| 41 |
+
"https://youtu.be/a-4oCHe-hDE",
|
| 42 |
+
"https://youtu.be/Q30-nakUrSM",
|
| 43 |
+
"https://youtu.be/HSm-cq7zd2s",
|
| 44 |
+
"https://youtu.be/x6oWgtJInCQ",
|
| 45 |
+
"https://youtu.be/9gn_1V1sCS8",
|
| 46 |
+
"https://youtu.be/dIv2FXyD3CU",
|
| 47 |
+
"https://youtu.be/SGUBriL9bNU",
|
| 48 |
+
"https://youtu.be/ABayYXu7OfI",
|
| 49 |
+
"https://youtu.be/-c0Evpf8V3A",
|
| 50 |
+
"https://youtu.be/F7VggbBaCsg",
|
| 51 |
+
"https://youtu.be/cn5BC3Vzcsc",
|
| 52 |
+
"https://youtu.be/TbjEVSNPiMQ",
|
| 53 |
+
"https://youtu.be/2PNiRWStZIo",
|
| 54 |
+
"https://youtu.be/UEeXv1bczuE",
|
| 55 |
+
"https://youtu.be/mYgznqvbisM",
|
| 56 |
+
"https://youtu.be/VMLW6XW0k6U",
|
| 57 |
+
"https://youtu.be/G17sBkp-DIk",
|
| 58 |
+
"https://youtu.be/XO783U-B5bg",
|
| 59 |
+
"https://youtu.be/n6V8v5PlvGI",
|
| 60 |
+
"https://youtu.be/VFSg3DkGSXQ",
|
| 61 |
+
"https://youtu.be/WgZ1K3J3RGU",
|
| 62 |
+
"https://youtu.be/DEx1nwxRzXQ",
|
| 63 |
+
"https://youtu.be/xbKDdRDLhJ8",
|
| 64 |
+
"https://youtu.be/F5Z8rj-fekU",
|
| 65 |
+
"https://youtu.be/wkX0z6ygng4",
|
| 66 |
+
"https://youtu.be/rg-VJGf3Z8E",
|
| 67 |
+
"https://youtu.be/Bkme3OeK6DM",
|
| 68 |
+
"https://youtu.be/FsniCv0L-7E",
|
| 69 |
+
"https://youtu.be/fZrY5n-wqZQ",
|
| 70 |
+
"https://youtu.be/XGHBtvnvz9U"
|
| 71 |
]
|
| 72 |
|
| 73 |
+
|
| 74 |
USER_AGENTS = [
|
| 75 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
| 76 |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
|
|
|
|
| 80 |
RAPIDAPI_HOST = "yt-api.p.rapidapi.com"
|
| 81 |
RAPIDAPI_KEY = os.environ.get("RAPIDAPI_KEY", "7b63a42ed4msha215d4e2fb17099p17ae62jsn0f42bd187691")
|
| 82 |
PROXY = os.environ.get("198.23.239.134:6540:kknqfmqe:0wyvognccou8") # Set this environment variable to your proxy, e.g. http://user:pass@host:port
|
| 83 |
+
HF_DATASET_REPO_ID = os.environ.get("HF_DATASET_REPO_ID")
|
| 84 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 85 |
|
| 86 |
def batch_download_via_api(download_urls, download_dir=DOWNLOAD_DIR):
|
| 87 |
"""
|
| 88 |
Download each video using the public API endpoint and save to download_dir.
|
| 89 |
+
Use the filename from Content-Disposition if available, else use video ID.
|
| 90 |
Also copy to dataset dir and upload to HuggingFace if configured.
|
| 91 |
No zipping, just raw mp4s.
|
| 92 |
"""
|
| 93 |
+
api_url = "https://fred808-data1.hf.space/video/download"
|
| 94 |
for url in download_urls:
|
| 95 |
try:
|
| 96 |
resp = requests.post(api_url, json={"urls": [url]}, stream=True)
|
| 97 |
if resp.status_code == 200:
|
| 98 |
+
# Try to get filename from Content-Disposition header
|
| 99 |
+
filename = None
|
| 100 |
+
cd = resp.headers.get("content-disposition")
|
| 101 |
+
if cd and "filename=" in cd:
|
| 102 |
+
import re
|
| 103 |
+
match = re.search(r'filename="?([^";]+)"?', cd)
|
| 104 |
+
if match:
|
| 105 |
+
filename = match.group(1)
|
| 106 |
+
if not filename:
|
| 107 |
+
# Fallback to video ID
|
| 108 |
+
if "v=" in url:
|
| 109 |
+
video_id = url.split("v=")[1].split("&")[0]
|
| 110 |
+
elif "youtu.be/" in url:
|
| 111 |
+
video_id = url.split("youtu.be/")[1].split("?")[0]
|
| 112 |
+
else:
|
| 113 |
+
import hashlib
|
| 114 |
+
video_id = hashlib.md5(url.encode()).hexdigest()
|
| 115 |
+
filename = f"{video_id}.mp4"
|
| 116 |
+
out_path = download_dir / filename
|
| 117 |
with open(out_path, "wb") as f:
|
| 118 |
for chunk in resp.iter_content(chunk_size=8192):
|
| 119 |
if chunk:
|