File size: 7,152 Bytes
3a322ec cd04c97 3a322ec cd04c97 3a322ec cd04c97 1e49a32 3a322ec 1e49a32 cd04c97 1e49a32 3a322ec cd04c97 3a322ec fd6f122 3a322ec cd04c97 3a322ec 1e49a32 3a322ec ba9a039 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
from fastapi import FastAPI, HTTPException
from fastapi.responses import FileResponse
from pathlib import Path
import os
import threading
import requests
from huggingface_hub import HfApi
import random
import time
app = FastAPI()
DOWNLOAD_DIR = Path("downloaded").resolve()
DATASET_DIR = Path("dataset").resolve()
MAX_VIDEOS = 5000
DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
DATASET_DIR.mkdir(parents=True, exist_ok=True)
DOWNLOAD_URLS = [
"https://youtu.be/wJe9zz_G4js",
"https://youtu.be/6RKL-j1k4Dc",
"https://youtu.be/tK848ib0BBw",
"https://youtu.be/r6ZQil-zd5Y",
"https://youtu.be/-gbCSnkvwNo",
"https://youtu.be/oso5I277FRY",
"https://youtu.be/4haAdmHqGOw",
"https://youtu.be/OeGULgqwJh8",
"https://youtu.be/QtIfb9JXOJg",
"https://youtu.be/lgljOqhhgHg",
"https://youtu.be/bokz-7HbgGM",
"https://youtu.be/-CiHJ41n6VI",
"https://youtu.be/Ys4793edotw",
"https://youtu.be/9N87-yRR5aE",
"https://youtu.be/5WOrfMz2Sqs",
"https://youtu.be/1qVbGr_ie30",
"https://youtu.be/qa_1LjeWsJg",
"https://youtu.be/9OVvnOh2ZGk",
"https://youtu.be/xEpVyEi1Hts",
"https://youtu.be/Wg244y2f9Fw",
"https://youtu.be/a-4oCHe-hDE",
"https://youtu.be/Q30-nakUrSM",
"https://youtu.be/HSm-cq7zd2s",
"https://youtu.be/x6oWgtJInCQ",
"https://youtu.be/9gn_1V1sCS8",
"https://youtu.be/dIv2FXyD3CU",
"https://youtu.be/SGUBriL9bNU",
"https://youtu.be/ABayYXu7OfI",
"https://youtu.be/-c0Evpf8V3A",
"https://youtu.be/F7VggbBaCsg",
"https://youtu.be/cn5BC3Vzcsc",
"https://youtu.be/TbjEVSNPiMQ",
"https://youtu.be/2PNiRWStZIo",
"https://youtu.be/UEeXv1bczuE",
"https://youtu.be/mYgznqvbisM",
"https://youtu.be/VMLW6XW0k6U",
"https://youtu.be/G17sBkp-DIk",
"https://youtu.be/XO783U-B5bg",
"https://youtu.be/n6V8v5PlvGI",
"https://youtu.be/VFSg3DkGSXQ",
"https://youtu.be/WgZ1K3J3RGU",
"https://youtu.be/DEx1nwxRzXQ",
"https://youtu.be/xbKDdRDLhJ8",
"https://youtu.be/F5Z8rj-fekU",
"https://youtu.be/wkX0z6ygng4",
"https://youtu.be/rg-VJGf3Z8E",
"https://youtu.be/Bkme3OeK6DM",
"https://youtu.be/FsniCv0L-7E",
"https://youtu.be/fZrY5n-wqZQ",
"https://youtu.be/XGHBtvnvz9U"
]
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
]
COOKIES_FILE = Path("youtube.com_cookies.txt").resolve() # Place your exported cookies file here
RAPIDAPI_HOST = "yt-api.p.rapidapi.com"
RAPIDAPI_KEY = os.environ.get("RAPIDAPI_KEY", "7b63a42ed4msha215d4e2fb17099p17ae62jsn0f42bd187691")
PROXY = os.environ.get("198.23.239.134:6540:kknqfmqe:0wyvognccou8") # Set this environment variable to your proxy, e.g. http://user:pass@host:port
HF_DATASET_REPO_ID = os.environ.get("HF_DATASET_REPO_ID")
HF_TOKEN = os.environ.get("HF_TOKEN")
def batch_download_via_api(download_urls, download_dir=DOWNLOAD_DIR):
"""
Download each video using the public API endpoint and save to download_dir.
Use the filename from Content-Disposition if available, else use video ID.
Also copy to dataset dir and upload to HuggingFace if configured.
No zipping, just raw mp4s.
"""
api_url = "https://fred808-data1.hf.space/video/download"
for url in download_urls:
try:
# FIX: API expects {"url": ...} not {"urls": [...]}
resp = requests.post(api_url, json={"url": url}, stream=True)
if resp.status_code == 200:
# Try to get filename from Content-Disposition header
filename = None
cd = resp.headers.get("content-disposition")
if cd and "filename=" in cd:
import re
match = re.search(r'filename="?([^";]+)"?', cd)
if match:
filename = match.group(1)
if not filename:
# Fallback to video ID
if "v=" in url:
video_id = url.split("v=")[1].split("&")[0]
elif "youtu.be/" in url:
video_id = url.split("youtu.be/")[1].split("?")[0]
else:
import hashlib
video_id = hashlib.md5(url.encode()).hexdigest()
filename = f"{video_id}.mp4"
out_path = download_dir / filename
with open(out_path, "wb") as f:
for chunk in resp.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print(f"Downloaded {url} to {out_path}")
# Copy to dataset dir and upload to HF if configured
if HF_DATASET_REPO_ID and HF_TOKEN:
upload_to_hf_dataset(out_path, HF_DATASET_REPO_ID, HF_TOKEN)
else:
import shutil
shutil.copy2(out_path, DATASET_DIR / out_path.name)
else:
print(f"Failed to download {url}: {resp.status_code} {resp.text}")
except Exception as e:
print(f"Error downloading {url}: {e}")
@app.on_event("startup")
def startup_event():
threading.Thread(target=batch_download_via_api, args=(DOWNLOAD_URLS,), daemon=True).start()
@app.get("/files")
def list_files():
files = [f.name for f in DOWNLOAD_DIR.glob("*") if f.is_file()]
return {"files": files}
@app.get("/download/{filename}")
def download_file(filename: str):
file_path = DOWNLOAD_DIR / filename
if not file_path.exists() or not file_path.is_file():
raise HTTPException(status_code=404, detail="File not found")
return FileResponse(str(file_path), filename=filename)
@app.get("/")
def root():
files = [f.name for f in DOWNLOAD_DIR.glob("*") if f.is_file()]
return {"message": "Use /download/{filename} to download a file.", "available_files": files}
def upload_to_hf_dataset(local_path, repo_id, token):
api = HfApi()
try:
# Also copy to dataset dir for local access
import shutil
shutil.copy2(local_path, DATASET_DIR / local_path.name)
api.upload_file(
path_or_fileobj=str(local_path),
path_in_repo=local_path.name,
repo_id=repo_id,
repo_type="dataset",
token=token,
)
print(f"Uploaded {local_path.name} to {repo_id} and copied to dataset dir")
except Exception as e:
print(f"Failed to upload {local_path.name} to {repo_id}: {e}")
@app.get("/dataset/{filename}")
def download_dataset_file(filename: str):
file_path = DATASET_DIR / filename
if not file_path.exists() or not file_path.is_file():
raise HTTPException(status_code=404, detail="File not found in dataset")
return FileResponse(str(file_path), filename=filename)
@app.get("/dataset")
def list_dataset_files():
files = [f.name for f in DATASET_DIR.glob("*") if f.is_file()]
return {"dataset_files": files} |