from fastapi import FastAPI, HTTPException from fastapi.responses import FileResponse from pathlib import Path import os import threading import requests from huggingface_hub import HfApi import random import time app = FastAPI() DOWNLOAD_DIR = Path("downloaded").resolve() DATASET_DIR = Path("dataset").resolve() MAX_VIDEOS = 5000 DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True) DATASET_DIR.mkdir(parents=True, exist_ok=True) DOWNLOAD_URLS = [ "https://youtu.be/wJe9zz_G4js", "https://youtu.be/6RKL-j1k4Dc", "https://youtu.be/tK848ib0BBw", "https://youtu.be/r6ZQil-zd5Y", "https://youtu.be/-gbCSnkvwNo", "https://youtu.be/oso5I277FRY", "https://youtu.be/4haAdmHqGOw", "https://youtu.be/OeGULgqwJh8", "https://youtu.be/QtIfb9JXOJg", "https://youtu.be/lgljOqhhgHg", "https://youtu.be/bokz-7HbgGM", "https://youtu.be/-CiHJ41n6VI", "https://youtu.be/Ys4793edotw", "https://youtu.be/9N87-yRR5aE", "https://youtu.be/5WOrfMz2Sqs", "https://youtu.be/1qVbGr_ie30", "https://youtu.be/qa_1LjeWsJg", "https://youtu.be/9OVvnOh2ZGk", "https://youtu.be/xEpVyEi1Hts", "https://youtu.be/Wg244y2f9Fw", "https://youtu.be/a-4oCHe-hDE", "https://youtu.be/Q30-nakUrSM", "https://youtu.be/HSm-cq7zd2s", "https://youtu.be/x6oWgtJInCQ", "https://youtu.be/9gn_1V1sCS8", "https://youtu.be/dIv2FXyD3CU", "https://youtu.be/SGUBriL9bNU", "https://youtu.be/ABayYXu7OfI", "https://youtu.be/-c0Evpf8V3A", "https://youtu.be/F7VggbBaCsg", "https://youtu.be/cn5BC3Vzcsc", "https://youtu.be/TbjEVSNPiMQ", "https://youtu.be/2PNiRWStZIo", "https://youtu.be/UEeXv1bczuE", "https://youtu.be/mYgznqvbisM", "https://youtu.be/VMLW6XW0k6U", "https://youtu.be/G17sBkp-DIk", "https://youtu.be/XO783U-B5bg", "https://youtu.be/n6V8v5PlvGI", "https://youtu.be/VFSg3DkGSXQ", "https://youtu.be/WgZ1K3J3RGU", "https://youtu.be/DEx1nwxRzXQ", "https://youtu.be/xbKDdRDLhJ8", "https://youtu.be/F5Z8rj-fekU", "https://youtu.be/wkX0z6ygng4", "https://youtu.be/rg-VJGf3Z8E", "https://youtu.be/Bkme3OeK6DM", "https://youtu.be/FsniCv0L-7E", "https://youtu.be/fZrY5n-wqZQ", "https://youtu.be/XGHBtvnvz9U" ] USER_AGENTS = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15", ] COOKIES_FILE = Path("youtube.com_cookies.txt").resolve() # Place your exported cookies file here RAPIDAPI_HOST = "yt-api.p.rapidapi.com" RAPIDAPI_KEY = os.environ.get("RAPIDAPI_KEY", "7b63a42ed4msha215d4e2fb17099p17ae62jsn0f42bd187691") PROXY = os.environ.get("198.23.239.134:6540:kknqfmqe:0wyvognccou8") # Set this environment variable to your proxy, e.g. http://user:pass@host:port HF_DATASET_REPO_ID = os.environ.get("HF_DATASET_REPO_ID") HF_TOKEN = os.environ.get("HF_TOKEN") def batch_download_via_api(download_urls, download_dir=DOWNLOAD_DIR): """ Download each video using the public API endpoint and save to download_dir. Use the filename from Content-Disposition if available, else use video ID. Also copy to dataset dir and upload to HuggingFace if configured. No zipping, just raw mp4s. """ api_url = "https://fred808-data1.hf.space/video/download" for url in download_urls: try: # FIX: API expects {"url": ...} not {"urls": [...]} resp = requests.post(api_url, json={"url": url}, stream=True) if resp.status_code == 200: # Try to get filename from Content-Disposition header filename = None cd = resp.headers.get("content-disposition") if cd and "filename=" in cd: import re match = re.search(r'filename="?([^";]+)"?', cd) if match: filename = match.group(1) if not filename: # Fallback to video ID if "v=" in url: video_id = url.split("v=")[1].split("&")[0] elif "youtu.be/" in url: video_id = url.split("youtu.be/")[1].split("?")[0] else: import hashlib video_id = hashlib.md5(url.encode()).hexdigest() filename = f"{video_id}.mp4" out_path = download_dir / filename with open(out_path, "wb") as f: for chunk in resp.iter_content(chunk_size=8192): if chunk: f.write(chunk) print(f"Downloaded {url} to {out_path}") # Copy to dataset dir and upload to HF if configured if HF_DATASET_REPO_ID and HF_TOKEN: upload_to_hf_dataset(out_path, HF_DATASET_REPO_ID, HF_TOKEN) else: import shutil shutil.copy2(out_path, DATASET_DIR / out_path.name) else: print(f"Failed to download {url}: {resp.status_code} {resp.text}") except Exception as e: print(f"Error downloading {url}: {e}") @app.on_event("startup") def startup_event(): threading.Thread(target=batch_download_via_api, args=(DOWNLOAD_URLS,), daemon=True).start() @app.get("/files") def list_files(): files = [f.name for f in DOWNLOAD_DIR.glob("*") if f.is_file()] return {"files": files} @app.get("/download/{filename}") def download_file(filename: str): file_path = DOWNLOAD_DIR / filename if not file_path.exists() or not file_path.is_file(): raise HTTPException(status_code=404, detail="File not found") return FileResponse(str(file_path), filename=filename) @app.get("/") def root(): files = [f.name for f in DOWNLOAD_DIR.glob("*") if f.is_file()] return {"message": "Use /download/{filename} to download a file.", "available_files": files} def upload_to_hf_dataset(local_path, repo_id, token): api = HfApi() try: # Also copy to dataset dir for local access import shutil shutil.copy2(local_path, DATASET_DIR / local_path.name) api.upload_file( path_or_fileobj=str(local_path), path_in_repo=local_path.name, repo_id=repo_id, repo_type="dataset", token=token, ) print(f"Uploaded {local_path.name} to {repo_id} and copied to dataset dir") except Exception as e: print(f"Failed to upload {local_path.name} to {repo_id}: {e}") @app.get("/dataset/{filename}") def download_dataset_file(filename: str): file_path = DATASET_DIR / filename if not file_path.exists() or not file_path.is_file(): raise HTTPException(status_code=404, detail="File not found in dataset") return FileResponse(str(file_path), filename=filename) @app.get("/dataset") def list_dataset_files(): files = [f.name for f in DATASET_DIR.glob("*") if f.is_file()] return {"dataset_files": files}