File size: 7,152 Bytes
3a322ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd04c97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a322ec
 
cd04c97
3a322ec
 
 
 
 
 
 
 
 
cd04c97
1e49a32
3a322ec
 
 
1e49a32
cd04c97
1e49a32
 
3a322ec
cd04c97
3a322ec
 
fd6f122
 
3a322ec
cd04c97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a322ec
 
 
 
 
1e49a32
 
 
 
 
 
3a322ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba9a039
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
from fastapi import FastAPI, HTTPException
from fastapi.responses import FileResponse
from pathlib import Path
import os
import threading
import requests
from huggingface_hub import HfApi
import random
import time

app = FastAPI()

DOWNLOAD_DIR = Path("downloaded").resolve()
DATASET_DIR = Path("dataset").resolve()
MAX_VIDEOS = 5000

DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
DATASET_DIR.mkdir(parents=True, exist_ok=True)

DOWNLOAD_URLS = [
    "https://youtu.be/wJe9zz_G4js",
    "https://youtu.be/6RKL-j1k4Dc",
    "https://youtu.be/tK848ib0BBw",
    "https://youtu.be/r6ZQil-zd5Y",
    "https://youtu.be/-gbCSnkvwNo",
    "https://youtu.be/oso5I277FRY",
    "https://youtu.be/4haAdmHqGOw",
    "https://youtu.be/OeGULgqwJh8",
    "https://youtu.be/QtIfb9JXOJg",
    "https://youtu.be/lgljOqhhgHg",
    "https://youtu.be/bokz-7HbgGM",
    "https://youtu.be/-CiHJ41n6VI",
    "https://youtu.be/Ys4793edotw",
    "https://youtu.be/9N87-yRR5aE",
    "https://youtu.be/5WOrfMz2Sqs",
    "https://youtu.be/1qVbGr_ie30",
    "https://youtu.be/qa_1LjeWsJg",
    "https://youtu.be/9OVvnOh2ZGk",
    "https://youtu.be/xEpVyEi1Hts",
    "https://youtu.be/Wg244y2f9Fw",
    "https://youtu.be/a-4oCHe-hDE",
    "https://youtu.be/Q30-nakUrSM",
    "https://youtu.be/HSm-cq7zd2s",
    "https://youtu.be/x6oWgtJInCQ",
    "https://youtu.be/9gn_1V1sCS8",
    "https://youtu.be/dIv2FXyD3CU",
    "https://youtu.be/SGUBriL9bNU",
    "https://youtu.be/ABayYXu7OfI",
    "https://youtu.be/-c0Evpf8V3A",
    "https://youtu.be/F7VggbBaCsg",
    "https://youtu.be/cn5BC3Vzcsc",
    "https://youtu.be/TbjEVSNPiMQ",
    "https://youtu.be/2PNiRWStZIo",
    "https://youtu.be/UEeXv1bczuE",
    "https://youtu.be/mYgznqvbisM",
    "https://youtu.be/VMLW6XW0k6U",
    "https://youtu.be/G17sBkp-DIk",
    "https://youtu.be/XO783U-B5bg",
    "https://youtu.be/n6V8v5PlvGI",
    "https://youtu.be/VFSg3DkGSXQ",
    "https://youtu.be/WgZ1K3J3RGU",
    "https://youtu.be/DEx1nwxRzXQ",
    "https://youtu.be/xbKDdRDLhJ8",
    "https://youtu.be/F5Z8rj-fekU",
    "https://youtu.be/wkX0z6ygng4",
    "https://youtu.be/rg-VJGf3Z8E",
    "https://youtu.be/Bkme3OeK6DM",
    "https://youtu.be/FsniCv0L-7E",
    "https://youtu.be/fZrY5n-wqZQ",
    "https://youtu.be/XGHBtvnvz9U"
]


USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
]

COOKIES_FILE = Path("youtube.com_cookies.txt").resolve()  # Place your exported cookies file here
RAPIDAPI_HOST = "yt-api.p.rapidapi.com"
RAPIDAPI_KEY = os.environ.get("RAPIDAPI_KEY", "7b63a42ed4msha215d4e2fb17099p17ae62jsn0f42bd187691")
PROXY = os.environ.get("198.23.239.134:6540:kknqfmqe:0wyvognccou8")  # Set this environment variable to your proxy, e.g. http://user:pass@host:port
HF_DATASET_REPO_ID = os.environ.get("HF_DATASET_REPO_ID")
HF_TOKEN = os.environ.get("HF_TOKEN")

def batch_download_via_api(download_urls, download_dir=DOWNLOAD_DIR):
    """
    Download each video using the public API endpoint and save to download_dir.
    Use the filename from Content-Disposition if available, else use video ID.
    Also copy to dataset dir and upload to HuggingFace if configured.
    No zipping, just raw mp4s.
    """
    api_url = "https://fred808-data1.hf.space/video/download"
    for url in download_urls:
        try:
            # FIX: API expects {"url": ...} not {"urls": [...]}
            resp = requests.post(api_url, json={"url": url}, stream=True)
            if resp.status_code == 200:
                # Try to get filename from Content-Disposition header
                filename = None
                cd = resp.headers.get("content-disposition")
                if cd and "filename=" in cd:
                    import re
                    match = re.search(r'filename="?([^";]+)"?', cd)
                    if match:
                        filename = match.group(1)
                if not filename:
                    # Fallback to video ID
                    if "v=" in url:
                        video_id = url.split("v=")[1].split("&")[0]
                    elif "youtu.be/" in url:
                        video_id = url.split("youtu.be/")[1].split("?")[0]
                    else:
                        import hashlib
                        video_id = hashlib.md5(url.encode()).hexdigest()
                    filename = f"{video_id}.mp4"
                out_path = download_dir / filename
                with open(out_path, "wb") as f:
                    for chunk in resp.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
                print(f"Downloaded {url} to {out_path}")
                # Copy to dataset dir and upload to HF if configured
                if HF_DATASET_REPO_ID and HF_TOKEN:
                    upload_to_hf_dataset(out_path, HF_DATASET_REPO_ID, HF_TOKEN)
                else:
                    import shutil
                    shutil.copy2(out_path, DATASET_DIR / out_path.name)
            else:
                print(f"Failed to download {url}: {resp.status_code} {resp.text}")
        except Exception as e:
            print(f"Error downloading {url}: {e}")

@app.on_event("startup")
def startup_event():
    threading.Thread(target=batch_download_via_api, args=(DOWNLOAD_URLS,), daemon=True).start()

@app.get("/files")
def list_files():
    files = [f.name for f in DOWNLOAD_DIR.glob("*") if f.is_file()]
    return {"files": files}

@app.get("/download/{filename}")
def download_file(filename: str):
    file_path = DOWNLOAD_DIR / filename
    if not file_path.exists() or not file_path.is_file():
        raise HTTPException(status_code=404, detail="File not found")
    return FileResponse(str(file_path), filename=filename)

@app.get("/")
def root():
    files = [f.name for f in DOWNLOAD_DIR.glob("*") if f.is_file()]
    return {"message": "Use /download/{filename} to download a file.", "available_files": files}

def upload_to_hf_dataset(local_path, repo_id, token):
    api = HfApi()
    try:
        # Also copy to dataset dir for local access
        import shutil
        shutil.copy2(local_path, DATASET_DIR / local_path.name)
        api.upload_file(
            path_or_fileobj=str(local_path),
            path_in_repo=local_path.name,
            repo_id=repo_id,
            repo_type="dataset",
            token=token,
        )
        print(f"Uploaded {local_path.name} to {repo_id} and copied to dataset dir")
    except Exception as e:
        print(f"Failed to upload {local_path.name} to {repo_id}: {e}")

@app.get("/dataset/{filename}")
def download_dataset_file(filename: str):
    file_path = DATASET_DIR / filename
    if not file_path.exists() or not file_path.is_file():
        raise HTTPException(status_code=404, detail="File not found in dataset")
    return FileResponse(str(file_path), filename=filename)

@app.get("/dataset")
def list_dataset_files():
    files = [f.name for f in DATASET_DIR.glob("*") if f.is_file()]
    return {"dataset_files": files}