Fred808 commited on
Commit
cd04c97
·
verified ·
1 Parent(s): 02745ab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -12
app.py CHANGED
@@ -18,9 +18,59 @@ DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
18
  DATASET_DIR.mkdir(parents=True, exist_ok=True)
19
 
20
  DOWNLOAD_URLS = [
21
- "https://youtu.be/ULCkj_Q5NCc?si=P5fVfGeL9dc47tju", "https://youtu.be/WJkI0cds4m4?si=4GlB22ly6RV32q48"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  ]
23
 
 
24
  USER_AGENTS = [
25
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
26
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
@@ -30,29 +80,40 @@ COOKIES_FILE = Path("youtube.com_cookies.txt").resolve() # Place your exported
30
  RAPIDAPI_HOST = "yt-api.p.rapidapi.com"
31
  RAPIDAPI_KEY = os.environ.get("RAPIDAPI_KEY", "7b63a42ed4msha215d4e2fb17099p17ae62jsn0f42bd187691")
32
  PROXY = os.environ.get("198.23.239.134:6540:kknqfmqe:0wyvognccou8") # Set this environment variable to your proxy, e.g. http://user:pass@host:port
33
- HF_DATASET_REPO_ID = os.environ.get("Fred808/BG1")
34
  HF_TOKEN = os.environ.get("HF_TOKEN")
35
 
36
  def batch_download_via_api(download_urls, download_dir=DOWNLOAD_DIR):
37
  """
38
  Download each video using the public API endpoint and save to download_dir.
 
39
  Also copy to dataset dir and upload to HuggingFace if configured.
40
  No zipping, just raw mp4s.
41
  """
42
- api_url = "https://fred808-data1.hf.space/batch/download"
43
  for url in download_urls:
44
  try:
45
  resp = requests.post(api_url, json={"urls": [url]}, stream=True)
46
  if resp.status_code == 200:
47
- # Try to extract a video ID or use a hash for filename
48
- if "v=" in url:
49
- video_id = url.split("v=")[1].split("&")[0]
50
- elif "youtu.be/" in url:
51
- video_id = url.split("youtu.be/")[1].split("?")[0]
52
- else:
53
- import hashlib
54
- video_id = hashlib.md5(url.encode()).hexdigest()
55
- out_path = download_dir / f"{video_id}.mp4"
 
 
 
 
 
 
 
 
 
 
56
  with open(out_path, "wb") as f:
57
  for chunk in resp.iter_content(chunk_size=8192):
58
  if chunk:
 
18
  DATASET_DIR.mkdir(parents=True, exist_ok=True)
19
 
20
  DOWNLOAD_URLS = [
21
+ "https://youtu.be/wJe9zz_G4js",
22
+ "https://youtu.be/6RKL-j1k4Dc",
23
+ "https://youtu.be/tK848ib0BBw",
24
+ "https://youtu.be/r6ZQil-zd5Y",
25
+ "https://youtu.be/-gbCSnkvwNo",
26
+ "https://youtu.be/oso5I277FRY",
27
+ "https://youtu.be/4haAdmHqGOw",
28
+ "https://youtu.be/OeGULgqwJh8",
29
+ "https://youtu.be/QtIfb9JXOJg",
30
+ "https://youtu.be/lgljOqhhgHg",
31
+ "https://youtu.be/bokz-7HbgGM",
32
+ "https://youtu.be/-CiHJ41n6VI",
33
+ "https://youtu.be/Ys4793edotw",
34
+ "https://youtu.be/9N87-yRR5aE",
35
+ "https://youtu.be/5WOrfMz2Sqs",
36
+ "https://youtu.be/1qVbGr_ie30",
37
+ "https://youtu.be/qa_1LjeWsJg",
38
+ "https://youtu.be/9OVvnOh2ZGk",
39
+ "https://youtu.be/xEpVyEi1Hts",
40
+ "https://youtu.be/Wg244y2f9Fw",
41
+ "https://youtu.be/a-4oCHe-hDE",
42
+ "https://youtu.be/Q30-nakUrSM",
43
+ "https://youtu.be/HSm-cq7zd2s",
44
+ "https://youtu.be/x6oWgtJInCQ",
45
+ "https://youtu.be/9gn_1V1sCS8",
46
+ "https://youtu.be/dIv2FXyD3CU",
47
+ "https://youtu.be/SGUBriL9bNU",
48
+ "https://youtu.be/ABayYXu7OfI",
49
+ "https://youtu.be/-c0Evpf8V3A",
50
+ "https://youtu.be/F7VggbBaCsg",
51
+ "https://youtu.be/cn5BC3Vzcsc",
52
+ "https://youtu.be/TbjEVSNPiMQ",
53
+ "https://youtu.be/2PNiRWStZIo",
54
+ "https://youtu.be/UEeXv1bczuE",
55
+ "https://youtu.be/mYgznqvbisM",
56
+ "https://youtu.be/VMLW6XW0k6U",
57
+ "https://youtu.be/G17sBkp-DIk",
58
+ "https://youtu.be/XO783U-B5bg",
59
+ "https://youtu.be/n6V8v5PlvGI",
60
+ "https://youtu.be/VFSg3DkGSXQ",
61
+ "https://youtu.be/WgZ1K3J3RGU",
62
+ "https://youtu.be/DEx1nwxRzXQ",
63
+ "https://youtu.be/xbKDdRDLhJ8",
64
+ "https://youtu.be/F5Z8rj-fekU",
65
+ "https://youtu.be/wkX0z6ygng4",
66
+ "https://youtu.be/rg-VJGf3Z8E",
67
+ "https://youtu.be/Bkme3OeK6DM",
68
+ "https://youtu.be/FsniCv0L-7E",
69
+ "https://youtu.be/fZrY5n-wqZQ",
70
+ "https://youtu.be/XGHBtvnvz9U"
71
  ]
72
 
73
+
74
  USER_AGENTS = [
75
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
76
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
 
80
  RAPIDAPI_HOST = "yt-api.p.rapidapi.com"
81
  RAPIDAPI_KEY = os.environ.get("RAPIDAPI_KEY", "7b63a42ed4msha215d4e2fb17099p17ae62jsn0f42bd187691")
82
  PROXY = os.environ.get("198.23.239.134:6540:kknqfmqe:0wyvognccou8") # Set this environment variable to your proxy, e.g. http://user:pass@host:port
83
+ HF_DATASET_REPO_ID = os.environ.get("HF_DATASET_REPO_ID")
84
  HF_TOKEN = os.environ.get("HF_TOKEN")
85
 
86
  def batch_download_via_api(download_urls, download_dir=DOWNLOAD_DIR):
87
  """
88
  Download each video using the public API endpoint and save to download_dir.
89
+ Use the filename from Content-Disposition if available, else use video ID.
90
  Also copy to dataset dir and upload to HuggingFace if configured.
91
  No zipping, just raw mp4s.
92
  """
93
+ api_url = "https://fred808-data1.hf.space/video/download"
94
  for url in download_urls:
95
  try:
96
  resp = requests.post(api_url, json={"urls": [url]}, stream=True)
97
  if resp.status_code == 200:
98
+ # Try to get filename from Content-Disposition header
99
+ filename = None
100
+ cd = resp.headers.get("content-disposition")
101
+ if cd and "filename=" in cd:
102
+ import re
103
+ match = re.search(r'filename="?([^";]+)"?', cd)
104
+ if match:
105
+ filename = match.group(1)
106
+ if not filename:
107
+ # Fallback to video ID
108
+ if "v=" in url:
109
+ video_id = url.split("v=")[1].split("&")[0]
110
+ elif "youtu.be/" in url:
111
+ video_id = url.split("youtu.be/")[1].split("?")[0]
112
+ else:
113
+ import hashlib
114
+ video_id = hashlib.md5(url.encode()).hexdigest()
115
+ filename = f"{video_id}.mp4"
116
+ out_path = download_dir / filename
117
  with open(out_path, "wb") as f:
118
  for chunk in resp.iter_content(chunk_size=8192):
119
  if chunk: