Fred808 commited on
Commit
c0223a0
·
verified ·
1 Parent(s): a244bae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +142 -88
app.py CHANGED
@@ -1,10 +1,9 @@
1
  import requests
2
  import os
3
  import time
 
4
  from huggingface_hub import upload_file
5
 
6
-
7
-
8
  # === CONFIGURATION ===
9
  HF_TOKEN = os.environ.get("HF_TOKEN")
10
  REPO_ID = "Fred808/BG1"
@@ -18,7 +17,6 @@ FETCH_API_URL = f"{BASE_URL}/batch/download-all/{{}}"
18
 
19
  # Video URLs to download from YouTube
20
  VIDEO_URLS = [
21
- "https://youtu.be/a-4oCHe-hDE",
22
  "https://youtu.be/Q30-nakUrSM",
23
  "https://youtu.be/HSm-cq7zd2s",
24
  "https://youtu.be/x6oWgtJInCQ",
@@ -53,79 +51,106 @@ VIDEO_URLS = [
53
 
54
  # Output directory
55
  OUTPUT_DIR = "batch_downloads"
56
- if os.path.isfile(OUTPUT_DIR):
57
- os.remove(OUTPUT_DIR)
58
- os.makedirs(OUTPUT_DIR, exist_ok=True)
 
 
 
 
59
 
60
  # Polling settings
61
  POLL_DELAY = 5
62
  MAX_WAIT_MINUTES = 5
63
  MAX_RETRIES = int((MAX_WAIT_MINUTES * 60) / POLL_DELAY)
64
 
65
- # === Step 1: Submit batch ===
66
- print(f"[*] Submitting {len(VIDEO_URLS)} URLs to batch API...")
67
- try:
68
- res = requests.post(INIT_API_URL, json={"urls": VIDEO_URLS})
69
- res.raise_for_status()
70
- batch_id = res.json().get("batch_id")
71
- if not batch_id:
72
- raise Exception("No batch_id returned.")
73
- print(f"[+] Batch submitted. ID: {batch_id}")
74
- except Exception as e:
75
- print(f"[!] Submission error: {e}")
76
- exit(1)
77
-
78
- # === Step 2: Wait for processing to complete ===
79
- status_url = STATUS_API_URL.format(batch_id)
80
- print("[*] Waiting for batch to complete...")
81
-
82
- for attempt in range(MAX_RETRIES):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  try:
84
- res = requests.get(status_url)
85
  res.raise_for_status()
86
  data = res.json()
87
-
88
- status = data.get("status")
89
- total = data.get("total_urls", "?")
90
- completed = data.get("completed", 0)
91
- failed = data.get("failed", 0)
92
-
93
- if status == "completed":
94
- print(f"[✓] All {completed}/{total} videos processed successfully.")
95
- break
96
- elif status in ["started", "processing", "in_progress"]:
97
- print(f"[~] Processing... {completed}/{total} done, {failed} failed.")
98
- elif status == "failed":
99
- print("[!] Batch failed on server.")
100
- exit(1)
101
- else:
102
- print(f"[~] Unknown status '{status}', retrying...")
103
-
104
  except Exception as e:
105
- print(f"[!] Error checking status: {e}")
 
106
 
107
- time.sleep(POLL_DELAY)
108
- else:
109
- print("[!] Timeout — batch not finished in time.")
110
- exit(1)
111
-
112
- # === Step 3: Fetch download list ONCE ===
113
- fetch_url = FETCH_API_URL.format(batch_id)
114
- print("[*] Fetching download list...")
115
- try:
116
- res = requests.get(fetch_url)
117
- res.raise_for_status()
118
- data = res.json()
119
- downloads = data.get("downloads", [])
120
- if not downloads:
121
- print("[!] No downloads available. Exiting.")
122
- exit(1)
123
- print(f"[+] Found {len(downloads)} videos to download.")
124
- except Exception as e:
125
- print(f"[!] Failed to fetch download links: {e}")
126
- exit(1)
127
-
128
- # === Step 4: Download & Upload ===
129
  def upload_to_dataset(filepath):
130
  try:
131
  upload_file(
@@ -139,30 +164,59 @@ def upload_to_dataset(filepath):
139
  except Exception as e:
140
  print(f"[!] Upload failed: {filepath} — {e}")
141
 
142
- for video in downloads:
143
- filename = video.get("filename")
144
- url = video.get("url")
145
 
146
- if not filename or not url:
147
- print("[!] Skipping invalid entry.")
148
- continue
 
149
 
150
- # Prepend base domain if needed
151
- if url.startswith("/"):
152
- url = BASE_URL + url
153
 
154
- local_path = os.path.join(OUTPUT_DIR, filename)
155
- try:
156
- print(f"[*] Downloading {filename}...")
157
- with requests.get(url, stream=True) as r:
158
- r.raise_for_status()
159
- with open(local_path, "wb") as f:
160
- for chunk in r.iter_content(chunk_size=8192):
161
- f.write(chunk)
162
- print(f"[✓] Downloaded: {filename}")
163
-
164
- upload_to_dataset(local_path)
165
- os.remove(local_path)
166
 
167
- except Exception as e:
168
- print(f"[!] Error downloading/uploading {filename}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import requests
2
  import os
3
  import time
4
+ import json
5
  from huggingface_hub import upload_file
6
 
 
 
7
  # === CONFIGURATION ===
8
  HF_TOKEN = os.environ.get("HF_TOKEN")
9
  REPO_ID = "Fred808/BG1"
 
17
 
18
  # Video URLs to download from YouTube
19
  VIDEO_URLS = [
 
20
  "https://youtu.be/Q30-nakUrSM",
21
  "https://youtu.be/HSm-cq7zd2s",
22
  "https://youtu.be/x6oWgtJInCQ",
 
51
 
52
  # Output directory
53
  OUTPUT_DIR = "batch_downloads"
54
+ if os.path.isdir(OUTPUT_DIR):
55
+ for filename in os.listdir(OUTPUT_DIR):
56
+ file_path = os.path.join(OUTPUT_DIR, filename)
57
+ if os.path.isfile(file_path):
58
+ os.remove(file_path)
59
+ else:
60
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
61
 
62
  # Polling settings
63
  POLL_DELAY = 5
64
  MAX_WAIT_MINUTES = 5
65
  MAX_RETRIES = int((MAX_WAIT_MINUTES * 60) / POLL_DELAY)
66
 
67
+ # Path to the file that stores processed URLs
68
+ PROCESSED_URLS_FILE = "processed_urls.json"
69
+
70
+ # Load the list of processed URLs from a file
71
+ def load_processed_urls():
72
+ if os.path.exists(PROCESSED_URLS_FILE):
73
+ with open(PROCESSED_URLS_FILE, "r") as f:
74
+ return set(json.load(f))
75
+ else:
76
+ return set()
77
+
78
+ # Save the list of processed URLs to a file
79
+ def save_processed_urls(processed_urls):
80
+ with open(PROCESSED_URLS_FILE, "w") as f:
81
+ json.dump(list(processed_urls), f)
82
+
83
+ # Function to split URLs into batches of 20
84
+ def chunk_urls(urls, batch_size=20):
85
+ for i in range(0, len(urls), batch_size):
86
+ yield urls[i:i + batch_size]
87
+
88
+ # === Step 1: Submit batch, process each batch ===
89
+ def process_batch(urls):
90
+ print(f"[*] Submitting {len(urls)} URLs to batch API...")
91
+ try:
92
+ res = requests.post(INIT_API_URL, json={"urls": urls})
93
+ res.raise_for_status()
94
+ batch_id = res.json().get("batch_id")
95
+ if not batch_id:
96
+ raise Exception("No batch_id returned.")
97
+ print(f"[+] Batch submitted. ID: {batch_id}")
98
+ return batch_id
99
+ except Exception as e:
100
+ print(f"[!] Submission error: {e}")
101
+ return None
102
+
103
+ def check_status(batch_id):
104
+ status_url = STATUS_API_URL.format(batch_id)
105
+ print("[*] Waiting for batch to complete...")
106
+
107
+ for attempt in range(MAX_RETRIES):
108
+ try:
109
+ res = requests.get(status_url)
110
+ res.raise_for_status()
111
+ data = res.json()
112
+
113
+ status = data.get("status")
114
+ total = data.get("total_urls", "?")
115
+ completed = data.get("completed", 0)
116
+ failed = data.get("failed", 0)
117
+
118
+ if status == "completed":
119
+ print(f"[✓] All {completed}/{total} videos processed successfully.")
120
+ return True
121
+ elif status in ["started", "processing", "in_progress"]:
122
+ print(f"[~] Processing... {completed}/{total} done, {failed} failed.")
123
+ elif status == "failed":
124
+ print("[!] Batch failed on server.")
125
+ return False
126
+ else:
127
+ print(f"[~] Unknown status '{status}', retrying...")
128
+
129
+ except Exception as e:
130
+ print(f"[!] Error checking status: {e}")
131
+
132
+ time.sleep(POLL_DELAY)
133
+ else:
134
+ print("[!] Timeout — batch not finished in time.")
135
+ return False
136
+
137
+ def fetch_downloads(batch_id):
138
+ fetch_url = FETCH_API_URL.format(batch_id)
139
+ print("[*] Fetching download list...")
140
  try:
141
+ res = requests.get(fetch_url)
142
  res.raise_for_status()
143
  data = res.json()
144
+ downloads = data.get("downloads", [])
145
+ if not downloads:
146
+ print("[!] No downloads available. Exiting.")
147
+ return []
148
+ print(f"[+] Found {len(downloads)} videos to download.")
149
+ return downloads
 
 
 
 
 
 
 
 
 
 
 
150
  except Exception as e:
151
+ print(f"[!] Failed to fetch download links: {e}")
152
+ return []
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  def upload_to_dataset(filepath):
155
  try:
156
  upload_file(
 
164
  except Exception as e:
165
  print(f"[!] Upload failed: {filepath} — {e}")
166
 
167
+ # Main function to download and upload videos
168
+ def main():
169
+ processed_urls = load_processed_urls() # Load previously processed URLs
170
 
171
+ # Process each batch
172
+ for url_batch in chunk_urls(VIDEO_URLS, batch_size=20):
173
+ # Filter out already processed URLs
174
+ urls_to_process = [url for url in url_batch if url not in processed_urls]
175
 
176
+ if not urls_to_process:
177
+ print("[*] All URLs in this batch have already been processed. Skipping...")
178
+ continue
179
 
180
+ batch_id = process_batch(urls_to_process)
181
+ if not batch_id:
182
+ break
 
 
 
 
 
 
 
 
 
183
 
184
+ # Wait for the batch to be processed
185
+ if not check_status(batch_id):
186
+ continue
187
+
188
+ # Fetch the download list for the batch
189
+ downloads = fetch_downloads(batch_id)
190
+ for video in downloads:
191
+ filename = video.get("filename")
192
+ url = video.get("url")
193
+
194
+ if not filename or not url:
195
+ print("[!] Skipping invalid entry.")
196
+ continue
197
+
198
+ # Prepend base domain if needed
199
+ if url.startswith("/"):
200
+ url = BASE_URL + url
201
+
202
+ local_path = os.path.join(OUTPUT_DIR, filename)
203
+ try:
204
+ print(f"[*] Downloading {filename}...")
205
+ with requests.get(url, stream=True) as r:
206
+ r.raise_for_status()
207
+ with open(local_path, "wb") as f:
208
+ for chunk in r.iter_content(chunk_size=8192):
209
+ f.write(chunk)
210
+ print(f"[✓] Downloaded: {filename}")
211
+
212
+ # Upload to dataset
213
+ upload_to_dataset(local_path)
214
+
215
+ # Remove the file after upload
216
+ os.remove(local_path)
217
+
218
+ # Mark this URL as processed
219
+ processed_urls.add(url)
220
+
221
+ except Exception as e:
222
+ print(f"[!]