Fred808 commited on
Commit
b9d8908
Β·
verified Β·
1 Parent(s): 8ff1abc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -33
app.py CHANGED
@@ -68,7 +68,8 @@ VIDEO_URLS = [
68
  "https://youtu.be/XGHBtvnvz9U"
69
  ]
70
 
71
- # === Load processed videos ===
 
72
  if os.path.exists(PROCESSED_FILE):
73
  with open(PROCESSED_FILE, "r") as f:
74
  processed_urls = set(json.load(f))
@@ -93,21 +94,24 @@ def upload_to_dataset(filepath):
93
  token=HF_TOKEN
94
  )
95
  print(f"[↑] Uploaded: {filepath}")
 
96
  except Exception as e:
97
  print(f"[!] Upload failed: {filepath} β€” {e}")
 
98
 
99
- # === Create output folder ===
100
  if os.path.exists(OUTPUT_DIR):
101
- for file in os.listdir(OUTPUT_DIR):
102
- os.remove(os.path.join(OUTPUT_DIR, file))
103
  else:
104
  os.makedirs(OUTPUT_DIR)
105
 
106
- # === Process in batches ===
107
  unprocessed_urls = [url for url in VIDEO_URLS if url not in processed_urls]
108
 
 
109
  for batch in chunk_urls(unprocessed_urls, 20):
110
- print(f"\n[*] Submitting {len(batch)} URLs to batch API...")
111
  try:
112
  res = requests.post(INIT_API_URL, json={"urls": batch})
113
  res.raise_for_status()
@@ -119,39 +123,32 @@ for batch in chunk_urls(unprocessed_urls, 20):
119
  print(f"[!] Submission error: {e}")
120
  continue
121
 
122
- # === Wait for batch to finish ===
123
  status_url = STATUS_API_URL.format(batch_id)
124
- print("[*] Waiting for batch to complete...")
125
- POLL_DELAY = 5
126
- MAX_WAIT_MINUTES = 5
127
- MAX_RETRIES = int((MAX_WAIT_MINUTES * 60) / POLL_DELAY)
128
-
129
- for attempt in range(MAX_RETRIES):
130
  try:
131
  res = requests.get(status_url)
132
  res.raise_for_status()
133
  data = res.json()
134
- status = data.get("status")
135
  total = data.get("total_urls", "?")
136
  completed = data.get("completed", 0)
137
  failed = data.get("failed", 0)
 
138
 
139
- if status == "completed":
140
- print(f"[βœ“] All {completed}/{total} videos processed.")
141
- break
142
- elif status in ["started", "processing", "in_progress"]:
143
- print(f"[~] Processing... {completed}/{total} done, {failed} failed.")
144
- elif status == "failed":
145
- print("[!] Batch failed on server.")
146
  break
 
147
  except Exception as e:
148
- print(f"[!] Status check failed: {e}")
149
- time.sleep(POLL_DELAY)
150
- else:
151
- print("[!] Timeout β€” skipping batch.")
152
- continue
153
 
154
- # === Fetch download links ===
 
 
155
  fetch_url = FETCH_API_URL.format(batch_id)
156
  print("[*] Fetching download list...")
157
  try:
@@ -166,7 +163,7 @@ for batch in chunk_urls(unprocessed_urls, 20):
166
  print(f"[!] Failed to fetch download list: {e}")
167
  continue
168
 
169
- # === Download and upload ===
170
  for video, url in zip(downloads, batch):
171
  filename = video.get("filename")
172
  file_url = video.get("url")
@@ -178,6 +175,7 @@ for batch in chunk_urls(unprocessed_urls, 20):
178
  file_url = BASE_URL + file_url
179
 
180
  local_path = os.path.join(OUTPUT_DIR, filename)
 
181
  try:
182
  print(f"[*] Downloading {filename}...")
183
  with requests.get(file_url, stream=True) as r:
@@ -186,14 +184,17 @@ for batch in chunk_urls(unprocessed_urls, 20):
186
  for chunk in r.iter_content(chunk_size=8192):
187
  f.write(chunk)
188
  print(f"[βœ“] Downloaded: {filename}")
189
- upload_to_dataset(local_path)
 
 
 
 
 
190
  processed_urls.add(url)
191
  save_processed()
192
- os.remove(local_path)
193
- except Exception as e:
194
- print(f"[!] Error downloading/uploading {filename}: {e}")
195
 
196
  print("[⏱] Waiting 30s before next batch...\n")
197
  time.sleep(30)
198
 
199
- print("\nβœ… Done. All batches processed.")
 
68
  "https://youtu.be/XGHBtvnvz9U"
69
  ]
70
 
71
+
72
+ # === Load processed ===
73
  if os.path.exists(PROCESSED_FILE):
74
  with open(PROCESSED_FILE, "r") as f:
75
  processed_urls = set(json.load(f))
 
94
  token=HF_TOKEN
95
  )
96
  print(f"[↑] Uploaded: {filepath}")
97
+ return True
98
  except Exception as e:
99
  print(f"[!] Upload failed: {filepath} β€” {e}")
100
+ return False
101
 
102
+ # === Prepare output folder ===
103
  if os.path.exists(OUTPUT_DIR):
104
+ for f in os.listdir(OUTPUT_DIR):
105
+ os.remove(os.path.join(OUTPUT_DIR, f))
106
  else:
107
  os.makedirs(OUTPUT_DIR)
108
 
109
+ # === Filter unprocessed URLs ===
110
  unprocessed_urls = [url for url in VIDEO_URLS if url not in processed_urls]
111
 
112
+ # === Process in batches ===
113
  for batch in chunk_urls(unprocessed_urls, 20):
114
+ print(f"\n[*] Submitting batch of {len(batch)} URLs...")
115
  try:
116
  res = requests.post(INIT_API_URL, json={"urls": batch})
117
  res.raise_for_status()
 
123
  print(f"[!] Submission error: {e}")
124
  continue
125
 
126
+ # === Wait until all videos in batch are fully processed ===
127
  status_url = STATUS_API_URL.format(batch_id)
128
+ print("[*] Waiting for all batch videos to be processed...")
129
+ while True:
 
 
 
 
130
  try:
131
  res = requests.get(status_url)
132
  res.raise_for_status()
133
  data = res.json()
134
+
135
  total = data.get("total_urls", "?")
136
  completed = data.get("completed", 0)
137
  failed = data.get("failed", 0)
138
+ status = data.get("status")
139
 
140
+ print(f"[~] Status: {status} β€” {completed}/{total} done, {failed} failed.")
141
+
142
+ if completed + failed >= int(total):
143
+ print(f"[βœ“] Batch fully processed: {completed} completed, {failed} failed.")
 
 
 
144
  break
145
+
146
  except Exception as e:
147
+ print(f"[!] Error checking status: {e}")
 
 
 
 
148
 
149
+ time.sleep(5)
150
+
151
+ # === Fetch download list ===
152
  fetch_url = FETCH_API_URL.format(batch_id)
153
  print("[*] Fetching download list...")
154
  try:
 
163
  print(f"[!] Failed to fetch download list: {e}")
164
  continue
165
 
166
+ # === Download & upload ===
167
  for video, url in zip(downloads, batch):
168
  filename = video.get("filename")
169
  file_url = video.get("url")
 
175
  file_url = BASE_URL + file_url
176
 
177
  local_path = os.path.join(OUTPUT_DIR, filename)
178
+
179
  try:
180
  print(f"[*] Downloading {filename}...")
181
  with requests.get(file_url, stream=True) as r:
 
184
  for chunk in r.iter_content(chunk_size=8192):
185
  f.write(chunk)
186
  print(f"[βœ“] Downloaded: {filename}")
187
+ except Exception as e:
188
+ print(f"[!] Download failed: {filename} β€” {e}")
189
+ continue
190
+
191
+ # Upload
192
+ if upload_to_dataset(local_path):
193
  processed_urls.add(url)
194
  save_processed()
195
+ os.remove(local_path)
 
 
196
 
197
  print("[⏱] Waiting 30s before next batch...\n")
198
  time.sleep(30)
199
 
200
+ print("\nβœ… Done. All batches fully processed.")