Update app.py
Browse files
app.py
CHANGED
|
@@ -68,7 +68,8 @@ VIDEO_URLS = [
|
|
| 68 |
"https://youtu.be/XGHBtvnvz9U"
|
| 69 |
]
|
| 70 |
|
| 71 |
-
|
|
|
|
| 72 |
if os.path.exists(PROCESSED_FILE):
|
| 73 |
with open(PROCESSED_FILE, "r") as f:
|
| 74 |
processed_urls = set(json.load(f))
|
|
@@ -93,21 +94,24 @@ def upload_to_dataset(filepath):
|
|
| 93 |
token=HF_TOKEN
|
| 94 |
)
|
| 95 |
print(f"[β] Uploaded: {filepath}")
|
|
|
|
| 96 |
except Exception as e:
|
| 97 |
print(f"[!] Upload failed: {filepath} β {e}")
|
|
|
|
| 98 |
|
| 99 |
-
# ===
|
| 100 |
if os.path.exists(OUTPUT_DIR):
|
| 101 |
-
for
|
| 102 |
-
os.remove(os.path.join(OUTPUT_DIR,
|
| 103 |
else:
|
| 104 |
os.makedirs(OUTPUT_DIR)
|
| 105 |
|
| 106 |
-
# ===
|
| 107 |
unprocessed_urls = [url for url in VIDEO_URLS if url not in processed_urls]
|
| 108 |
|
|
|
|
| 109 |
for batch in chunk_urls(unprocessed_urls, 20):
|
| 110 |
-
print(f"\n[*] Submitting {len(batch)} URLs
|
| 111 |
try:
|
| 112 |
res = requests.post(INIT_API_URL, json={"urls": batch})
|
| 113 |
res.raise_for_status()
|
|
@@ -119,39 +123,32 @@ for batch in chunk_urls(unprocessed_urls, 20):
|
|
| 119 |
print(f"[!] Submission error: {e}")
|
| 120 |
continue
|
| 121 |
|
| 122 |
-
# === Wait
|
| 123 |
status_url = STATUS_API_URL.format(batch_id)
|
| 124 |
-
print("[*] Waiting for batch to
|
| 125 |
-
|
| 126 |
-
MAX_WAIT_MINUTES = 5
|
| 127 |
-
MAX_RETRIES = int((MAX_WAIT_MINUTES * 60) / POLL_DELAY)
|
| 128 |
-
|
| 129 |
-
for attempt in range(MAX_RETRIES):
|
| 130 |
try:
|
| 131 |
res = requests.get(status_url)
|
| 132 |
res.raise_for_status()
|
| 133 |
data = res.json()
|
| 134 |
-
|
| 135 |
total = data.get("total_urls", "?")
|
| 136 |
completed = data.get("completed", 0)
|
| 137 |
failed = data.get("failed", 0)
|
|
|
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
print(f"[~] Processing... {completed}/{total} done, {failed} failed.")
|
| 144 |
-
elif status == "failed":
|
| 145 |
-
print("[!] Batch failed on server.")
|
| 146 |
break
|
|
|
|
| 147 |
except Exception as e:
|
| 148 |
-
print(f"[!]
|
| 149 |
-
time.sleep(POLL_DELAY)
|
| 150 |
-
else:
|
| 151 |
-
print("[!] Timeout β skipping batch.")
|
| 152 |
-
continue
|
| 153 |
|
| 154 |
-
|
|
|
|
|
|
|
| 155 |
fetch_url = FETCH_API_URL.format(batch_id)
|
| 156 |
print("[*] Fetching download list...")
|
| 157 |
try:
|
|
@@ -166,7 +163,7 @@ for batch in chunk_urls(unprocessed_urls, 20):
|
|
| 166 |
print(f"[!] Failed to fetch download list: {e}")
|
| 167 |
continue
|
| 168 |
|
| 169 |
-
# === Download
|
| 170 |
for video, url in zip(downloads, batch):
|
| 171 |
filename = video.get("filename")
|
| 172 |
file_url = video.get("url")
|
|
@@ -178,6 +175,7 @@ for batch in chunk_urls(unprocessed_urls, 20):
|
|
| 178 |
file_url = BASE_URL + file_url
|
| 179 |
|
| 180 |
local_path = os.path.join(OUTPUT_DIR, filename)
|
|
|
|
| 181 |
try:
|
| 182 |
print(f"[*] Downloading {filename}...")
|
| 183 |
with requests.get(file_url, stream=True) as r:
|
|
@@ -186,14 +184,17 @@ for batch in chunk_urls(unprocessed_urls, 20):
|
|
| 186 |
for chunk in r.iter_content(chunk_size=8192):
|
| 187 |
f.write(chunk)
|
| 188 |
print(f"[β] Downloaded: {filename}")
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
processed_urls.add(url)
|
| 191 |
save_processed()
|
| 192 |
-
|
| 193 |
-
except Exception as e:
|
| 194 |
-
print(f"[!] Error downloading/uploading {filename}: {e}")
|
| 195 |
|
| 196 |
print("[β±] Waiting 30s before next batch...\n")
|
| 197 |
time.sleep(30)
|
| 198 |
|
| 199 |
-
print("\nβ
Done. All batches processed.")
|
|
|
|
| 68 |
"https://youtu.be/XGHBtvnvz9U"
|
| 69 |
]
|
| 70 |
|
| 71 |
+
|
| 72 |
+
# === Load processed ===
|
| 73 |
if os.path.exists(PROCESSED_FILE):
|
| 74 |
with open(PROCESSED_FILE, "r") as f:
|
| 75 |
processed_urls = set(json.load(f))
|
|
|
|
| 94 |
token=HF_TOKEN
|
| 95 |
)
|
| 96 |
print(f"[β] Uploaded: {filepath}")
|
| 97 |
+
return True
|
| 98 |
except Exception as e:
|
| 99 |
print(f"[!] Upload failed: {filepath} β {e}")
|
| 100 |
+
return False
|
| 101 |
|
| 102 |
+
# === Prepare output folder ===
|
| 103 |
if os.path.exists(OUTPUT_DIR):
|
| 104 |
+
for f in os.listdir(OUTPUT_DIR):
|
| 105 |
+
os.remove(os.path.join(OUTPUT_DIR, f))
|
| 106 |
else:
|
| 107 |
os.makedirs(OUTPUT_DIR)
|
| 108 |
|
| 109 |
+
# === Filter unprocessed URLs ===
|
| 110 |
unprocessed_urls = [url for url in VIDEO_URLS if url not in processed_urls]
|
| 111 |
|
| 112 |
+
# === Process in batches ===
|
| 113 |
for batch in chunk_urls(unprocessed_urls, 20):
|
| 114 |
+
print(f"\n[*] Submitting batch of {len(batch)} URLs...")
|
| 115 |
try:
|
| 116 |
res = requests.post(INIT_API_URL, json={"urls": batch})
|
| 117 |
res.raise_for_status()
|
|
|
|
| 123 |
print(f"[!] Submission error: {e}")
|
| 124 |
continue
|
| 125 |
|
| 126 |
+
# === Wait until all videos in batch are fully processed ===
|
| 127 |
status_url = STATUS_API_URL.format(batch_id)
|
| 128 |
+
print("[*] Waiting for all batch videos to be processed...")
|
| 129 |
+
while True:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
try:
|
| 131 |
res = requests.get(status_url)
|
| 132 |
res.raise_for_status()
|
| 133 |
data = res.json()
|
| 134 |
+
|
| 135 |
total = data.get("total_urls", "?")
|
| 136 |
completed = data.get("completed", 0)
|
| 137 |
failed = data.get("failed", 0)
|
| 138 |
+
status = data.get("status")
|
| 139 |
|
| 140 |
+
print(f"[~] Status: {status} β {completed}/{total} done, {failed} failed.")
|
| 141 |
+
|
| 142 |
+
if completed + failed >= int(total):
|
| 143 |
+
print(f"[β] Batch fully processed: {completed} completed, {failed} failed.")
|
|
|
|
|
|
|
|
|
|
| 144 |
break
|
| 145 |
+
|
| 146 |
except Exception as e:
|
| 147 |
+
print(f"[!] Error checking status: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
+
time.sleep(5)
|
| 150 |
+
|
| 151 |
+
# === Fetch download list ===
|
| 152 |
fetch_url = FETCH_API_URL.format(batch_id)
|
| 153 |
print("[*] Fetching download list...")
|
| 154 |
try:
|
|
|
|
| 163 |
print(f"[!] Failed to fetch download list: {e}")
|
| 164 |
continue
|
| 165 |
|
| 166 |
+
# === Download & upload ===
|
| 167 |
for video, url in zip(downloads, batch):
|
| 168 |
filename = video.get("filename")
|
| 169 |
file_url = video.get("url")
|
|
|
|
| 175 |
file_url = BASE_URL + file_url
|
| 176 |
|
| 177 |
local_path = os.path.join(OUTPUT_DIR, filename)
|
| 178 |
+
|
| 179 |
try:
|
| 180 |
print(f"[*] Downloading {filename}...")
|
| 181 |
with requests.get(file_url, stream=True) as r:
|
|
|
|
| 184 |
for chunk in r.iter_content(chunk_size=8192):
|
| 185 |
f.write(chunk)
|
| 186 |
print(f"[β] Downloaded: {filename}")
|
| 187 |
+
except Exception as e:
|
| 188 |
+
print(f"[!] Download failed: {filename} β {e}")
|
| 189 |
+
continue
|
| 190 |
+
|
| 191 |
+
# Upload
|
| 192 |
+
if upload_to_dataset(local_path):
|
| 193 |
processed_urls.add(url)
|
| 194 |
save_processed()
|
| 195 |
+
os.remove(local_path)
|
|
|
|
|
|
|
| 196 |
|
| 197 |
print("[β±] Waiting 30s before next batch...\n")
|
| 198 |
time.sleep(30)
|
| 199 |
|
| 200 |
+
print("\nβ
Done. All batches fully processed.")
|