Spaces:

Fred808
/

dumper

Paused

App Files Files Community

Fred808 commited on Jul 4, 2025

Commit

c0223a0

verified ·

1 Parent(s): a244bae

Update app.py

Browse files

Files changed (1) hide show

app.py +142 -88

app.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import requests
 import os
 import time
 from huggingface_hub import upload_file
 # === CONFIGURATION ===
 HF_TOKEN = os.environ.get("HF_TOKEN")
 REPO_ID = "Fred808/BG1"
@@ -18,7 +17,6 @@ FETCH_API_URL = f"{BASE_URL}/batch/download-all/{{}}"
 # Video URLs to download from YouTube
 VIDEO_URLS = [
-    "https://youtu.be/a-4oCHe-hDE",
     "https://youtu.be/Q30-nakUrSM",
     "https://youtu.be/HSm-cq7zd2s",
     "https://youtu.be/x6oWgtJInCQ",
@@ -53,79 +51,106 @@ VIDEO_URLS = [
 # Output directory
 OUTPUT_DIR = "batch_downloads"
-if os.path.isfile(OUTPUT_DIR):
-    os.remove(OUTPUT_DIR)
-os.makedirs(OUTPUT_DIR, exist_ok=True)
 # Polling settings
 POLL_DELAY = 5
 MAX_WAIT_MINUTES = 5
 MAX_RETRIES = int((MAX_WAIT_MINUTES * 60) / POLL_DELAY)
-# === Step 1: Submit batch ===
-print(f"[*] Submitting {len(VIDEO_URLS)} URLs to batch API...")
-try:
-    res = requests.post(INIT_API_URL, json={"urls": VIDEO_URLS})
-    res.raise_for_status()
-    batch_id = res.json().get("batch_id")
-    if not batch_id:
-        raise Exception("No batch_id returned.")
-    print(f"[+] Batch submitted. ID: {batch_id}")
-except Exception as e:
-    print(f"[!] Submission error: {e}")
-    exit(1)
-# === Step 2: Wait for processing to complete ===
-status_url = STATUS_API_URL.format(batch_id)
-print("[*] Waiting for batch to complete...")
-for attempt in range(MAX_RETRIES):
     try:
-        res = requests.get(status_url)
         res.raise_for_status()
         data = res.json()
-        status = data.get("status")
-        total = data.get("total_urls", "?")
-        completed = data.get("completed", 0)
-        failed = data.get("failed", 0)
-        if status == "completed":
-            print(f"[✓] All {completed}/{total} videos processed successfully.")
-            break
-        elif status in ["started", "processing", "in_progress"]:
-            print(f"[~] Processing... {completed}/{total} done, {failed} failed.")
-        elif status == "failed":
-            print("[!] Batch failed on server.")
-            exit(1)
-        else:
-            print(f"[~] Unknown status '{status}', retrying...")
     except Exception as e:
-        print(f"[!] Error checking status: {e}")
-    time.sleep(POLL_DELAY)
-else:
-    print("[!] Timeout — batch not finished in time.")
-    exit(1)
-# === Step 3: Fetch download list ONCE ===
-fetch_url = FETCH_API_URL.format(batch_id)
-print("[*] Fetching download list...")
-try:
-    res = requests.get(fetch_url)
-    res.raise_for_status()
-    data = res.json()
-    downloads = data.get("downloads", [])
-    if not downloads:
-        print("[!] No downloads available. Exiting.")
-        exit(1)
-    print(f"[+] Found {len(downloads)} videos to download.")
-except Exception as e:
-    print(f"[!] Failed to fetch download links: {e}")
-    exit(1)
-# === Step 4: Download & Upload ===
 def upload_to_dataset(filepath):
     try:
         upload_file(
@@ -139,30 +164,59 @@ def upload_to_dataset(filepath):
     except Exception as e:
         print(f"[!] Upload failed: {filepath} — {e}")
-for video in downloads:
-    filename = video.get("filename")
-    url = video.get("url")
-    if not filename or not url:
-        print("[!] Skipping invalid entry.")
-        continue
-    # Prepend base domain if needed
-    if url.startswith("/"):
-        url = BASE_URL + url
-    local_path = os.path.join(OUTPUT_DIR, filename)
-    try:
-        print(f"[*] Downloading {filename}...")
-        with requests.get(url, stream=True) as r:
-            r.raise_for_status()
-            with open(local_path, "wb") as f:
-                for chunk in r.iter_content(chunk_size=8192):
-                    f.write(chunk)
-        print(f"[✓] Downloaded: {filename}")
-        upload_to_dataset(local_path)
-        os.remove(local_path)
-    except Exception as e:
-        print(f"[!] Error downloading/uploading {filename}: {e}")

 import requests
 import os
 import time
+import json
 from huggingface_hub import upload_file
 # === CONFIGURATION ===
 HF_TOKEN = os.environ.get("HF_TOKEN")
 REPO_ID = "Fred808/BG1"
 # Video URLs to download from YouTube
 VIDEO_URLS = [
     "https://youtu.be/Q30-nakUrSM",
     "https://youtu.be/HSm-cq7zd2s",
     "https://youtu.be/x6oWgtJInCQ",
 # Output directory
 OUTPUT_DIR = "batch_downloads"
+if os.path.isdir(OUTPUT_DIR):
+    for filename in os.listdir(OUTPUT_DIR):
+        file_path = os.path.join(OUTPUT_DIR, filename)
+        if os.path.isfile(file_path):
+            os.remove(file_path)
+else:
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
 # Polling settings
 POLL_DELAY = 5
 MAX_WAIT_MINUTES = 5
 MAX_RETRIES = int((MAX_WAIT_MINUTES * 60) / POLL_DELAY)
+# Path to the file that stores processed URLs
+PROCESSED_URLS_FILE = "processed_urls.json"
+# Load the list of processed URLs from a file
+def load_processed_urls():
+    if os.path.exists(PROCESSED_URLS_FILE):
+        with open(PROCESSED_URLS_FILE, "r") as f:
+            return set(json.load(f))
+    else:
+        return set()
+# Save the list of processed URLs to a file
+def save_processed_urls(processed_urls):
+    with open(PROCESSED_URLS_FILE, "w") as f:
+        json.dump(list(processed_urls), f)
+# Function to split URLs into batches of 20
+def chunk_urls(urls, batch_size=20):
+    for i in range(0, len(urls), batch_size):
+        yield urls[i:i + batch_size]
+# === Step 1: Submit batch, process each batch ===
+def process_batch(urls):
+    print(f"[*] Submitting {len(urls)} URLs to batch API...")
+    try:
+        res = requests.post(INIT_API_URL, json={"urls": urls})
+        res.raise_for_status()
+        batch_id = res.json().get("batch_id")
+        if not batch_id:
+            raise Exception("No batch_id returned.")
+        print(f"[+] Batch submitted. ID: {batch_id}")
+        return batch_id
+    except Exception as e:
+        print(f"[!] Submission error: {e}")
+        return None
+def check_status(batch_id):
+    status_url = STATUS_API_URL.format(batch_id)
+    print("[*] Waiting for batch to complete...")
+    for attempt in range(MAX_RETRIES):
+        try:
+            res = requests.get(status_url)
+            res.raise_for_status()
+            data = res.json()
+            status = data.get("status")
+            total = data.get("total_urls", "?")
+            completed = data.get("completed", 0)
+            failed = data.get("failed", 0)
+            if status == "completed":
+                print(f"[✓] All {completed}/{total} videos processed successfully.")
+                return True
+            elif status in ["started", "processing", "in_progress"]:
+                print(f"[~] Processing... {completed}/{total} done, {failed} failed.")
+            elif status == "failed":
+                print("[!] Batch failed on server.")
+                return False
+            else:
+                print(f"[~] Unknown status '{status}', retrying...")
+        except Exception as e:
+            print(f"[!] Error checking status: {e}")
+        time.sleep(POLL_DELAY)
+    else:
+        print("[!] Timeout — batch not finished in time.")
+        return False
+def fetch_downloads(batch_id):
+    fetch_url = FETCH_API_URL.format(batch_id)
+    print("[*] Fetching download list...")
     try:
+        res = requests.get(fetch_url)
         res.raise_for_status()
         data = res.json()
+        downloads = data.get("downloads", [])
+        if not downloads:
+            print("[!] No downloads available. Exiting.")
+            return []
+        print(f"[+] Found {len(downloads)} videos to download.")
+        return downloads
     except Exception as e:
+        print(f"[!] Failed to fetch download links: {e}")
+        return []
 def upload_to_dataset(filepath):
     try:
         upload_file(
     except Exception as e:
         print(f"[!] Upload failed: {filepath} — {e}")
+# Main function to download and upload videos
+def main():
+    processed_urls = load_processed_urls()  # Load previously processed URLs
+    # Process each batch
+    for url_batch in chunk_urls(VIDEO_URLS, batch_size=20):
+        # Filter out already processed URLs
+        urls_to_process = [url for url in url_batch if url not in processed_urls]
+        if not urls_to_process:
+            print("[*] All URLs in this batch have already been processed. Skipping...")
+            continue
+        batch_id = process_batch(urls_to_process)
+        if not batch_id:
+            break
+        # Wait for the batch to be processed
+        if not check_status(batch_id):
+            continue
+        # Fetch the download list for the batch
+        downloads = fetch_downloads(batch_id)
+        for video in downloads:
+            filename = video.get("filename")
+            url = video.get("url")
+            if not filename or not url:
+                print("[!] Skipping invalid entry.")
+                continue
+            # Prepend base domain if needed
+            if url.startswith("/"):
+                url = BASE_URL + url
+            local_path = os.path.join(OUTPUT_DIR, filename)
+            try:
+                print(f"[*] Downloading {filename}...")
+                with requests.get(url, stream=True) as r:
+                    r.raise_for_status()
+                    with open(local_path, "wb") as f:
+                        for chunk in r.iter_content(chunk_size=8192):
+                            f.write(chunk)
+                print(f"[✓] Downloaded: {filename}")
+                # Upload to dataset
+                upload_to_dataset(local_path)
+                # Remove the file after upload
+                os.remove(local_path)
+                # Mark this URL as processed
+                processed_urls.add(url)
+            except Exception as e:
+                print(f"[!]