Spaces:

fred1012
/

svcai2

Paused

App Files Files Community

fred1012 commited on Nov 18, 2025

Commit

a8d3cb5

verified ·

1 Parent(s): a8e11b4

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +45 -0
download_api.py +267 -0
requirements.txt +11 -0
vision_analyzer.py +675 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,45 @@

+FROM python:3.11-slim-bullseye
+# Install system dependencies
+RUN sed -i 's/main/main contrib non-free/' /etc/apt/sources.list && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+    unrar \
+    libgl1 \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Upgrade pip and install core dependencies first
+RUN pip install --no-cache-dir --upgrade pip setuptools wheel packaging
+# Install CPU-only PyTorch first
+# Copy requirements and install with special handling for flash_attn
+COPY requirements.txt .
+RUN pip install --no-cache-dir \
+    -r requirements.txt \
+    --find-links https://download.pytorch.org/whl/cpu \
+    --extra-index-url https://pypi.org/simple && \
+# Install remaining packages that might have been skipped
+    pip install --no-cache-dir \
+    accelerate \
+    transformers==4.36.2 \
+    timm==0.9.12 \
+    einops==0.7.0
+# Copy application code
+COPY . .
+# Create non-root user
+RUN useradd -m -u 1000 user && \
+    chown -R user:user /app
+USER user
+# Environment variables to suppress warnings
+ENV HF_HUB_DISABLE_PROGRESS=1
+ENV TF_CPP_MIN_LOG_LEVEL=3
+CMD ["uvicorn", "download_api:app", "--host", "0.0.0.0", "--port", "7860"]

download_api.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import os
+import json
+import time
+import threading
+import asyncio
+from fastapi import FastAPI, HTTPException, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, FileResponse
+from fastapi.staticfiles import StaticFiles
+import uvicorn
+from typing import Dict
+from pathlib import Path
+import subprocess
+from datetime import datetime
+import torch
+# Import core functionality
+from vision_analyzer import (
+    main_processing_loop,
+    processing_status,
+    log_message,
+)
+# FastAPI App Definition
+app = FastAPI(title="Video Analysis API",
+             description="API to access video frame analysis results and extracted images",
+             version="1.0.0")
+# Add CORS middleware to allow cross-origin requests
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Allows all origins
+    allow_credentials=True,
+    allow_methods=["*"],  # Allows all methods
+    allow_headers=["*"],
+)
+# Global variables for processing and frame tracking
+processing_thread = None
+frame_locks = {}  # Dict to track frame locks: {course: {frame: {"locked_by": id, "locked_at": timestamp}}}
+processed_frames = {}  # Dict to track processed frames: {course: {frame: {"processed_by": id, "processed_at": timestamp}}}
+LOCK_TIMEOUT = 300  # 5 minutes timeout for locks
+TRACKING_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "frame_tracking.json")
+def save_tracking_state():
+    """Save frame tracking state to disk"""
+    state = {
+        "frame_locks": frame_locks,
+        "processed_frames": processed_frames
+    }
+    try:
+        with open(TRACKING_FILE, "w") as f:
+            json.dump(state, f, indent=2)
+    except Exception as e:
+        log_message(f"Error saving tracking state: {e}")
+def load_tracking_state():
+    """Load frame tracking state from disk"""
+    global frame_locks, processed_frames
+    try:
+        with open(TRACKING_FILE, "r") as f:
+            state = json.load(f)
+            frame_locks = state.get("frame_locks", {})
+            processed_frames = state.get("processed_frames", {})
+    except FileNotFoundError:
+        log_message("No previous tracking state found")
+    except Exception as e:
+        log_message(f"Error loading tracking state: {e}")
+def check_frame_lock(course: str, frame: str) -> bool:
+    """Check if frame is locked and lock hasn't expired"""
+    if course in frame_locks and frame in frame_locks[course]:
+        lock = frame_locks[course][frame]
+        if time.time() - lock["locked_at"] < LOCK_TIMEOUT:
+            return True
+        # Lock expired, remove it
+        del frame_locks[course][frame]
+        save_tracking_state()
+    return False
+def lock_frame(course: str, frame: str, requester_id: str) -> bool:
+    """Attempt to lock a frame for processing"""
+    if check_frame_lock(course, frame):
+        return False
+    if course not in frame_locks:
+        frame_locks[course] = {}
+    frame_locks[course][frame] = {
+        "locked_by": requester_id,
+        "locked_at": time.time()
+    }
+    save_tracking_state()
+    return True
+def mark_frame_processed(course: str, frame: str, requester_id: str):
+    """Mark a frame as successfully processed"""
+    if course not in processed_frames:
+        processed_frames[course] = {}
+    processed_frames[course][frame] = {
+        "processed_by": requester_id,
+        "processed_at": time.time()
+    }
+    # Remove the lock if it exists
+    if course in frame_locks and frame in frame_locks[course]:
+        del frame_locks[course][frame]
+    save_tracking_state()
+def log_message(message):
+    """Add a log message with timestamp"""
+    timestamp = datetime.now().strftime("%H:%M:%S")
+    log_entry = f"[{timestamp}] {message}"
+    processing_status["logs"].append(log_entry)
+    # Keep only the last 100 logs
+    if len(processing_status["logs"]) > 100:
+        processing_status["logs"] = processing_status["logs"][-100:]
+    print(log_entry)
+@app.on_event("startup")
+async def startup_event():
+    """Initialize frame tracking and start processing loop"""
+    # Load frame tracking state
+    load_tracking_state()
+    log_message("✓ Loaded frame tracking state")
+    # Start processing thread
+    global processing_thread
+    if not (processing_thread and processing_thread.is_alive()):
+        log_message("🚀 Starting RAR extraction, frame extraction, and vision analysis pipeline in background...")
+        processing_thread = threading.Thread(target=main_processing_loop)
+        processing_thread.daemon = True
+        processing_thread.start()
+@app.get("/")
+async def root():
+    """Root endpoint that returns basic info"""
+    return {
+        "message": "Video Analysis API",
+        "status": "running",
+        "endpoints": {
+            "/status": "Get processing status",
+            "/courses": "List all available course folders",
+            "/images/{course_folder}": "List images in a course folder",
+            "/images/{course_folder}/{frame_filename}": "Get specific frame image",
+            "/start-processing": "Start processing pipeline",
+            "/stop-processing": "Stop processing pipeline"
+        }
+    }
+@app.get("/status")
+async def get_status():
+    """Get current processing status"""
+    return {
+        "processing_status": processing_status
+    }
+# ===== NEW IMAGE SERVING ENDPOINTS =====
+@app.post("/middleware/release/frame/{course_folder}/{video}/{frame}")
+async def release_frame(course_folder: str, video: str, frame: str, requester_id: str):
+    """Release a frame lock"""
+    if course_folder in frame_locks and frame in frame_locks[course_folder]:
+        lock = frame_locks[course_folder][frame]
+        if lock["locked_by"] == requester_id:
+            del frame_locks[course_folder][frame]
+            save_tracking_state()
+            return {"status": "released"}
+    return {"status": "not_found"}
+@app.post("/middleware/release/course/{course_folder}")
+async def release_course(course_folder: str, requester_id: str):
+    """Release all frame locks for a course"""
+    if course_folder in frame_locks:
+        # Only release frames locked by this requester
+        frames_to_release = [
+            frame for frame, lock in frame_locks[course_folder].items()
+            if lock["locked_by"] == requester_id
+        ]
+        for frame in frames_to_release:
+            del frame_locks[course_folder][frame]
+        save_tracking_state()
+    return {"status": "released"}
+    """
+    List all available course folders with their image counts
+    """
+    if not os.path.exists(FRAMES_OUTPUT_FOLDER):
+        return {"courses": [], "message": "Frames output folder does not exist yet"}
+    courses = []
+    for folder in os.listdir(FRAMES_OUTPUT_FOLDER):
+        folder_path = os.path.join(FRAMES_OUTPUT_FOLDER, folder)
+        if os.path.isdir(folder_path):
+            # Count image files
+            image_count = len([f for f in os.listdir(folder_path)
+                             if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
+            courses.append({
+                "course_folder": folder,
+                "image_count": image_count,
+                "images_url": f"/images/{folder}",
+                "sample_image_url": f"/images/{folder}/0001.png" if image_count > 0 else None
+            })
+    return {
+        "total_courses": len(courses),
+        "courses": courses
+    }
+# Signal handlers to prevent accidental shutdown
+def handle_shutdown(signum, frame):
+    """Prevent shutdown on SIGTERM/SIGINT"""
+    print(f"\n⚠️ Received signal {signum}. Server will continue running.")
+    print("Use Ctrl+Break or kill -9 to force stop.")
+# Setup signal handlers for graceful shutdown prevention
+import signal
+signal.signal(signal.SIGINT, handle_shutdown)
+signal.signal(signal.SIGTERM, handle_shutdown)
+# Server lifecycle events
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Save state on shutdown attempt"""
+    save_tracking_state()
+    print("💾 Saved tracking state")
+    print("⚠️ Server shutdown prevented - use Ctrl+Break or kill -9 to force stop")
+    # Prevent shutdown by not returning
+    while True:
+        await asyncio.sleep(1)
+if __name__ == "__main__":
+    # Start the FastAPI server
+    print("🚀 Starting Video Analysis FastAPI Server (Persistent Mode)...")
+    print("API Documentation will be available at: http://localhost:8000/docs")
+    print("API Root endpoint: http://localhost:8000/")
+    print("⚠️ Server will continue running even after processing completes")
+    print("Use Ctrl+Break or kill -9 to force stop")
+    # Start processing in thread instead of blocking
+    processing_thread = threading.Thread(target=main_processing_loop)
+    processing_thread.daemon = False  # Make non-daemon so it doesn't exit
+    processing_thread.start()
+    # Configure uvicorn for persistent running
+    config = uvicorn.Config(
+        app=app,
+        host="0.0.0.0",
+        port=8000,
+        log_level="info",
+        reload=False,
+        workers=1,
+        loop="asyncio",
+        timeout_keep_alive=600,  # Keep connections alive longer
+        access_log=True
+    )
+    # Run server with persistent config
+    server = uvicorn.Server(config)
+    server.run()

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+accelerate
+fastapi
+uvicorn
+opencv-python-headless
+numpy
+pathlib
+huggingface_hub
+pillow
+rarfile
+python-multipart
+moviepy

vision_analyzer.py ADDED Viewed

	@@ -0,0 +1,675 @@

+import os
+import json
+import requests
+import subprocess
+import shutil
+import time
+import re
+import threading
+from typing import Dict, List, Set, Optional, Any
+from huggingface_hub import HfApi, list_repo_files, CommitOperationAdd, hf_hub_download, hf_hub_url
+import cv2
+import numpy as np
+from pathlib import Path
+import smtplib
+from email.message import EmailMessage
+from moviepy import VideoFileClip
+# ==== CONFIGURATION ====
+HF_TOKEN = os.getenv("HF_TOKEN", "")
+SOURCE_REPO_ID = os.getenv("SOURCE_REPO", "Fred808/BG1")
+TARGET_REPO_ID = os.getenv("TARGET_REPO", "Samfredoly/BG_Vid") # New target repo for uploads
+# Path Configuration
+DOWNLOAD_FOLDER = "downloads"
+EXTRACT_FOLDER = "extracted"
+LOCAL_STATE_FOLDER = ".state" # Folder to temporarily store the downloaded state file
+FRAMES_OUTPUT_FOLDER = "frames" # Folder to store extracted frames
+os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
+os.makedirs(EXTRACT_FOLDER, exist_ok=True)
+os.makedirs(LOCAL_STATE_FOLDER, exist_ok=True)
+# State Files
+FAILED_FILES_LOG = "failed_files.log"
+HF_STATE_FILE = "processing_state2.json" # New remote state file name
+# Processing Parameters
+CHUNK_SIZE = 2
+PROCESSING_DELAY = 2
+MAX_RETRIES = 3
+MIN_FREE_SPACE_GB = 2  # Minimum free space in GB before processing
+# Initialize HF API
+hf_api = HfApi(token=HF_TOKEN)
+# Global State
+processing_status = {
+    "is_running": False,
+    "current_file": None,
+    "total_files": 0,
+    "processed_files": 0,
+    "failed_files": 0,
+    "extracted_courses": 0,
+    "extracted_videos": 0,
+    "last_update": None,
+    "logs": []
+}
+def log_message(message: str, level: str = "INFO"):
+    """Log messages with timestamp"""
+    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
+    log_entry = f"[{timestamp}] {level}: {message}"
+    print(log_entry)
+    processing_status["logs"].append(log_entry)
+    processing_status["last_update"] = timestamp
+    if len(processing_status["logs"]) > 100:
+        processing_status["logs"] = processing_status["logs"][-100:]
+def log_failed_file(filename: str, error: str):
+    """Log failed files to persistent file"""
+    with open(FAILED_FILES_LOG, "a") as f:
+        f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {filename}: {error}\n")
+def get_disk_usage(path: str) -> Dict[str, float]:
+    """Get disk usage statistics in GB"""
+    statvfs = os.statvfs(path)
+    total = statvfs.f_frsize * statvfs.f_blocks / (1024**3)
+    free = statvfs.f_frsize * statvfs.f_bavail / (1024**3)
+    used = total - free
+    return {"total": total, "free": free, "used": used}
+def check_disk_space(path: str = ".") -> bool:
+    """Check if there's enough disk space"""
+    disk_info = get_disk_usage(path)
+    if disk_info["free"] < MIN_FREE_SPACE_GB:
+        log_message(f'⚠️ Low disk space: {disk_info["free"]:.2f}GB free, {disk_info["used"]:.2f}GB used')
+        return False
+    return True
+def cleanup_temp_files():
+    """Clean up temporary files to free space"""
+    log_message("🧹 Cleaning up temporary files...", "INFO")
+    # Clean old downloads (keep only current processing file)
+    current_file = processing_status.get("current_file")
+    for file in os.listdir(DOWNLOAD_FOLDER):
+        if file != current_file and file.endswith((".rar", ".zip")):
+            try:
+                os.remove(os.path.join(DOWNLOAD_FOLDER, file))
+                log_message(f"🗑️ Removed old download: {file}", "INFO")
+            except:
+                pass
+def load_json_state(file_path: str, default_value: Dict[str, Any]) -> Dict[str, Any]:
+    """Load state from JSON file with migration logic for new structure."""
+    if os.path.exists(file_path):
+        try:
+            with open(file_path, "r") as f:
+                data = json.load(f)
+                # --- MIGRATION LOGIC ---
+                # 1. Convert old "processed_rars" list to new "file_states" dictionary
+                if "processed_rars" in data and isinstance(data["processed_rars"], list):
+                    log_message("ℹ️ Migrating old 'processed_rars' list to new 'file_states' dictionary.", "INFO")
+                    data["file_states"] = {
+                        filename: "processed" for filename in data.pop("processed_rars")
+                    }
+                # 2. Ensure file_states exists and is a dict
+                if "file_states" not in data or not isinstance(data["file_states"], dict):
+                    log_message("ℹ️ Initializing 'file_states' dictionary.", "INFO")
+                    data["file_states"] = {}
+                # 3. Ensure next_download_index exists
+                if "next_download_index" not in data:
+                    data["next_download_index"] = 0
+                return data
+        except json.JSONDecodeError:
+            log_message(f"⚠️ Corrupted state file: {file_path}", "WARNING")
+    return default_value
+def save_json_state(file_path: str, data: Dict[str, Any]):
+    """Save state to JSON file"""
+    with open(file_path, "w") as f:
+        json.dump(data, f, indent=2)
+def download_hf_state(repo_id: str, filename: str) -> Dict[str, Any]:
+    """Downloads the state file from Hugging Face or returns a default state."""
+    local_path = os.path.join(LOCAL_STATE_FOLDER, filename)
+    # Changed default state to use 'file_states' for the new structure
+    default_state = {"next_download_index": 0, "file_states": {}}
+    try:
+        # Check if the file exists in the repo first
+        files = hf_api.list_repo_files(repo_id=repo_id, repo_type="dataset")
+        if filename not in files:
+            log_message(f"ℹ️ State file {filename} not found in {repo_id}. Starting from default state.", "INFO")
+            return default_state
+        # Download the file
+        hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            repo_type="dataset",
+            local_dir=LOCAL_STATE_FOLDER,
+            local_dir_use_symlinks=False
+        )
+        log_message(f"✅ Successfully downloaded state file from {repo_id}.", "INFO")
+        # Use the modified load_json_state which handles migration
+        return load_json_state(local_path, default_state)
+    except Exception as e:
+        log_message(f"⚠️ Failed to download state file from Hugging Face: {str(e)}. Starting from default state.", "WARNING")
+        return default_state
+def upload_hf_state(repo_id: str, filename: str, state: Dict[str, Any]) -> bool:
+    """Uploads the state file to Hugging Face."""
+    local_path = os.path.join(LOCAL_STATE_FOLDER, filename)
+    try:
+        # 1. Save the updated state locally
+        save_json_state(local_path, state)
+        # 2. Upload the file
+        hf_api.upload_file(
+            path_or_fileobj=local_path,
+            path_in_repo=filename,
+            repo_id=repo_id,
+            repo_type="dataset",
+            commit_message=f"Update processing state: next_index={state['next_download_index']}"
+        )
+        log_message(f"✅ Successfully uploaded updated state file to {repo_id}", "INFO")
+        return True
+    except Exception as e:
+        log_message(f"❌ Failed to upload state file to Hugging Face: {str(e)}", "ERROR")
+        return False
+# --- NEW LOCKING FUNCTIONS ---
+def lock_file_for_processing(rar_filename: str, state: Dict[str, Any]) -> bool:
+    """Marks a file as 'processing' in the state file and uploads the lock."""
+    log_message(f"🔒 Attempting to lock file: {rar_filename} (Marking as 'processing')", "INFO")
+    # Update state locally
+    state["file_states"][rar_filename] = "processing"
+    # Upload the updated state file immediately to establish the lock
+    if upload_hf_state(TARGET_REPO_ID, HF_STATE_FILE, state):
+        log_message(f"✅ Successfully locked file: {rar_filename}", "INFO")
+        return True
+    else:
+        log_message(f"❌ Failed to upload lock for file: {rar_filename}. Aborting processing.", "ERROR")
+        # Revert local state to avoid confusion if upload failed
+        if rar_filename in state["file_states"]:
+            del state["file_states"][rar_filename]
+        return False
+def unlock_file_as_processed(rar_filename: str, state: Dict[str, Any], next_index: int) -> bool:
+    """Marks a file as 'processed', updates the index, and uploads the state."""
+    log_message(f"🔓 Attempting to unlock file: {rar_filename} (Marking as 'processed')", "INFO")
+    # Update state locally
+    state["file_states"][rar_filename] = "processed"
+    state["next_download_index"] = next_index
+    # Upload the updated state file
+    if upload_hf_state(TARGET_REPO_ID, HF_STATE_FILE, state):
+        log_message(f"✅ Successfully unlocked and marked as processed: {rar_filename}", "INFO")
+        return True
+    else:
+        log_message(f"❌ Failed to upload final state for file: {rar_filename}. The file is processed locally but state is not updated.", "ERROR")
+        return False
+# --- Original Utility Functions ---
+def download_with_retry(url: str, dest_path: str, max_retries: int = 3) -> bool:
+    """Download file with retry logic and disk space checking"""
+    if not check_disk_space():
+        cleanup_temp_files()
+        if not check_disk_space():
+            log_message("❌ Insufficient disk space even after cleanup", "ERROR")
+            return False
+    # NEW FIX: Ensure the directory structure exists before attempting to write the file
+    try:
+        os.makedirs(os.path.dirname(dest_path), exist_ok=True)
+    except Exception as e:
+        log_message(f"❌ Failed to create directory for download path {os.path.dirname(dest_path)}: {str(e)}", "ERROR")
+        return False
+    headers = {"Authorization": f"Bearer {HF_TOKEN}"}
+    for attempt in range(max_retries):
+        try:
+            with requests.get(url, headers=headers, stream=True) as r:
+                r.raise_for_status()
+                # Check content length if available
+                content_length = r.headers.get("content-length")
+                if content_length:
+                    size_gb = int(content_length) / (1024**3)
+                    disk_info = get_disk_usage(".")
+                    # Check if there is enough space for the full download
+                    if disk_info["free"] < size_gb + MIN_FREE_SPACE_GB:
+                        log_message(f"⚠️ Not enough space for download ({size_gb:.2f}GB required). Freeing space...", "WARNING")
+                        cleanup_temp_files()
+                        disk_info = get_disk_usage(".")
+                        if disk_info["free"] < size_gb + MIN_FREE_SPACE_GB:
+                            log_message(f"❌ Still not enough space for download. Required: {size_gb + MIN_FREE_SPACE_GB:.2f}GB, Available: {disk_info['free']:.2f}GB", "ERROR")
+                            return False
+                # Download the file chunk by chunk
+                with open(dest_path, "wb") as f:
+                    for chunk in r.iter_content(chunk_size=8192):
+                        if chunk: # filter out keep-alive new chunks
+                            f.write(chunk)
+                log_message(f"✅ Download successful: {dest_path}", "INFO")
+                return True
+        except requests.exceptions.RequestException as e:
+            log_message(f"❌ Download attempt {attempt + 1} failed for {url}: {str(e)}", "WARNING")
+            time.sleep(PROCESSING_DELAY)
+        except Exception as e:
+            log_message(f"❌ An unexpected error occurred during download: {str(e)}", "ERROR")
+            return False
+    log_message(f"❌ Failed to download {url} after {max_retries} attempts.", "ERROR")
+    return False
+def extract_rar(rar_path: str, extract_path: str) -> bool:
+    """Extracts a RAR file using unrar (requires unrar to be installed)."""
+    log_message(f"📦 Attempting to extract RAR: {rar_path} to {extract_path}", "INFO")
+    # Helper to run a command and return (success, completed_process_or_exception)
+    def _run(cmd):
+        try:
+            proc = subprocess.run(cmd, capture_output=True, text=True, check=True)
+            return True, proc
+        except subprocess.CalledProcessError as e:
+            return False, e
+    try:
+        # Create the extraction directory if it doesn't exist
+        os.makedirs(extract_path, exist_ok=True)
+        # Aggressive per-file extraction to bypass missing volume dependency check.
+        # 1. List files in the current volume using 'unrar lb'
+        list_cmd = ["unrar", "lb", rar_path]
+        ok_list, list_result = _run(list_cmd)
+        if not ok_list:
+            # If listing fails, it's a critical error (e.g., unrar not found, file not a rar, or severe corruption)
+            list_err = (list_result.stderr if isinstance(list_result, subprocess.CalledProcessError) else str(list_result))
+            log_message(f"❌ Failed to list archive contents for {rar_path}. Cannot proceed with extraction. Error: {list_err}", "ERROR")
+            return False
+        file_list = [ln.strip() for ln in (list_result.stdout or '').splitlines() if ln.strip()]
+        if not file_list:
+            log_message(f"⚠️ Archive {rar_path} appears empty or listing failed. Cannot extract.", "WARNING")
+            return False
+        extracted_any = False
+        # 2. Try to extract each file individually
+        for member in file_list:
+            # Use 'unrar e' (extract to current dir) with '-kb' (keep broken) and '-y' (assume yes)
+            # We use 'e' to avoid path issues and '-kb' to force extraction of the file itself.
+            cmd = ["unrar", "e", "-o+", "-y", "-kb", rar_path, member, extract_path]
+            ok_member, member_result = _run(cmd)
+            if ok_member:
+                extracted_any = True
+                log_message(f"✅ Extracted member {member} from {rar_path}", "INFO")
+            else:
+                # Log the failure but continue to the next file
+                member_err = ''
+                if isinstance(member_result, subprocess.CalledProcessError):
+                    member_err = (member_result.stderr or '')
+                log_message(f"⚠️ Could not extract member {member} from {rar_path}. Continuing to next file. Error: {member_err.strip()}", "WARNING")
+        if extracted_any:
+            log_message(f"✅ Finished partial extraction from {rar_path}. At least one file was extracted.", "INFO")
+            return True
+        else:
+            log_message(f"❌ Failed to extract any files from {rar_path}.", "ERROR")
+            return False
+    except FileNotFoundError:
+        log_message("❌ 'unrar' command not found. Please ensure 'unrar' is installed.", "ERROR")
+        return False
+    except Exception as e:
+        log_message(f"❌ An unexpected error occurred during RAR extraction: {str(e)}", "ERROR")
+        return False
+def extract_audio_from_video(video_path: str, output_wav_path: str) -> bool:
+    """Extracts audio from a video file and saves it as WAV format using moviepy."""
+    log_message(f"🔊 Extracting audio from {video_path} to {output_wav_path}", "INFO")
+    try:
+        # Ensure the output directory exists
+        os.makedirs(os.path.dirname(output_wav_path), exist_ok=True)
+        # Load the video file
+        video = VideoFileClip(video_path)
+        # Check if the video has audio
+        if video.audio is None:
+            log_message(f"⚠️ No audio track found in video: {video_path}", "WARNING")
+            video.close()
+            return False
+        # Extract audio and save as WAV
+        video.audio.write_audiofile(output_wav_path, logger=None)
+        video.close()
+        if os.path.exists(output_wav_path) and os.path.getsize(output_wav_path) > 0:
+            log_message(f"✅ Successfully extracted audio to WAV: {output_wav_path}", "INFO")
+            return True
+        else:
+            log_message(f"❌ Audio extraction produced empty or missing file: {output_wav_path}", "ERROR")
+            return False
+    except Exception as e:
+        log_message(f"❌ An error occurred during audio extraction from {video_path}: {str(e)}", "ERROR")
+        return False
+def upload_folder_to_hf(folder_path: str, repo_id: str, path_in_repo: str, commit_message: str) -> bool:
+    """Uploads an entire folder's contents to a Hugging Face repository."""
+    log_message(f"⬆️ Uploading folder {folder_path} to {repo_id}/{path_in_repo}", "INFO")
+    try:
+        # Collect all files to be uploaded
+        operations = []
+        for root, _, files in os.walk(folder_path):
+            for file in files:
+                local_path = os.path.join(root, file)
+                # Calculate the path inside the repository
+                relative_path = os.path.relpath(local_path, folder_path)
+                repo_path = os.path.join(path_in_repo, relative_path)
+                operations.append(
+                    CommitOperationAdd(path_in_repo=repo_path, path_or_fileobj=local_path)
+                )
+        if not operations:
+            log_message(f"⚠️ Folder {folder_path} is empty. Skipping upload.", "WARNING")
+            return True # Consider an empty folder upload successful
+        # Perform the upload
+        hf_api.create_commit(
+            repo_id=repo_id,
+            operations=operations,
+            commit_message=commit_message,
+            repo_type="dataset"
+        )
+        log_message(f"✅ Successfully uploaded {len(operations)} files from {folder_path}", "INFO")
+        return True
+    except Exception as e:
+        log_message(f"❌ Failed to upload folder {folder_path} to Hugging Face: {str(e)}", "ERROR")
+        return False
+def process_rar_file(rar_path: str) -> bool:
+    """
+    Main processing logic for a single RAR file:
+    1. Extract RAR
+    2. Find video files (MP4s)
+    3. Extract audio from each video and convert to WAV
+    4. Upload each WAV file to HF one by one
+    5. Clean up local files
+    """
+    rar_filename = os.path.basename(rar_path)
+    base_name = os.path.splitext(rar_filename)[0]
+    # 1. Extract RAR
+    extract_dir = os.path.join(EXTRACT_FOLDER, base_name)
+    if not extract_rar(rar_path, extract_dir):
+        log_failed_file(rar_filename, "RAR extraction failed")
+        return False
+    video_files = []
+    # Search for common video extensions recursively
+    for ext in ['*.mp4', '*.mkv', '*.avi', '*.mov', '*.webm']:
+        video_files.extend(Path(extract_dir).rglob(ext))
+    if not video_files:
+        log_message(f"⚠️ No video files found in extracted content for {rar_filename}", "WARNING")
+        # Clean up the extracted folder
+        shutil.rmtree(extract_dir, ignore_errors=True)
+        log_message(f"🗑️ Cleaned up extracted folder: {extract_dir}", "INFO")
+        log_failed_file(rar_filename, "No video files found")
+        return False
+    success_count = 0
+    for video_path_obj in video_files:
+        video_path = str(video_path_obj)
+        video_filename = video_path_obj.name
+        video_base_name = os.path.splitext(video_filename)[0]
+        # 3. Extract audio from video and convert to WAV
+        wav_filename = f"{video_base_name}.wav"
+        wav_output_path = os.path.join(EXTRACT_FOLDER, wav_filename)
+        if not extract_audio_from_video(video_path, wav_output_path):
+            log_failed_file(rar_filename, f"Failed to extract audio from {video_filename}")
+            continue
+        # 4. Upload each WAV file to HF
+        path_in_repo = f"audio/{wav_filename}"
+        commit_message = f"Add audio: {wav_filename} extracted from {video_filename} in archive {rar_filename}"
+        try:
+            log_message(f"⬆️ Uploading audio: {wav_filename}", "INFO")
+            hf_api.upload_file(
+                path_or_fileobj=wav_output_path,
+                path_in_repo=path_in_repo,
+                repo_id=TARGET_REPO_ID,
+                repo_type="dataset",
+                commit_message=commit_message
+            )
+            log_message(f"✅ Successfully uploaded audio: {wav_filename}", "INFO")
+            success_count += 1
+            processing_status["extracted_videos"] += 1
+            # Wait 60 seconds before next upload
+            log_message(f"⏳ Waiting 60 seconds before next upload...", "INFO")
+            time.sleep(60)
+        except Exception as e:
+            log_message(f"❌ Failed to upload audio {wav_filename}: {str(e)}", "ERROR")
+            log_failed_file(rar_filename, f"Failed to upload audio {wav_filename}: {str(e)}")
+        finally:
+            # Clean up the WAV file after upload attempt
+            if os.path.exists(wav_output_path):
+                os.remove(wav_output_path)
+                log_message(f"🗑️ Cleaned up WAV file: {wav_output_path}", "INFO")
+    # 5. Clean up the extracted folder
+    shutil.rmtree(extract_dir, ignore_errors=True)
+    log_message(f"🗑️ Cleaned up extracted folder: {extract_dir}", "INFO")
+    if success_count > 0:
+        processing_status["extracted_courses"] += 1 # Assuming one rar is one course
+        return True
+    else:
+        log_message(f"❌ All audio extraction/upload failed for {rar_filename}", "ERROR")
+        return False
+def get_next_file_to_process(repo_id: str, state: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+    """
+    Finds the next file to process from the source repo.
+    Returns: { 'filename': str, 'url': str, 'index': int } or None
+    """
+    log_message(f"🔍 Searching for next file to process in {repo_id}", "INFO")
+    try:
+        # 1. List all files in the source repository
+        files_list = hf_api.list_repo_files(repo_id=repo_id, repo_type="dataset")
+        # 2. Filter for .rar and .zip files
+        archive_files = sorted([f for f in files_list if f.endswith(('.rar', '.zip'))])
+        if not archive_files:
+            log_message("ℹ️ No .rar or .zip files found in the source repository.", "INFO")
+            return None
+        processing_status["total_files"] = len(archive_files)
+        # 3. Get the next index from the state
+        start_index = state.get("next_download_index", 0)
+        # 4. Iterate through files starting from the index
+        for index in range(start_index, len(archive_files)):
+            filename = archive_files[index]
+            # Check the state of the file
+            file_state = state["file_states"].get(filename)
+            # Only process if the file is not in the state or is marked as 'failed'
+            if file_state is None or file_state == "failed":
+                # Construct the download URL
+                url = hf_hub_url(repo_id=repo_id, filename=filename, repo_type="dataset", subfolder=None)
+                log_message(f"✅ Found next file: {filename} at index {index}", "INFO")
+                return {
+                    'filename': filename,
+                    'url': url,
+                    'index': index
+                }
+            elif file_state == "processing":
+                log_message(f"⚠️ File {filename} is currently marked as 'processing'. Skipping for now.", "WARNING")
+                # Advance the index if a file is stuck in 'processing' for too long,
+                # but for now, we'll just skip it and let the loop continue.
+            elif file_state == "processed":
+                log_message(f"ℹ️ File {filename} already processed. Skipping.", "INFO")
+        log_message("ℹ️ All files up to the current index have been processed or skipped.", "INFO")
+        # If we reach the end of the list, reset the index to 0 to check for new files
+        if start_index >= len(archive_files):
+            log_message("ℹ️ Reached end of file list. Resetting index to 0 for next loop.", "INFO")
+            state["next_download_index"] = 0
+            upload_hf_state(TARGET_REPO_ID, HF_STATE_FILE, state)
+        return None
+    except Exception as e:
+        log_message(f"❌ Failed to list files from Hugging Face: {str(e)}", "ERROR")
+        return None
+def main_processing_loop():
+    """The main loop that orchestrates the download, processing, and upload cycle."""
+    if processing_status["is_running"]:
+        log_message("⚠️ Processing loop is already running.", "WARNING")
+        return
+    processing_status["is_running"] = True
+    try:
+        log_message("🚀 Starting main processing loop...", "INFO")
+        while processing_status["is_running"]:
+            # 1. Download the current state
+            current_state = download_hf_state(TARGET_REPO_ID, HF_STATE_FILE)
+            # 2. Find the next file to process
+            next_file_info = get_next_file_to_process(SOURCE_REPO_ID, current_state)
+            if next_file_info is None:
+                log_message("💤 No new files to process. Sleeping for a while...", "INFO")
+                time.sleep(PROCESSING_DELAY * 5) # Sleep longer if nothing to do
+                continue
+            target_file = next_file_info['filename']
+            rar_url = next_file_info['url']
+            target_index = next_file_info['index']
+            processing_status["current_file"] = target_file
+            success = False
+            try:
+                # 3. Lock the file for processing
+                if not lock_file_for_processing(target_file, current_state):
+                    log_message(f"❌ Failed to lock file {target_file}. Skipping.", "ERROR")
+                    time.sleep(PROCESSING_DELAY)
+                    continue # Start next iteration
+                # 4. Download the file
+                local_rar_path = os.path.join(DOWNLOAD_FOLDER, target_file)
+                log_message(f"⬇️ Downloading file: {target_file}", "INFO")
+                if download_with_retry(rar_url, local_rar_path):
+                    # 5. Process the file (extraction, frame processing, zipping, uploading results, etc.)
+                    if process_rar_file(local_rar_path):
+                        success = True
+                        log_message(f"✅ Finished all processing steps for: {target_file}", "INFO")
+                    else:
+                         log_message(f"❌ Processing failed for: {target_file}", "ERROR")
+                else:
+                     log_message(f"❌ Download failed for: {target_file}", "ERROR")
+            except Exception as e:
+                 log_message(f"🔥 An unhandled error occurred while processing {target_file}: {str(e)}", "ERROR")
+                 log_failed_file(target_file, str(e))
+            finally:
+                # 6. Release Lock / Update State
+                # The next index to check will be the one *after* the current file, regardless of success.
+                next_index_to_save = target_index + 1
+                # Download the latest state again before final upload to ensure we don't overwrite
+                # changes made by other workers in the meantime (e.g. if they processed a file
+                # that was before this one in the queue).
+                current_state = download_hf_state(TARGET_REPO_ID, HF_STATE_FILE)
+                if success:
+                    # Mark as 'processed' and update the next_download_index
+                    unlock_file_as_processed(target_file, current_state, next_index_to_save)
+                    processing_status["processed_files"] += 1
+                else:
+                    # If processing failed, we still want to release the 'processing' lock,
+                    # but we mark it as 'failed' instead of 'processed' and still advance the index.
+                    log_message(f"⚠️ Processing failed for {target_file}. Marking as 'failed' and advancing index.", "WARNING")
+                    current_state["file_states"][target_file] = "failed"
+                    current_state["next_download_index"] = next_index_to_save
+                    upload_hf_state(TARGET_REPO_ID, HF_STATE_FILE, current_state)
+                    processing_status["failed_files"] += 1
+                # Clean up local files
+                if os.path.exists(local_rar_path):
+                    os.remove(local_rar_path)
+                    log_message(f"🗑️ Cleaned up local file: {local_rar_path}", "INFO")
+            # Wait a bit before checking for the next file to avoid hammering the HF API
+            time.sleep(PROCESSING_DELAY)
+        log_message("🎉 Processing complete!", "INFO")
+        log_message(f'📊 Final stats: {processing_status["extracted_courses"]} courses extracted, {processing_status["extracted_videos"]} videos processed, frames extracted', "INFO")
+    except KeyboardInterrupt:
+        log_message("⏹️ Processing interrupted by user", "WARNING")
+    except Exception as e:
+         log_message(f"❌ Fatal error: {str(e)}", "ERROR")
+    finally:
+        processing_status["is_running"] = False
+        cleanup_temp_files()
+# Expose necessary functions and variables
+__all__ = [
+    "main_processing_loop",
+    "processing_status",
+    "log_message",
+]