Spaces:

Fred808
/

VS2

Paused

App Files Files Community

Fred808 commited on Oct 21, 2025

Commit

53d888f

verified ·

1 Parent(s): bd8bf7b

Update vision_analyzer.py

Browse files

Files changed (1) hide show

vision_analyzer.py +411 -380

vision_analyzer.py CHANGED Viewed

@@ -1,407 +1,438 @@
 import os
 import json
 import time
 import threading
-import asyncio
-from fastapi import FastAPI, HTTPException, BackgroundTasks
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse, FileResponse
-from fastapi.staticfiles import StaticFiles
-import uvicorn
-from typing import Dict
 from pathlib import Path
-import subprocess
-from datetime import datetime
-import torch
-# Import from vision_analyzer (previously cursor_tracker)
-from vision_analyzer import (
-    main_processing_loop,
-    processing_status,
-    log_message,
-    FRAMES_OUTPUT_FOLDER  # Add this import for frames directory
-)
-# FastAPI App Definition
-app = FastAPI(title="Video Analysis API",
-             description="API to access video frame analysis results and extracted images",
-             version="1.0.0")
-# Add CORS middleware to allow cross-origin requests
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],  # Allows all origins
-    allow_credentials=True,
-    allow_methods=["*"],  # Allows all methods
-    allow_headers=["*"],
-)
-# Global variables for processing and frame tracking
-processing_thread = None
-frame_locks = {}  # Dict to track frame locks: {course: {frame: {"locked_by": id, "locked_at": timestamp}}}
-processed_frames = {}  # Dict to track processed frames: {course: {frame: {"processed_by": id, "processed_at": timestamp}}}
-LOCK_TIMEOUT = 300  # 5 minutes timeout for locks
-TRACKING_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "frame_tracking.json")
-def save_tracking_state():
-    """Save frame tracking state to disk"""
-    state = {
-        "frame_locks": frame_locks,
-        "processed_frames": processed_frames
-    }
-    try:
-        with open(TRACKING_FILE, "w") as f:
-            json.dump(state, f, indent=2)
-    except Exception as e:
-        log_message(f"Error saving tracking state: {e}")
-def load_tracking_state():
-    """Load frame tracking state from disk"""
-    global frame_locks, processed_frames
-    try:
-        with open(TRACKING_FILE, "r") as f:
-            state = json.load(f)
-            frame_locks = state.get("frame_locks", {})
-            processed_frames = state.get("processed_frames", {})
-    except FileNotFoundError:
-        log_message("No previous tracking state found")
-    except Exception as e:
-        log_message(f"Error loading tracking state: {e}")
-def check_frame_lock(course: str, frame: str) -> bool:
-    """Check if frame is locked and lock hasn't expired"""
-    if course in frame_locks and frame in frame_locks[course]:
-        lock = frame_locks[course][frame]
-        if time.time() - lock["locked_at"] < LOCK_TIMEOUT:
-            return True
-        # Lock expired, remove it
-        del frame_locks[course][frame]
-        save_tracking_state()
-    return False
-def lock_frame(course: str, frame: str, requester_id: str) -> bool:
-    """Attempt to lock a frame for processing"""
-    if check_frame_lock(course, frame):
-        return False
-    if course not in frame_locks:
-        frame_locks[course] = {}
-    frame_locks[course][frame] = {
-        "locked_by": requester_id,
-        "locked_at": time.time()
-    }
-    save_tracking_state()
-    return True
-def mark_frame_processed(course: str, frame: str, requester_id: str):
-    """Mark a frame as successfully processed"""
-    if course not in processed_frames:
-        processed_frames[course] = {}
-    processed_frames[course][frame] = {
-        "processed_by": requester_id,
-        "processed_at": time.time()
-    }
-    # Remove the lock if it exists
-    if course in frame_locks and frame in frame_locks[course]:
-        del frame_locks[course][frame]
-    save_tracking_state()
-def log_message(message):
-    """Add a log message with timestamp"""
-    timestamp = datetime.now().strftime("%H:%M:%S")
     log_entry = f"[{timestamp}] {message}"
     processing_status["logs"].append(log_entry)
-    # Keep only the last 100 logs
     if len(processing_status["logs"]) > 100:
         processing_status["logs"] = processing_status["logs"][-100:]
-    print(log_entry)
-@app.on_event("startup")
-async def startup_event():
-    """Initialize frame tracking and start processing loop"""
-    # Load frame tracking state
-    load_tracking_state()
-    log_message("✓ Loaded frame tracking state")
-    # Start processing thread
-    global processing_thread
-    if not (processing_thread and processing_thread.is_alive()):
-        log_message("🚀 Starting RAR extraction, frame extraction, and vision analysis pipeline in background...")
-        processing_thread = threading.Thread(target=main_processing_loop)
-        processing_thread.daemon = True
-        processing_thread.start()
-@app.get("/")
-async def root():
-    """Root endpoint that returns basic info"""
-    return {
-        "message": "Video Analysis API",
-        "status": "running",
-        "endpoints": {
-            "/status": "Get processing status",
-            "/courses": "List all available course folders",
-            "/images/{course_folder}": "List images in a course folder",
-            "/images/{course_folder}/{frame_filename}": "Get specific frame image",
-            "/start-processing": "Start processing pipeline",
-            "/stop-processing": "Stop processing pipeline"
-        }
-    }
-@app.get("/status")
-async def get_status():
-    """Get current processing status"""
-    return {
-        "processing_status": processing_status,
-        "frames_folder": FRAMES_OUTPUT_FOLDER,
-        "frames_folder_exists": os.path.exists(FRAMES_OUTPUT_FOLDER)
-    }
-# ===== NEW IMAGE SERVING ENDPOINTS =====
-@app.get("/middleware/next/course")
-async def get_next_course(requester_id: str):
-    """Get next available course for processing"""
-    if not os.path.exists(FRAMES_OUTPUT_FOLDER):
-        raise HTTPException(status_code=404, detail="No courses available")
-    # Load latest state
-    load_tracking_state()
-    # Find a course with unprocessed frames
-    for folder in os.listdir(FRAMES_OUTPUT_FOLDER):
-        folder_path = os.path.join(FRAMES_OUTPUT_FOLDER, folder)
-        if not os.path.isdir(folder_path):
-            continue
-        # Check if course has any unprocessed frames
-        image_files = [f for f in os.listdir(folder_path)
-                      if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
-        for image in image_files:
-            if (folder not in processed_frames or
-                image not in processed_frames[folder]):
-                return {"course": folder}
-    raise HTTPException(status_code=404, detail="No courses with unprocessed frames")
-@app.get("/middleware/next/image/{course_folder}")
-async def get_next_image(course_folder: str, requester_id: str):
-    """Get next available image from a course"""
-    folder_path = os.path.join(FRAMES_OUTPUT_FOLDER, course_folder)
-    if not os.path.exists(folder_path):
-        raise HTTPException(status_code=404, detail=f"Course not found: {course_folder}")
-    # Load latest state
-    load_tracking_state()
-    # Find first unprocessed and unlocked frame
-    for file in sorted(os.listdir(folder_path)):
-        if not file.lower().endswith(('.png', '.jpg', '.jpeg')):
-            continue
-        # Skip if processed
-        if (course_folder in processed_frames and
-            file in processed_frames[course_folder]):
-            continue
-        # Skip if locked by another requester
-        if check_frame_lock(course_folder, file):
-            continue
-        # Try to lock the frame
-        if lock_frame(course_folder, file, requester_id):
-            file_path = os.path.join(folder_path, file)
-            file_stats = os.stat(file_path)
-            return {
-                "file_id": f"frame:{course_folder}/{file}",
-                "frame": file,
-                "video": os.path.splitext(file)[0],
-                "size_bytes": file_stats.st_size,
-                "modified_time": time.ctime(file_stats.st_mtime),
-                "url": f"/images/{course_folder}/{file}"
-            }
-    raise HTTPException(status_code=404, detail="No available frames in course")
-@app.post("/middleware/release/frame/{course_folder}/{video}/{frame}")
-async def release_frame(course_folder: str, video: str, frame: str, requester_id: str):
-    """Release a frame lock"""
-    if course_folder in frame_locks and frame in frame_locks[course_folder]:
-        lock = frame_locks[course_folder][frame]
-        if lock["locked_by"] == requester_id:
-            del frame_locks[course_folder][frame]
-            save_tracking_state()
-            return {"status": "released"}
-    return {"status": "not_found"}
-@app.post("/middleware/release/course/{course_folder}")
-async def release_course(course_folder: str, requester_id: str):
-    """Release all frame locks for a course"""
-    if course_folder in frame_locks:
-        # Only release frames locked by this requester
-        frames_to_release = [
-            frame for frame, lock in frame_locks[course_folder].items()
-            if lock["locked_by"] == requester_id
-        ]
-        for frame in frames_to_release:
-            del frame_locks[course_folder][frame]
-        save_tracking_state()
-    return {"status": "released"}
-@app.get("/images/{course_folder}/{frame_filename}")
-async def get_frame_image(course_folder: str, frame_filename: str, requester_id: str = None):
-    """
-    Serve extracted frame images from course folders with locking
-    Args:
-        course_folder: The course folder name (e.g., "course1_video1_mp4_frames")
-        frame_filename: The frame file name (e.g., "0001.png")
-        requester_id: Optional requester ID for frame locking
-    """
-    # Load latest state
-    load_tracking_state()
-    # Construct the full path to the image
-    image_path = os.path.join(FRAMES_OUTPUT_FOLDER, course_folder, frame_filename)
-    # Check if file exists
-    if not os.path.exists(image_path):
-        raise HTTPException(status_code=404, detail=f"Image not found: {course_folder}/{frame_filename}")
-    # Verify it's an image file
-    if not frame_filename.lower().endswith(('.png', '.jpg', '.jpeg')):
-        raise HTTPException(status_code=400, detail="File must be an image (PNG, JPG, JPEG)")
-    # If requester_id provided, verify frame lock
-    if requester_id:
-        if check_frame_lock(course_folder, frame_filename):
-            lock = frame_locks[course_folder][frame_filename]
-            if lock["locked_by"] != requester_id:
-                raise HTTPException(status_code=423, detail="Frame is locked by another requester")
-    # Return the image file
-    return FileResponse(image_path)
-@app.get("/images/{course_folder}")
-async def list_course_images(course_folder: str):
-    """
-    List all available images in a specific course folder
-    Args:
-        course_folder: The course folder name
-    """
-    folder_path = os.path.join(FRAMES_OUTPUT_FOLDER, course_folder)
-    if not os.path.exists(folder_path):
-        raise HTTPException(status_code=404, detail=f"Course folder not found: {course_folder}")
-    # Get all image files
-    image_files = []
-    for file in os.listdir(folder_path):
-        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
-            file_path = os.path.join(folder_path, file)
-            file_stats = os.stat(file_path)
-            image_files.append({
-                "filename": file,
-                "size_bytes": file_stats.st_size,
-                "modified_time": time.ctime(file_stats.st_mtime),
-                "url": f"/images/{course_folder}/{file}"
-            })
-    return {
-        "course_folder": course_folder,
-        "total_images": len(image_files),
-        "images": image_files
-    }
-@app.get("/courses")
-async def list_all_courses():
-    """
-    List all available course folders with their image counts
-    """
-    if not os.path.exists(FRAMES_OUTPUT_FOLDER):
-        return {"courses": [], "message": "Frames output folder does not exist yet"}
-    courses = []
-    for folder in os.listdir(FRAMES_OUTPUT_FOLDER):
-        folder_path = os.path.join(FRAMES_OUTPUT_FOLDER, folder)
-        if os.path.isdir(folder_path):
-            # Count image files
-            image_count = len([f for f in os.listdir(folder_path)
-                             if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
-            courses.append({
-                "course_folder": folder,
-                "image_count": image_count,
-                "images_url": f"/images/{folder}",
-                "sample_image_url": f"/images/{folder}/0001.png" if image_count > 0 else None
-            })
-    return {
-        "total_courses": len(courses),
-        "courses": courses
-    }
-# Signal handlers to prevent accidental shutdown
-def handle_shutdown(signum, frame):
-    """Prevent shutdown on SIGTERM/SIGINT"""
-    print(f"\n⚠️ Received signal {signum}. Server will continue running.")
-    print("Use Ctrl+Break or kill -9 to force stop.")
-# Setup signal handlers for graceful shutdown prevention
-import signal
-signal.signal(signal.SIGINT, handle_shutdown)
-signal.signal(signal.SIGTERM, handle_shutdown)
-# Server lifecycle events
-@app.on_event("shutdown")
-async def shutdown_event():
-    """Save state on shutdown attempt"""
-    save_tracking_state()
-    print("💾 Saved tracking state")
-    print("⚠️ Server shutdown prevented - use Ctrl+Break or kill -9 to force stop")
-    # Prevent shutdown by not returning
-    while True:
-        await asyncio.sleep(1)
-if __name__ == "__main__":
-    # Start the FastAPI server
-    print("🚀 Starting Video Analysis FastAPI Server (Persistent Mode)...")
-    print("API Documentation will be available at: http://localhost:8000/docs")
-    print("API Root endpoint: http://localhost:8000/")
-    print("⚠️ Server will continue running even after processing completes")
-    print("Use Ctrl+Break or kill -9 to force stop")
-    # Ensure the analysis output folder exists
-    os.makedirs(FRAMES_OUTPUT_FOLDER, exist_ok=True)
-    # Start processing in thread instead of blocking
-    processing_thread = threading.Thread(target=main_processing_loop)
-    processing_thread.daemon = False  # Make non-daemon so it doesn't exit
-    processing_thread.start()
-    # Configure uvicorn for persistent running
-    config = uvicorn.Config(
-        app=app,
-        host="0.0.0.0",
-        port=8000,
-        log_level="info",
-        reload=False,
-        workers=1,
-        loop="asyncio",
-        timeout_keep_alive=600,  # Keep connections alive longer
-        access_log=True
-    )
-    # Run server with persistent config
-    server = uvicorn.Server(config)
-    server.run()

 import os
 import json
+import requests
+import subprocess
+import shutil
 import time
+import re
 import threading
+from typing import Dict, List, Set, Optional
+from huggingface_hub import HfApi, list_repo_files
+import cv2
+import numpy as np
 from pathlib import Path
+import smtplib
+from email.message import EmailMessage
+# ==== CONFIGURATION ====
+HF_TOKEN = os.getenv("HF_TOKEN", "")
+SOURCE_REPO_ID = os.getenv("SOURCE_REPO", "Fred808/BG1")
+# Path Configuration
+DOWNLOAD_FOLDER = "downloads"
+EXTRACT_FOLDER = "extracted"
+FRAMES_OUTPUT_FOLDER = "extracted_frames"
+os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
+os.makedirs(EXTRACT_FOLDER, exist_ok=True)
+os.makedirs(FRAMES_OUTPUT_FOLDER, exist_ok=True)
+# State Files
+DOWNLOAD_STATE_FILE = "download_progress.json"
+PROCESS_STATE_FILE = "process_progress.json"
+FAILED_FILES_LOG = "failed_files.log"
+# Processing Parameters
+CHUNK_SIZE = 1
+PROCESSING_DELAY = 2
+MAX_RETRIES = 3
+MIN_FREE_SPACE_GB = 2  # Minimum free space in GB before processing
+# Frame Extraction Parameters
+DEFAULT_FPS = 3 # Default frames per second for extraction
+# Cursor Tracking Parameters
+# Initialize HF API
+hf_api = HfApi(token=HF_TOKEN)
+# Global State
+processing_status = {
+    "is_running": False,
+    "current_file": None,
+    "total_files": 0,
+    "processed_files": 0,
+    "failed_files": 0,
+    "extracted_courses": 0,
+    "extracted_videos": 0,
+    "last_update": None,
+    "logs": []
+}
+def log_message(message: str):
+    """Log messages with timestamp"""
+    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
     log_entry = f"[{timestamp}] {message}"
+    print(log_entry)
     processing_status["logs"].append(log_entry)
+    processing_status["last_update"] = timestamp
     if len(processing_status["logs"]) > 100:
         processing_status["logs"] = processing_status["logs"][-100:]
+def log_failed_file(filename: str, error: str):
+    """Log failed files to persistent file"""
+    with open(FAILED_FILES_LOG, "a") as f:
+        f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {filename}: {error}\n")
+def get_disk_usage(path: str) -> Dict[str, float]:
+    """Get disk usage statistics in GB"""
+    statvfs = os.statvfs(path)
+    total = statvfs.f_frsize * statvfs.f_blocks / (1024**3)
+    free = statvfs.f_frsize * statvfs.f_bavail / (1024**3)
+    used = total - free
+    return {"total": total, "free": free, "used": used}
+def check_disk_space(path: str = ".") -> bool:
+    """Check if there's enough disk space"""
+    disk_info = get_disk_usage(path)
+    if disk_info["free"] < MIN_FREE_SPACE_GB:
+        log_message(f'⚠️ Low disk space: {disk_info["free"]:.2f}GB free, {disk_info["used"]:.2f}GB used')
+        return False
+    return True
+def cleanup_temp_files():
+    """Clean up temporary files to free space"""
+    log_message("🧹 Cleaning up temporary files...")
+    # Clean old downloads (keep only current processing file)
+    current_file = processing_status.get("current_file")
+    for file in os.listdir(DOWNLOAD_FOLDER):
+        if file != current_file and file.endswith((".rar", ".zip")):
+            try:
+                os.remove(os.path.join(DOWNLOAD_FOLDER, file))
+                log_message(f"🗑️ Removed old download: {file}")
+            except:
+                pass
+def load_json_state(file_path: str, default_value):
+    """Load state from JSON file"""
+    if os.path.exists(file_path):
+        try:
+            with open(file_path, "r") as f:
+                return json.load(f)
+        except json.JSONDecodeError:
+            log_message(f"⚠️ Corrupted state file: {file_path}")
+    return default_value
+def save_json_state(file_path: str, data):
+    """Save state to JSON file"""
+    with open(file_path, "w") as f:
+        json.dump(data, f, indent=2)
+def download_with_retry(url: str, dest_path: str, max_retries: int = 3) -> bool:
+    """Download file with retry logic and disk space checking"""
+    if not check_disk_space():
+        cleanup_temp_files()
+        if not check_disk_space():
+            log_message("❌ Insufficient disk space even after cleanup")
+            return False
+    headers = {"Authorization": f"Bearer {HF_TOKEN}"}
+    for attempt in range(max_retries):
+        try:
+            with requests.get(url, headers=headers, stream=True) as r:
+                r.raise_for_status()
+                # Check content length if available
+                content_length = r.headers.get("content-length")
+                if content_length:
+                    size_gb = int(content_length) / (1024**3)
+                    disk_info = get_disk_usage(".")
+                    if size_gb > disk_info["free"] - 0.5:  # Leave 0.5GB buffer
+                        log_message(f'❌ File too large: {size_gb:.2f}GB, only {disk_info["free"]:.2f}GB free')
+                        return False
+                with open(dest_path, "wb") as f:
+                    for chunk in r.iter_content(chunk_size=8192):
+                        f.write(chunk)
+            return True
+        except Exception as e:
+            if attempt < max_retries - 1:
+                time.sleep(2 ** attempt)
+                continue
+            log_message(f"❌ Download failed after {max_retries} attempts: {e}")
+            return False
+    return False
+def is_multipart_rar(filename: str) -> bool:
+    """Check if this is a multi-part RAR file"""
+    return ".part" in filename.lower() and filename.lower().endswith(".rar")
+def get_rar_part_base(filename: str) -> str:
+    """Get the base name for multi-part RAR files"""
+    if ".part" in filename.lower():
+        return filename.split(".part")[0]
+    return filename.replace(".rar", "")
+def extract_with_retry(rar_path: str, output_dir: str, max_retries: int = 2) -> bool:
+    """Extract RAR with retry and recovery, handling multi-part archives"""
+    filename = os.path.basename(rar_path)
+    # For multi-part RARs, we need the first part
+    if is_multipart_rar(filename):
+        base_name = get_rar_part_base(filename)
+        first_part = f"{base_name}.part01.rar"
+        first_part_path = os.path.join(os.path.dirname(rar_path), first_part)
+        if not os.path.exists(first_part_path):
+            log_message(f"⚠️ Multi-part RAR detected but first part not found: {first_part}")
+            return False
+        rar_path = first_part_path
+        log_message(f"📦 Processing multi-part RAR starting with: {first_part}")
+    for attempt in range(max_retries):
+        try:
+            # Test RAR first
+            test_cmd = ["unrar", "t", rar_path]
+            test_result = subprocess.run(test_cmd, capture_output=True, text=True)
+            if test_result.returncode != 0:
+                log_message(f"⚠️ RAR test failed: {test_result.stderr}")
+                if attempt == max_retries - 1:
+                    return False
+                continue
+            # Extract RAR
+            cmd = ["unrar", "x", "-o+", rar_path, output_dir]
+            if attempt > 0:  # Try recovery on subsequent attempts
+                cmd.insert(2, "-kb")
+            result = subprocess.run(cmd, capture_output=True, text=True)
+            if result.returncode == 0:
+                log_message(f"✅ Successfully extracted: {os.path.basename(rar_path)}")
+                return True
+            else:
+                error_msg = result.stderr or result.stdout
+                log_message(f"⚠️ Extraction attempt {attempt + 1} failed: {error_msg}")
+                if "checksum error" in error_msg.lower() or "CRC failed" in error_msg:
+                    log_message(f"⚠️ Data corruption detected, attempt {attempt + 1}")
+                elif result.returncode == 10:
+                    log_message(f"⚠️ No files to extract (exit code 10)")
+                    return False
+                elif result.returncode == 1:
+                    log_message(f"⚠️ Non-fatal error (exit code 1)")
+        except Exception as e:
+            log_message(f"❌ Extraction exception: {str(e)}")
+            if attempt == max_retries - 1:
+                return False
+            time.sleep(1)
+    return False
+# --- Frame Extraction Utilities ---
+def ensure_dir(path):
+    os.makedirs(path, exist_ok=True)
+def extract_frames(video_path, output_dir, fps=DEFAULT_FPS):
+    """Extract frames from video at the specified frames per second (fps)."""
+    log_message(f"[INFO] Extracting frames from {video_path} to {output_dir} at {fps} fps...")
+    ensure_dir(output_dir)
+    cap = cv2.VideoCapture(str(video_path))
+    if not cap.isOpened():
+        log_message(f"[ERROR] Failed to open video file: {video_path}")
+        return 0
+    video_fps = cap.get(cv2.CAP_PROP_FPS)
+    # log_message(f"[DEBUG] Video FPS: {video_fps}")
+    if not video_fps or video_fps <= 0:
+        video_fps = 30  # fallback if FPS is not available
+        log_message(f"[WARN] Using fallback FPS: {video_fps}")
+    frame_interval = int(round(video_fps / fps))
+    # log_message(f"[DEBUG] Frame interval: {frame_interval}")
+    frame_idx = 0
+    saved_idx = 1
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    log_message(f"[DEBUG] Total frames in video: {total_frames}")
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            # log_message(f"[DEBUG] No more frames to read at frame_idx {frame_idx}.")
+            break
+        if frame_idx % frame_interval == 0:
+            frame_name = f"{saved_idx:04d}.png"
+            cv2.imwrite(str(Path(output_dir) / frame_name), frame)
+            # log_message(f"[DEBUG] Saved frame {frame_idx} as {frame_name}")
+            saved_idx += 1
+        frame_idx += 1
+    cap.release()
+    log_message(f"Extracted {saved_idx-1} frames from {video_path} to {output_dir}")
+    return saved_idx - 1
+def process_rar_file(rar_path: str) -> bool:
+    """Process a single RAR file - extract, then process videos for frames"""
+    filename = os.path.basename(rar_path)
+    processing_status["current_file"] = filename
+    # Handle multi-part RAR naming
+    if is_multipart_rar(filename):
+        course_name = get_rar_part_base(filename)
+    else:
+        course_name = filename.replace(".rar", "")
+    extract_dir = os.path.join(EXTRACT_FOLDER, course_name)
+    try:
+        log_message(f"🔄 Processing: {filename}")
+        # Clean up any existing directory
+        if os.path.exists(extract_dir):
+            shutil.rmtree(extract_dir, ignore_errors=True)
+        # Extract RAR
+        os.makedirs(extract_dir, exist_ok=True)
+        if not extract_with_retry(rar_path, extract_dir):
+            raise Exception("RAR extraction failed")
+        # Count extracted files
+        file_count = 0
+        video_files_found = []
+        for root, dirs, files in os.walk(extract_dir):
+            for file in files:
+                file_count += 1
+                if file.lower().endswith((".mp4", ".avi", ".mov", ".mkv")):
+                    video_files_found.append(os.path.join(root, file))
+        processing_status["extracted_courses"] += 1
+        log_message(f"✅ Successfully extracted '{course_name}' ({file_count} files, {len(video_files_found)} videos)")
+        # Process video files for frame extraction
+        for video_path in video_files_found:
+            video_filename = Path(video_path).name
+            # Unique output directory for frames
+            frames_output_dir = os.path.join(
+                FRAMES_OUTPUT_FOLDER,
+                f"{course_name}_{video_filename.replace('.', '_')}_frames"
+            )
+            ensure_dir(frames_output_dir)
+            # 🔥 Extract frames here
+            frame_count = extract_frames(video_path, frames_output_dir, fps=DEFAULT_FPS)
+            processing_status["extracted_videos"] += 1
+            if frame_count == 0:
+                log_message(f"⚠️ No frames extracted from {video_filename}")
+            else:
+                log_message(f"✅ {frame_count} frames extracted from {video_filename}")
+        return True
+    except Exception as e:
+        error_msg = str(e)
+        log_message(f"❌ Processing failed: {error_msg}")
+        log_failed_file(filename, error_msg)
+        return False
+    finally:
+        processing_status["current_file"] = None
+def main_processing_loop(start_index: int = 0):
+    """Main processing workflow - extraction, frame extraction, and cursor tracking"""
+    processing_status["is_running"] = True
+    try:
+        # Load state
+        processed_rars = load_json_state(PROCESS_STATE_FILE, {"processed_rars": []})["processed_rars"]
+        download_state = load_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": 3})
+        # Use start_index if provided, otherwise use the saved state
+        next_index = start_index if start_index > 0 else download_state["next_download_index"]
+        log_message(f"📊 Starting from index {next_index}")
+        log_message(f"📊 Previously processed: {len(processed_rars)} files")
+        # Get file list
+        try:
+            files = list(hf_api.list_repo_files(repo_id=SOURCE_REPO_ID, repo_type="dataset"))
+            rar_files = sorted([f for f in files if f.endswith(".rar")])
+            processing_status["total_files"] = len(rar_files)
+            log_message(f"📁 Found {len(rar_files)} RAR files in repository")
+            if next_index >= len(rar_files):
+                log_message("✅ All files have been processed!")
+                return
+        except Exception as e:
+            log_message(f"❌ Failed to get file list: {str(e)}")
+            return
+        # Process only one file per run
+        if next_index < len(rar_files):
+            rar_file = rar_files[next_index]
+            filename = os.path.basename(rar_file)
+            if filename in processed_rars:
+                log_message(f"⏭️ Skipping already processed: {filename}")
+                processing_status["processed_files"] += 1
+                # Move to next file
+                next_index += 1
+                save_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": next_index})
+                log_message(f"📊 Moving to next file. Progress: {next_index}/{len(rar_files)}")
+                return
+            log_message(f"📥 Downloading: {filename}")
+            dest_path = os.path.join(DOWNLOAD_FOLDER, filename)
+            # Download file
+            download_url = f"https://huggingface.co/datasets/{SOURCE_REPO_ID}/resolve/main/{rar_file}"
+            if download_with_retry(download_url, dest_path):
+                # Process file
+                if process_rar_file(dest_path):
+                    processed_rars.append(filename)
+                    save_json_state(PROCESS_STATE_FILE, {"processed_rars": processed_rars})
+                    log_message(f"✅ Successfully processed: {filename}")
+                    processing_status["processed_files"] += 1
+                else:
+                    log_message(f"❌ Failed to process: {filename}")
+                    processing_status["failed_files"] += 1
+                # Clean up downloaded file
+                try:
+                    os.remove(dest_path)
+                    log_message(f"🗑️ Cleaned up download: {filename}")
+                except:
+                    pass
+            else:
+                log_message(f"❌ Failed to download: {filename}")
+                processing_status["failed_files"] += 1
+            # Update download state for next run
+            next_index += 1
+            save_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": next_index})
+            # Status update            log_message(f"📊 Frames Extracted: {processing_status["extracted_frames_count"]}")
+            if next_index < len(rar_files):
+                log_message(f"🔄 Run the script again to process the next file: {os.path.basename(rar_files[next_index])}")
+            else:
+                log_message("🎉 All files have been processed!")
+        else:
+            log_message("✅ All files have been processed!")
+        log_message("🎉 Processing complete!")
+        log_message(f'📊 Final stats: {processing_status["extracted_courses"]} courses extracted, {processing_status["extracted_videos"]} videos processed, frames extracted')
+    except KeyboardInterrupt:
+        log_message("⏹️ Processing interrupted by user")
+    except Exception as e:
+        log_message(f"❌ Fatal error: {str(e)}")
+    finally:
+        processing_status["is_running"] = False
+        cleanup_temp_files()
+# Expose necessary functions and variables for download_api.py
+__all__ = [
+    "main_processing_loop",
+    "processing_status",
+    "log_message",
+    "extract_frames",
+    "DEFAULT_FPS",
+    "ensure_dir"
+]