Spaces:

Fred808
/

VSSEE

Paused

App Files Files Community

Fred808 commited on Oct 19, 2025

Commit

9fea5e6

verified ·

1 Parent(s): 8d428f0

Upload 3 files

Browse files

Files changed (3) hide show

app.py +575 -558
frame_extractor.py +55 -0
processing_logic.py +398 -373

app.py CHANGED Viewed

@@ -1,559 +1,576 @@
-from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks, Query
-from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
-from fastapi.middleware.cors import CORSMiddleware
-import os
-import json
-import shutil
-import uuid
-import time
-import threading
-import logging
-import aiofiles
-from pathlib import Path
-from typing import Dict, List, Optional
-from dataclasses import dataclass, asdict
-from datetime import datetime
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.FileHandler('api.log'),
-        logging.StreamHandler()
-    ]
-)
-logger = logging.getLogger(__name__)
-# Import error logger
-from error_logger import ErrorLogger
-error_logger = ErrorLogger()
-# Import processing functions and variables
-from processing_logic import (
-    processing_status,
-    uploaded_mp4s,
-    log_message,
-    process_hf_files_background,
-    UPLOAD_DIRECTORY,
-    MP4_OUTPUT_FOLDER,
-    hf_api,
-    DEFAULT_RAR_LIMIT
-)
-# Middleware Configuration
-LOCK_TIMEOUT = 300  # 5 minutes in seconds
-STATE_FILE = "middleware_state.json"
-CHUNK_SIZE = 8192  # 8KB chunks for streaming
-app = FastAPI(title="Unified MP4 Processing & Distribution API")
-# Configure CORS
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# Define folders
-MP4_UPLOAD_FOLDER = os.path.join(UPLOAD_DIRECTORY, "uploads")
-os.makedirs(MP4_UPLOAD_FOLDER, exist_ok=True)
-os.makedirs(MP4_OUTPUT_FOLDER, exist_ok=True)
-processing_thread = None
-# ==== MIDDLEWARE STATE MANAGEMENT ====
-@dataclass
-class FileState:
-    path: str
-    locked: bool
-    lock_holder: Optional[str] = None
-    lock_time: Optional[float] = None
-    download_count: int = 0
-    last_access: Optional[float] = None
-class MiddlewareState:
-    def __init__(self):
-        self.files: Dict[str, FileState] = {}
-        self.load_state()
-    def load_state(self):
-        """Load state from disk"""
-        if os.path.exists(STATE_FILE):
-            try:
-                with open(STATE_FILE, 'r') as f:
-                    data = json.load(f)
-                    self.files = {
-                        k: FileState(**v) for k, v in data.items()
-                    }
-            except Exception as e:
-                logger.error(f"Error loading state: {e}")
-                self.files = {}
-    def save_state(self):
-        """Save state to disk"""
-        try:
-            with open(STATE_FILE, 'w') as f:
-                json.dump({
-                    k: asdict(v) for k, v in self.files.items()
-                }, f, indent=2)
-        except Exception as e:
-            logger.error(f"Error saving state: {e}")
-    def clean_expired_locks(self):
-        """Remove expired locks"""
-        now = time.time()
-        for file_id, state in self.files.items():
-            if state.locked and (now - state.lock_time) > LOCK_TIMEOUT:
-                state.locked = False
-                state.lock_holder = None
-                state.lock_time = None
-        self.save_state()
-    def get_next_available_file(self, requester_id: str) -> Optional[str]:
-        """Get next unlocked file"""
-        self.clean_expired_locks()
-        # First try to find any file this requester has locked
-        for file_id, state in self.files.items():
-            if state.locked and state.lock_holder == requester_id:
-                return file_id
-        # Then look for any unlocked file
-        for file_id, state in self.files.items():
-            if not state.locked:
-                state.locked = True
-                state.lock_holder = requester_id
-                state.lock_time = time.time()
-                self.save_state()
-                return file_id
-        return None
-    def release_lock(self, file_id: str, requester_id: str) -> bool:
-        """Release a file lock"""
-        if file_id in self.files:
-            state = self.files[file_id]
-            if state.lock_holder == requester_id:
-                state.locked = False
-                state.lock_holder = None
-                state.lock_time = None
-                state.last_access = time.time()
-                state.download_count += 1
-                self.save_state()
-                return True
-        return False
-# Global state
-middleware_state = MiddlewareState()
-# ==== HELPER FUNCTIONS ====
-def save_file(uploaded_file: UploadFile, save_path: str):
-    os.makedirs(os.path.dirname(save_path), exist_ok=True)
-    with open(save_path, "wb") as f:
-        shutil.copyfileobj(uploaded_file.file, f)
-def log_request(endpoint: str, params: dict = None):
-    """Log API requests for debugging"""
-    logger.info(f"API Request: {endpoint} - Params: {params}")
-# === ERROR HANDLING ===
-@app.exception_handler(HTTPException)
-async def http_exception_handler(request, exc):
-    """Handle HTTP exceptions with detailed logging"""
-    error_id = error_logger.log_error(
-        exc,
-        request.url.path,
-        request_info={
-            "method": request.method,
-            "url": str(request.url),
-            "headers": dict(request.headers),
-            "query_params": dict(request.query_params),
-        },
-        context={
-            "status_code": exc.status_code,
-            "detail": exc.detail
-        }
-    )
-    return JSONResponse(
-        status_code=exc.status_code,
-        content={
-            "error": exc.detail,
-            "error_id": error_id,
-            "type": "http_error",
-            "status_code": exc.status_code
-        }
-    )
-@app.exception_handler(Exception)
-async def general_exception_handler(request, exc):
-    """Handle all other exceptions with detailed logging"""
-    error_id = error_logger.log_error(
-        exc,
-        request.url.path,
-        request_info={
-            "method": request.method,
-            "url": str(request.url),
-            "headers": dict(request.headers),
-            "query_params": dict(request.query_params),
-        }
-    )
-    return JSONResponse(
-        status_code=500,
-        content={
-            "error": "Internal server error",
-            "error_id": error_id,
-            "type": "server_error",
-            "detail": str(exc) if app.debug else "An unexpected error occurred"
-        }
-    )
-# === ROUTES ===
-@app.get("/")
-async def root():
-    """API root endpoint"""
-    return {
-        "message": "Unified MP4 Processing & Distribution API",
-        "version": "1.0.0",
-        "status": "running",
-        "endpoints": {
-            "processing": {
-                "courses": "GET /courses - List all course folders",
-                "images": "GET /images/{course_folder:path} - List MP4s in course",
-                "download": "GET /download?course={course}&file={file} - Download MP4 file",
-                "debug": "GET /debug/structure - Debug file structure"
-            },
-            "middleware": {
-                "status": "GET /middleware/status - Get middleware status",
-                "register": "POST /middleware/register - Register a new file",
-                "next": "GET /middleware/next - Get next available file",
-                "release": "POST /middleware/release/{file_id} - Release a file lock",
-                "stream": "GET /middleware/stream/{file_id} - Stream a file"
-            }
-        }
-    }
-# ==== PROCESSING ENDPOINTS ====
-@app.get("/courses")
-async def get_courses():
-    """List all top-level course folders."""
-    try:
-        courses = [d.name for d in Path(MP4_OUTPUT_FOLDER).iterdir() if d.is_dir()]
-        return {"courses": courses, "total": len(courses)}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to list courses: {e}")
-@app.get("/images/{course_folder:path}")
-async def get_mp4_list(course_folder: str):
-    """List all MP4 files within a specific course folder."""
-    course_path = Path(MP4_OUTPUT_FOLDER) / course_folder
-    if not course_path.is_dir():
-        raise HTTPException(status_code=404, detail="Course folder not found")
-    try:
-        mp4_files = [f.name for f in course_path.iterdir() if f.is_file() and f.suffix.lower() == ".mp4"]
-        return mp4_files
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to list MP4s: {e}")
-@app.get("/download")
-async def download_mp4(course: str, file: str):
-    """Download a specific MP4 file from a course folder."""
-    file_path = Path(MP4_OUTPUT_FOLDER) / course / file
-    if not file_path.is_file():
-        raise HTTPException(status_code=404, detail="File not found")
-    return FileResponse(path=file_path, media_type="video/mp4", filename=file)
-@app.get("/debug/structure")
-async def debug_structure():
-    """Debug endpoint to inspect the file structure and sizes."""
-    mp4_output_folder_path = Path(MP4_OUTPUT_FOLDER)
-    structure = {}
-    total_size_bytes = 0
-    total_mp4_files = 0
-    if not mp4_output_folder_path.exists():
-        return JSONResponse(content={
-            "mp4_output_folder": str(mp4_output_folder_path),
-            "folder_exists": False,
-            "total_mp4_files": 0,
-            "total_size_bytes": 0,
-            "structure": {}
-        })
-    for root, dirs, files in os.walk(mp4_output_folder_path):
-        current_path = Path(root)
-        relative_path = str(current_path.relative_to(mp4_output_folder_path))
-        if relative_path == ".":
-            relative_path = "/"
-        structure[relative_path] = {
-            "directories": [d for d in dirs],
-            "mp4_files": [],
-            "other_files": []
-        }
-        for file in files:
-            file_full_path = current_path / file
-            file_size = file_full_path.stat().st_size
-            total_size_bytes += file_size
-            if file.lower().endswith(".mp4"):
-                structure[relative_path]["mp4_files"].append({"name": file, "size": file_size})
-                total_mp4_files += 1
-            else:
-                structure[relative_path]["other_files"].append({"name": file, "size": file_size})
-    return {
-        "mp4_output_folder": str(mp4_output_folder_path),
-        "folder_exists": mp4_output_folder_path.exists(),
-        "total_mp4_files": total_mp4_files,
-        "total_size_bytes": total_size_bytes,
-        "structure": structure
-    }
-# ==== ERROR MONITORING ENDPOINTS ====
-@app.get("/errors/recent")
-async def get_recent_errors(limit: int = Query(10, ge=1, le=100)):
-    """Get most recent errors"""
-    return error_logger.get_recent_errors(limit)
-@app.get("/errors/{error_id}")
-async def get_error_details(error_id: str):
-    """Get detailed information about a specific error"""
-    error = error_logger.get_error(error_id)
-    if not error:
-        raise HTTPException(status_code=404, detail="Error ID not found")
-    return error
-@app.get("/errors/summary")
-async def get_error_summary():
-    """Get summary of errors by type"""
-    return error_logger.get_error_summary()
-# ==== MIDDLEWARE ENDPOINTS ====
-@app.get("/middleware/status")
-async def get_middleware_status():
-    """Get middleware status"""
-    courses = sum(1 for f in middleware_state.files.keys() if f.startswith("course:"))
-    images = sum(1 for f in middleware_state.files.keys() if f.startswith("image:"))
-    return {
-        "active_locks": sum(1 for f in middleware_state.files.values() if f.locked),
-        "total_files": len(middleware_state.files),
-        "total_courses": courses,
-        "total_images": images,
-        "downloads_completed": sum(f.download_count for f in middleware_state.files.values())
-    }
-@app.get("/middleware/status/course/{course_id}")
-async def get_course_status(course_id: str):
-    """Get status of a specific course"""
-    file_id = f"course:{course_id}"
-    if file_id not in middleware_state.files:
-        raise HTTPException(status_code=404, detail="Course not found")
-    state = middleware_state.files[file_id]
-    return {
-        "course_id": course_id,
-        "locked": state.locked,
-        "lock_holder": state.lock_holder,
-        "lock_time": state.lock_time,
-        "download_count": state.download_count,
-        "last_access": state.last_access
-    }
-@app.get("/middleware/status/image/{course_folder}/{file_id}")
-async def get_image_status(course_folder: str, file_id: str):
-    """Get status of a specific image"""
-    full_id = f"image:{course_folder}/{file_id}"
-    if full_id not in middleware_state.files:
-        raise HTTPException(status_code=404, detail="Image not found")
-    state = middleware_state.files[full_id]
-    return {
-        "file_id": file_id,
-        "course": course_folder,
-        "locked": state.locked,
-        "lock_holder": state.lock_holder,
-        "lock_time": state.lock_time,
-        "download_count": state.download_count,
-        "last_access": state.last_access
-    }
-@app.post("/middleware/register")
-async def register_file(file_path: str):
-    """Register a new file in the system"""
-    if not os.path.exists(file_path):
-        raise HTTPException(status_code=404, detail="File not found")
-    file_id = os.path.basename(file_path)
-    if file_id not in middleware_state.files:
-        middleware_state.files[file_id] = FileState(
-            path=file_path,
-            locked=False
-        )
-        middleware_state.save_state()
-    return {"file_id": file_id}
-@app.get("/middleware/next/course")
-async def get_next_course(requester_id: str):
-    """Get next available course folder"""
-    try:
-        courses = [d.name for d in Path(MP4_OUTPUT_FOLDER).iterdir() if d.is_dir()]
-        for course in courses:
-            course_id = f"course:{course}"
-            if course_id not in middleware_state.files:
-                middleware_state.files[course_id] = FileState(
-                    path=str(Path(MP4_OUTPUT_FOLDER) / course),
-                    locked=False
-                )
-                middleware_state.save_state()
-            if not middleware_state.files[course_id].locked:
-                middleware_state.files[course_id].locked = True
-                middleware_state.files[course_id].lock_holder = requester_id
-                middleware_state.files[course_id].lock_time = time.time()
-                middleware_state.save_state()
-                return {
-                    "course_id": course,
-                    "path": str(Path(MP4_OUTPUT_FOLDER) / course),
-                    "lock_time": middleware_state.files[course_id].lock_time
-                }
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to get next course: {e}")
-    raise HTTPException(status_code=404, detail="No courses available")
-@app.get("/middleware/next/image/{course_folder}")
-async def get_next_image(course_folder: str, requester_id: str):
-    """Get next available image from a specific course"""
-    course_path = Path(MP4_OUTPUT_FOLDER) / course_folder
-    if not course_path.is_dir():
-        raise HTTPException(status_code=404, detail="Course folder not found")
-    try:
-        mp4_files = [f for f in course_path.iterdir() if f.is_file() and f.suffix.lower() == ".mp4"]
-        for mp4_file in mp4_files:
-            file_id = f"image:{course_folder}/{mp4_file.name}"
-            if file_id not in middleware_state.files:
-                middleware_state.files[file_id] = FileState(
-                    path=str(mp4_file),
-                    locked=False
-                )
-                middleware_state.save_state()
-            if not middleware_state.files[file_id].locked:
-                middleware_state.files[file_id].locked = True
-                middleware_state.files[file_id].lock_holder = requester_id
-                middleware_state.files[file_id].lock_time = time.time()
-                middleware_state.save_state()
-                return {
-                    "file_id": mp4_file.name,
-                    "course": course_folder,
-                    "path": str(mp4_file),
-                    "lock_time": middleware_state.files[file_id].lock_time
-                }
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to get next image: {e}")
-    raise HTTPException(status_code=404, detail="No images available in this course")
-@app.get("/middleware/next/any")
-async def get_next_any_file(requester_id: str):
-    """Get next available file of any type"""
-    file_id = middleware_state.get_next_available_file(requester_id)
-    if not file_id:
-        raise HTTPException(status_code=404, detail="No files available")
-    file_state = middleware_state.files[file_id]
-    return {
-        "file_id": file_id,
-        "file_path": file_state.path,
-        "lock_time": file_state.lock_time,
-        "type": "course" if file_id.startswith("course:") else "image"
-    }
-@app.post("/middleware/release/course/{course_id}")
-async def release_course(course_id: str, requester_id: str):
-    """Release a course lock"""
-    file_id = f"course:{course_id}"
-    if not middleware_state.release_lock(file_id, requester_id):
-        raise HTTPException(status_code=403, detail="Not lock holder")
-    return {"status": "ok"}
-@app.post("/middleware/release/image/{course_folder}/{file_id}")
-async def release_image(course_folder: str, file_id: str, requester_id: str):
-    """Release an image lock"""
-    full_id = f"image:{course_folder}/{file_id}"
-    if not middleware_state.release_lock(full_id, requester_id):
-        raise HTTPException(status_code=403, detail="Not lock holder")
-    return {"status": "ok"}
-@app.post("/middleware/release/{file_id}")
-async def release_file(file_id: str, requester_id: str):
-    """Release any file lock (backward compatibility)"""
-    if not middleware_state.release_lock(file_id, requester_id):
-        raise HTTPException(status_code=403, detail="Not lock holder")
-    return {"status": "ok"}
-@app.get("/middleware/stream/{file_id}")
-async def stream_file(file_id: str, requester_id: str):
-    """Stream a file to client"""
-    if file_id not in middleware_state.files:
-        raise HTTPException(status_code=404, detail="File not found")
-    file_state = middleware_state.files[file_id]
-    if not file_state.locked or file_state.lock_holder != requester_id:
-        raise HTTPException(status_code=403, detail="Not lock holder")
-    if not os.path.exists(file_state.path):
-        raise HTTPException(status_code=404, detail="File not found on disk")
-    async def file_stream():
-        async with aiofiles.open(file_state.path, 'rb') as f:
-            while chunk := await f.read(CHUNK_SIZE):
-                yield chunk
-        # Auto-release lock after successful transfer
-        middleware_state.release_lock(file_id, requester_id)
-    return StreamingResponse(
-        file_stream(),
-        media_type="application/octet-stream",
-        headers={
-            "Content-Disposition": f"attachment; filename={file_id}"
-        }
-    )
-# ==== STARTUP EVENT ====
-@app.on_event("startup")
-async def startup_event():
-    """Run the processing loop in the background when the API starts"""
-    global processing_thread
-    logger.info("Starting up Unified MP4 Processing & Distribution API...")
-    if not (processing_thread and processing_thread.is_alive()):
-        logger.info("🚀 Starting background processing thread...")
-        processing_thread = threading.Thread(target=process_hf_files_background)
-        processing_thread.daemon = True
-        processing_thread.start()
-if __name__ == "__main__":
-    import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=8000)

+from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks, Query
+from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
+from fastapi.middleware.cors import CORSMiddleware
+import os
+import json
+import shutil
+import uuid
+import time
+import threading
+import logging
+import aiofiles
+from pathlib import Path
+from typing import Dict, List, Optional
+from dataclasses import dataclass, asdict
+from datetime import datetime
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('api.log'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+# Import error logger
+from error_logger import ErrorLogger
+error_logger = ErrorLogger()
+# Import processing functions and variables
+from processing_logic import (
+    processing_status,
+    uploaded_mp4s,
+    log_message,
+    process_hf_files_background,
+    UPLOAD_DIRECTORY,
+    MP4_OUTPUT_FOLDER,
+    hf_api,
+    DEFAULT_RAR_LIMIT
+)
+# Middleware Configuration
+LOCK_TIMEOUT = 300  # 5 minutes in seconds
+STATE_FILE = "middleware_state.json"
+CHUNK_SIZE = 8192  # 8KB chunks for streaming
+app = FastAPI(title="Unified MP4 Processing & Distribution API")
+# Configure CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Define folders
+MP4_UPLOAD_FOLDER = os.path.join(UPLOAD_DIRECTORY, "uploads")
+os.makedirs(MP4_UPLOAD_FOLDER, exist_ok=True)
+os.makedirs(MP4_OUTPUT_FOLDER, exist_ok=True)
+processing_thread = None
+# ==== MIDDLEWARE STATE MANAGEMENT ====
+@dataclass
+class FileState:
+    path: str
+    locked: bool
+    lock_holder: Optional[str] = None
+    lock_time: Optional[float] = None
+    download_count: int = 0
+    last_access: Optional[float] = None
+class MiddlewareState:
+    def __init__(self):
+        self.files: Dict[str, FileState] = {}
+        self.load_state()
+    def load_state(self):
+        """Load state from disk"""
+        if os.path.exists(STATE_FILE):
+            try:
+                with open(STATE_FILE, 'r') as f:
+                    data = json.load(f)
+                    self.files = {
+                        k: FileState(**v) for k, v in data.items()
+                    }
+            except Exception as e:
+                logger.error(f"Error loading state: {e}")
+                self.files = {}
+    def save_state(self):
+        """Save state to disk"""
+        try:
+            with open(STATE_FILE, 'w') as f:
+                json.dump({
+                    k: asdict(v) for k, v in self.files.items()
+                }, f, indent=2)
+        except Exception as e:
+            logger.error(f"Error saving state: {e}")
+    def clean_expired_locks(self):
+        """Remove expired locks"""
+        now = time.time()
+        for file_id, state in self.files.items():
+            if state.locked and (now - state.lock_time) > LOCK_TIMEOUT:
+                state.locked = False
+                state.lock_holder = None
+                state.lock_time = None
+        self.save_state()
+    def get_next_available_file(self, requester_id: str) -> Optional[str]:
+        """Get next unlocked file"""
+        self.clean_expired_locks()
+        # First try to find any file this requester has locked
+        for file_id, state in self.files.items():
+            if state.locked and state.lock_holder == requester_id:
+                return file_id
+        # Then look for any unlocked file
+        for file_id, state in self.files.items():
+            if not state.locked:
+                state.locked = True
+                state.lock_holder = requester_id
+                state.lock_time = time.time()
+                self.save_state()
+                return file_id
+        return None
+    def release_lock(self, file_id: str, requester_id: str) -> bool:
+        """Release a file lock"""
+        if file_id in self.files:
+            state = self.files[file_id]
+            if state.lock_holder == requester_id:
+                state.locked = False
+                state.lock_holder = None
+                state.lock_time = None
+                state.last_access = time.time()
+                state.download_count += 1
+                self.save_state()
+                return True
+        return False
+# Global state
+middleware_state = MiddlewareState()
+# ==== HELPER FUNCTIONS ====
+def save_file(uploaded_file: UploadFile, save_path: str):
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    with open(save_path, "wb") as f:
+        shutil.copyfileobj(uploaded_file.file, f)
+def log_request(endpoint: str, params: dict = None):
+    """Log API requests for debugging"""
+    logger.info(f"API Request: {endpoint} - Params: {params}")
+# === ERROR HANDLING ===
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request, exc):
+    """Handle HTTP exceptions with detailed logging"""
+    error_id = error_logger.log_error(
+        exc,
+        request.url.path,
+        request_info={
+            "method": request.method,
+            "url": str(request.url),
+            "headers": dict(request.headers),
+            "query_params": dict(request.query_params),
+        },
+        context={
+            "status_code": exc.status_code,
+            "detail": exc.detail
+        }
+    )
+    return JSONResponse(
+        status_code=exc.status_code,
+        content={
+            "error": exc.detail,
+            "error_id": error_id,
+            "type": "http_error",
+            "status_code": exc.status_code
+        }
+    )
+@app.exception_handler(Exception)
+async def general_exception_handler(request, exc):
+    """Handle all other exceptions with detailed logging"""
+    error_id = error_logger.log_error(
+        exc,
+        request.url.path,
+        request_info={
+            "method": request.method,
+            "url": str(request.url),
+            "headers": dict(request.headers),
+            "query_params": dict(request.query_params),
+        }
+    )
+    return JSONResponse(
+        status_code=500,
+        content={
+            "error": "Internal server error",
+            "error_id": error_id,
+            "type": "server_error",
+            "detail": str(exc) if app.debug else "An unexpected error occurred"
+        }
+    )
+# === ROUTES ===
+@app.get("/")
+async def root():
+    """API root endpoint"""
+    return {
+        "message": "Unified MP4 Processing & Distribution API",
+        "version": "1.0.0",
+        "status": "running",
+        "endpoints": {
+            "processing": {
+                "courses": "GET /courses - List all course folders",
+                "images": "GET /images/{course_folder:path} - List MP4s in course",
+                "download": "GET /download?course={course}&file={file} - Download MP4 file",
+                "debug": "GET /debug/structure - Debug file structure"
+            },
+            "middleware": {
+                "status": "GET /middleware/status - Get middleware status",
+                "register": "POST /middleware/register - Register a new file",
+                "next": "GET /middleware/next - Get next available file",
+                "release": "POST /middleware/release/{file_id} - Release a file lock",
+                "stream": "GET /middleware/stream/{file_id} - Stream a file"
+            }
+        }
+    }
+# ==== PROCESSING ENDPOINTS ====
+@app.get("/courses")
+async def get_courses():
+    """List all top-level course folders."""
+    try:
+        courses = [d.name for d in Path(MP4_OUTPUT_FOLDER).iterdir() if d.is_dir()]
+        return {"courses": courses, "total": len(courses)}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to list courses: {e}")
+@app.get("/images/{course_folder:path}")
+async def get_mp4_list(course_folder: str):
+    """List all MP4 files within a specific course folder."""
+    course_path = Path(MP4_OUTPUT_FOLDER) / course_folder
+    if not course_path.is_dir():
+        raise HTTPException(status_code=404, detail="Course folder not found")
+    try:
+        mp4_files = [f.name for f in course_path.iterdir() if f.is_file() and f.suffix.lower() == ".mp4"]
+        return mp4_files
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to list MP4s: {e}")
+@app.get("/download")
+async def download_mp4(course: str, file: str):
+    """Download a specific MP4 file from a course folder."""
+    file_path = Path(MP4_OUTPUT_FOLDER) / course / file
+    if not file_path.is_file():
+        raise HTTPException(status_code=404, detail="File not found")
+    return FileResponse(path=file_path, media_type="video/mp4", filename=file)
+@app.get("/debug/structure")
+async def debug_structure():
+    """Debug endpoint to inspect the file structure and sizes."""
+    mp4_output_folder_path = Path(MP4_OUTPUT_FOLDER)
+    structure = {}
+    total_size_bytes = 0
+    total_mp4_files = 0
+    if not mp4_output_folder_path.exists():
+        return JSONResponse(content={
+            "mp4_output_folder": str(mp4_output_folder_path),
+            "folder_exists": False,
+            "total_mp4_files": 0,
+            "total_size_bytes": 0,
+            "structure": {}
+        })
+    for root, dirs, files in os.walk(mp4_output_folder_path):
+        current_path = Path(root)
+        relative_path = str(current_path.relative_to(mp4_output_folder_path))
+        if relative_path == ".":
+            relative_path = "/"
+        structure[relative_path] = {
+            "directories": [d for d in dirs],
+            "mp4_files": [],
+            "other_files": []
+        }
+        for file in files:
+            file_full_path = current_path / file
+            file_size = file_full_path.stat().st_size
+            total_size_bytes += file_size
+            if file.lower().endswith(".mp4"):
+                structure[relative_path]["mp4_files"].append({"name": file, "size": file_size})
+                total_mp4_files += 1
+            else:
+                structure[relative_path]["other_files"].append({"name": file, "size": file_size})
+    return {
+        "mp4_output_folder": str(mp4_output_folder_path),
+        "folder_exists": mp4_output_folder_path.exists(),
+        "total_mp4_files": total_mp4_files,
+        "total_size_bytes": total_size_bytes,
+        "structure": structure
+    }
+# ==== ERROR MONITORING ENDPOINTS ====
+@app.get("/errors/recent")
+async def get_recent_errors(limit: int = Query(10, ge=1, le=100)):
+    """Get most recent errors"""
+    return error_logger.get_recent_errors(limit)
+@app.get("/errors/{error_id}")
+async def get_error_details(error_id: str):
+    """Get detailed information about a specific error"""
+    error = error_logger.get_error(error_id)
+    if not error:
+        raise HTTPException(status_code=404, detail="Error ID not found")
+    return error
+@app.get("/errors/summary")
+async def get_error_summary():
+    """Get summary of errors by type"""
+    return error_logger.get_error_summary()
+# ==== MIDDLEWARE ENDPOINTS ====
+@app.get("/middleware/status")
+async def get_middleware_status():
+    """Get middleware status"""
+    courses = sum(1 for f in middleware_state.files.keys() if f.startswith("course:"))
+    images = sum(1 for f in middleware_state.files.keys() if f.startswith("image:"))
+    return {
+        "active_locks": sum(1 for f in middleware_state.files.values() if f.locked),
+        "total_files": len(middleware_state.files),
+        "total_courses": courses,
+        "total_images": images,
+        "downloads_completed": sum(f.download_count for f in middleware_state.files.values())
+    }
+@app.get("/middleware/status/course/{course_id}")
+async def get_course_status(course_id: str):
+    """Get status of a specific course"""
+    file_id = f"course:{course_id}"
+    if file_id not in middleware_state.files:
+        raise HTTPException(status_code=404, detail="Course not found")
+    state = middleware_state.files[file_id]
+    return {
+        "course_id": course_id,
+        "locked": state.locked,
+        "lock_holder": state.lock_holder,
+        "lock_time": state.lock_time,
+        "download_count": state.download_count,
+        "last_access": state.last_access
+    }
+@app.get("/middleware/status/image/{course_folder}/{file_id}")
+async def get_image_status(course_folder: str, file_id: str):
+    """Get status of a specific image"""
+    full_id = f"image:{course_folder}/{file_id}"
+    if full_id not in middleware_state.files:
+        raise HTTPException(status_code=404, detail="Image not found")
+    state = middleware_state.files[full_id]
+    return {
+        "file_id": file_id,
+        "course": course_folder,
+        "locked": state.locked,
+        "lock_holder": state.lock_holder,
+        "lock_time": state.lock_time,
+        "download_count": state.download_count,
+        "last_access": state.last_access
+    }
+@app.post("/middleware/register")
+async def register_file(file_path: str):
+    """Register a new file in the system"""
+    if not os.path.exists(file_path):
+        raise HTTPException(status_code=404, detail="File not found")
+    file_id = os.path.basename(file_path)
+    if file_id not in middleware_state.files:
+        middleware_state.files[file_id] = FileState(
+            path=file_path,
+            locked=False
+        )
+        middleware_state.save_state()
+    return {"file_id": file_id}
+@app.get("/middleware/next/course")
+async def get_next_course(requester_id: str):
+    """Get next available course folder"""
+    try:
+        courses = [d.name for d in Path(MP4_OUTPUT_FOLDER).iterdir() if d.is_dir()]
+        for course in courses:
+            course_id = f"course:{course}"
+            if course_id not in middleware_state.files:
+                middleware_state.files[course_id] = FileState(
+                    path=str(Path(MP4_OUTPUT_FOLDER) / course),
+                    locked=False
+                )
+                middleware_state.save_state()
+            if not middleware_state.files[course_id].locked:
+                middleware_state.files[course_id].locked = True
+                middleware_state.files[course_id].lock_holder = requester_id
+                middleware_state.files[course_id].lock_time = time.time()
+                middleware_state.save_state()
+                return {
+                    "course_id": course,
+                    "path": str(Path(MP4_OUTPUT_FOLDER) / course),
+                    "lock_time": middleware_state.files[course_id].lock_time
+                }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to get next course: {e}")
+    raise HTTPException(status_code=404, detail="No courses available")
+@app.get("/middleware/next/image/{course_folder}")
+async def get_next_image(course_folder: str, requester_id: str):
+    """Get next available frame image from a specific course's frames directory"""
+    # Look in the frames directory for the course
+    frames_path = Path(MP4_OUTPUT_FOLDER) / f"{course_folder}_frames"
+    if not frames_path.is_dir():
+        raise HTTPException(status_code=404, detail="Course frames directory not found")
+    try:
+        # Get all frame images (jpg files) from all video subdirectories
+        frame_files = []
+        for video_dir in frames_path.iterdir():
+            if video_dir.is_dir():
+                frame_files.extend([
+                    (video_dir.name, f)
+                    for f in video_dir.iterdir()
+                    if f.suffix.lower() in ('.jpg', '.jpeg')
+                ])
+        # Try to find an unlocked frame
+        for video_name, frame_file in frame_files:
+            file_id = f"frame:{course_folder}/{video_name}/{frame_file.name}"
+            # Register frame if not in state
+            if file_id not in middleware_state.files:
+                middleware_state.files[file_id] = FileState(
+                    path=str(frame_file),
+                    locked=False
+                )
+                middleware_state.save_state()
+            # Check if frame is available
+            if not middleware_state.files[file_id].locked:
+                middleware_state.files[file_id].locked = True
+                middleware_state.files[file_id].lock_holder = requester_id
+                middleware_state.files[file_id].lock_time = time.time()
+                middleware_state.save_state()
+                return {
+                    "file_id": file_id,
+                    "course": course_folder,
+                    "video": video_name,
+                    "frame": frame_file.name,
+                    "path": str(frame_file),
+                    "lock_time": middleware_state.files[file_id].lock_time
+                }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to get next image: {e}")
+    raise HTTPException(status_code=404, detail="No images available in this course")
+@app.get("/middleware/next/any")
+async def get_next_any_file(requester_id: str):
+    """Get next available file of any type"""
+    file_id = middleware_state.get_next_available_file(requester_id)
+    if not file_id:
+        raise HTTPException(status_code=404, detail="No files available")
+    file_state = middleware_state.files[file_id]
+    return {
+        "file_id": file_id,
+        "file_path": file_state.path,
+        "lock_time": file_state.lock_time,
+        "type": "course" if file_id.startswith("course:") else "image"
+    }
+@app.post("/middleware/release/course/{course_id}")
+async def release_course(course_id: str, requester_id: str):
+    """Release a course lock"""
+    file_id = f"course:{course_id}"
+    if not middleware_state.release_lock(file_id, requester_id):
+        raise HTTPException(status_code=403, detail="Not lock holder")
+    return {"status": "ok"}
+@app.post("/middleware/release/frame/{course_folder}/{video_name}/{frame_id}")
+async def release_frame(course_folder: str, video_name: str, frame_id: str, requester_id: str):
+    """Release a frame image lock"""
+    full_id = f"frame:{course_folder}/{video_name}/{frame_id}"
+    if not middleware_state.release_lock(full_id, requester_id):
+        raise HTTPException(status_code=403, detail="Not lock holder")
+    return {"status": "ok"}
+@app.post("/middleware/release/{file_id}")
+async def release_file(file_id: str, requester_id: str):
+    """Release any file lock (backward compatibility)"""
+    if not middleware_state.release_lock(file_id, requester_id):
+        raise HTTPException(status_code=403, detail="Not lock holder")
+    return {"status": "ok"}
+@app.get("/middleware/stream/{file_id}")
+async def stream_file(file_id: str, requester_id: str):
+    """Stream a file to client"""
+    if file_id not in middleware_state.files:
+        raise HTTPException(status_code=404, detail="File not found")
+    file_state = middleware_state.files[file_id]
+    if not file_state.locked or file_state.lock_holder != requester_id:
+        raise HTTPException(status_code=403, detail="Not lock holder")
+    if not os.path.exists(file_state.path):
+        raise HTTPException(status_code=404, detail="File not found on disk")
+    async def file_stream():
+        async with aiofiles.open(file_state.path, 'rb') as f:
+            while chunk := await f.read(CHUNK_SIZE):
+                yield chunk
+        # Auto-release lock after successful transfer
+        middleware_state.release_lock(file_id, requester_id)
+    return StreamingResponse(
+        file_stream(),
+        media_type="application/octet-stream",
+        headers={
+            "Content-Disposition": f"attachment; filename={file_id}"
+        }
+    )
+# ==== STARTUP EVENT ====
+@app.on_event("startup")
+async def startup_event():
+    """Run the processing loop in the background when the API starts"""
+    global processing_thread
+    logger.info("Starting up Unified MP4 Processing & Distribution API...")
+    if not (processing_thread and processing_thread.is_alive()):
+        logger.info("🚀 Starting background processing thread...")
+        processing_thread = threading.Thread(target=process_hf_files_background)
+        processing_thread.daemon = True
+        processing_thread.start()
+if __name__ == "__main__":
+    import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=8000)

frame_extractor.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import cv2
+import multiprocessing
+from pathlib import Path
+import numpy as np
+from typing import List, Tuple
+import os
+def extract_frames_from_video(args: Tuple[str, str, int]):
+    """
+    Extract frames from a video file at specified FPS
+    args: (input_video_path, output_folder, target_fps)
+    """
+    video_path, output_folder, target_fps = args
+    video_name = Path(video_path).stem
+    # Create output folder
+    frames_dir = Path(output_folder) / video_name
+    frames_dir.mkdir(parents=True, exist_ok=True)
+    try:
+        # Open video file
+        cap = cv2.VideoCapture(str(video_path))
+        if not cap.isOpened():
+            print(f"Error: Could not open video {video_path}")
+            return
+        # Get video properties
+        original_fps = cap.get(cv2.CAP_PROP_FPS)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        # Calculate frame extraction interval
+        interval = int(original_fps / target_fps)
+        frame_count = 0
+        saved_count = 0
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            # Extract frame at target FPS
+            if frame_count % interval == 0:
+                frame_path = frames_dir / f"frame_{saved_count:06d}.jpg"
+                cv2.imwrite(str(frame_path), frame)
+                saved_count += 1
+            frame_count += 1
+        cap.release()
+        return saved_count
+    except Exception as e:
+        print(f"Error processing {video_path}: {str(e)}")
+        return 0

processing_logic.py CHANGED Viewed

@@ -1,373 +1,398 @@
-import os
-import json
-import requests
-import subprocess
-import shutil
-import time
-import threading
-from typing import Dict, List, Optional
-from pathlib import Path
-from huggingface_hub import HfApi
-import uuid
-# ==== CONFIGURATION ====
-HF_TOKEN = os.getenv("HF_TOKEN", "")
-SOURCE_REPO_ID = os.getenv("SOURCE_REPO", "Fred808/BG1")
-# Directory Configuration
-UPLOAD_DIRECTORY = "./uploads"
-DOWNLOAD_FOLDER = "./downloads"
-EXTRACT_FOLDER = "./extracted"
-MP4_OUTPUT_FOLDER = "./mp4_files"
-# Create directories
-for directory in [UPLOAD_DIRECTORY, DOWNLOAD_FOLDER, EXTRACT_FOLDER, MP4_OUTPUT_FOLDER]:
-    os.makedirs(directory, exist_ok=True)
-# State Files
-DOWNLOAD_STATE_FILE = "download_progress.json"
-PROCESS_STATE_FILE = "process_progress.json"
-FAILED_FILES_LOG = "failed_files.log"
-# Processing Parameters
-MAX_RETRIES = 3
-MIN_FREE_SPACE_GB = 2
-DEFAULT_RAR_LIMIT = 5 # Default number of RAR files to process
-# Initialize HF API
-hf_api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
-# Global State
-processing_status = {
-    "is_running": False,
-    "current_file": None,
-    "total_files": 0,
-    "processed_files": 0,
-    "failed_files": 0,
-    "extracted_courses": 0,
-    "extracted_mp4s": 0,
-    "last_update": None,
-    "logs": []
-}
-# Store for uploaded MP4s with metadata (this will be managed by the API part, but needs to be accessible)
-uploaded_mp4s = {}
-def log_message(message: str):
-    """Log messages with timestamp"""
-    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
-    log_entry = f"[{timestamp}] {message}"
-    print(log_entry)
-    processing_status["logs"].append(log_entry)
-    processing_status["last_update"] = timestamp
-    if len(processing_status["logs"]) > 100:
-        processing_status["logs"] = processing_status["logs"][-100:]
-def log_failed_file(filename: str, error: str):
-    """Log failed files to persistent file"""
-    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
-    with open(FAILED_FILES_LOG, "a") as f:
-        f.write(f"{timestamp} - {filename}: {error}\n")
-def get_disk_usage(path: str) -> Dict[str, float]:
-    """Get disk usage statistics in GB"""
-    statvfs = os.statvfs(path)
-    total = statvfs.f_frsize * statvfs.f_blocks / (1024**3)
-    free = statvfs.f_frsize * statvfs.f_bavail / (1024**3)
-    used = total - free
-    return {"total": total, "free": free, "used": used}
-def check_disk_space(path: str = ".") -> bool:
-    """Check if there\'s enough disk space"""
-    disk_info = get_disk_usage(path)
-    if disk_info["free"] < MIN_FREE_SPACE_GB:
-        log_message(f'⚠️ Low disk space: {disk_info["free"]:.2f}GB free, {disk_info["used"]:.2f}GB used')
-        return False
-    return True
-def cleanup_temp_files():
-    """Clean up temporary files to free space"""
-    log_message("🧹 Cleaning up temporary files...")
-    # Clean old downloads (keep only current processing file)
-    current_file = processing_status.get("current_file")
-    for file in os.listdir(DOWNLOAD_FOLDER):
-        if file != current_file and file.endswith((".rar", ".zip")):
-            try:
-                os.remove(os.path.join(DOWNLOAD_FOLDER, file))
-                log_message(f"🗑️ Removed old download: {file}")
-            except:
-                pass
-def load_json_state(file_path: str, default_value):
-    """Load state from JSON file"""
-    if os.path.exists(file_path):
-        try:
-            with open(file_path, "r") as f:
-                return json.load(f)
-        except json.JSONDecodeError:
-            log_message(f"⚠️ Corrupted state file: {file_path}")
-    return default_value
-def save_json_state(file_path: str, data):
-    """Save state to JSON file"""
-    with open(file_path, "w") as f:
-        json.dump(data, f, indent=2)
-def download_with_retry(url: str, dest_path: str, max_retries: int = 3) -> bool:
-    """Download file with retry logic and disk space checking"""
-    if not check_disk_space():
-        cleanup_temp_files()
-        if not check_disk_space():
-            log_message("❌ Insufficient disk space even after cleanup")
-            return False
-    headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
-    for attempt in range(max_retries):
-        try:
-            with requests.get(url, headers=headers, stream=True) as r:
-                r.raise_for_status()
-                # Check content length if available
-                content_length = r.headers.get("content-length")
-                if content_length:
-                    size_gb = int(content_length) / (1024**3)
-                    disk_info = get_disk_usage(".")
-                    if size_gb > disk_info["free"] - 0.5:  # Leave 0.5GB buffer
-                        log_message(f'❌ File too large: {size_gb:.2f}GB, only {disk_info["free"]:.2f}GB free')
-                        return False
-                with open(dest_path, "wb") as f:
-                    for chunk in r.iter_content(chunk_size=8192):
-                        f.write(chunk)
-            return True
-        except Exception as e:
-            if attempt < max_retries - 1:
-                time.sleep(2 ** attempt)
-                continue
-            log_message(f"❌ Download failed after {max_retries} attempts: {e}")
-            return False
-    return False
-def is_multipart_rar(filename: str) -> bool:
-    """Check if this is a multi-part RAR file"""
-    return ".part" in filename.lower() and filename.lower().endswith(".rar")
-def get_rar_part_base(filename: str) -> str:
-    """Get the base name for multi-part RAR files"""
-    if ".part" in filename.lower():
-        return filename.split(".part")[0]
-    return filename.replace(".rar", "")
-def extract_with_retry(rar_path: str, output_dir: str, max_retries: int = 2) -> bool:
-    """Extract RAR with retry and recovery, handling multi-part archives"""
-    filename = os.path.basename(rar_path)
-    # For multi-part RARs, we need the first part
-    if is_multipart_rar(filename):
-        base_name = get_rar_part_base(filename)
-        first_part = f"{base_name}.part01.rar"
-        first_part_path = os.path.join(os.path.dirname(rar_path), first_part)
-        if not os.path.exists(first_part_path):
-            log_message(f"⚠️ Multi-part RAR detected but first part not found: {first_part}")
-            return False
-        rar_path = first_part_path
-        log_message(f"📦 Processing multi-part RAR starting with: {first_part}")
-    for attempt in range(max_retries):
-        try:
-            # Test RAR first
-            test_cmd = ["unrar", "t", rar_path]
-            test_result = subprocess.run(test_cmd, capture_output=True, text=True)
-            if test_result.returncode != 0:
-                log_message(f"⚠️ RAR test failed: {test_result.stderr}")
-                if attempt == max_retries - 1:
-                    return False
-                continue
-            # Extract RAR
-            cmd = ["unrar", "x", "-o+", rar_path, output_dir]
-            if attempt > 0:  # Try recovery on subsequent attempts
-                cmd.insert(2, "-kb")
-            result = subprocess.run(cmd, capture_output=True, text=True)
-            if result.returncode == 0:
-                log_message(f"✅ Successfully extracted: {os.path.basename(rar_path)}")
-                return True
-            else:
-                error_msg = result.stderr or result.stdout
-                log_message(f"⚠️ Extraction attempt {attempt + 1} failed: {error_msg}")
-        except Exception as e:
-            log_message(f"❌ Extraction exception: {str(e)}")
-            if attempt == max_retries - 1:
-                return False
-            time.sleep(1)
-    return False
-def process_rar_file(rar_path: str) -> List[Dict]:
-    """Process a single RAR file - extract and find MP4 files"""
-    filename = os.path.basename(rar_path)
-    processing_status["current_file"] = filename
-    # Handle multi-part RAR naming
-    if is_multipart_rar(filename):
-        course_name = get_rar_part_base(filename)
-    else:
-        course_name = filename.replace(".rar", "")
-    # Create a unique directory for this course's extracted MP4s
-    course_mp4_output_dir = os.path.join(MP4_OUTPUT_FOLDER, course_name)
-    os.makedirs(course_mp4_output_dir, exist_ok=True)
-    extract_dir = os.path.join(EXTRACT_FOLDER, course_name)
-    mp4_files = []
-    try:
-        log_message(f"🔄 Processing: {filename}")
-        # Clean up any existing directory
-        if os.path.exists(extract_dir):
-            shutil.rmtree(extract_dir, ignore_errors=True)
-        # Extract RAR
-        os.makedirs(extract_dir, exist_ok=True)
-        if not extract_with_retry(rar_path, extract_dir):
-            raise Exception("RAR extraction failed")
-        # Find and copy MP4 files
-        for root, dirs, files in os.walk(extract_dir):
-            for file in files:
-                if file.lower().endswith(".mp4"):
-                    source_path = os.path.join(root, file)
-                    # Use original filename for MP4 output within the course directory
-                    dest_path = os.path.join(course_mp4_output_dir, file)
-                    try:
-                        shutil.copy2(source_path, dest_path)
-                        mp4_files.append({
-                            "id": os.path.join(course_name, file), # Store as course_name/filename.mp4
-                            "original_name": file,
-                            "course_name": course_name,
-                            "size": os.path.getsize(dest_path),
-                            "created_at": time.strftime("%Y-%m-%d %H:%M:%S")
-                        })
-                        log_message(f"✅ Extracted MP4: {file} -> {os.path.join(course_name, file)}")
-                    except Exception as e:
-                        log_message(f"❌ Failed to copy MP4 {file}: {e}")
-        processing_status["extracted_courses"] += 1
-        processing_status["extracted_mp4s"] += len(mp4_files)
-        log_message(f"✅ Successfully processed '{course_name}' - found {len(mp4_files)} MP4 files")
-        return mp4_files
-    except Exception as e:
-        error_msg = str(e)
-        log_message(f"❌ Processing failed: {error_msg}")
-        log_failed_file(filename, error_msg)
-        return []
-    finally:
-        processing_status["current_file"] = None
-        # Clean up extracted directory
-        if os.path.exists(extract_dir):
-            shutil.rmtree(extract_dir, ignore_errors=True)
-def process_hf_files_background(start_index: int = 5, limit: int = DEFAULT_RAR_LIMIT):
-    """Background task to process HuggingFace files"""
-    if not hf_api:
-        log_message("❌ HuggingFace API not configured (missing HF_TOKEN)")
-        return
-    processing_status["is_running"] = True
-    try:
-        # Load state
-        processed_rars = load_json_state(PROCESS_STATE_FILE, {"processed_rars": []})["processed_rars"]
-        download_state = load_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": 0})
-        # Use start_index if provided, otherwise use the saved state
-        current_index = start_index if start_index > 0 else download_state["next_download_index"]
-        log_message(f"📊 Starting processing from index {current_index} with a limit of {limit} files.")
-        log_message(f"📊 Previously processed: {len(processed_rars)} files")
-        # Get file list
-        try:
-            files = list(hf_api.list_repo_files(repo_id=SOURCE_REPO_ID, repo_type="dataset"))
-            rar_files = sorted([f for f in files if f.endswith(".rar")])
-            processing_status["total_files"] = len(rar_files)
-            log_message(f"📁 Found {len(rar_files)} RAR files in repository")
-            if current_index >= len(rar_files):
-                log_message("✅ All files have been processed!")
-                return
-        except Exception as e:
-            log_message(f"❌ Failed to get file list: {str(e)}")
-            return
-        processed_count = 0
-        while processed_count < limit and current_index < len(rar_files) and processing_status["is_running"]:
-            rar_file = rar_files[current_index]
-            filename = os.path.basename(rar_file)
-            if filename in processed_rars:
-                log_message(f"⏭️ Skipping already processed: {filename}")
-                processing_status["processed_files"] += 1
-                current_index += 1
-                save_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": current_index})
-                continue
-            log_message(f"📥 Downloading: {filename}")
-            dest_path = os.path.join(DOWNLOAD_FOLDER, filename)
-            # Download file
-            download_url = f"https://huggingface.co/datasets/{SOURCE_REPO_ID}/resolve/main/{rar_file}"
-            if download_with_retry(download_url, dest_path):
-                # Process file
-                mp4_files = process_rar_file(dest_path)
-                if mp4_files:
-                    processed_rars.append(filename)
-                    save_json_state(PROCESS_STATE_FILE, {"processed_rars": processed_rars})
-                    log_message(f"✅ Successfully processed: {filename}")
-                    processing_status["processed_files"] += 1
-                else:
-                    log_message(f"❌ Failed to process: {filename}")
-                    processing_status["failed_files"] += 1
-                # Clean up downloaded file
-                try:
-                    os.remove(dest_path)
-                    log_message(f"🗑️ Cleaned up download: {filename}")
-                except:
-                    pass
-            else:
-                log_message(f"❌ Failed to download: {filename}")
-                processing_status["failed_files"] += 1
-            # Update download state for next run
-            current_index += 1
-            processed_count += 1
-            save_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": current_index})
-        if current_index >= len(rar_files):
-            log_message("🎉 All available RAR files have been processed!")
-        elif not processing_status["is_running"]:
-            log_message("⏹️ Processing stopped by request.")
-        else:
-            log_message(f"✅ Processed {processed_count} RAR files. Next index to process: {current_index}")
-    except Exception as e:
-        log_message(f"❌ Fatal error in background processing: {str(e)}")
-    finally:
-        processing_status["is_running"] = False
-        cleanup_temp_files()

+import os
+import json
+import requests
+import subprocess
+import shutil
+import time
+import threading
+import multiprocessing
+from typing import Dict, List, Optional
+from pathlib import Path
+from huggingface_hub import HfApi
+import uuid
+import frame_extractor  # Our frame extraction module
+# ==== CONFIGURATION ====
+HF_TOKEN = os.getenv("HF_TOKEN", "")
+SOURCE_REPO_ID = os.getenv("SOURCE_REPO", "Fred808/BG1")
+# Directory Configuration
+UPLOAD_DIRECTORY = "./uploads"
+DOWNLOAD_FOLDER = "./downloads"
+EXTRACT_FOLDER = "./extracted"
+MP4_OUTPUT_FOLDER = "./mp4_files"
+# Create directories
+for directory in [UPLOAD_DIRECTORY, DOWNLOAD_FOLDER, EXTRACT_FOLDER, MP4_OUTPUT_FOLDER]:
+    os.makedirs(directory, exist_ok=True)
+# State Files
+DOWNLOAD_STATE_FILE = "download_progress.json"
+PROCESS_STATE_FILE = "process_progress.json"
+FAILED_FILES_LOG = "failed_files.log"
+# Processing Parameters
+MAX_RETRIES = 3
+MIN_FREE_SPACE_GB = 2
+DEFAULT_RAR_LIMIT = 2 # Default number of RAR files to process
+# Initialize HF API
+hf_api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
+# Global State
+processing_status = {
+    "is_running": False,
+    "current_file": None,
+    "total_files": 0,
+    "processed_files": 0,
+    "failed_files": 0,
+    "extracted_courses": 0,
+    "extracted_mp4s": 0,
+    "last_update": None,
+    "logs": []
+}
+# Store for uploaded MP4s with metadata (this will be managed by the API part, but needs to be accessible)
+uploaded_mp4s = {}
+def log_message(message: str):
+    """Log messages with timestamp"""
+    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
+    log_entry = f"[{timestamp}] {message}"
+    print(log_entry)
+    processing_status["logs"].append(log_entry)
+    processing_status["last_update"] = timestamp
+    if len(processing_status["logs"]) > 100:
+        processing_status["logs"] = processing_status["logs"][-100:]
+def log_failed_file(filename: str, error: str):
+    """Log failed files to persistent file"""
+    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
+    with open(FAILED_FILES_LOG, "a") as f:
+        f.write(f"{timestamp} - {filename}: {error}\n")
+def get_disk_usage(path: str) -> Dict[str, float]:
+    """Get disk usage statistics in GB"""
+    statvfs = os.statvfs(path)
+    total = statvfs.f_frsize * statvfs.f_blocks / (1024**3)
+    free = statvfs.f_frsize * statvfs.f_bavail / (1024**3)
+    used = total - free
+    return {"total": total, "free": free, "used": used}
+def check_disk_space(path: str = ".") -> bool:
+    """Check if there\'s enough disk space"""
+    disk_info = get_disk_usage(path)
+    if disk_info["free"] < MIN_FREE_SPACE_GB:
+        log_message(f'⚠️ Low disk space: {disk_info["free"]:.2f}GB free, {disk_info["used"]:.2f}GB used')
+        return False
+    return True
+def cleanup_temp_files():
+    """Clean up temporary files to free space"""
+    log_message("🧹 Cleaning up temporary files...")
+    # Clean old downloads (keep only current processing file)
+    current_file = processing_status.get("current_file")
+    for file in os.listdir(DOWNLOAD_FOLDER):
+        if file != current_file and file.endswith((".rar", ".zip")):
+            try:
+                os.remove(os.path.join(DOWNLOAD_FOLDER, file))
+                log_message(f"🗑️ Removed old download: {file}")
+            except:
+                pass
+def load_json_state(file_path: str, default_value):
+    """Load state from JSON file"""
+    if os.path.exists(file_path):
+        try:
+            with open(file_path, "r") as f:
+                return json.load(f)
+        except json.JSONDecodeError:
+            log_message(f"⚠️ Corrupted state file: {file_path}")
+    return default_value
+def save_json_state(file_path: str, data):
+    """Save state to JSON file"""
+    with open(file_path, "w") as f:
+        json.dump(data, f, indent=2)
+def download_with_retry(url: str, dest_path: str, max_retries: int = 3) -> bool:
+    """Download file with retry logic and disk space checking"""
+    if not check_disk_space():
+        cleanup_temp_files()
+        if not check_disk_space():
+            log_message("❌ Insufficient disk space even after cleanup")
+            return False
+    headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
+    for attempt in range(max_retries):
+        try:
+            with requests.get(url, headers=headers, stream=True) as r:
+                r.raise_for_status()
+                # Check content length if available
+                content_length = r.headers.get("content-length")
+                if content_length:
+                    size_gb = int(content_length) / (1024**3)
+                    disk_info = get_disk_usage(".")
+                    if size_gb > disk_info["free"] - 0.5:  # Leave 0.5GB buffer
+                        log_message(f'❌ File too large: {size_gb:.2f}GB, only {disk_info["free"]:.2f}GB free')
+                        return False
+                with open(dest_path, "wb") as f:
+                    for chunk in r.iter_content(chunk_size=8192):
+                        f.write(chunk)
+            return True
+        except Exception as e:
+            if attempt < max_retries - 1:
+                time.sleep(2 ** attempt)
+                continue
+            log_message(f"❌ Download failed after {max_retries} attempts: {e}")
+            return False
+    return False
+def is_multipart_rar(filename: str) -> bool:
+    """Check if this is a multi-part RAR file"""
+    return ".part" in filename.lower() and filename.lower().endswith(".rar")
+def get_rar_part_base(filename: str) -> str:
+    """Get the base name for multi-part RAR files"""
+    if ".part" in filename.lower():
+        return filename.split(".part")[0]
+    return filename.replace(".rar", "")
+def extract_with_retry(rar_path: str, output_dir: str, max_retries: int = 2) -> bool:
+    """Extract RAR with retry and recovery, handling multi-part archives"""
+    filename = os.path.basename(rar_path)
+    # For multi-part RARs, we need the first part
+    if is_multipart_rar(filename):
+        base_name = get_rar_part_base(filename)
+        first_part = f"{base_name}.part01.rar"
+        first_part_path = os.path.join(os.path.dirname(rar_path), first_part)
+        if not os.path.exists(first_part_path):
+            log_message(f"⚠️ Multi-part RAR detected but first part not found: {first_part}")
+            return False
+        rar_path = first_part_path
+        log_message(f"📦 Processing multi-part RAR starting with: {first_part}")
+    for attempt in range(max_retries):
+        try:
+            # Test RAR first
+            test_cmd = ["unrar", "t", rar_path]
+            test_result = subprocess.run(test_cmd, capture_output=True, text=True)
+            if test_result.returncode != 0:
+                log_message(f"⚠️ RAR test failed: {test_result.stderr}")
+                if attempt == max_retries - 1:
+                    return False
+                continue
+            # Extract RAR
+            cmd = ["unrar", "x", "-o+", rar_path, output_dir]
+            if attempt > 0:  # Try recovery on subsequent attempts
+                cmd.insert(2, "-kb")
+            result = subprocess.run(cmd, capture_output=True, text=True)
+            if result.returncode == 0:
+                log_message(f"✅ Successfully extracted: {os.path.basename(rar_path)}")
+                return True
+            else:
+                error_msg = result.stderr or result.stdout
+                log_message(f"⚠️ Extraction attempt {attempt + 1} failed: {error_msg}")
+        except Exception as e:
+            log_message(f"❌ Extraction exception: {str(e)}")
+            if attempt == max_retries - 1:
+                return False
+            time.sleep(1)
+    return False
+def process_rar_file(rar_path: str) -> List[Dict]:
+    """Process a single RAR file - extract and find MP4 files"""
+    filename = os.path.basename(rar_path)
+    processing_status["current_file"] = filename
+    # Handle multi-part RAR naming
+    if is_multipart_rar(filename):
+        course_name = get_rar_part_base(filename)
+    else:
+        course_name = filename.replace(".rar", "")
+    # Create a unique directory for this course's extracted MP4s
+    course_mp4_output_dir = os.path.join(MP4_OUTPUT_FOLDER, course_name)
+    os.makedirs(course_mp4_output_dir, exist_ok=True)
+    extract_dir = os.path.join(EXTRACT_FOLDER, course_name)
+    mp4_files = []
+    try:
+        log_message(f"🔄 Processing: {filename}")
+        # Clean up any existing directory
+        if os.path.exists(extract_dir):
+            shutil.rmtree(extract_dir, ignore_errors=True)
+        # Extract RAR
+        os.makedirs(extract_dir, exist_ok=True)
+        if not extract_with_retry(rar_path, extract_dir):
+            raise Exception("RAR extraction failed")
+        # Find and copy MP4 files
+        for root, dirs, files in os.walk(extract_dir):
+            for file in files:
+                if file.lower().endswith(".mp4"):
+                    source_path = os.path.join(root, file)
+                    # Use original filename for MP4 output within the course directory
+                    dest_path = os.path.join(course_mp4_output_dir, file)
+                    try:
+                        shutil.copy2(source_path, dest_path)
+                        file_info = {
+                            "id": os.path.join(course_name, file),
+                            "original_name": file,
+                            "course_name": course_name,
+                            "size": os.path.getsize(dest_path),
+                            "path": dest_path,
+                            "created_at": time.strftime("%Y-%m-%d %H:%M:%S")
+                        }
+                        mp4_files.append(file_info)
+                        log_message(f"✅ Extracted MP4: {file} -> {os.path.join(course_name, file)}")
+                    except Exception as e:
+                        log_message(f"❌ Failed to copy MP4 {file}: {e}")
+        # Process frame extraction for all MP4s in parallel
+        if mp4_files:
+            log_message(f"🎞️ Starting frame extraction for {len(mp4_files)} MP4 files...")
+            # Create frames directory for this course
+            frames_dir = os.path.join(MP4_OUTPUT_FOLDER, f"{course_name}_frames")
+            os.makedirs(frames_dir, exist_ok=True)
+            # Prepare arguments for frame extraction
+            extraction_args = [
+                (mp4["path"], frames_dir, 10)  # 10 FPS
+                for mp4 in mp4_files
+            ]
+            # Use multiprocessing for frame extraction
+            cpu_count = multiprocessing.cpu_count()
+            with multiprocessing.Pool(processes=cpu_count) as pool:
+                results = pool.map(frame_extractor.extract_frames_from_video, extraction_args)
+            # Log frame extraction results
+            total_frames = sum(count for count in results if count is not None)
+            log_message(f"🎞️ Extracted {total_frames} frames from {len(mp4_files)} videos using {cpu_count} CPU cores")
+        processing_status["extracted_courses"] += 1
+        processing_status["extracted_mp4s"] += len(mp4_files)
+        log_message(f"✅ Successfully processed '{course_name}' - found {len(mp4_files)} MP4 files")
+        return mp4_files
+    except Exception as e:
+        error_msg = str(e)
+        log_message(f"❌ Processing failed: {error_msg}")
+        log_failed_file(filename, error_msg)
+        return []
+    finally:
+        processing_status["current_file"] = None
+        # Clean up extracted directory
+        if os.path.exists(extract_dir):
+            shutil.rmtree(extract_dir, ignore_errors=True)
+def process_hf_files_background(start_index: int = 5, limit: int = DEFAULT_RAR_LIMIT):
+    """Background task to process HuggingFace files"""
+    if not hf_api:
+        log_message("❌ HuggingFace API not configured (missing HF_TOKEN)")
+        return
+    processing_status["is_running"] = True
+    try:
+        # Load state
+        processed_rars = load_json_state(PROCESS_STATE_FILE, {"processed_rars": []})["processed_rars"]
+        download_state = load_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": 0})
+        # Use start_index if provided, otherwise use the saved state
+        current_index = start_index if start_index > 0 else download_state["next_download_index"]
+        log_message(f"📊 Starting processing from index {current_index} with a limit of {limit} files.")
+        log_message(f"📊 Previously processed: {len(processed_rars)} files")
+        # Get file list
+        try:
+            files = list(hf_api.list_repo_files(repo_id=SOURCE_REPO_ID, repo_type="dataset"))
+            rar_files = sorted([f for f in files if f.endswith(".rar")])
+            processing_status["total_files"] = len(rar_files)
+            log_message(f"📁 Found {len(rar_files)} RAR files in repository")
+            if current_index >= len(rar_files):
+                log_message("✅ All files have been processed!")
+                return
+        except Exception as e:
+            log_message(f"❌ Failed to get file list: {str(e)}")
+            return
+        processed_count = 0
+        while processed_count < limit and current_index < len(rar_files) and processing_status["is_running"]:
+            rar_file = rar_files[current_index]
+            filename = os.path.basename(rar_file)
+            if filename in processed_rars:
+                log_message(f"⏭️ Skipping already processed: {filename}")
+                processing_status["processed_files"] += 1
+                current_index += 1
+                save_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": current_index})
+                continue
+            log_message(f"📥 Downloading: {filename}")
+            dest_path = os.path.join(DOWNLOAD_FOLDER, filename)
+            # Download file
+            download_url = f"https://huggingface.co/datasets/{SOURCE_REPO_ID}/resolve/main/{rar_file}"
+            if download_with_retry(download_url, dest_path):
+                # Process file
+                mp4_files = process_rar_file(dest_path)
+                if mp4_files:
+                    processed_rars.append(filename)
+                    save_json_state(PROCESS_STATE_FILE, {"processed_rars": processed_rars})
+                    log_message(f"✅ Successfully processed: {filename}")
+                    processing_status["processed_files"] += 1
+                else:
+                    log_message(f"❌ Failed to process: {filename}")
+                    processing_status["failed_files"] += 1
+                # Clean up downloaded file
+                try:
+                    os.remove(dest_path)
+                    log_message(f"🗑️ Cleaned up download: {filename}")
+                except:
+                    pass
+            else:
+                log_message(f"❌ Failed to download: {filename}")
+                processing_status["failed_files"] += 1
+            # Update download state for next run
+            current_index += 1
+            processed_count += 1
+            save_json_state(DOWNLOAD_STATE_FILE, {"next_download_index": current_index})
+        if current_index >= len(rar_files):
+            log_message("🎉 All available RAR files have been processed!")
+        elif not processing_status["is_running"]:
+            log_message("⏹️ Processing stopped by request.")
+        else:
+            log_message(f"✅ Processed {processed_count} RAR files. Next index to process: {current_index}")
+    except Exception as e:
+        log_message(f"❌ Fatal error in background processing: {str(e)}")
+    finally:
+        processing_status["is_running"] = False
+        cleanup_temp_files()