Spaces:

Fred808
/

FCORD

Paused

App Files Files Community

Fred808 commited on Oct 30, 2025

Commit

708ee50

verified ·

1 Parent(s): f3af125

Update app.py

Browse files

Files changed (1) hide show

app.py +411 -360

app.py CHANGED Viewed

@@ -4,14 +4,14 @@ import time
 import asyncio
 import aiohttp
 import zipfile
 import shutil
 from typing import Dict, List, Set, Optional, Any
 from urllib.parse import quote
 from datetime import datetime
 from pathlib import Path
-import io
-from fastapi import FastAPI, BackgroundTasks, HTTPException, status, Request, Form
 from fastapi.responses import HTMLResponse
 from fastapi.templating import Jinja2Templates
 from pydantic import BaseModel, Field
@@ -19,21 +19,17 @@ from huggingface_hub import HfApi, hf_hub_download, HfFileSystem
 import uvicorn
 # --- Configuration ---
-# Flow Server ID and Port will be set via environment variables for easy deployment
 FLOW_ID = os.getenv("FLOW_ID", "flow_default")
-FLOW_PORT = int(os.getenv("FLOW_PORT", 8001)) # Default to 8001 for flow1
-# Manager Server Configuration
 MANAGER_URL = os.getenv("MANAGER_URL", "https://fred808-fcord.hf.space")
 MANAGER_COMPLETE_TASK_URL = f"{MANAGER_URL}/task/complete"
-# Hugging Face Configuration
-HF_TOKEN = os.getenv("HF_TOKEN", "") # User provided token
 HF_DATASET_ID = os.getenv("HF_DATASET_ID", "Fred808/BG3")
-HF_OUTPUT_DATASET_ID = os.getenv("HF_OUTPUT_DATASET_ID", "fred808/helium") # Target dataset for captions
-# Using the full list from the user's original code
-INITIAL_CAPTION_SERVERS = [
     "https://fred808-pil-4-1.hf.space/analyze",
     "https://fred808-pil-4-2.hf.space/analyze",
     "https://fred808-pil-4-3.hf.space/analyze",
@@ -78,126 +74,156 @@ MODEL_TYPE = "Florence-2-large"
 TEMP_DIR = Path(f"temp_images_{FLOW_ID}")
 TEMP_DIR.mkdir(exist_ok=True)
-# State persistence file name in the output dataset
-STATE_FILENAME = f"processing_state_{FLOW_ID}.json"
 # --- Models ---
 class ProcessCourseRequest(BaseModel):
     course_name: Optional[str] = None
-    start_index: int = 0 # New field for configurable start index
-class CaptionServer(BaseModel):
-    url: str
-    busy: bool = False
-    total_processed: int = 0
-    total_time: float = 0.0
-    model: str = MODEL_TYPE
     @property
     def fps(self):
         return self.total_processed / self.total_time if self.total_time > 0 else 0
-class ProcessingState(BaseModel):
-    # processed_files is a Set in the Pydantic model but stored as a List in JSON
-    processed_files: Set[str] = Field(default_factory=set)
-    last_processed_course: Optional[str] = None
-    last_processed_index: int = 0
-    servers: List[CaptionServer] = Field(default_factory=list)
-# --- Global State ---
-# Global state object
-state = ProcessingState()
-# Lock for safely modifying the global state (especially servers list)
-state_lock = asyncio.Lock()
-# Templates for the UI
-templates = Jinja2Templates(directory="templates")
-# Index for round-robin selection
 server_index = 0
-# --- State Management Functions ---
 async def load_state_from_hf():
-    """Downloads and loads the processing state from the output dataset."""
     global state
-    print(f"[{FLOW_ID}] Attempting to load state from {STATE_FILENAME} in {HF_OUTPUT_DATASET_ID}...")
-    try:
-        fs = HfFileSystem(token=HF_TOKEN)
-        if fs.exists(f"{HF_OUTPUT_DATASET_ID}/{STATE_FILENAME}"):
-            with fs.open(f"{HF_OUTPUT_DATASET_ID}/{STATE_FILENAME}", "r") as f:
-                data = json.load(f)
-                # Convert list back to set for processed_files
-                if 'processed_files' in data and isinstance(data['processed_files'], list):
-                    data['processed_files'] = set(data['processed_files'])
-                # Ensure servers are loaded, falling back to initial list if not present
-                if not data.get('servers'):
-                    data['servers'] = [CaptionServer(url=url).dict() for url in INITIAL_CAPTION_SERVERS]
-                # Manually parse servers to Pydantic models to handle nested structure
-                data['servers'] = [CaptionServer(**s) for s in data['servers']]
-                state = ProcessingState(**data)
-                print(f"[{FLOW_ID}] State loaded successfully. Processed files: {len(state.processed_files)}")
-                return True
-        else:
-            print(f"[{FLOW_ID}] State file not found. Initializing with default servers.")
-            state.servers = [CaptionServer(url=url) for url in INITIAL_CAPTION_SERVERS]
-            return False
-    except Exception as e:
-        print(f"[{FLOW_ID}] Error loading state: {e}. Initializing with default servers.")
-        state.servers = [CaptionServer(url=url) for url in INITIAL_CAPTION_SERVERS]
-        return False
 async def save_state_to_hf():
-    """Saves the current processing state to the output dataset."""
     async with state_lock:
-        print(f"[{FLOW_ID}] Saving state to {STATE_FILENAME} in {HF_OUTPUT_DATASET_ID}...")
         try:
-            # Prepare data for saving, converting sets/objects to serializable types
-            data_to_save = state.dict()
-            data_to_save['processed_files'] = list(state.processed_files) # Convert set to list for JSON
-            json_content = json.dumps(data_to_save, indent=2, ensure_ascii=False).encode('utf-8')
-            api = HfApi(token=HF_TOKEN)
             api.upload_file(
                 path_or_fileobj=io.BytesIO(json_content),
-                path_in_repo=STATE_FILENAME,
                 repo_id=HF_OUTPUT_DATASET_ID,
                 repo_type="dataset",
-                commit_message=f"[{FLOW_ID}] Update processing state"
             )
             print(f"[{FLOW_ID}] State saved successfully.")
             return True
         except Exception as e:
-            print(f"[{FLOW_ID}] Error saving state: {e}")
             return False
-# --- Core Processing Functions ---
 async def get_available_server(timeout: float = 300.0) -> CaptionServer:
-    """Round-robin selection of an available caption server from the global state."""
     global server_index
     start_time = time.time()
     while True:
-        async with state_lock:
-            # Check if there are any servers configured
-            if not state.servers:
-                raise RuntimeError("No caption servers are configured.")
-            # Round-robin check for an available server
-            for _ in range(len(state.servers)):
-                server = state.servers[server_index % len(state.servers)]
-                server_index = (server_index + 1) % len(state.servers)
-                if not server.busy:
-                    return server
-        # If all servers are busy, wait for a short period and check again
         await asyncio.sleep(0.5)
-        # Check if timeout has been reached
         if time.time() - start_time > timeout:
             raise TimeoutError(f"Timeout ({timeout}s) waiting for an available caption server.")
@@ -207,21 +233,13 @@ async def send_image_for_captioning(image_path: Path, course_name: str, progress
     for attempt in range(MAX_RETRIES):
         server = None
         try:
-            # 1. Get an available server (will wait if all are busy, with a timeout)
             server = await get_available_server()
-            async with state_lock:
-                # Find the server in the global list and mark it busy
-                server_in_state = next(s for s in state.servers if s.url == server.url)
-                server_in_state.busy = True
             start_time = time.time()
-            # Print a less verbose message only on the first attempt
             if attempt == 0:
                 print(f"[{FLOW_ID}] Starting attempt on {image_path.name}...")
-            # 2. Prepare request data
             form_data = aiohttp.FormData()
             form_data.add_field('file',
                                 image_path.open('rb'),
@@ -229,24 +247,21 @@ async def send_image_for_captioning(image_path: Path, course_name: str, progress
                                 content_type='image/jpeg')
             form_data.add_field('model_choice', MODEL_TYPE)
-            # 3. Send request
             async with aiohttp.ClientSession() as session:
-                # Increased timeout to 10 minutes (600s)
                 async with session.post(server.url, data=form_data, timeout=600) as resp:
                     if resp.status == 200:
                         result = await resp.json()
                         caption = result.get("caption")
                         if caption:
-                            # Update progress counter
                             progress_tracker['completed'] += 1
                             if progress_tracker['completed'] % 50 == 0:
                                 print(f"[{FLOW_ID}] PROGRESS: {progress_tracker['completed']}/{progress_tracker['total']} captions completed.")
-                            # Log success only if it's not a progress report interval
-                            if progress_tracker['completed'] % 50 != 0:
-                                print(f"[{FLOW_ID}] Success: {image_path.name} captioned by {server.url}")
                             return {
                                 "course": course_name,
                                 "image_path": image_path.name,
@@ -255,51 +270,76 @@ async def send_image_for_captioning(image_path: Path, course_name: str, progress
                             }
                         else:
                             print(f"[{FLOW_ID}] Server {server.url} returned success but no caption for {image_path.name}. Retrying...")
-                            continue # Retry with a different server
                     else:
                         error_text = await resp.text()
                         print(f"[{FLOW_ID}] Error from server {server.url} for {image_path.name}: {resp.status} - {error_text}. Retrying...")
-                        continue # Retry with a different server
-        except (aiohttp.ClientError, asyncio.TimeoutError, TimeoutError, RuntimeError) as e:
-            # RuntimeError is for "No caption servers are configured."
-            print(f"[{FLOW_ID}] Connection/Timeout/Server error for {image_path.name} on {server.url if server else 'unknown server'}: {e}. Retrying...")
-            continue # Retry with a different server
         except Exception as e:
             print(f"[{FLOW_ID}] Unexpected error during captioning for {image_path.name}: {e}. Retrying...")
-            continue # Retry with a different server
         finally:
             if server:
                 end_time = time.time()
-                async with state_lock:
-                    # Find the server in the global list and update its stats
-                    try:
-                        server_in_state = next(s for s in state.servers if s.url == server.url)
-                        server_in_state.busy = False
-                        server_in_state.total_processed += 1
-                        server_in_state.total_time += (end_time - start_time)
-                    except StopIteration:
-                        # Server might have been removed while processing
-                        print(f"[{FLOW_ID}] Warning: Completed task on a server that was likely removed: {server.url}")
     print(f"[{FLOW_ID}] FAILED after {MAX_RETRIES} attempts for {image_path.name}.")
     return None
-async def upload_captions_to_hf(zip_full_name: str, captions: List[Dict]) -> bool:
-    """Uploads the final captions JSON file to the output dataset.
-    The user requested the output JSON file to be named after the full zip file name.
-    """
-    # Use the full zip name, replacing the extension with .json
     caption_filename = Path(zip_full_name).with_suffix('.json').name
     try:
         print(f"[{FLOW_ID}] Uploading {len(captions)} captions for {zip_full_name} as {caption_filename} to {HF_OUTPUT_DATASET_ID}...")
-        # Create JSON content in memory
         json_content = json.dumps(captions, indent=2, ensure_ascii=False).encode('utf-8')
-        api = HfApi(token=HF_TOKEN)
         api.upload_file(
             path_or_fileobj=io.BytesIO(json_content),
             path_in_repo=caption_filename,
@@ -315,225 +355,147 @@ async def upload_captions_to_hf(zip_full_name: str, captions: List[Dict]) -> boo
         print(f"[{FLOW_ID}] Error uploading captions for {zip_full_name}: {e}")
         return False
-async def download_and_extract_zip(course_name: str) -> Optional[tuple[Path, str, str]]:
-    """Downloads the next unprocessed zip file for the course and extracts its contents."""
-    print(f"[{FLOW_ID}] Looking for files starting with '{course_name}' in frames/ directory...")
-    try:
-        api = HfApi(token=HF_TOKEN)
-        # List all files in the frames directory
-        repo_files = api.list_repo_files(
-            repo_id=HF_DATASET_ID,
-            repo_type="dataset"
-        )
-        # Find zip files that start with the course name
-        matching_files = [
-            f for f in repo_files
-            if f.startswith(f"frames/{course_name}") and f.endswith('.zip')
-        ]
-        if not matching_files:
-            print(f"[{FLOW_ID}] No zip files found starting with '{course_name}' in frames/ directory.")
-            return None, None, None
         async with state_lock:
-            # Filter out already processed files using the global state
-            unprocessed_files = [f for f in matching_files if f not in state.processed_files]
-        if not unprocessed_files:
-            print(f"[{FLOW_ID}] No new zip files found for '{course_name}'.")
-            return None, None, None
-        repo_file_full_path = unprocessed_files[0] # e.g., frames/DAREEFSA_full_name.zip
-        # Extract the full file name from the path (e.g., DAREEFSA_full_name.zip)
-        zip_full_name = Path(repo_file_full_path).name
-        print(f"[{FLOW_ID}] Found new matching file: {repo_file_full_path}. Full name: {zip_full_name}")
-        # Use hf_hub_download to get the file path
-        zip_path = hf_hub_download(
-            repo_id=HF_DATASET_ID,
-            filename=repo_file_full_path, # Use the full path in the repo
-            repo_type="dataset",
-            token=HF_TOKEN,
-        )
-        print(f"[{FLOW_ID}] Downloaded to {zip_path}. Extracting...")
-        # Create a temporary directory for extraction
-        extract_dir = TEMP_DIR / course_name
-        extract_dir.mkdir(exist_ok=True)
-        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-            zip_ref.extractall(extract_dir)
-        print(f"[{FLOW_ID}] Extraction complete to {extract_dir}.")
-        # Return the extraction directory, the full zip file name, and the repo path
-        return extract_dir, zip_full_name, repo_file_full_path
-    except Exception as e:
-        print(f"[{FLOW_ID}] Error downloading or extracting zip for {course_name}: {e}")
-        return None, None, None
-async def process_course_task(course_name: str, start_index: int = 0):
-    """Main task to process a single course, looping until all files are processed."""
-    print(f"[{FLOW_ID}] Starting continuous processing for course: {course_name} with start index {start_index}")
-    global_success = True
-    # Update state before starting the loop
-    async with state_lock:
-        state.last_processed_course = course_name
-        state.last_processed_index = start_index
-    await save_state_to_hf()
-    # Loop to continuously check for new files matching the course_name prefix
-    while True:
         extract_dir = None
         zip_full_name = None
-        repo_file_full_path = None
         try:
-            # download_and_extract_zip now uses global state to check for processed files
-            download_result = await download_and_extract_zip(course_name)
-            if download_result is None or download_result[0] is None:
-                # No new files found, or an error occurred during search/download
-                print(f"[{FLOW_ID}] No new files found for {course_name}. Exiting loop.")
-                break
-            extract_dir, zip_full_name, repo_file_full_path = download_result
-            # --- Start Processing the single file ---
-            # FIX: Use recursive glob to find images in subdirectories
             image_paths = [p for p in extract_dir.glob("**/*") if p.is_file() and p.suffix.lower() in ['.jpg', '.jpeg', '.png']]
-            # Apply start_index logic
-            if start_index > 0:
-                original_count = len(image_paths)
-                image_paths = image_paths[start_index:]
-                print(f"[{FLOW_ID}] Applying start index {start_index}. Processing {len(image_paths)} images from {original_count} in {zip_full_name}.")
-                # Reset start_index for subsequent zip files
-                start_index = 0
-            else:
-                print(f"[{FLOW_ID}] Found {len(image_paths)} images to process in {zip_full_name}.")
-            current_file_success = False
             if not image_paths:
-                print(f"[{FLOW_ID}] No images to process after applying start index in {zip_full_name}. Marking as complete.")
-                current_file_success = True
             else:
                 # Initialize progress tracker
                 progress_tracker = {
                     'total': len(image_paths),
                     'completed': 0
                 }
-                print(f"[{FLOW_ID}] Starting captioning for {progress_tracker['total']} images in {zip_full_name}...")
-                # Create a semaphore to limit concurrent tasks to the number of available servers
                 async with state_lock:
-                    # Use the current number of servers from the global state
-                    semaphore = asyncio.Semaphore(len(state.servers) if state.servers else 1)
                 async def limited_send_image_for_captioning(image_path, course_name, progress_tracker):
                     async with semaphore:
                         return await send_image_for_captioning(image_path, course_name, progress_tracker)
-                # Create a list of tasks for parallel captioning
                 caption_tasks = [limited_send_image_for_captioning(p, course_name, progress_tracker) for p in image_paths]
-                # Run all captioning tasks concurrently
                 results = await asyncio.gather(*caption_tasks)
-                # Filter out failed results
                 all_captions = [r for r in results if r is not None]
-                # Final progress report for the current file
                 if len(all_captions) == len(image_paths):
                     print(f"[{FLOW_ID}] FINAL PROGRESS for {zip_full_name}: Successfully completed all {len(all_captions)} captions.")
-                    current_file_success = True
                 else:
                     print(f"[{FLOW_ID}] FINAL PROGRESS for {zip_full_name}: Completed with partial result: {len(all_captions)}/{len(image_paths)} captions.")
-                    current_file_success = False
                 # Upload results
                 if all_captions and zip_full_name:
-                    # Use the full zip file name for the upload as requested
-                    print(f"[{FLOW_ID}] Uploading {len(all_captions)} captions for {zip_full_name}...")
                     if await upload_captions_to_hf(zip_full_name, all_captions):
                         print(f"[{FLOW_ID}] Successfully uploaded captions for {zip_full_name}.")
-                        # If partial success, we still upload, but the overall task is marked as failure if any file failed
-                        if not current_file_success:
-                            global_success = False
                     else:
-                        print(f"[{FLOW_ID}] Failed to upload captions for {zip_full_name}.")
-                        current_file_success = False
-                        global_success = False
                 else:
-                    print(f"[{FLOW_ID}] No captions generated or zip_full_name is missing. Skipping upload for {zip_full_name}.")
-                    current_file_success = False
-                    global_success = False
-            # --- End Processing the single file ---
-            # Mark the file as processed and save state
-            if current_file_success:
-                async with state_lock:
-                    state.processed_files.add(repo_file_full_path)
-                await save_state_to_hf()
         except Exception as e:
             error_message = str(e)
-            print(f"[{FLOW_ID}] Critical error in process_course_task for {course_name}: {error_message}")
-            global_success = False
         finally:
-            # Cleanup temporary files for the current file
             if extract_dir and extract_dir.exists():
                 print(f"[{FLOW_ID}] Cleaned up temporary directory {extract_dir}.")
                 shutil.rmtree(extract_dir, ignore_errors=True)
-            # If an unrecoverable error occurred (e.g., during search/download), break the loop
-            if download_result is None and extract_dir is None:
-                break
-    # --- Final Report after the loop is complete ---
-    print(f"[{FLOW_ID}] All processing loops complete for {course_name}.")
-    # Report completion to manager
-    final_error_message = error_message if not global_success else None
-    await report_completion(course_name, global_success, final_error_message)
-    return global_success
-async def report_completion(course_name: str, success: bool, error_message: Optional[str] = None):
-    """Reports the task result back to the Manager Server."""
-    print(f"[{FLOW_ID}] Reporting completion for {course_name} (Success: {success})...")
-    payload = {
-        "flow_id": FLOW_ID,
-        "course_name": course_name,
-        "success": success,
-        "error_message": error_message
-    }
-    try:
-        async with aiohttp.ClientSession() as session:
-            async with session.post(MANAGER_COMPLETE_TASK_URL, json=payload) as resp:
-                if resp.status != 200:
-                    print(f"[{FLOW_ID}] ERROR: Manager reported non-200 status: {resp.status} - {await resp.text()}")
-                else:
-                    print(f"[{FLOW_ID}] Successfully reported completion to Manager.")
-    except aiohttp.ClientError as e:
-        print(f"[{FLOW_ID}] CRITICAL ERROR: Could not connect to Manager at {MANAGER_COMPLETE_TASK_URL}. Task completion not reported. Error: {e}")
-    except Exception as e:
-        print(f"[{FLOW_ID}] Unexpected error during reporting: {e}")
 # --- FastAPI App and Endpoints ---
@@ -543,78 +505,167 @@ app = FastAPI(
     version="2.0.0"
 )
 @app.on_event("startup")
 async def startup_event():
-    print(f"Flow Server {FLOW_ID} starting up...")
-    # Load state first before starting the server
-    await load_state_from_hf()
     print(f"Flow Server {FLOW_ID} started on port {FLOW_PORT}. Manager URL: {MANAGER_URL}")
 @app.get("/", response_class=HTMLResponse)
-async def root(request: Request):
-    """The main UI dashboard."""
     async with state_lock:
         context = {
             "request": request,
             "flow_id": FLOW_ID,
-            "status": "ready" if not any(s.busy for s in state.servers) else "processing",
-            "manager_url": MANAGER_URL,
-            "servers": state.servers,
-            "total_servers": len(state.servers),
-            "busy_servers": sum(1 for s in state.servers if s.busy),
-            "processed_files_count": len(state.processed_files),
-            "last_course": state.last_processed_course,
-            "last_index": state.last_processed_index,
         }
-    return templates.TemplateResponse("dashboard.html", context)
-@app.post("/add_server")
-async def add_server_endpoint(server_url: str = Form(...)):
-    """API endpoint to add a new caption server."""
-    if not server_url.endswith("/analyze"):
-        server_url = server_url.rstrip("/") + "/analyze"
     async with state_lock:
-        # Check if server already exists
-        if any(s.url == server_url for s in state.servers):
-            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Server already exists.")
-        new_server = CaptionServer(url=server_url)
-        state.servers.append(new_server)
-    await save_state_to_hf()
-    return {"status": "success", "message": f"Server {server_url} added.", "server": new_server.dict()}
-@app.post("/remove_server")
-async def remove_server_endpoint(server_url: str = Form(...)):
-    """API endpoint to remove a caption server."""
-    async with state_lock:
-        initial_count = len(state.servers)
-        state.servers = [s for s in state.servers if s.url != server_url]
-        if len(state.servers) == initial_count:
-            raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Server not found.")
-    await save_state_to_hf()
-    return {"status": "success", "message": f"Server {server_url} removed."}
-@app.post("/process_course")
-async def process_course_api(request: ProcessCourseRequest, background_tasks: BackgroundTasks):
-    """
-    Receives a course name and optional start index and starts processing in the background.
-    """
-    course_name = request.course_name
-    start_index = request.start_index
-    if not course_name:
-        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Course name is required.")
-    print(f"[{FLOW_ID}] Received course: {course_name} with start index {start_index}. Starting background task.")
-    # Start the heavy processing in a background task so the API call returns immediately
-    background_tasks.add_task(process_course_task, course_name, start_index)
-    return {"status": "processing", "course_name": course_name, "start_index": start_index, "message": "Processing started in background."}
 if __name__ == "__main__":
-    # Note: When running in the sandbox, we need to use 0.0.0.0 to expose the port.
     uvicorn.run(app, host="0.0.0.0", port=FLOW_PORT)

 import asyncio
 import aiohttp
 import zipfile
+import io
 import shutil
 from typing import Dict, List, Set, Optional, Any
 from urllib.parse import quote
 from datetime import datetime
 from pathlib import Path
+from fastapi import FastAPI, BackgroundTasks, HTTPException, status, Request
 from fastapi.responses import HTMLResponse
 from fastapi.templating import Jinja2Templates
 from pydantic import BaseModel, Field
 import uvicorn
 # --- Configuration ---
 FLOW_ID = os.getenv("FLOW_ID", "flow_default")
+FLOW_PORT = int(os.getenv("FLOW_PORT", 8001))
 MANAGER_URL = os.getenv("MANAGER_URL", "https://fred808-fcord.hf.space")
 MANAGER_COMPLETE_TASK_URL = f"{MANAGER_URL}/task/complete"
+HF_TOKEN = os.getenv("HF_TOKEN", "")
 HF_DATASET_ID = os.getenv("HF_DATASET_ID", "Fred808/BG3")
+HF_OUTPUT_DATASET_ID = os.getenv("HF_OUTPUT_DATASET_ID", "fred808/helium")
+STATE_FILE_NAME = f"{FLOW_ID}_state.json"
+# Using the full list from the user's original code for actual deployment
+CAPTION_SERVERS = [
     "https://fred808-pil-4-1.hf.space/analyze",
     "https://fred808-pil-4-2.hf.space/analyze",
     "https://fred808-pil-4-3.hf.space/analyze",
 TEMP_DIR = Path(f"temp_images_{FLOW_ID}")
 TEMP_DIR.mkdir(exist_ok=True)
 # --- Models ---
 class ProcessCourseRequest(BaseModel):
     course_name: Optional[str] = None
+class CaptionServer:
+    def __init__(self, url):
+        self.url = url
+        self.busy = False
+        self.total_processed = 0
+        self.total_time = 0
+        self.model = MODEL_TYPE
     @property
     def fps(self):
         return self.total_processed / self.total_time if self.total_time > 0 else 0
+class ServerState(BaseModel):
+    # The list of all zip files in the dataset (frames/ directory)
+    all_zip_files: List[str] = Field(default_factory=list)
+    # The set of zip files that have been successfully processed and uploaded
+    processed_files: Set[str] = Field(default_factory=set)
+    # The index in all_zip_files from which the next download should start
+    current_index: int = 0
+    # Total number of files to process
+    total_files: int = 0
+    # Status of the current operation
+    status: str = "Idle"
+    # Name of the file currently being processed
+    current_file: Optional[str] = None
+    # Progress within the current file
+    current_file_progress: str = "0/0"
+    # Timestamp of the last update
+    last_update: str = datetime.now().isoformat()
+    # Flag to control the processing loop
+    is_running: bool = False
+# Global state for caption servers and the overall server state
+servers = [CaptionServer(url) for url in CAPTION_SERVERS]
 server_index = 0
+state = ServerState()
+# Lock for thread-safe access to the global state
+state_lock = asyncio.Lock()
+# --- Persistence Functions ---
+def get_hf_api():
+    """Helper to get HfApi instance."""
+    return HfApi(token=HF_TOKEN)
+def get_hf_fs():
+    """Helper to get HfFileSystem instance."""
+    return HfFileSystem(token=HF_TOKEN)
 async def load_state_from_hf():
+    """Loads the state from the Hugging Face output dataset."""
     global state
+    fs = get_hf_fs()
+    state_path = f"{HF_OUTPUT_DATASET_ID}/{STATE_FILE_NAME}"
+    async with state_lock:
+        try:
+            if fs.exists(state_path):
+                print(f"[{FLOW_ID}] Loading state from {state_path}...")
+                with fs.open(state_path, 'rb') as f:
+                    data = json.load(f)
+                    # Convert list of processed files back to a set
+                    if 'processed_files' in data and isinstance(data['processed_files'], list):
+                        data['processed_files'] = set(data['processed_files'])
+                    state = ServerState.parse_obj(data)
+                    print(f"[{FLOW_ID}] State loaded successfully. Current index: {state.current_index}")
+            else:
+                print(f"[{FLOW_ID}] State file {state_path} not found. Starting with default state.")
+        except Exception as e:
+            print(f"[{FLOW_ID}] Error loading state from HF: {e}. Starting with default state.")
+            state = ServerState()
 async def save_state_to_hf():
+    """Saves the current state to the Hugging Face output dataset."""
+    global state
+    api = get_hf_api()
+    state_path = STATE_FILE_NAME
     async with state_lock:
+        state.last_update = datetime.now().isoformat()
+        # Convert set of processed files to a list for JSON serialization
+        data_to_save = state.dict()
+        data_to_save['processed_files'] = list(state.processed_files)
+        json_content = json.dumps(data_to_save, indent=2, ensure_ascii=False).encode('utf-8')
         try:
+            print(f"[{FLOW_ID}] Saving state to {state_path} in {HF_OUTPUT_DATASET_ID}...")
             api.upload_file(
                 path_or_fileobj=io.BytesIO(json_content),
+                path_in_repo=state_path,
                 repo_id=HF_OUTPUT_DATASET_ID,
                 repo_type="dataset",
+                commit_message=f"[{FLOW_ID}] Update server state. Index: {state.current_index}"
             )
             print(f"[{FLOW_ID}] State saved successfully.")
             return True
         except Exception as e:
+            print(f"[{FLOW_ID}] Error saving state to HF: {e}")
             return False
+async def update_file_list():
+    """Fetches the list of all zip files from the BG3 dataset."""
+    global state
+    api = get_hf_api()
+    async with state_lock:
+        try:
+            state.status = "Updating file list..."
+            print(f"[{FLOW_ID}] Fetching file list from {HF_DATASET_ID}...")
+            repo_files = api.list_repo_files(
+                repo_id=HF_DATASET_ID,
+                repo_type="dataset"
+            )
+            # Filter for zip files in the 'frames/' directory
+            zip_files = sorted([
+                f for f in repo_files
+                if f.startswith("frames/") and f.endswith('.zip')
+            ])
+            state.all_zip_files = zip_files
+            state.total_files = len(zip_files)
+            state.status = "File list updated."
+            print(f"[{FLOW_ID}] Found {state.total_files} zip files.")
+        except Exception as e:
+            state.status = f"Error updating file list: {e}"
+            print(f"[{FLOW_ID}] Error updating file list: {e}")
+        await save_state_to_hf()
+# --- Core Processing Functions (Modified) ---
 async def get_available_server(timeout: float = 300.0) -> CaptionServer:
+    """Round-robin selection of an available caption server."""
     global server_index
     start_time = time.time()
     while True:
+        for _ in range(len(servers)):
+            server = servers[server_index]
+            server_index = (server_index + 1) % len(servers)
+            if not server.busy:
+                return server
         await asyncio.sleep(0.5)
         if time.time() - start_time > timeout:
             raise TimeoutError(f"Timeout ({timeout}s) waiting for an available caption server.")
     for attempt in range(MAX_RETRIES):
         server = None
         try:
             server = await get_available_server()
+            server.busy = True
             start_time = time.time()
             if attempt == 0:
                 print(f"[{FLOW_ID}] Starting attempt on {image_path.name}...")
             form_data = aiohttp.FormData()
             form_data.add_field('file',
                                 image_path.open('rb'),
                                 content_type='image/jpeg')
             form_data.add_field('model_choice', MODEL_TYPE)
             async with aiohttp.ClientSession() as session:
                 async with session.post(server.url, data=form_data, timeout=600) as resp:
                     if resp.status == 200:
                         result = await resp.json()
                         caption = result.get("caption")
                         if caption:
+                            # Update progress counter and global state
                             progress_tracker['completed'] += 1
+                            async with state_lock:
+                                state.current_file_progress = f"{progress_tracker['completed']}/{progress_tracker['total']}"
                             if progress_tracker['completed'] % 50 == 0:
                                 print(f"[{FLOW_ID}] PROGRESS: {progress_tracker['completed']}/{progress_tracker['total']} captions completed.")
                             return {
                                 "course": course_name,
                                 "image_path": image_path.name,
                             }
                         else:
                             print(f"[{FLOW_ID}] Server {server.url} returned success but no caption for {image_path.name}. Retrying...")
+                            continue
                     else:
                         error_text = await resp.text()
                         print(f"[{FLOW_ID}] Error from server {server.url} for {image_path.name}: {resp.status} - {error_text}. Retrying...")
+                        continue
+        except (aiohttp.ClientError, asyncio.TimeoutError, TimeoutError) as e:
+            print(f"[{FLOW_ID}] Connection/Timeout error for {image_path.name} on {server.url if server else 'unknown server'}: {e}. Retrying...")
+            continue
         except Exception as e:
             print(f"[{FLOW_ID}] Unexpected error during captioning for {image_path.name}: {e}. Retrying...")
+            continue
         finally:
             if server:
                 end_time = time.time()
+                server.busy = False
+                server.total_processed += 1
+                server.total_time += (end_time - start_time)
     print(f"[{FLOW_ID}] FAILED after {MAX_RETRIES} attempts for {image_path.name}.")
     return None
+async def download_and_extract_zip(repo_file_full_path: str) -> Optional[tuple[Path, str]]:
+    """Downloads the zip file at the given path and extracts its contents."""
+    zip_full_name = Path(repo_file_full_path).name
+    course_name = zip_full_name.split('_')[0] # Assuming course name is the prefix before the first underscore
+    try:
+        print(f"[{FLOW_ID}] Downloading file: {repo_file_full_path}. Full name: {zip_full_name}")
+        # Use hf_hub_download to get the file path
+        zip_path = hf_hub_download(
+            repo_id=HF_DATASET_ID,
+            filename=repo_file_full_path, # Use the full path in the repo
+            repo_type="dataset",
+            token=HF_TOKEN,
+        )
+        print(f"[{FLOW_ID}] Downloaded to {zip_path}. Extracting...")
+        # Create a temporary directory for extraction
+        extract_dir = TEMP_DIR / course_name / zip_full_name.replace('.', '_')
+        extract_dir.mkdir(parents=True, exist_ok=True)
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_ref.extractall(extract_dir)
+        print(f"[{FLOW_ID}] Extraction complete to {extract_dir}.")
+        # Clean up the downloaded zip file
+        os.remove(zip_path)
+        # Return the extraction directory and the full zip file name
+        return extract_dir, zip_full_name
+    except Exception as e:
+        print(f"[{FLOW_ID}] Error downloading or extracting zip for {repo_file_full_path}: {e}")
+        return None
+async def upload_captions_to_hf(zip_full_name: str, captions: List[Dict]) -> bool:
+    """Uploads the final captions JSON file to the output dataset."""
     caption_filename = Path(zip_full_name).with_suffix('.json').name
     try:
         print(f"[{FLOW_ID}] Uploading {len(captions)} captions for {zip_full_name} as {caption_filename} to {HF_OUTPUT_DATASET_ID}...")
         json_content = json.dumps(captions, indent=2, ensure_ascii=False).encode('utf-8')
+        api = get_hf_api()
         api.upload_file(
             path_or_fileobj=io.BytesIO(json_content),
             path_in_repo=caption_filename,
         print(f"[{FLOW_ID}] Error uploading captions for {zip_full_name}: {e}")
         return False
+async def process_next_file_task():
+    """Task to process the next file in the list based on the current index."""
+    global state
+    if not state.is_running:
+        print(f"[{FLOW_ID}] Processing loop is not running. Exiting task.")
+        return
+    while state.is_running:
+        repo_file_full_path = None
+        current_index = -1
         async with state_lock:
+            current_index = state.current_index
+            if current_index >= state.total_files:
+                state.status = "Finished processing all files."
+                state.is_running = False
+                print(f"[{FLOW_ID}] Reached end of file list. Stopping processing.")
+                await save_state_to_hf()
+                break
+            repo_file_full_path = state.all_zip_files[current_index]
+            if repo_file_full_path in state.processed_files:
+                state.current_index += 1
+                state.status = f"Skipping processed file: {Path(repo_file_full_path).name}"
+                state.current_file = Path(repo_file_full_path).name
+                print(f"[{FLOW_ID}] Skipping already processed file: {repo_file_full_path}")
+                await save_state_to_hf()
+                continue
+            # Mark the file as in-progress in the state
+            state.status = f"Processing file {current_index + 1}/{state.total_files}"
+            state.current_file = Path(repo_file_full_path).name
+            state.current_file_progress = "0/0"
+            await save_state_to_hf()
+        # --- Start Processing ---
         extract_dir = None
         zip_full_name = None
+        global_success = False
         try:
+            download_result = await download_and_extract_zip(repo_file_full_path)
+            if download_result is None:
+                raise Exception("Failed to download or extract zip file.")
+            extract_dir, zip_full_name = download_result
+            course_name = zip_full_name.split('_')[0]
+            # Find images
             image_paths = [p for p in extract_dir.glob("**/*") if p.is_file() and p.suffix.lower() in ['.jpg', '.jpeg', '.png']]
+            print(f"[{FLOW_ID}] Found {len(image_paths)} images to process in {zip_full_name}.")
             if not image_paths:
+                print(f"[{FLOW_ID}] No images found in {zip_full_name}. Marking as complete.")
+                global_success = True
             else:
                 # Initialize progress tracker
                 progress_tracker = {
                     'total': len(image_paths),
                     'completed': 0
                 }
                 async with state_lock:
+                    state.current_file_progress = f"0/{len(image_paths)}"
+                    await save_state_to_hf()
+                # Create and run captioning tasks
+                semaphore = asyncio.Semaphore(len(servers))
                 async def limited_send_image_for_captioning(image_path, course_name, progress_tracker):
                     async with semaphore:
                         return await send_image_for_captioning(image_path, course_name, progress_tracker)
                 caption_tasks = [limited_send_image_for_captioning(p, course_name, progress_tracker) for p in image_paths]
                 results = await asyncio.gather(*caption_tasks)
                 all_captions = [r for r in results if r is not None]
+                # Final progress report
                 if len(all_captions) == len(image_paths):
                     print(f"[{FLOW_ID}] FINAL PROGRESS for {zip_full_name}: Successfully completed all {len(all_captions)} captions.")
+                    global_success = True
                 else:
                     print(f"[{FLOW_ID}] FINAL PROGRESS for {zip_full_name}: Completed with partial result: {len(all_captions)}/{len(image_paths)} captions.")
+                    global_success = False
                 # Upload results
                 if all_captions and zip_full_name:
                     if await upload_captions_to_hf(zip_full_name, all_captions):
                         print(f"[{FLOW_ID}] Successfully uploaded captions for {zip_full_name}.")
+                        # If upload is successful, we mark the file as processed, regardless of partial success
+                        # The uploaded JSON will reflect the actual number of captions
+                        if global_success:
+                            print(f"[{FLOW_ID}] Fully processed and uploaded: {zip_full_name}")
+                        else:
+                            print(f"[{FLOW_ID}] Partially processed but uploaded: {zip_full_name}. Needs manual review.")
+                        # Mark as processed only if upload succeeded
+                        async with state_lock:
+                            state.processed_files.add(repo_file_full_path)
+                            state.current_index += 1
+                            state.current_file = None
+                            state.current_file_progress = "0/0"
+                            state.status = "Idle"
+                            await save_state_to_hf()
                     else:
+                        print(f"[{FLOW_ID}] Failed to upload captions for {zip_full_name}. Will retry this file later.")
+                        # Do NOT increment index or mark as processed, so it will be retried
+                        async with state_lock:
+                            state.status = f"Error uploading captions for {zip_full_name}. Retrying later."
+                            await save_state_to_hf()
+                        # Wait before retrying to avoid immediate re-attempt on a transient error
+                        await asyncio.sleep(60)
                 else:
+                    print(f"[{FLOW_ID}] No captions generated or zip_full_name is missing. Skipping upload for {zip_full_name}. Will retry later.")
+                    # Do NOT increment index or mark as processed
+                    async with state_lock:
+                        state.status = f"No captions generated for {zip_full_name}. Retrying later."
+                        await save_state_to_hf()
+                    await asyncio.sleep(60)
         except Exception as e:
             error_message = str(e)
+            print(f"[{FLOW_ID}] Critical error in process_next_file_task for {repo_file_full_path}: {error_message}")
+            async with state_lock:
+                state.status = f"CRITICAL ERROR for {Path(repo_file_full_path).name}. Retrying later. Error: {error_message[:50]}..."
+                await save_state_to_hf()
+            # Wait before retrying
+            await asyncio.sleep(60)
         finally:
+            # Cleanup temporary files
             if extract_dir and extract_dir.exists():
                 print(f"[{FLOW_ID}] Cleaned up temporary directory {extract_dir}.")
                 shutil.rmtree(extract_dir, ignore_errors=True)
+            # If the loop is still running, wait a short time before checking for the next file
+            if state.is_running:
+                await asyncio.sleep(5)
 # --- FastAPI App and Endpoints ---
     version="2.0.0"
 )
+# Setup Jinja2 templates for the UI
+templates = Jinja2Templates(directory="templates")
 @app.on_event("startup")
 async def startup_event():
     print(f"Flow Server {FLOW_ID} started on port {FLOW_PORT}. Manager URL: {MANAGER_URL}")
+    # 1. Load state from persistence (HF)
+    await load_state_from_hf()
+    # 2. Update the list of all files from the dataset
+    await update_file_list()
+    # 3. Start the continuous processing task if the index is valid
+    if state.current_index < state.total_files:
+        state.is_running = True
+        BackgroundTasks().add_task(process_next_file_task)
+    else:
+        state.is_running = False
+        print(f"[{FLOW_ID}] Index {state.current_index} is out of bounds. Starting in Idle mode.")
 @app.get("/", response_class=HTMLResponse)
+async def home(request: Request):
+    """Home page with status and controls."""
     async with state_lock:
+        processed_count = len(state.processed_files)
+        remaining_count = state.total_files - processed_count
+        # Calculate server stats
+        server_stats = [
+            {
+                "url": s.url,
+                "busy": s.busy,
+                "processed": s.total_processed,
+                "fps": f"{s.fps:.2f}"
+            } for s in servers
+        ]
+        # Calculate overall FPS
+        total_processed = sum(s.total_processed for s in servers)
+        total_time = sum(s.total_time for s in servers)
+        overall_fps = total_processed / total_time if total_time > 0 else 0
         context = {
             "request": request,
             "flow_id": FLOW_ID,
+            "status": state.status,
+            "is_running": state.is_running,
+            "total_files": state.total_files,
+            "processed_count": processed_count,
+            "remaining_count": remaining_count,
+            "current_index": state.current_index,
+            "current_file": state.current_file if state.current_file else "N/A",
+            "current_file_progress": state.current_file_progress,
+            "last_update": state.last_update,
+            "overall_fps": f"{overall_fps:.2f}",
+            "server_stats": server_stats
         }
+    return templates.TemplateResponse("index.html", context)
+@app.post("/set_index")
+async def set_index(request: Request, background_tasks: BackgroundTasks):
+    """Endpoint to manually set the start index."""
+    global state
+    form = await request.form()
+    try:
+        new_index = int(form.get("start_index"))
+    except (TypeError, ValueError):
+        raise HTTPException(status_code=400, detail="Invalid index value.")
     async with state_lock:
+        if 0 <= new_index < state.total_files:
+            state.current_index = new_index
+            state.status = f"Index set to {new_index}. Restarting processing."
+            # If the loop is not running, start it
+            if not state.is_running:
+                state.is_running = True
+                background_tasks.add_task(process_next_file_task)
+            await save_state_to_hf()
+            print(f"[{FLOW_ID}] Index manually set to {new_index}.")
+            return {"status": "success", "message": f"Start index set to {new_index}. Processing will resume from this point."}
+        elif new_index == state.total_files:
+            state.current_index = new_index
+            state.is_running = False
+            state.status = "Finished processing all files."
+            await save_state_to_hf()
+            return {"status": "success", "message": "Index set to end of list. Processing stopped."}
+        else:
+            raise HTTPException(status_code=400, detail=f"Index {new_index} is out of bounds (0 to {state.total_files}).")
+@app.post("/control_processing")
+async def control_processing(request: Request, background_tasks: BackgroundTasks):
+    """Endpoint to start/stop the processing loop."""
+    global state
+    form = await request.form()
+    action = form.get("action")
+    async with state_lock:
+        if action == "start":
+            if not state.is_running and state.current_index < state.total_files:
+                state.is_running = True
+                state.status = "Processing started."
+                background_tasks.add_task(process_next_file_task)
+                await save_state_to_hf()
+                return {"status": "success", "message": "Processing loop started."}
+            elif state.current_index >= state.total_files:
+                return {"status": "error", "message": "Cannot start. All files have been processed."}
+            else:
+                return {"status": "info", "message": "Processing is already running."}
+        elif action == "stop":
+            if state.is_running:
+                state.is_running = False
+                state.status = "Processing stopped by user."
+                await save_state_to_hf()
+                return {"status": "success", "message": "Processing loop stopped."}
+            else:
+                return {"status": "info", "message": "Processing is already stopped."}
+        else:
+            raise HTTPException(status_code=400, detail="Invalid action.")
+@app.get("/status")
+async def get_status():
+    """API endpoint to get the current server status as JSON."""
+    async with state_lock:
+        processed_count = len(state.processed_files)
+        server_stats = [
+            {
+                "url": s.url,
+                "busy": s.busy,
+                "processed": s.total_processed,
+                "fps": f"{s.fps:.2f}"
+            } for s in servers
+        ]
+        total_processed = sum(s.total_processed for s in servers)
+        total_time = sum(s.total_time for s in servers)
+        overall_fps = total_processed / total_time if total_time > 0 else 0
+        return {
+            "flow_id": FLOW_ID,
+            "status": state.status,
+            "is_running": state.is_running,
+            "total_files": state.total_files,
+            "processed_count": processed_count,
+            "remaining_count": state.total_files - processed_count,
+            "current_index": state.current_index,
+            "current_file": state.current_file,
+            "current_file_progress": state.current_file_progress,
+            "last_update": state.last_update,
+            "overall_fps": f"{overall_fps:.2f}",
+            "server_stats": server_stats
+        }
+# The original /process_course endpoint is now obsolete as the server manages its own queue
+# @app.post("/process_course")
+# async def process_course(request: ProcessCourseRequest, background_tasks: BackgroundTasks):
+#     return {"status": "obsolete", "message": "The server now manages its own processing queue based on the index."}
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=FLOW_PORT)