Spaces:

Fred808
/

FCORD

Paused

App Files Files Community

Fred808 commited on Oct 30, 2025

Commit

a99bc96

verified ·

1 Parent(s): 38c1f9d

Update app.py

Browse files

Files changed (1) hide show

app.py +283 -497

app.py CHANGED Viewed

@@ -4,66 +4,35 @@ import time
 import asyncio
 import aiohttp
 import zipfile
-import io
 import shutil
-from typing import Dict, List, Set, Optional, Any
 from urllib.parse import quote
 from datetime import datetime
 from pathlib import Path
-from fastapi import FastAPI, BackgroundTasks, HTTPException, status, Request
-from fastapi.responses import HTMLResponse
-from fastapi.templating import Jinja2Templates
 from pydantic import BaseModel, Field
-from huggingface_hub import HfApi, hf_hub_download, HfFileSystem
-import uvicorn
 # --- Configuration ---
 FLOW_ID = os.getenv("FLOW_ID", "flow_default")
 FLOW_PORT = int(os.getenv("FLOW_PORT", 8001))
-MANAGER_URL = os.getenv("MANAGER_URL", "https://fred808-fcord.hf.space")
-MANAGER_COMPLETE_TASK_URL = f"{MANAGER_URL}/task/complete"
 HF_TOKEN = os.getenv("HF_TOKEN", "")
-HF_DATASET_ID = os.getenv("HF_DATASET_ID", "Fred808/BG3")
-HF_OUTPUT_DATASET_ID = os.getenv("HF_OUTPUT_DATASET_ID", "fred808/helium")
-STATE_FILE_NAME = f"{FLOW_ID}_state.json"
 # Using the full list from the user's original code for actual deployment
 CAPTION_SERVERS = [
     "https://fred808-pil-4-1.hf.space/analyze",
     "https://fred808-pil-4-2.hf.space/analyze",
-    "https://fred808-pil-4-3.hf.space/analyze",
-    "https://fred1012-fred1012-gw0j2h.hf.space/analyze",
-    "https://fred1012-fred1012-wqs6c2.hf.space/analyze",
-    "https://fred1012-fred1012-oncray.hf.space/analyze",
-    "https://fred1012-fred1012-4goge7.hf.space/analyze",
-    "https://fred1012-fred1012-z0eh7m.hf.space/analyze",
-    "https://fred1012-fred1012-u95rte.hf.space/analyze",
-    "https://fred1012-fred1012-igje22.hf.space/analyze",
-    "https://fred1012-fred1012-ibkuf8.hf.space/analyze",
-    "https://fred1012-fred1012-nwqthy.hf.space/analyze",
-    "https://fred1012-fred1012-4ldqj4.hf.space/analyze",
-    "https://fred1012-fred1012-pivlzg.hf.space/analyze",
-    "https://fred1012-fred1012-ptlc5u.hf.space/analyze",
-    "https://fred1012-fred1012-u7lh57.hf.space/analyze",
-    "https://fred1012-fred1012-q8djv1.hf.space/analyze",
-    "https://fredalone-fredalone-ozugrp.hf.space/analyze",
-    "https://fredalone-fredalone-9brxj2.hf.space/analyze",
-    "https://fredalone-fredalone-p8vq9a.hf.space/analyze",
-    "https://fredalone-fredalone-vbli2y.hf.space/analyze",
-    "https://fredalone-fredalone-uggger.hf.space/analyze",
-    "https://fredalone-fredalone-nmi7e8.hf.space/analyze",
-    "https://fredalone-fredalone-d1f26d.hf.space/analyze",
-    "https://fredalone-fredalone-461jp2.hf.space/analyze",
-    "https://fredalone-fredalone-3enfg4.hf.space/analyze",
-    "https://fredalone-fredalone-dqdbpv.hf.space/analyze",
-    "https://fredalone-fredalone-ivtjua.hf.space/analyze",
-    "https://fredalone-fredalone-6bezt2.hf.space/analyze",
-    "https://fredalone-fredalone-e0wfnk.hf.space/analyze",
-    "https://fredalone-fredalone-zu2t7j.hf.space/analyze",
-    "https://fredalone-fredalone-dqtv1o.hf.space/analyze",
-    "https://fredalone-fredalone-wclyog.hf.space/analyze",
-    "https://fredalone-fredalone-t27vig.hf.space/analyze",
     "https://fredalone-fredalone-gahbxh.hf.space/analyze",
     "https://fredalone-fredalone-kw2po4.hf.space/analyze",
     "https://fredalone-fredalone-8h285h.hf.space/analyze"
@@ -75,8 +44,8 @@ TEMP_DIR = Path(f"temp_images_{FLOW_ID}")
 TEMP_DIR.mkdir(exist_ok=True)
 # --- Models ---
-class ProcessCourseRequest(BaseModel):
-    course_name: Optional[str] = None
 class CaptionServer:
     def __init__(self, url):
@@ -90,116 +59,144 @@ class CaptionServer:
     def fps(self):
         return self.total_processed / self.total_time if self.total_time > 0 else 0
-class ServerState(BaseModel):
-    all_zip_files: List[str] = Field(default_factory=list)
-    processed_files: Set[str] = Field(default_factory=set)
-    current_index: int = 0
-    total_files: int = 0
-    status: str = "Idle"
-    current_file: Optional[str] = None
-    current_file_progress: str = "0/0"
-    last_update: str = datetime.now().isoformat()
-    is_running: bool = False
-# Global state for caption servers and the overall server state
 servers = [CaptionServer(url) for url in CAPTION_SERVERS]
 server_index = 0
-state = ServerState()
-# Lock for thread-safe access to the global state
-state_lock = asyncio.Lock()
-# --- Persistence Functions ---
-def get_hf_api():
-    """Helper to get HfApi instance."""
-    return HfApi(token=HF_TOKEN)
-def get_hf_fs():
-    """Helper to get HfFileSystem instance."""
-    return HfFileSystem(token=HF_TOKEN)
-async def load_state_from_hf():
-    """Loads the state from the Hugging Face output dataset."""
-    global state
-    fs = get_hf_fs()
-    state_path = f"{HF_OUTPUT_DATASET_ID}/{STATE_FILE_NAME}"
-    async with state_lock:
-        try:
-            if fs.exists(state_path):
-                print(f"[{FLOW_ID}] Loading state from {state_path}...")
-                with fs.open(state_path, 'rb') as f:
-                    data = json.load(f)
-                    # Convert list of processed files back to a set
-                    if 'processed_files' in data and isinstance(data['processed_files'], list):
-                        data['processed_files'] = set(data['processed_files'])
-                    state = ServerState.parse_obj(data)
-                    print(f"[{FLOW_ID}] State loaded successfully. Current index: {state.current_index}")
-            else:
-                print(f"[{FLOW_ID}] State file {state_path} not found. Starting with default state.")
-        except Exception as e:
-            print(f"[{FLOW_ID}] Error loading state from HF: {e}. Starting with default state.")
-            state = ServerState()
-async def save_state_to_hf():
-    """Saves the current state to the Hugging Face output dataset."""
-    global state
-    api = get_hf_api()
-    state_path = STATE_FILE_NAME
-    async with state_lock:
-        state.last_update = datetime.now().isoformat()
-        # Convert set of processed files to a list for JSON serialization
-        data_to_save = state.dict()
-        data_to_save['processed_files'] = list(state.processed_files)
-        json_content = json.dumps(data_to_save, indent=2, ensure_ascii=False).encode('utf-8')
-        try:
-            print(f"[{FLOW_ID}] Saving state to {state_path} in {HF_OUTPUT_DATASET_ID}...")
-            api.upload_file(
-                path_or_fileobj=io.BytesIO(json_content),
-                path_in_repo=state_path,
-                repo_id=HF_OUTPUT_DATASET_ID,
-                repo_type="dataset",
-                commit_message=f"[{FLOW_ID}] Update server state. Index: {state.current_index}"
-            )
-            print(f"[{FLOW_ID}] State saved successfully.")
-            return True
-        except Exception as e:
-            print(f"[{FLOW_ID}] Error saving state to HF: {e}")
-            return False
-async def update_file_list():
-    """Fetches the list of all zip files from the BG3 dataset."""
-    global state
-    api = get_hf_api()
-    async with state_lock:
-        try:
-            state.status = "Updating file list..."
-            print(f"[{FLOW_ID}] Fetching file list from {HF_DATASET_ID}...")
-            repo_files = api.list_repo_files(
-                repo_id=HF_DATASET_ID,
-                repo_type="dataset"
-            )
-            # Filter for zip files in the 'frames/' directory
-            zip_files = sorted([
-                f for f in repo_files
-                if f.startswith("frames/") and f.endswith('.zip')
-            ])
-            state.all_zip_files = zip_files
-            state.total_files = len(zip_files)
-            state.status = "File list updated."
-            print(f"[{FLOW_ID}] Found {state.total_files} zip files.")
-        except Exception as e:
-            state.status = f"Error updating file list: {e}"
-            print(f"[{FLOW_ID}] Error updating file list: {e}")
-        await save_state_to_hf()
 # --- Core Processing Functions (Modified) ---
@@ -208,30 +205,37 @@ async def get_available_server(timeout: float = 300.0) -> CaptionServer:
     global server_index
     start_time = time.time()
     while True:
         for _ in range(len(servers)):
             server = servers[server_index]
             server_index = (server_index + 1) % len(servers)
             if not server.busy:
                 return server
         await asyncio.sleep(0.5)
         if time.time() - start_time > timeout:
             raise TimeoutError(f"Timeout ({timeout}s) waiting for an available caption server.")
 async def send_image_for_captioning(image_path: Path, course_name: str, progress_tracker: Dict) -> Optional[Dict]:
     """Sends a single image to a caption server for processing."""
     MAX_RETRIES = 3
     for attempt in range(MAX_RETRIES):
         server = None
         try:
             server = await get_available_server()
             server.busy = True
             start_time = time.time()
             if attempt == 0:
                 print(f"[{FLOW_ID}] Starting attempt on {image_path.name}...")
             form_data = aiohttp.FormData()
             form_data.add_field('file',
                                 image_path.open('rb'),
@@ -239,21 +243,24 @@ async def send_image_for_captioning(image_path: Path, course_name: str, progress
                                 content_type='image/jpeg')
             form_data.add_field('model_choice', MODEL_TYPE)
             async with aiohttp.ClientSession() as session:
                 async with session.post(server.url, data=form_data, timeout=600) as resp:
                     if resp.status == 200:
                         result = await resp.json()
                         caption = result.get("caption")
                         if caption:
-                            # Update progress counter and global state
                             progress_tracker['completed'] += 1
-                            async with state_lock:
-                                state.current_file_progress = f"{progress_tracker['completed']}/{progress_tracker['total']}"
                             if progress_tracker['completed'] % 50 == 0:
                                 print(f"[{FLOW_ID}] PROGRESS: {progress_tracker['completed']}/{progress_tracker['total']} captions completed.")
                             return {
                                 "course": course_name,
                                 "image_path": image_path.name,
@@ -262,18 +269,18 @@ async def send_image_for_captioning(image_path: Path, course_name: str, progress
                             }
                         else:
                             print(f"[{FLOW_ID}] Server {server.url} returned success but no caption for {image_path.name}. Retrying...")
-                            continue
                     else:
                         error_text = await resp.text()
                         print(f"[{FLOW_ID}] Error from server {server.url} for {image_path.name}: {resp.status} - {error_text}. Retrying...")
-                        continue
         except (aiohttp.ClientError, asyncio.TimeoutError, TimeoutError) as e:
             print(f"[{FLOW_ID}] Connection/Timeout error for {image_path.name} on {server.url if server else 'unknown server'}: {e}. Retrying...")
-            continue
         except Exception as e:
             print(f"[{FLOW_ID}] Unexpected error during captioning for {image_path.name}: {e}. Retrying...")
-            continue
         finally:
             if server:
                 end_time = time.time()
@@ -284,406 +291,185 @@ async def send_image_for_captioning(image_path: Path, course_name: str, progress
     print(f"[{FLOW_ID}] FAILED after {MAX_RETRIES} attempts for {image_path.name}.")
     return None
-async def download_and_extract_zip(repo_file_full_path: str) -> Optional[tuple[Path, str]]:
-    """Downloads the zip file at the given path and extracts its contents."""
-    zip_full_name = Path(repo_file_full_path).name
-    course_name = zip_full_name.split('_')[0] # Assuming course name is the prefix before the first underscore
-    try:
-        print(f"[{FLOW_ID}] Downloading file: {repo_file_full_path}. Full name: {zip_full_name}")
-        # Use hf_hub_download to get the file path
-        zip_path = hf_hub_download(
-            repo_id=HF_DATASET_ID,
-            filename=repo_file_full_path, # Use the full path in the repo
-            repo_type="dataset",
-            token=HF_TOKEN,
-        )
-        print(f"[{FLOW_ID}] Downloaded to {zip_path}. Extracting...")
-        # Create a temporary directory for extraction
-        extract_dir = TEMP_DIR / course_name / zip_full_name.replace('.', '_')
-        extract_dir.mkdir(parents=True, exist_ok=True)
-        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-            zip_ref.extractall(extract_dir)
-        print(f"[{FLOW_ID}] Extraction complete to {extract_dir}.")
-        # Clean up the downloaded zip file
-        os.remove(zip_path)
-        # Return the extraction directory and the full zip file name
-        return extract_dir, zip_full_name
-    except Exception as e:
-        print(f"[{FLOW_ID}] Error downloading or extracting zip for {repo_file_full_path}: {e}")
-        return None
-async def upload_captions_to_hf(zip_full_name: str, captions: List[Dict]) -> bool:
-    """Uploads the final captions JSON file to the output dataset."""
-    caption_filename = Path(zip_full_name).with_suffix('.json').name
-    try:
-        print(f"[{FLOW_ID}] Uploading {len(captions)} captions for {zip_full_name} as {caption_filename} to {HF_OUTPUT_DATASET_ID}...")
-        json_content = json.dumps(captions, indent=2, ensure_ascii=False).encode('utf-8')
-        api = get_hf_api()
-        api.upload_file(
-            path_or_fileobj=io.BytesIO(json_content),
-            path_in_repo=caption_filename,
-            repo_id=HF_OUTPUT_DATASET_ID,
-            repo_type="dataset",
-            commit_message=f"[{FLOW_ID}] Captions for {zip_full_name}"
-        )
-        print(f"[{FLOW_ID}] Successfully uploaded captions for {zip_full_name}.")
-        return True
-    except Exception as e:
-        print(f"[{FLOW_ID}] Error uploading captions for {zip_full_name}: {e}")
         return False
-async def process_next_file_task():
-    """Continuous task to process files based on the current index."""
-    global state
-    print(f"[{FLOW_ID}] Processing task started. Running: {state.is_running}")
-    while True:
-        # Check if we should be running
-        async with state_lock:
-            should_run = state.is_running
-            current_index = state.current_index
-        if not should_run:
-            # Wait a bit and check again
-            await asyncio.sleep(2)
             continue
-        # Check if we have files to process
-        if current_index >= state.total_files:
-            async with state_lock:
-                state.status = "Finished processing all files."
-                state.is_running = False
-                state.current_file = None
-                state.current_file_progress = "0/0"
-            print(f"[{FLOW_ID}] Reached end of file list. Stopping processing.")
-            await save_state_to_hf()
-            await asyncio.sleep(2)
-            continue
-        # Process the current file
-        repo_file_full_path = None
-        async with state_lock:
-            repo_file_full_path = state.all_zip_files[current_index]
-            if repo_file_full_path in state.processed_files:
-                state.current_index += 1
-                state.status = f"Skipping processed file: {Path(repo_file_full_path).name}"
-                state.current_file = Path(repo_file_full_path).name
-                print(f"[{FLOW_ID}] Skipping already processed file: {repo_file_full_path}")
-                await save_state_to_hf()
-                continue
-            # Mark the file as in-progress
-            state.status = f"Processing file {current_index + 1}/{state.total_files}"
-            state.current_file = Path(repo_file_full_path).name
-            state.current_file_progress = "0/0"
-            await save_state_to_hf()
-        # --- Process the file ---
         extract_dir = None
-        zip_full_name = None
         try:
-            print(f"[{FLOW_ID}] Processing: {repo_file_full_path}")
-            download_result = await download_and_extract_zip(repo_file_full_path)
-            if download_result is None:
                 raise Exception("Failed to download or extract zip file.")
-            extract_dir, zip_full_name = download_result
-            course_name = zip_full_name.split('_')[0]
-            # Find images
             image_paths = [p for p in extract_dir.glob("**/*") if p.is_file() and p.suffix.lower() in ['.jpg', '.jpeg', '.png']]
             print(f"[{FLOW_ID}] Found {len(image_paths)} images to process in {zip_full_name}.")
             if not image_paths:
                 print(f"[{FLOW_ID}] No images found in {zip_full_name}. Marking as complete.")
-                # Mark as processed and move to next
-                async with state_lock:
-                    state.processed_files.add(repo_file_full_path)
-                    state.current_index += 1
-                    state.current_file = None
-                    state.current_file_progress = "0/0"
-                    state.status = "Idle"
-                    await save_state_to_hf()
             else:
-                # Initialize progress tracker
                 progress_tracker = {
                     'total': len(image_paths),
                     'completed': 0
                 }
-                async with state_lock:
-                    state.current_file_progress = f"0/{len(image_paths)}"
-                    await save_state_to_hf()
-                # Process images
                 semaphore = asyncio.Semaphore(len(servers))
                 async def limited_send_image_for_captioning(image_path, course_name, progress_tracker):
                     async with semaphore:
                         return await send_image_for_captioning(image_path, course_name, progress_tracker)
                 caption_tasks = [limited_send_image_for_captioning(p, course_name, progress_tracker) for p in image_paths]
                 results = await asyncio.gather(*caption_tasks)
                 all_captions = [r for r in results if r is not None]
-                # Final progress report
-                success_rate = len(all_captions) / len(image_paths)
-                print(f"[{FLOW_ID}] FINAL PROGRESS for {zip_full_name}: {len(all_captions)}/{len(image_paths)} captions ({success_rate:.1%})")
-                # Upload results if we have any captions
-                if all_captions and zip_full_name:
                     if await upload_captions_to_hf(zip_full_name, all_captions):
-                        print(f"[{FLOW_ID}] Successfully uploaded captions for {zip_full_name}")
-                        # Mark as processed regardless of partial success
-                        async with state_lock:
-                            state.processed_files.add(repo_file_full_path)
-                            state.current_index += 1
-                            state.current_file = None
-                            state.current_file_progress = "0/0"
-                            state.status = "Idle"
-                            await save_state_to_hf()
                     else:
-                        print(f"[{FLOW_ID}] Failed to upload captions for {zip_full_name}. Will retry.")
-                        # Don't increment index, will retry this file
-                        async with state_lock:
-                            state.status = f"Upload failed for {zip_full_name}. Retrying later."
-                            await save_state_to_hf()
-                        await asyncio.sleep(30)  # Wait before retry
                 else:
-                    print(f"[{FLOW_ID}] No captions generated for {zip_full_name}. Will retry.")
-                    # Don't increment index, will retry this file
-                    async with state_lock:
-                        state.status = f"No captions for {zip_full_name}. Retrying later."
-                        await save_state_to_hf()
-                    await asyncio.sleep(30)  # Wait before retry
         except Exception as e:
-            error_message = str(e)
-            print(f"[{FLOW_ID}] Error processing {repo_file_full_path}: {error_message}")
-            async with state_lock:
-                state.status = f"Error processing {Path(repo_file_full_path).name}: {error_message[:100]}..."
-                await save_state_to_hf()
-            await asyncio.sleep(30)  # Wait before retry
         finally:
-            # Cleanup temporary files
             if extract_dir and extract_dir.exists():
-                try:
-                    shutil.rmtree(extract_dir, ignore_errors=True)
-                    print(f"[{FLOW_ID}] Cleaned up temporary directory {extract_dir}.")
-                except Exception as e:
-                    print(f"[{FLOW_ID}] Error cleaning up {extract_dir}: {e}")
 # --- FastAPI App and Endpoints ---
 app = FastAPI(
     title=f"Flow Server {FLOW_ID} API",
-    description="Fetches, extracts, and captions images for a given course.",
-    version="2.0.0"
 )
-# Setup Jinja2 templates for the UI
-templates = Jinja2Templates(directory="templates")
 @app.on_event("startup")
 async def startup_event():
-    print(f"Flow Server {FLOW_ID} started on port {FLOW_PORT}. Manager URL: {MANAGER_URL}")
-    # Initialize in background to prevent blocking server startup
-    asyncio.create_task(initialize_after_startup())
-async def initialize_after_startup():
-    """Initialize app components after server has started"""
-    try:
-        await load_state_from_hf()
-        await update_file_list()
-        # Always start the processing task, but it will check is_running flag
-        asyncio.create_task(process_next_file_task())
-        print(f"[{FLOW_ID}] Processing task created. Current running state: {state.is_running}")
-        # Auto-start if we have files to process
-        async with state_lock:
-            if state.current_index < state.total_files:
-                state.is_running = True
-                print(f"[{FLOW_ID}] Auto-starting processing from index {state.current_index}")
-    except Exception as e:
-        print(f"[{FLOW_ID}] Error during initialization: {e}")
-        import traceback
-        traceback.print_exc()
-@app.get("/", response_class=HTMLResponse)
-async def home(request: Request):
-    """Home page with status and controls."""
-    async with state_lock:
-        processed_count = len(state.processed_files)
-        remaining_count = state.total_files - processed_count
-        # Calculate server stats
-        server_stats = [
-            {
-                "url": s.url,
-                "busy": s.busy,
-                "processed": s.total_processed,
-                "fps": f"{s.fps:.2f}"
-            } for s in servers
-        ]
-        # Calculate overall FPS
-        total_processed = sum(s.total_processed for s in servers)
-        total_time = sum(s.total_time for s in servers)
-        overall_fps = total_processed / total_time if total_time > 0 else 0
-        context = {
-            "request": request,
-            "flow_id": FLOW_ID,
-            "status": state.status,
-            "is_running": state.is_running,
-            "total_files": state.total_files,
-            "processed_count": processed_count,
-            "remaining_count": remaining_count,
-            "current_index": state.current_index,
-            "current_file": state.current_file if state.current_file else "N/A",
-            "current_file_progress": state.current_file_progress,
-            "last_update": state.last_update,
-            "overall_fps": f"{overall_fps:.2f}",
-            "server_stats": server_stats
-        }
-    return templates.TemplateResponse("index.html", context)
-@app.post("/set_index")
-async def set_index(request: Request, background_tasks: BackgroundTasks):
-    """Endpoint to manually set the start index."""
-    global state
-    form = await request.form()
-    try:
-        new_index = int(form.get("start_index"))
-    except (TypeError, ValueError):
-        raise HTTPException(status_code=400, detail="Invalid index value.")
-    async with state_lock:
-        if 0 <= new_index < state.total_files:
-            state.current_index = new_index
-            state.status = f"Index set to {new_index}. Restarting processing."
-            # If the loop is not running, start it
-            if not state.is_running:
-                state.is_running = True
-                background_tasks.add_task(process_next_file_task)
-            await save_state_to_hf()
-            print(f"[{FLOW_ID}] Index manually set to {new_index}.")
-            return {"status": "success", "message": f"Start index set to {new_index}. Processing will resume from this point."}
-        elif new_index == state.total_files:
-            state.current_index = new_index
-            state.is_running = False
-            state.status = "Finished processing all files."
-            await save_state_to_hf()
-            return {"status": "success", "message": "Index set to end of list. Processing stopped."}
-        else:
-            raise HTTPException(status_code=400, detail=f"Index {new_index} is out of bounds (0 to {state.total_files}).")
-@app.post("/control_processing")
-async def control_processing(request: Request, background_tasks: BackgroundTasks):
-    """Endpoint to start/stop the processing loop."""
-    global state
-    form = await request.form()
-    action = form.get("action")
-    async with state_lock:
-        if action == "start":
-            if not state.is_running:
-                # Reset state if we're at the end
-                if state.current_index >= state.total_files:
-                    state.current_index = 0
-                    state.status = "Reset to start and processing..."
-                state.is_running = True
-                state.status = "Processing started."
-                # Start the processing task
-                background_tasks.add_task(process_next_file_task)
-                await save_state_to_hf()
-                print(f"[{FLOW_ID}] Processing manually started from index {state.current_index}")
-                return {"status": "success", "message": "Processing loop started."}
-            else:
-                return {"status": "info", "message": "Processing is already running."}
-        elif action == "stop":
-            if state.is_running:
-                state.is_running = False
-                state.status = "Processing stopped by user."
-                await save_state_to_hf()
-                print(f"[{FLOW_ID}] Processing manually stopped")
-                return {"status": "success", "message": "Processing loop stopped."}
-            else:
-                return {"status": "info", "message": "Processing is already stopped."}
-        else:
-            raise HTTPException(status_code=400, detail="Invalid action.")
-@app.get("/status")
-async def get_status():
-    """API endpoint to get the current server status as JSON."""
-    async with state_lock:
-        processed_count = len(state.processed_files)
-        server_stats = [
-            {
-                "url": s.url,
-                "busy": s.busy,
-                "processed": s.total_processed,
-                "fps": f"{s.fps:.2f}"
-            } for s in servers
-        ]
-        total_processed = sum(s.total_processed for s in servers)
-        total_time = sum(s.total_time for s in servers)
-        overall_fps = total_processed / total_time if total_time > 0 else 0
-        return {
-            "flow_id": FLOW_ID,
-            "status": state.status,
-            "is_running": state.is_running,
-            "total_files": state.total_files,
-            "processed_count": processed_count,
-            "remaining_count": state.total_files - processed_count,
-            "current_index": state.current_index,
-            "current_file": state.current_file,
-            "current_file_progress": state.current_file_progress,
-            "last_update": state.last_update,
-            "overall_fps": f"{overall_fps:.2f}",
-            "server_stats": server_stats
-        }
-# The original /process_course endpoint is now obsolete as the server manages its own queue
-# @app.post("/process_course")
-# async def process_course(request: ProcessCourseRequest, background_tasks: BackgroundTasks):
-#     return {"status": "obsolete", "message": "The server now manages its own processing queue based on the index."}
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=FLOW_PORT)

 import asyncio
 import aiohttp
 import zipfile
 import shutil
+from typing import Dict, List, Set, Optional, Tuple
 from urllib.parse import quote
 from datetime import datetime
 from pathlib import Path
+import io
+from fastapi import FastAPI, BackgroundTasks, HTTPException, status
 from pydantic import BaseModel, Field
+from huggingface_hub import HfApi, hf_hub_download
 # --- Configuration ---
+AUTO_START_INDEX = 20 # Hardcoded default start index if no progress is found
 FLOW_ID = os.getenv("FLOW_ID", "flow_default")
 FLOW_PORT = int(os.getenv("FLOW_PORT", 8001))
 HF_TOKEN = os.getenv("HF_TOKEN", "")
+HF_DATASET_ID = os.getenv("HF_DATASET_ID", "Fred808/BG3") # Source dataset for zip files
+HF_OUTPUT_DATASET_ID = os.getenv("HF_OUTPUT_DATASET_ID", "fred808/helium") # Target dataset for captions
+# Progress Tracking File
+PROGRESS_FILE = Path("processing_progress.json")
+# Directory within the HF dataset where the zip files are located
+ZIP_FILE_PREFIX = "frames/"
 # Using the full list from the user's original code for actual deployment
 CAPTION_SERVERS = [
     "https://fred808-pil-4-1.hf.space/analyze",
     "https://fred808-pil-4-2.hf.space/analyze",
+    # ... (rest of the servers)
     "https://fredalone-fredalone-gahbxh.hf.space/analyze",
     "https://fredalone-fredalone-kw2po4.hf.space/analyze",
     "https://fredalone-fredalone-8h285h.hf.space/analyze"
 TEMP_DIR.mkdir(exist_ok=True)
 # --- Models ---
+class ProcessStartRequest(BaseModel):
+    start_index: int = Field(AUTO_START_INDEX, ge=1, description="The index number of the zip file to start processing from (1-indexed).")
 class CaptionServer:
     def __init__(self, url):
     def fps(self):
         return self.total_processed / self.total_time if self.total_time > 0 else 0
+# Global state for caption servers
 servers = [CaptionServer(url) for url in CAPTION_SERVERS]
 server_index = 0
+# --- Progress Tracking Functions ---
+def load_progress() -> Dict:
+    """Loads the processing progress from the JSON file."""
+    if PROGRESS_FILE.exists():
+        try:
+            with PROGRESS_FILE.open('r') as f:
+                return json.load(f)
+        except json.JSONDecodeError:
+            print(f"[{FLOW_ID}] WARNING: Progress file is corrupted. Starting fresh.")
+            # Fall through to return default structure
+    # Default structure
+    return {
+        "last_processed_index": 0,
+        "processed_files": {}, # {index: repo_path}
+        "file_list": [] # Full list of all zip files found in the dataset
+    }
+def save_progress(progress_data: Dict):
+    """Saves the processing progress to the JSON file."""
+    try:
+        with PROGRESS_FILE.open('w') as f:
+            json.dump(progress_data, f, indent=4)
+    except Exception as e:
+        print(f"[{FLOW_ID}] CRITICAL ERROR: Could not save progress to {PROGRESS_FILE}: {e}")
+# --- Hugging Face Utility Functions ---
+async def get_zip_file_list(progress_data: Dict) -> List[str]:
+    """
+    Fetches the list of all zip files from the dataset, or uses the cached list.
+    Updates the progress_data with the file list if a new list is fetched.
+    """
+    if progress_data['file_list']:
+        print(f"[{FLOW_ID}] Using cached file list with {len(progress_data['file_list'])} files.")
+        return progress_data['file_list']
+    print(f"[{FLOW_ID}] Fetching full list of zip files from {HF_DATASET_ID}...")
+    try:
+        api = HfApi(token=HF_TOKEN)
+        repo_files = api.list_repo_files(
+            repo_id=HF_DATASET_ID,
+            repo_type="dataset"
+        )
+        # Filter for zip files in the specified directory and sort them alphabetically for consistent indexing
+        zip_files = sorted([
+            f for f in repo_files
+            if f.startswith(ZIP_FILE_PREFIX) and f.endswith('.zip')
+        ])
+        if not zip_files:
+            raise FileNotFoundError(f"No zip files found in '{ZIP_FILE_PREFIX}' directory of dataset '{HF_DATASET_ID}'.")
+        print(f"[{FLOW_ID}] Found {len(zip_files)} zip files.")
+        # Update and save the progress data
+        progress_data['file_list'] = zip_files
+        save_progress(progress_data)
+        return zip_files
+    except Exception as e:
+        print(f"[{FLOW_ID}] Error fetching file list from Hugging Face: {e}")
+        return []
+async def download_and_extract_zip_by_index(file_index: int, repo_file_full_path: str) -> Optional[Path]:
+    """Downloads the zip file for the given index and extracts its contents."""
+    # Extract the base name for the extraction directory
+    zip_full_name = Path(repo_file_full_path).name
+    course_name = zip_full_name.replace('.zip', '') # Use the file name as the course/job name
+    print(f"[{FLOW_ID}] Processing file #{file_index}: {repo_file_full_path}. Full name: {zip_full_name}")
+    try:
+        # Use hf_hub_download to get the file path
+        zip_path = hf_hub_download(
+            repo_id=HF_DATASET_ID,
+            filename=repo_file_full_path, # Use the full path in the repo
+            repo_type="dataset",
+            token=HF_TOKEN,
+        )
+        print(f"[{FLOW_ID}] Downloaded to {zip_path}. Extracting...")
+        # Create a temporary directory for extraction
+        extract_dir = TEMP_DIR / course_name
+        # Ensure a clean directory for extraction
+        if extract_dir.exists():
+            shutil.rmtree(extract_dir)
+        extract_dir.mkdir(exist_ok=True)
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_ref.extractall(extract_dir)
+        print(f"[{FLOW_ID}] Extraction complete to {extract_dir}.")
+        # Clean up the downloaded zip file to save space
+        os.remove(zip_path)
+        return extract_dir
+    except Exception as e:
+        print(f"[{FLOW_ID}] Error downloading or extracting zip for {repo_file_full_path}: {e}")
+        return None
+async def upload_captions_to_hf(zip_full_name: str, captions: List[Dict]) -> bool:
+    """Uploads the final captions JSON file to the output dataset."""
+    # Use the full zip name, replacing the extension with .json
+    caption_filename = Path(zip_full_name).with_suffix('.json').name
+    try:
+        print(f"[{FLOW_ID}] Uploading {len(captions)} captions for {zip_full_name} as {caption_filename} to {HF_OUTPUT_DATASET_ID}...")
+        # Create JSON content in memory
+        json_content = json.dumps(captions, indent=2, ensure_ascii=False).encode('utf-8')
+        api = HfApi(token=HF_TOKEN)
+        api.upload_file(
+            path_or_fileobj=io.BytesIO(json_content),
+            path_in_repo=caption_filename,
+            repo_id=HF_OUTPUT_DATASET_ID,
+            repo_type="dataset",
+            commit_message=f"[{FLOW_ID}] Captions for {zip_full_name}"
+        )
+        print(f"[{FLOW_ID}] Successfully uploaded captions for {zip_full_name}.")
+        return True
+    except Exception as e:
+        print(f"[{FLOW_ID}] Error uploading captions for {zip_full_name}: {e}")
+        return False
 # --- Core Processing Functions (Modified) ---
     global server_index
     start_time = time.time()
     while True:
+        # Round-robin check for an available server
         for _ in range(len(servers)):
             server = servers[server_index]
             server_index = (server_index + 1) % len(servers)
             if not server.busy:
                 return server
+        # If all servers are busy, wait for a short period and check again
         await asyncio.sleep(0.5)
+        # Check if timeout has been reached
         if time.time() - start_time > timeout:
             raise TimeoutError(f"Timeout ({timeout}s) waiting for an available caption server.")
 async def send_image_for_captioning(image_path: Path, course_name: str, progress_tracker: Dict) -> Optional[Dict]:
     """Sends a single image to a caption server for processing."""
+    # This function now handles server selection and retries internally
     MAX_RETRIES = 3
     for attempt in range(MAX_RETRIES):
         server = None
         try:
+            # 1. Get an available server (will wait if all are busy, with a timeout)
             server = await get_available_server()
             server.busy = True
             start_time = time.time()
+            # Print a less verbose message only on the first attempt
             if attempt == 0:
                 print(f"[{FLOW_ID}] Starting attempt on {image_path.name}...")
+            # 2. Prepare request data
             form_data = aiohttp.FormData()
             form_data.add_field('file',
                                 image_path.open('rb'),
                                 content_type='image/jpeg')
             form_data.add_field('model_choice', MODEL_TYPE)
+            # 3. Send request
             async with aiohttp.ClientSession() as session:
+                # Increased timeout to 10 minutes (600s) as requested by user's problem description
                 async with session.post(server.url, data=form_data, timeout=600) as resp:
                     if resp.status == 200:
                         result = await resp.json()
                         caption = result.get("caption")
                         if caption:
+                            # Update progress counter
                             progress_tracker['completed'] += 1
                             if progress_tracker['completed'] % 50 == 0:
                                 print(f"[{FLOW_ID}] PROGRESS: {progress_tracker['completed']}/{progress_tracker['total']} captions completed.")
+                            # Log success only if it's not a progress report interval
+                            if progress_tracker['completed'] % 50 != 0:
+                                print(f"[{FLOW_ID}] Success: {image_path.name} captioned by {server.url}")
                             return {
                                 "course": course_name,
                                 "image_path": image_path.name,
                             }
                         else:
                             print(f"[{FLOW_ID}] Server {server.url} returned success but no caption for {image_path.name}. Retrying...")
+                            continue # Retry with a different server
                     else:
                         error_text = await resp.text()
                         print(f"[{FLOW_ID}] Error from server {server.url} for {image_path.name}: {resp.status} - {error_text}. Retrying...")
+                        continue # Retry with a different server
         except (aiohttp.ClientError, asyncio.TimeoutError, TimeoutError) as e:
             print(f"[{FLOW_ID}] Connection/Timeout error for {image_path.name} on {server.url if server else 'unknown server'}: {e}. Retrying...")
+            continue # Retry with a different server
         except Exception as e:
             print(f"[{FLOW_ID}] Unexpected error during captioning for {image_path.name}: {e}. Retrying...")
+            continue # Retry with a different server
         finally:
             if server:
                 end_time = time.time()
     print(f"[{FLOW_ID}] FAILED after {MAX_RETRIES} attempts for {image_path.name}.")
     return None
+async def process_dataset_task(start_index: int):
+    """Main task to process the dataset sequentially starting from a given index."""
+    progress = load_progress()
+    file_list = await get_zip_file_list(progress)
+    if not file_list:
+        print(f"[{FLOW_ID}] ERROR: Cannot proceed. File list is empty.")
         return False
+    # Ensure start_index is within bounds
+    if start_index > len(file_list):
+        print(f"[{FLOW_ID}] WARNING: Start index {start_index} is greater than the total number of files ({len(file_list)}). Exiting.")
+        return True
+    # Determine the actual starting index in the 0-indexed list
+    start_list_index = start_index - 1
+    print(f"[{FLOW_ID}] Starting dataset processing from file index: {start_index} out of {len(file_list)}.")
+    global_success = True
+    for i in range(start_list_index, len(file_list)):
+        file_index = i + 1 # 1-indexed for user display and progress tracking
+        repo_file_full_path = file_list[i]
+        zip_full_name = Path(repo_file_full_path).name
+        course_name = zip_full_name.replace('.zip', '') # Use the file name as the course/job name
+        # Check if the file has already been successfully processed
+        if str(file_index) in progress['processed_files']:
+            print(f"[{FLOW_ID}] Skipping file #{file_index} ({zip_full_name}): Already processed according to progress file.")
+            progress['last_processed_index'] = file_index
+            save_progress(progress)
             continue
         extract_dir = None
+        current_file_success = False
         try:
+            # 1. Download and Extract
+            extract_dir = await download_and_extract_zip_by_index(file_index, repo_file_full_path)
+            if not extract_dir:
                 raise Exception("Failed to download or extract zip file.")
+            # 2. Find Images
+            # Use recursive glob to find images in subdirectories
             image_paths = [p for p in extract_dir.glob("**/*") if p.is_file() and p.suffix.lower() in ['.jpg', '.jpeg', '.png']]
             print(f"[{FLOW_ID}] Found {len(image_paths)} images to process in {zip_full_name}.")
             if not image_paths:
                 print(f"[{FLOW_ID}] No images found in {zip_full_name}. Marking as complete.")
+                current_file_success = True
             else:
+                # 3. Process Images (Captioning)
                 progress_tracker = {
                     'total': len(image_paths),
                     'completed': 0
                 }
+                print(f"[{FLOW_ID}] Starting captioning for {progress_tracker['total']} images in {zip_full_name}...")
+                # Create a semaphore to limit concurrent tasks to the number of available servers
                 semaphore = asyncio.Semaphore(len(servers))
                 async def limited_send_image_for_captioning(image_path, course_name, progress_tracker):
                     async with semaphore:
                         return await send_image_for_captioning(image_path, course_name, progress_tracker)
+                # Create a list of tasks for parallel captioning
                 caption_tasks = [limited_send_image_for_captioning(p, course_name, progress_tracker) for p in image_paths]
+                # Run all captioning tasks concurrently
                 results = await asyncio.gather(*caption_tasks)
+                # Filter out failed results
                 all_captions = [r for r in results if r is not None]
+                # Final progress report for the current file
+                if len(all_captions) == len(image_paths):
+                    print(f"[{FLOW_ID}] FINAL PROGRESS for {zip_full_name}: Successfully completed all {len(all_captions)} captions.")
+                    current_file_success = True
+                else:
+                    print(f"[{FLOW_ID}] FINAL PROGRESS for {zip_full_name}: Completed with partial result: {len(all_captions)}/{len(image_paths)} captions. Marking as partial failure.")
+                    current_file_success = False
+                # 4. Upload Results
+                if all_captions:
+                    print(f"[{FLOW_ID}] Uploading {len(all_captions)} captions for {zip_full_name}...")
                     if await upload_captions_to_hf(zip_full_name, all_captions):
+                        print(f"[{FLOW_ID}] Successfully uploaded captions for {zip_full_name}.")
+                        # Partial success in captioning is still a success for the upload step
+                        pass
                     else:
+                        print(f"[{FLOW_ID}] Failed to upload captions for {zip_full_name}.")
+                        current_file_success = False
                 else:
+                    print(f"[{FLOW_ID}] No captions generated. Skipping upload for {zip_full_name}.")
+                    current_file_success = False
         except Exception as e:
+            print(f"[{FLOW_ID}] Critical error in process_dataset_task for file #{file_index} ({zip_full_name}): {e}")
+            current_file_success = False
+            global_success = False # Mark overall task as failed if any file fails critically
         finally:
+            # 5. Cleanup and Update Progress
             if extract_dir and extract_dir.exists():
+                print(f"[{FLOW_ID}] Cleaned up temporary directory {extract_dir}.")
+                shutil.rmtree(extract_dir, ignore_errors=True)
+            if current_file_success:
+                # Update progress only on successful completion of the file
+                progress['last_processed_index'] = file_index
+                progress['processed_files'][str(file_index)] = repo_file_full_path
+                save_progress(progress)
+                print(f"[{FLOW_ID}] Progress saved: File #{file_index} marked as processed.")
+            else:
+                # If a file fails, we stop the continuous loop to allow for manual intervention or a fresh start
+                print(f"[{FLOW_ID}] File #{file_index} failed. Stopping continuous processing.")
+                global_success = False
+                break
+    print(f"[{FLOW_ID}] All processing loops complete. Overall success: {global_success}")
+    return global_success
 # --- FastAPI App and Endpoints ---
 app = FastAPI(
     title=f"Flow Server {FLOW_ID} API",
+    description="Sequentially processes zip files from a dataset, captions images, and tracks progress.",
+    version="1.0.0"
 )
 @app.on_event("startup")
 async def startup_event():
+    print(f"Flow Server {FLOW_ID} started on port {FLOW_PORT}.")
+    # Automatically start the processing task
+    progress = load_progress()
+    # Start from the last processed index + 1, or the hardcoded AUTO_START_INDEX if the progress file is new/empty
+    start_index = progress.get('last_processed_index', 0) + 1
+    if start_index < AUTO_START_INDEX:
+        start_index = AUTO_START_INDEX
+    # Use a dummy BackgroundTasks object for the startup task
+    # Note: FastAPI's startup events can't directly use BackgroundTasks, but we can use asyncio.create_task
+    # to run the long-running process in the background without blocking the server startup.
+    print(f"[{FLOW_ID}] Auto-starting processing from index: {start_index}...")
+    asyncio.create_task(process_dataset_task(start_index))
+@app.get("/")
+async def root():
+    progress = load_progress()
+    return {
+        "flow_id": FLOW_ID,
+        "status": "ready",
+        "last_processed_index": progress['last_processed_index'],
+        "total_files_in_list": len(progress['file_list']),
+        "processed_files_count": len(progress['processed_files']),
+        "total_servers": len(servers),
+        "busy_servers": sum(1 for s in servers if s.busy),
+    }
+@app.post("/start_processing")
+async def start_processing(request: ProcessStartRequest, background_tasks: BackgroundTasks):
+    """
+    Starts the sequential processing of zip files from the given index in the background.
+    """
+    start_index = request.start_index
+    print(f"[{FLOW_ID}] Received request to start processing from index: {start_index}. Starting background task.")
+    # Start the heavy processing in a background task so the API call returns immediately
+    # Note: The server is already auto-starting, but this allows for manual restart/override.
+    background_tasks.add_task(process_dataset_task, start_index)
+    return {"status": "processing", "start_index": start_index, "message": "Dataset processing started in background."}
 if __name__ == "__main__":
+    import uvicorn
+    # Note: When running in the sandbox, we need to use 0.0.0.0 to expose the port.
     uvicorn.run(app, host="0.0.0.0", port=FLOW_PORT)