Spaces:

Samfredoly
/

switches

Paused

App Files Files Community

Samfredoly commited on Nov 25, 2025

Commit

99dce0a

verified ·

1 Parent(s): 890b86d

Update app.py

Browse files

Files changed (1) hide show

app.py +352 -539

app.py CHANGED Viewed

@@ -5,10 +5,9 @@ import asyncio
 import aiohttp
 import zipfile
 import shutil
-import threading
 from typing import Dict, List, Set, Optional, Tuple, Any
 from urllib.parse import quote
-from datetime import datetime, timedelta
 from pathlib import Path
 import io
@@ -17,12 +16,12 @@ from pydantic import BaseModel, Field
 from huggingface_hub import HfApi, hf_hub_download
 # --- Configuration ---
-AUTO_START_INDEX = 1290
 FLOW_ID = os.getenv("FLOW_ID", "flow_default")
 FLOW_PORT = int(os.getenv("FLOW_PORT", 8001))
 HF_TOKEN = os.getenv("HF_TOKEN", "")
 HF_AUDIO_DATASET_ID = os.getenv("HF_AUDIO_DATASET_ID", "Samfredoly/BG_Vid")
-HF_OUTPUT_DATASET_ID = os.getenv("HF_OUTPUT_DATASET_ID", "samfred2/AVTF")
 # Progress and State Tracking
 PROGRESS_FILE = Path("processing_progress.json")
@@ -30,10 +29,12 @@ HF_STATE_FILE = "processing_state_transcriptions.json"
 LOCAL_STATE_FOLDER = Path(".state")
 LOCAL_STATE_FOLDER.mkdir(exist_ok=True)
-AUDIO_FILE_PREFIX = "audio/"
-# Reference dataset for filename mapping
-REFERENCE_REPO_ID = os.getenv("REFERENCE_REPO_ID", "Fred808/BG3")
 WHISPER_SERVERS = [
     "https://fred1012-switch3.hf.space/transcribe",
@@ -58,98 +59,58 @@ WHISPER_SERVERS = [
     "https://Eliasishere-mint-20.hf.space/transcribe"
 ]
-MODEL_TYPE = "whisper-small"
-ZIP_UPLOAD_THRESHOLD = 100  # Upload and zip after this many transcriptions
 # Temporary storage for audio files
 TEMP_DIR = Path(f"temp_audio_{FLOW_ID}")
 TEMP_DIR.mkdir(exist_ok=True)
-# Temporary storage for transcription results
-RESULTS_DIR = Path(f"transcription_results_{FLOW_ID}")
-RESULTS_DIR.mkdir(exist_ok=True)
 # --- Models ---
 class WhisperServer:
-    def __init__(self, url):
         self.url = url
-        self.busy = False
         self.total_processed = 0
-        self.total_time = 0
-        self.model = MODEL_TYPE
     @property
     def fps(self):
         return self.total_processed / self.total_time if self.total_time > 0 else 0
-class RateLimiter:
-    """Tracks uploads per hour with max limit of 120, stops at 128."""
-    def __init__(self, max_per_hour: int = 120, stop_at: int = 128):
-        self.max_per_hour = max_per_hour
-        self.stop_at = stop_at
-        self.uploads = []  # List of timestamps
-        self.lock = asyncio.Lock()
-    async def wait_if_needed(self) -> bool:
-        """
-        Returns True if upload can proceed, False if rate limit reached.
-        Waits if needed to stay within limits.
-        """
-        async with self.lock:
-            now = datetime.now()
-            one_hour_ago = now - timedelta(hours=1)
-            # Remove old uploads outside the 1-hour window
-            self.uploads = [ts for ts in self.uploads if ts > one_hour_ago]
-            # If we've reached the hard stop limit (128), return False
-            if len(self.uploads) >= self.stop_at:
-                print(f"[{FLOW_ID}] ⏸️  Upload limit ({self.stop_at}) reached. Waiting for next hour...")
-                return False
-            # If we're at the soft limit (120), add timestamp and continue
-            if len(self.uploads) < self.max_per_hour:
-                self.uploads.append(now)
-                remaining = self.max_per_hour - len(self.uploads)
-                print(f"[{FLOW_ID}] 📤 Upload #{len(self.uploads)}/120 this hour ({remaining} remaining)")
-                return True
-            # Between soft limit and hard stop, add and continue
-            self.uploads.append(now)
-            print(f"[{FLOW_ID}] ⚠️  Upload #{len(self.uploads)}/120 this hour (approaching limit)")
-            return True
-    async def can_upload(self) -> bool:
-        """Check if upload is allowed without waiting."""
-        async with self.lock:
-            now = datetime.now()
-            one_hour_ago = now - timedelta(hours=1)
-            self.uploads = [ts for ts in self.uploads if ts > one_hour_ago]
-            return len(self.uploads) < self.stop_at
-# Global rate limiter
-rate_limiter = RateLimiter(max_per_hour=120, stop_at=128)
 # Global state for whisper servers
 servers = [WhisperServer(url) for url in WHISPER_SERVERS]
-server_index = 0
 def load_progress() -> Dict:
     if PROGRESS_FILE.exists():
         try:
             with PROGRESS_FILE.open('r') as f:
                 return json.load(f)
         except json.JSONDecodeError:
             print(f"[{FLOW_ID}] WARNING: Progress file is corrupted. Starting fresh.")
     return {
         "last_processed_index": 0,
-        "processed_files": {},
-        "file_list": [],
-        "transcription_count": 0,
-        "reference_map": {},
     }
 def save_progress(progress_data: Dict):
@@ -175,13 +136,10 @@ def load_json_state(file_path: str, default_value: Dict[str, Any]) -> Dict[str,
                 if "next_download_index" not in data:
                     data["next_download_index"] = 0
-                if "transcription_count" not in data:
-                    data["transcription_count"] = 0
                 return data
         except json.JSONDecodeError:
             print(f"[{FLOW_ID}] WARNING: Corrupted state file: {file_path}")
-            return default_value
 def save_json_state(file_path: str, data: Dict[str, Any]):
     """Save state to JSON file"""
@@ -191,10 +149,10 @@ def save_json_state(file_path: str, data: Dict[str, Any]):
 async def download_hf_state() -> Dict[str, Any]:
     """Downloads the state file from Hugging Face or returns a default state."""
     local_path = LOCAL_STATE_FOLDER / HF_STATE_FILE
-    default_state = {"next_download_index": 0, "file_states": {}, "transcription_count": 0}
     try:
-        # Check if the file exists in the output repo
         files = HfApi(token=HF_TOKEN).list_repo_files(
             repo_id=HF_OUTPUT_DATASET_ID,
             repo_type="dataset"
@@ -229,13 +187,13 @@ async def upload_hf_state(state: Dict[str, Any]) -> bool:
         # Save state locally first
         save_json_state(str(local_path), state)
-        # Upload to output dataset
         HfApi(token=HF_TOKEN).upload_file(
             path_or_fileobj=str(local_path),
             path_in_repo=HF_STATE_FILE,
             repo_id=HF_OUTPUT_DATASET_ID,
             repo_type="dataset",
-            commit_message=f"Update transcription processing state: next_index={state['next_download_index']}, count={state.get('transcription_count', 0)}"
         )
         print(f"[{FLOW_ID}] Successfully uploaded state file.")
         return True
@@ -243,84 +201,52 @@ async def upload_hf_state(state: Dict[str, Any]) -> bool:
         print(f"[{FLOW_ID}] Failed to upload state file: {str(e)}")
         return False
-async def lock_file_for_processing(audio_filename: str, state: Dict[str, Any]) -> bool:
     """Marks a file as 'processing' in the state file and uploads the lock."""
-    print(f"[{FLOW_ID}] 🔒 Attempting to lock file: {audio_filename}")
     # Update state locally
-    state["file_states"][audio_filename] = "processing"
     # Upload the updated state file immediately to establish the lock
     if await upload_hf_state(state):
-        print(f"[{FLOW_ID}] ✅ Successfully locked file: {audio_filename}")
         return True
     else:
-        print(f"[{FLOW_ID}] ❌ Failed to lock file: {audio_filename}")
         # Revert local state
-        if audio_filename in state["file_states"]:
-            del state["file_states"][audio_filename]
         return False
-async def unlock_file_as_processed(audio_filename: str, state: Dict[str, Any], next_index: int) -> bool:
     """Marks a file as 'processed', updates the index, and uploads the state."""
-    print(f"[{FLOW_ID}] 🔓 Marking file as processed: {audio_filename}")
     # Update state locally
-    state["file_states"][audio_filename] = "processed"
     state["next_download_index"] = next_index
     # Upload the updated state
     if await upload_hf_state(state):
-        print(f"[{FLOW_ID}] ✅ Successfully marked as processed: {audio_filename}")
         return True
     else:
-        print(f"[{FLOW_ID}] ❌ Failed to update state for: {audio_filename}")
         return False
 # --- Hugging Face Utility Functions ---
-async def get_reference_map(reference_repo_id: str) -> Dict[str, str]:
-    """
-    Fetches the reference file list from the Hugging Face repo and creates a map
-    from audio filename (without extension) to reference filename.
-    """
-    print(f"[{FLOW_ID}] Fetching reference file list from {reference_repo_id}...")
-    try:
-        api = HfApi(token=HF_TOKEN)
-        repo_files = api.list_repo_files(repo_id=reference_repo_id, repo_type="dataset")
-        reference_map = {}
-        for file in repo_files:
-            base_name, ext = os.path.splitext(file)
-            if ext.lower() in ['.txt', '.json']:  # Consider text/json files as reference
-                reference_map[base_name] = file
-        print(f"[{FLOW_ID}] ✅ Successfully created reference map with {len(reference_map)} entries.")
-        return reference_map
-    except Exception as e:
-        print(f"[{FLOW_ID}] ⚠️ Failed to fetch reference map from Hugging Face: {e}")
-        return {}
-def find_matching_filename(audio_filename: str, reference_map: Dict[str, str]) -> Optional[str]:
-    """
-    Finds the matching reference filename for a given audio filename.
-    Returns the reference filename if found, otherwise None.
-    """
-    base_name, _ = os.path.splitext(audio_filename)
-    return reference_map.get(base_name)
 async def get_audio_file_list(progress_data: Dict) -> List[str]:
     """
-    Fetches the list of all audio files from the dataset, or uses the cached list.
     Updates the progress_data with the file list if a new list is fetched.
     """
     if progress_data['file_list']:
         print(f"[{FLOW_ID}] Using cached file list with {len(progress_data['file_list'])} files.")
         return progress_data['file_list']
-    print(f"[{FLOW_ID}] Fetching full list of audio files from {HF_AUDIO_DATASET_ID}...")
     try:
         api = HfApi(token=HF_TOKEN)
         repo_files = api.list_repo_files(
@@ -328,497 +254,384 @@ async def get_audio_file_list(progress_data: Dict) -> List[str]:
             repo_type="dataset"
         )
-        # Filter for audio files in the specified directory and sort them alphabetically for consistent indexing
-        audio_extensions = ['.mp3', '.wav', '.m4a', '.flac', '.ogg', '.aac']
-        audio_files = sorted([
             f for f in repo_files
-            if f.startswith(AUDIO_FILE_PREFIX) and any(f.lower().endswith(ext) for ext in audio_extensions)
         ])
-        if not audio_files:
-            raise FileNotFoundError(f"No audio files found in '{AUDIO_FILE_PREFIX}' directory of dataset '{HF_AUDIO_DATASET_ID}'.")
-        print(f"[{FLOW_ID}] Found {len(audio_files)} audio files.")
         # Update and save the progress data
-        progress_data['file_list'] = audio_files
         save_progress(progress_data)
-        return audio_files
     except Exception as e:
         print(f"[{FLOW_ID}] Error fetching file list from Hugging Face: {e}")
         return []
-async def download_audio_file(file_index: int, repo_file_full_path: str) -> Optional[Path]:
-    """Downloads the audio file for the given index."""
-    audio_filename = Path(repo_file_full_path).name
-    print(f"[{FLOW_ID}] Processing audio file #{file_index}: {repo_file_full_path}")
     try:
         # Use hf_hub_download to get the file path
-        audio_path = hf_hub_download(
             repo_id=HF_AUDIO_DATASET_ID,
             filename=repo_file_full_path,
             repo_type="dataset",
             token=HF_TOKEN,
         )
-        print(f"[{FLOW_ID}] Downloaded audio to {audio_path}.")
-        # Copy to temp directory
-        temp_path = TEMP_DIR / audio_filename
-        shutil.copy2(audio_path, temp_path)
-        return temp_path
     except Exception as e:
-        print(f"[{FLOW_ID}] Error downloading audio file {repo_file_full_path}: {e}")
         return None
-async def upload_json_to_dataset(json_file_path: Path, json_filename: str) -> bool:
-    """Uploads a single JSON transcription file directly to HF dataset."""
-    try:
-        # Check rate limit before uploading
-        if not await rate_limiter.wait_if_needed():
-            print(f"[{FLOW_ID}] ⏸️  Upload rate limit reached for {json_filename}. Waiting...")
-            return False
-        print(f"[{FLOW_ID}] 📤 Uploading JSON file: {json_filename}...")
-        api = HfApi(token=HF_TOKEN)
-        api.upload_file(
-            path_or_fileobj=str(json_file_path),
-            path_in_repo=f"transcriptions/{json_filename}",
-            repo_id=HF_OUTPUT_DATASET_ID,
-            repo_type="dataset",
-            commit_message=f"[{FLOW_ID}] Transcription: {json_filename}"
-        )
-        print(f"[{FLOW_ID}] ✅ Successfully uploaded: {json_filename}")
-        return True
-    except Exception as e:
-        print(f"[{FLOW_ID}] ❌ Error uploading {json_filename}: {e}")
-        return False
-async def zip_and_upload_transcriptions(transcription_files: List[Path], batch_number: int) -> bool:
-    """Zips transcription JSON files and uploads to dataset with batch numbering."""
-    if not transcription_files:
-        print(f"[{FLOW_ID}] No transcription files to zip.")
-        return False
     try:
-        zip_filename = f"audio_json_batch_{batch_number}.zip"
-        zip_path = RESULTS_DIR / zip_filename
-        print(f"[{FLOW_ID}] 📦 Creating zip file: {zip_filename} with {len(transcription_files)} files...")
-        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
-            for file_path in transcription_files:
-                if file_path.exists():
-                    zipf.write(file_path, arcname=file_path.name)
-        print(f"[{FLOW_ID}] 📤 Uploading zip file to {HF_OUTPUT_DATASET_ID}...")
         api = HfApi(token=HF_TOKEN)
         api.upload_file(
-            path_or_fileobj=str(zip_path),
-            path_in_repo=zip_filename,
             repo_id=HF_OUTPUT_DATASET_ID,
             repo_type="dataset",
-            commit_message=f"[{FLOW_ID}] Batch {batch_number}: {len(transcription_files)} transcriptions"
         )
-        print(f"[{FLOW_ID}] ✅ Successfully uploaded: {zip_filename}")
-        # Cleanup
-        os.remove(zip_path)
         return True
     except Exception as e:
-        print(f"[{FLOW_ID}] Error zipping and uploading transcriptions: {e}")
         return False
 # --- Core Processing Functions ---
-async def get_available_server(timeout: float = 300.0) -> WhisperServer:
-    """Round-robin selection of an available whisper server."""
-    global server_index
-    start_time = time.time()
-    while True:
-        # Round-robin check for an available server
-        for _ in range(len(servers)):
-            server = servers[server_index]
-            server_index = (server_index + 1) % len(servers)
-            if not server.busy:
-                return server
-        # If all servers are busy, wait for a short period and check again
-        await asyncio.sleep(0.5)
-        # Check if timeout has been reached
-        if time.time() - start_time > timeout:
-            raise TimeoutError(f"Timeout ({timeout}s) waiting for an available whisper server.")
-async def send_audio_for_transcription(audio_path: Path, progress_tracker: Dict) -> Optional[Dict]:
-    """Sends a single audio file to a whisper server for transcription."""
-    MAX_RETRIES = 3
-    for attempt in range(MAX_RETRIES):
-        server = None
-        try:
-            # 1. Get an available server
-            server = await get_available_server()
-            server.busy = True
-            start_time = time.time()
-            if attempt == 0:
-                print(f"[{FLOW_ID}] Starting transcription attempt on {audio_path.name}...")
-            # 2. Prepare request data - keep file open until request is done
-            with audio_path.open('rb') as f:
-                file_content = f.read()
-            form_data = aiohttp.FormData()
-            form_data.add_field('file',
-                                io.BytesIO(file_content),
-                                filename=audio_path.name,
-                                content_type='audio/mpeg')
-            # 3. Send request
-            async with aiohttp.ClientSession() as session:
-                print(f"[{FLOW_ID}] Sending audio file to {server.url}...")
-                async with session.post(server.url, data=form_data, timeout=aiohttp.ClientTimeout(total=600)) as resp:
-                    print(f"[{FLOW_ID}] Received response status: {resp.status}")
-                    if resp.status == 200:
-                        result = await resp.json()
-                        print(f"[{FLOW_ID}] Response data: {result}")
-                        # Check if response contains transcription data
-                        if result.get('text') or result.get('transcription'):
-                            # Update progress counter
-                            progress_tracker['completed'] += 1
-                            if progress_tracker['completed'] % 10 == 0:
-                                print(f"[{FLOW_ID}] PROGRESS: {progress_tracker['completed']}/{progress_tracker['total']} transcriptions completed.")
-                            print(f"[{FLOW_ID}] ✅ Success: {audio_path.name} transcribed by {server.url}")
-                            # Store the full transcription result
-                            return {
-                                "audio_file": audio_path.name,
-                                "text": result.get('text', result.get('transcription', '')),
-                                "language": result.get('language', 'unknown'),
-                                "confidence": result.get('confidence'),
-                                "duration": result.get('duration'),
-                            }
-                        else:
-                            print(f"[{FLOW_ID}] ⚠️ Server {server.url} returned invalid response format for {audio_path.name}. Response: {result}")
-                            continue
-                    else:
-                        error_text = await resp.text()
-                        print(f"[{FLOW_ID}] ❌ Error from server {server.url} for {audio_path.name}: {resp.status} - {error_text}. Retrying...")
-                        continue
-        except (aiohttp.ClientError, asyncio.TimeoutError, TimeoutError) as e:
-            print(f"[{FLOW_ID}] ❌ Connection/Timeout error for {audio_path.name} on {server.url if server else 'unknown server'}: {e}. Retrying...")
-            continue
-        except Exception as e:
-            print(f"[{FLOW_ID}] ❌ Unexpected error during transcription for {audio_path.name}: {e}. Retrying...")
-            import traceback
-            traceback.print_exc()
-            continue
-        finally:
-            if server:
-                end_time = time.time()
-                server.busy = False
-                server.total_processed += 1
-                server.total_time += (end_time - start_time)
-    print(f"[{FLOW_ID}] ❌ FAILED after {MAX_RETRIES} attempts for {audio_path.name}.")
-    return None
-# --- FastAPI App and Endpoints ---
-app = FastAPI(
-    title=f"Flow Server {FLOW_ID} API",
-    description="Processes audio files from a dataset, sends to whisper servers for transcription, and tracks progress.",
-    version="1.0.0"
-)
-@app.on_event("startup")
-async def startup_event():
-    print(f"[{FLOW_ID}] Flow Server started on port {FLOW_PORT}.")
-    print(f"[{FLOW_ID}] 🚀 Auto-starting background processing...")
-    # Create a background task to run the processing loop
-    thread = threading.Thread(target=lambda: asyncio.run(process_audio_files_background()), daemon=True)
-    thread.start()
-    print(f"[{FLOW_ID}] ✅ Background processing thread started")
-@app.post("/process")
-async def process_audio_files(background_tasks: BackgroundTasks):
     """
-    Manually trigger processing endpoint (in addition to auto-start on startup).
-    Orchestrates transcription of audio files with reference file mapping.
     """
-    print(f"[{FLOW_ID}] /process endpoint called, starting additional background task...")
-    background_tasks.add_task(process_audio_files_background)
-    return {
-        "status": "processing_started",
-        "flow_id": FLOW_ID,
-        "message": "Background processing task started. Check /status for progress."
-    }
-async def process_audio_files_background():
     """
-    Background task that processes audio files with reference mapping.
-    - Downloads batch of files (1 per server)
-    - Distributes to Whisper servers in parallel
-    - Uploads JSON results directly to HF dataset
-    - Updates processing state after each batch round (dynamically based on actual processed count)
-    - Respects rate limit: max 120 uploads/hour, stops at 128
     """
-    progress_data = load_progress()
-    reference_map = progress_data.get('reference_map', {})
-    # Fetch reference map if empty
-    if not reference_map:
-        print(f"[{FLOW_ID}] Reference map is empty. Fetching from {REFERENCE_REPO_ID}...")
-        reference_map = await get_reference_map(REFERENCE_REPO_ID)
-        progress_data['reference_map'] = reference_map
-        save_progress(progress_data)
-    audio_files = await get_audio_file_list(progress_data)
-    if not audio_files:
-        print(f"[{FLOW_ID}] No audio files found. Exiting.")
-        return
-    # Dynamic batch size: one file per server
-    BATCH_SIZE = len(servers)
-    print(f"[{FLOW_ID}] 📊 Configuration: {len(servers)} Whisper server(s) → Batch size: {BATCH_SIZE} (1 file per server)")
-    start_index = progress_data['last_processed_index']
-    print(f"[{FLOW_ID}] Starting batch processing from file #{start_index} (out of {len(audio_files)})...")
-    # Process in batches
-    for batch_start in range(start_index, len(audio_files), BATCH_SIZE):
-        batch_end = min(batch_start + BATCH_SIZE, len(audio_files))
-        batch_files = audio_files[batch_start:batch_end]
-        print(f"\n[{FLOW_ID}] 📦 BATCH ROUND: Processing files #{batch_start}-#{batch_end-1} ({len(batch_files)} files)")
-        # Step 1: Download all files in batch in parallel
-        print(f"[{FLOW_ID}] ⬇️  Downloading batch ({len(batch_files)} files)...")
-        download_tasks = []
-        for idx, repo_file_path in enumerate(batch_files):
-            file_index = batch_start + idx
-            download_tasks.append(download_audio_file(file_index, repo_file_path))
-        downloaded_paths = await asyncio.gather(*download_tasks, return_exceptions=True)
-        # Step 2: Send all downloaded files to Whisper servers in parallel
-        print(f"[{FLOW_ID}] 🎤 Distributing to {len(servers)} Whisper server(s) ({len(batch_files)} files)...")
-        transcription_tasks = []
-        file_metadata = []  # Track file info for results
-        for idx, (repo_file_path, audio_path) in enumerate(zip(batch_files, downloaded_paths)):
-            file_index = batch_start + idx
-            audio_filename = Path(repo_file_path).name
-            # Skip if download failed
-            if isinstance(audio_path, Exception):
-                print(f"[{FLOW_ID}] ⏭️  Skipping {audio_filename} (download failed)")
-                continue
-            if not audio_path or not audio_path.exists():
-                continue
-            reference_filename = find_matching_filename(audio_filename, reference_map)
-            file_metadata.append({
-                'audio_filename': audio_filename,
-                'audio_path': audio_path,
-                'reference_filename': reference_filename,
-                'file_index': file_index
-            })
-            # Create transcription task (will be awaited in parallel)
-            transcription_tasks.append(send_audio_for_transcription_task(audio_path, audio_filename))
-        if transcription_tasks:
-            print(f"[{FLOW_ID}] ⏳ Waiting for {len(transcription_tasks)} transcriptions (parallel)...")
-            transcription_results = await asyncio.gather(*transcription_tasks, return_exceptions=True)
-            # Step 3: Upload transcriptions directly to HF dataset
-            successful_uploads = 0
-            uploaded_files = []
-            state = await download_hf_state()
-            print(f"[{FLOW_ID}] 📤 Uploading {len([r for r in transcription_results if r and not isinstance(r, Exception)])}/{len(transcription_results)} transcriptions directly to dataset...")
-            for metadata, result in zip(file_metadata, transcription_results):
-                if isinstance(result, Exception):
-                    print(f"[{FLOW_ID}] ❌ Transcription failed for {metadata['audio_filename']}: {result}")
                     continue
-                if result:
-                    # Save JSON locally first
-                    json_filename = Path(metadata['reference_filename']).stem if metadata['reference_filename'] else Path(metadata['audio_filename']).stem
-                    json_file_path = Path(RESULTS_DIR) / f"{json_filename}.json"
-                    # Write JSON to file
-                    with open(json_file_path, 'w', encoding='utf-8') as f:
-                        json.dump(result, f, indent=2, ensure_ascii=False)
-                    # Upload directly to HF dataset
-                    if await upload_json_to_dataset(json_file_path, f"{json_filename}.json"):
-                        successful_uploads += 1
-                        uploaded_files.append(json_file_path)
-                        progress_data['transcription_count'] += 1
-                    # Cleanup local JSON file after upload
-                    if json_file_path.exists():
-                        os.remove(json_file_path)
-            # Step 4: Cleanup downloaded audio files
-            for metadata in file_metadata:
-                if metadata['audio_path'].exists():
-                    os.remove(metadata['audio_path'])
-            # Step 5: Update processing state after this batch round
-            # Update next_download_index based on actual files processed this round
-            files_processed_this_round = len([m for m in file_metadata if m])  # Count of files actually processed
-            new_download_index = batch_start + files_processed_this_round
-            print(f"[{FLOW_ID}] 🔄 Batch round complete: {files_processed_this_round} files distributed and processed")
-            print(f"[{FLOW_ID}] 📊 Updating state: next_download_index {state['next_download_index']} → {new_download_index}")
-            state['next_download_index'] = new_download_index
-            # Mark all files in this round as processed in the state
-            for metadata in file_metadata:
-                state['file_states'][metadata['audio_filename']] = "processed"
-            # Upload updated state
             await upload_hf_state(state)
-            # Save local progress
-            progress_data['last_processed_index'] = batch_end
-            save_progress(progress_data)
-            print(f"[{FLOW_ID}] ✅ State updated. Successful uploads this round: {successful_uploads}/{len(file_metadata)}")
-    print(f"\n[{FLOW_ID}] ✅ ALL DONE! Total transcriptions: {progress_data['transcription_count']}")
-async def send_audio_for_transcription_task(audio_path: Path, audio_filename: str) -> Optional[Dict]:
-    """Wrapper for transcription that can be used in asyncio.gather."""
-    MAX_RETRIES = 3
-    for attempt in range(MAX_RETRIES):
-        server = None
-        try:
-            server = await get_available_server()
-            server.busy = True
-            start_time = time.time()
-            # Read file content once
-            with audio_path.open('rb') as f:
-                file_content = f.read()
-            form_data = aiohttp.FormData()
-            form_data.add_field('file',
-                                io.BytesIO(file_content),
-                                filename=audio_filename,
-                                content_type='audio/mpeg')
-            async with aiohttp.ClientSession() as session:
-                async with session.post(server.url, data=form_data, timeout=aiohttp.ClientTimeout(total=600)) as resp:
-                    if resp.status == 200:
-                        result = await resp.json()
-                        if result.get('text') or result.get('transcription'):
-                            print(f"[{FLOW_ID}] ✅ {audio_filename}")
-                            return {
-                                "audio_file": audio_filename,
-                                "text": result.get('text', result.get('transcription', '')),
-                                "language": result.get('language', 'unknown'),
-                                "confidence": result.get('confidence'),
-                                "duration": result.get('duration'),
-                            }
-                        else:
-                            print(f"[{FLOW_ID}] ⚠️  Invalid response for {audio_filename}")
-                            continue
-                    else:
-                        error_text = await resp.text()
-                        print(f"[{FLOW_ID}] ❌ Server error {resp.status}: {audio_filename}")
-                        continue
-        except (aiohttp.ClientError, asyncio.TimeoutError, TimeoutError) as e:
-            print(f"[{FLOW_ID}] ⏱️  Timeout/Connection error for {audio_filename}")
-            continue
-        except Exception as e:
-            print(f"[{FLOW_ID}] ❌ Error for {audio_filename}: {str(e)[:50]}")
-            continue
-        finally:
-            if server:
-                end_time = time.time()
-                server.busy = False
-                server.total_processed += 1
-                server.total_time += (end_time - start_time)
-    return None
 @app.get("/")
 async def root():
     progress = load_progress()
     return {
         "flow_id": FLOW_ID,
         "status": "ready",
-        "last_processed_index": progress['last_processed_index'],
         "total_files_in_list": len(progress['file_list']),
-        "processed_files_count": len(progress['processed_files']),
-        "transcription_count": progress.get('transcription_count', 0),
         "total_servers": len(servers),
-        "busy_servers": sum(1 for s in servers if s.busy),
     }
-@app.get("/status")
-async def get_status():
-    """Returns detailed processing status with reference map info."""
-    progress = load_progress()
-    state = await download_hf_state()
-    return {
-        "flow_id": FLOW_ID,
-        "status": "processing" if state['next_download_index'] < len(progress.get('file_list', [])) else "idle",
-        "progress": {
-            "current_index": state['next_download_index'],
-            "total_files": len(progress.get('file_list', [])),
-            "percentage": (state['next_download_index'] / len(progress.get('file_list', [])) * 100) if progress.get('file_list') else 0
-        },
-        "transcription_count": progress.get('transcription_count', 0),
-        "reference_map_size": len(progress.get('reference_map', {})),
-        "server_stats": {
-            "total_servers": len(servers),
-            "busy_servers": sum(1 for s in servers if s.busy),
-            "details": [
-                {
-                    "url": s.url,
-                    "busy": s.busy,
-                    "total_processed": s.total_processed,
-                    "avg_time_per_file": s.total_time / s.total_processed if s.total_processed > 0 else 0
-                }
-                for s in servers
-            ]
-        },
-        "files_in_processing": list(state.get('file_states', {}).keys())
-    }
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=FLOW_PORT)

 import aiohttp
 import zipfile
 import shutil
 from typing import Dict, List, Set, Optional, Tuple, Any
 from urllib.parse import quote
+from datetime import datetime
 from pathlib import Path
 import io
 from huggingface_hub import HfApi, hf_hub_download
 # --- Configuration ---
+AUTO_START_INDEX = 1  # Hardcoded default start index if no progress is found
 FLOW_ID = os.getenv("FLOW_ID", "flow_default")
 FLOW_PORT = int(os.getenv("FLOW_PORT", 8001))
 HF_TOKEN = os.getenv("HF_TOKEN", "")
 HF_AUDIO_DATASET_ID = os.getenv("HF_AUDIO_DATASET_ID", "Samfredoly/BG_Vid")
+HF_OUTPUT_DATASET_ID = os.getenv("HF_OUTPUT_DATASET_ID", "samfred2/ATO")
 # Progress and State Tracking
 PROGRESS_FILE = Path("processing_progress.json")
 LOCAL_STATE_FOLDER = Path(".state")
 LOCAL_STATE_FOLDER.mkdir(exist_ok=True)
+# Processing configuration
+MAX_UPLOADS_BEFORE_PAUSE = 120  # Pause uploading after 120 files
+UPLOAD_PAUSE_ENABLED = True
+# Directory within the HF dataset where the audio files are located
+AUDIO_FILE_PREFIX = "audio/"
 WHISPER_SERVERS = [
     "https://fred1012-switch3.hf.space/transcribe",
     "https://Eliasishere-mint-20.hf.space/transcribe"
 ]
 # Temporary storage for audio files
 TEMP_DIR = Path(f"temp_audio_{FLOW_ID}")
 TEMP_DIR.mkdir(exist_ok=True)
 # --- Models ---
+class ProcessStartRequest(BaseModel):
+    start_index: int = Field(AUTO_START_INDEX, ge=1, description="The index number of the audio file to start processing from (1-indexed).")
 class WhisperServer:
+    def __init__(self, url: str):
         self.url = url
+        self.is_processing = False
+        self.current_file_index: Optional[int] = None
         self.total_processed = 0
+        self.total_time = 0.0
     @property
     def fps(self):
+        """Files per second"""
         return self.total_processed / self.total_time if self.total_time > 0 else 0
+    def assign_file(self, file_index: int):
+        """Assign a file index to this server"""
+        self.is_processing = True
+        self.current_file_index = file_index
+    def release(self):
+        """Release the server for a new file"""
+        self.is_processing = False
+        self.current_file_index = None
 # Global state for whisper servers
 servers = [WhisperServer(url) for url in WHISPER_SERVERS]
+server_lock = asyncio.Lock()  # Lock for thread-safe server state access
+# --- Progress and State Management Functions ---
 def load_progress() -> Dict:
+    """Loads the local processing progress from the JSON file."""
     if PROGRESS_FILE.exists():
         try:
             with PROGRESS_FILE.open('r') as f:
                 return json.load(f)
         except json.JSONDecodeError:
             print(f"[{FLOW_ID}] WARNING: Progress file is corrupted. Starting fresh.")
+            # Fall through to return default structure
+    # Default structure
     return {
         "last_processed_index": 0,
+        "processed_files": {}, # {index: repo_path}
+        "file_list": [] # Full list of all zip files found in the dataset
     }
 def save_progress(progress_data: Dict):
                 if "next_download_index" not in data:
                     data["next_download_index"] = 0
                 return data
         except json.JSONDecodeError:
             print(f"[{FLOW_ID}] WARNING: Corrupted state file: {file_path}")
+    return default_value
 def save_json_state(file_path: str, data: Dict[str, Any]):
     """Save state to JSON file"""
 async def download_hf_state() -> Dict[str, Any]:
     """Downloads the state file from Hugging Face or returns a default state."""
     local_path = LOCAL_STATE_FOLDER / HF_STATE_FILE
+    default_state = {"next_download_index": 0, "file_states": {}}
     try:
+        # Check if the file exists in the helium repo
         files = HfApi(token=HF_TOKEN).list_repo_files(
             repo_id=HF_OUTPUT_DATASET_ID,
             repo_type="dataset"
         # Save state locally first
         save_json_state(str(local_path), state)
+        # Upload to helium dataset
         HfApi(token=HF_TOKEN).upload_file(
             path_or_fileobj=str(local_path),
             path_in_repo=HF_STATE_FILE,
             repo_id=HF_OUTPUT_DATASET_ID,
             repo_type="dataset",
+            commit_message=f"Update caption processing state: next_index={state['next_download_index']}"
         )
         print(f"[{FLOW_ID}] Successfully uploaded state file.")
         return True
         print(f"[{FLOW_ID}] Failed to upload state file: {str(e)}")
         return False
+async def lock_file_for_processing(zip_filename: str, state: Dict[str, Any]) -> bool:
     """Marks a file as 'processing' in the state file and uploads the lock."""
+    print(f"[{FLOW_ID}] 🔒 Attempting to lock file: {zip_filename}")
     # Update state locally
+    state["file_states"][zip_filename] = "processing"
     # Upload the updated state file immediately to establish the lock
     if await upload_hf_state(state):
+        print(f"[{FLOW_ID}] ✅ Successfully locked file: {zip_filename}")
         return True
     else:
+        print(f"[{FLOW_ID}] ❌ Failed to lock file: {zip_filename}")
         # Revert local state
+        if zip_filename in state["file_states"]:
+            del state["file_states"][zip_filename]
         return False
+async def unlock_file_as_processed(zip_filename: str, state: Dict[str, Any], next_index: int) -> bool:
     """Marks a file as 'processed', updates the index, and uploads the state."""
+    print(f"[{FLOW_ID}] 🔓 Marking file as processed: {zip_filename}")
     # Update state locally
+    state["file_states"][zip_filename] = "processed"
     state["next_download_index"] = next_index
     # Upload the updated state
     if await upload_hf_state(state):
+        print(f"[{FLOW_ID}] ✅ Successfully marked as processed: {zip_filename}")
         return True
     else:
+        print(f"[{FLOW_ID}] ❌ Failed to update state for: {zip_filename}")
         return False
 # --- Hugging Face Utility Functions ---
 async def get_audio_file_list(progress_data: Dict) -> List[str]:
     """
+    Fetches the list of all WAV files from the dataset, or uses the cached list.
     Updates the progress_data with the file list if a new list is fetched.
     """
     if progress_data['file_list']:
         print(f"[{FLOW_ID}] Using cached file list with {len(progress_data['file_list'])} files.")
         return progress_data['file_list']
+    print(f"[{FLOW_ID}] Fetching full list of WAV files from {HF_AUDIO_DATASET_ID}...")
     try:
         api = HfApi(token=HF_TOKEN)
         repo_files = api.list_repo_files(
             repo_type="dataset"
         )
+        # Filter for WAV files and sort them alphabetically for consistent indexing
+        wav_files = sorted([
             f for f in repo_files
+            if f.endswith('.wav')
         ])
+        if not wav_files:
+            raise FileNotFoundError(f"No WAV files found in dataset '{HF_AUDIO_DATASET_ID}'.")
+        print(f"[{FLOW_ID}] Found {len(wav_files)} WAV files.")
         # Update and save the progress data
+        progress_data['file_list'] = wav_files
         save_progress(progress_data)
+        return wav_files
     except Exception as e:
         print(f"[{FLOW_ID}] Error fetching file list from Hugging Face: {e}")
         return []
+async def download_wav_file_by_index(file_index: int, repo_file_full_path: str) -> Optional[Path]:
+    """Downloads a WAV file from the repository."""
+    wav_filename = Path(repo_file_full_path).name
+    print(f"[{FLOW_ID}] Downloading file #{file_index}: {repo_file_full_path}")
     try:
         # Use hf_hub_download to get the file path
+        wav_path = hf_hub_download(
             repo_id=HF_AUDIO_DATASET_ID,
             filename=repo_file_full_path,
             repo_type="dataset",
             token=HF_TOKEN,
         )
+        print(f"[{FLOW_ID}] Downloaded WAV file to {wav_path}")
+        return Path(wav_path)
     except Exception as e:
+        print(f"[{FLOW_ID}] Error downloading WAV file {repo_file_full_path}: {e}")
         return None
+async def upload_transcription_to_hf(wav_filename: str, transcription_data: Dict) -> bool:
+    """Uploads the transcription JSON file to the output dataset."""
+    # Use the WAV filename, replacing the extension with .json
+    json_filename = Path(wav_filename).with_suffix('.json').name
     try:
+        print(f"[{FLOW_ID}] Uploading transcription for {wav_filename} as {json_filename} to {HF_OUTPUT_DATASET_ID}...")
+        # Create JSON content in memory
+        json_content = json.dumps(transcription_data, indent=2, ensure_ascii=False).encode('utf-8')
         api = HfApi(token=HF_TOKEN)
         api.upload_file(
+            path_or_fileobj=io.BytesIO(json_content),
+            path_in_repo=json_filename,
             repo_id=HF_OUTPUT_DATASET_ID,
             repo_type="dataset",
+            commit_message=f"[{FLOW_ID}] Transcription for {wav_filename}"
         )
+        print(f"[{FLOW_ID}] Successfully uploaded transcription for {wav_filename}.")
         return True
     except Exception as e:
+        print(f"[{FLOW_ID}] Error uploading transcription for {wav_filename}: {e}")
         return False
 # --- Core Processing Functions ---
+async def send_audio_to_whisper(wav_path: Path, server: WhisperServer) -> Optional[Dict]:
+    """Sends a WAV file to a Whisper server for transcription."""
+    try:
+        print(f"[{FLOW_ID}] Sending {wav_path.name} to {server.url}...")
+        start_time = time.time()
+        # Prepare multipart form data
+        form_data = aiohttp.FormData()
+        form_data.add_field('file',
+                            wav_path.open('rb'),
+                            filename=wav_path.name,
+                            content_type='audio/wav')
+        async with aiohttp.ClientSession() as session:
+            # 10 minute timeout for transcription
+            async with session.post(server.url, data=form_data, timeout=600) as resp:
+                if resp.status == 200:
+                    result = await resp.json()
+                    end_time = time.time()
+                    # Update server stats
+                    server.total_processed += 1
+                    server.total_time += (end_time - start_time)
+                    print(f"[{FLOW_ID}] ✓ {wav_path.name} transcribed successfully by {server.url}")
+                    return {
+                        "file": wav_path.name,
+                        "transcription": result,
+                        "timestamp": datetime.now().isoformat(),
+                        "processing_time_seconds": end_time - start_time
+                    }
+                else:
+                    error_text = await resp.text()
+                    print(f"[{FLOW_ID}] ✗ Error from {server.url}: {resp.status} - {error_text}")
+                    return None
+    except asyncio.TimeoutError:
+        print(f"[{FLOW_ID}] ✗ Timeout from {server.url} for {wav_path.name}")
+        return None
+    except Exception as e:
+        print(f"[{FLOW_ID}] ✗ Exception on {server.url} for {wav_path.name}: {e}")
+        return None
+async def get_available_servers() -> List[WhisperServer]:
     """
+    Returns a list of servers that are not currently processing.
+    Dynamically assigns new files to available servers.
     """
+    async with server_lock:
+        available = [s for s in servers if not s.is_processing]
+        return available
+async def assign_file_to_server(file_index: int, server: WhisperServer):
+    """Safely assign a file to a server"""
+    async with server_lock:
+        server.assign_file(file_index)
+async def release_server(server: WhisperServer):
+    """Safely release a server for new work"""
+    async with server_lock:
+        server.release()
+async def process_batch_dynamic(wav_files: List[str], start_batch_index: int, batch_size: int, state: Dict[str, Any], progress: Dict) -> Tuple[int, int]:
     """
+    Dynamically processes a batch of WAV files using available servers.
+    Returns (next_batch_index, uploaded_count)
     """
+    batch_end = min(start_batch_index + batch_size, len(wav_files))
+    current_index = start_batch_index
+    uploaded_count = progress.get('uploaded_count', 0)
+    # Create tasks for all servers to process files dynamically
+    pending_tasks: Dict[asyncio.Task, Tuple[int, Path, WhisperServer]] = {}
+    print(f"[{FLOW_ID}] Processing batch from index {start_batch_index} to {batch_end}")
+    try:
+        while current_index < batch_end or pending_tasks:
+            # Assign new files to available servers
+            while current_index < batch_end:
+                available_servers = await get_available_servers()
+                if not available_servers:
+                    # All servers busy, wait a bit
+                    await asyncio.sleep(0.5)
                     continue
+                server = available_servers[0]
+                file_index = current_index
+                wav_file = wav_files[file_index]
+                wav_filename = Path(wav_file).name
+                # Mark file as processing in state
+                state["file_states"][wav_filename] = "processing"
+                # Download the WAV file
+                wav_path = await download_wav_file_by_index(file_index + 1, wav_file)
+                if not wav_path:
+                    state["file_states"][wav_filename] = "failed"
+                    current_index += 1
+                    continue
+                # Assign to server and create task
+                await assign_file_to_server(file_index, server)
+                task = asyncio.create_task(send_audio_to_whisper(wav_path, server))
+                pending_tasks[task] = (file_index, wav_path, server)
+                current_index += 1
+            # Wait for at least one task to complete
+            if pending_tasks:
+                done, pending_tasks_remaining = await asyncio.wait(
+                    pending_tasks.keys(),
+                    return_when=asyncio.FIRST_COMPLETED
+                )
+                # Process completed tasks
+                for task in done:
+                    file_index, wav_path, server = pending_tasks.pop(task)
+                    wav_filename = Path(wav_path).name
+                    try:
+                        transcription_result = task.result()
+                        if transcription_result:
+                            # Check if we should pause uploading
+                            if UPLOAD_PAUSE_ENABLED and uploaded_count >= MAX_UPLOADS_BEFORE_PAUSE:
+                                print(f"[{FLOW_ID}] ⏸️  Upload limit reached ({uploaded_count}/{MAX_UPLOADS_BEFORE_PAUSE}). Pausing uploads but continuing processing...")
+                                # Mark as processed but don't upload
+                                state["file_states"][wav_filename] = "processed"
+                            else:
+                                # Upload transcription
+                                if await upload_transcription_to_hf(wav_filename, transcription_result):
+                                    state["file_states"][wav_filename] = "processed"
+                                    uploaded_count += 1
+                                    progress['uploaded_count'] = uploaded_count
+                                    save_progress(progress)
+                                else:
+                                    state["file_states"][wav_filename] = "failed"
+                        else:
+                            state["file_states"][wav_filename] = "failed"
+                    except Exception as e:
+                        print(f"[{FLOW_ID}] Error processing result for {wav_filename}: {e}")
+                        state["file_states"][wav_filename] = "failed"
+                    finally:
+                        # Release the server
+                        await release_server(server)
+                        # Clean up the WAV file
+                        if wav_path.exists():
+                            wav_path.unlink()
+                # Update pending_tasks with remaining
+                pending_tasks = {task: pending_tasks[task] for task in pending_tasks_remaining}
+            # Update HF state periodically
             await upload_hf_state(state)
+    except Exception as e:
+        print(f"[{FLOW_ID}] Error in process_batch_dynamic: {e}")
+    return current_index, uploaded_count
+async def process_dataset_task(start_index: int):
+    """Main task to process the dataset using dynamic server assignment."""
+    # Load both local progress and HF state
+    progress = load_progress()
+    current_state = await download_hf_state()
+    file_list = await get_audio_file_list(progress)
+    if not file_list:
+        print(f"[{FLOW_ID}] ERROR: Cannot proceed. File list is empty.")
+        return False
+    # Ensure start_index is within bounds
+    if start_index > len(file_list):
+        print(f"[{FLOW_ID}] WARNING: Start index {start_index} is greater than the total number of files ({len(file_list)}). Exiting.")
+        return True
+    # Determine the actual starting index in the 0-indexed list
+    start_list_index = start_index - 1
+    print(f"[{FLOW_ID}] Starting audio transcription from file index: {start_index} out of {len(file_list)}.")
+    print(f"[{FLOW_ID}] Using {len(servers)} Whisper servers for dynamic processing.")
+    print(f"[{FLOW_ID}] Upload pause enabled: {UPLOAD_PAUSE_ENABLED}, Max uploads before pause: {MAX_UPLOADS_BEFORE_PAUSE}")
+    # Initialize progress tracking
+    if 'uploaded_count' not in progress:
+        progress['uploaded_count'] = 0
+    global_success = True
+    current_batch_index = start_list_index
+    batch_size = len(servers) * 2  # Process 2 batches per server at a time
+    try:
+        while current_batch_index < len(file_list):
+            # Process a batch dynamically
+            next_index, uploaded_count = await process_batch_dynamic(
+                file_list,
+                current_batch_index,
+                batch_size,
+                current_state,
+                progress
+            )
+            # Update progress
+            progress['last_processed_index'] = next_index
+            progress['uploaded_count'] = uploaded_count
+            save_progress(progress)
+            # Update current batch index
+            current_batch_index = next_index
+            # Log statistics
+            print(f"[{FLOW_ID}] Batch complete. Progress: {current_batch_index}/{len(file_list)}, Uploaded: {uploaded_count}")
+            # Print server statistics
+            print(f"[{FLOW_ID}] Server Statistics:")
+            for i, server in enumerate(servers):
+                print(f"  Server {i+1}: {server.total_processed} files, {server.total_time:.2f}s total, {server.fps:.2f} files/sec")
+        print(f"[{FLOW_ID}] All files processed successfully!")
+        return True
+    except Exception as e:
+        print(f"[{FLOW_ID}] Critical error in process_dataset_task: {e}")
+        global_success = False
+        return global_success
+# --- FastAPI App and Endpoints ---
+app = FastAPI(
+    title=f"Flow Server {FLOW_ID} API",
+    description="Sequentially processes zip files from a dataset, captions images, and tracks progress.",
+    version="1.0.0"
+)
+@app.on_event("startup")
+async def startup_event():
+    print(f"Flow Server {FLOW_ID} started on port {FLOW_PORT}.")
+    # Get both local progress and HF state
+    progress = load_progress()
+    current_state = await download_hf_state()
+    # Get the next_download_index from HF state if available
+    hf_next_index = current_state.get("next_download_index", 0)
+    # If HF state has a higher index, use that instead of local progress
+    if hf_next_index > 0:
+        start_index = hf_next_index
+        print(f"[{FLOW_ID}] Using next_download_index from HF state: {start_index}")
+    else:
+        # Fall back to local progress if HF state doesn't have a meaningful index
+        start_index = progress.get('last_processed_index', 0) + 1
+        if start_index < AUTO_START_INDEX:
+            start_index = AUTO_START_INDEX
+    # Use a dummy BackgroundTasks object for the startup task
+    # Note: FastAPI's startup events can't directly use BackgroundTasks, but we can use asyncio.create_task
+    # to run the long-running process in the background without blocking the server startup.
+    print(f"[{FLOW_ID}] Auto-starting processing from index: {start_index}...")
+    asyncio.create_task(process_dataset_task(start_index))
 @app.get("/")
 async def root():
     progress = load_progress()
+    # Calculate server stats
+    total_processed = sum(s.total_processed for s in servers)
+    total_time = sum(s.total_time for s in servers)
+    avg_fps = total_processed / total_time if total_time > 0 else 0
     return {
         "flow_id": FLOW_ID,
         "status": "ready",
+        "last_processed_index": progress.get('last_processed_index', 0),
         "total_files_in_list": len(progress['file_list']),
+        "uploaded_count": progress.get('uploaded_count', 0),
         "total_servers": len(servers),
+        "processing_servers": sum(1 for s in servers if s.is_processing),
+        "total_files_processed_by_servers": total_processed,
+        "avg_files_per_second": avg_fps,
+        "upload_limit_paused": progress.get('uploaded_count', 0) >= MAX_UPLOADS_BEFORE_PAUSE
     }
+@app.post("/start_processing")
+async def start_processing(request: ProcessStartRequest, background_tasks: BackgroundTasks):
+    """
+    Starts the sequential processing of zip files from the given index in the background.
+    """
+    start_index = request.start_index
+    print(f"[{FLOW_ID}] Received request to start processing from index: {start_index}. Starting background task.")
+    # Start the heavy processing in a background task so the API call returns immediately
+    # Note: The server is already auto-starting, but this allows for manual restart/override.
+    background_tasks.add_task(process_dataset_task, start_index)
+    return {"status": "processing", "start_index": start_index, "message": "Dataset processing started in background."}
 if __name__ == "__main__":
     import uvicorn
+    # Note: When running in the sandbox, we need to use 0.0.0.0 to expose the port.
     uvicorn.run(app, host="0.0.0.0", port=FLOW_PORT)