Spaces:

Samfredoly
/

switches

Paused

App Files Files Community

Samfredoly commited on Nov 23, 2025

Commit

424a01e

verified ·

1 Parent(s): 963cadd

Update app.py

Browse files

Files changed (1) hide show

app.py +132 -60

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ import shutil
 import threading
 from typing import Dict, List, Set, Optional, Tuple, Any
 from urllib.parse import quote
-from datetime import datetime
 from pathlib import Path
 import io
@@ -17,26 +17,24 @@ from pydantic import BaseModel, Field
 from huggingface_hub import HfApi, hf_hub_download
 # --- Configuration ---
-AUTO_START_INDEX = 1290  # Hardcoded default start index if no progress is found
 FLOW_ID = os.getenv("FLOW_ID", "flow_default")
 FLOW_PORT = int(os.getenv("FLOW_PORT", 8001))
 HF_TOKEN = os.getenv("HF_TOKEN", "")
-HF_AUDIO_DATASET_ID = os.getenv("HF_AUDIO_DATASET_ID", "Samfredoly/BG_Vid")  # Source dataset for audio files
-HF_OUTPUT_DATASET_ID = os.getenv("HF_OUTPUT_DATASET_ID", "samfred2/AT2")  # Target dataset for transcriptions
 # Progress and State Tracking
 PROGRESS_FILE = Path("processing_progress.json")
-HF_STATE_FILE = "processing_state_transcriptions.json"  # State file in output dataset
-LOCAL_STATE_FOLDER = Path(".state")  # Local folder for state file
 LOCAL_STATE_FOLDER.mkdir(exist_ok=True)
-# Directory within the HF dataset where audio files are located
 AUDIO_FILE_PREFIX = "audio/"
 # Reference dataset for filename mapping
-REFERENCE_REPO_ID = os.getenv("REFERENCE_REPO_ID", "Fred808/BG3")  # For matching audio to reference files
-# Whisper server endpoints
 WHISPER_SERVERS = [
     "https://fred1012-switch3.hf.space/transcribe",
     "https://Eliasishere-mint-2.hf.space/transcribe",
@@ -84,14 +82,60 @@ class WhisperServer:
     def fps(self):
         return self.total_processed / self.total_time if self.total_time > 0 else 0
 # Global state for whisper servers
 servers = [WhisperServer(url) for url in WHISPER_SERVERS]
 server_index = 0
-# --- Progress and State Management Functions ---
 def load_progress() -> Dict:
-    """Loads the local processing progress from the JSON file."""
     if PROGRESS_FILE.exists():
         try:
             with PROGRESS_FILE.open('r') as f:
@@ -99,13 +143,13 @@ def load_progress() -> Dict:
         except json.JSONDecodeError:
             print(f"[{FLOW_ID}] WARNING: Progress file is corrupted. Starting fresh.")
-    # Default structure
     return {
         "last_processed_index": 0,
-        "processed_files": {},  # {index: audio_file_path}
-        "file_list": [],  # Full list of all audio files found in the dataset
-        "transcription_count": 0,  # Count of transcriptions saved
-        "reference_map": {}  # Mapping from audio filename to reference filename
     }
 def save_progress(progress_data: Dict):
@@ -137,7 +181,7 @@ def load_json_state(file_path: str, default_value: Dict[str, Any]) -> Dict[str,
                 return data
         except json.JSONDecodeError:
             print(f"[{FLOW_ID}] WARNING: Corrupted state file: {file_path}")
-    return default_value
 def save_json_state(file_path: str, data: Dict[str, Any]):
     """Save state to JSON file"""
@@ -334,6 +378,32 @@ async def download_audio_file(file_index: int, repo_file_full_path: str) -> Opti
         print(f"[{FLOW_ID}] Error downloading audio file {repo_file_full_path}: {e}")
         return None
 async def zip_and_upload_transcriptions(transcription_files: List[Path], batch_number: int) -> bool:
     """Zips transcription JSON files and uploads to dataset with batch numbering."""
     if not transcription_files:
@@ -504,7 +574,14 @@ async def process_audio_files(background_tasks: BackgroundTasks):
     }
 async def process_audio_files_background():
-    """Background task that processes audio files with reference mapping using batch distribution."""
     progress_data = load_progress()
     reference_map = progress_data.get('reference_map', {})
@@ -525,8 +602,6 @@ async def process_audio_files_background():
     print(f"[{FLOW_ID}] 📊 Configuration: {len(servers)} Whisper server(s) → Batch size: {BATCH_SIZE} (1 file per server)")
     start_index = progress_data['last_processed_index']
-    transcription_files = []
-    batch_number = 1
     print(f"[{FLOW_ID}] Starting batch processing from file #{start_index} (out of {len(audio_files)})...")
@@ -535,7 +610,7 @@ async def process_audio_files_background():
         batch_end = min(batch_start + BATCH_SIZE, len(audio_files))
         batch_files = audio_files[batch_start:batch_end]
-        print(f"\n[{FLOW_ID}] 📦 BATCH: Processing files #{batch_start}-#{batch_end-1} ({len(batch_files)} files)")
         # Step 1: Download all files in batch in parallel
         print(f"[{FLOW_ID}] ⬇️  Downloading batch ({len(batch_files)} files)...")
@@ -579,9 +654,12 @@ async def process_audio_files_background():
             print(f"[{FLOW_ID}] ⏳ Waiting for {len(transcription_tasks)} transcriptions (parallel)...")
             transcription_results = await asyncio.gather(*transcription_tasks, return_exceptions=True)
-            # Step 3: Save transcriptions locally (don't upload individually)
-            successful = len([r for r in transcription_results if not isinstance(r, Exception) and r])
-            print(f"[{FLOW_ID}] 💾 Saving {successful}/{len(transcription_results)} transcriptions locally...")
             for metadata, result in zip(file_metadata, transcription_results):
                 if isinstance(result, Exception):
@@ -589,7 +667,7 @@ async def process_audio_files_background():
                     continue
                 if result:
-                    # Save JSON locally
                     json_filename = Path(metadata['reference_filename']).stem if metadata['reference_filename'] else Path(metadata['audio_filename']).stem
                     json_file_path = Path(RESULTS_DIR) / f"{json_filename}.json"
@@ -597,49 +675,43 @@ async def process_audio_files_background():
                     with open(json_file_path, 'w', encoding='utf-8') as f:
                         json.dump(result, f, indent=2, ensure_ascii=False)
-                    transcription_files.append(json_file_path)
-                    progress_data['transcription_count'] += 1
-                    # Mark as processed
-                    state = await download_hf_state()
-                    await unlock_file_as_processed(
-                        metadata['audio_filename'],
-                        state,
-                        metadata['file_index'] + 1
-                    )
             # Step 4: Cleanup downloaded audio files
             for metadata in file_metadata:
                 if metadata['audio_path'].exists():
                     os.remove(metadata['audio_path'])
-        # Save progress after batch
-        progress_data['last_processed_index'] = batch_end
-        save_progress(progress_data)
-        # Step 5: Check if we've reached the batch threshold for zipping (100 files)
-        if len(transcription_files) >= ZIP_UPLOAD_THRESHOLD:
-            print(f"\n[{FLOW_ID}] 📦 Reached ZIP threshold ({ZIP_UPLOAD_THRESHOLD}). Creating and uploading batch {batch_number}...")
-            files_to_zip = transcription_files[:ZIP_UPLOAD_THRESHOLD]
-            await zip_and_upload_transcriptions(files_to_zip, batch_number)
-            # Remove zipped files locally and update list
-            for file_path in files_to_zip:
-                if file_path.exists():
-                    os.remove(file_path)
-            transcription_files = transcription_files[ZIP_UPLOAD_THRESHOLD:]
-            batch_number += 1
-    # Upload remaining transcriptions as final batch
-    if transcription_files:
-        print(f"\n[{FLOW_ID}] 📦 Uploading final batch {batch_number} with {len(transcription_files)} transcriptions...")
-        await zip_and_upload_transcriptions(transcription_files, batch_number)
-        # Cleanup
-        for file_path in transcription_files:
-            if file_path.exists():
-                os.remove(file_path)
     print(f"\n[{FLOW_ID}] ✅ ALL DONE! Total transcriptions: {progress_data['transcription_count']}")

 import threading
 from typing import Dict, List, Set, Optional, Tuple, Any
 from urllib.parse import quote
+from datetime import datetime, timedelta
 from pathlib import Path
 import io
 from huggingface_hub import HfApi, hf_hub_download
 # --- Configuration ---
+AUTO_START_INDEX = 1290
 FLOW_ID = os.getenv("FLOW_ID", "flow_default")
 FLOW_PORT = int(os.getenv("FLOW_PORT", 8001))
 HF_TOKEN = os.getenv("HF_TOKEN", "")
+HF_AUDIO_DATASET_ID = os.getenv("HF_AUDIO_DATASET_ID", "Samfredoly/BG_Vid")
+HF_OUTPUT_DATASET_ID = os.getenv("HF_OUTPUT_DATASET_ID", "samfred2/AT2")
 # Progress and State Tracking
 PROGRESS_FILE = Path("processing_progress.json")
+HF_STATE_FILE = "processing_state_transcriptions.json"
+LOCAL_STATE_FOLDER = Path(".state")
 LOCAL_STATE_FOLDER.mkdir(exist_ok=True)
 AUDIO_FILE_PREFIX = "audio/"
 # Reference dataset for filename mapping
+REFERENCE_REPO_ID = os.getenv("REFERENCE_REPO_ID", "Fred808/BG3")
 WHISPER_SERVERS = [
     "https://fred1012-switch3.hf.space/transcribe",
     "https://Eliasishere-mint-2.hf.space/transcribe",
     def fps(self):
         return self.total_processed / self.total_time if self.total_time > 0 else 0
+class RateLimiter:
+    """Tracks uploads per hour with max limit of 120, stops at 128."""
+    def __init__(self, max_per_hour: int = 120, stop_at: int = 128):
+        self.max_per_hour = max_per_hour
+        self.stop_at = stop_at
+        self.uploads = []  # List of timestamps
+        self.lock = asyncio.Lock()
+    async def wait_if_needed(self) -> bool:
+        """
+        Returns True if upload can proceed, False if rate limit reached.
+        Waits if needed to stay within limits.
+        """
+        async with self.lock:
+            now = datetime.now()
+            one_hour_ago = now - timedelta(hours=1)
+            # Remove old uploads outside the 1-hour window
+            self.uploads = [ts for ts in self.uploads if ts > one_hour_ago]
+            # If we've reached the hard stop limit (128), return False
+            if len(self.uploads) >= self.stop_at:
+                print(f"[{FLOW_ID}] ⏸️  Upload limit ({self.stop_at}) reached. Waiting for next hour...")
+                return False
+            # If we're at the soft limit (120), add timestamp and continue
+            if len(self.uploads) < self.max_per_hour:
+                self.uploads.append(now)
+                remaining = self.max_per_hour - len(self.uploads)
+                print(f"[{FLOW_ID}] 📤 Upload #{len(self.uploads)}/120 this hour ({remaining} remaining)")
+                return True
+            # Between soft limit and hard stop, add and continue
+            self.uploads.append(now)
+            print(f"[{FLOW_ID}] ⚠️  Upload #{len(self.uploads)}/120 this hour (approaching limit)")
+            return True
+    async def can_upload(self) -> bool:
+        """Check if upload is allowed without waiting."""
+        async with self.lock:
+            now = datetime.now()
+            one_hour_ago = now - timedelta(hours=1)
+            self.uploads = [ts for ts in self.uploads if ts > one_hour_ago]
+            return len(self.uploads) < self.stop_at
+# Global rate limiter
+rate_limiter = RateLimiter(max_per_hour=120, stop_at=128)
 # Global state for whisper servers
 servers = [WhisperServer(url) for url in WHISPER_SERVERS]
 server_index = 0
 def load_progress() -> Dict:
     if PROGRESS_FILE.exists():
         try:
             with PROGRESS_FILE.open('r') as f:
         except json.JSONDecodeError:
             print(f"[{FLOW_ID}] WARNING: Progress file is corrupted. Starting fresh.")
     return {
         "last_processed_index": 0,
+        "processed_files": {},
+        "file_list": [],
+        "transcription_count": 0,
+        "reference_map": {},
     }
 def save_progress(progress_data: Dict):
                 return data
         except json.JSONDecodeError:
             print(f"[{FLOW_ID}] WARNING: Corrupted state file: {file_path}")
+            return default_value
 def save_json_state(file_path: str, data: Dict[str, Any]):
     """Save state to JSON file"""
         print(f"[{FLOW_ID}] Error downloading audio file {repo_file_full_path}: {e}")
         return None
+async def upload_json_to_dataset(json_file_path: Path, json_filename: str) -> bool:
+    """Uploads a single JSON transcription file directly to HF dataset."""
+    try:
+        # Check rate limit before uploading
+        if not await rate_limiter.wait_if_needed():
+            print(f"[{FLOW_ID}] ⏸️  Upload rate limit reached for {json_filename}. Waiting...")
+            return False
+        print(f"[{FLOW_ID}] 📤 Uploading JSON file: {json_filename}...")
+        api = HfApi(token=HF_TOKEN)
+        api.upload_file(
+            path_or_fileobj=str(json_file_path),
+            path_in_repo=f"transcriptions/{json_filename}",
+            repo_id=HF_OUTPUT_DATASET_ID,
+            repo_type="dataset",
+            commit_message=f"[{FLOW_ID}] Transcription: {json_filename}"
+        )
+        print(f"[{FLOW_ID}] ✅ Successfully uploaded: {json_filename}")
+        return True
+    except Exception as e:
+        print(f"[{FLOW_ID}] ❌ Error uploading {json_filename}: {e}")
+        return False
 async def zip_and_upload_transcriptions(transcription_files: List[Path], batch_number: int) -> bool:
     """Zips transcription JSON files and uploads to dataset with batch numbering."""
     if not transcription_files:
     }
 async def process_audio_files_background():
+    """
+    Background task that processes audio files with reference mapping.
+    - Downloads batch of files (1 per server)
+    - Distributes to Whisper servers in parallel
+    - Uploads JSON results directly to HF dataset
+    - Updates processing state after each batch round (dynamically based on actual processed count)
+    - Respects rate limit: max 120 uploads/hour, stops at 128
+    """
     progress_data = load_progress()
     reference_map = progress_data.get('reference_map', {})
     print(f"[{FLOW_ID}] 📊 Configuration: {len(servers)} Whisper server(s) → Batch size: {BATCH_SIZE} (1 file per server)")
     start_index = progress_data['last_processed_index']
     print(f"[{FLOW_ID}] Starting batch processing from file #{start_index} (out of {len(audio_files)})...")
         batch_end = min(batch_start + BATCH_SIZE, len(audio_files))
         batch_files = audio_files[batch_start:batch_end]
+        print(f"\n[{FLOW_ID}] 📦 BATCH ROUND: Processing files #{batch_start}-#{batch_end-1} ({len(batch_files)} files)")
         # Step 1: Download all files in batch in parallel
         print(f"[{FLOW_ID}] ⬇️  Downloading batch ({len(batch_files)} files)...")
             print(f"[{FLOW_ID}] ⏳ Waiting for {len(transcription_tasks)} transcriptions (parallel)...")
             transcription_results = await asyncio.gather(*transcription_tasks, return_exceptions=True)
+            # Step 3: Upload transcriptions directly to HF dataset
+            successful_uploads = 0
+            uploaded_files = []
+            state = await download_hf_state()
+            print(f"[{FLOW_ID}] 📤 Uploading {len([r for r in transcription_results if r and not isinstance(r, Exception)])}/{len(transcription_results)} transcriptions directly to dataset...")
             for metadata, result in zip(file_metadata, transcription_results):
                 if isinstance(result, Exception):
                     continue
                 if result:
+                    # Save JSON locally first
                     json_filename = Path(metadata['reference_filename']).stem if metadata['reference_filename'] else Path(metadata['audio_filename']).stem
                     json_file_path = Path(RESULTS_DIR) / f"{json_filename}.json"
                     with open(json_file_path, 'w', encoding='utf-8') as f:
                         json.dump(result, f, indent=2, ensure_ascii=False)
+                    # Upload directly to HF dataset
+                    if await upload_json_to_dataset(json_file_path, f"{json_filename}.json"):
+                        successful_uploads += 1
+                        uploaded_files.append(json_file_path)
+                        progress_data['transcription_count'] += 1
+                    # Cleanup local JSON file after upload
+                    if json_file_path.exists():
+                        os.remove(json_file_path)
             # Step 4: Cleanup downloaded audio files
             for metadata in file_metadata:
                 if metadata['audio_path'].exists():
                     os.remove(metadata['audio_path'])
+            # Step 5: Update processing state after this batch round
+            # Update next_download_index based on actual files processed this round
+            files_processed_this_round = len([m for m in file_metadata if m])  # Count of files actually processed
+            new_download_index = batch_start + files_processed_this_round
+            print(f"[{FLOW_ID}] 🔄 Batch round complete: {files_processed_this_round} files distributed and processed")
+            print(f"[{FLOW_ID}] 📊 Updating state: next_download_index {state['next_download_index']} → {new_download_index}")
+            state['next_download_index'] = new_download_index
+            # Mark all files in this round as processed in the state
+            for metadata in file_metadata:
+                state['file_states'][metadata['audio_filename']] = "processed"
+            # Upload updated state
+            await upload_hf_state(state)
+            # Save local progress
+            progress_data['last_processed_index'] = batch_end
+            save_progress(progress_data)
+            print(f"[{FLOW_ID}] ✅ State updated. Successful uploads this round: {successful_uploads}/{len(file_metadata)}")
     print(f"\n[{FLOW_ID}] ✅ ALL DONE! Total transcriptions: {progress_data['transcription_count']}")