Spaces:

Samfredoly
/

switches

Paused

App Files Files Community

Samfredoly commited on Nov 19

Commit

bcbbd0a

verified ·

1 Parent(s): 818a8c7

Update app.py

Browse files

Files changed (1) hide show

app.py +214 -193

app.py CHANGED Viewed

@@ -21,7 +21,8 @@ app = FastAPI(title="Audio Transcriber", description="Audio transcription and up
 # ==== CONFIGURATION ====
 # The new backend URL for state management and transcription upload
-BACKEND_URL = "https://samfredoly-acp.hf.space"
 # The original Hugging Face repo IDs are still needed for fetching the audio files
 # and the reference file list, as the backend only handles transcription storage.
 SOURCE_REPO_ID = "Samfredoly/BG_Vid"  # Fetch audio files from here
@@ -256,248 +257,268 @@ def download_with_retry(url: str, dest_path: str, max_retries: int = 3) -> bool:
                         if chunk:
                             f.write(chunk)
-                log_message(f"✅ Download successful: {dest_path}", "INFO")
                 return True
         except requests.exceptions.RequestException as e:
-            log_message(f"❌ Download attempt {attempt + 1} failed for {url}: {str(e)}", "WARNING")
-            time.sleep(PROCESSING_DELAY)
         except Exception as e:
             log_message(f"❌ An unexpected error occurred during download: {str(e)}", "ERROR")
             return False
-    log_message(f"❌ Failed to download {url} after {max_retries} attempts.", "ERROR")
     return False
-def fetch_reference_files(repo_id: str) -> Dict[str, str]:
-    """Fetch all files from Fred808/BG3 repo to match with audio filenames."""
-    log_message(f"📋 Fetching file list from {repo_id}...", "INFO")
     try:
-        # This still uses the Hugging Face API
-        files_list = hf_api.list_repo_files(repo_id=repo_id, repo_type="dataset")
-        # Include all file types (zip, rar, wav, mp3, etc.)
-        all_files = [f for f in files_list]
-        # Create a mapping of base filename (without extension) to full path
-        filename_map = {}
-        for file_path in all_files:
-            base_name = os.path.splitext(os.path.basename(file_path))[0]
-            filename_map[base_name] = file_path
-        log_message(f"✅ Found {len(filename_map)} files in reference repo", "INFO")
-        return filename_map
     except Exception as e:
-        log_message(f"❌ Failed to fetch reference files: {str(e)}", "ERROR")
         return {}
-def find_matching_filename(transcribed_filename: str, reference_map: Dict[str, str]) -> Optional[str]:
-    """Find matching filename in reference map from Fred808/BG3."""
-    base_name = os.path.splitext(transcribed_filename)[0]
-    # Exact match first
-    if base_name in reference_map:
-        full_path = reference_map[base_name]
-        print(f"\n✅ EXACT MATCH FOUND:")
-        print(f"   Audio: {transcribed_filename}")
-        print(f"   File:  {full_path}")
-        log_message(f"✅ Found exact match: {transcribed_filename} -> {full_path}", "INFO")
-        return full_path
-    # Partial/fuzzy match (check if reference contains transcribed as substring)
-    matches = []
-    for ref_base, ref_full_path in reference_map.items():
-        if base_name.lower() in ref_base.lower() or ref_base.lower() in base_name.lower():
-            matches.append((ref_base, ref_full_path))
-    # Return first partial match if found
-    if matches:
-        ref_base, ref_full_path = matches[0]
-        print(f"\n✅ PARTIAL MATCH FOUND:")
-        print(f"   Audio: {transcribed_filename}")
-        print(f"   File:  {ref_full_path}")
-        log_message(f"✅ Found partial match: {transcribed_filename} -> {ref_full_path}", "INFO")
-        return ref_full_path
-    print(f"\n⚠️ NO EXACT/PARTIAL MATCH FOUND (will still process):")
-    print(f"   Audio: {transcribed_filename}")
-    log_message(f"⚠️ No matching filename found for: {transcribed_filename}. Will use original filename.", "WARNING")
-    return None
-def transcribe_audio(wav_path: str) -> Optional[Dict[str, Any]]:
-    """Transcribe audio file using Whisper from Transformers."""
-    log_message(f"🎤 Transcribing audio file: {wav_path}", "INFO")
     try:
-        # Import inside function to avoid global import issues if libraries are missing
-        from transformers import pipeline
-        import librosa
-        # Load audio with librosa
-        log_message(f"Loading audio file: {wav_path}", "INFO")
-        audio, sr = librosa.load(wav_path, sr=16000)
-        # Initialize Whisper pipeline
-        log_message(f"Loading Whisper {WHISPER_MODEL} model from Transformers...", "INFO")
-        pipe = pipeline(
-            "automatic-speech-recognition",
-            model=f"openai/whisper-{WHISPER_MODEL}",
-            device=0 if __import__('torch').cuda.is_available() else -1  # GPU if available, else CPU
-        )
-        # Transcribe
-        log_message("Transcribing audio...", "INFO")
-        result = pipe(audio)
-        # Format result to match openai-whisper format
-        formatted_result = {
-            "text": result["text"],
-            "segments": [{"text": result["text"]}]
-        }
-        log_message(f"✅ Successfully transcribed: {wav_path}", "INFO")
-        return formatted_result
-    except ImportError as e:
-        missing_lib = str(e)
-        log_message(f"❌ Missing library. Install with: pip install transformers librosa torch torchaudio", "ERROR")
-        log_message(f"   Error: {missing_lib}", "ERROR")
         return None
     except Exception as e:
-        log_message(f"❌ Failed to transcribe {wav_path}: {str(e)}", "ERROR")
         return None
-def process_audio_file(wav_path: str, reference_map: Dict[str, str], matched_filename: str) -> bool:
     """
-    Main processing logic for a single audio file:
-    1. Transcribe using Whisper
-    2. Save transcription as JSON
-    3. Upload to backend API
-    4. Clean up local files
     """
-    wav_filename = os.path.basename(wav_path)
-    # 1. Transcribe audio
-    transcription = transcribe_audio(wav_path)
-    if transcription is None:
-        log_failed_file(wav_filename, "Transcription failed")
-        return False
-    # 2. Save transcription as JSON
-    # The filename must be the one the backend expects for a transcription file
-    json_filename = os.path.splitext(matched_filename)[0] + "_transcription.json"
-    json_output_path = os.path.join(TRANSCRIPTIONS_FOLDER, json_filename)
     try:
-        os.makedirs(os.path.dirname(json_output_path), exist_ok=True)
-        with open(json_output_path, "w", encoding="utf-8") as f:
-            json.dump(transcription, f, indent=2, ensure_ascii=False)
-        log_message(f"✅ Saved transcription: {json_output_path}", "INFO")
-    except Exception as e:
-        log_message(f"❌ Failed to save transcription JSON: {str(e)}", "ERROR")
-        log_failed_file(wav_filename, f"Failed to save JSON: {str(e)}")
-        return False
-    # 3. Upload to backend API
-    if upload_transcription_to_api(json_output_path, matched_filename):
-        log_message(f"✅ Successfully uploaded transcription via API: {json_filename}", "INFO")
-        processing_status["transcribed_files"] += 1
-    else:
-        log_message(f"❌ Failed to upload transcription via API.", "ERROR")
-        log_failed_file(wav_filename, f"Failed to upload via API")
-        return False
-    # 4. Clean up local files
-    try:
-        os.remove(json_output_path)
-        log_message(f"🗑️ Cleaned up local transcription file: {json_output_path}", "INFO")
-    except:
-        pass
-    return True
-def get_next_file_to_process(repo_id: str, state: Dict[str, Any]) -> Optional[Dict[str, Any]]:
-    """
-    Finds the next audio file to process from the source repo in reverse order (oldest to newest).
-    Returns: { 'filename': str, 'url': str, 'index': int } or None
-    """
-    log_message(f"🔍 Searching for next audio file to process in {repo_id}", "INFO")
-    try:
-        # This still uses the Hugging Face API
-        files_list = hf_api.list_repo_files(repo_id=repo_id, repo_type="dataset")
-        # Filter for audio files and sort in reverse order (descending)
-        audio_files = sorted([f for f in files_list if f.endswith(('.wav', '.mp3'))], reverse=True)
-        if not audio_files:
-            log_message("ℹ️ No audio files found in the source repository.", "INFO")
-            return None
-        processing_status["total_files"] = len(audio_files)
-        start_index = state.get("next_download_index", 0)
-        for index in range(start_index, len(audio_files)):
-            filename = audio_files[index]
-            file_state = state["file_states"].get(filename)
-            if file_state is None or file_state == "failed":
-                # Use hf_hub_url to get the direct download URL
-                url = hf_hub_url(repo_id=repo_id, filename=filename, repo_type="dataset", subfolder=None)
-                log_message(f"✅ Found next audio file: {filename} at index {index}", "INFO")
-                return {
-                    'filename': filename,
-                    'url': url,
-                    'index': index
-                }
-            elif file_state == "processing":
-                log_message(f"⚠️ File {filename} is currently marked as 'processing'. Skipping for now.", "WARNING")
-            elif file_state == "processed":
-                log_message(f"ℹ️ File {filename} already processed. Skipping.", "INFO")
-        log_message("ℹ️ All files up to the current index have been processed or skipped.", "INFO")
-        if start_index >= len(audio_files):
-            log_message("ℹ️ Reached end of file list. Resetting index to 0 for next loop.", "INFO")
-            # We update the state locally to reset the index, and then upload it.
-            state["next_download_index"] = 0
-            upload_state_to_api(state)
         return None
     except Exception as e:
-        log_message(f"❌ Failed to list files from Hugging Face: {str(e)}", "ERROR")
         return None
 def main_processing_loop():
-    """The main loop that orchestrates the download, transcription, and upload cycle."""
     if processing_status["is_running"]:
-        log_message("⚠️ Processing loop is already running.", "WARNING")
         return
     processing_status["is_running"] = True
-    try:
-        log_message("🚀 Starting audio transcription processing loop...", "INFO")
-        # Fetch reference files from BG_Vid repo once at the start
-        reference_map = fetch_reference_files(REFERENCE_REPO_ID)
-        if not reference_map:
-            log_message("❌ No reference files found. Cannot proceed.", "ERROR")
-            return
         while processing_status["is_running"]:
             # 1. Download state from the new API
             current_state = download_state_from_api()
@@ -615,7 +636,7 @@ async def stop_processing():
     processing_status["is_running"] = False
     return JSONResponse(status_code=200, content={"message": "Processing stop requested. Will stop after current file."})
-# --- Main Execution (Unchanged) ---
 if __name__ == "__main__":
     # This block is for local testing and won't be used in the final sandbox execution

 # ==== CONFIGURATION ====
 # The new backend URL for state management and transcription upload
+# It is now read from an environment variable, falling back to the default if not set.
+BACKEND_URL = os.environ.get("BACKEND_URL", "https://samfredoly-acp.hf.space")
 # The original Hugging Face repo IDs are still needed for fetching the audio files
 # and the reference file list, as the backend only handles transcription storage.
 SOURCE_REPO_ID = "Samfredoly/BG_Vid"  # Fetch audio files from here
                         if chunk:
                             f.write(chunk)
+                log_message(f"✅ Download successful: {os.path.basename(dest_path)}", "INFO")
                 return True
         except requests.exceptions.RequestException as e:
+            log_message(f"⚠️ Download attempt {attempt + 1}/{max_retries} failed for {url}: {str(e)}", "WARNING")
+            if attempt < max_retries - 1:
+                time.sleep(2 ** attempt) # Exponential backoff
+            else:
+                log_message(f"❌ Download failed after {max_retries} attempts for {url}", "ERROR")
+                return False
         except Exception as e:
             log_message(f"❌ An unexpected error occurred during download: {str(e)}", "ERROR")
             return False
     return False
+def get_reference_map(reference_repo_id: str) -> Dict[str, str]:
+    """
+    Downloads the reference file list from the Hugging Face repo and creates a map
+    from audio filename (without extension) to the reference filename.
+    """
+    log_message(f"Fetching reference file list from {reference_repo_id}...", "INFO")
+    # This is a placeholder for the actual logic to get the file list.
+    # Assuming the reference repo contains a list of files that match the audio files.
+    # In a real scenario, this would involve listing files in the repo.
+    # For now, we'll assume a simple list of files can be retrieved.
     try:
+        # Use HfApi to list files in the reference repo
+        repo_files = hf_api.list_repo_files(repo_id=reference_repo_id, repo_type="dataset")
+        reference_map = {}
+        for file in repo_files:
+            # Assuming the reference files are named like 'audio_file_name.txt'
+            # and we want to map the audio file name (e.g., 'audio_file_name.wav') to it.
+            base_name, ext = os.path.splitext(file)
+            if ext.lower() in ['.txt', '.json']: # Only consider text/json files as reference
+                # The key is the audio file name without extension
+                reference_map[base_name] = file
+        log_message(f"✅ Successfully created reference map with {len(reference_map)} entries.", "INFO")
+        return reference_map
     except Exception as e:
+        log_message(f"❌ Failed to fetch reference map from Hugging Face: {str(e)}", "ERROR")
         return {}
+def find_matching_filename(audio_filename: str, reference_map: Dict[str, str]) -> Optional[str]:
+    """Finds the matching reference filename for a given audio filename."""
+    base_name, _ = os.path.splitext(audio_filename)
+    return reference_map.get(base_name)
+def get_next_file_to_process(source_repo_id: str, state: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+    """
+    Determines the next file to process based on the current state and the file list
+    from the source Hugging Face repository.
+    """
+    log_message(f"Determining next file to process from {source_repo_id}...", "INFO")
     try:
+        # 1. Get the list of all files in the source repo
+        repo_files = hf_api.list_repo_files(repo_id=source_repo_id, repo_type="dataset")
+        # Filter for audio files (e.g., .wav, .mp3)
+        audio_files = sorted([f for f in repo_files if f.lower().endswith(('.wav', '.mp3'))])
+        processing_status["total_files"] = len(audio_files)
+        if not audio_files:
+            log_message("No audio files found in the source repository.", "INFO")
+            return None
+        # 2. Get the next index from the state
+        next_index = state.get("next_download_index", 0)
+        file_states = state.get("file_states", {})
+        # 3. Find the next file that hasn't been processed or is not currently being processed
+        for i in range(next_index, len(audio_files)):
+            filename = audio_files[i]
+            status = file_states.get(filename, "unprocessed")
+            # Skip files that are already processed or currently being processed
+            if status in ["processed", "processing"]:
+                continue
+            # Found an unprocessed file
+            file_url = hf_hub_url(repo_id=source_repo_id, filename=filename, repo_type="dataset")
+            log_message(f"Found next file at index {i}: {filename}", "INFO")
+            return {
+                "filename": filename,
+                "url": file_url,
+                "index": i
+            }
+        log_message("All files up to the current index have been processed or are locked.", "INFO")
+        # If we reach the end, check from the beginning for any failed files
+        for i in range(0, next_index):
+            filename = audio_files[i]
+            status = file_states.get(filename, "unprocessed")
+            if status == "failed":
+                file_url = hf_hub_url(repo_id=source_repo_id, filename=filename, repo_type="dataset")
+                log_message(f"Found failed file for retry at index {i}: {filename}", "INFO")
+                return {
+                    "filename": filename,
+                    "url": file_url,
+                    "index": i
+                }
         return None
     except Exception as e:
+        log_message(f"❌ Failed to get next file to process: {str(e)}", "ERROR")
         return None
+def run_whisper_transcription(audio_path: str, output_dir: str, model: str) -> Optional[str]:
     """
+    Runs the whisper command-line tool to transcribe the audio file.
+    Returns the path to the generated JSON file on success.
     """
+    log_message(f"🎙️ Starting transcription for {os.path.basename(audio_path)} with model {model}...", "INFO")
+    # The whisper command-line tool saves output files in the current directory
+    # We need to run the command from the desired output directory
     try:
+        # The command is 'whisper <audio_path> --model <model> --output_dir <output_dir> --output_format json'
+        # Since we want to run it from the output_dir, we need to adjust the audio_path
+        # Move the audio file to the output directory temporarily
+        temp_audio_path = os.path.join(output_dir, os.path.basename(audio_path))
+        shutil.move(audio_path, temp_audio_path)
+        # The whisper command will be executed in the output_dir
+        command = [
+            "whisper",
+            os.path.basename(temp_audio_path), # Use the relative path in the output_dir
+            "--model", model,
+            "--output_dir", ".", # Output to the current directory (which is output_dir)
+            "--output_format", "json"
+        ]
+        # Run the command
+        result = subprocess.run(
+            command,
+            cwd=output_dir, # Change current working directory for the subprocess
+            capture_output=True,
+            text=True,
+            check=True,
+            timeout=3600 # 1 hour timeout for transcription
+        )
+        log_message(f"✅ Transcription successful. Output: {result.stdout.strip()}", "INFO")
+        # The output filename is the base name of the audio file with a .json extension
+        base_name, _ = os.path.splitext(os.path.basename(temp_audio_path))
+        json_output_path = os.path.join(output_dir, f"{base_name}.json")
+        # Move the audio file back (or just delete it, as it will be deleted later)
+        os.remove(temp_audio_path)
+        if os.path.exists(json_output_path):
+            return json_output_path
+        else:
+            log_message(f"❌ Whisper ran successfully but did not produce the expected JSON file: {json_output_path}", "ERROR")
+            return None
+    except subprocess.CalledProcessError as e:
+        log_message(f"❌ Whisper command failed. Stderr: {e.stderr.strip()}", "ERROR")
+        log_message(f"❌ Command: {' '.join(command)}", "ERROR")
+        return None
+    except subprocess.TimeoutExpired:
+        log_message("❌ Whisper command timed out.", "ERROR")
         return None
     except Exception as e:
+        log_message(f"❌ An unexpected error occurred during transcription: {str(e)}", "ERROR")
         return None
+def process_audio_file(audio_path: str, reference_map: Dict[str, str], output_filename: str) -> bool:
+    """
+    Transcribes the audio file, renames the output JSON to match the reference,
+    and uploads the result to the API.
+    """
+    # 1. Run transcription
+    json_output_path = run_whisper_transcription(audio_path, TRANSCRIPTIONS_FOLDER, WHISPER_MODEL)
+    if not json_output_path:
+        return False
+    # 2. Rename the JSON file to the matched filename
+    # The output_filename already includes the correct extension (e.g., .txt or .json)
+    # We assume the reference map provides the full target filename.
+    # The whisper output is a JSON file named after the audio file.
+    # We need to rename it to the target filename (which should be a JSON file for the backend).
+    # The output_filename is the matched filename from the reference map (e.g., 'audio_file_name.txt')
+    # The backend expects a JSON file. Let's assume the matched filename should be used as the base
+    # but with a .json extension for the upload.
+    # Let's stick to the original logic: the backend expects a JSON file with the name
+    # of the audio file (or the matched reference file) with a .json extension.
+    # Since the whisper output is already a JSON file, we just need to rename it
+    # to the desired final name.
+    # The output_filename passed here is the base name of the audio file or the matched reference file.
+    # If it's a reference file name (e.g., 'file.txt'), we should probably use 'file.json'.
+    # For simplicity and to match the backend's expectation (which handles JSON),
+    # we will rename the whisper output JSON to the base name of the audio file
+    # and ensure it has a .json extension.
+    base_name, _ = os.path.splitext(output_filename)
+    final_json_filename = f"{base_name}.json"
+    final_json_path = os.path.join(TRANSCRIPTIONS_FOLDER, final_json_filename)
+    try:
+        if json_output_path != final_json_path:
+            shutil.move(json_output_path, final_json_path)
+            log_message(f"✅ Renamed transcription to: {final_json_filename}", "INFO")
+    except Exception as e:
+        log_message(f"❌ Failed to rename transcription file: {str(e)}", "ERROR")
+        return False
+    # 3. Upload transcription to API
+    if upload_transcription_to_api(final_json_path, final_json_filename):
+        processing_status["transcribed_files"] += 1
+        # Clean up the local transcription file after successful upload
+        try:
+            os.remove(final_json_path)
+            log_message(f"🗑️ Cleaned up local transcription file: {final_json_path}", "INFO")
+        except Exception as e:
+            log_message(f"❌ Failed to clean up transcription file: {str(e)}", "ERROR")
+        return True
+    else:
+        log_message(f"❌ Failed to upload transcription to API: {final_json_filename}", "ERROR")
+        return False
 def main_processing_loop():
+    """The main loop that continuously checks for and processes new audio files."""
+    global processing_status
     if processing_status["is_running"]:
+        log_message("Processing loop is already running.", "WARNING")
         return
     processing_status["is_running"] = True
+    log_message("🚀 Audio transcription processing loop started.", "INFO")
+    # 1. Get the reference map once
+    reference_map = get_reference_map(REFERENCE_REPO_ID)
+    if not reference_map:
+        log_message("❌ Could not get reference map. Stopping loop.", "CRITICAL")
+        processing_status["is_running"] = False
+        return
+    try:
         while processing_status["is_running"]:
+            time.sleep(PROCESSING_DELAY)
             # 1. Download state from the new API
             current_state = download_state_from_api()
     processing_status["is_running"] = False
     return JSONResponse(status_code=200, content={"message": "Processing stop requested. Will stop after current file."})
+# --- Main Execution ---
 if __name__ == "__main__":
     # This block is for local testing and won't be used in the final sandbox execution