Spaces:

Samfredoly
/

switches

Paused

App Files Files Community

Samfredoly commited on Nov 19, 2025

Commit

8f8df59

verified ·

1 Parent(s): 2e03387

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -47

app.py CHANGED Viewed

@@ -10,6 +10,9 @@ from typing import Dict, List, Optional, Any
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
 import uvicorn
 # Fix Unicode encoding for Windows
 if sys.platform == 'win32':
@@ -38,6 +41,52 @@ os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
 os.makedirs(TRANSCRIPTIONS_FOLDER, exist_ok=True)
 os.makedirs(LOCAL_STATE_FOLDER, exist_ok=True)
 # State Files
 FAILED_FILES_LOG = "failed_audio_files.log"
 HF_STATE_FILE = "processing_audio_state.json" # This is the filename the backend uses
@@ -375,65 +424,56 @@ def get_next_file_to_process(source_repo_id: str, state: Dict[str, Any]) -> Opti
 def run_whisper_transcription(audio_path: str, output_dir: str, model: str) -> Optional[str]:
     """
-    Runs the whisper command-line tool to transcribe the audio file.
     Returns the path to the generated JSON file on success.
     """
     log_message(f"🎙️ Starting transcription for {os.path.basename(audio_path)} with model {model}...", "INFO")
-    # The whisper command-line tool saves output files in the current directory
-    # We need to run the command from the desired output directory
     try:
-        # The command is 'whisper <audio_path> --model <model> --output_dir <output_dir> --output_format json'
-        # Since we want to run it from the output_dir, we need to adjust the audio_path
-        # Move the audio file to the output directory temporarily
-        temp_audio_path = os.path.join(output_dir, os.path.basename(audio_path))
-        shutil.move(audio_path, temp_audio_path)
-        # The whisper command will be executed in the output_dir
-        command = [
-            "whisper",
-            os.path.basename(temp_audio_path), # Use the relative path in the output_dir
-            "--model", model,
-            "--output_dir", ".", # Output to the current directory (which is output_dir)
-            "--output_format", "json"
-        ]
-        # Run the command
-        result = subprocess.run(
-            command,
-            cwd=output_dir, # Change current working directory for the subprocess
-            capture_output=True,
-            text=True,
-            check=True,
-            timeout=3600 # 1 hour timeout for transcription
         )
-        log_message(f"✅ Transcription successful. Output: {result.stdout.strip()}", "INFO")
-        # The output filename is the base name of the audio file with a .json extension
-        base_name, _ = os.path.splitext(os.path.basename(temp_audio_path))
         json_output_path = os.path.join(output_dir, f"{base_name}.json")
-        # Move the audio file back (or just delete it, as it will be deleted later)
-        os.remove(temp_audio_path)
-        if os.path.exists(json_output_path):
-            return json_output_path
-        else:
-            log_message(f"❌ Whisper ran successfully but did not produce the expected JSON file: {json_output_path}", "ERROR")
-            return None
-    except subprocess.CalledProcessError as e:
-        log_message(f"❌ Whisper command failed. Stderr: {e.stderr.strip()}", "ERROR")
-        log_message(f"❌ Command: {' '.join(command)}", "ERROR")
-        return None
-    except subprocess.TimeoutExpired:
-        log_message("❌ Whisper command timed out.", "ERROR")
-        return None
     except Exception as e:
-        log_message(f"❌ An unexpected error occurred during transcription: {str(e)}", "ERROR")
         return None
 def process_audio_file(audio_path: str, reference_map: Dict[str, str], output_filename: str) -> bool:

 from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
 import uvicorn
+import torch
+import librosa
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 # Fix Unicode encoding for Windows
 if sys.platform == 'win32':
 os.makedirs(TRANSCRIPTIONS_FOLDER, exist_ok=True)
 os.makedirs(LOCAL_STATE_FOLDER, exist_ok=True)
+# Whisper Model Setup (using transformers)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
+WHISPER_MODEL_ID = f"openai/whisper-{WHISPER_MODEL}"
+# Global model cache
+_whisper_model = None
+_whisper_processor = None
+_whisper_pipeline = None
+def get_whisper_pipeline():
+    """Get or initialize the Whisper pipeline."""
+    global _whisper_model, _whisper_processor, _whisper_pipeline
+    if _whisper_pipeline is not None:
+        return _whisper_pipeline
+    try:
+        log_message(f"Loading Whisper model {WHISPER_MODEL_ID}...", "INFO")
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            WHISPER_MODEL_ID,
+            torch_dtype=TORCH_DTYPE,
+            low_cpu_mem_usage=True,
+            use_safetensors=True
+        )
+        model = model.to(DEVICE)
+        processor = AutoProcessor.from_pretrained(WHISPER_MODEL_ID)
+        _whisper_pipeline = pipeline(
+            "automatic-speech-recognition",
+            model=model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            torch_dtype=TORCH_DTYPE,
+            device=DEVICE
+        )
+        log_message(f"✅ Whisper model loaded successfully on {DEVICE.upper()}", "INFO")
+        return _whisper_pipeline
+    except Exception as e:
+        log_message(f"❌ Failed to load Whisper model: {str(e)}", "ERROR")
+        raise
 # State Files
 FAILED_FILES_LOG = "failed_audio_files.log"
 HF_STATE_FILE = "processing_audio_state.json" # This is the filename the backend uses
 def run_whisper_transcription(audio_path: str, output_dir: str, model: str) -> Optional[str]:
     """
+    Runs Whisper transcription using the transformers library.
     Returns the path to the generated JSON file on success.
+    No ffmpeg dependency required.
     """
     log_message(f"🎙️ Starting transcription for {os.path.basename(audio_path)} with model {model}...", "INFO")
     try:
+        # Get the Whisper pipeline
+        pipe = get_whisper_pipeline()
+        # Load audio using librosa
+        log_message(f"Loading audio file: {audio_path}", "INFO")
+        audio_data, sample_rate = librosa.load(audio_path, sr=16000)
+        # Run transcription
+        log_message(f"Running transcription...", "INFO")
+        result = pipe(
+            audio_data,
+            chunk_length_s=30,
+            batch_size=8,
+            return_timestamps=True
         )
+        # Extract text and chunks
+        transcription_text = result.get("text", "")
+        chunks = result.get("chunks", [])
+        log_message(f"✅ Transcription successful: {len(transcription_text)} characters", "INFO")
+        # Prepare output JSON structure
+        output_json = {
+            "text": transcription_text,
+            "chunks": chunks,
+            "language": result.get("language", "en")
+        }
+        # Save to JSON file
+        base_name, _ = os.path.splitext(os.path.basename(audio_path))
         json_output_path = os.path.join(output_dir, f"{base_name}.json")
+        with open(json_output_path, "w", encoding="utf-8") as f:
+            json.dump(output_json, f, indent=2, ensure_ascii=False)
+        log_message(f"✅ Saved transcription to: {json_output_path}", "INFO")
+        return json_output_path
     except Exception as e:
+        log_message(f"❌ An error occurred during transcription: {str(e)}", "ERROR")
+        import traceback
+        log_message(f"Traceback: {traceback.format_exc()}", "ERROR")
         return None
 def process_audio_file(audio_path: str, reference_map: Dict[str, str], output_filename: str) -> bool: