Samfredoly commited on
Commit
8f8df59
·
verified ·
1 Parent(s): 2e03387

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -47
app.py CHANGED
@@ -10,6 +10,9 @@ from typing import Dict, List, Optional, Any
10
  from fastapi import FastAPI, HTTPException
11
  from fastapi.responses import JSONResponse
12
  import uvicorn
 
 
 
13
 
14
  # Fix Unicode encoding for Windows
15
  if sys.platform == 'win32':
@@ -38,6 +41,52 @@ os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
38
  os.makedirs(TRANSCRIPTIONS_FOLDER, exist_ok=True)
39
  os.makedirs(LOCAL_STATE_FOLDER, exist_ok=True)
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  # State Files
42
  FAILED_FILES_LOG = "failed_audio_files.log"
43
  HF_STATE_FILE = "processing_audio_state.json" # This is the filename the backend uses
@@ -375,65 +424,56 @@ def get_next_file_to_process(source_repo_id: str, state: Dict[str, Any]) -> Opti
375
 
376
  def run_whisper_transcription(audio_path: str, output_dir: str, model: str) -> Optional[str]:
377
  """
378
- Runs the whisper command-line tool to transcribe the audio file.
379
  Returns the path to the generated JSON file on success.
 
380
  """
381
  log_message(f"🎙️ Starting transcription for {os.path.basename(audio_path)} with model {model}...", "INFO")
382
 
383
- # The whisper command-line tool saves output files in the current directory
384
- # We need to run the command from the desired output directory
385
-
386
  try:
387
- # The command is 'whisper <audio_path> --model <model> --output_dir <output_dir> --output_format json'
388
- # Since we want to run it from the output_dir, we need to adjust the audio_path
389
-
390
- # Move the audio file to the output directory temporarily
391
- temp_audio_path = os.path.join(output_dir, os.path.basename(audio_path))
392
- shutil.move(audio_path, temp_audio_path)
393
-
394
- # The whisper command will be executed in the output_dir
395
- command = [
396
- "whisper",
397
- os.path.basename(temp_audio_path), # Use the relative path in the output_dir
398
- "--model", model,
399
- "--output_dir", ".", # Output to the current directory (which is output_dir)
400
- "--output_format", "json"
401
- ]
402
-
403
- # Run the command
404
- result = subprocess.run(
405
- command,
406
- cwd=output_dir, # Change current working directory for the subprocess
407
- capture_output=True,
408
- text=True,
409
- check=True,
410
- timeout=3600 # 1 hour timeout for transcription
411
  )
412
 
413
- log_message(f"✅ Transcription successful. Output: {result.stdout.strip()}", "INFO")
 
 
 
 
414
 
415
- # The output filename is the base name of the audio file with a .json extension
416
- base_name, _ = os.path.splitext(os.path.basename(temp_audio_path))
 
 
 
 
 
 
 
417
  json_output_path = os.path.join(output_dir, f"{base_name}.json")
418
 
419
- # Move the audio file back (or just delete it, as it will be deleted later)
420
- os.remove(temp_audio_path)
 
 
 
421
 
422
- if os.path.exists(json_output_path):
423
- return json_output_path
424
- else:
425
- log_message(f"❌ Whisper ran successfully but did not produce the expected JSON file: {json_output_path}", "ERROR")
426
- return None
427
-
428
- except subprocess.CalledProcessError as e:
429
- log_message(f"❌ Whisper command failed. Stderr: {e.stderr.strip()}", "ERROR")
430
- log_message(f"❌ Command: {' '.join(command)}", "ERROR")
431
- return None
432
- except subprocess.TimeoutExpired:
433
- log_message("❌ Whisper command timed out.", "ERROR")
434
- return None
435
  except Exception as e:
436
- log_message(f"❌ An unexpected error occurred during transcription: {str(e)}", "ERROR")
 
 
437
  return None
438
 
439
  def process_audio_file(audio_path: str, reference_map: Dict[str, str], output_filename: str) -> bool:
 
10
  from fastapi import FastAPI, HTTPException
11
  from fastapi.responses import JSONResponse
12
  import uvicorn
13
+ import torch
14
+ import librosa
15
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
16
 
17
  # Fix Unicode encoding for Windows
18
  if sys.platform == 'win32':
 
41
  os.makedirs(TRANSCRIPTIONS_FOLDER, exist_ok=True)
42
  os.makedirs(LOCAL_STATE_FOLDER, exist_ok=True)
43
 
44
+ # Whisper Model Setup (using transformers)
45
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
46
+ TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
47
+ WHISPER_MODEL_ID = f"openai/whisper-{WHISPER_MODEL}"
48
+
49
+ # Global model cache
50
+ _whisper_model = None
51
+ _whisper_processor = None
52
+ _whisper_pipeline = None
53
+
54
+ def get_whisper_pipeline():
55
+ """Get or initialize the Whisper pipeline."""
56
+ global _whisper_model, _whisper_processor, _whisper_pipeline
57
+
58
+ if _whisper_pipeline is not None:
59
+ return _whisper_pipeline
60
+
61
+ try:
62
+ log_message(f"Loading Whisper model {WHISPER_MODEL_ID}...", "INFO")
63
+
64
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
65
+ WHISPER_MODEL_ID,
66
+ torch_dtype=TORCH_DTYPE,
67
+ low_cpu_mem_usage=True,
68
+ use_safetensors=True
69
+ )
70
+ model = model.to(DEVICE)
71
+
72
+ processor = AutoProcessor.from_pretrained(WHISPER_MODEL_ID)
73
+
74
+ _whisper_pipeline = pipeline(
75
+ "automatic-speech-recognition",
76
+ model=model,
77
+ tokenizer=processor.tokenizer,
78
+ feature_extractor=processor.feature_extractor,
79
+ torch_dtype=TORCH_DTYPE,
80
+ device=DEVICE
81
+ )
82
+
83
+ log_message(f"✅ Whisper model loaded successfully on {DEVICE.upper()}", "INFO")
84
+ return _whisper_pipeline
85
+
86
+ except Exception as e:
87
+ log_message(f"❌ Failed to load Whisper model: {str(e)}", "ERROR")
88
+ raise
89
+
90
  # State Files
91
  FAILED_FILES_LOG = "failed_audio_files.log"
92
  HF_STATE_FILE = "processing_audio_state.json" # This is the filename the backend uses
 
424
 
425
  def run_whisper_transcription(audio_path: str, output_dir: str, model: str) -> Optional[str]:
426
  """
427
+ Runs Whisper transcription using the transformers library.
428
  Returns the path to the generated JSON file on success.
429
+ No ffmpeg dependency required.
430
  """
431
  log_message(f"🎙️ Starting transcription for {os.path.basename(audio_path)} with model {model}...", "INFO")
432
 
 
 
 
433
  try:
434
+ # Get the Whisper pipeline
435
+ pipe = get_whisper_pipeline()
436
+
437
+ # Load audio using librosa
438
+ log_message(f"Loading audio file: {audio_path}", "INFO")
439
+ audio_data, sample_rate = librosa.load(audio_path, sr=16000)
440
+
441
+ # Run transcription
442
+ log_message(f"Running transcription...", "INFO")
443
+ result = pipe(
444
+ audio_data,
445
+ chunk_length_s=30,
446
+ batch_size=8,
447
+ return_timestamps=True
 
 
 
 
 
 
 
 
 
 
448
  )
449
 
450
+ # Extract text and chunks
451
+ transcription_text = result.get("text", "")
452
+ chunks = result.get("chunks", [])
453
+
454
+ log_message(f"✅ Transcription successful: {len(transcription_text)} characters", "INFO")
455
 
456
+ # Prepare output JSON structure
457
+ output_json = {
458
+ "text": transcription_text,
459
+ "chunks": chunks,
460
+ "language": result.get("language", "en")
461
+ }
462
+
463
+ # Save to JSON file
464
+ base_name, _ = os.path.splitext(os.path.basename(audio_path))
465
  json_output_path = os.path.join(output_dir, f"{base_name}.json")
466
 
467
+ with open(json_output_path, "w", encoding="utf-8") as f:
468
+ json.dump(output_json, f, indent=2, ensure_ascii=False)
469
+
470
+ log_message(f"✅ Saved transcription to: {json_output_path}", "INFO")
471
+ return json_output_path
472
 
 
 
 
 
 
 
 
 
 
 
 
 
 
473
  except Exception as e:
474
+ log_message(f"❌ An error occurred during transcription: {str(e)}", "ERROR")
475
+ import traceback
476
+ log_message(f"Traceback: {traceback.format_exc()}", "ERROR")
477
  return None
478
 
479
  def process_audio_file(audio_path: str, reference_map: Dict[str, str], output_filename: str) -> bool: