Grinding commited on
Commit
ccedece
·
verified ·
1 Parent(s): f4750da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -6
app.py CHANGED
@@ -7,7 +7,7 @@ import subprocess
7
  import soundfile as sf
8
  import logging
9
 
10
- # Configure logging
11
  logging.basicConfig(level=logging.INFO)
12
  logger = logging.getLogger(__name__)
13
 
@@ -15,10 +15,15 @@ app = FastAPI()
15
 
16
  # Load the ASR model on startup
17
  try:
 
 
 
18
  asr_model = WhisperModel(
19
  "distil-whisper/distil-large-v3",
20
  device="cpu",
21
- compute_type="int8" # Quantization for speed and memory efficiency
 
 
22
  )
23
  logger.info("✅ ASR model loaded successfully with faster-whisper")
24
  except Exception as e:
@@ -37,13 +42,13 @@ async def transcribe_audio(audio_file: UploadFile = File(...)):
37
  try:
38
  # Save uploaded bytes to a temporary file
39
  suffix = os.path.splitext(audio_file.filename)[1] or ""
40
- with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tf:
41
  tf.write(audio_bytes)
42
  tf.flush()
43
  tmp_in = tf.name
44
 
45
  # Convert to 16kHz mono WAV PCM using ffmpeg
46
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tfwav:
47
  tmp_wav = tfwav.name
48
 
49
  ffmpeg_cmd = [
@@ -71,9 +76,14 @@ async def transcribe_audio(audio_file: UploadFile = File(...)):
71
  if speech.ndim > 1:
72
  speech = np.mean(speech, axis=1)
73
 
74
- # Transcribe using faster-whisper
75
  logger.info("Starting transcription")
76
- segments, _ = asr_model.transcribe(speech, beam_size=5)
 
 
 
 
 
77
  text = " ".join(segment.text.strip() for segment in segments)
78
  logger.info("Transcription completed")
79
 
 
7
  import soundfile as sf
8
  import logging
9
 
10
+ # Configure logging for debugging
11
  logging.basicConfig(level=logging.INFO)
12
  logger = logging.getLogger(__name__)
13
 
 
15
 
16
  # Load the ASR model on startup
17
  try:
18
+ # Use HF Spaces-compatible cache directory
19
+ cache_dir = os.getenv("HF_HOME", "/data/hf_cache")
20
+ os.makedirs(cache_dir, exist_ok=True)
21
  asr_model = WhisperModel(
22
  "distil-whisper/distil-large-v3",
23
  device="cpu",
24
+ compute_type="int8", # Quantization for speed/memory
25
+ local_files_only=True, # Use pre-downloaded model
26
+ model_dir=cache_dir
27
  )
28
  logger.info("✅ ASR model loaded successfully with faster-whisper")
29
  except Exception as e:
 
42
  try:
43
  # Save uploaded bytes to a temporary file
44
  suffix = os.path.splitext(audio_file.filename)[1] or ""
45
+ with tempfile.NamedTemporaryFile(suffix=suffix, delete=False, dir="/tmp") as tf:
46
  tf.write(audio_bytes)
47
  tf.flush()
48
  tmp_in = tf.name
49
 
50
  # Convert to 16kHz mono WAV PCM using ffmpeg
51
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/tmp") as tfwav:
52
  tmp_wav = tfwav.name
53
 
54
  ffmpeg_cmd = [
 
76
  if speech.ndim > 1:
77
  speech = np.mean(speech, axis=1)
78
 
79
+ # Transcribe using faster-whisper with optimized settings
80
  logger.info("Starting transcription")
81
+ segments, _ = asr_model.transcribe(
82
+ speech,
83
+ beam_size=5,
84
+ vad_filter=True, # Voice activity detection to skip silence
85
+ vad_parameters=dict(min_silence_duration_ms=500)
86
+ )
87
  text = " ".join(segment.text.strip() for segment in segments)
88
  logger.info("Transcription completed")
89