Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,7 +7,7 @@ import subprocess
|
|
| 7 |
import soundfile as sf
|
| 8 |
import logging
|
| 9 |
|
| 10 |
-
# Configure logging
|
| 11 |
logging.basicConfig(level=logging.INFO)
|
| 12 |
logger = logging.getLogger(__name__)
|
| 13 |
|
|
@@ -15,10 +15,15 @@ app = FastAPI()
|
|
| 15 |
|
| 16 |
# Load the ASR model on startup
|
| 17 |
try:
|
|
|
|
|
|
|
|
|
|
| 18 |
asr_model = WhisperModel(
|
| 19 |
"distil-whisper/distil-large-v3",
|
| 20 |
device="cpu",
|
| 21 |
-
compute_type="int8" # Quantization for speed
|
|
|
|
|
|
|
| 22 |
)
|
| 23 |
logger.info("✅ ASR model loaded successfully with faster-whisper")
|
| 24 |
except Exception as e:
|
|
@@ -37,13 +42,13 @@ async def transcribe_audio(audio_file: UploadFile = File(...)):
|
|
| 37 |
try:
|
| 38 |
# Save uploaded bytes to a temporary file
|
| 39 |
suffix = os.path.splitext(audio_file.filename)[1] or ""
|
| 40 |
-
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tf:
|
| 41 |
tf.write(audio_bytes)
|
| 42 |
tf.flush()
|
| 43 |
tmp_in = tf.name
|
| 44 |
|
| 45 |
# Convert to 16kHz mono WAV PCM using ffmpeg
|
| 46 |
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tfwav:
|
| 47 |
tmp_wav = tfwav.name
|
| 48 |
|
| 49 |
ffmpeg_cmd = [
|
|
@@ -71,9 +76,14 @@ async def transcribe_audio(audio_file: UploadFile = File(...)):
|
|
| 71 |
if speech.ndim > 1:
|
| 72 |
speech = np.mean(speech, axis=1)
|
| 73 |
|
| 74 |
-
# Transcribe using faster-whisper
|
| 75 |
logger.info("Starting transcription")
|
| 76 |
-
segments, _ = asr_model.transcribe(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
text = " ".join(segment.text.strip() for segment in segments)
|
| 78 |
logger.info("Transcription completed")
|
| 79 |
|
|
|
|
| 7 |
import soundfile as sf
|
| 8 |
import logging
|
| 9 |
|
| 10 |
+
# Configure logging for debugging
|
| 11 |
logging.basicConfig(level=logging.INFO)
|
| 12 |
logger = logging.getLogger(__name__)
|
| 13 |
|
|
|
|
| 15 |
|
| 16 |
# Load the ASR model on startup
|
| 17 |
try:
|
| 18 |
+
# Use HF Spaces-compatible cache directory
|
| 19 |
+
cache_dir = os.getenv("HF_HOME", "/data/hf_cache")
|
| 20 |
+
os.makedirs(cache_dir, exist_ok=True)
|
| 21 |
asr_model = WhisperModel(
|
| 22 |
"distil-whisper/distil-large-v3",
|
| 23 |
device="cpu",
|
| 24 |
+
compute_type="int8", # Quantization for speed/memory
|
| 25 |
+
local_files_only=True, # Use pre-downloaded model
|
| 26 |
+
model_dir=cache_dir
|
| 27 |
)
|
| 28 |
logger.info("✅ ASR model loaded successfully with faster-whisper")
|
| 29 |
except Exception as e:
|
|
|
|
| 42 |
try:
|
| 43 |
# Save uploaded bytes to a temporary file
|
| 44 |
suffix = os.path.splitext(audio_file.filename)[1] or ""
|
| 45 |
+
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False, dir="/tmp") as tf:
|
| 46 |
tf.write(audio_bytes)
|
| 47 |
tf.flush()
|
| 48 |
tmp_in = tf.name
|
| 49 |
|
| 50 |
# Convert to 16kHz mono WAV PCM using ffmpeg
|
| 51 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/tmp") as tfwav:
|
| 52 |
tmp_wav = tfwav.name
|
| 53 |
|
| 54 |
ffmpeg_cmd = [
|
|
|
|
| 76 |
if speech.ndim > 1:
|
| 77 |
speech = np.mean(speech, axis=1)
|
| 78 |
|
| 79 |
+
# Transcribe using faster-whisper with optimized settings
|
| 80 |
logger.info("Starting transcription")
|
| 81 |
+
segments, _ = asr_model.transcribe(
|
| 82 |
+
speech,
|
| 83 |
+
beam_size=5,
|
| 84 |
+
vad_filter=True, # Voice activity detection to skip silence
|
| 85 |
+
vad_parameters=dict(min_silence_duration_ms=500)
|
| 86 |
+
)
|
| 87 |
text = " ".join(segment.text.strip() for segment in segments)
|
| 88 |
logger.info("Transcription completed")
|
| 89 |
|