Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -17,6 +17,7 @@ import time
|
|
| 17 |
import shutil
|
| 18 |
from starlette.concurrency import run_in_threadpool
|
| 19 |
import gc
|
|
|
|
| 20 |
try:
|
| 21 |
import noisereduce as nr
|
| 22 |
HAVE_NOISEREDUCE = True
|
|
@@ -50,6 +51,9 @@ model_name = "large-v2"
|
|
| 50 |
ALIGN_MODEL_MAP = {
|
| 51 |
"ur": "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"}
|
| 52 |
global_align_model_cache = {}
|
|
|
|
|
|
|
|
|
|
| 53 |
class TimelineItem(BaseModel):
|
| 54 |
start: float
|
| 55 |
end: float
|
|
@@ -217,10 +221,16 @@ def analyze_audio(audio_file: str,
|
|
| 217 |
model = whisperx.load_model(model_name, device, compute_type="float32")
|
| 218 |
audio_loaded = whisperx.load_audio(audio_for_model)
|
| 219 |
print("Detecting language...")
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
print("Transcribing audio...")
|
| 225 |
transcribed_language = "ur"
|
| 226 |
result = model.transcribe(audio_loaded, batch_size=BATCH_SIZE, language= transcribed_language
|
|
|
|
| 17 |
import shutil
|
| 18 |
from starlette.concurrency import run_in_threadpool
|
| 19 |
import gc
|
| 20 |
+
from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor
|
| 21 |
try:
|
| 22 |
import noisereduce as nr
|
| 23 |
HAVE_NOISEREDUCE = True
|
|
|
|
| 51 |
ALIGN_MODEL_MAP = {
|
| 52 |
"ur": "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"}
|
| 53 |
global_align_model_cache = {}
|
| 54 |
+
processor = AutoFeatureExtractor.from_pretrained("facebook/mms-lid-4017")
|
| 55 |
+
model = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/mms-lid-4017")
|
| 56 |
+
model.to("cpu")
|
| 57 |
class TimelineItem(BaseModel):
|
| 58 |
start: float
|
| 59 |
end: float
|
|
|
|
| 221 |
model = whisperx.load_model(model_name, device, compute_type="float32")
|
| 222 |
audio_loaded = whisperx.load_audio(audio_for_model)
|
| 223 |
print("Detecting language...")
|
| 224 |
+
inputs = processor(audio_loaded, sampling_rate=target_sr, return_tensors="pt")
|
| 225 |
+
with torch.no_grad():
|
| 226 |
+
outputs = model(**inputs).logits
|
| 227 |
+
lang_id = torch.argmax(outputs, dim=-1)[0].item()
|
| 228 |
+
detected_language = model.config.id2label[lang_id]
|
| 229 |
+
languageCode = detected_language
|
| 230 |
+
# lang_result = model.transcribe(audio_loaded, batch_size=4, language=None)
|
| 231 |
+
# language_code_detected = lang_result.get("language") or lang_result.get("detected_language")
|
| 232 |
+
# languageCode = language_code_detected
|
| 233 |
+
# results.languageCode = languageCode
|
| 234 |
print("Transcribing audio...")
|
| 235 |
transcribed_language = "ur"
|
| 236 |
result = model.transcribe(audio_loaded, batch_size=BATCH_SIZE, language= transcribed_language
|