Spaces:

Merlintxu
/

Wav2Txt

Build error

App Files Files Community

Merlintxu commited on Jul 7, 2024

Commit

df609a3

verified ·

1 Parent(s): 2b71965

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -47

app.py CHANGED Viewed

@@ -1,11 +1,17 @@
 import gradio as gr
-from transformers import pipeline, AutoModelForCTC, AutoProcessor
 import torch
 import librosa
 import subprocess
 from langdetect import detect_langs
 import os
-import numpy as np
 # Updated models by language
 MODELS = {
@@ -34,56 +40,57 @@ def convert_audio_to_wav(audio_path):
     return wav_path
 def detect_language(audio_path):
-    speech, _ = librosa.load(audio_path, sr=16000, duration=30)  # Increased duration for better detection
-    # Use multiple models for transcription to improve accuracy
-    transcriptions = []
-    models = ["facebook/wav2vec2-large-xlsr-53-spanish", "facebook/wav2vec2-large-xlsr-53-portuguese", "facebook/wav2vec2-large-960h"]
-    for model_name in models:
-        processor = AutoProcessor.from_pretrained(model_name)
-        model = AutoModelForCTC.from_pretrained(model_name)
-        inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
-        with torch.no_grad():
-            logits = model(inputs.input_values).logits
-        predicted_ids = torch.argmax(logits, dim=-1)
-        transcription = processor.batch_decode(predicted_ids)[0]
-        transcriptions.append(transcription)
-    # Combine transcriptions and detect language
-    combined_text = " ".join(transcriptions)
-    langs = detect_langs(combined_text)
-    # Check confidence levels
     es_confidence = next((lang.prob for lang in langs if lang.lang == 'es'), 0)
     pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)
-    # If Spanish and Portuguese are close, prefer Spanish for Latin American content
     if abs(es_confidence - pt_confidence) < 0.2:
         return 'es'
     return max(langs, key=lambda x: x.prob).lang
-def transcribe_audio(audio, model_name):
     wav_audio = convert_audio_to_wav(audio)
-    transcriber = pipeline("automatic-speech-recognition", model=model_name)
-    chunk_duration = 30  # seconds
-    speech, rate = librosa.load(wav_audio, sr=16000)
-    duration = len(speech) / rate
-    transcription = ""
-    for i in range(0, int(duration), chunk_duration):
-        end = min(i + chunk_duration, duration)
-        chunk = speech[int(i * rate):int(end * rate)]
-        transcription += transcriber(chunk)["text"] + " "
-    output_file = "transcription.txt"
-    with open(output_file, "w", encoding="utf-8") as file:
-        file.write(transcription.strip())
-    return output_file
 def detect_and_select_model(audio):
     wav_audio = convert_audio_to_wav(audio)
@@ -95,18 +102,19 @@ def combined_interface(audio):
     try:
         language, model_options = detect_and_select_model(audio)
         selected_model = model_options[0]
-        transcription_file = transcribe_audio(audio, selected_model)
-        with open(transcription_file, "r", encoding="utf-8") as file:
-            transcription_text = file.read()
         # Clean up temporary files
-        os.remove(transcription_file)
         os.remove("converted_audio.wav")
-        return language, gr.Dropdown.update(choices=model_options, value=selected_model), selected_model, transcription_text
     except Exception as e:
-        return str(e), gr.Dropdown.update(choices=[]), "", "An error occurred during processing."
 iface = gr.Interface(
     fn=combined_interface,
@@ -117,9 +125,10 @@ iface = gr.Interface(
         gr.Textbox(label="Selected Model"),
         gr.Textbox(label="Transcription", lines=10)
     ],
-    title="Multilingual Audio Transcriber (Latin American Spanish Optimized)",
-    description="Upload an audio file to detect the language, select the transcription model, and get the transcription. Optimized for Latin American Spanish detection."
 )
 if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
+from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
 import torch
 import librosa
 import subprocess
 from langdetect import detect_langs
 import os
+import warnings
+from transformers import logging
+# Suppress warnings
+warnings.filterwarnings("ignore")
+logging.set_verbosity_error()
 # Updated models by language
 MODELS = {
     return wav_path
 def detect_language(audio_path):
+    speech, _ = librosa.load(audio_path, sr=16000, duration=30)
+    processor = WhisperProcessor.from_pretrained("openai/whisper-base")
+    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
+    input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
+    predicted_ids = model.generate(input_features)
+    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+    langs = detect_langs(transcription)
     es_confidence = next((lang.prob for lang in langs if lang.lang == 'es'), 0)
     pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)
     if abs(es_confidence - pt_confidence) < 0.2:
         return 'es'
     return max(langs, key=lambda x: x.prob).lang
+def transcribe_audio_stream(audio, model_name):
     wav_audio = convert_audio_to_wav(audio)
+    if "whisper" in model_name:
+        processor = WhisperProcessor.from_pretrained(model_name)
+        model = WhisperForConditionalGeneration.from_pretrained(model_name)
+        chunk_duration = 30  # seconds
+        speech, rate = librosa.load(wav_audio, sr=16000)
+        duration = len(speech) / rate
+        for i in range(0, int(duration), chunk_duration):
+            end = min(i + chunk_duration, duration)
+            chunk = speech[int(i * rate):int(end * rate)]
+            input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
+            predicted_ids = model.generate(input_features)
+            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+            yield transcription
+    else:
+        transcriber = pipeline("automatic-speech-recognition", model=model_name)
+        chunk_duration = 10  # seconds
+        speech, rate = librosa.load(wav_audio, sr=16000)
+        duration = len(speech) / rate
+        for i in range(0, int(duration), chunk_duration):
+            end = min(i + chunk_duration, duration)
+            chunk = speech[int(i * rate):int(end * rate)]
+            result = transcriber(chunk)
+            yield result["text"]
 def detect_and_select_model(audio):
     wav_audio = convert_audio_to_wav(audio)
     try:
         language, model_options = detect_and_select_model(audio)
         selected_model = model_options[0]
+        yield language, gr.Dropdown.update(choices=model_options, value=selected_model), selected_model, ""
+        full_transcription = ""
+        for partial_transcription in transcribe_audio_stream(audio, selected_model):
+            full_transcription += partial_transcription + " "
+            yield language, gr.Dropdown.update(choices=model_options, value=selected_model), selected_model, full_transcription.strip()
         # Clean up temporary files
         os.remove("converted_audio.wav")
     except Exception as e:
+        yield str(e), gr.Dropdown.update(choices=[]), "", "An error occurred during processing."
 iface = gr.Interface(
     fn=combined_interface,
         gr.Textbox(label="Selected Model"),
         gr.Textbox(label="Transcription", lines=10)
     ],
+    title="Multilingual Audio Transcriber with Real-time Display",
+    description="Upload an audio file to detect the language, select the transcription model, and get the transcription in real-time. Optimized for Spanish and English.",
+    live=True
 )
 if __name__ == "__main__":
+    iface.queue().launch()