Spaces:

cigol123
/

Macedonian-ASR

Sleeping

App Files Files Community

cigol123 commited on Feb 21, 2025

Commit

9b83916

verified ·

1 Parent(s): 9f6ab6b

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -26

app.py CHANGED Viewed

@@ -5,34 +5,30 @@ import soundfile as sf
 import numpy as np
 from scipy import signal
-# Ensure the model runs on GPU if available
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Running on device: {device}")
-# Load the model and processor
-print("Loading Whisper model for Macedonian transcription...")
-processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
-model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3").to(device)
-print("✓ Model loaded successfully!")
 def process_audio(audio_path):
-    try:
-        # Load and resample to 16kHz using scipy
-        waveform, sr = sf.read(audio_path)
-        if len(waveform.shape) > 1:  # Convert stereo to mono
-            waveform = waveform.mean(axis=1)
-        if sr != 16000:  # Resample if necessary
-            num_samples = int(len(waveform) * 16000 / sr)
-            waveform = signal.resample(waveform, num_samples)
-        # Process the audio
-        inputs = processor(waveform, sampling_rate=16000, return_tensors="pt").to(device)
-        print("Transcribing...")
-        predicted_ids = model.generate(**inputs, language="mk")
-        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-        return transcription
-    except Exception as e:
-        return f"Error during transcription: {str(e)}"
 # Gradio interface
 demo = gr.Interface(
@@ -44,4 +40,4 @@ demo = gr.Interface(
 )
 if __name__ == "__main__":
-    demo.launch(share=True)

 import numpy as np
 from scipy import signal
+# Load the Whisper model and processor directly from Hugging Face
+def load_model():
+    print("Loading Whisper model and processor...")
+    processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
+    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
+    print("✓ Model and processor loaded successfully!")
+    return processor, model
+processor, model = load_model()
 def process_audio(audio_path):
+    # Load and resample to 16kHz using scipy
+    waveform, sr = sf.read(audio_path)
+    if len(waveform.shape) > 1:  # Convert stereo to mono
+        waveform = waveform.mean(axis=1)
+    if sr != 16000:  # Resample if necessary
+        num_samples = int(len(waveform) * 16000 / sr)
+        waveform = signal.resample(waveform, num_samples)
+    # Process the audio
+    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")
+    predicted_ids = model.generate(**inputs, language="mk")
+    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+    return transcription
 # Gradio interface
 demo = gr.Interface(
 )
 if __name__ == "__main__":
+    demo.launch()