import gradio as gr import torch import librosa import numpy as np from transformers import WhisperForConditionalGeneration, WhisperProcessor from peft import PeftModel print("Loading model...") processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3") base_model = WhisperForConditionalGeneration.from_pretrained( "openai/whisper-large-v3", torch_dtype=torch.float32, low_cpu_mem_usage=True ) model = PeftModel.from_pretrained(base_model, "Dougsworth/caribbean-whisper-asr") print("Model loaded!") def enhance_phone_audio(audio): """Gentle enhancement for phone audio - just normalize, no filtering""" # Remove DC offset audio = audio - np.mean(audio) # Normalize to [-1, 1] max_val = np.max(np.abs(audio)) if max_val > 0: audio = audio / max_val # Gentle compression to boost quiet parts audio = np.sign(audio) * np.power(np.abs(audio), 0.9) # Normalize again max_val = np.max(np.abs(audio)) if max_val > 0: audio = audio / max_val return audio.astype(np.float32) def transcribe(audio_path): if audio_path is None: return "Please upload an audio file." audio, sr = librosa.load(audio_path, sr=16000) # Apply gentle enhancement audio = enhance_phone_audio(audio) inputs = processor(audio, sampling_rate=16000, return_tensors="pt") with torch.no_grad(): predicted_ids = model.generate(inputs.input_features, language="en", task="transcribe") transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] return transcription demo = gr.Interface( fn=transcribe, inputs=gr.Audio(type="filepath", label="Upload Voice Note"), outputs=gr.Textbox(label="Job Listing Transcription"), title="Linkup - Caribbean Speech to Text", description="Upload a voice note describing a job and get it transcribed. Built for Caribbean accents.", examples=[] ) demo.launch()