Spaces:
Sleeping
Sleeping
| import os | |
| import warnings | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| from transformers import ( | |
| AutoModelForSpeechSeq2Seq, | |
| AutoProcessor, | |
| logging, | |
| pipeline, | |
| ) | |
| warnings.simplefilter("ignore", FutureWarning) | |
| # ββ CPU performance tweaks ββ | |
| os.environ["OMP_NUM_THREADS"] = "4" | |
| os.environ["MKL_NUM_THREADS"] = "4" | |
| torch.set_num_threads(4) | |
| logging.set_verbosity_error() | |
| # ββ Model setup ββ | |
| model_id = "kingabzpro/whisper-base-urdu-full" | |
| # Load and quantize to int8 | |
| model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| model_id, | |
| use_safetensors=True, | |
| ) | |
| model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8) | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| # Build a CPU-based pipeline with chunking | |
| transcriber = pipeline( | |
| task="automatic-speech-recognition", | |
| model=model, | |
| tokenizer=processor.tokenizer, | |
| feature_extractor=processor.feature_extractor, | |
| device=-1, # CPU | |
| chunk_length_s=30, | |
| stride_length_s=(5, 5), | |
| ) | |
| def transcribe(audio): | |
| if audio is None: | |
| return "No audio provided. Please record or upload an audio file." | |
| sr, y = audio | |
| # mono & normalize | |
| if y.ndim > 1: | |
| y = y.mean(axis=1) | |
| y = y.astype(np.float32) | |
| peak = np.max(np.abs(y)) | |
| if peak > 0: | |
| y /= peak | |
| else: | |
| return "Audio appears to be silent. Please try again." | |
| # Inference under no_grad | |
| with torch.no_grad(): | |
| result = transcriber({"sampling_rate": sr, "raw": y}) | |
| text = result.get("text", "") | |
| # Add Urdu full stop if not present | |
| if text: | |
| text = text.rstrip() | |
| if text.endswith("."): | |
| text = text[:-1] + "Ϋ" | |
| elif not text.endswith("Ϋ"): | |
| text = text + "Ϋ" | |
| return text | |
| # ββ Gradio UI ββ | |
| description = """ | |
| <p style='text-align: center'> | |
| Record or upload audio in Urdu and get the transcribed text using the Whisper Base Urdu model. | |
| </p> | |
| """ | |
| examples = [ | |
| ["samples/audio1.mp3"], | |
| ["samples/audio2.mp3"], | |
| ["samples/audio3.mp3"], | |
| ] | |
| demo = gr.Interface( | |
| fn=transcribe, | |
| inputs=gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="numpy", | |
| label="Record or Upload Audio (Urdu)", | |
| ), | |
| outputs=gr.Textbox( | |
| label="Transcribed Text (Urdu)", | |
| placeholder="Transcribed Urdu text will appear here...", | |
| ), | |
| title="β‘Fast Urdu Speech Recognition", | |
| description=description, | |
| examples=examples, | |
| allow_flagging="never", | |
| theme=gr.themes.Soft(), | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |