Spaces:
Sleeping
Sleeping
| # app.py | |
| import os, tempfile, subprocess, gradio as gr | |
| from dotenv import load_dotenv | |
| import whisper | |
| import pvfalcon | |
| # βββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. ENVIRONMENT | |
| # βββββββββββββββββββββββββββββββββββββββββββ | |
| load_dotenv() | |
| FALCON_ACCESS_KEY = os.getenv("FALCON_ACCESS_KEY") | |
| if not FALCON_ACCESS_KEY: | |
| raise RuntimeError( | |
| "Set FALCON_ACCESS_KEY in your environment or .env file " | |
| "(get one free at https://console.picovoice.ai)." | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. MODELS | |
| # βββββββββββββββββββββββββββββββββββββββββββ | |
| whisper_model = whisper.load_model("base") # CPU-friendly | |
| falcon = pvfalcon.create(access_key=FALCON_ACCESS_KEY) | |
| # βββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. CORE LOGIC | |
| # βββββββββββββββββββββββββββββββββββββββββββ | |
| def process_video(file, language="Auto"): | |
| # 3.1 Choose language for Whisper | |
| lang_code = None if language == "Auto" else language.lower() | |
| # 3.2 Extract mono 16-kHz WAV with ffmpeg | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as wav: | |
| wav_path = wav.name | |
| subprocess.run( | |
| ["ffmpeg", "-y", "-i", file.name, | |
| "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", wav_path], | |
| stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL | |
| ) | |
| if not os.path.getsize(wav_path): | |
| return "Audio extraction failed.", "" | |
| # 3.3 Speaker diarization | |
| segments = falcon.process_file(wav_path) # list[pvfalcon.Segment] | |
| diarized_map, label_map, counter = [], {}, 1 | |
| for seg in segments: | |
| tag = seg.speaker_tag | |
| if tag not in label_map: | |
| label_map[tag] = f"Speaker {counter}" | |
| counter += 1 | |
| diarized_map.append( | |
| dict(start=seg.start_sec, end=seg.end_sec, speaker=label_map[tag]) | |
| ) | |
| # 3.4 Transcription (Whisper) | |
| res = whisper_model.transcribe(wav_path, language=lang_code) | |
| paragraph_transcript = res["text"] # plain paragraph | |
| # 3.5 Merge speakers with transcription | |
| speaker_lines = [] | |
| for s in res.get("segments", []): | |
| speaker = next( | |
| (m["speaker"] for m in diarized_map if m["start"] <= s["start"] <= m["end"]), | |
| "Unknown" | |
| ) | |
| speaker_lines.append(f"{speaker}: {s['text']}") | |
| speaker_transcript = "\n".join(speaker_lines) | |
| # 3.6 Return in desired order | |
| return speaker_transcript, paragraph_transcript | |
| # βββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. GRADIO UI | |
| # βββββββββββββββββββββββββββββββββββββββββββ | |
| demo = gr.Interface( | |
| fn=process_video, | |
| inputs=[ | |
| gr.File(label="Upload Video", type="filepath"), | |
| gr.Dropdown(["Auto", "English", "Hindi", "Urdu"], label="Language") | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Speaker-wise Transcript", show_copy_button=True), | |
| gr.Textbox(label=" Transcription", show_copy_button=True) | |
| ], | |
| title="Transcription + Speaker Segmentation", | |
| description="Whisper + Picovoice Falcon running fully on CPU." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |