GaneshSarode's picture
Update app.py
aa9b5fb verified
import gradio as gr
import whisper
# =========================
# Load Whisper model ONCE
# =========================
model = whisper.load_model("small")
# =========================
# ASR function
# =========================
def transcribe(audio_path, src_lang, tgt_lang):
if audio_path is None:
return "No audio provided", None
result = model.transcribe(audio_path)
text = result.get("text", "").strip()
if not text:
text = "No speech detected"
# Audio output is intentionally None (voice cloning later)
return text, None
# =========================
# UI
# =========================
with gr.Blocks(title="Multilingual Voice Clone") as demo:
gr.Markdown("# ๐ŸŽ™ Multilingual Voice Cloning Demo")
gr.Markdown("**Phase 2: Speech-to-Text (Whisper ASR)**")
with gr.Row():
audio_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="Speak (Input Speech)"
)
gr.Audio(
sources=["upload"],
type="filepath",
label="Upload Voice Sample (used later)",
interactive=False
)
with gr.Row():
src_lang = gr.Dropdown(
["Marathi", "Hindi", "English"],
value="English",
label="Input Language (for later)"
)
tgt_lang = gr.Dropdown(
["Hindi", "English", "Tamil"],
value="Hindi",
label="Output Language (for later)"
)
btn = gr.Button("Generate Voice")
text_out = gr.Textbox(
label="Transcribed Text",
placeholder="Speech-to-text output will appear here"
)
audio_out = gr.Audio(
label="Cloned Voice Output (coming in Phase 4)"
)
btn.click(
fn=transcribe,
inputs=[audio_input, src_lang, tgt_lang],
outputs=[text_out, audio_out]
)
demo.launch()