# app.py import gradio as gr import os import subprocess import tempfile from fastapi import FastAPI, File, UploadFile, Form, HTTPException app = FastAPI() VOICE_CHOICES = [ "NATF0.pt", "NATF1.pt", "NATF2.pt", "NATF3.pt", "NATM0.pt", "NATM1.pt", "NATM2.pt", "NATM3.pt", "VARF0.pt", "VARF1.pt", "VARF2.pt", "VARF3.pt", "VARF4.pt", "VARM0.pt", "VARM1.pt", "VARM2.pt", "VARM3.pt", "VARM4.pt" ] DEFAULT_PERSONA = """You are Sofia, a warm, helpful, witty virtual assistant from Lagos. You love tech, music, Nollywood, and great conversations. Speak naturally, be empathetic, use a touch of Nigerian flair when it fits, and keep responses concise but engaging.""" def run_offline_inference(input_path, text_prompt, voice_prompt, seed, output_wav, output_json): cmd = [ "python", "-m", "moshi.offline", "--voice-prompt", voice_prompt, "--input-wav", input_path, "--seed", str(seed), "--output-wav", output_wav, "--output-text", output_json ] if text_prompt: with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as prompt_file: prompt_file.write(text_prompt) cmd += ["--text-prompt", prompt_file.name] try: subprocess.check_call(cmd, timeout=900) # 15 min max (CPU can be slow) except subprocess.TimeoutExpired: raise RuntimeError("Inference timed out — CPU is slow, try shorter input audio.") finally: if os.path.exists(prompt_file.name): os.unlink(prompt_file.name) else: subprocess.check_call(cmd) def gradio_generate(input_audio, text_prompt, voice_prompt, seed): if input_audio is None: raise gr.Error("Please record or upload audio for Sofia to hear you!") full_prompt = text_prompt.strip() or DEFAULT_PERSONA try: with tempfile.TemporaryDirectory() as tmpdir: output_wav = os.path.join(tmpdir, "sofia_response.wav") output_json = os.path.join(tmpdir, "sofia_response.json") yield None, "Processing... Sofia is thinking (expect 1–5+ minutes on free CPU)..." run_offline_inference(input_audio, full_prompt, voice_prompt, seed, output_wav, output_json) with open(output_json, "r") as f: text = f.read().strip() yield output_wav, text except Exception as e: raise gr.Error(f"Error: {str(e)}\n(Try shorter audio clips or check Space logs)") with gr.Blocks(theme=gr.themes.Soft(primary_hue="pink", secondary_hue="purple")) as demo: gr.Markdown("# Sofia — Your PersonaPlex AI Companion") gr.Markdown( "Record or upload short audio → Sofia responds! \n" "**CPU note:** First load takes time (model download + init). Responses: 1–5+ min. Use short inputs (5–15 sec)." ) with gr.Row(): with gr.Column(): input_audio = gr.Audio( sources=["microphone", "upload"], type="filepath", label="Speak to Sofia (mic or upload WAV/MP3)", # Fixed: Use proper WaveformOptions class waveform_options=gr.WaveformOptions( show_recording_waveform=True, # shows waveform while recording show_controls=False # optional: hides extra player buttons if wanted ) ) text_prompt = gr.Textbox( label="Custom Persona / Role for Sofia (optional)", placeholder=DEFAULT_PERSONA, lines=4, value="" ) voice_prompt = gr.Dropdown( choices=VOICE_CHOICES, label="Sofia's Voice Style", value="NATF2.pt" ) seed = gr.Number(label="Random Seed", value=42424242, precision=0) submit_btn = gr.Button("Send to Sofia →", variant="primary") with gr.Column(): output_audio = gr.Audio(label="Sofia's Response (Audio)", autoplay=True) output_text = gr.Textbox(label="Sofia's Response (Text)", lines=6) status = gr.Textbox(label="Status", interactive=False, value="Ready...") submit_btn.click( fn=gradio_generate, inputs=[input_audio, text_prompt, voice_prompt, seed], outputs=[output_audio, output_text], ).then( lambda: "Done! Play Sofia's response above ↑", outputs=status ) gr.mount_gradio_app(app, demo, path="/") @app.post("/generate") async def api_generate( input_audio: UploadFile = File(...), text_prompt: str = Form(None), voice_prompt: str = Form("NATF2.pt"), seed: int = Form(42424242) ): if not input_audio: raise HTTPException(400, "No audio file provided") text_prompt = text_prompt or DEFAULT_PERSONA with tempfile.TemporaryDirectory() as tmpdir: input_path = os.path.join(tmpdir, "input.wav") with open(input_path, "wb") as f: f.write(await input_audio.read()) output_wav = os.path.join(tmpdir, "sofia_output.wav") output_json = os.path.join(tmpdir, "sofia_output.json") run_offline_inference(input_path, text_prompt, voice_prompt, seed, output_wav, output_json) with open(output_wav, "rb") as f: audio_data = f.read() with open(output_json, "r") as f: text = f.read().strip() return {"audio": audio_data, "text": text} if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)