SwaGPT / app.py
Stanley03's picture
Update app.py
de7f0d0 verified
import torch
from transformers import pipeline, VitsModel, AutoTokenizer
import scipy.io.wavfile
import gradio as gr
import tempfile
import os
import google.generativeai as genai
# ====================================================================
# SwaGPT Final Deployment Script (Optimized for Hugging Face Spaces)
# This script loads the GEMINI_API_KEY from the Space Secrets for security.
# ====================================================================
# 1. Setup Models (Optimized for Free/CPU environments)
STT_MODEL = "openai/whisper-tiny"
# IMPORTANT: Replace with your custom trained model ID once uploaded!
TTS_MODEL_ID = "facebook/mms-tts-swh"
print("Loading AI components...")
stt_pipe = pipeline("automatic-speech-recognition", model=STT_MODEL, device="cpu")
tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_ID)
tts_model = VitsModel.from_pretrained(TTS_MODEL_ID)
# 2. Configure Gemini API
# The key is loaded from the environment variable set in Hugging Face Secrets
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
if GEMINI_API_KEY:
genai.configure(api_key=GEMINI_API_KEY)
print("Gemini API configured successfully from Secrets.")
else:
print("WARNING: GEMINI_API_KEY not found in environment variables. LLM will not work.")
def voice_agent_chat(audio_path):
if not GEMINI_API_KEY:
return "ERROR: Gemini API Key is missing. Please set the GEMINI_API_KEY secret in your Space settings.", None
if audio_path is None:
return "Tafadhali rekodi sauti yako.", None
# Step 1: Speech-to-Text (Listen)
stt_result = stt_pipe(audio_path, generate_kwargs={"language": "swahili"})
user_text = stt_result["text"]
# Step 2: Gemini Intelligence (Think)
try:
model = genai.GenerativeModel('gemini-1.5-flash')
# System Prompt for Kiswahili AI Personality
system_instruction = "Wewe ni SwaGPT, msaidizi wa akili mnemba unayezungumza Kiswahili sanifu. Jibu kwa ufupi sana (sentensi 1-2)."
prompt = f"{system_instruction}\n\nMtumiaji: {user_text}"
response = model.generate_content(prompt)
ai_response = response.text
except Exception as e:
ai_response = f"Tatizo la API: {str(e)}. Huenda umefikia kikomo cha matumizi ya bure."
# Step 3: Text-to-Speech (Speak)
inputs = tts_tokenizer(ai_response, return_tensors="pt")
with torch.no_grad():
output = tts_model(**inputs).waveform
sampling_rate = tts_model.config.sampling_rate
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
scipy.io.wavfile.write(temp_file.name, rate=sampling_rate, data=output.float().numpy().T)
return ai_response, temp_file.name
# Create Gradio Interface
with gr.Blocks(title="SwaGPT Intelligent Voice Agent") as demo:
gr.Markdown("# 🤖 SwaGPT Intelligent Voice Agent")
gr.Markdown("Zungumza na SwaGPT! Mfumo huu unatumia Gemini kufikiri na SwaGPT kuzungumza.")
with gr.Row():
with gr.Column():
gr.Markdown("### 1. Zungumza (Talk)")
audio_input = gr.Audio(label="Rekodi Sauti", type="filepath")
submit_btn = gr.Button("Anza Mazungumzo", variant="primary")
with gr.Column():
gr.Markdown("### 2. Jibu (Response)")
chat_text = gr.Textbox(label="Maandishi ya AI")
audio_output = gr.Audio(label="Sauti ya AI")
submit_btn.click(
fn=voice_agent_chat,
inputs=audio_input,
outputs=[chat_text, audio_output]
)
if __name__ == "__main__":
demo.launch()