Mind_Aid_Demo / app.py
Fluospark128's picture
Update app.py
8a5ea2d verified
import gradio as gr
import torch
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer
from gtts import gTTS
import numpy as np
import tempfile
import os
import google.generativeai as genai
# Set Google GenAI API key from environment variable
#GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
#genai.configure(api_key=GOOGLE_API_KEY)
genai.configure(api_key="AIzaSyB3N9BHeIWs_8sdFK76PU-v9N6prcIq2Hw")
#model = genai.GenerativeModel("gemini-1.5-pro")
#chat = model.start_chat(history=[])
# Load GenAI model
print("Loading Google Generative AI model...")
gen_model = genai.GenerativeModel("gemini-1.5-pro")
# Load ASR
print("Loading ASR model...")
speech_to_text_pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
# Load GPT-2
print("Loading GPT-2 model...")
response_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
response_model = GPT2LMHeadModel.from_pretrained("gpt2")
response_model.eval()
# Main logic
def process_input(emotion, audio_input, text_input):
print(f"\n---\nEmotion: {emotion}")
# Handle audio input
audio_text = ""
if audio_input is not None:
print("Audio input detected. Transcribing...")
try:
sample_rate, audio_data = audio_input
if len(audio_data) == 0 or np.all(audio_data == 0):
print("Silent or empty audio.")
else:
audio_data = audio_data / np.max(np.abs(audio_data))
audio_text = speech_to_text_pipeline({
"sampling_rate": sample_rate,
"array": audio_data
})["text"]
print(f"Audio transcription: {audio_text}")
except Exception as e:
print(f"Speech-to-text error: {e}")
audio_text = ""
# Combine input
combined_input_text = (text_input or "") + " " + (audio_text or "")
combined_input_text = combined_input_text.strip()
print(f"User input: {combined_input_text}")
if not combined_input_text:
return "Please provide text or audio input.", None
# Add emotion context
prompt = f"The user feels {emotion}. Respond supportively: {combined_input_text}"
print(f"Final prompt to model: {prompt}")
# Use Google GenAI
try:
gen_response = gen_model.generate_content(prompt)
text_output = gen_response.text.strip()
print(f"Google GenAI response: {text_output}")
except Exception as e:
print(f"GenAI Error: {e}")
# Fallback to GPT-2
print("Falling back to GPT-2...")
try:
input_ids = response_tokenizer.encode(prompt, return_tensors='pt')[:, -512:]
with torch.no_grad():
output = response_model.generate(
input_ids=input_ids,
max_length=input_ids.shape[1] + 50,
num_beams=3,
temperature=0.8,
no_repeat_ngram_size=2,
early_stopping=True
)
text_output = response_tokenizer.decode(output[0], skip_special_tokens=True)
print(f"GPT-2 fallback response: {text_output}")
except Exception as gpt_error:
print(f"GPT-2 Error: {gpt_error}")
text_output = "Sorry, I couldn't generate a response."
# TTS conversion
try:
print("Generating speech...")
tts = gTTS(text_output)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
tts.save(temp_file.name)
audio_output_path = temp_file.name
print(f"TTS audio saved at: {audio_output_path}")
except Exception as e:
print(f"TTS Error: {e}")
audio_output_path = None
return text_output, audio_output_path
# Gradio Interface
iface = gr.Interface(
fn=process_input,
inputs=[
gr.Radio(["positive", "neutral", "negative"], label="Your Emotion"),
gr.Audio(type="numpy", label="Speak..."),
gr.Textbox(label="Text Input", placeholder="Or type here..."),
],
outputs=[
gr.Textbox(label="AI Response"),
gr.Audio(label="Spoken Response"),
],
title="Emotion-Aware Multimodal AI Assistant",
description="Choose your emotional state, then talk or type to the AI assistant. It responds based on your emotional context.",
)
if __name__ == "__main__":
iface.launch()