Spaces:

Fluospark128
/

Mind_Aid_Demo

Sleeping

App Files Files Community

Mind_Aid_Demo / app.py

Fluospark128

Update app.py

8a5ea2d verified 9 months ago

raw

history blame contribute delete

4.38 kB

	import gradio as gr
	import torch
	from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer
	from gtts import gTTS
	import numpy as np
	import tempfile
	import os
	import google.generativeai as genai

	# Set Google GenAI API key from environment variable
	#GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
	#genai.configure(api_key=GOOGLE_API_KEY)
	genai.configure(api_key="AIzaSyB3N9BHeIWs_8sdFK76PU-v9N6prcIq2Hw")
	#model = genai.GenerativeModel("gemini-1.5-pro")
	#chat = model.start_chat(history=[])

	# Load GenAI model
	print("Loading Google Generative AI model...")
	gen_model = genai.GenerativeModel("gemini-1.5-pro")


	# Load ASR
	print("Loading ASR model...")
	speech_to_text_pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")

	# Load GPT-2
	print("Loading GPT-2 model...")
	response_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
	response_model = GPT2LMHeadModel.from_pretrained("gpt2")
	response_model.eval()

	# Main logic
	def process_input(emotion, audio_input, text_input):
	print(f"\n---\nEmotion: {emotion}")

	# Handle audio input
	audio_text = ""
	if audio_input is not None:
	print("Audio input detected. Transcribing...")
	try:
	sample_rate, audio_data = audio_input
	if len(audio_data) == 0 or np.all(audio_data == 0):
	print("Silent or empty audio.")
	else:
	audio_data = audio_data / np.max(np.abs(audio_data))
	audio_text = speech_to_text_pipeline({
	"sampling_rate": sample_rate,
	"array": audio_data
	})["text"]
	print(f"Audio transcription: {audio_text}")
	except Exception as e:
	print(f"Speech-to-text error: {e}")
	audio_text = ""

	# Combine input
	combined_input_text = (text_input or "") + " " + (audio_text or "")
	combined_input_text = combined_input_text.strip()
	print(f"User input: {combined_input_text}")

	if not combined_input_text:
	return "Please provide text or audio input.", None

	# Add emotion context
	prompt = f"The user feels {emotion}. Respond supportively: {combined_input_text}"
	print(f"Final prompt to model: {prompt}")

	# Use Google GenAI
	try:
	gen_response = gen_model.generate_content(prompt)
	text_output = gen_response.text.strip()
	print(f"Google GenAI response: {text_output}")
	except Exception as e:
	print(f"GenAI Error: {e}")
	# Fallback to GPT-2
	print("Falling back to GPT-2...")
	try:
	input_ids = response_tokenizer.encode(prompt, return_tensors='pt')[:, -512:]
	with torch.no_grad():
	output = response_model.generate(
	input_ids=input_ids,
	max_length=input_ids.shape[1] + 50,
	num_beams=3,
	temperature=0.8,
	no_repeat_ngram_size=2,
	early_stopping=True
	)
	text_output = response_tokenizer.decode(output[0], skip_special_tokens=True)
	print(f"GPT-2 fallback response: {text_output}")
	except Exception as gpt_error:
	print(f"GPT-2 Error: {gpt_error}")
	text_output = "Sorry, I couldn't generate a response."

	# TTS conversion
	try:
	print("Generating speech...")
	tts = gTTS(text_output)
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
	tts.save(temp_file.name)
	audio_output_path = temp_file.name
	print(f"TTS audio saved at: {audio_output_path}")
	except Exception as e:
	print(f"TTS Error: {e}")
	audio_output_path = None

	return text_output, audio_output_path

	# Gradio Interface
	iface = gr.Interface(
	fn=process_input,
	inputs=[
	gr.Radio(["positive", "neutral", "negative"], label="Your Emotion"),
	gr.Audio(type="numpy", label="Speak..."),
	gr.Textbox(label="Text Input", placeholder="Or type here..."),
	],
	outputs=[
	gr.Textbox(label="AI Response"),
	gr.Audio(label="Spoken Response"),
	],
	title="Emotion-Aware Multimodal AI Assistant",
	description="Choose your emotional state, then talk or type to the AI assistant. It responds based on your emotional context.",
	)

	if __name__ == "__main__":
	iface.launch()