import gradio as gr import torch from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer from gtts import gTTS import numpy as np import tempfile import os import google.generativeai as genai # Set Google GenAI API key from environment variable #GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") #genai.configure(api_key=GOOGLE_API_KEY) genai.configure(api_key="AIzaSyB3N9BHeIWs_8sdFK76PU-v9N6prcIq2Hw") #model = genai.GenerativeModel("gemini-1.5-pro") #chat = model.start_chat(history=[]) # Load GenAI model print("Loading Google Generative AI model...") gen_model = genai.GenerativeModel("gemini-1.5-pro") # Load ASR print("Loading ASR model...") speech_to_text_pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h") # Load GPT-2 print("Loading GPT-2 model...") response_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") response_model = GPT2LMHeadModel.from_pretrained("gpt2") response_model.eval() # Main logic def process_input(emotion, audio_input, text_input): print(f"\n---\nEmotion: {emotion}") # Handle audio input audio_text = "" if audio_input is not None: print("Audio input detected. Transcribing...") try: sample_rate, audio_data = audio_input if len(audio_data) == 0 or np.all(audio_data == 0): print("Silent or empty audio.") else: audio_data = audio_data / np.max(np.abs(audio_data)) audio_text = speech_to_text_pipeline({ "sampling_rate": sample_rate, "array": audio_data })["text"] print(f"Audio transcription: {audio_text}") except Exception as e: print(f"Speech-to-text error: {e}") audio_text = "" # Combine input combined_input_text = (text_input or "") + " " + (audio_text or "") combined_input_text = combined_input_text.strip() print(f"User input: {combined_input_text}") if not combined_input_text: return "Please provide text or audio input.", None # Add emotion context prompt = f"The user feels {emotion}. Respond supportively: {combined_input_text}" print(f"Final prompt to model: {prompt}") # Use Google GenAI try: gen_response = gen_model.generate_content(prompt) text_output = gen_response.text.strip() print(f"Google GenAI response: {text_output}") except Exception as e: print(f"GenAI Error: {e}") # Fallback to GPT-2 print("Falling back to GPT-2...") try: input_ids = response_tokenizer.encode(prompt, return_tensors='pt')[:, -512:] with torch.no_grad(): output = response_model.generate( input_ids=input_ids, max_length=input_ids.shape[1] + 50, num_beams=3, temperature=0.8, no_repeat_ngram_size=2, early_stopping=True ) text_output = response_tokenizer.decode(output[0], skip_special_tokens=True) print(f"GPT-2 fallback response: {text_output}") except Exception as gpt_error: print(f"GPT-2 Error: {gpt_error}") text_output = "Sorry, I couldn't generate a response." # TTS conversion try: print("Generating speech...") tts = gTTS(text_output) temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") tts.save(temp_file.name) audio_output_path = temp_file.name print(f"TTS audio saved at: {audio_output_path}") except Exception as e: print(f"TTS Error: {e}") audio_output_path = None return text_output, audio_output_path # Gradio Interface iface = gr.Interface( fn=process_input, inputs=[ gr.Radio(["positive", "neutral", "negative"], label="Your Emotion"), gr.Audio(type="numpy", label="Speak..."), gr.Textbox(label="Text Input", placeholder="Or type here..."), ], outputs=[ gr.Textbox(label="AI Response"), gr.Audio(label="Spoken Response"), ], title="Emotion-Aware Multimodal AI Assistant", description="Choose your emotional state, then talk or type to the AI assistant. It responds based on your emotional context.", ) if __name__ == "__main__": iface.launch()