Spaces:

Fluospark128
/

Mind_Aid_Demo

Sleeping

App Files Files Community

Fluospark128 commited on May 2, 2025

Commit

8a5ea2d

verified ·

1 Parent(s): 7a1a9b5

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -36

app.py CHANGED Viewed

@@ -5,97 +5,120 @@ from gtts import gTTS
 import numpy as np
 import tempfile
 import os
-# 1. Speech-to-Text pipeline
 print("Loading ASR model...")
-speech_to_text_pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h") #model="openai/whisper-small") #model="facebook/wav2vec2-base-960h"
-# 2. Text generation model (GPT2)
 print("Loading GPT-2 model...")
-response_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-response_model = GPT2LMHeadModel.from_pretrained('gpt2')
 response_model.eval()
-# 3. Main logic
-def process_audio_and_text(audio_input, text_input):
-    print("Processing input...")
-    # 1. Handle audio input
     audio_text = ""
     if audio_input is not None:
-        print("Audio input detected. Starting transcription...")
         try:
             sample_rate, audio_data = audio_input
             if len(audio_data) == 0 or np.all(audio_data == 0):
-                print("Empty or silent audio input.")
             else:
-                audio_data = audio_data / np.max(np.abs(audio_data))  # Normalize
                 audio_text = speech_to_text_pipeline({
                     "sampling_rate": sample_rate,
                     "array": audio_data
                 })["text"]
-                print(f"Transcribed Audio: {audio_text}")
         except Exception as e:
-            print(f"Speech-to-Text Error: {e}")
             audio_text = ""
-    # 2. Combine inputs
     combined_input_text = (text_input or "") + " " + (audio_text or "")
     combined_input_text = combined_input_text.strip()
-    print(f"Combined input: {combined_input_text}")
-    # 3. Generate response
-    if combined_input_text:
-        input_ids = response_tokenizer.encode(combined_input_text, return_tensors='pt')[:, -512:]  # trim context
-        print("Generating response...")
         try:
             with torch.no_grad():
                 output = response_model.generate(
                     input_ids=input_ids,
-                    max_length=input_ids.shape[1] + 50,  # short responses
                     num_beams=3,
                     temperature=0.8,
                     no_repeat_ngram_size=2,
                     early_stopping=True
                 )
             text_output = response_tokenizer.decode(output[0], skip_special_tokens=True)
-            print(f"Generated text: {text_output}")
-        except Exception as e:
-            print(f"Text generation error: {e}")
             text_output = "Sorry, I couldn't generate a response."
-    else:
-        text_output = "Please provide audio or text input."
-        print(text_output)
-    # 4. Convert to speech
     try:
-        print("Generating audio response...")
         tts = gTTS(text_output)
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
         tts.save(temp_file.name)
         audio_output_path = temp_file.name
-        print(f"Audio saved to {audio_output_path}")
     except Exception as e:
         print(f"TTS Error: {e}")
         audio_output_path = None
     return text_output, audio_output_path
-# 5. Gradio Interface
 iface = gr.Interface(
-    fn=process_audio_and_text,
     inputs=[
-        gr.Audio(type="numpy", label="Speak..."), #, max_duration=10),
         gr.Textbox(label="Text Input", placeholder="Or type here..."),
     ],
     outputs=[
         gr.Textbox(label="AI Response"),
         gr.Audio(label="Spoken Response"),
     ],
-    title="Multimodal Conversational AI",
-    description="Talk or type to the AI assistant. It will reply with both text and voice.",
 )
-# 6. Launch
 if __name__ == "__main__":
     iface.launch()

 import numpy as np
 import tempfile
 import os
+import google.generativeai as genai
+# Set Google GenAI API key from environment variable
+#GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+#genai.configure(api_key=GOOGLE_API_KEY)
+genai.configure(api_key="AIzaSyB3N9BHeIWs_8sdFK76PU-v9N6prcIq2Hw")
+#model = genai.GenerativeModel("gemini-1.5-pro")
+#chat = model.start_chat(history=[])
+# Load GenAI model
+print("Loading Google Generative AI model...")
+gen_model = genai.GenerativeModel("gemini-1.5-pro")
+# Load ASR
 print("Loading ASR model...")
+speech_to_text_pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
+# Load GPT-2
 print("Loading GPT-2 model...")
+response_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+response_model = GPT2LMHeadModel.from_pretrained("gpt2")
 response_model.eval()
+# Main logic
+def process_input(emotion, audio_input, text_input):
+    print(f"\n---\nEmotion: {emotion}")
+    # Handle audio input
     audio_text = ""
     if audio_input is not None:
+        print("Audio input detected. Transcribing...")
         try:
             sample_rate, audio_data = audio_input
             if len(audio_data) == 0 or np.all(audio_data == 0):
+                print("Silent or empty audio.")
             else:
+                audio_data = audio_data / np.max(np.abs(audio_data))
                 audio_text = speech_to_text_pipeline({
                     "sampling_rate": sample_rate,
                     "array": audio_data
                 })["text"]
+                print(f"Audio transcription: {audio_text}")
         except Exception as e:
+            print(f"Speech-to-text error: {e}")
             audio_text = ""
+    # Combine input
     combined_input_text = (text_input or "") + " " + (audio_text or "")
     combined_input_text = combined_input_text.strip()
+    print(f"User input: {combined_input_text}")
+    if not combined_input_text:
+        return "Please provide text or audio input.", None
+    # Add emotion context
+    prompt = f"The user feels {emotion}. Respond supportively: {combined_input_text}"
+    print(f"Final prompt to model: {prompt}")
+    # Use Google GenAI
+    try:
+        gen_response = gen_model.generate_content(prompt)
+        text_output = gen_response.text.strip()
+        print(f"Google GenAI response: {text_output}")
+    except Exception as e:
+        print(f"GenAI Error: {e}")
+        # Fallback to GPT-2
+        print("Falling back to GPT-2...")
         try:
+            input_ids = response_tokenizer.encode(prompt, return_tensors='pt')[:, -512:]
             with torch.no_grad():
                 output = response_model.generate(
                     input_ids=input_ids,
+                    max_length=input_ids.shape[1] + 50,
                     num_beams=3,
                     temperature=0.8,
                     no_repeat_ngram_size=2,
                     early_stopping=True
                 )
             text_output = response_tokenizer.decode(output[0], skip_special_tokens=True)
+            print(f"GPT-2 fallback response: {text_output}")
+        except Exception as gpt_error:
+            print(f"GPT-2 Error: {gpt_error}")
             text_output = "Sorry, I couldn't generate a response."
+    # TTS conversion
     try:
+        print("Generating speech...")
         tts = gTTS(text_output)
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
         tts.save(temp_file.name)
         audio_output_path = temp_file.name
+        print(f"TTS audio saved at: {audio_output_path}")
     except Exception as e:
         print(f"TTS Error: {e}")
         audio_output_path = None
     return text_output, audio_output_path
+# Gradio Interface
 iface = gr.Interface(
+    fn=process_input,
     inputs=[
+        gr.Radio(["positive", "neutral", "negative"], label="Your Emotion"),
+        gr.Audio(type="numpy", label="Speak..."),
         gr.Textbox(label="Text Input", placeholder="Or type here..."),
     ],
     outputs=[
         gr.Textbox(label="AI Response"),
         gr.Audio(label="Spoken Response"),
     ],
+    title="Emotion-Aware Multimodal AI Assistant",
+    description="Choose your emotional state, then talk or type to the AI assistant. It responds based on your emotional context.",
 )
 if __name__ == "__main__":
     iface.launch()