Fluospark128 commited on
Commit
8a5ea2d
·
verified ·
1 Parent(s): 7a1a9b5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -36
app.py CHANGED
@@ -5,97 +5,120 @@ from gtts import gTTS
5
  import numpy as np
6
  import tempfile
7
  import os
 
8
 
9
- # 1. Speech-to-Text pipeline
 
 
 
 
 
 
 
 
 
 
 
 
10
  print("Loading ASR model...")
11
- speech_to_text_pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h") #model="openai/whisper-small") #model="facebook/wav2vec2-base-960h"
12
 
13
- # 2. Text generation model (GPT2)
14
  print("Loading GPT-2 model...")
15
- response_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
16
- response_model = GPT2LMHeadModel.from_pretrained('gpt2')
17
  response_model.eval()
18
 
19
- # 3. Main logic
20
- def process_audio_and_text(audio_input, text_input):
21
- print("Processing input...")
22
 
23
- # 1. Handle audio input
24
  audio_text = ""
25
  if audio_input is not None:
26
- print("Audio input detected. Starting transcription...")
27
  try:
28
  sample_rate, audio_data = audio_input
29
  if len(audio_data) == 0 or np.all(audio_data == 0):
30
- print("Empty or silent audio input.")
31
  else:
32
- audio_data = audio_data / np.max(np.abs(audio_data)) # Normalize
33
  audio_text = speech_to_text_pipeline({
34
  "sampling_rate": sample_rate,
35
  "array": audio_data
36
  })["text"]
37
- print(f"Transcribed Audio: {audio_text}")
38
  except Exception as e:
39
- print(f"Speech-to-Text Error: {e}")
40
  audio_text = ""
41
 
42
- # 2. Combine inputs
43
  combined_input_text = (text_input or "") + " " + (audio_text or "")
44
  combined_input_text = combined_input_text.strip()
45
- print(f"Combined input: {combined_input_text}")
 
 
 
46
 
47
- # 3. Generate response
48
- if combined_input_text:
49
- input_ids = response_tokenizer.encode(combined_input_text, return_tensors='pt')[:, -512:] # trim context
50
- print("Generating response...")
 
 
 
 
 
 
 
 
 
51
  try:
 
52
  with torch.no_grad():
53
  output = response_model.generate(
54
  input_ids=input_ids,
55
- max_length=input_ids.shape[1] + 50, # short responses
56
  num_beams=3,
57
  temperature=0.8,
58
  no_repeat_ngram_size=2,
59
  early_stopping=True
60
  )
61
  text_output = response_tokenizer.decode(output[0], skip_special_tokens=True)
62
- print(f"Generated text: {text_output}")
63
- except Exception as e:
64
- print(f"Text generation error: {e}")
65
  text_output = "Sorry, I couldn't generate a response."
66
- else:
67
- text_output = "Please provide audio or text input."
68
- print(text_output)
69
 
70
- # 4. Convert to speech
71
  try:
72
- print("Generating audio response...")
73
  tts = gTTS(text_output)
74
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
75
  tts.save(temp_file.name)
76
  audio_output_path = temp_file.name
77
- print(f"Audio saved to {audio_output_path}")
78
  except Exception as e:
79
  print(f"TTS Error: {e}")
80
  audio_output_path = None
81
 
82
  return text_output, audio_output_path
83
 
84
- # 5. Gradio Interface
85
  iface = gr.Interface(
86
- fn=process_audio_and_text,
87
  inputs=[
88
- gr.Audio(type="numpy", label="Speak..."), #, max_duration=10),
 
89
  gr.Textbox(label="Text Input", placeholder="Or type here..."),
90
  ],
91
  outputs=[
92
  gr.Textbox(label="AI Response"),
93
  gr.Audio(label="Spoken Response"),
94
  ],
95
- title="Multimodal Conversational AI",
96
- description="Talk or type to the AI assistant. It will reply with both text and voice.",
97
  )
98
 
99
- # 6. Launch
100
  if __name__ == "__main__":
101
  iface.launch()
 
5
  import numpy as np
6
  import tempfile
7
  import os
8
+ import google.generativeai as genai
9
 
10
+ # Set Google GenAI API key from environment variable
11
+ #GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
12
+ #genai.configure(api_key=GOOGLE_API_KEY)
13
+ genai.configure(api_key="AIzaSyB3N9BHeIWs_8sdFK76PU-v9N6prcIq2Hw")
14
+ #model = genai.GenerativeModel("gemini-1.5-pro")
15
+ #chat = model.start_chat(history=[])
16
+
17
+ # Load GenAI model
18
+ print("Loading Google Generative AI model...")
19
+ gen_model = genai.GenerativeModel("gemini-1.5-pro")
20
+
21
+
22
+ # Load ASR
23
  print("Loading ASR model...")
24
+ speech_to_text_pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
25
 
26
+ # Load GPT-2
27
  print("Loading GPT-2 model...")
28
+ response_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
29
+ response_model = GPT2LMHeadModel.from_pretrained("gpt2")
30
  response_model.eval()
31
 
32
+ # Main logic
33
+ def process_input(emotion, audio_input, text_input):
34
+ print(f"\n---\nEmotion: {emotion}")
35
 
36
+ # Handle audio input
37
  audio_text = ""
38
  if audio_input is not None:
39
+ print("Audio input detected. Transcribing...")
40
  try:
41
  sample_rate, audio_data = audio_input
42
  if len(audio_data) == 0 or np.all(audio_data == 0):
43
+ print("Silent or empty audio.")
44
  else:
45
+ audio_data = audio_data / np.max(np.abs(audio_data))
46
  audio_text = speech_to_text_pipeline({
47
  "sampling_rate": sample_rate,
48
  "array": audio_data
49
  })["text"]
50
+ print(f"Audio transcription: {audio_text}")
51
  except Exception as e:
52
+ print(f"Speech-to-text error: {e}")
53
  audio_text = ""
54
 
55
+ # Combine input
56
  combined_input_text = (text_input or "") + " " + (audio_text or "")
57
  combined_input_text = combined_input_text.strip()
58
+ print(f"User input: {combined_input_text}")
59
+
60
+ if not combined_input_text:
61
+ return "Please provide text or audio input.", None
62
 
63
+ # Add emotion context
64
+ prompt = f"The user feels {emotion}. Respond supportively: {combined_input_text}"
65
+ print(f"Final prompt to model: {prompt}")
66
+
67
+ # Use Google GenAI
68
+ try:
69
+ gen_response = gen_model.generate_content(prompt)
70
+ text_output = gen_response.text.strip()
71
+ print(f"Google GenAI response: {text_output}")
72
+ except Exception as e:
73
+ print(f"GenAI Error: {e}")
74
+ # Fallback to GPT-2
75
+ print("Falling back to GPT-2...")
76
  try:
77
+ input_ids = response_tokenizer.encode(prompt, return_tensors='pt')[:, -512:]
78
  with torch.no_grad():
79
  output = response_model.generate(
80
  input_ids=input_ids,
81
+ max_length=input_ids.shape[1] + 50,
82
  num_beams=3,
83
  temperature=0.8,
84
  no_repeat_ngram_size=2,
85
  early_stopping=True
86
  )
87
  text_output = response_tokenizer.decode(output[0], skip_special_tokens=True)
88
+ print(f"GPT-2 fallback response: {text_output}")
89
+ except Exception as gpt_error:
90
+ print(f"GPT-2 Error: {gpt_error}")
91
  text_output = "Sorry, I couldn't generate a response."
 
 
 
92
 
93
+ # TTS conversion
94
  try:
95
+ print("Generating speech...")
96
  tts = gTTS(text_output)
97
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
98
  tts.save(temp_file.name)
99
  audio_output_path = temp_file.name
100
+ print(f"TTS audio saved at: {audio_output_path}")
101
  except Exception as e:
102
  print(f"TTS Error: {e}")
103
  audio_output_path = None
104
 
105
  return text_output, audio_output_path
106
 
107
+ # Gradio Interface
108
  iface = gr.Interface(
109
+ fn=process_input,
110
  inputs=[
111
+ gr.Radio(["positive", "neutral", "negative"], label="Your Emotion"),
112
+ gr.Audio(type="numpy", label="Speak..."),
113
  gr.Textbox(label="Text Input", placeholder="Or type here..."),
114
  ],
115
  outputs=[
116
  gr.Textbox(label="AI Response"),
117
  gr.Audio(label="Spoken Response"),
118
  ],
119
+ title="Emotion-Aware Multimodal AI Assistant",
120
+ description="Choose your emotional state, then talk or type to the AI assistant. It responds based on your emotional context.",
121
  )
122
 
 
123
  if __name__ == "__main__":
124
  iface.launch()