Kashif12334 commited on
Commit
deca404
Β·
verified Β·
1 Parent(s): 7333ef7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -216
app.py CHANGED
@@ -1,93 +1,48 @@
1
- import os
2
- import json
3
- import tempfile
4
- import torch
5
- import soundfile as sf
6
  import gradio as gr
7
  import requests
8
- import io
9
-
10
- from dotenv import load_dotenv
11
  from groq import Groq
12
- from PIL import Image
13
 
 
 
14
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 
15
 
16
- load_dotenv()
17
 
18
- # =============================
19
  # API KEYS
20
- # =============================
21
 
22
- GROQ_API_KEY = os.getenv("GROQ_API_KEY")
23
  HF_TOKEN = os.getenv("HF_TOKEN")
 
24
 
25
- client = Groq(api_key=GROQ_API_KEY)
26
-
27
- CHAT_FILE = "chat_history.json"
28
- PREF_FILE = "preferences.json"
29
-
30
- # =============================
31
- # JSON HELPERS
32
- # =============================
33
-
34
- def load_json(file, default):
35
- if os.path.exists(file):
36
- try:
37
- with open(file, "r") as f:
38
- return json.load(f)
39
- except:
40
- return default
41
- return default
42
-
43
-
44
- def save_json(file, data):
45
- with open(file, "w") as f:
46
- json.dump(data, f, indent=4)
47
-
48
 
49
- conversation_history = load_json(CHAT_FILE, [])
50
- user_preferences = load_json(PREF_FILE, {"style": "Default"})
51
 
52
- # =============================
53
- # LOAD TTS MODEL
54
- # =============================
55
 
56
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
57
- tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
58
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
59
-
60
- speaker_embeddings = torch.randn(1, 512)
61
-
62
- # =============================
63
- # HUGGING FACE IMAGE API
64
- # =============================
65
-
66
- HF_API_URL = "https://api-inference.huggingface.co/models/stabilityai/sdxl-turbo"
67
 
68
- headers = {
69
- "Authorization": f"Bearer {HF_TOKEN}"
70
- }
71
-
72
- # =============================
73
- # SPEECH TO TEXT
74
- # =============================
75
-
76
- def transcribe_audio(audio_path):
77
 
78
- with open(audio_path, "rb") as audio_file:
79
 
80
- transcription = client.audio.transcriptions.create(
81
- file=audio_file,
82
- model="whisper-large-v3"
83
- )
84
 
85
- return transcription.text
 
 
86
 
87
 
88
- # =============================
89
- # TEXT TO SPEECH
90
- # =============================
91
 
92
  def text_to_speech(text):
93
 
@@ -111,41 +66,23 @@ def text_to_speech(text):
111
  return temp_audio.name
112
 
113
 
114
- # =============================
115
- # IMAGE TRIGGER
116
- # =============================
117
-
118
- def should_generate_image(user_prompt):
119
-
120
- keywords = [
121
- "draw",
122
- "diagram",
123
- "visualize",
124
- "show me",
125
- "illustration",
126
- "picture",
127
- "image",
128
- "architecture"
129
- ]
130
-
131
- for word in keywords:
132
- if word in user_prompt.lower():
133
- return True
134
 
135
- return False
136
 
 
137
 
138
- # =============================
139
- # IMAGE GENERATION
140
- # =============================
141
 
142
- def generate_image(prompt):
 
 
143
 
144
- response = requests.post(
145
- HF_API_URL,
146
- headers=headers,
147
- json={"inputs": prompt}
148
- )
149
 
150
  print("HF STATUS:", response.status_code)
151
 
@@ -153,157 +90,83 @@ def generate_image(prompt):
153
  print(response.text)
154
  return None
155
 
156
- image = Image.open(io.BytesIO(response.content))
157
 
158
- return image
159
 
 
160
 
161
- # =============================
162
- # CHAT FUNCTION
163
- # =============================
164
 
165
- def chat_with_memory(user_message, preference_text):
166
 
167
- global conversation_history, user_preferences
168
 
169
- if preference_text and preference_text.strip():
170
- user_preferences["style"] = preference_text
171
- save_json(PREF_FILE, user_preferences)
172
 
173
- system_prompt = f"""
174
- You are a helpful AI assistant.
175
 
176
- User Preferences:
177
- {user_preferences.get("style", "Default")}
178
-
179
- Maintain conversational memory.
180
- """
181
-
182
- messages = [{"role": "system", "content": system_prompt}]
183
- messages.extend(conversation_history)
184
- messages.append({"role": "user", "content": user_message})
185
-
186
- response = client.chat.completions.create(
187
  model="llama-3.1-8b-instant",
188
  max_tokens=200,
189
- messages=messages
 
 
190
  )
191
 
192
- assistant_reply = response.choices[0].message.content
193
-
194
- conversation_history.append({"role": "user", "content": user_message})
195
- conversation_history.append({"role": "assistant", "content": assistant_reply})
196
-
197
- save_json(CHAT_FILE, conversation_history)
198
-
199
- return assistant_reply
200
-
201
-
202
- # =============================
203
- # PROCESS TEXT
204
- # =============================
205
-
206
- def process_text(user_message, preference_text, chat_display):
207
-
208
- if not user_message.strip():
209
- return "", chat_display, None, None
210
-
211
- assistant_reply = chat_with_memory(user_message, preference_text)
212
-
213
- chat_display.append({"role": "user", "content": user_message})
214
- chat_display.append({"role": "assistant", "content": assistant_reply})
215
-
216
- audio_output = text_to_speech(assistant_reply)
217
-
218
- image_output = None
219
-
220
- if should_generate_image(user_message):
221
- image_output = generate_image(user_message)
222
-
223
- return "", chat_display, audio_output, image_output
224
-
225
 
226
- # =============================
227
- # PROCESS VOICE
228
- # =============================
229
 
230
- def process_voice(audio_file, preference_text, chat_display):
 
 
231
 
232
- user_text = transcribe_audio(audio_file)
233
 
234
- assistant_reply = chat_with_memory(user_text, preference_text)
235
 
236
- chat_display.append({"role": "user", "content": user_text})
237
- chat_display.append({"role": "assistant", "content": assistant_reply})
238
 
239
- audio_output = text_to_speech(assistant_reply)
 
240
 
241
- image_output = None
242
 
243
- if should_generate_image(user_text):
244
- image_output = generate_image(user_text)
245
 
246
- return chat_display, audio_output, image_output
247
 
248
-
249
- # =============================
250
- # CLEAR MEMORY
251
- # =============================
252
-
253
- def clear_memory():
254
-
255
- global conversation_history
256
-
257
- conversation_history = []
258
-
259
- save_json(CHAT_FILE, [])
260
-
261
- return []
262
-
263
-
264
- # =============================
265
  # GRADIO UI
266
- # =============================
267
 
268
  with gr.Blocks() as demo:
269
 
270
- gr.Markdown("# πŸ€– Version 5 β€” Multimodal AI Assistant (Voice + Images)")
271
 
272
- chatbot = gr.Chatbot(label="Conversation", value=conversation_history)
273
-
274
- preference_input = gr.Textbox(label="User Preferences")
275
-
276
- user_message = gr.Textbox(label="Type message")
277
-
278
- audio_input = gr.Audio(
279
- sources=["microphone"],
280
- type="filepath",
281
- label="Voice Input"
282
  )
283
 
284
- audio_output = gr.Audio(label="Voice Response")
285
-
286
- image_output = gr.Image(label="Generated Image")
287
-
288
- send_btn = gr.Button("Send Text")
289
- voice_btn = gr.Button("Send Voice")
290
- clear_btn = gr.Button("Clear Memory")
291
 
292
- send_btn.click(
293
- process_text,
294
- inputs=[user_message, preference_input, chatbot],
295
- outputs=[user_message, chatbot, audio_output, image_output]
296
  )
297
 
298
- voice_btn.click(
299
- process_voice,
300
- inputs=[audio_input, preference_input, chatbot],
301
- outputs=[chatbot, audio_output, image_output]
302
  )
303
 
304
- clear_btn.click(
305
- clear_memory,
306
- outputs=chatbot
 
 
 
307
  )
308
 
 
309
  demo.launch()
 
 
 
 
 
 
1
  import gradio as gr
2
  import requests
3
+ import os
4
+ import tempfile
 
5
  from groq import Groq
 
6
 
7
+ import torch
8
+ import soundfile as sf
9
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
10
+ from datasets import load_dataset
11
 
 
12
 
13
+ # ==============================
14
  # API KEYS
15
+ # ==============================
16
 
 
17
  HF_TOKEN = os.getenv("HF_TOKEN")
18
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
19
 
20
+ groq_client = Groq(api_key=GROQ_API_KEY)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
 
 
22
 
23
+ # ==============================
24
+ # LOAD TTS MODELS
25
+ # ==============================
26
 
27
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 
 
 
 
 
 
 
 
 
 
28
 
29
+ tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
 
 
 
 
 
 
 
 
30
 
31
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
32
 
33
+ embeddings_dataset = load_dataset(
34
+ "Matthijs/cmu-arctic-xvectors",
35
+ split="validation"
36
+ )
37
 
38
+ speaker_embeddings = torch.tensor(
39
+ embeddings_dataset[7306]["xvector"]
40
+ ).unsqueeze(0)
41
 
42
 
43
+ # ==============================
44
+ # TEXT β†’ SPEECH
45
+ # ==============================
46
 
47
  def text_to_speech(text):
48
 
 
66
  return temp_audio.name
67
 
68
 
69
+ # ==============================
70
+ # IMAGE GENERATION
71
+ # ==============================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
+ def generate_image(prompt):
74
 
75
+ API_URL = "https://router.huggingface.co/hf-inference/models/stabilityai/stable-diffusion-xl-base-1.0"
76
 
77
+ headers = {
78
+ "Authorization": f"Bearer {HF_TOKEN}"
79
+ }
80
 
81
+ payload = {
82
+ "inputs": prompt
83
+ }
84
 
85
+ response = requests.post(API_URL, headers=headers, json=payload)
 
 
 
 
86
 
87
  print("HF STATUS:", response.status_code)
88
 
 
90
  print(response.text)
91
  return None
92
 
93
+ image_bytes = response.content
94
 
95
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
96
 
97
+ temp_file.write(image_bytes)
98
 
99
+ temp_file.close()
 
 
100
 
101
+ return temp_file.name
102
 
 
103
 
104
+ # ==============================
105
+ # GROQ CHATBOT
106
+ # ==============================
107
 
108
+ def ask_llm(question):
 
109
 
110
+ response = groq_client.chat.completions.create(
 
 
 
 
 
 
 
 
 
 
111
  model="llama-3.1-8b-instant",
112
  max_tokens=200,
113
+ messages=[
114
+ {"role": "user", "content": question}
115
+ ]
116
  )
117
 
118
+ return response.choices[0].message.content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
 
 
 
120
 
121
+ # ==============================
122
+ # MAIN ASSISTANT FUNCTION
123
+ # ==============================
124
 
125
+ def ai_assistant(user_input):
126
 
127
+ reply = ask_llm(user_input)
128
 
129
+ image = None
 
130
 
131
+ if "image" in user_input.lower() or "generate" in user_input.lower():
132
+ image = generate_image(user_input)
133
 
134
+ audio = text_to_speech(reply)
135
 
136
+ return reply, audio, image
 
137
 
 
138
 
139
+ # ==============================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  # GRADIO UI
141
+ # ==============================
142
 
143
  with gr.Blocks() as demo:
144
 
145
+ gr.Markdown("# πŸ€– AI Assistant (Chat + Voice + Image)")
146
 
147
+ user_input = gr.Textbox(
148
+ label="Ask something or request an image"
 
 
 
 
 
 
 
 
149
  )
150
 
151
+ text_output = gr.Textbox(
152
+ label="Assistant Response"
153
+ )
 
 
 
 
154
 
155
+ audio_output = gr.Audio(
156
+ label="Voice Response"
 
 
157
  )
158
 
159
+ image_output = gr.Image(
160
+ label="Generated Image"
 
 
161
  )
162
 
163
+ submit_btn = gr.Button("Submit")
164
+
165
+ submit_btn.click(
166
+ fn=ai_assistant,
167
+ inputs=user_input,
168
+ outputs=[text_output, audio_output, image_output]
169
  )
170
 
171
+
172
  demo.launch()