asad9641 commited on
Commit
52a8c9a
Β·
verified Β·
1 Parent(s): b0d4c97

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -124
app.py CHANGED
@@ -1,32 +1,25 @@
1
  # app.py
2
  """
3
  Multi-Mode AI Assistant (Voice, PDF, Image)
4
- - Fixed Gradio v4+ Audio usage (no source=...).
5
- - Chatbot uses type="messages" (openai-style {"role","content"} dicts).
6
- - Voice tab: single mic + three buttons (Ask General / Ask PDF / Ask Image).
7
- - PDF tab: upload + text questions only (no voice controls).
8
- - PDF summary download returns a temporary .pdf file for gr.File.
9
- - OCR uses OCR.space (OCR_SPACE_API_KEY).
10
- - Uses Groq endpoints for transcription + chat completions (GROQ_API_KEY).
11
- - Embeddings via sentence-transformers (all-MiniLM-L6-v2).
12
  """
13
  import os
14
  import uuid
15
  import tempfile
16
  import requests
 
17
  from dotenv import load_dotenv
18
  from gtts import gTTS
19
  from PyPDF2 import PdfReader
20
  import gradio as gr
21
  from sentence_transformers import SentenceTransformer, util
22
  from fpdf import FPDF
23
- from datetime import datetime
24
 
25
  # ------------------ Load API Keys ------------------
26
  load_dotenv()
27
  GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()
28
  OCR_SPACE_API_KEY = os.getenv("OCR_SPACE_API_KEY", "").strip()
29
-
30
  if not GROQ_API_KEY:
31
  raise ValueError("❌ GROQ_API_KEY missing. Set it in env / Hugging Face Secrets.")
32
  if not OCR_SPACE_API_KEY:
@@ -34,20 +27,17 @@ if not OCR_SPACE_API_KEY:
34
 
35
  HEADERS = {"Authorization": f"Bearer {GROQ_API_KEY}"}
36
 
37
- # ------------------ Global State ------------------
38
- SESSION_HISTORY = {} # session_id -> list of {"role","content"} messages for LLM
39
- CHAT_DISPLAY = {} # session_id -> list of (user_text, assistant_text) tuples (kept for conversion)
40
- PDF_CONTENT = {} # session_id -> list of chunks (strings)
41
- PDF_EMBEDS = {} # session_id -> embeddings tensor
42
- IMAGE_TEXT = {} # session_id -> list of image-text chunks
43
- IMAGE_EMBEDS = {} # session_id -> embeddings tensor
44
-
45
  CHUNK_SIZE = 1500
46
 
47
- # Load embedding model once (can be heavy)
48
  embed_model = SentenceTransformer("all-MiniLM-L6-v2")
49
 
50
-
51
  # ------------------ Helpers ------------------
52
  def _get_path_from_gr_file(gr_file):
53
  if not gr_file:
@@ -55,22 +45,18 @@ def _get_path_from_gr_file(gr_file):
55
  if isinstance(gr_file, str) and os.path.exists(gr_file):
56
  return gr_file
57
  try:
58
- if hasattr(gr_file, "name") and isinstance(gr_file.name, str) and os.path.exists(gr_file.name):
59
  return gr_file.name
60
- except Exception:
61
  pass
62
  if isinstance(gr_file, dict):
63
  for key in ("name", "file_name", "filepath"):
64
- if key in gr_file:
65
- candidate = gr_file.get(key)
66
- if isinstance(candidate, str) and os.path.exists(candidate):
67
- return candidate
68
  return None
69
 
70
-
71
  def chunk_text(text, size=CHUNK_SIZE):
72
- return [text[i:i + size] for i in range(0, len(text), size)]
73
-
74
 
75
  def synthesize_speech(text, lang="en"):
76
  try:
@@ -83,7 +69,6 @@ def synthesize_speech(text, lang="en"):
83
  print("TTS error:", e)
84
  return None
85
 
86
-
87
  def select_relevant_chunk(question, chunks, chunk_embeds):
88
  if not chunks or chunk_embeds is None:
89
  return ""
@@ -92,7 +77,6 @@ def select_relevant_chunk(question, chunks, chunk_embeds):
92
  top_idx = int(scores.argmax().item())
93
  return chunks[top_idx]
94
 
95
-
96
  def _chat_display_to_messages(chat_display):
97
  msgs = []
98
  for user, assistant in chat_display:
@@ -100,8 +84,12 @@ def _chat_display_to_messages(chat_display):
100
  msgs.append({"role": "assistant", "content": assistant})
101
  return msgs
102
 
 
 
 
 
103
 
104
- # ------------------ Transcription & LLM ------------------
105
  def transcribe_audio(audio_path):
106
  if not audio_path or not os.path.exists(audio_path):
107
  return "Error: audio file missing."
@@ -117,7 +105,6 @@ def transcribe_audio(audio_path):
117
  print("transcription error:", e)
118
  return f"Error transcribing audio: {e}"
119
 
120
-
121
  def generate_response(session_id, user_text):
122
  if session_id not in SESSION_HISTORY:
123
  SESSION_HISTORY[session_id] = []
@@ -134,29 +121,25 @@ def generate_response(session_id, user_text):
134
  print("generate_response error:", e)
135
  return f"Error generating response: {e}"
136
 
137
-
138
- # ------------------ PDF handling ------------------
139
  def handle_pdf_upload(pdf_file, session_id):
140
  path = _get_path_from_gr_file(pdf_file)
141
  if not path:
142
  return "No file uploaded or file unreadable."
143
  try:
144
  reader = PdfReader(path)
145
- text = ""
146
- for page in reader.pages:
147
- text += (page.extract_text() or "") + "\n"
148
  if not text.strip():
149
  return "No extractable content found in PDF."
150
  chunks = chunk_text(text)
151
  PDF_CONTENT[session_id] = chunks
152
  PDF_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
153
- return f"PDF processed: {len(chunks)} chunks ready."
154
  except Exception as e:
155
  print("PDF upload error:", e)
156
  return f"Error processing PDF: {e}"
157
 
158
-
159
- def handle_pdf_question(question, session_id):
160
  if session_id not in PDF_CONTENT:
161
  return "Document not found. Upload first."
162
  chunk = select_relevant_chunk(question, PDF_CONTENT[session_id], PDF_EMBEDS[session_id])
@@ -173,34 +156,6 @@ def handle_pdf_question(question, session_id):
173
  print("PDF question error:", e)
174
  return f"Error generating response: {e}"
175
 
176
-
177
- # ------------------ PDF Generation ------------------
178
- def generate_pdf_file(text, filename_prefix="summary"):
179
- pdf = FPDF()
180
- pdf.add_page()
181
- pdf.set_auto_page_break(auto=True, margin=15)
182
- pdf.set_font("Arial", size=12)
183
- for line in text.split("\n"):
184
- pdf.multi_cell(0, 6, line)
185
- file_path = f"/tmp/{filename_prefix}_{uuid.uuid4()}.pdf"
186
- pdf.output(file_path)
187
- return file_path
188
-
189
-
190
- def download_pdf_summary(session_pdf_id):
191
- summary_text = "\n".join([msg["content"] for msg in SESSION_HISTORY.get(session_pdf_id, []) if msg["role"]=="assistant"])
192
- if not summary_text:
193
- summary_text = "No summary available."
194
- return generate_pdf_file(summary_text, "pdf_summary")
195
-
196
-
197
- def download_image_summary(session_image_id):
198
- summary_text = "\n".join([msg["content"] for msg in SESSION_HISTORY.get(session_image_id, []) if msg["role"]=="assistant"])
199
- if not summary_text:
200
- summary_text = "No summary available."
201
- return generate_pdf_file(summary_text, "image_summary")
202
-
203
-
204
  # ------------------ Image OCR ------------------
205
  def ocr_space_file(image_path, api_key, language="eng"):
206
  if not image_path or not os.path.exists(image_path):
@@ -213,7 +168,7 @@ def ocr_space_file(image_path, api_key, language="eng"):
213
  r.raise_for_status()
214
  j = r.json()
215
  if j.get("IsErroredOnProcessing"):
216
- print("OCR.space processing error:", j)
217
  return ""
218
  parsed = [pr.get("ParsedText", "") for pr in j.get("ParsedResults", [])]
219
  return "\n".join(parsed)
@@ -221,11 +176,10 @@ def ocr_space_file(image_path, api_key, language="eng"):
221
  print("ocr_space_file error:", e)
222
  return ""
223
 
224
-
225
  def handle_image_upload(image_file, session_id):
226
  path = _get_path_from_gr_file(image_file)
227
  if not path:
228
- return "No image uploaded or file unreadable.", ""
229
  parsed = ocr_space_file(path, OCR_SPACE_API_KEY)
230
  if not parsed.strip():
231
  return "No extractable text found in the image.", ""
@@ -234,8 +188,7 @@ def handle_image_upload(image_file, session_id):
234
  IMAGE_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
235
  return f"Image processed: {len(chunks)} chunks ready.", ""
236
 
237
-
238
- def handle_image_question(question, session_id):
239
  if session_id not in IMAGE_TEXT:
240
  return "Image not found. Upload first."
241
  chunk = select_relevant_chunk(question, IMAGE_TEXT[session_id], IMAGE_EMBEDS[session_id])
@@ -252,14 +205,31 @@ def handle_image_question(question, session_id):
252
  print("Image question error:", e)
253
  return f"Error generating response: {e}"
254
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
- # ------------------ Voice routing (single mic) ------------------
257
- def _append_chat_display(session_id, user_text, assistant_text):
258
- if session_id not in CHAT_DISPLAY:
259
- CHAT_DISPLAY[session_id] = []
260
- CHAT_DISPLAY[session_id].append((user_text, assistant_text))
261
 
 
 
 
 
 
262
 
 
263
  def handle_voice_general(audio_file, session_id, tts_lang="en"):
264
  path = _get_path_from_gr_file(audio_file)
265
  if not path:
@@ -270,44 +240,32 @@ def handle_voice_general(audio_file, session_id, tts_lang="en"):
270
  audio_path = synthesize_speech(assistant_text, lang=tts_lang)
271
  return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
272
 
273
-
274
  def handle_voice_pdf(audio_file, session_id, tts_lang="en"):
275
  path = _get_path_from_gr_file(audio_file)
276
  if not path:
277
  return "No audio provided.", None, []
278
  user_text = transcribe_audio(path)
279
- assistant_text = handle_pdf_question(user_text, session_id)
280
  _append_chat_display(session_id, user_text, assistant_text)
281
  audio_path = synthesize_speech(assistant_text, lang=tts_lang)
282
  return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
283
 
284
-
285
  def handle_voice_image(audio_file, session_id, tts_lang="en"):
286
  path = _get_path_from_gr_file(audio_file)
287
  if not path:
288
  return "No audio provided.", None, []
289
  user_text = transcribe_audio(path)
290
- assistant_text = handle_image_question(user_text, session_id)
291
  _append_chat_display(session_id, user_text, assistant_text)
292
  audio_path = synthesize_speech(assistant_text, lang=tts_lang)
293
  return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
294
 
295
-
296
- # ------------------ Text handlers ------------------
297
  def handle_text_general(user_text, session_id):
298
  assistant = generate_response(session_id, user_text)
299
  _append_chat_display(session_id, user_text, assistant)
300
  return assistant, _chat_display_to_messages(CHAT_DISPLAY[session_id])
301
 
302
-
303
- def handle_text_pdf(question, session_id):
304
- return handle_pdf_question(question, session_id)
305
-
306
-
307
- def handle_text_image(question, session_id):
308
- return handle_image_question(question, session_id)
309
-
310
-
311
  # ------------------ Gradio UI ------------------
312
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
313
  gr.Markdown("## πŸ›  Multi-Mode AI Assistant (Voice, PDF, Image)")
@@ -318,54 +276,54 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
318
 
319
  # ---------------- Voice Tab ----------------
320
  with gr.Tab("🎀 Voice Chat"):
321
- chat_voice = gr.Chatbot(type="messages", height=380)
322
  with gr.Row():
323
- mic = gr.Audio(type="filepath", label="🎀 Record Voice (hold & speak)")
324
- tts_lang = gr.Dropdown(choices=["en", "ur"], value="en", label="TTS Language")
325
  with gr.Row():
326
- btn_general = gr.Button("Ask General (from recorded voice)")
327
- btn_pdf = gr.Button("Ask PDF (from recorded voice)")
328
- btn_image = gr.Button("Ask Image (from recorded voice)")
329
  answer_voice = gr.Textbox(label="Assistant Answer (text)", lines=4)
330
- audio_output = gr.Audio(label="Assistant Voice Output", type="filepath")
331
 
 
332
  with gr.Row():
333
- text_input = gr.Textbox(label="Or type a question (General)", placeholder="Type message here...")
334
- btn_send_text = gr.Button("Send (Text General)")
335
 
336
  btn_general.click(
337
- handle_voice_general,
338
  inputs=[mic, session_voice, tts_lang],
339
  outputs=[answer_voice, audio_output, chat_voice],
340
  )
341
  btn_pdf.click(
342
- handle_voice_pdf,
343
  inputs=[mic, session_pdf, tts_lang],
344
  outputs=[answer_voice, audio_output, chat_voice],
345
  )
346
  btn_image.click(
347
- handle_voice_image,
348
  inputs=[mic, session_image, tts_lang],
349
  outputs=[answer_voice, audio_output, chat_voice],
350
  )
351
-
352
  btn_send_text.click(
353
- handle_text_general,
354
  inputs=[text_input, session_voice],
355
  outputs=[answer_voice, chat_voice],
356
  )
357
 
358
  # ---------------- PDF Tab ----------------
359
  with gr.Tab("πŸ“„ PDF Summarizer"):
360
- pdf_output = gr.Textbox(label="Answer (Text Only)", lines=12)
361
- pdf_summary_file = gr.File(label="Download Summary File")
362
  with gr.Row():
363
- pdf_upload_btn = gr.File(label="Upload PDF", file_types=[".pdf"])
364
- pdf_upload_msg = gr.Textbox(label="Upload Status", interactive=False)
365
  pdf_question = gr.Textbox(label="Ask a question about PDF (text)", lines=2)
366
- pdf_send_btn = gr.Button("Ask (Text)")
367
- pdf_reset_btn = gr.Button("β™» Reset PDF")
368
- pdf_download_btn = gr.Button("πŸ“₯ Download Summary")
369
 
370
  pdf_upload_btn.upload(handle_pdf_upload, inputs=[pdf_upload_btn, session_pdf], outputs=[pdf_upload_msg])
371
  pdf_send_btn.click(handle_text_pdf, inputs=[pdf_question, session_pdf], outputs=[pdf_output])
@@ -374,21 +332,20 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
374
 
375
  # ---------------- Image Tab ----------------
376
  with gr.Tab("πŸ–Ό Image OCR"):
377
- image_output = gr.Textbox(label="Answer (Text Only)", lines=12)
378
- img_summary_file = gr.File(label="Download Summary File")
379
  with gr.Row():
380
- image_upload_btn = gr.File(label="Upload Image", file_types=[".png", ".jpg", ".jpeg"])
381
- image_upload_msg = gr.Textbox(label="Upload Status", interactive=False)
382
  image_question = gr.Textbox(label="Ask a question about Image (text)", lines=2)
383
- image_send_btn = gr.Button("Ask (Text)")
384
- image_reset_btn = gr.Button("β™» Reset Image")
385
- img_download_btn = gr.Button("πŸ“₯ Download Summary")
386
 
387
  image_upload_btn.upload(handle_image_upload, inputs=[image_upload_btn, session_image], outputs=[image_upload_msg, image_output])
388
  image_send_btn.click(handle_text_image, inputs=[image_question, session_image], outputs=[image_output])
389
  image_reset_btn.click(lambda: (str(uuid.uuid4()), ""), outputs=[session_image, image_output])
390
  img_download_btn.click(download_image_summary, inputs=[session_image], outputs=[img_summary_file])
391
 
392
- # Launch
393
  if __name__ == "__main__":
394
- demo.launch()
 
1
  # app.py
2
  """
3
  Multi-Mode AI Assistant (Voice, PDF, Image)
4
+ - Improved interactive UI: compact, visually appealing, emojis/icons, scrollable previews.
5
+ - All backend functionality preserved.
 
 
 
 
 
 
6
  """
7
  import os
8
  import uuid
9
  import tempfile
10
  import requests
11
+ from datetime import datetime
12
  from dotenv import load_dotenv
13
  from gtts import gTTS
14
  from PyPDF2 import PdfReader
15
  import gradio as gr
16
  from sentence_transformers import SentenceTransformer, util
17
  from fpdf import FPDF
 
18
 
19
  # ------------------ Load API Keys ------------------
20
  load_dotenv()
21
  GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()
22
  OCR_SPACE_API_KEY = os.getenv("OCR_SPACE_API_KEY", "").strip()
 
23
  if not GROQ_API_KEY:
24
  raise ValueError("❌ GROQ_API_KEY missing. Set it in env / Hugging Face Secrets.")
25
  if not OCR_SPACE_API_KEY:
 
27
 
28
  HEADERS = {"Authorization": f"Bearer {GROQ_API_KEY}"}
29
 
30
+ # ------------------ Global States ------------------
31
+ SESSION_HISTORY = {}
32
+ CHAT_DISPLAY = {}
33
+ PDF_CONTENT = {}
34
+ PDF_EMBEDS = {}
35
+ IMAGE_TEXT = {}
36
+ IMAGE_EMBEDS = {}
 
37
  CHUNK_SIZE = 1500
38
 
 
39
  embed_model = SentenceTransformer("all-MiniLM-L6-v2")
40
 
 
41
  # ------------------ Helpers ------------------
42
  def _get_path_from_gr_file(gr_file):
43
  if not gr_file:
 
45
  if isinstance(gr_file, str) and os.path.exists(gr_file):
46
  return gr_file
47
  try:
48
+ if hasattr(gr_file, "name") and os.path.exists(gr_file.name):
49
  return gr_file.name
50
+ except:
51
  pass
52
  if isinstance(gr_file, dict):
53
  for key in ("name", "file_name", "filepath"):
54
+ if key in gr_file and os.path.exists(gr_file[key]):
55
+ return gr_file[key]
 
 
56
  return None
57
 
 
58
  def chunk_text(text, size=CHUNK_SIZE):
59
+ return [text[i:i+size] for i in range(0, len(text), size)]
 
60
 
61
  def synthesize_speech(text, lang="en"):
62
  try:
 
69
  print("TTS error:", e)
70
  return None
71
 
 
72
  def select_relevant_chunk(question, chunks, chunk_embeds):
73
  if not chunks or chunk_embeds is None:
74
  return ""
 
77
  top_idx = int(scores.argmax().item())
78
  return chunks[top_idx]
79
 
 
80
  def _chat_display_to_messages(chat_display):
81
  msgs = []
82
  for user, assistant in chat_display:
 
84
  msgs.append({"role": "assistant", "content": assistant})
85
  return msgs
86
 
87
+ def _append_chat_display(session_id, user_text, assistant_text):
88
+ if session_id not in CHAT_DISPLAY:
89
+ CHAT_DISPLAY[session_id] = []
90
+ CHAT_DISPLAY[session_id].append((user_text, assistant_text))
91
 
92
+ # ------------------ Voice & LLM ------------------
93
  def transcribe_audio(audio_path):
94
  if not audio_path or not os.path.exists(audio_path):
95
  return "Error: audio file missing."
 
105
  print("transcription error:", e)
106
  return f"Error transcribing audio: {e}"
107
 
 
108
  def generate_response(session_id, user_text):
109
  if session_id not in SESSION_HISTORY:
110
  SESSION_HISTORY[session_id] = []
 
121
  print("generate_response error:", e)
122
  return f"Error generating response: {e}"
123
 
124
+ # ------------------ PDF Handling ------------------
 
125
  def handle_pdf_upload(pdf_file, session_id):
126
  path = _get_path_from_gr_file(pdf_file)
127
  if not path:
128
  return "No file uploaded or file unreadable."
129
  try:
130
  reader = PdfReader(path)
131
+ text = "".join([page.extract_text() or "" for page in reader.pages])
 
 
132
  if not text.strip():
133
  return "No extractable content found in PDF."
134
  chunks = chunk_text(text)
135
  PDF_CONTENT[session_id] = chunks
136
  PDF_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
137
+ return f"PDF uploaded: {len(chunks)} chunks ready."
138
  except Exception as e:
139
  print("PDF upload error:", e)
140
  return f"Error processing PDF: {e}"
141
 
142
+ def handle_text_pdf(question, session_id):
 
143
  if session_id not in PDF_CONTENT:
144
  return "Document not found. Upload first."
145
  chunk = select_relevant_chunk(question, PDF_CONTENT[session_id], PDF_EMBEDS[session_id])
 
156
  print("PDF question error:", e)
157
  return f"Error generating response: {e}"
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  # ------------------ Image OCR ------------------
160
  def ocr_space_file(image_path, api_key, language="eng"):
161
  if not image_path or not os.path.exists(image_path):
 
168
  r.raise_for_status()
169
  j = r.json()
170
  if j.get("IsErroredOnProcessing"):
171
+ print("OCR.space error:", j)
172
  return ""
173
  parsed = [pr.get("ParsedText", "") for pr in j.get("ParsedResults", [])]
174
  return "\n".join(parsed)
 
176
  print("ocr_space_file error:", e)
177
  return ""
178
 
 
179
  def handle_image_upload(image_file, session_id):
180
  path = _get_path_from_gr_file(image_file)
181
  if not path:
182
+ return "No image uploaded.", ""
183
  parsed = ocr_space_file(path, OCR_SPACE_API_KEY)
184
  if not parsed.strip():
185
  return "No extractable text found in the image.", ""
 
188
  IMAGE_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
189
  return f"Image processed: {len(chunks)} chunks ready.", ""
190
 
191
+ def handle_text_image(question, session_id):
 
192
  if session_id not in IMAGE_TEXT:
193
  return "Image not found. Upload first."
194
  chunk = select_relevant_chunk(question, IMAGE_TEXT[session_id], IMAGE_EMBEDS[session_id])
 
205
  print("Image question error:", e)
206
  return f"Error generating response: {e}"
207
 
208
+ # ------------------ PDF Generation ------------------
209
+ def generate_pdf_file(text, filename_prefix="summary"):
210
+ pdf = FPDF()
211
+ pdf.add_page()
212
+ pdf.set_auto_page_break(auto=True, margin=15)
213
+ pdf.set_font("Arial", size=12)
214
+ for line in text.split("\n"):
215
+ pdf.multi_cell(0, 6, line)
216
+ tmp_path = f"/tmp/{filename_prefix}_{uuid.uuid4()}.pdf"
217
+ pdf.output(tmp_path)
218
+ return tmp_path
219
 
220
+ def download_pdf_summary(session_pdf_id):
221
+ summary_text = "\n".join([msg["content"] for msg in SESSION_HISTORY.get(session_pdf_id, []) if msg["role"]=="assistant"])
222
+ if not summary_text:
223
+ summary_text = "No summary available."
224
+ return generate_pdf_file(summary_text, "pdf_summary")
225
 
226
+ def download_image_summary(session_image_id):
227
+ summary_text = "\n".join([msg["content"] for msg in SESSION_HISTORY.get(session_image_id, []) if msg["role"]=="assistant"])
228
+ if not summary_text:
229
+ summary_text = "No summary available."
230
+ return generate_pdf_file(summary_text, "image_summary")
231
 
232
+ # ------------------ Voice Handlers ------------------
233
  def handle_voice_general(audio_file, session_id, tts_lang="en"):
234
  path = _get_path_from_gr_file(audio_file)
235
  if not path:
 
240
  audio_path = synthesize_speech(assistant_text, lang=tts_lang)
241
  return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
242
 
 
243
  def handle_voice_pdf(audio_file, session_id, tts_lang="en"):
244
  path = _get_path_from_gr_file(audio_file)
245
  if not path:
246
  return "No audio provided.", None, []
247
  user_text = transcribe_audio(path)
248
+ assistant_text = handle_text_pdf(user_text, session_id)
249
  _append_chat_display(session_id, user_text, assistant_text)
250
  audio_path = synthesize_speech(assistant_text, lang=tts_lang)
251
  return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
252
 
 
253
  def handle_voice_image(audio_file, session_id, tts_lang="en"):
254
  path = _get_path_from_gr_file(audio_file)
255
  if not path:
256
  return "No audio provided.", None, []
257
  user_text = transcribe_audio(path)
258
+ assistant_text = handle_text_image(user_text, session_id)
259
  _append_chat_display(session_id, user_text, assistant_text)
260
  audio_path = synthesize_speech(assistant_text, lang=tts_lang)
261
  return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
262
 
263
+ # ------------------ Text Handlers ------------------
 
264
  def handle_text_general(user_text, session_id):
265
  assistant = generate_response(session_id, user_text)
266
  _append_chat_display(session_id, user_text, assistant)
267
  return assistant, _chat_display_to_messages(CHAT_DISPLAY[session_id])
268
 
 
 
 
 
 
 
 
 
 
269
  # ------------------ Gradio UI ------------------
270
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
271
  gr.Markdown("## πŸ›  Multi-Mode AI Assistant (Voice, PDF, Image)")
 
276
 
277
  # ---------------- Voice Tab ----------------
278
  with gr.Tab("🎀 Voice Chat"):
279
+ chat_voice = gr.Chatbot(type="messages", height=350)
280
  with gr.Row():
281
+ mic = gr.Audio(type="filepath", label="🎀 Record Voice (hold & speak)", show_download_button=False)
282
+ tts_lang = gr.Dropdown(choices=["en", "ur"], value="en", label="TTS Language", interactive=True, scale=1)
283
  with gr.Row():
284
+ btn_general = gr.Button("Ask General 🎯", scale=1)
285
+ btn_pdf = gr.Button("Ask PDF πŸ“„", scale=1)
286
+ btn_image = gr.Button("Ask Image πŸ–Ό", scale=1)
287
  answer_voice = gr.Textbox(label="Assistant Answer (text)", lines=4)
288
+ audio_output = gr.Audio(label="Assistant Voice Output", type="filepath", interactive=False)
289
 
290
+ # Text-only general chat
291
  with gr.Row():
292
+ text_input = gr.Textbox(label="Or type a question (General)", placeholder="Type message here...", lines=2)
293
+ btn_send_text = gr.Button("Send (Text General)", scale=1)
294
 
295
  btn_general.click(
296
+ fn=handle_voice_general,
297
  inputs=[mic, session_voice, tts_lang],
298
  outputs=[answer_voice, audio_output, chat_voice],
299
  )
300
  btn_pdf.click(
301
+ fn=handle_voice_pdf,
302
  inputs=[mic, session_pdf, tts_lang],
303
  outputs=[answer_voice, audio_output, chat_voice],
304
  )
305
  btn_image.click(
306
+ fn=handle_voice_image,
307
  inputs=[mic, session_image, tts_lang],
308
  outputs=[answer_voice, audio_output, chat_voice],
309
  )
 
310
  btn_send_text.click(
311
+ fn=handle_text_general,
312
  inputs=[text_input, session_voice],
313
  outputs=[answer_voice, chat_voice],
314
  )
315
 
316
  # ---------------- PDF Tab ----------------
317
  with gr.Tab("πŸ“„ PDF Summarizer"):
318
+ pdf_output = gr.Textbox(label="Answer (Text Only)", lines=6)
319
+ pdf_summary_file = gr.File(label="πŸ“₯ Download PDF Summary")
320
  with gr.Row():
321
+ pdf_upload_btn = gr.File(label="Upload PDF", file_types=[".pdf"], file_types_preview=False, interactive=True)
322
+ pdf_upload_msg = gr.Textbox(label="Upload Status", interactive=False, lines=1)
323
  pdf_question = gr.Textbox(label="Ask a question about PDF (text)", lines=2)
324
+ pdf_send_btn = gr.Button("Ask (Text)", scale=1)
325
+ pdf_reset_btn = gr.Button("β™» Reset PDF", scale=1)
326
+ pdf_download_btn = gr.Button("πŸ“₯ Download Summary", scale=1)
327
 
328
  pdf_upload_btn.upload(handle_pdf_upload, inputs=[pdf_upload_btn, session_pdf], outputs=[pdf_upload_msg])
329
  pdf_send_btn.click(handle_text_pdf, inputs=[pdf_question, session_pdf], outputs=[pdf_output])
 
332
 
333
  # ---------------- Image Tab ----------------
334
  with gr.Tab("πŸ–Ό Image OCR"):
335
+ image_output = gr.Textbox(label="Answer (Text Only)", lines=6)
336
+ img_summary_file = gr.File(label="πŸ“₯ Download PDF Summary")
337
  with gr.Row():
338
+ image_upload_btn = gr.File(label="Upload Image", file_types=[".png", ".jpg", ".jpeg"], interactive=True)
339
+ image_upload_msg = gr.Textbox(label="Upload Status", interactive=False, lines=1)
340
  image_question = gr.Textbox(label="Ask a question about Image (text)", lines=2)
341
+ image_send_btn = gr.Button("Ask", scale=1)
342
+ image_reset_btn = gr.Button("β™» Reset Image", scale=1)
343
+ img_download_btn = gr.Button("πŸ“₯ Download Summary", scale=1)
344
 
345
  image_upload_btn.upload(handle_image_upload, inputs=[image_upload_btn, session_image], outputs=[image_upload_msg, image_output])
346
  image_send_btn.click(handle_text_image, inputs=[image_question, session_image], outputs=[image_output])
347
  image_reset_btn.click(lambda: (str(uuid.uuid4()), ""), outputs=[session_image, image_output])
348
  img_download_btn.click(download_image_summary, inputs=[session_image], outputs=[img_summary_file])
349
 
 
350
  if __name__ == "__main__":
351
+ demo.launch(server_name="0.0.0.0", server_port=7860)