asad9641 commited on
Commit
bb63563
Β·
verified Β·
1 Parent(s): 1f64b26

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -93
app.py CHANGED
@@ -1,25 +1,31 @@
1
  # app.py
2
  """
3
  Multi-Mode AI Assistant (Voice, PDF, Image)
4
- - Fixed gr.File issue: removed unsupported 'file_types_preview' argument
5
- - Interactive, compact, visually appealing UI
 
 
 
 
 
6
  """
7
  import os
8
  import uuid
9
  import tempfile
10
  import requests
11
- from datetime import datetime
12
  from dotenv import load_dotenv
13
  from gtts import gTTS
14
  from PyPDF2 import PdfReader
15
  import gradio as gr
16
  from sentence_transformers import SentenceTransformer, util
17
  from fpdf import FPDF
 
18
 
19
  # ------------------ Load API Keys ------------------
20
  load_dotenv()
21
  GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()
22
  OCR_SPACE_API_KEY = os.getenv("OCR_SPACE_API_KEY", "").strip()
 
23
  if not GROQ_API_KEY:
24
  raise ValueError("❌ GROQ_API_KEY missing. Set it in env / Hugging Face Secrets.")
25
  if not OCR_SPACE_API_KEY:
@@ -27,15 +33,17 @@ if not OCR_SPACE_API_KEY:
27
 
28
  HEADERS = {"Authorization": f"Bearer {GROQ_API_KEY}"}
29
 
30
- # ------------------ Global States ------------------
31
- SESSION_HISTORY = {}
32
- CHAT_DISPLAY = {}
33
- PDF_CONTENT = {}
34
- PDF_EMBEDS = {}
35
- IMAGE_TEXT = {}
36
- IMAGE_EMBEDS = {}
 
37
  CHUNK_SIZE = 1500
38
 
 
39
  embed_model = SentenceTransformer("all-MiniLM-L6-v2")
40
 
41
  # ------------------ Helpers ------------------
@@ -47,16 +55,18 @@ def _get_path_from_gr_file(gr_file):
47
  try:
48
  if hasattr(gr_file, "name") and os.path.exists(gr_file.name):
49
  return gr_file.name
50
- except:
51
  pass
52
  if isinstance(gr_file, dict):
53
  for key in ("name", "file_name", "filepath"):
54
- if key in gr_file and os.path.exists(gr_file[key]):
55
- return gr_file[key]
 
 
56
  return None
57
 
58
  def chunk_text(text, size=CHUNK_SIZE):
59
- return [text[i:i+size] for i in range(0, len(text), size)]
60
 
61
  def synthesize_speech(text, lang="en"):
62
  try:
@@ -84,12 +94,7 @@ def _chat_display_to_messages(chat_display):
84
  msgs.append({"role": "assistant", "content": assistant})
85
  return msgs
86
 
87
- def _append_chat_display(session_id, user_text, assistant_text):
88
- if session_id not in CHAT_DISPLAY:
89
- CHAT_DISPLAY[session_id] = []
90
- CHAT_DISPLAY[session_id].append((user_text, assistant_text))
91
-
92
- # ------------------ Voice & LLM ------------------
93
  def transcribe_audio(audio_path):
94
  if not audio_path or not os.path.exists(audio_path):
95
  return "Error: audio file missing."
@@ -121,25 +126,27 @@ def generate_response(session_id, user_text):
121
  print("generate_response error:", e)
122
  return f"Error generating response: {e}"
123
 
124
- # ------------------ PDF Handling ------------------
125
  def handle_pdf_upload(pdf_file, session_id):
126
  path = _get_path_from_gr_file(pdf_file)
127
  if not path:
128
  return "No file uploaded or file unreadable."
129
  try:
130
  reader = PdfReader(path)
131
- text = "".join([page.extract_text() or "" for page in reader.pages])
 
 
132
  if not text.strip():
133
  return "No extractable content found in PDF."
134
  chunks = chunk_text(text)
135
  PDF_CONTENT[session_id] = chunks
136
  PDF_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
137
- return f"PDF uploaded: {len(chunks)} chunks ready."
138
  except Exception as e:
139
  print("PDF upload error:", e)
140
  return f"Error processing PDF: {e}"
141
 
142
- def handle_text_pdf(question, session_id):
143
  if session_id not in PDF_CONTENT:
144
  return "Document not found. Upload first."
145
  chunk = select_relevant_chunk(question, PDF_CONTENT[session_id], PDF_EMBEDS[session_id])
@@ -151,7 +158,12 @@ def handle_text_pdf(question, session_id):
151
  try:
152
  resp = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=HEADERS, json=body, timeout=60)
153
  resp.raise_for_status()
154
- return resp.json()["choices"][0]["message"]["content"]
 
 
 
 
 
155
  except Exception as e:
156
  print("PDF question error:", e)
157
  return f"Error generating response: {e}"
@@ -168,7 +180,7 @@ def ocr_space_file(image_path, api_key, language="eng"):
168
  r.raise_for_status()
169
  j = r.json()
170
  if j.get("IsErroredOnProcessing"):
171
- print("OCR.space error:", j)
172
  return ""
173
  parsed = [pr.get("ParsedText", "") for pr in j.get("ParsedResults", [])]
174
  return "\n".join(parsed)
@@ -179,7 +191,7 @@ def ocr_space_file(image_path, api_key, language="eng"):
179
  def handle_image_upload(image_file, session_id):
180
  path = _get_path_from_gr_file(image_file)
181
  if not path:
182
- return "No image uploaded.", ""
183
  parsed = ocr_space_file(path, OCR_SPACE_API_KEY)
184
  if not parsed.strip():
185
  return "No extractable text found in the image.", ""
@@ -188,7 +200,7 @@ def handle_image_upload(image_file, session_id):
188
  IMAGE_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
189
  return f"Image processed: {len(chunks)} chunks ready.", ""
190
 
191
- def handle_text_image(question, session_id):
192
  if session_id not in IMAGE_TEXT:
193
  return "Image not found. Upload first."
194
  chunk = select_relevant_chunk(question, IMAGE_TEXT[session_id], IMAGE_EMBEDS[session_id])
@@ -200,12 +212,16 @@ def handle_text_image(question, session_id):
200
  try:
201
  resp = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=HEADERS, json=body, timeout=60)
202
  resp.raise_for_status()
203
- return resp.json()["choices"][0]["message"]["content"]
 
 
 
 
204
  except Exception as e:
205
  print("Image question error:", e)
206
  return f"Error generating response: {e}"
207
 
208
- # ------------------ PDF Generation ------------------
209
  def generate_pdf_file(text, filename_prefix="summary"):
210
  pdf = FPDF()
211
  pdf.add_page()
@@ -213,23 +229,22 @@ def generate_pdf_file(text, filename_prefix="summary"):
213
  pdf.set_font("Arial", size=12)
214
  for line in text.split("\n"):
215
  pdf.multi_cell(0, 6, line)
216
- tmp_path = f"/tmp/{filename_prefix}_{uuid.uuid4()}.pdf"
217
- pdf.output(tmp_path)
218
- return tmp_path
219
 
220
- def download_pdf_summary(session_pdf_id):
221
- summary_text = "\n".join([msg["content"] for msg in SESSION_HISTORY.get(session_pdf_id, []) if msg["role"]=="assistant"])
222
  if not summary_text:
223
  summary_text = "No summary available."
224
- return generate_pdf_file(summary_text, "pdf_summary")
225
 
226
- def download_image_summary(session_image_id):
227
- summary_text = "\n".join([msg["content"] for msg in SESSION_HISTORY.get(session_image_id, []) if msg["role"]=="assistant"])
228
- if not summary_text:
229
- summary_text = "No summary available."
230
- return generate_pdf_file(summary_text, "image_summary")
231
 
232
- # ------------------ Voice Handlers ------------------
233
  def handle_voice_general(audio_file, session_id, tts_lang="en"):
234
  path = _get_path_from_gr_file(audio_file)
235
  if not path:
@@ -245,7 +260,7 @@ def handle_voice_pdf(audio_file, session_id, tts_lang="en"):
245
  if not path:
246
  return "No audio provided.", None, []
247
  user_text = transcribe_audio(path)
248
- assistant_text = handle_text_pdf(user_text, session_id)
249
  _append_chat_display(session_id, user_text, assistant_text)
250
  audio_path = synthesize_speech(assistant_text, lang=tts_lang)
251
  return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
@@ -255,17 +270,23 @@ def handle_voice_image(audio_file, session_id, tts_lang="en"):
255
  if not path:
256
  return "No audio provided.", None, []
257
  user_text = transcribe_audio(path)
258
- assistant_text = handle_text_image(user_text, session_id)
259
  _append_chat_display(session_id, user_text, assistant_text)
260
  audio_path = synthesize_speech(assistant_text, lang=tts_lang)
261
  return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
262
 
263
- # ------------------ Text Handlers ------------------
264
  def handle_text_general(user_text, session_id):
265
  assistant = generate_response(session_id, user_text)
266
  _append_chat_display(session_id, user_text, assistant)
267
  return assistant, _chat_display_to_messages(CHAT_DISPLAY[session_id])
268
 
 
 
 
 
 
 
269
  # ------------------ Gradio UI ------------------
270
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
271
  gr.Markdown("## πŸ›  Multi-Mode AI Assistant (Voice, PDF, Image)")
@@ -276,54 +297,37 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
276
 
277
  # ---------------- Voice Tab ----------------
278
  with gr.Tab("🎀 Voice Chat"):
279
- chat_voice = gr.Chatbot(type="messages", height=350)
280
  with gr.Row():
281
  mic = gr.Audio(type="filepath", label="🎀 Record Voice (hold & speak)", show_download_button=False)
282
- tts_lang = gr.Dropdown(choices=["en", "ur"], value="en", label="TTS Language", interactive=True, scale=1)
283
  with gr.Row():
284
- btn_general = gr.Button("Ask General 🎯", scale=1)
285
- btn_pdf = gr.Button("Ask PDF πŸ“„", scale=1)
286
- btn_image = gr.Button("Ask Image πŸ–Ό", scale=1)
287
  answer_voice = gr.Textbox(label="Assistant Answer (text)", lines=4)
288
- audio_output = gr.Audio(label="Assistant Voice Output", type="filepath", interactive=False)
289
 
290
- # Text-only general chat
291
  with gr.Row():
292
- text_input = gr.Textbox(label="Or type a question (General)", placeholder="Type message here...", lines=2)
293
- btn_send_text = gr.Button("Send (Text General)", scale=1)
294
-
295
- btn_general.click(
296
- fn=handle_voice_general,
297
- inputs=[mic, session_voice, tts_lang],
298
- outputs=[answer_voice, audio_output, chat_voice],
299
- )
300
- btn_pdf.click(
301
- fn=handle_voice_pdf,
302
- inputs=[mic, session_pdf, tts_lang],
303
- outputs=[answer_voice, audio_output, chat_voice],
304
- )
305
- btn_image.click(
306
- fn=handle_voice_image,
307
- inputs=[mic, session_image, tts_lang],
308
- outputs=[answer_voice, audio_output, chat_voice],
309
- )
310
- btn_send_text.click(
311
- fn=handle_text_general,
312
- inputs=[text_input, session_voice],
313
- outputs=[answer_voice, chat_voice],
314
- )
315
 
316
  # ---------------- PDF Tab ----------------
317
  with gr.Tab("πŸ“„ PDF Summarizer"):
318
- pdf_output = gr.Textbox(label="Answer (Text Only)", lines=6)
319
- pdf_summary_file = gr.File(label="πŸ“₯ Download PDF Summary")
320
  with gr.Row():
321
- pdf_upload_btn = gr.File(label="Upload PDF", file_types=[".pdf"], interactive=True)
322
- pdf_upload_msg = gr.Textbox(label="Upload Status", interactive=False, lines=1)
323
  pdf_question = gr.Textbox(label="Ask a question about PDF (text)", lines=2)
324
- pdf_send_btn = gr.Button("Ask (Text)", scale=1)
325
- pdf_reset_btn = gr.Button("β™» Reset PDF", scale=1)
326
- pdf_download_btn = gr.Button("πŸ“₯ Download Summary", scale=1)
327
 
328
  pdf_upload_btn.upload(handle_pdf_upload, inputs=[pdf_upload_btn, session_pdf], outputs=[pdf_upload_msg])
329
  pdf_send_btn.click(handle_text_pdf, inputs=[pdf_question, session_pdf], outputs=[pdf_output])
@@ -332,20 +336,20 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
332
 
333
  # ---------------- Image Tab ----------------
334
  with gr.Tab("πŸ–Ό Image OCR"):
335
- image_output = gr.Textbox(label="Answer (Text Only)", lines=6)
336
- img_summary_file = gr.File(label="πŸ“₯ Download PDF Summary")
337
- with gr.Row():
338
- image_upload_btn = gr.File(label="Upload Image", file_types=[".png", ".jpg", ".jpeg"], interactive=True)
339
- image_upload_msg = gr.Textbox(label="Upload Status", interactive=False, lines=1)
340
- image_question = gr.Textbox(label="Ask a question about Image (text)", lines=2)
341
- image_send_btn = gr.Button("Ask", scale=1)
342
- image_reset_btn = gr.Button("β™» Reset Image", scale=1)
343
- img_download_btn = gr.Button("πŸ“₯ Download Summary", scale=1)
344
 
345
  image_upload_btn.upload(handle_image_upload, inputs=[image_upload_btn, session_image], outputs=[image_upload_msg, image_output])
346
  image_send_btn.click(handle_text_image, inputs=[image_question, session_image], outputs=[image_output])
347
  image_reset_btn.click(lambda: (str(uuid.uuid4()), ""), outputs=[session_image, image_output])
348
- img_download_btn.click(download_image_summary, inputs=[session_image], outputs=[img_summary_file])
349
 
 
350
  if __name__ == "__main__":
351
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  # app.py
2
  """
3
  Multi-Mode AI Assistant (Voice, PDF, Image)
4
+ - Fixed PDF/Image summary download to match β€œAnswer (Text Only)” box.
5
+ - Chatbot uses type="messages" (openai-style {"role","content"} dicts).
6
+ - Voice tab: single mic + three buttons (Ask General / Ask PDF / Ask Image).
7
+ - PDF tab: upload + text questions only (no voice controls).
8
+ - OCR uses OCR.space (OCR_SPACE_API_KEY).
9
+ - Uses Groq endpoints for transcription + chat completions (GROQ_API_KEY).
10
+ - Embeddings via sentence-transformers (all-MiniLM-L6-v2).
11
  """
12
  import os
13
  import uuid
14
  import tempfile
15
  import requests
 
16
  from dotenv import load_dotenv
17
  from gtts import gTTS
18
  from PyPDF2 import PdfReader
19
  import gradio as gr
20
  from sentence_transformers import SentenceTransformer, util
21
  from fpdf import FPDF
22
+ from datetime import datetime
23
 
24
  # ------------------ Load API Keys ------------------
25
  load_dotenv()
26
  GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()
27
  OCR_SPACE_API_KEY = os.getenv("OCR_SPACE_API_KEY", "").strip()
28
+
29
  if not GROQ_API_KEY:
30
  raise ValueError("❌ GROQ_API_KEY missing. Set it in env / Hugging Face Secrets.")
31
  if not OCR_SPACE_API_KEY:
 
33
 
34
  HEADERS = {"Authorization": f"Bearer {GROQ_API_KEY}"}
35
 
36
+ # ------------------ Global State ------------------
37
+ SESSION_HISTORY = {} # session_id -> list of {"role","content"} messages for LLM
38
+ CHAT_DISPLAY = {} # session_id -> list of (user_text, assistant_text) tuples (kept for conversion)
39
+ PDF_CONTENT = {} # session_id -> list of chunks (strings)
40
+ PDF_EMBEDS = {} # session_id -> embeddings tensor
41
+ IMAGE_TEXT = {} # session_id -> list of image-text chunks
42
+ IMAGE_EMBEDS = {} # session_id -> embeddings tensor
43
+
44
  CHUNK_SIZE = 1500
45
 
46
+ # Load embedding model once
47
  embed_model = SentenceTransformer("all-MiniLM-L6-v2")
48
 
49
  # ------------------ Helpers ------------------
 
55
  try:
56
  if hasattr(gr_file, "name") and os.path.exists(gr_file.name):
57
  return gr_file.name
58
+ except Exception:
59
  pass
60
  if isinstance(gr_file, dict):
61
  for key in ("name", "file_name", "filepath"):
62
+ if key in gr_file:
63
+ candidate = gr_file.get(key)
64
+ if isinstance(candidate, str) and os.path.exists(candidate):
65
+ return candidate
66
  return None
67
 
68
  def chunk_text(text, size=CHUNK_SIZE):
69
+ return [text[i:i + size] for i in range(0, len(text), size)]
70
 
71
  def synthesize_speech(text, lang="en"):
72
  try:
 
94
  msgs.append({"role": "assistant", "content": assistant})
95
  return msgs
96
 
97
+ # ------------------ Transcription & LLM ------------------
 
 
 
 
 
98
  def transcribe_audio(audio_path):
99
  if not audio_path or not os.path.exists(audio_path):
100
  return "Error: audio file missing."
 
126
  print("generate_response error:", e)
127
  return f"Error generating response: {e}"
128
 
129
+ # ------------------ PDF handling ------------------
130
  def handle_pdf_upload(pdf_file, session_id):
131
  path = _get_path_from_gr_file(pdf_file)
132
  if not path:
133
  return "No file uploaded or file unreadable."
134
  try:
135
  reader = PdfReader(path)
136
+ text = ""
137
+ for page in reader.pages:
138
+ text += (page.extract_text() or "") + "\n"
139
  if not text.strip():
140
  return "No extractable content found in PDF."
141
  chunks = chunk_text(text)
142
  PDF_CONTENT[session_id] = chunks
143
  PDF_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
144
+ return f"PDF processed: {len(chunks)} chunks ready."
145
  except Exception as e:
146
  print("PDF upload error:", e)
147
  return f"Error processing PDF: {e}"
148
 
149
+ def handle_pdf_question(question, session_id):
150
  if session_id not in PDF_CONTENT:
151
  return "Document not found. Upload first."
152
  chunk = select_relevant_chunk(question, PDF_CONTENT[session_id], PDF_EMBEDS[session_id])
 
158
  try:
159
  resp = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=HEADERS, json=body, timeout=60)
160
  resp.raise_for_status()
161
+ assistant_text = resp.json()["choices"][0]["message"]["content"]
162
+ # Store assistant answer in SESSION_HISTORY for PDF download
163
+ if session_id not in SESSION_HISTORY:
164
+ SESSION_HISTORY[session_id] = []
165
+ SESSION_HISTORY[session_id].append({"role": "assistant", "content": assistant_text})
166
+ return assistant_text
167
  except Exception as e:
168
  print("PDF question error:", e)
169
  return f"Error generating response: {e}"
 
180
  r.raise_for_status()
181
  j = r.json()
182
  if j.get("IsErroredOnProcessing"):
183
+ print("OCR.space processing error:", j)
184
  return ""
185
  parsed = [pr.get("ParsedText", "") for pr in j.get("ParsedResults", [])]
186
  return "\n".join(parsed)
 
191
  def handle_image_upload(image_file, session_id):
192
  path = _get_path_from_gr_file(image_file)
193
  if not path:
194
+ return "No image uploaded or file unreadable.", ""
195
  parsed = ocr_space_file(path, OCR_SPACE_API_KEY)
196
  if not parsed.strip():
197
  return "No extractable text found in the image.", ""
 
200
  IMAGE_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
201
  return f"Image processed: {len(chunks)} chunks ready.", ""
202
 
203
+ def handle_image_question(question, session_id):
204
  if session_id not in IMAGE_TEXT:
205
  return "Image not found. Upload first."
206
  chunk = select_relevant_chunk(question, IMAGE_TEXT[session_id], IMAGE_EMBEDS[session_id])
 
212
  try:
213
  resp = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=HEADERS, json=body, timeout=60)
214
  resp.raise_for_status()
215
+ assistant_text = resp.json()["choices"][0]["message"]["content"]
216
+ if session_id not in SESSION_HISTORY:
217
+ SESSION_HISTORY[session_id] = []
218
+ SESSION_HISTORY[session_id].append({"role": "assistant", "content": assistant_text})
219
+ return assistant_text
220
  except Exception as e:
221
  print("Image question error:", e)
222
  return f"Error generating response: {e}"
223
 
224
+ # ------------------ PDF Generation for Download ------------------
225
  def generate_pdf_file(text, filename_prefix="summary"):
226
  pdf = FPDF()
227
  pdf.add_page()
 
229
  pdf.set_font("Arial", size=12)
230
  for line in text.split("\n"):
231
  pdf.multi_cell(0, 6, line)
232
+ file_path = f"/tmp/{filename_prefix}_{uuid.uuid4()}.pdf"
233
+ pdf.output(file_path)
234
+ return file_path
235
 
236
+ def download_pdf_summary(session_id):
237
+ summary_text = "\n".join([m["content"] for m in SESSION_HISTORY.get(session_id, []) if m["role"]=="assistant"])
238
  if not summary_text:
239
  summary_text = "No summary available."
240
+ return generate_pdf_file(summary_text, "summary")
241
 
242
+ # ------------------ Voice routing ------------------
243
+ def _append_chat_display(session_id, user_text, assistant_text):
244
+ if session_id not in CHAT_DISPLAY:
245
+ CHAT_DISPLAY[session_id] = []
246
+ CHAT_DISPLAY[session_id].append((user_text, assistant_text))
247
 
 
248
  def handle_voice_general(audio_file, session_id, tts_lang="en"):
249
  path = _get_path_from_gr_file(audio_file)
250
  if not path:
 
260
  if not path:
261
  return "No audio provided.", None, []
262
  user_text = transcribe_audio(path)
263
+ assistant_text = handle_pdf_question(user_text, session_id)
264
  _append_chat_display(session_id, user_text, assistant_text)
265
  audio_path = synthesize_speech(assistant_text, lang=tts_lang)
266
  return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
 
270
  if not path:
271
  return "No audio provided.", None, []
272
  user_text = transcribe_audio(path)
273
+ assistant_text = handle_image_question(user_text, session_id)
274
  _append_chat_display(session_id, user_text, assistant_text)
275
  audio_path = synthesize_speech(assistant_text, lang=tts_lang)
276
  return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
277
 
278
+ # ------------------ Text handlers ------------------
279
  def handle_text_general(user_text, session_id):
280
  assistant = generate_response(session_id, user_text)
281
  _append_chat_display(session_id, user_text, assistant)
282
  return assistant, _chat_display_to_messages(CHAT_DISPLAY[session_id])
283
 
284
+ def handle_text_pdf(question, session_id):
285
+ return handle_pdf_question(question, session_id)
286
+
287
+ def handle_text_image(question, session_id):
288
+ return handle_image_question(question, session_id)
289
+
290
  # ------------------ Gradio UI ------------------
291
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
292
  gr.Markdown("## πŸ›  Multi-Mode AI Assistant (Voice, PDF, Image)")
 
297
 
298
  # ---------------- Voice Tab ----------------
299
  with gr.Tab("🎀 Voice Chat"):
300
+ chat_voice = gr.Chatbot(type="messages", height=380)
301
  with gr.Row():
302
  mic = gr.Audio(type="filepath", label="🎀 Record Voice (hold & speak)", show_download_button=False)
303
+ tts_lang = gr.Dropdown(choices=["en", "ur"], value="en", label="TTS Language")
304
  with gr.Row():
305
+ btn_general = gr.Button("Ask General (from recorded voice)")
306
+ btn_pdf = gr.Button("Ask PDF (from recorded voice)")
307
+ btn_image = gr.Button("Ask Image (from recorded voice)")
308
  answer_voice = gr.Textbox(label="Assistant Answer (text)", lines=4)
309
+ audio_output = gr.Audio(label="Assistant Voice Output", type="filepath")
310
 
 
311
  with gr.Row():
312
+ text_input = gr.Textbox(label="Or type a question (General)", placeholder="Type message here...")
313
+ btn_send_text = gr.Button("Send (Text General)")
314
+
315
+ btn_general.click(fn=handle_voice_general, inputs=[mic, session_voice, tts_lang], outputs=[answer_voice, audio_output, chat_voice])
316
+ btn_pdf.click(fn=handle_voice_pdf, inputs=[mic, session_pdf, tts_lang], outputs=[answer_voice, audio_output, chat_voice])
317
+ btn_image.click(fn=handle_voice_image, inputs=[mic, session_image, tts_lang], outputs=[answer_voice, audio_output, chat_voice])
318
+ btn_send_text.click(fn=handle_text_general, inputs=[text_input, session_voice], outputs=[answer_voice, chat_voice])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
  # ---------------- PDF Tab ----------------
321
  with gr.Tab("πŸ“„ PDF Summarizer"):
322
+ pdf_output = gr.Textbox(label="Answer (Text Only)", lines=8)
323
+ pdf_summary_file = gr.File(label="Download Summary File")
324
  with gr.Row():
325
+ pdf_upload_btn = gr.File(label="Upload PDF", file_types=[".pdf"])
326
+ pdf_upload_msg = gr.Textbox(label="Upload Status", interactive=False)
327
  pdf_question = gr.Textbox(label="Ask a question about PDF (text)", lines=2)
328
+ pdf_send_btn = gr.Button("Ask (Text)")
329
+ pdf_reset_btn = gr.Button("β™» Reset PDF")
330
+ pdf_download_btn = gr.Button("πŸ“₯ Download Summary")
331
 
332
  pdf_upload_btn.upload(handle_pdf_upload, inputs=[pdf_upload_btn, session_pdf], outputs=[pdf_upload_msg])
333
  pdf_send_btn.click(handle_text_pdf, inputs=[pdf_question, session_pdf], outputs=[pdf_output])
 
336
 
337
  # ---------------- Image Tab ----------------
338
  with gr.Tab("πŸ–Ό Image OCR"):
339
+ image_output = gr.Textbox(label="Answer (Text Only)", lines=8)
340
+ image_upload_btn = gr.File(label="Upload Image", file_types=[".png", ".jpg", ".jpeg"])
341
+ image_upload_msg = gr.Textbox(label="Upload Status", interactive=False)
342
+ image_question = gr.Textbox(label="Ask question about Image", lines=2)
343
+ image_send_btn = gr.Button("Ask (Text)")
344
+ image_reset_btn = gr.Button("β™» Reset Image")
345
+ image_download_btn = gr.Button("πŸ“₯ Download Summary")
346
+ image_summary_file = gr.File(label="Download Summary File")
 
347
 
348
  image_upload_btn.upload(handle_image_upload, inputs=[image_upload_btn, session_image], outputs=[image_upload_msg, image_output])
349
  image_send_btn.click(handle_text_image, inputs=[image_question, session_image], outputs=[image_output])
350
  image_reset_btn.click(lambda: (str(uuid.uuid4()), ""), outputs=[session_image, image_output])
351
+ image_download_btn.click(download_pdf_summary, inputs=[session_image], outputs=[image_summary_file])
352
 
353
+ # Launch
354
  if __name__ == "__main__":
355
+ demo.launch()