asad9641 commited on
Commit
d9b53fd
·
verified ·
1 Parent(s): 52a8c9a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -171
app.py CHANGED
@@ -1,25 +1,33 @@
1
  # app.py
2
  """
3
  Multi-Mode AI Assistant (Voice, PDF, Image)
4
- - Improved interactive UI: compact, visually appealing, emojis/icons, scrollable previews.
5
- - All backend functionality preserved.
 
 
 
 
 
 
 
6
  """
7
  import os
8
  import uuid
9
  import tempfile
10
  import requests
11
- from datetime import datetime
12
  from dotenv import load_dotenv
13
  from gtts import gTTS
14
  from PyPDF2 import PdfReader
15
  import gradio as gr
16
  from sentence_transformers import SentenceTransformer, util
17
  from fpdf import FPDF
 
18
 
19
  # ------------------ Load API Keys ------------------
20
  load_dotenv()
21
  GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()
22
  OCR_SPACE_API_KEY = os.getenv("OCR_SPACE_API_KEY", "").strip()
 
23
  if not GROQ_API_KEY:
24
  raise ValueError("❌ GROQ_API_KEY missing. Set it in env / Hugging Face Secrets.")
25
  if not OCR_SPACE_API_KEY:
@@ -27,36 +35,22 @@ if not OCR_SPACE_API_KEY:
27
 
28
  HEADERS = {"Authorization": f"Bearer {GROQ_API_KEY}"}
29
 
30
- # ------------------ Global States ------------------
31
  SESSION_HISTORY = {}
32
- CHAT_DISPLAY = {}
33
  PDF_CONTENT = {}
34
  PDF_EMBEDS = {}
35
  IMAGE_TEXT = {}
36
  IMAGE_EMBEDS = {}
37
  CHUNK_SIZE = 1500
38
 
 
39
  embed_model = SentenceTransformer("all-MiniLM-L6-v2")
40
 
41
- # ------------------ Helpers ------------------
42
- def _get_path_from_gr_file(gr_file):
43
- if not gr_file:
44
- return None
45
- if isinstance(gr_file, str) and os.path.exists(gr_file):
46
- return gr_file
47
- try:
48
- if hasattr(gr_file, "name") and os.path.exists(gr_file.name):
49
- return gr_file.name
50
- except:
51
- pass
52
- if isinstance(gr_file, dict):
53
- for key in ("name", "file_name", "filepath"):
54
- if key in gr_file and os.path.exists(gr_file[key]):
55
- return gr_file[key]
56
- return None
57
 
 
58
  def chunk_text(text, size=CHUNK_SIZE):
59
- return [text[i:i+size] for i in range(0, len(text), size)]
 
60
 
61
  def synthesize_speech(text, lang="en"):
62
  try:
@@ -69,6 +63,7 @@ def synthesize_speech(text, lang="en"):
69
  print("TTS error:", e)
70
  return None
71
 
 
72
  def select_relevant_chunk(question, chunks, chunk_embeds):
73
  if not chunks or chunk_embeds is None:
74
  return ""
@@ -77,6 +72,7 @@ def select_relevant_chunk(question, chunks, chunk_embeds):
77
  top_idx = int(scores.argmax().item())
78
  return chunks[top_idx]
79
 
 
80
  def _chat_display_to_messages(chat_display):
81
  msgs = []
82
  for user, assistant in chat_display:
@@ -84,12 +80,27 @@ def _chat_display_to_messages(chat_display):
84
  msgs.append({"role": "assistant", "content": assistant})
85
  return msgs
86
 
87
- def _append_chat_display(session_id, user_text, assistant_text):
88
- if session_id not in CHAT_DISPLAY:
89
- CHAT_DISPLAY[session_id] = []
90
- CHAT_DISPLAY[session_id].append((user_text, assistant_text))
91
 
92
- # ------------------ Voice & LLM ------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  def transcribe_audio(audio_path):
94
  if not audio_path or not os.path.exists(audio_path):
95
  return "Error: audio file missing."
@@ -105,6 +116,7 @@ def transcribe_audio(audio_path):
105
  print("transcription error:", e)
106
  return f"Error transcribing audio: {e}"
107
 
 
108
  def generate_response(session_id, user_text):
109
  if session_id not in SESSION_HISTORY:
110
  SESSION_HISTORY[session_id] = []
@@ -121,6 +133,7 @@ def generate_response(session_id, user_text):
121
  print("generate_response error:", e)
122
  return f"Error generating response: {e}"
123
 
 
124
  # ------------------ PDF Handling ------------------
125
  def handle_pdf_upload(pdf_file, session_id):
126
  path = _get_path_from_gr_file(pdf_file)
@@ -128,17 +141,20 @@ def handle_pdf_upload(pdf_file, session_id):
128
  return "No file uploaded or file unreadable."
129
  try:
130
  reader = PdfReader(path)
131
- text = "".join([page.extract_text() or "" for page in reader.pages])
 
 
132
  if not text.strip():
133
  return "No extractable content found in PDF."
134
  chunks = chunk_text(text)
135
  PDF_CONTENT[session_id] = chunks
136
  PDF_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
137
- return f"PDF uploaded: {len(chunks)} chunks ready."
138
  except Exception as e:
139
  print("PDF upload error:", e)
140
  return f"Error processing PDF: {e}"
141
 
 
142
  def handle_text_pdf(question, session_id):
143
  if session_id not in PDF_CONTENT:
144
  return "Document not found. Upload first."
@@ -156,56 +172,7 @@ def handle_text_pdf(question, session_id):
156
  print("PDF question error:", e)
157
  return f"Error generating response: {e}"
158
 
159
- # ------------------ Image OCR ------------------
160
- def ocr_space_file(image_path, api_key, language="eng"):
161
- if not image_path or not os.path.exists(image_path):
162
- return ""
163
- try:
164
- with open(image_path, "rb") as f:
165
- payload = {"apikey": api_key, "language": language}
166
- files = {"file": f}
167
- r = requests.post("https://api.ocr.space/parse/image", files=files, data=payload, timeout=60)
168
- r.raise_for_status()
169
- j = r.json()
170
- if j.get("IsErroredOnProcessing"):
171
- print("OCR.space error:", j)
172
- return ""
173
- parsed = [pr.get("ParsedText", "") for pr in j.get("ParsedResults", [])]
174
- return "\n".join(parsed)
175
- except Exception as e:
176
- print("ocr_space_file error:", e)
177
- return ""
178
-
179
- def handle_image_upload(image_file, session_id):
180
- path = _get_path_from_gr_file(image_file)
181
- if not path:
182
- return "No image uploaded.", ""
183
- parsed = ocr_space_file(path, OCR_SPACE_API_KEY)
184
- if not parsed.strip():
185
- return "No extractable text found in the image.", ""
186
- chunks = chunk_text(parsed)
187
- IMAGE_TEXT[session_id] = chunks
188
- IMAGE_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
189
- return f"Image processed: {len(chunks)} chunks ready.", ""
190
-
191
- def handle_text_image(question, session_id):
192
- if session_id not in IMAGE_TEXT:
193
- return "Image not found. Upload first."
194
- chunk = select_relevant_chunk(question, IMAGE_TEXT[session_id], IMAGE_EMBEDS[session_id])
195
- messages = [
196
- {"role": "system", "content": "You are a helpful assistant summarizing image text."},
197
- {"role": "user", "content": f"Image chunk:\n{chunk}\n\nQuestion: {question}"}
198
- ]
199
- body = {"model": "llama-3.1-8b-instant", "messages": messages}
200
- try:
201
- resp = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=HEADERS, json=body, timeout=60)
202
- resp.raise_for_status()
203
- return resp.json()["choices"][0]["message"]["content"]
204
- except Exception as e:
205
- print("Image question error:", e)
206
- return f"Error generating response: {e}"
207
 
208
- # ------------------ PDF Generation ------------------
209
  def generate_pdf_file(text, filename_prefix="summary"):
210
  pdf = FPDF()
211
  pdf.add_page()
@@ -213,58 +180,24 @@ def generate_pdf_file(text, filename_prefix="summary"):
213
  pdf.set_font("Arial", size=12)
214
  for line in text.split("\n"):
215
  pdf.multi_cell(0, 6, line)
216
- tmp_path = f"/tmp/{filename_prefix}_{uuid.uuid4()}.pdf"
217
- pdf.output(tmp_path)
218
- return tmp_path
 
219
 
220
  def download_pdf_summary(session_pdf_id):
221
- summary_text = "\n".join([msg["content"] for msg in SESSION_HISTORY.get(session_pdf_id, []) if msg["role"]=="assistant"])
222
  if not summary_text:
223
  summary_text = "No summary available."
224
  return generate_pdf_file(summary_text, "pdf_summary")
225
 
 
226
  def download_image_summary(session_image_id):
227
- summary_text = "\n".join([msg["content"] for msg in SESSION_HISTORY.get(session_image_id, []) if msg["role"]=="assistant"])
228
  if not summary_text:
229
  summary_text = "No summary available."
230
  return generate_pdf_file(summary_text, "image_summary")
231
 
232
- # ------------------ Voice Handlers ------------------
233
- def handle_voice_general(audio_file, session_id, tts_lang="en"):
234
- path = _get_path_from_gr_file(audio_file)
235
- if not path:
236
- return "No audio provided.", None, []
237
- user_text = transcribe_audio(path)
238
- assistant_text = generate_response(session_id, user_text)
239
- _append_chat_display(session_id, user_text, assistant_text)
240
- audio_path = synthesize_speech(assistant_text, lang=tts_lang)
241
- return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
242
-
243
- def handle_voice_pdf(audio_file, session_id, tts_lang="en"):
244
- path = _get_path_from_gr_file(audio_file)
245
- if not path:
246
- return "No audio provided.", None, []
247
- user_text = transcribe_audio(path)
248
- assistant_text = handle_text_pdf(user_text, session_id)
249
- _append_chat_display(session_id, user_text, assistant_text)
250
- audio_path = synthesize_speech(assistant_text, lang=tts_lang)
251
- return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
252
-
253
- def handle_voice_image(audio_file, session_id, tts_lang="en"):
254
- path = _get_path_from_gr_file(audio_file)
255
- if not path:
256
- return "No audio provided.", None, []
257
- user_text = transcribe_audio(path)
258
- assistant_text = handle_text_image(user_text, session_id)
259
- _append_chat_display(session_id, user_text, assistant_text)
260
- audio_path = synthesize_speech(assistant_text, lang=tts_lang)
261
- return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
262
-
263
- # ------------------ Text Handlers ------------------
264
- def handle_text_general(user_text, session_id):
265
- assistant = generate_response(session_id, user_text)
266
- _append_chat_display(session_id, user_text, assistant)
267
- return assistant, _chat_display_to_messages(CHAT_DISPLAY[session_id])
268
 
269
  # ------------------ Gradio UI ------------------
270
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
@@ -274,78 +207,51 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
274
  session_pdf = gr.State(str(uuid.uuid4()))
275
  session_image = gr.State(str(uuid.uuid4()))
276
 
277
- # ---------------- Voice Tab ----------------
278
  with gr.Tab("🎤 Voice Chat"):
279
- chat_voice = gr.Chatbot(type="messages", height=350)
280
  with gr.Row():
281
- mic = gr.Audio(type="filepath", label="🎤 Record Voice (hold & speak)", show_download_button=False)
282
- tts_lang = gr.Dropdown(choices=["en", "ur"], value="en", label="TTS Language", interactive=True, scale=1)
283
  with gr.Row():
284
- btn_general = gr.Button("Ask General 🎯", scale=1)
285
- btn_pdf = gr.Button("Ask PDF 📄", scale=1)
286
- btn_image = gr.Button("Ask Image 🖼", scale=1)
287
- answer_voice = gr.Textbox(label="Assistant Answer (text)", lines=4)
288
- audio_output = gr.Audio(label="Assistant Voice Output", type="filepath", interactive=False)
289
 
290
- # Text-only general chat
291
- with gr.Row():
292
- text_input = gr.Textbox(label="Or type a question (General)", placeholder="Type message here...", lines=2)
293
- btn_send_text = gr.Button("Send (Text General)", scale=1)
294
-
295
- btn_general.click(
296
- fn=handle_voice_general,
297
- inputs=[mic, session_voice, tts_lang],
298
- outputs=[answer_voice, audio_output, chat_voice],
299
- )
300
- btn_pdf.click(
301
- fn=handle_voice_pdf,
302
- inputs=[mic, session_pdf, tts_lang],
303
- outputs=[answer_voice, audio_output, chat_voice],
304
- )
305
- btn_image.click(
306
- fn=handle_voice_image,
307
- inputs=[mic, session_image, tts_lang],
308
- outputs=[answer_voice, audio_output, chat_voice],
309
- )
310
- btn_send_text.click(
311
- fn=handle_text_general,
312
- inputs=[text_input, session_voice],
313
- outputs=[answer_voice, chat_voice],
314
- )
315
-
316
- # ---------------- PDF Tab ----------------
317
  with gr.Tab("📄 PDF Summarizer"):
318
- pdf_output = gr.Textbox(label="Answer (Text Only)", lines=6)
319
- pdf_summary_file = gr.File(label="📥 Download PDF Summary")
320
  with gr.Row():
321
- pdf_upload_btn = gr.File(label="Upload PDF", file_types=[".pdf"], file_types_preview=False, interactive=True)
322
- pdf_upload_msg = gr.Textbox(label="Upload Status", interactive=False, lines=1)
323
  pdf_question = gr.Textbox(label="Ask a question about PDF (text)", lines=2)
324
- pdf_send_btn = gr.Button("Ask (Text)", scale=1)
325
- pdf_reset_btn = gr.Button("♻ Reset PDF", scale=1)
326
- pdf_download_btn = gr.Button("📥 Download Summary", scale=1)
327
 
328
  pdf_upload_btn.upload(handle_pdf_upload, inputs=[pdf_upload_btn, session_pdf], outputs=[pdf_upload_msg])
329
  pdf_send_btn.click(handle_text_pdf, inputs=[pdf_question, session_pdf], outputs=[pdf_output])
330
  pdf_reset_btn.click(lambda: (str(uuid.uuid4()), ""), outputs=[session_pdf, pdf_output])
331
  pdf_download_btn.click(download_pdf_summary, inputs=[session_pdf], outputs=[pdf_summary_file])
332
 
333
- # ---------------- Image Tab ----------------
334
  with gr.Tab("🖼 Image OCR"):
335
- image_output = gr.Textbox(label="Answer (Text Only)", lines=6)
336
- img_summary_file = gr.File(label="📥 Download PDF Summary")
337
  with gr.Row():
338
  image_upload_btn = gr.File(label="Upload Image", file_types=[".png", ".jpg", ".jpeg"], interactive=True)
339
- image_upload_msg = gr.Textbox(label="Upload Status", interactive=False, lines=1)
340
  image_question = gr.Textbox(label="Ask a question about Image (text)", lines=2)
341
- image_send_btn = gr.Button("Ask", scale=1)
342
- image_reset_btn = gr.Button("♻ Reset Image", scale=1)
343
- img_download_btn = gr.Button("📥 Download Summary", scale=1)
344
 
345
- image_upload_btn.upload(handle_image_upload, inputs=[image_upload_btn, session_image], outputs=[image_upload_msg, image_output])
346
  image_send_btn.click(handle_text_image, inputs=[image_question, session_image], outputs=[image_output])
347
  image_reset_btn.click(lambda: (str(uuid.uuid4()), ""), outputs=[session_image, image_output])
348
  img_download_btn.click(download_image_summary, inputs=[session_image], outputs=[img_summary_file])
349
 
350
  if __name__ == "__main__":
351
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  # app.py
2
  """
3
  Multi-Mode AI Assistant (Voice, PDF, Image)
4
+ - Fixed Gradio v4+ Audio usage (no source=...).
5
+ - Chatbot uses type="messages" (openai-style {"role","content"} dicts).
6
+ - Voice tab: single mic + three buttons (Ask General / Ask PDF / Ask Image).
7
+ - PDF tab: upload + text questions only (no voice controls).
8
+ - PDF & Image summary download fixed (now outputs same text as Answer box).
9
+ - OCR uses OCR.space (OCR_SPACE_API_KEY).
10
+ - Uses Groq endpoints for transcription + chat completions (GROQ_API_KEY).
11
+ - Embeddings via sentence-transformers (all-MiniLM-L6-v2).
12
+ - Improved interactive UI with attractive layout.
13
  """
14
  import os
15
  import uuid
16
  import tempfile
17
  import requests
 
18
  from dotenv import load_dotenv
19
  from gtts import gTTS
20
  from PyPDF2 import PdfReader
21
  import gradio as gr
22
  from sentence_transformers import SentenceTransformer, util
23
  from fpdf import FPDF
24
+ from datetime import datetime
25
 
26
  # ------------------ Load API Keys ------------------
27
  load_dotenv()
28
  GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()
29
  OCR_SPACE_API_KEY = os.getenv("OCR_SPACE_API_KEY", "").strip()
30
+
31
  if not GROQ_API_KEY:
32
  raise ValueError("❌ GROQ_API_KEY missing. Set it in env / Hugging Face Secrets.")
33
  if not OCR_SPACE_API_KEY:
 
35
 
36
  HEADERS = {"Authorization": f"Bearer {GROQ_API_KEY}"}
37
 
38
+ # ------------------ Global State ------------------
39
  SESSION_HISTORY = {}
 
40
  PDF_CONTENT = {}
41
  PDF_EMBEDS = {}
42
  IMAGE_TEXT = {}
43
  IMAGE_EMBEDS = {}
44
  CHUNK_SIZE = 1500
45
 
46
+ # Load embedding model once
47
  embed_model = SentenceTransformer("all-MiniLM-L6-v2")
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # ------------------ Helpers ------------------
51
  def chunk_text(text, size=CHUNK_SIZE):
52
+ return [text[i:i + size] for i in range(0, len(text), size)]
53
+
54
 
55
  def synthesize_speech(text, lang="en"):
56
  try:
 
63
  print("TTS error:", e)
64
  return None
65
 
66
+
67
  def select_relevant_chunk(question, chunks, chunk_embeds):
68
  if not chunks or chunk_embeds is None:
69
  return ""
 
72
  top_idx = int(scores.argmax().item())
73
  return chunks[top_idx]
74
 
75
+
76
  def _chat_display_to_messages(chat_display):
77
  msgs = []
78
  for user, assistant in chat_display:
 
80
  msgs.append({"role": "assistant", "content": assistant})
81
  return msgs
82
 
 
 
 
 
83
 
84
+ def _get_path_from_gr_file(gr_file):
85
+ if not gr_file:
86
+ return None
87
+ if isinstance(gr_file, str) and os.path.exists(gr_file):
88
+ return gr_file
89
+ try:
90
+ if hasattr(gr_file, "name") and isinstance(gr_file.name, str) and os.path.exists(gr_file.name):
91
+ return gr_file.name
92
+ except Exception:
93
+ pass
94
+ if isinstance(gr_file, dict):
95
+ for key in ("name", "file_name", "filepath"):
96
+ if key in gr_file:
97
+ candidate = gr_file.get(key)
98
+ if isinstance(candidate, str) and os.path.exists(candidate):
99
+ return candidate
100
+ return None
101
+
102
+
103
+ # ------------------ Transcription & LLM ------------------
104
  def transcribe_audio(audio_path):
105
  if not audio_path or not os.path.exists(audio_path):
106
  return "Error: audio file missing."
 
116
  print("transcription error:", e)
117
  return f"Error transcribing audio: {e}"
118
 
119
+
120
  def generate_response(session_id, user_text):
121
  if session_id not in SESSION_HISTORY:
122
  SESSION_HISTORY[session_id] = []
 
133
  print("generate_response error:", e)
134
  return f"Error generating response: {e}"
135
 
136
+
137
  # ------------------ PDF Handling ------------------
138
  def handle_pdf_upload(pdf_file, session_id):
139
  path = _get_path_from_gr_file(pdf_file)
 
141
  return "No file uploaded or file unreadable."
142
  try:
143
  reader = PdfReader(path)
144
+ text = ""
145
+ for page in reader.pages:
146
+ text += (page.extract_text() or "") + "\n"
147
  if not text.strip():
148
  return "No extractable content found in PDF."
149
  chunks = chunk_text(text)
150
  PDF_CONTENT[session_id] = chunks
151
  PDF_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
152
+ return f"PDF processed: {len(chunks)} chunks ready."
153
  except Exception as e:
154
  print("PDF upload error:", e)
155
  return f"Error processing PDF: {e}"
156
 
157
+
158
  def handle_text_pdf(question, session_id):
159
  if session_id not in PDF_CONTENT:
160
  return "Document not found. Upload first."
 
172
  print("PDF question error:", e)
173
  return f"Error generating response: {e}"
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
 
176
  def generate_pdf_file(text, filename_prefix="summary"):
177
  pdf = FPDF()
178
  pdf.add_page()
 
180
  pdf.set_font("Arial", size=12)
181
  for line in text.split("\n"):
182
  pdf.multi_cell(0, 6, line)
183
+ file_path = f"/tmp/{filename_prefix}_{uuid.uuid4()}.pdf"
184
+ pdf.output(file_path)
185
+ return file_path
186
+
187
 
188
  def download_pdf_summary(session_pdf_id):
189
+ summary_text = "\n".join([m["content"] for m in SESSION_HISTORY.get(session_pdf_id, []) if m["role"]=="assistant"])
190
  if not summary_text:
191
  summary_text = "No summary available."
192
  return generate_pdf_file(summary_text, "pdf_summary")
193
 
194
+
195
  def download_image_summary(session_image_id):
196
+ summary_text = "\n".join([m["content"] for m in SESSION_HISTORY.get(session_image_id, []) if m["role"]=="assistant"])
197
  if not summary_text:
198
  summary_text = "No summary available."
199
  return generate_pdf_file(summary_text, "image_summary")
200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
  # ------------------ Gradio UI ------------------
203
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
 
207
  session_pdf = gr.State(str(uuid.uuid4()))
208
  session_image = gr.State(str(uuid.uuid4()))
209
 
210
+ # --- Voice ---
211
  with gr.Tab("🎤 Voice Chat"):
212
+ chat_voice = gr.Chatbot(type="messages", height=380)
213
  with gr.Row():
214
+ mic = gr.Audio(label="Hold & speak", type="filepath")
215
+ tts_lang = gr.Dropdown(choices=["en", "ur"], value="en", label="TTS Language")
216
  with gr.Row():
217
+ btn_general = gr.Button("Ask General")
218
+ btn_pdf = gr.Button("Ask PDF")
219
+ btn_image = gr.Button("Ask Image")
220
+ audio_output = gr.Audio(label="Assistant Voice Output", type="filepath")
 
221
 
222
+ # --- PDF ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  with gr.Tab("📄 PDF Summarizer"):
224
+ pdf_output = gr.Textbox(label="Answer (Text Only)", lines=8)
225
+ pdf_summary_file = gr.File(label="Download Summary PDF")
226
  with gr.Row():
227
+ pdf_upload_btn = gr.File(label="Upload PDF", file_types=[".pdf"], interactive=True)
228
+ pdf_upload_msg = gr.Textbox(label="Upload Status", interactive=False)
229
  pdf_question = gr.Textbox(label="Ask a question about PDF (text)", lines=2)
230
+ pdf_send_btn = gr.Button("Ask (Text)")
231
+ pdf_reset_btn = gr.Button("♻ Reset PDF")
232
+ pdf_download_btn = gr.Button("📥 Download Summary")
233
 
234
  pdf_upload_btn.upload(handle_pdf_upload, inputs=[pdf_upload_btn, session_pdf], outputs=[pdf_upload_msg])
235
  pdf_send_btn.click(handle_text_pdf, inputs=[pdf_question, session_pdf], outputs=[pdf_output])
236
  pdf_reset_btn.click(lambda: (str(uuid.uuid4()), ""), outputs=[session_pdf, pdf_output])
237
  pdf_download_btn.click(download_pdf_summary, inputs=[session_pdf], outputs=[pdf_summary_file])
238
 
239
+ # --- Image ---
240
  with gr.Tab("🖼 Image OCR"):
241
+ image_output = gr.Textbox(label="Answer (Text Only)", lines=8)
242
+ img_summary_file = gr.File(label="Download Summary PDF")
243
  with gr.Row():
244
  image_upload_btn = gr.File(label="Upload Image", file_types=[".png", ".jpg", ".jpeg"], interactive=True)
245
+ image_upload_msg = gr.Textbox(label="Upload Status", interactive=False)
246
  image_question = gr.Textbox(label="Ask a question about Image (text)", lines=2)
247
+ image_send_btn = gr.Button("Ask (Text)")
248
+ image_reset_btn = gr.Button("♻ Reset Image")
249
+ img_download_btn = gr.Button("📥 Download Summary")
250
 
251
+ image_upload_btn.upload(handle_image_upload, inputs=[image_upload_btn, session_image], outputs=[image_upload_msg])
252
  image_send_btn.click(handle_text_image, inputs=[image_question, session_image], outputs=[image_output])
253
  image_reset_btn.click(lambda: (str(uuid.uuid4()), ""), outputs=[session_image, image_output])
254
  img_download_btn.click(download_image_summary, inputs=[session_image], outputs=[img_summary_file])
255
 
256
  if __name__ == "__main__":
257
+ demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)