asad9641 commited on
Commit
1f64b26
Β·
verified Β·
1 Parent(s): d9b53fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +170 -76
app.py CHANGED
@@ -1,33 +1,25 @@
1
  # app.py
2
  """
3
  Multi-Mode AI Assistant (Voice, PDF, Image)
4
- - Fixed Gradio v4+ Audio usage (no source=...).
5
- - Chatbot uses type="messages" (openai-style {"role","content"} dicts).
6
- - Voice tab: single mic + three buttons (Ask General / Ask PDF / Ask Image).
7
- - PDF tab: upload + text questions only (no voice controls).
8
- - PDF & Image summary download fixed (now outputs same text as Answer box).
9
- - OCR uses OCR.space (OCR_SPACE_API_KEY).
10
- - Uses Groq endpoints for transcription + chat completions (GROQ_API_KEY).
11
- - Embeddings via sentence-transformers (all-MiniLM-L6-v2).
12
- - Improved interactive UI with attractive layout.
13
  """
14
  import os
15
  import uuid
16
  import tempfile
17
  import requests
 
18
  from dotenv import load_dotenv
19
  from gtts import gTTS
20
  from PyPDF2 import PdfReader
21
  import gradio as gr
22
  from sentence_transformers import SentenceTransformer, util
23
  from fpdf import FPDF
24
- from datetime import datetime
25
 
26
  # ------------------ Load API Keys ------------------
27
  load_dotenv()
28
  GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()
29
  OCR_SPACE_API_KEY = os.getenv("OCR_SPACE_API_KEY", "").strip()
30
-
31
  if not GROQ_API_KEY:
32
  raise ValueError("❌ GROQ_API_KEY missing. Set it in env / Hugging Face Secrets.")
33
  if not OCR_SPACE_API_KEY:
@@ -35,22 +27,36 @@ if not OCR_SPACE_API_KEY:
35
 
36
  HEADERS = {"Authorization": f"Bearer {GROQ_API_KEY}"}
37
 
38
- # ------------------ Global State ------------------
39
  SESSION_HISTORY = {}
 
40
  PDF_CONTENT = {}
41
  PDF_EMBEDS = {}
42
  IMAGE_TEXT = {}
43
  IMAGE_EMBEDS = {}
44
  CHUNK_SIZE = 1500
45
 
46
- # Load embedding model once
47
  embed_model = SentenceTransformer("all-MiniLM-L6-v2")
48
 
49
-
50
  # ------------------ Helpers ------------------
51
- def chunk_text(text, size=CHUNK_SIZE):
52
- return [text[i:i + size] for i in range(0, len(text), size)]
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
 
 
54
 
55
  def synthesize_speech(text, lang="en"):
56
  try:
@@ -63,7 +69,6 @@ def synthesize_speech(text, lang="en"):
63
  print("TTS error:", e)
64
  return None
65
 
66
-
67
  def select_relevant_chunk(question, chunks, chunk_embeds):
68
  if not chunks or chunk_embeds is None:
69
  return ""
@@ -72,7 +77,6 @@ def select_relevant_chunk(question, chunks, chunk_embeds):
72
  top_idx = int(scores.argmax().item())
73
  return chunks[top_idx]
74
 
75
-
76
  def _chat_display_to_messages(chat_display):
77
  msgs = []
78
  for user, assistant in chat_display:
@@ -80,27 +84,12 @@ def _chat_display_to_messages(chat_display):
80
  msgs.append({"role": "assistant", "content": assistant})
81
  return msgs
82
 
 
 
 
 
83
 
84
- def _get_path_from_gr_file(gr_file):
85
- if not gr_file:
86
- return None
87
- if isinstance(gr_file, str) and os.path.exists(gr_file):
88
- return gr_file
89
- try:
90
- if hasattr(gr_file, "name") and isinstance(gr_file.name, str) and os.path.exists(gr_file.name):
91
- return gr_file.name
92
- except Exception:
93
- pass
94
- if isinstance(gr_file, dict):
95
- for key in ("name", "file_name", "filepath"):
96
- if key in gr_file:
97
- candidate = gr_file.get(key)
98
- if isinstance(candidate, str) and os.path.exists(candidate):
99
- return candidate
100
- return None
101
-
102
-
103
- # ------------------ Transcription & LLM ------------------
104
  def transcribe_audio(audio_path):
105
  if not audio_path or not os.path.exists(audio_path):
106
  return "Error: audio file missing."
@@ -116,7 +105,6 @@ def transcribe_audio(audio_path):
116
  print("transcription error:", e)
117
  return f"Error transcribing audio: {e}"
118
 
119
-
120
  def generate_response(session_id, user_text):
121
  if session_id not in SESSION_HISTORY:
122
  SESSION_HISTORY[session_id] = []
@@ -133,7 +121,6 @@ def generate_response(session_id, user_text):
133
  print("generate_response error:", e)
134
  return f"Error generating response: {e}"
135
 
136
-
137
  # ------------------ PDF Handling ------------------
138
  def handle_pdf_upload(pdf_file, session_id):
139
  path = _get_path_from_gr_file(pdf_file)
@@ -141,20 +128,17 @@ def handle_pdf_upload(pdf_file, session_id):
141
  return "No file uploaded or file unreadable."
142
  try:
143
  reader = PdfReader(path)
144
- text = ""
145
- for page in reader.pages:
146
- text += (page.extract_text() or "") + "\n"
147
  if not text.strip():
148
  return "No extractable content found in PDF."
149
  chunks = chunk_text(text)
150
  PDF_CONTENT[session_id] = chunks
151
  PDF_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
152
- return f"PDF processed: {len(chunks)} chunks ready."
153
  except Exception as e:
154
  print("PDF upload error:", e)
155
  return f"Error processing PDF: {e}"
156
 
157
-
158
  def handle_text_pdf(question, session_id):
159
  if session_id not in PDF_CONTENT:
160
  return "Document not found. Upload first."
@@ -172,7 +156,56 @@ def handle_text_pdf(question, session_id):
172
  print("PDF question error:", e)
173
  return f"Error generating response: {e}"
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
 
176
  def generate_pdf_file(text, filename_prefix="summary"):
177
  pdf = FPDF()
178
  pdf.add_page()
@@ -180,24 +213,58 @@ def generate_pdf_file(text, filename_prefix="summary"):
180
  pdf.set_font("Arial", size=12)
181
  for line in text.split("\n"):
182
  pdf.multi_cell(0, 6, line)
183
- file_path = f"/tmp/{filename_prefix}_{uuid.uuid4()}.pdf"
184
- pdf.output(file_path)
185
- return file_path
186
-
187
 
188
  def download_pdf_summary(session_pdf_id):
189
- summary_text = "\n".join([m["content"] for m in SESSION_HISTORY.get(session_pdf_id, []) if m["role"]=="assistant"])
190
  if not summary_text:
191
  summary_text = "No summary available."
192
  return generate_pdf_file(summary_text, "pdf_summary")
193
 
194
-
195
  def download_image_summary(session_image_id):
196
- summary_text = "\n".join([m["content"] for m in SESSION_HISTORY.get(session_image_id, []) if m["role"]=="assistant"])
197
  if not summary_text:
198
  summary_text = "No summary available."
199
  return generate_pdf_file(summary_text, "image_summary")
200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
  # ------------------ Gradio UI ------------------
203
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
@@ -207,51 +274,78 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
207
  session_pdf = gr.State(str(uuid.uuid4()))
208
  session_image = gr.State(str(uuid.uuid4()))
209
 
210
- # --- Voice ---
211
  with gr.Tab("🎀 Voice Chat"):
212
- chat_voice = gr.Chatbot(type="messages", height=380)
213
  with gr.Row():
214
- mic = gr.Audio(label="Hold & speak", type="filepath")
215
- tts_lang = gr.Dropdown(choices=["en", "ur"], value="en", label="TTS Language")
216
  with gr.Row():
217
- btn_general = gr.Button("Ask General")
218
- btn_pdf = gr.Button("Ask PDF")
219
- btn_image = gr.Button("Ask Image")
220
- audio_output = gr.Audio(label="Assistant Voice Output", type="filepath")
 
221
 
222
- # --- PDF ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  with gr.Tab("πŸ“„ PDF Summarizer"):
224
- pdf_output = gr.Textbox(label="Answer (Text Only)", lines=8)
225
- pdf_summary_file = gr.File(label="Download Summary PDF")
226
  with gr.Row():
227
  pdf_upload_btn = gr.File(label="Upload PDF", file_types=[".pdf"], interactive=True)
228
- pdf_upload_msg = gr.Textbox(label="Upload Status", interactive=False)
229
  pdf_question = gr.Textbox(label="Ask a question about PDF (text)", lines=2)
230
- pdf_send_btn = gr.Button("Ask (Text)")
231
- pdf_reset_btn = gr.Button("β™» Reset PDF")
232
- pdf_download_btn = gr.Button("πŸ“₯ Download Summary")
233
 
234
  pdf_upload_btn.upload(handle_pdf_upload, inputs=[pdf_upload_btn, session_pdf], outputs=[pdf_upload_msg])
235
  pdf_send_btn.click(handle_text_pdf, inputs=[pdf_question, session_pdf], outputs=[pdf_output])
236
  pdf_reset_btn.click(lambda: (str(uuid.uuid4()), ""), outputs=[session_pdf, pdf_output])
237
  pdf_download_btn.click(download_pdf_summary, inputs=[session_pdf], outputs=[pdf_summary_file])
238
 
239
- # --- Image ---
240
  with gr.Tab("πŸ–Ό Image OCR"):
241
- image_output = gr.Textbox(label="Answer (Text Only)", lines=8)
242
- img_summary_file = gr.File(label="Download Summary PDF")
243
  with gr.Row():
244
  image_upload_btn = gr.File(label="Upload Image", file_types=[".png", ".jpg", ".jpeg"], interactive=True)
245
- image_upload_msg = gr.Textbox(label="Upload Status", interactive=False)
246
  image_question = gr.Textbox(label="Ask a question about Image (text)", lines=2)
247
- image_send_btn = gr.Button("Ask (Text)")
248
- image_reset_btn = gr.Button("β™» Reset Image")
249
- img_download_btn = gr.Button("πŸ“₯ Download Summary")
250
 
251
- image_upload_btn.upload(handle_image_upload, inputs=[image_upload_btn, session_image], outputs=[image_upload_msg])
252
  image_send_btn.click(handle_text_image, inputs=[image_question, session_image], outputs=[image_output])
253
  image_reset_btn.click(lambda: (str(uuid.uuid4()), ""), outputs=[session_image, image_output])
254
  img_download_btn.click(download_image_summary, inputs=[session_image], outputs=[img_summary_file])
255
 
256
  if __name__ == "__main__":
257
- demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)
 
1
  # app.py
2
  """
3
  Multi-Mode AI Assistant (Voice, PDF, Image)
4
+ - Fixed gr.File issue: removed unsupported 'file_types_preview' argument
5
+ - Interactive, compact, visually appealing UI
 
 
 
 
 
 
 
6
  """
7
  import os
8
  import uuid
9
  import tempfile
10
  import requests
11
+ from datetime import datetime
12
  from dotenv import load_dotenv
13
  from gtts import gTTS
14
  from PyPDF2 import PdfReader
15
  import gradio as gr
16
  from sentence_transformers import SentenceTransformer, util
17
  from fpdf import FPDF
 
18
 
19
  # ------------------ Load API Keys ------------------
20
  load_dotenv()
21
  GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()
22
  OCR_SPACE_API_KEY = os.getenv("OCR_SPACE_API_KEY", "").strip()
 
23
  if not GROQ_API_KEY:
24
  raise ValueError("❌ GROQ_API_KEY missing. Set it in env / Hugging Face Secrets.")
25
  if not OCR_SPACE_API_KEY:
 
27
 
28
  HEADERS = {"Authorization": f"Bearer {GROQ_API_KEY}"}
29
 
30
+ # ------------------ Global States ------------------
31
  SESSION_HISTORY = {}
32
+ CHAT_DISPLAY = {}
33
  PDF_CONTENT = {}
34
  PDF_EMBEDS = {}
35
  IMAGE_TEXT = {}
36
  IMAGE_EMBEDS = {}
37
  CHUNK_SIZE = 1500
38
 
 
39
  embed_model = SentenceTransformer("all-MiniLM-L6-v2")
40
 
 
41
  # ------------------ Helpers ------------------
42
+ def _get_path_from_gr_file(gr_file):
43
+ if not gr_file:
44
+ return None
45
+ if isinstance(gr_file, str) and os.path.exists(gr_file):
46
+ return gr_file
47
+ try:
48
+ if hasattr(gr_file, "name") and os.path.exists(gr_file.name):
49
+ return gr_file.name
50
+ except:
51
+ pass
52
+ if isinstance(gr_file, dict):
53
+ for key in ("name", "file_name", "filepath"):
54
+ if key in gr_file and os.path.exists(gr_file[key]):
55
+ return gr_file[key]
56
+ return None
57
 
58
+ def chunk_text(text, size=CHUNK_SIZE):
59
+ return [text[i:i+size] for i in range(0, len(text), size)]
60
 
61
  def synthesize_speech(text, lang="en"):
62
  try:
 
69
  print("TTS error:", e)
70
  return None
71
 
 
72
  def select_relevant_chunk(question, chunks, chunk_embeds):
73
  if not chunks or chunk_embeds is None:
74
  return ""
 
77
  top_idx = int(scores.argmax().item())
78
  return chunks[top_idx]
79
 
 
80
  def _chat_display_to_messages(chat_display):
81
  msgs = []
82
  for user, assistant in chat_display:
 
84
  msgs.append({"role": "assistant", "content": assistant})
85
  return msgs
86
 
87
+ def _append_chat_display(session_id, user_text, assistant_text):
88
+ if session_id not in CHAT_DISPLAY:
89
+ CHAT_DISPLAY[session_id] = []
90
+ CHAT_DISPLAY[session_id].append((user_text, assistant_text))
91
 
92
+ # ------------------ Voice & LLM ------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  def transcribe_audio(audio_path):
94
  if not audio_path or not os.path.exists(audio_path):
95
  return "Error: audio file missing."
 
105
  print("transcription error:", e)
106
  return f"Error transcribing audio: {e}"
107
 
 
108
  def generate_response(session_id, user_text):
109
  if session_id not in SESSION_HISTORY:
110
  SESSION_HISTORY[session_id] = []
 
121
  print("generate_response error:", e)
122
  return f"Error generating response: {e}"
123
 
 
124
  # ------------------ PDF Handling ------------------
125
  def handle_pdf_upload(pdf_file, session_id):
126
  path = _get_path_from_gr_file(pdf_file)
 
128
  return "No file uploaded or file unreadable."
129
  try:
130
  reader = PdfReader(path)
131
+ text = "".join([page.extract_text() or "" for page in reader.pages])
 
 
132
  if not text.strip():
133
  return "No extractable content found in PDF."
134
  chunks = chunk_text(text)
135
  PDF_CONTENT[session_id] = chunks
136
  PDF_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
137
+ return f"PDF uploaded: {len(chunks)} chunks ready."
138
  except Exception as e:
139
  print("PDF upload error:", e)
140
  return f"Error processing PDF: {e}"
141
 
 
142
  def handle_text_pdf(question, session_id):
143
  if session_id not in PDF_CONTENT:
144
  return "Document not found. Upload first."
 
156
  print("PDF question error:", e)
157
  return f"Error generating response: {e}"
158
 
159
+ # ------------------ Image OCR ------------------
160
+ def ocr_space_file(image_path, api_key, language="eng"):
161
+ if not image_path or not os.path.exists(image_path):
162
+ return ""
163
+ try:
164
+ with open(image_path, "rb") as f:
165
+ payload = {"apikey": api_key, "language": language}
166
+ files = {"file": f}
167
+ r = requests.post("https://api.ocr.space/parse/image", files=files, data=payload, timeout=60)
168
+ r.raise_for_status()
169
+ j = r.json()
170
+ if j.get("IsErroredOnProcessing"):
171
+ print("OCR.space error:", j)
172
+ return ""
173
+ parsed = [pr.get("ParsedText", "") for pr in j.get("ParsedResults", [])]
174
+ return "\n".join(parsed)
175
+ except Exception as e:
176
+ print("ocr_space_file error:", e)
177
+ return ""
178
+
179
+ def handle_image_upload(image_file, session_id):
180
+ path = _get_path_from_gr_file(image_file)
181
+ if not path:
182
+ return "No image uploaded.", ""
183
+ parsed = ocr_space_file(path, OCR_SPACE_API_KEY)
184
+ if not parsed.strip():
185
+ return "No extractable text found in the image.", ""
186
+ chunks = chunk_text(parsed)
187
+ IMAGE_TEXT[session_id] = chunks
188
+ IMAGE_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
189
+ return f"Image processed: {len(chunks)} chunks ready.", ""
190
+
191
+ def handle_text_image(question, session_id):
192
+ if session_id not in IMAGE_TEXT:
193
+ return "Image not found. Upload first."
194
+ chunk = select_relevant_chunk(question, IMAGE_TEXT[session_id], IMAGE_EMBEDS[session_id])
195
+ messages = [
196
+ {"role": "system", "content": "You are a helpful assistant summarizing image text."},
197
+ {"role": "user", "content": f"Image chunk:\n{chunk}\n\nQuestion: {question}"}
198
+ ]
199
+ body = {"model": "llama-3.1-8b-instant", "messages": messages}
200
+ try:
201
+ resp = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=HEADERS, json=body, timeout=60)
202
+ resp.raise_for_status()
203
+ return resp.json()["choices"][0]["message"]["content"]
204
+ except Exception as e:
205
+ print("Image question error:", e)
206
+ return f"Error generating response: {e}"
207
 
208
+ # ------------------ PDF Generation ------------------
209
  def generate_pdf_file(text, filename_prefix="summary"):
210
  pdf = FPDF()
211
  pdf.add_page()
 
213
  pdf.set_font("Arial", size=12)
214
  for line in text.split("\n"):
215
  pdf.multi_cell(0, 6, line)
216
+ tmp_path = f"/tmp/{filename_prefix}_{uuid.uuid4()}.pdf"
217
+ pdf.output(tmp_path)
218
+ return tmp_path
 
219
 
220
  def download_pdf_summary(session_pdf_id):
221
+ summary_text = "\n".join([msg["content"] for msg in SESSION_HISTORY.get(session_pdf_id, []) if msg["role"]=="assistant"])
222
  if not summary_text:
223
  summary_text = "No summary available."
224
  return generate_pdf_file(summary_text, "pdf_summary")
225
 
 
226
  def download_image_summary(session_image_id):
227
+ summary_text = "\n".join([msg["content"] for msg in SESSION_HISTORY.get(session_image_id, []) if msg["role"]=="assistant"])
228
  if not summary_text:
229
  summary_text = "No summary available."
230
  return generate_pdf_file(summary_text, "image_summary")
231
 
232
+ # ------------------ Voice Handlers ------------------
233
+ def handle_voice_general(audio_file, session_id, tts_lang="en"):
234
+ path = _get_path_from_gr_file(audio_file)
235
+ if not path:
236
+ return "No audio provided.", None, []
237
+ user_text = transcribe_audio(path)
238
+ assistant_text = generate_response(session_id, user_text)
239
+ _append_chat_display(session_id, user_text, assistant_text)
240
+ audio_path = synthesize_speech(assistant_text, lang=tts_lang)
241
+ return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
242
+
243
+ def handle_voice_pdf(audio_file, session_id, tts_lang="en"):
244
+ path = _get_path_from_gr_file(audio_file)
245
+ if not path:
246
+ return "No audio provided.", None, []
247
+ user_text = transcribe_audio(path)
248
+ assistant_text = handle_text_pdf(user_text, session_id)
249
+ _append_chat_display(session_id, user_text, assistant_text)
250
+ audio_path = synthesize_speech(assistant_text, lang=tts_lang)
251
+ return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
252
+
253
+ def handle_voice_image(audio_file, session_id, tts_lang="en"):
254
+ path = _get_path_from_gr_file(audio_file)
255
+ if not path:
256
+ return "No audio provided.", None, []
257
+ user_text = transcribe_audio(path)
258
+ assistant_text = handle_text_image(user_text, session_id)
259
+ _append_chat_display(session_id, user_text, assistant_text)
260
+ audio_path = synthesize_speech(assistant_text, lang=tts_lang)
261
+ return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
262
+
263
+ # ------------------ Text Handlers ------------------
264
+ def handle_text_general(user_text, session_id):
265
+ assistant = generate_response(session_id, user_text)
266
+ _append_chat_display(session_id, user_text, assistant)
267
+ return assistant, _chat_display_to_messages(CHAT_DISPLAY[session_id])
268
 
269
  # ------------------ Gradio UI ------------------
270
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
 
274
  session_pdf = gr.State(str(uuid.uuid4()))
275
  session_image = gr.State(str(uuid.uuid4()))
276
 
277
+ # ---------------- Voice Tab ----------------
278
  with gr.Tab("🎀 Voice Chat"):
279
+ chat_voice = gr.Chatbot(type="messages", height=350)
280
  with gr.Row():
281
+ mic = gr.Audio(type="filepath", label="🎀 Record Voice (hold & speak)", show_download_button=False)
282
+ tts_lang = gr.Dropdown(choices=["en", "ur"], value="en", label="TTS Language", interactive=True, scale=1)
283
  with gr.Row():
284
+ btn_general = gr.Button("Ask General 🎯", scale=1)
285
+ btn_pdf = gr.Button("Ask PDF πŸ“„", scale=1)
286
+ btn_image = gr.Button("Ask Image πŸ–Ό", scale=1)
287
+ answer_voice = gr.Textbox(label="Assistant Answer (text)", lines=4)
288
+ audio_output = gr.Audio(label="Assistant Voice Output", type="filepath", interactive=False)
289
 
290
+ # Text-only general chat
291
+ with gr.Row():
292
+ text_input = gr.Textbox(label="Or type a question (General)", placeholder="Type message here...", lines=2)
293
+ btn_send_text = gr.Button("Send (Text General)", scale=1)
294
+
295
+ btn_general.click(
296
+ fn=handle_voice_general,
297
+ inputs=[mic, session_voice, tts_lang],
298
+ outputs=[answer_voice, audio_output, chat_voice],
299
+ )
300
+ btn_pdf.click(
301
+ fn=handle_voice_pdf,
302
+ inputs=[mic, session_pdf, tts_lang],
303
+ outputs=[answer_voice, audio_output, chat_voice],
304
+ )
305
+ btn_image.click(
306
+ fn=handle_voice_image,
307
+ inputs=[mic, session_image, tts_lang],
308
+ outputs=[answer_voice, audio_output, chat_voice],
309
+ )
310
+ btn_send_text.click(
311
+ fn=handle_text_general,
312
+ inputs=[text_input, session_voice],
313
+ outputs=[answer_voice, chat_voice],
314
+ )
315
+
316
+ # ---------------- PDF Tab ----------------
317
  with gr.Tab("πŸ“„ PDF Summarizer"):
318
+ pdf_output = gr.Textbox(label="Answer (Text Only)", lines=6)
319
+ pdf_summary_file = gr.File(label="πŸ“₯ Download PDF Summary")
320
  with gr.Row():
321
  pdf_upload_btn = gr.File(label="Upload PDF", file_types=[".pdf"], interactive=True)
322
+ pdf_upload_msg = gr.Textbox(label="Upload Status", interactive=False, lines=1)
323
  pdf_question = gr.Textbox(label="Ask a question about PDF (text)", lines=2)
324
+ pdf_send_btn = gr.Button("Ask (Text)", scale=1)
325
+ pdf_reset_btn = gr.Button("β™» Reset PDF", scale=1)
326
+ pdf_download_btn = gr.Button("πŸ“₯ Download Summary", scale=1)
327
 
328
  pdf_upload_btn.upload(handle_pdf_upload, inputs=[pdf_upload_btn, session_pdf], outputs=[pdf_upload_msg])
329
  pdf_send_btn.click(handle_text_pdf, inputs=[pdf_question, session_pdf], outputs=[pdf_output])
330
  pdf_reset_btn.click(lambda: (str(uuid.uuid4()), ""), outputs=[session_pdf, pdf_output])
331
  pdf_download_btn.click(download_pdf_summary, inputs=[session_pdf], outputs=[pdf_summary_file])
332
 
333
+ # ---------------- Image Tab ----------------
334
  with gr.Tab("πŸ–Ό Image OCR"):
335
+ image_output = gr.Textbox(label="Answer (Text Only)", lines=6)
336
+ img_summary_file = gr.File(label="πŸ“₯ Download PDF Summary")
337
  with gr.Row():
338
  image_upload_btn = gr.File(label="Upload Image", file_types=[".png", ".jpg", ".jpeg"], interactive=True)
339
+ image_upload_msg = gr.Textbox(label="Upload Status", interactive=False, lines=1)
340
  image_question = gr.Textbox(label="Ask a question about Image (text)", lines=2)
341
+ image_send_btn = gr.Button("Ask", scale=1)
342
+ image_reset_btn = gr.Button("β™» Reset Image", scale=1)
343
+ img_download_btn = gr.Button("πŸ“₯ Download Summary", scale=1)
344
 
345
+ image_upload_btn.upload(handle_image_upload, inputs=[image_upload_btn, session_image], outputs=[image_upload_msg, image_output])
346
  image_send_btn.click(handle_text_image, inputs=[image_question, session_image], outputs=[image_output])
347
  image_reset_btn.click(lambda: (str(uuid.uuid4()), ""), outputs=[session_image, image_output])
348
  img_download_btn.click(download_image_summary, inputs=[session_image], outputs=[img_summary_file])
349
 
350
  if __name__ == "__main__":
351
+ demo.launch(server_name="0.0.0.0", server_port=7860)