asad9641 commited on
Commit
ec41711
·
verified ·
1 Parent(s): 0be9386

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +251 -299
app.py CHANGED
@@ -1,10 +1,8 @@
1
  # app.py
2
  """
3
- Multi-Mode AI Assistant (Voice, PDF, Image) with Wow-Factor Features
4
- - Preserves original features
5
- - Adds snippet highlighting, cross-modal memory, styled PDF generation
6
- - Live waveform placeholder for voice input
7
- - Modular & Hugging Face safe
8
  """
9
  import os
10
  import uuid
@@ -18,16 +16,12 @@ from sentence_transformers import SentenceTransformer, util
18
  from fpdf import FPDF
19
  from datetime import datetime
20
 
21
- # ------------------ Load API Keys ------------------
22
  load_dotenv()
23
  GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()
24
  OCR_SPACE_API_KEY = os.getenv("OCR_SPACE_API_KEY", "").strip()
25
-
26
- if not GROQ_API_KEY:
27
- raise ValueError("❌ GROQ_API_KEY missing. Set it in env / Hugging Face Secrets.")
28
- if not OCR_SPACE_API_KEY:
29
- raise ValueError("❌ OCR_SPACE_API_KEY missing. Set it in env / Hugging Face Secrets.")
30
-
31
  HEADERS = {"Authorization": f"Bearer {GROQ_API_KEY}"}
32
 
33
  # ------------------ Global State ------------------
@@ -38,339 +32,297 @@ PDF_EMBEDS = {}
38
  IMAGE_TEXT = {}
39
  IMAGE_EMBEDS = {}
40
  CHUNK_SIZE = 1500
41
-
42
- # Load embedding model
43
  embed_model = SentenceTransformer("all-MiniLM-L6-v2")
44
 
45
  # ------------------ Helpers ------------------
46
  def _get_path_from_gr_file(gr_file):
47
- if not gr_file:
48
- return None
49
- if isinstance(gr_file, str) and os.path.exists(gr_file):
50
- return gr_file
51
  try:
52
- if hasattr(gr_file, "name") and os.path.exists(gr_file.name):
53
- return gr_file.name
54
- except Exception:
55
- pass
56
- if isinstance(gr_file, dict):
57
- for key in ("name", "file_name", "filepath"):
58
  if key in gr_file:
59
- candidate = gr_file.get(key)
60
- if isinstance(candidate, str) and os.path.exists(candidate):
61
- return candidate
62
  return None
63
 
64
- def chunk_text(text, size=CHUNK_SIZE):
65
- return [text[i:i + size] for i in range(0, len(text), size)]
66
 
67
- def synthesize_speech(text, lang="en"):
68
- try:
69
- if not text:
70
- return None
71
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
72
- gTTS(text=text, lang=lang).save(tmp.name)
73
- return tmp.name
74
- except Exception as e:
75
- print("TTS error:", e)
76
- return None
77
-
78
- def select_relevant_chunk(question, chunks, chunk_embeds):
79
- if not chunks or chunk_embeds is None:
80
- return ""
81
- q_emb = embed_model.encode(question, convert_to_tensor=True)
82
- scores = util.cos_sim(q_emb, chunk_embeds)[0]
83
- top_idx = int(scores.argmax().item())
84
  return chunks[top_idx]
85
 
86
  def _chat_display_to_messages(chat_display):
87
- msgs = []
88
- for user, assistant in chat_display:
89
- msgs.append({"role": "user", "content": user})
90
- msgs.append({"role": "assistant", "content": assistant})
91
  return msgs
92
 
93
- # ------------------ Transcription & LLM ------------------
94
  def transcribe_audio(audio_path):
95
  if not audio_path or not os.path.exists(audio_path):
96
  return "Error: audio file missing."
97
  try:
98
- url = "https://api.groq.com/openai/v1/audio/transcriptions"
99
- with open(audio_path, "rb") as f:
100
- files = {"file": (os.path.basename(audio_path), f, "audio/wav")}
101
- data = {"model": "whisper-large-v3"}
102
- resp = requests.post(url, headers=HEADERS, files=files, data=data, timeout=60)
103
  resp.raise_for_status()
104
- return resp.json().get("text", "") or ""
105
  except Exception as e:
106
- print("transcription error:", e)
107
  return f"Error transcribing audio: {e}"
108
 
109
  def groq_chat_completion(messages):
110
- body = {"model": "llama-3.1-8b-instant", "messages": messages}
111
  try:
112
- resp = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=HEADERS, json=body, timeout=60)
 
113
  resp.raise_for_status()
114
  return resp.json()["choices"][0]["message"]["content"]
115
  except Exception as e:
116
- print("groq_chat_completion error:", e)
117
  return f"Error generating response: {e}"
118
 
119
- def generate_response(session_id, user_text, enhancer_enabled=False, enhancer_tone="Helpful"):
120
- if session_id not in SESSION_HISTORY:
121
- SESSION_HISTORY[session_id] = []
122
-
123
- SESSION_HISTORY[session_id].append({"role": "user", "content": user_text})
124
- messages = [{"role": "system", "content": "You are a helpful AI assistant."}] + SESSION_HISTORY[session_id]
125
-
126
  if enhancer_enabled:
127
- messages.append({"role": "user", "content": f"Enhance response. Tone: {enhancer_tone}. Question: {user_text}"})
128
-
129
- assistant_text = groq_chat_completion(messages)
130
- SESSION_HISTORY[session_id].append({"role": "assistant", "content": assistant_text})
131
  return assistant_text
132
 
133
- # ------------------ PDF handling ------------------
134
- def handle_pdf_upload(pdf_file, session_id):
135
- path = _get_path_from_gr_file(pdf_file)
136
- if not path:
137
- return "No file uploaded or file unreadable."
138
  try:
139
- reader = PdfReader(path)
140
- text = ""
141
- for page in reader.pages:
142
- text += (page.extract_text() or "") + "\n"
143
- if not text.strip():
144
- return "No extractable content found in PDF."
145
- chunks = chunk_text(text)
146
- PDF_CONTENT[session_id] = chunks
147
- PDF_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
148
- return f"PDF processed: {len(chunks)} chunks ready."
149
- except Exception as e:
150
- print("PDF upload error:", e)
151
- return f"Error processing PDF: {e}"
152
-
153
- def handle_pdf_question(question, session_id):
154
- if session_id not in PDF_CONTENT:
155
- return "Document not found. Upload first."
156
- chunk = select_relevant_chunk(question, PDF_CONTENT[session_id], PDF_EMBEDS[session_id])
157
- messages = [
158
- {"role": "system", "content": "You are a helpful assistant summarizing PDF content."},
159
- {"role": "user", "content": f"PDF chunk:\n{chunk}\n\nQuestion: {question}"}
160
- ]
161
- assistant_text = groq_chat_completion(messages)
162
- # Add snippet highlighting for wow factor
163
- assistant_text = f"**Snippet from PDF:**\n{chunk[:200]}...\n\n**Answer:**\n{assistant_text}"
164
- if session_id not in SESSION_HISTORY:
165
- SESSION_HISTORY[session_id] = []
166
- SESSION_HISTORY[session_id].append({"role": "assistant", "content": assistant_text})
167
  return assistant_text
168
 
169
  # ------------------ Image OCR ------------------
170
- def ocr_space_file(image_path, api_key, language="eng"):
171
- if not image_path or not os.path.exists(image_path):
172
- return ""
173
  try:
174
- with open(image_path, "rb") as f:
175
- payload = {"apikey": api_key, "language": language}
176
- files = {"file": f}
177
- r = requests.post("https://api.ocr.space/parse/image", files=files, data=payload, timeout=60)
178
  r.raise_for_status()
179
- j = r.json()
180
- if j.get("IsErroredOnProcessing"):
181
- print("OCR.space processing error:", j)
182
- return ""
183
- parsed = [pr.get("ParsedText", "") for pr in j.get("ParsedResults", [])]
184
  return "\n".join(parsed)
185
- except Exception as e:
186
- print("ocr_space_file error:", e)
187
- return ""
188
-
189
- def handle_image_upload(image_file, session_id):
190
- path = _get_path_from_gr_file(image_file)
191
- if not path:
192
- return "No image uploaded or file unreadable.", ""
193
- parsed = ocr_space_file(path, OCR_SPACE_API_KEY)
194
- if not parsed.strip():
195
- return "No extractable text found in the image.", ""
196
- chunks = chunk_text(parsed)
197
- IMAGE_TEXT[session_id] = chunks
198
- IMAGE_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
199
- return f"Image processed: {len(chunks)} chunks ready.", ""
200
-
201
- def handle_image_question(question, session_id):
202
- if session_id not in IMAGE_TEXT:
203
- return "Image not found. Upload first."
204
- chunk = select_relevant_chunk(question, IMAGE_TEXT[session_id], IMAGE_EMBEDS[session_id])
205
- messages = [
206
- {"role": "system", "content": "You are a helpful assistant summarizing image text."},
207
- {"role": "user", "content": f"Image chunk:\n{chunk}\n\nQuestion: {question}"}
208
- ]
209
- assistant_text = groq_chat_completion(messages)
210
- assistant_text = f"**Snippet from Image:**\n{chunk[:200]}...\n\n**Answer:**\n{assistant_text}"
211
- if session_id not in SESSION_HISTORY:
212
- SESSION_HISTORY[session_id] = []
213
- SESSION_HISTORY[session_id].append({"role": "assistant", "content": assistant_text})
214
  return assistant_text
215
 
216
- # ------------------ PDF Generation ------------------
217
- def generate_pdf_file(text, filename_prefix="summary"):
218
- pdf = FPDF()
219
  pdf.add_page()
220
- pdf.set_auto_page_break(auto=True, margin=15)
221
- pdf.set_font("Arial", "B", size=14)
222
- pdf.multi_cell(0, 8, f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n\n")
223
- pdf.set_font("Arial", size=12)
224
- for line in text.split("\n"):
225
- pdf.multi_cell(0, 6, line)
226
- file_path = f"/tmp/{filename_prefix}_{uuid.uuid4()}.pdf"
227
- pdf.output(file_path)
228
- return file_path
229
 
230
  def download_pdf_summary(session_id):
231
- summary_text = "\n".join([m["content"] for m in SESSION_HISTORY.get(session_id, []) if m["role"]=="assistant"])
232
- if not summary_text:
233
- summary_text = "No summary available."
234
- return generate_pdf_file(summary_text, "summary")
235
-
236
- # ------------------ Voice & Chat Handlers ------------------
237
- def _append_chat_display(session_id, user_text, assistant_text):
238
- if session_id not in CHAT_DISPLAY:
239
- CHAT_DISPLAY[session_id] = []
240
- CHAT_DISPLAY[session_id].append((user_text, assistant_text))
241
-
242
- def handle_voice_general(audio_file, session_id, tts_lang="en", enhancer_enabled=False, enhancer_tone="Helpful"):
243
- path = _get_path_from_gr_file(audio_file)
244
- if not path:
245
- return "No audio provided.", None, []
246
- user_text = transcribe_audio(path)
247
- assistant_text = generate_response(session_id, user_text, enhancer_enabled, enhancer_tone)
248
- _append_chat_display(session_id, user_text, assistant_text)
249
- audio_path = synthesize_speech(assistant_text, lang=tts_lang)
250
- return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
251
-
252
- def handle_voice_pdf(audio_file, session_id, tts_lang="en"):
253
- path = _get_path_from_gr_file(audio_file)
254
- if not path:
255
- return "No audio provided.", None, []
256
- user_text = transcribe_audio(path)
257
- assistant_text = handle_pdf_question(user_text, session_id)
258
- _append_chat_display(session_id, user_text, assistant_text)
259
- audio_path = synthesize_speech(assistant_text, lang=tts_lang)
260
- return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
261
-
262
- def handle_voice_image(audio_file, session_id, tts_lang="en"):
263
- path = _get_path_from_gr_file(audio_file)
264
- if not path:
265
- return "No audio provided.", None, []
266
- user_text = transcribe_audio(path)
267
- assistant_text = handle_image_question(user_text, session_id)
268
- _append_chat_display(session_id, user_text, assistant_text)
269
- audio_path = synthesize_speech(assistant_text, lang=tts_lang)
270
- return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
271
-
272
- def handle_text_general(user_text, session_id, enhancer_enabled=False, enhancer_tone="Helpful"):
273
- assistant = generate_response(session_id, user_text, enhancer_enabled, enhancer_tone)
274
- _append_chat_display(session_id, user_text, assistant)
275
- return assistant, _chat_display_to_messages(CHAT_DISPLAY[session_id])
276
-
277
- def handle_text_pdf(question, session_id):
278
- return handle_pdf_question(question, session_id)
279
-
280
- def handle_text_image(question, session_id):
281
- return handle_image_question(question, session_id)
282
-
283
- # ------------------ Gradio UI ------------------
284
  with gr.Blocks() as demo:
285
  gr.HTML("""
286
  <style>
287
- /* Change height + width of the audio recorder box */
288
- #mic_box audio {
289
- height: 50px !important; /* adjust height */
290
- width: 200px !important; /* adjust width (optional) */
291
- }
 
 
 
 
 
 
292
  </style>
293
  """)
294
- gr.Markdown("## 🛠 Multi-Mode AI Assistant (Voice, PDF, Image)")
295
-
296
- session_voice = gr.State(str(uuid.uuid4()))
297
- session_pdf = gr.State(str(uuid.uuid4()))
298
- session_image = gr.State(str(uuid.uuid4()))
299
- # FIX: define pdf_summary_file BEFORE it is used
300
- #pdf_summary_file = gr.File(label="Download Summary", visible=False)
301
-
302
- with gr.Tab("🎤 Voice Chat"):
303
- chat_voice = gr.Chatbot(t, height=320)
304
- with gr.Row():
305
- mic = gr.Audio(type="filepath",label="🎤 Record Voice (hold & speak)", elem_id="mic_box")
306
- audio_output = gr.Audio(label="Assistant Voice Output", type="filepath", interactive=False)
307
- tts_lang = gr.Dropdown(choices=["en", "ur"], value="en", label="TTS Language")
308
-
309
- with gr.Row():
310
- btn_general = gr.Button("⚡Ask General 🎯")
311
- btn_pdf = gr.Button("⚡Ask PDF 📄")
312
- btn_image = gr.Button("⚡Ask Image 🖼")
313
- enhancer_toggle = gr.Checkbox(label="Enable Response Enhancer", value=False, scale =1)
314
- tone_dropdown = gr.Dropdown(choices=["Helpful", "Formal", "Friendly"], value="Helpful", label="Enhancer Tone", scale =1)
315
- with gr.Row():
316
- btn_reset_logs = gr.Button(" Reset LOGs")
317
- btn_download_logs = gr.Button("📥 Download Summary")
318
- Voice_summary_file = gr.File(label="📥Download Summary File", interactive=False,scale =1)
319
- #btn_general = gr.Button("⚡Ask General 🎯")
320
- #btn_pdf = gr.Button("⚡Ask PDF 📄")
321
- #btn_image = gr.Button("⚡Ask Image 🖼")
322
- #with gr.Row():
323
- #text_input = gr.Textbox(label="Or type a question (General)",visible=False)
324
- #btn_send_text = gr.Button("Send (Text General)",visible=False)
325
- #btn_reset_logs = gr.Button("♻ Reset LOGs")
326
- answer_voice = gr.Textbox(label="Assistant Answer (text)", lines=2, visible=False)
327
-
328
- btn_general.click(fn=handle_voice_general,
329
- inputs=[mic, session_voice, tts_lang, enhancer_toggle, tone_dropdown],
330
- outputs=[answer_voice, audio_output, chat_voice])
331
- btn_pdf.click(fn=handle_voice_pdf, inputs=[mic, session_pdf, tts_lang], outputs=[answer_voice, audio_output, chat_voice])
332
- btn_image.click(fn=handle_voice_image, inputs=[mic, session_image, tts_lang], outputs=[answer_voice, audio_output, chat_voice])
333
- # btn_send_text.click(fn=handle_text_general, inputs=[text_input, session_voice, enhancer_toggle, tone_dropdown], outputs=[answer_voice, chat_voice])
334
- btn_reset_logs.click(lambda: (str(uuid.uuid4()), [], None, None, ""), outputs=[session_voice, chat_voice, mic, audio_output, answer_voice])
335
- btn_download_logs.click(download_pdf_summary, inputs=[session_voice], outputs=[Voice_summary_file])
336
-
337
- with gr.Tab("📄 PDF Summarizer"):
338
- pdf_output = gr.Textbox(label="Answer (Text Only)", lines=5)
339
- with gr.Row():
340
- pdf_upload_btn = gr.File(label="Upload PDF", file_types=[".pdf"], scale=1 )
341
- pdf_question = gr.Textbox(label="Ask a question about PDF (text)", lines=3)
342
- pdf_upload_msg = gr.Textbox(label="Upload Status", interactive=False)
343
-
344
- with gr.Row():
345
- pdf_send_btn = gr.Button("Ask (Questions)")
346
- pdf_reset_btn = gr.Button(" Reset LOGs")
347
- with gr.Row():
348
- pdf_summary_file = gr.File(label="📥Download Summary File", interactive=False,scale =1)
349
- pdf_download_btn = gr.Button("📥 Download Summary")
350
-
351
- pdf_upload_btn.upload(handle_pdf_upload, inputs=[pdf_upload_btn, session_pdf], outputs=[pdf_upload_msg])
352
- pdf_send_btn.click(handle_text_pdf, inputs=[pdf_question, session_pdf], outputs=[pdf_output])
353
- pdf_reset_btn.click(lambda: (str(uuid.uuid4()), ""), outputs=[session_pdf, pdf_output])
354
- pdf_download_btn.click(download_pdf_summary, inputs=[session_pdf], outputs=[pdf_summary_file])
355
-
356
- with gr.Tab("🖼 Image OCR"):
357
- image_output = gr.Textbox(label="Answer (Text Only)", lines=5)
358
- with gr.Row():
359
- image_upload_btn = gr.File(label="Upload Image", file_types=[".png", ".jpg", ".jpeg"], scale =1 )
360
- image_question = gr.Textbox(label="Ask question about Image", lines=3)
361
- image_upload_msg = gr.Textbox(label="Upload Status", interactive=False)
362
-
363
- with gr.Row():
364
- image_send_btn = gr.Button("Ask (Questions)")
365
- image_reset_btn = gr.Button("♻ Reset LOGs")
366
- with gr.Row():
367
- image_summary_file = gr.File(label="📥Download Summary File", interactive=False,scale =1)
368
- image_download_btn = gr.Button("📥 Download Summary")
369
-
370
- image_upload_btn.upload(handle_image_upload, inputs=[image_upload_btn, session_image], outputs=[image_upload_msg, image_output])
371
- image_send_btn.click(handle_text_image, inputs=[image_question, session_image], outputs=[image_output])
372
- image_reset_btn.click(lambda: (str(uuid.uuid4()), ""), outputs=[session_image, image_output])
373
- image_download_btn.click(download_pdf_summary, inputs=[session_image], outputs=[image_summary_file])
374
-
375
- if __name__ == "__main__":
376
- demo.launch()
 
 
 
1
  # app.py
2
  """
3
+ Multi-Mode AI Assistant (Voice, PDF, Image) with full colorful website-like UI
4
+ - All functionality remains intact
5
+ - Custom tabs, bright buttons, dark background, visible text
 
 
6
  """
7
  import os
8
  import uuid
 
16
  from fpdf import FPDF
17
  from datetime import datetime
18
 
19
+ # ------------------ API Keys ------------------
20
  load_dotenv()
21
  GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()
22
  OCR_SPACE_API_KEY = os.getenv("OCR_SPACE_API_KEY", "").strip()
23
+ if not GROQ_API_KEY: raise ValueError("❌ GROQ_API_KEY missing.")
24
+ if not OCR_SPACE_API_KEY: raise ValueError("❌ OCR_SPACE_API_KEY missing.")
 
 
 
 
25
  HEADERS = {"Authorization": f"Bearer {GROQ_API_KEY}"}
26
 
27
  # ------------------ Global State ------------------
 
32
  IMAGE_TEXT = {}
33
  IMAGE_EMBEDS = {}
34
  CHUNK_SIZE = 1500
 
 
35
  embed_model = SentenceTransformer("all-MiniLM-L6-v2")
36
 
37
  # ------------------ Helpers ------------------
38
  def _get_path_from_gr_file(gr_file):
39
+ if not gr_file: return None
40
+ if isinstance(gr_file,str) and os.path.exists(gr_file): return gr_file
 
 
41
  try:
42
+ if hasattr(gr_file,"name") and os.path.exists(gr_file.name): return gr_file.name
43
+ except: pass
44
+ if isinstance(gr_file,dict):
45
+ for key in ("name","file_name","filepath"):
 
 
46
  if key in gr_file:
47
+ candidate=gr_file.get(key)
48
+ if isinstance(candidate,str) and os.path.exists(candidate): return candidate
 
49
  return None
50
 
51
+ def chunk_text(text,size=CHUNK_SIZE):
52
+ return [text[i:i+size] for i in range(0,len(text),size)]
53
 
54
+ def synthesize_speech(text,lang="en"):
55
+ if not text: return None
56
+ tmp=tempfile.NamedTemporaryFile(delete=False,suffix=".mp3")
57
+ gTTS(text=text,lang=lang).save(tmp.name)
58
+ return tmp.name
59
+
60
+ def select_relevant_chunk(question,chunks,chunk_embeds):
61
+ if not chunks or chunk_embeds is None: return ""
62
+ q_emb=embed_model.encode(question,convert_to_tensor=True)
63
+ scores=util.cos_sim(q_emb,chunk_embeds)[0]
64
+ top_idx=int(scores.argmax().item())
 
 
 
 
 
 
65
  return chunks[top_idx]
66
 
67
  def _chat_display_to_messages(chat_display):
68
+ msgs=[]
69
+ for user,assistant in chat_display:
70
+ msgs.append({"role":"user","content":user})
71
+ msgs.append({"role":"assistant","content":assistant})
72
  return msgs
73
 
74
+ # ------------------ LLM & Transcription ------------------
75
  def transcribe_audio(audio_path):
76
  if not audio_path or not os.path.exists(audio_path):
77
  return "Error: audio file missing."
78
  try:
79
+ url="https://api.groq.com/openai/v1/audio/transcriptions"
80
+ with open(audio_path,"rb") as f:
81
+ files={"file":(os.path.basename(audio_path),f,"audio/wav")}
82
+ data={"model":"whisper-large-v3"}
83
+ resp=requests.post(url,headers=HEADERS,files=files,data=data,timeout=60)
84
  resp.raise_for_status()
85
+ return resp.json().get("text","") or ""
86
  except Exception as e:
87
+ print("Transcription error:",e)
88
  return f"Error transcribing audio: {e}"
89
 
90
  def groq_chat_completion(messages):
 
91
  try:
92
+ body={"model":"llama-3.1-8b-instant","messages":messages}
93
+ resp=requests.post("https://api.groq.com/openai/v1/chat/completions",headers=HEADERS,json=body,timeout=60)
94
  resp.raise_for_status()
95
  return resp.json()["choices"][0]["message"]["content"]
96
  except Exception as e:
97
+ print("Chat error:",e)
98
  return f"Error generating response: {e}"
99
 
100
+ def generate_response(session_id,user_text,enhancer_enabled=False,enhancer_tone="Helpful"):
101
+ if session_id not in SESSION_HISTORY: SESSION_HISTORY[session_id]=[]
102
+ SESSION_HISTORY[session_id].append({"role":"user","content":user_text})
103
+ messages=[{"role":"system","content":"You are a helpful AI assistant."}]+SESSION_HISTORY[session_id]
 
 
 
104
  if enhancer_enabled:
105
+ messages.append({"role":"user","content":f"Enhance response. Tone: {enhancer_tone}. Question: {user_text}"})
106
+ assistant_text=groq_chat_completion(messages)
107
+ SESSION_HISTORY[session_id].append({"role":"assistant","content":assistant_text})
 
108
  return assistant_text
109
 
110
+ # ------------------ PDF ------------------
111
+ def handle_pdf_upload(pdf_file,session_id):
112
+ path=_get_path_from_gr_file(pdf_file)
113
+ if not path: return "No file uploaded."
 
114
  try:
115
+ reader=PdfReader(path)
116
+ text="".join([page.extract_text() or "" for page in reader.pages])
117
+ if not text.strip(): return "No extractable content."
118
+ chunks=chunk_text(text)
119
+ PDF_CONTENT[session_id]=chunks
120
+ PDF_EMBEDS[session_id]=embed_model.encode(chunks,convert_to_tensor=True)
121
+ return f"PDF processed: {len(chunks)} chunks."
122
+ except Exception as e: return f"PDF error: {e}"
123
+
124
+ def handle_pdf_question(question,session_id):
125
+ if session_id not in PDF_CONTENT: return "Upload PDF first."
126
+ chunk=select_relevant_chunk(question,PDF_CONTENT[session_id],PDF_EMBEDS[session_id])
127
+ messages=[{"role":"system","content":"Summarize PDF."},{"role":"user","content":f"PDF chunk:\n{chunk}\n\nQuestion:{question}"}]
128
+ assistant_text=groq_chat_completion(messages)
129
+ assistant_text=f"**Snippet from PDF:**\n{chunk[:200]}...\n\n**Answer:**\n{assistant_text}"
130
+ if session_id not in SESSION_HISTORY: SESSION_HISTORY[session_id]=[]
131
+ SESSION_HISTORY[session_id].append({"role":"assistant","content":assistant_text})
 
 
 
 
 
 
 
 
 
 
 
132
  return assistant_text
133
 
134
  # ------------------ Image OCR ------------------
135
+ def ocr_space_file(image_path,api_key,language="eng"):
136
+ if not image_path or not os.path.exists(image_path): return ""
 
137
  try:
138
+ with open(image_path,"rb") as f:
139
+ payload={"apikey":api_key,"language":language}
140
+ files={"file":f}
141
+ r=requests.post("https://api.ocr.space/parse/image",files=files,data=payload,timeout=60)
142
  r.raise_for_status()
143
+ j=r.json()
144
+ if j.get("IsErroredOnProcessing"): return ""
145
+ parsed=[pr.get("ParsedText","") for pr in j.get("ParsedResults",[])]
 
 
146
  return "\n".join(parsed)
147
+ except Exception as e: print("OCR error:",e); return ""
148
+
149
+ def handle_image_upload(image_file,session_id):
150
+ path=_get_path_from_gr_file(image_file)
151
+ if not path: return "No image uploaded.",""
152
+ parsed=ocr_space_file(path,OCR_SPACE_API_KEY)
153
+ if not parsed.strip(): return "No text found.",""
154
+ chunks=chunk_text(parsed)
155
+ IMAGE_TEXT[session_id]=chunks
156
+ IMAGE_EMBEDS[session_id]=embed_model.encode(chunks,convert_to_tensor=True)
157
+ return f"Image processed: {len(chunks)} chunks.",""
158
+
159
+ def handle_image_question(question,session_id):
160
+ if session_id not in IMAGE_TEXT: return "Upload Image first."
161
+ chunk=select_relevant_chunk(question,IMAGE_TEXT[session_id],IMAGE_EMBEDS[session_id])
162
+ messages=[{"role":"system","content":"Summarize Image text"},{"role":"user","content":f"Image chunk:\n{chunk}\n\nQuestion:{question}"}]
163
+ assistant_text=groq_chat_completion(messages)
164
+ assistant_text=f"**Snippet from Image:**\n{chunk[:200]}...\n\n**Answer:**\n{assistant_text}"
165
+ if session_id not in SESSION_HISTORY: SESSION_HISTORY[session_id]=[]
166
+ SESSION_HISTORY[session_id].append({"role":"assistant","content":assistant_text})
 
 
 
 
 
 
 
 
 
167
  return assistant_text
168
 
169
+ # ------------------ PDF Download ------------------
170
+ def generate_pdf_file(text,filename_prefix="summary"):
171
+ pdf=FPDF()
172
  pdf.add_page()
173
+ pdf.set_auto_page_break(True,margin=15)
174
+ pdf.set_font("Arial","B",14)
175
+ pdf.multi_cell(0,8,f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n\n")
176
+ pdf.set_font("Arial","",12)
177
+ for line in text.split("\n"): pdf.multi_cell(0,6,line)
178
+ path=f"/tmp/{filename_prefix}_{uuid.uuid4()}.pdf"
179
+ pdf.output(path)
180
+ return path
 
181
 
182
  def download_pdf_summary(session_id):
183
+ summary="\n".join([m["content"] for m in SESSION_HISTORY.get(session_id,[]) if m["role"]=="assistant"])
184
+ if not summary: summary="No summary available."
185
+ return generate_pdf_file(summary,"summary")
186
+
187
+ # ------------------ Voice ------------------
188
+ def _append_chat_display(session_id,user_text,assistant_text):
189
+ if session_id not in CHAT_DISPLAY: CHAT_DISPLAY[session_id]=[]
190
+ CHAT_DISPLAY[session_id].append((user_text,assistant_text))
191
+
192
+ def handle_voice_general(audio_file,session_id,tts_lang="en",enhancer_enabled=False,enhancer_tone="Helpful"):
193
+ path=_get_path_from_gr_file(audio_file)
194
+ if not path: return "No audio",None,[]
195
+ user_text=transcribe_audio(path)
196
+ assistant_text=generate_response(session_id,user_text,enhancer_enabled,enhancer_tone)
197
+ _append_chat_display(session_id,user_text,assistant_text)
198
+ audio_path=synthesize_speech(assistant_text,tts_lang)
199
+ return assistant_text,audio_path,_chat_display_to_messages(CHAT_DISPLAY[session_id])
200
+
201
+ def handle_voice_pdf(audio_file,session_id,tts_lang="en"):
202
+ path=_get_path_from_gr_file(audio_file)
203
+ if not path: return "No audio",None,[]
204
+ user_text=transcribe_audio(path)
205
+ assistant_text=handle_pdf_question(user_text,session_id)
206
+ _append_chat_display(session_id,user_text,assistant_text)
207
+ audio_path=synthesize_speech(assistant_text,tts_lang)
208
+ return assistant_text,audio_path,_chat_display_to_messages(CHAT_DISPLAY[session_id])
209
+
210
+ def handle_voice_image(audio_file,session_id,tts_lang="en"):
211
+ path=_get_path_from_gr_file(audio_file)
212
+ if not path: return "No audio",None,[]
213
+ user_text=transcribe_audio(path)
214
+ assistant_text=handle_image_question(user_text,session_id)
215
+ _append_chat_display(session_id,user_text,assistant_text)
216
+ audio_path=synthesize_speech(assistant_text,tts_lang)
217
+ return assistant_text,audio_path,_chat_display_to_messages(CHAT_DISPLAY[session_id])
218
+
219
+ def handle_text_general(user_text,session_id,enhancer_enabled=False,enhancer_tone="Helpful"):
220
+ assistant=generate_response(session_id,user_text,enhancer_enabled,enhancer_tone)
221
+ _append_chat_display(session_id,user_text,assistant)
222
+ return assistant,_chat_display_to_messages(CHAT_DISPLAY[session_id])
223
+
224
+ def handle_text_pdf(question,session_id): return handle_pdf_question(question,session_id)
225
+ def handle_text_image(question,session_id): return handle_image_question(question,session_id)
226
+
227
+ # ------------------ Gradio Custom Tabs UI ------------------
 
 
 
 
 
 
 
 
228
  with gr.Blocks() as demo:
229
  gr.HTML("""
230
  <style>
231
+ body{background:#0f172a;color:#f8fafc;font-family:sans-serif;}
232
+ h2{color:#facc15;}
233
+ .tab-btn{background:linear-gradient(90deg,#f472b6,#3b82f6);color:#fff;padding:10px;border-radius:12px;margin-right:5px;font-weight:bold;}
234
+ .tab-btn:hover{background:linear-gradient(90deg,#3b82f6,#f472b6);}
235
+ .hidden{display:none;}
236
+ .visible{display:block;}
237
+ .chat-msg.user{background:#6b7280;color:#fff;padding:5px;border-radius:10px;margin:3px 0;}
238
+ .chat-msg.assistant{background:#f59e0b;color:#111;padding:5px;border-radius:10px;margin:3px 0;}
239
+ .gr-button{background:linear-gradient(90deg,#f472b6,#3b82f6);color:#fff;font-weight:bold;border-radius:12px;padding:10px;}
240
+ .gr-button:hover{background:linear-gradient(90deg,#3b82f6,#f472b6);}
241
+ .gr-textbox,.gr-file,.gr-dropdown,.gr-checkbox{background:#1e293b;color:#f8fafc;border-radius:8px;border:1px solid #facc15;}
242
  </style>
243
  """)
244
+ gr.Markdown("## 🌟 Multi-Mode AI Assistant (Voice, PDF, Image)")
245
+
246
+ # Session States
247
+ session_voice=gr.State(str(uuid.uuid4()))
248
+ session_pdf=gr.State(str(uuid.uuid4()))
249
+ session_image=gr.State(str(uuid.uuid4()))
250
+
251
+ # --- Tab Buttons ---
252
+ with gr.Row():
253
+ btn_voice_tab=gr.Button("🎤 Voice Chat",elem_id="btn_voice")
254
+ btn_pdf_tab=gr.Button("📄 PDF Summarizer",elem_id="btn_pdf")
255
+ btn_image_tab=gr.Button("🖼 Image OCR",elem_id="btn_image")
256
+
257
+ # --- Voice Chat Column ---
258
+ col_voice=gr.Column(visible=True)
259
+ chat_voice=gr.Chatbot(height=300,parent=col_voice)
260
+ with gr.Row(parent=col_voice):
261
+ mic=gr.Audio(type="filepath",label="🎤 Record Voice")
262
+ audio_output=gr.Audio(type="filepath",label="Assistant Voice",interactive=False)
263
+ tts_lang=gr.Dropdown(choices=["en","ur"],value="en",label="TTS Language")
264
+ with gr.Row(parent=col_voice):
265
+ btn_general=gr.Button("⚡Ask General")
266
+ btn_pdf=gr.Button("⚡Ask PDF")
267
+ btn_image=gr.Button("⚡Ask Image")
268
+ enhancer_toggle=gr.Checkbox(label="Enable Enhancer",value=False)
269
+ tone_dropdown=gr.Dropdown(choices=["Helpful","Formal","Friendly"],value="Helpful",label="Tone")
270
+ with gr.Row(parent=col_voice):
271
+ btn_reset=gr.Button(" Reset Logs")
272
+ btn_download=gr.Button("📥 Download Summary")
273
+ summary_file=gr.File(interactive=False)
274
+ answer_voice=gr.Textbox(visible=False,parent=col_voice)
275
+
276
+ # --- PDF Column ---
277
+ col_pdf=gr.Column(visible=False)
278
+ pdf_output=gr.Textbox(lines=5,parent=col_pdf)
279
+ pdf_upload=gr.File(file_types=[".pdf"],parent=col_pdf)
280
+ pdf_question=gr.Textbox(lines=3,label="Ask PDF Question",parent=col_pdf)
281
+ pdf_upload_msg=gr.Textbox(interactive=False,parent=col_pdf)
282
+ pdf_send=gr.Button("Ask",parent=col_pdf)
283
+ pdf_reset=gr.Button("♻ Reset Logs",parent=col_pdf)
284
+ pdf_summary_file=gr.File(interactive=False,parent=col_pdf)
285
+ pdf_download=gr.Button("📥 Download Summary",parent=col_pdf)
286
+
287
+ # --- Image Column ---
288
+ col_image=gr.Column(visible=False)
289
+ image_output=gr.Textbox(lines=5,parent=col_image)
290
+ image_upload=gr.File(file_types=[".png",".jpg",".jpeg"],parent=col_image)
291
+ image_question=gr.Textbox(lines=3,label="Ask Image Question",parent=col_image)
292
+ image_upload_msg=gr.Textbox(interactive=False,parent=col_image)
293
+ image_send=gr.Button("Ask",parent=col_image)
294
+ image_reset=gr.Button("♻ Reset Logs",parent=col_image)
295
+ image_summary_file=gr.File(interactive=False,parent=col_image)
296
+ image_download=gr.Button("📥 Download Summary",parent=col_image)
297
+
298
+ # --- Tab Switching Logic ---
299
+ def switch_tab(tab_name):
300
+ return (
301
+ tab_name=="voice",
302
+ tab_name=="pdf",
303
+ tab_name=="image"
304
+ )
305
+ btn_voice_tab.click(lambda:switch_tab("voice"),outputs=[col_voice,col_pdf,col_image])
306
+ btn_pdf_tab.click(lambda:switch_tab("pdf"),outputs=[col_voice,col_pdf,col_image])
307
+ btn_image_tab.click(lambda:switch_tab("image"),outputs=[col_voice,col_pdf,col_image])
308
+
309
+ # --- Voice Click Handlers ---
310
+ btn_general.click(fn=handle_voice_general,inputs=[mic,session_voice,tts_lang,enhancer_toggle,tone_dropdown],outputs=[answer_voice,audio_output,chat_voice])
311
+ btn_pdf.click(fn=handle_voice_pdf,inputs=[mic,session_pdf,tts_lang],outputs=[answer_voice,audio_output,chat_voice])
312
+ btn_image.click(fn=handle_voice_image,inputs=[mic,session_image,tts_lang],outputs=[answer_voice,audio_output,chat_voice])
313
+ btn_reset.click(lambda:(str(uuid.uuid4()),[],None,None,""),outputs=[session_voice,chat_voice,mic,audio_output,answer_voice])
314
+ btn_download.click(download_pdf_summary,inputs=[session_voice],outputs=[summary_file])
315
+
316
+ # --- PDF Handlers ---
317
+ pdf_upload.upload(handle_pdf_upload,inputs=[pdf_upload,session_pdf],outputs=[pdf_upload_msg])
318
+ pdf_send.click(handle_text_pdf,inputs=[pdf_question,session_pdf],outputs=[pdf_output])
319
+ pdf_reset.click(lambda:(str(uuid.uuid4()),""),outputs=[session_pdf,pdf_output])
320
+ pdf_download.click(download_pdf_summary,inputs=[session_pdf],outputs=[pdf_summary_file])
321
+
322
+ # --- Image Handlers ---
323
+ image_upload.upload(handle_image_upload,inputs=[image_upload,session_image],outputs=[image_upload_msg,image_output])
324
+ image_send.click(handle_text_image,inputs=[image_question,session_image],outputs=[image_output])
325
+ image_reset.click(lambda:(str(uuid.uuid4()),""),outputs=[session_image,image_output])
326
+ image_download.click(download_pdf_summary,inputs=[session_image],outputs=[image_summary_file])
327
+
328
+ demo.launch()