Spaces:

asad9641
/

AI-Powered-Talk-Bot

Sleeping

App Files Files Community

asad9641 commited on Nov 18, 2025

Commit

52a8c9a

verified ·

1 Parent(s): b0d4c97

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -124

app.py CHANGED Viewed

@@ -1,32 +1,25 @@
 # app.py
 """
 Multi-Mode AI Assistant (Voice, PDF, Image)
-- Fixed Gradio v4+ Audio usage (no source=...).
-- Chatbot uses type="messages" (openai-style {"role","content"} dicts).
-- Voice tab: single mic + three buttons (Ask General / Ask PDF / Ask Image).
-- PDF tab: upload + text questions only (no voice controls).
-- PDF summary download returns a temporary .pdf file for gr.File.
-- OCR uses OCR.space (OCR_SPACE_API_KEY).
-- Uses Groq endpoints for transcription + chat completions (GROQ_API_KEY).
-- Embeddings via sentence-transformers (all-MiniLM-L6-v2).
 """
 import os
 import uuid
 import tempfile
 import requests
 from dotenv import load_dotenv
 from gtts import gTTS
 from PyPDF2 import PdfReader
 import gradio as gr
 from sentence_transformers import SentenceTransformer, util
 from fpdf import FPDF
-from datetime import datetime
 # ------------------ Load API Keys ------------------
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()
 OCR_SPACE_API_KEY = os.getenv("OCR_SPACE_API_KEY", "").strip()
 if not GROQ_API_KEY:
     raise ValueError("❌ GROQ_API_KEY missing. Set it in env / Hugging Face Secrets.")
 if not OCR_SPACE_API_KEY:
@@ -34,20 +27,17 @@ if not OCR_SPACE_API_KEY:
 HEADERS = {"Authorization": f"Bearer {GROQ_API_KEY}"}
-# ------------------ Global State ------------------
-SESSION_HISTORY = {}   # session_id -> list of {"role","content"} messages for LLM
-CHAT_DISPLAY = {}      # session_id -> list of (user_text, assistant_text) tuples (kept for conversion)
-PDF_CONTENT = {}       # session_id -> list of chunks (strings)
-PDF_EMBEDS = {}        # session_id -> embeddings tensor
-IMAGE_TEXT = {}        # session_id -> list of image-text chunks
-IMAGE_EMBEDS = {}      # session_id -> embeddings tensor
 CHUNK_SIZE = 1500
-# Load embedding model once (can be heavy)
 embed_model = SentenceTransformer("all-MiniLM-L6-v2")
 # ------------------ Helpers ------------------
 def _get_path_from_gr_file(gr_file):
     if not gr_file:
@@ -55,22 +45,18 @@ def _get_path_from_gr_file(gr_file):
     if isinstance(gr_file, str) and os.path.exists(gr_file):
         return gr_file
     try:
-        if hasattr(gr_file, "name") and isinstance(gr_file.name, str) and os.path.exists(gr_file.name):
             return gr_file.name
-    except Exception:
         pass
     if isinstance(gr_file, dict):
         for key in ("name", "file_name", "filepath"):
-            if key in gr_file:
-                candidate = gr_file.get(key)
-                if isinstance(candidate, str) and os.path.exists(candidate):
-                    return candidate
     return None
 def chunk_text(text, size=CHUNK_SIZE):
-    return [text[i:i + size] for i in range(0, len(text), size)]
 def synthesize_speech(text, lang="en"):
     try:
@@ -83,7 +69,6 @@ def synthesize_speech(text, lang="en"):
         print("TTS error:", e)
         return None
 def select_relevant_chunk(question, chunks, chunk_embeds):
     if not chunks or chunk_embeds is None:
         return ""
@@ -92,7 +77,6 @@ def select_relevant_chunk(question, chunks, chunk_embeds):
     top_idx = int(scores.argmax().item())
     return chunks[top_idx]
 def _chat_display_to_messages(chat_display):
     msgs = []
     for user, assistant in chat_display:
@@ -100,8 +84,12 @@ def _chat_display_to_messages(chat_display):
         msgs.append({"role": "assistant", "content": assistant})
     return msgs
-# ------------------ Transcription & LLM ------------------
 def transcribe_audio(audio_path):
     if not audio_path or not os.path.exists(audio_path):
         return "Error: audio file missing."
@@ -117,7 +105,6 @@ def transcribe_audio(audio_path):
         print("transcription error:", e)
         return f"Error transcribing audio: {e}"
 def generate_response(session_id, user_text):
     if session_id not in SESSION_HISTORY:
         SESSION_HISTORY[session_id] = []
@@ -134,29 +121,25 @@ def generate_response(session_id, user_text):
         print("generate_response error:", e)
         return f"Error generating response: {e}"
-# ------------------ PDF handling ------------------
 def handle_pdf_upload(pdf_file, session_id):
     path = _get_path_from_gr_file(pdf_file)
     if not path:
         return "No file uploaded or file unreadable."
     try:
         reader = PdfReader(path)
-        text = ""
-        for page in reader.pages:
-            text += (page.extract_text() or "") + "\n"
         if not text.strip():
             return "No extractable content found in PDF."
         chunks = chunk_text(text)
         PDF_CONTENT[session_id] = chunks
         PDF_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
-        return f"PDF processed: {len(chunks)} chunks ready."
     except Exception as e:
         print("PDF upload error:", e)
         return f"Error processing PDF: {e}"
-def handle_pdf_question(question, session_id):
     if session_id not in PDF_CONTENT:
         return "Document not found. Upload first."
     chunk = select_relevant_chunk(question, PDF_CONTENT[session_id], PDF_EMBEDS[session_id])
@@ -173,34 +156,6 @@ def handle_pdf_question(question, session_id):
         print("PDF question error:", e)
         return f"Error generating response: {e}"
-# ------------------ PDF Generation ------------------
-def generate_pdf_file(text, filename_prefix="summary"):
-    pdf = FPDF()
-    pdf.add_page()
-    pdf.set_auto_page_break(auto=True, margin=15)
-    pdf.set_font("Arial", size=12)
-    for line in text.split("\n"):
-        pdf.multi_cell(0, 6, line)
-    file_path = f"/tmp/{filename_prefix}_{uuid.uuid4()}.pdf"
-    pdf.output(file_path)
-    return file_path
-def download_pdf_summary(session_pdf_id):
-    summary_text = "\n".join([msg["content"] for msg in SESSION_HISTORY.get(session_pdf_id, []) if msg["role"]=="assistant"])
-    if not summary_text:
-        summary_text = "No summary available."
-    return generate_pdf_file(summary_text, "pdf_summary")
-def download_image_summary(session_image_id):
-    summary_text = "\n".join([msg["content"] for msg in SESSION_HISTORY.get(session_image_id, []) if msg["role"]=="assistant"])
-    if not summary_text:
-        summary_text = "No summary available."
-    return generate_pdf_file(summary_text, "image_summary")
 # ------------------ Image OCR ------------------
 def ocr_space_file(image_path, api_key, language="eng"):
     if not image_path or not os.path.exists(image_path):
@@ -213,7 +168,7 @@ def ocr_space_file(image_path, api_key, language="eng"):
         r.raise_for_status()
         j = r.json()
         if j.get("IsErroredOnProcessing"):
-            print("OCR.space processing error:", j)
             return ""
         parsed = [pr.get("ParsedText", "") for pr in j.get("ParsedResults", [])]
         return "\n".join(parsed)
@@ -221,11 +176,10 @@ def ocr_space_file(image_path, api_key, language="eng"):
         print("ocr_space_file error:", e)
         return ""
 def handle_image_upload(image_file, session_id):
     path = _get_path_from_gr_file(image_file)
     if not path:
-        return "No image uploaded or file unreadable.", ""
     parsed = ocr_space_file(path, OCR_SPACE_API_KEY)
     if not parsed.strip():
         return "No extractable text found in the image.", ""
@@ -234,8 +188,7 @@ def handle_image_upload(image_file, session_id):
     IMAGE_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
     return f"Image processed: {len(chunks)} chunks ready.", ""
-def handle_image_question(question, session_id):
     if session_id not in IMAGE_TEXT:
         return "Image not found. Upload first."
     chunk = select_relevant_chunk(question, IMAGE_TEXT[session_id], IMAGE_EMBEDS[session_id])
@@ -252,14 +205,31 @@ def handle_image_question(question, session_id):
         print("Image question error:", e)
         return f"Error generating response: {e}"
-# ------------------ Voice routing (single mic) ------------------
-def _append_chat_display(session_id, user_text, assistant_text):
-    if session_id not in CHAT_DISPLAY:
-        CHAT_DISPLAY[session_id] = []
-    CHAT_DISPLAY[session_id].append((user_text, assistant_text))
 def handle_voice_general(audio_file, session_id, tts_lang="en"):
     path = _get_path_from_gr_file(audio_file)
     if not path:
@@ -270,44 +240,32 @@ def handle_voice_general(audio_file, session_id, tts_lang="en"):
     audio_path = synthesize_speech(assistant_text, lang=tts_lang)
     return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
 def handle_voice_pdf(audio_file, session_id, tts_lang="en"):
     path = _get_path_from_gr_file(audio_file)
     if not path:
         return "No audio provided.", None, []
     user_text = transcribe_audio(path)
-    assistant_text = handle_pdf_question(user_text, session_id)
     _append_chat_display(session_id, user_text, assistant_text)
     audio_path = synthesize_speech(assistant_text, lang=tts_lang)
     return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
 def handle_voice_image(audio_file, session_id, tts_lang="en"):
     path = _get_path_from_gr_file(audio_file)
     if not path:
         return "No audio provided.", None, []
     user_text = transcribe_audio(path)
-    assistant_text = handle_image_question(user_text, session_id)
     _append_chat_display(session_id, user_text, assistant_text)
     audio_path = synthesize_speech(assistant_text, lang=tts_lang)
     return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
-# ------------------ Text handlers ------------------
 def handle_text_general(user_text, session_id):
     assistant = generate_response(session_id, user_text)
     _append_chat_display(session_id, user_text, assistant)
     return assistant, _chat_display_to_messages(CHAT_DISPLAY[session_id])
-def handle_text_pdf(question, session_id):
-    return handle_pdf_question(question, session_id)
-def handle_text_image(question, session_id):
-    return handle_image_question(question, session_id)
 # ------------------ Gradio UI ------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("## 🛠 Multi-Mode AI Assistant (Voice, PDF, Image)")
@@ -318,54 +276,54 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     # ---------------- Voice Tab ----------------
     with gr.Tab("🎤 Voice Chat"):
-        chat_voice = gr.Chatbot(type="messages", height=380)
         with gr.Row():
-            mic = gr.Audio(type="filepath", label="🎤 Record Voice (hold & speak)")
-            tts_lang = gr.Dropdown(choices=["en", "ur"], value="en", label="TTS Language")
         with gr.Row():
-            btn_general = gr.Button("Ask General (from recorded voice)")
-            btn_pdf = gr.Button("Ask PDF (from recorded voice)")
-            btn_image = gr.Button("Ask Image (from recorded voice)")
         answer_voice = gr.Textbox(label="Assistant Answer (text)", lines=4)
-        audio_output = gr.Audio(label="Assistant Voice Output", type="filepath")
         with gr.Row():
-            text_input = gr.Textbox(label="Or type a question (General)", placeholder="Type message here...")
-            btn_send_text = gr.Button("Send (Text General)")
         btn_general.click(
-            handle_voice_general,
             inputs=[mic, session_voice, tts_lang],
             outputs=[answer_voice, audio_output, chat_voice],
         )
         btn_pdf.click(
-            handle_voice_pdf,
             inputs=[mic, session_pdf, tts_lang],
             outputs=[answer_voice, audio_output, chat_voice],
         )
         btn_image.click(
-            handle_voice_image,
             inputs=[mic, session_image, tts_lang],
             outputs=[answer_voice, audio_output, chat_voice],
         )
         btn_send_text.click(
-            handle_text_general,
             inputs=[text_input, session_voice],
             outputs=[answer_voice, chat_voice],
         )
     # ---------------- PDF Tab ----------------
     with gr.Tab("📄 PDF Summarizer"):
-        pdf_output = gr.Textbox(label="Answer (Text Only)", lines=12)
-        pdf_summary_file = gr.File(label="Download Summary File")
         with gr.Row():
-            pdf_upload_btn = gr.File(label="Upload PDF", file_types=[".pdf"])
-            pdf_upload_msg = gr.Textbox(label="Upload Status", interactive=False)
         pdf_question = gr.Textbox(label="Ask a question about PDF (text)", lines=2)
-        pdf_send_btn = gr.Button("Ask (Text)")
-        pdf_reset_btn = gr.Button("♻ Reset PDF")
-        pdf_download_btn = gr.Button("📥 Download Summary")
         pdf_upload_btn.upload(handle_pdf_upload, inputs=[pdf_upload_btn, session_pdf], outputs=[pdf_upload_msg])
         pdf_send_btn.click(handle_text_pdf, inputs=[pdf_question, session_pdf], outputs=[pdf_output])
@@ -374,21 +332,20 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     # ---------------- Image Tab ----------------
     with gr.Tab("🖼 Image OCR"):
-        image_output = gr.Textbox(label="Answer (Text Only)", lines=12)
-        img_summary_file = gr.File(label="Download Summary File")
         with gr.Row():
-            image_upload_btn = gr.File(label="Upload Image", file_types=[".png", ".jpg", ".jpeg"])
-            image_upload_msg = gr.Textbox(label="Upload Status", interactive=False)
         image_question = gr.Textbox(label="Ask a question about Image (text)", lines=2)
-        image_send_btn = gr.Button("Ask (Text)")
-        image_reset_btn = gr.Button("♻ Reset Image")
-        img_download_btn = gr.Button("📥 Download Summary")
         image_upload_btn.upload(handle_image_upload, inputs=[image_upload_btn, session_image], outputs=[image_upload_msg, image_output])
         image_send_btn.click(handle_text_image, inputs=[image_question, session_image], outputs=[image_output])
         image_reset_btn.click(lambda: (str(uuid.uuid4()), ""), outputs=[session_image, image_output])
         img_download_btn.click(download_image_summary, inputs=[session_image], outputs=[img_summary_file])
-# Launch
 if __name__ == "__main__":
-    demo.launch()

 # app.py
 """
 Multi-Mode AI Assistant (Voice, PDF, Image)
+- Improved interactive UI: compact, visually appealing, emojis/icons, scrollable previews.
+- All backend functionality preserved.
 """
 import os
 import uuid
 import tempfile
 import requests
+from datetime import datetime
 from dotenv import load_dotenv
 from gtts import gTTS
 from PyPDF2 import PdfReader
 import gradio as gr
 from sentence_transformers import SentenceTransformer, util
 from fpdf import FPDF
 # ------------------ Load API Keys ------------------
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()
 OCR_SPACE_API_KEY = os.getenv("OCR_SPACE_API_KEY", "").strip()
 if not GROQ_API_KEY:
     raise ValueError("❌ GROQ_API_KEY missing. Set it in env / Hugging Face Secrets.")
 if not OCR_SPACE_API_KEY:
 HEADERS = {"Authorization": f"Bearer {GROQ_API_KEY}"}
+# ------------------ Global States ------------------
+SESSION_HISTORY = {}
+CHAT_DISPLAY = {}
+PDF_CONTENT = {}
+PDF_EMBEDS = {}
+IMAGE_TEXT = {}
+IMAGE_EMBEDS = {}
 CHUNK_SIZE = 1500
 embed_model = SentenceTransformer("all-MiniLM-L6-v2")
 # ------------------ Helpers ------------------
 def _get_path_from_gr_file(gr_file):
     if not gr_file:
     if isinstance(gr_file, str) and os.path.exists(gr_file):
         return gr_file
     try:
+        if hasattr(gr_file, "name") and os.path.exists(gr_file.name):
             return gr_file.name
+    except:
         pass
     if isinstance(gr_file, dict):
         for key in ("name", "file_name", "filepath"):
+            if key in gr_file and os.path.exists(gr_file[key]):
+                return gr_file[key]
     return None
 def chunk_text(text, size=CHUNK_SIZE):
+    return [text[i:i+size] for i in range(0, len(text), size)]
 def synthesize_speech(text, lang="en"):
     try:
         print("TTS error:", e)
         return None
 def select_relevant_chunk(question, chunks, chunk_embeds):
     if not chunks or chunk_embeds is None:
         return ""
     top_idx = int(scores.argmax().item())
     return chunks[top_idx]
 def _chat_display_to_messages(chat_display):
     msgs = []
     for user, assistant in chat_display:
         msgs.append({"role": "assistant", "content": assistant})
     return msgs
+def _append_chat_display(session_id, user_text, assistant_text):
+    if session_id not in CHAT_DISPLAY:
+        CHAT_DISPLAY[session_id] = []
+    CHAT_DISPLAY[session_id].append((user_text, assistant_text))
+# ------------------ Voice & LLM ------------------
 def transcribe_audio(audio_path):
     if not audio_path or not os.path.exists(audio_path):
         return "Error: audio file missing."
         print("transcription error:", e)
         return f"Error transcribing audio: {e}"
 def generate_response(session_id, user_text):
     if session_id not in SESSION_HISTORY:
         SESSION_HISTORY[session_id] = []
         print("generate_response error:", e)
         return f"Error generating response: {e}"
+# ------------------ PDF Handling ------------------
 def handle_pdf_upload(pdf_file, session_id):
     path = _get_path_from_gr_file(pdf_file)
     if not path:
         return "No file uploaded or file unreadable."
     try:
         reader = PdfReader(path)
+        text = "".join([page.extract_text() or "" for page in reader.pages])
         if not text.strip():
             return "No extractable content found in PDF."
         chunks = chunk_text(text)
         PDF_CONTENT[session_id] = chunks
         PDF_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
+        return f"PDF uploaded: {len(chunks)} chunks ready."
     except Exception as e:
         print("PDF upload error:", e)
         return f"Error processing PDF: {e}"
+def handle_text_pdf(question, session_id):
     if session_id not in PDF_CONTENT:
         return "Document not found. Upload first."
     chunk = select_relevant_chunk(question, PDF_CONTENT[session_id], PDF_EMBEDS[session_id])
         print("PDF question error:", e)
         return f"Error generating response: {e}"
 # ------------------ Image OCR ------------------
 def ocr_space_file(image_path, api_key, language="eng"):
     if not image_path or not os.path.exists(image_path):
         r.raise_for_status()
         j = r.json()
         if j.get("IsErroredOnProcessing"):
+            print("OCR.space error:", j)
             return ""
         parsed = [pr.get("ParsedText", "") for pr in j.get("ParsedResults", [])]
         return "\n".join(parsed)
         print("ocr_space_file error:", e)
         return ""
 def handle_image_upload(image_file, session_id):
     path = _get_path_from_gr_file(image_file)
     if not path:
+        return "No image uploaded.", ""
     parsed = ocr_space_file(path, OCR_SPACE_API_KEY)
     if not parsed.strip():
         return "No extractable text found in the image.", ""
     IMAGE_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
     return f"Image processed: {len(chunks)} chunks ready.", ""
+def handle_text_image(question, session_id):
     if session_id not in IMAGE_TEXT:
         return "Image not found. Upload first."
     chunk = select_relevant_chunk(question, IMAGE_TEXT[session_id], IMAGE_EMBEDS[session_id])
         print("Image question error:", e)
         return f"Error generating response: {e}"
+# ------------------ PDF Generation ------------------
+def generate_pdf_file(text, filename_prefix="summary"):
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_auto_page_break(auto=True, margin=15)
+    pdf.set_font("Arial", size=12)
+    for line in text.split("\n"):
+        pdf.multi_cell(0, 6, line)
+    tmp_path = f"/tmp/{filename_prefix}_{uuid.uuid4()}.pdf"
+    pdf.output(tmp_path)
+    return tmp_path
+def download_pdf_summary(session_pdf_id):
+    summary_text = "\n".join([msg["content"] for msg in SESSION_HISTORY.get(session_pdf_id, []) if msg["role"]=="assistant"])
+    if not summary_text:
+        summary_text = "No summary available."
+    return generate_pdf_file(summary_text, "pdf_summary")
+def download_image_summary(session_image_id):
+    summary_text = "\n".join([msg["content"] for msg in SESSION_HISTORY.get(session_image_id, []) if msg["role"]=="assistant"])
+    if not summary_text:
+        summary_text = "No summary available."
+    return generate_pdf_file(summary_text, "image_summary")
+# ------------------ Voice Handlers ------------------
 def handle_voice_general(audio_file, session_id, tts_lang="en"):
     path = _get_path_from_gr_file(audio_file)
     if not path:
     audio_path = synthesize_speech(assistant_text, lang=tts_lang)
     return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
 def handle_voice_pdf(audio_file, session_id, tts_lang="en"):
     path = _get_path_from_gr_file(audio_file)
     if not path:
         return "No audio provided.", None, []
     user_text = transcribe_audio(path)
+    assistant_text = handle_text_pdf(user_text, session_id)
     _append_chat_display(session_id, user_text, assistant_text)
     audio_path = synthesize_speech(assistant_text, lang=tts_lang)
     return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
 def handle_voice_image(audio_file, session_id, tts_lang="en"):
     path = _get_path_from_gr_file(audio_file)
     if not path:
         return "No audio provided.", None, []
     user_text = transcribe_audio(path)
+    assistant_text = handle_text_image(user_text, session_id)
     _append_chat_display(session_id, user_text, assistant_text)
     audio_path = synthesize_speech(assistant_text, lang=tts_lang)
     return assistant_text, audio_path, _chat_display_to_messages(CHAT_DISPLAY[session_id])
+# ------------------ Text Handlers ------------------
 def handle_text_general(user_text, session_id):
     assistant = generate_response(session_id, user_text)
     _append_chat_display(session_id, user_text, assistant)
     return assistant, _chat_display_to_messages(CHAT_DISPLAY[session_id])
 # ------------------ Gradio UI ------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("## 🛠 Multi-Mode AI Assistant (Voice, PDF, Image)")
     # ---------------- Voice Tab ----------------
     with gr.Tab("🎤 Voice Chat"):
+        chat_voice = gr.Chatbot(type="messages", height=350)
         with gr.Row():
+            mic = gr.Audio(type="filepath", label="🎤 Record Voice (hold & speak)", show_download_button=False)
+            tts_lang = gr.Dropdown(choices=["en", "ur"], value="en", label="TTS Language", interactive=True, scale=1)
         with gr.Row():
+            btn_general = gr.Button("Ask General 🎯", scale=1)
+            btn_pdf = gr.Button("Ask PDF 📄", scale=1)
+            btn_image = gr.Button("Ask Image 🖼", scale=1)
         answer_voice = gr.Textbox(label="Assistant Answer (text)", lines=4)
+        audio_output = gr.Audio(label="Assistant Voice Output", type="filepath", interactive=False)
+        # Text-only general chat
         with gr.Row():
+            text_input = gr.Textbox(label="Or type a question (General)", placeholder="Type message here...", lines=2)
+            btn_send_text = gr.Button("Send (Text General)", scale=1)
         btn_general.click(
+            fn=handle_voice_general,
             inputs=[mic, session_voice, tts_lang],
             outputs=[answer_voice, audio_output, chat_voice],
         )
         btn_pdf.click(
+            fn=handle_voice_pdf,
             inputs=[mic, session_pdf, tts_lang],
             outputs=[answer_voice, audio_output, chat_voice],
         )
         btn_image.click(
+            fn=handle_voice_image,
             inputs=[mic, session_image, tts_lang],
             outputs=[answer_voice, audio_output, chat_voice],
         )
         btn_send_text.click(
+            fn=handle_text_general,
             inputs=[text_input, session_voice],
             outputs=[answer_voice, chat_voice],
         )
     # ---------------- PDF Tab ----------------
     with gr.Tab("📄 PDF Summarizer"):
+        pdf_output = gr.Textbox(label="Answer (Text Only)", lines=6)
+        pdf_summary_file = gr.File(label="📥 Download PDF Summary")
         with gr.Row():
+            pdf_upload_btn = gr.File(label="Upload PDF", file_types=[".pdf"], file_types_preview=False, interactive=True)
+            pdf_upload_msg = gr.Textbox(label="Upload Status", interactive=False, lines=1)
         pdf_question = gr.Textbox(label="Ask a question about PDF (text)", lines=2)
+        pdf_send_btn = gr.Button("Ask (Text)", scale=1)
+        pdf_reset_btn = gr.Button("♻ Reset PDF", scale=1)
+        pdf_download_btn = gr.Button("📥 Download Summary", scale=1)
         pdf_upload_btn.upload(handle_pdf_upload, inputs=[pdf_upload_btn, session_pdf], outputs=[pdf_upload_msg])
         pdf_send_btn.click(handle_text_pdf, inputs=[pdf_question, session_pdf], outputs=[pdf_output])
     # ---------------- Image Tab ----------------
     with gr.Tab("🖼 Image OCR"):
+        image_output = gr.Textbox(label="Answer (Text Only)", lines=6)
+        img_summary_file = gr.File(label="📥 Download PDF Summary")
         with gr.Row():
+            image_upload_btn = gr.File(label="Upload Image", file_types=[".png", ".jpg", ".jpeg"], interactive=True)
+            image_upload_msg = gr.Textbox(label="Upload Status", interactive=False, lines=1)
         image_question = gr.Textbox(label="Ask a question about Image (text)", lines=2)
+        image_send_btn = gr.Button("Ask", scale=1)
+        image_reset_btn = gr.Button("♻ Reset Image", scale=1)
+        img_download_btn = gr.Button("📥 Download Summary", scale=1)
         image_upload_btn.upload(handle_image_upload, inputs=[image_upload_btn, session_image], outputs=[image_upload_msg, image_output])
         image_send_btn.click(handle_text_image, inputs=[image_question, session_image], outputs=[image_output])
         image_reset_btn.click(lambda: (str(uuid.uuid4()), ""), outputs=[session_image, image_output])
         img_download_btn.click(download_image_summary, inputs=[session_image], outputs=[img_summary_file])
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)