Spaces:

Sazid2
/

Assamese

Sleeping

App Files Files Community

Sazid2 commited on Nov 27, 2025

Commit

bf281e4

verified ·

1 Parent(s): 1e98313

Create app.py

Browse files

Files changed (1) hide show

app.py +202 -81

app.py CHANGED Viewed

@@ -1,8 +1,20 @@
 # app.py
 import os
 import io
 import sqlite3
 from datetime import datetime
 import fitz  # PyMuPDF
 import numpy as np
 from PIL import Image
@@ -12,35 +24,40 @@ import pytesseract
 from sentence_transformers import SentenceTransformer
 import sympy as sp
-# Optional: huggingface inference
 from huggingface_hub import InferenceApi
-# ------------- CONFIG -------------
-APP_NAME = "Jajabor – SEBA Assamese Class 10 Tutor (Spaces)"
 BASE_DIR = os.path.abspath(os.path.dirname(__file__))
 PDF_DIR = os.path.join(BASE_DIR, "pdfs", "class10")
 DB_PATH = os.path.join(BASE_DIR, "jajabor_users.db")
-# Embedding model - compact for Spaces. Swap if you run on stronger infra.
 EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
-# LLM model to call via Inference API (optional)
-# WARNING: not all large models will run under a free plan; see docs.
-LLM_MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"  # can change to a hosted model
-USE_HF_INFERENCE = True  # set False if you plan to load a local small model
 CHUNK_SIZE = 600
 CHUNK_OVERLAP = 120
 TOP_K = 5
 HUGGINGFACE_API_TOKEN = os.environ.get("HF_API_TOKEN", None)
-if USE_HF_INFERENCE:
-    if not HUGGINGFACE_API_TOKEN:
-        print("Warning: HF API token not found in env (HF_API_TOKEN). LLM calls will fail.")
-    else:
         inference = InferenceApi(repo_id=LLM_MODEL_NAME, token=HUGGINGFACE_API_TOKEN)
-# ------------- DB helpers -------------
 def init_db(db_path=DB_PATH):
     os.makedirs(os.path.dirname(db_path), exist_ok=True)
     conn = sqlite3.connect(db_path)
@@ -117,14 +134,20 @@ def get_user_stats(user_id):
 init_db()
-# ------------- PDF loading + RAG -------------
 def extract_text_from_pdf(pdf_path: str) -> str:
-    doc = fitz.open(pdf_path)
     pages = []
     for page in doc:
-        txt = page.get_text("text")
-        if txt:
-            pages.append(txt)
     return "\n".join(pages)
 def load_all_pdfs(pdf_dir: str):
@@ -133,7 +156,7 @@ def load_all_pdfs(pdf_dir: str):
     if not os.path.isdir(pdf_dir):
         print("PDF_DIR not found:", pdf_dir)
         return texts, metas
-    for fname in os.listdir(pdf_dir):
         if fname.lower().endswith(".pdf"):
             path = os.path.join(pdf_dir, fname)
             print("Reading:", path)
@@ -143,14 +166,19 @@ def load_all_pdfs(pdf_dir: str):
     return texts, metas
 def split_text(text: str, chunk_size=600, overlap=120):
     chunks = []
     start = 0
-    while start < len(text):
-        end = start + chunk_size
         chunk = text[start:end]
         if chunk.strip():
             chunks.append(chunk)
-        start = max(end - overlap, end)  # avoid infinite loop
     return chunks
 print("Loading embedding model:", EMBEDDING_MODEL_NAME)
@@ -168,36 +196,43 @@ for text, meta in zip(all_texts, all_metas):
     corpus_metas.extend([meta] * len(chs))
 print("Total chunks:", len(corpus_chunks))
 if len(corpus_chunks) > 0:
-    print("Encoding chunks...")
-    embs = embedding_model.encode(corpus_chunks, batch_size=32, show_progress_bar=False).astype("float32")
-    dim = embs.shape[1]
-    index = faiss.IndexFlatL2(dim)
-    index.add(embs)
-    print("FAISS index ready; dim:", dim)
 else:
-    index = None
-    print("No corpus chunks - upload PDFs to the `pdfs/class10` folder in the repo.")
 def rag_search(query: str, k: int = TOP_K):
     if index is None:
         return []
-    q_vec = embedding_model.encode([query]).astype("float32")
-    D, I = index.search(q_vec, k)
-    results = []
-    for dist, idx in zip(D[0], I[0]):
-        if idx == -1:
-            continue
-        results.append(
-            {
-                "score": float(dist),
-                "text": corpus_chunks[idx],
-                "meta": corpus_metas[idx],
-            }
-        )
-    return results
-# ------------- LLM helpers -------------
 SYSTEM_PROMPT = """
 You are "Jajabor", an expert SEBA Assamese tutor for Class 10.
 Always prefer to answer in Assamese. If the student clearly asks for English, you may reply in English.
@@ -235,20 +270,25 @@ def build_rag_prompt(context_blocks, question, chat_history):
     return prompt
 def call_llm_via_hf(prompt: str, max_tokens=512):
-    if not HUGGINGFACE_API_TOKEN:
-        return "LLM not available: HF API token (env HF_API_TOKEN) is required to call the Inference API."
     try:
-        # huggingface InferenceApi text-generation returns text (model-specific format)
         out = inference(inputs=prompt, params={"max_new_tokens": max_tokens, "temperature": 0.3})
-        # inference result may be a dict or string; try to extract
         if isinstance(out, dict) and "generated_text" in out:
             return out["generated_text"]
-        if isinstance(out, list) and len(out) > 0 and "generated_text" in out[0]:
-            return out[0]["generated_text"]
         if isinstance(out, str):
             return out
         return str(out)
     except Exception as e:
         return f"LLM call failed: {e}"
 def llm_answer_with_rag(question: str, chat_history):
@@ -259,23 +299,34 @@ def llm_answer_with_rag(question: str, chat_history):
     else:
         return "LLM not configured (USE_HF_INFERENCE=False)."
-# ------------- OCR + math helpers -------------
 def ocr_from_image(img: Image.Image):
     if img is None:
         return ""
-    img = img.convert("RGB")
     try:
         text = pytesseract.image_to_string(img, lang="asm+eng")
     except Exception:
-        text = pytesseract.image_to_string(img)
     return text.strip()
 def is_likely_math(text: str) -> bool:
     math_chars = set("0123456789+-*/=^()%")
     if any(ch in text for ch in math_chars):
         return True
-    kws = ["গণিত", "সমীকৰণ", "উদাহৰণ", "প্ৰশ্ন", "বীজগণিত"]
-    return any(k in text for k in kws)
 def solve_math_expression(expr: str):
     try:
@@ -289,31 +340,33 @@ def solve_math_expression(expr: str):
             steps = []
             steps.append("প্ৰথমে সমীকৰণ লওঁ:")
             steps.append(f"{sp.pretty(eq)}")
-            steps.append("Sympy ৰ সহায়ত সমাধান পোৱা যায়:")
             steps.append(str(sol))
             explanation = "ধাপ-ধাপে সমাধান (সংক্ষেপে):\n" + "\n".join(f"- {s}" for s in steps)
-            explanation += f"\n\nসেয়েহে সমাধান: {sol}"
         else:
             expr_s = sp.sympify(expr)
             simp = sp.simplify(expr_s)
             explanation = (
-                "প্ৰদত্ত গণিতীয় অভিব্যক্তি:\n"
-                f"{expr}\n\nসরলীকৰণ কৰাৰ পিছত পোৱা যায়:\n{simp}"
             )
         return explanation
     except Exception:
         return (
-            "মই সঠিকভাৱে গণিতীয় অভিব্যক্তি চিনাক্ত কৰিব নোৱাৰিলোঁ। "
-            "দয়া কৰি সমীকৰণটো অলপ বেছি স্পষ্টকৈ লিখা: উদাহৰণ – 2x + 3 = 7"
         )
 def speech_to_text(audio):
     return ""
 def text_to_speech(text: str):
     return None
-# ------------- Chat logic -------------
 def login_user(username, user_state):
     username = (username or "").strip()
     if not username:
@@ -336,28 +389,58 @@ def chat_logic(
     chat_history,
     user_state,
 ):
     if not user_state or not user_state.get("user_id"):
         sys_msg = "⚠️ প্ৰথমে ওপৰত আপোনাৰ নাম লিখি **Login / লগিন** টিপক।"
         chat_history = chat_history + [[text_input or "", sys_msg]]
         return chat_history, user_state, None
     user_id = user_state["user_id"]
     final_query_parts = []
     voice_text = speech_to_text(audio_input)
     if voice_text:
         final_query_parts.append(voice_text)
     ocr_text = ""
-    if image_input is not None:
         try:
-            img = Image.open(io.BytesIO(image_input.read()))
         except Exception:
-            img = image_input
-        ocr_text = ocr_from_image(img)
-        if ocr_text:
-            final_query_parts.append(ocr_text)
     if text_input:
         final_query_parts.append(text_input)
@@ -367,6 +450,7 @@ def chat_logic(
         return chat_history, user_state, None
     full_query = "\n".join(final_query_parts)
     conv = []
     for u, b in chat_history:
         if u:
@@ -375,6 +459,7 @@ def chat_logic(
             conv.append(("Tutor", b))
     is_math = is_likely_math(full_query)
     if is_math:
         math_answer = solve_math_expression(full_query)
         combined_question = (
@@ -387,19 +472,25 @@ def chat_logic(
     else:
         final_answer = llm_answer_with_rag(full_query, conv)
     log_interaction(user_id, full_query, final_answer, is_math)
     audio_out = text_to_speech(final_answer)
     display_question = text_input or voice_text or ocr_text or "(empty)"
     chat_history = chat_history + [[display_question, final_answer]]
     return chat_history, user_state, audio_out
-# ------------- Gradio UI -------------
-with gr.Blocks(title=APP_NAME) as demo:
     gr.Markdown(
         """
-        # 🧭 জাজাবৰ – SEBA অসমীয়া ক্লাছ ১০ AI Tutor (Spaces)
-        - Upload your SEBA Class 10 PDFs to `pdfs/class10` in this Space repo
         - Text + Image (OCR) input
         - Math step-by-step solutions
         - User login + progress
@@ -411,23 +502,50 @@ with gr.Blocks(title=APP_NAME) as demo:
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 👤 লগিন")
-            username_inp = gr.Textbox(label="নাম / ইউজাৰ আইডি", placeholder="উদাহৰণ: abu10")
             login_btn = gr.Button("✅ Login / লগিন")
             stats_md = gr.Markdown("এতিয়ালৈকে লগিন হোৱা নাই।", elem_classes="stats-box")
         with gr.Column(scale=3):
             chat = gr.Chatbot(label="জাজাবৰ সৈতে কথোপকথন", height=500)
-            text_inp = gr.Textbox(label="আপোনাৰ প্ৰশ্ন লিখক", lines=2)
             with gr.Row():
-                image_inp = gr.Image(label="📷 প্ৰশ্নৰ ছবি (Optional)", type="file")
-                audio_inp = gr.Audio(label="🎙️ কণ্ঠস্বৰ প্ৰশ্ন (Stub)", type="numpy")
             with gr.Row():
                 ask_btn = gr.Button("🤖 জাজাবৰক সোধক")
-                audio_out = gr.Audio(label="🔊 উত্তৰৰ অডিঅ’ (TTS – future)", interactive=False)
-    login_btn.click(login_user, inputs=[username_inp, user_state], outputs=[user_state, stats_md])
     def wrapped_chat(text, image, audio, history, user_state_inner, username_inner):
-        if user_state_inner and username_inner and not user_state_inner.get("username"):
             user_state_inner["username"] = username_inner
         return chat_logic(username_inner, text, image, audio, history, user_state_inner)
@@ -436,11 +554,14 @@ with gr.Blocks(title=APP_NAME) as demo:
         inputs=[text_inp, image_inp, audio_inp, chat, user_state, username_inp],
         outputs=[chat, user_state, audio_out],
     )
     text_inp.submit(
         wrapped_chat,
         inputs=[text_inp, image_inp, audio_inp, chat, user_state, username_inp],
         outputs=[chat, user_state, audio_out],
     )
 if __name__ == "__main__":
     demo.launch()

 # app.py
+"""
+Jajabor – SEBA Assamese Class 10 Tutor (Gradio app)
+Full single-file app:
+- Loads PDFs from ./pdfs/class10
+- Builds FAISS index using sentence-transformers
+- Optional Hugging Face Inference API for LLM (set HF_API_TOKEN env var)
+- Login + sqlite interactions logging
+- OCR from images (pytesseract) with robust handling of gr.Image(type="filepath")
+"""
 import os
 import io
 import sqlite3
 from datetime import datetime
+import traceback
 import fitz  # PyMuPDF
 import numpy as np
 from PIL import Image
 from sentence_transformers import SentenceTransformer
 import sympy as sp
+# Optional HF inference
 from huggingface_hub import InferenceApi
+# -------------------- CONFIG --------------------
+APP_NAME = "Jajabor – SEBA Assamese Class 10 Tutor"
 BASE_DIR = os.path.abspath(os.path.dirname(__file__))
 PDF_DIR = os.path.join(BASE_DIR, "pdfs", "class10")
 DB_PATH = os.path.join(BASE_DIR, "jajabor_users.db")
+# Embedding model (compact for Spaces)
 EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+# LLM: model to call via HF Inference API. Change if you have another hosted model.
+LLM_MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"
+USE_HF_INFERENCE = True  # set False if you don't want to call HF Inference
 CHUNK_SIZE = 600
 CHUNK_OVERLAP = 120
 TOP_K = 5
 HUGGINGFACE_API_TOKEN = os.environ.get("HF_API_TOKEN", None)
+if USE_HF_INFERENCE and HUGGINGFACE_API_TOKEN is None:
+    print("Warning: HF_API_TOKEN not set. LLM calls will fail until the token is provided in env.")
+inference = None
+if USE_HF_INFERENCE and HUGGINGFACE_API_TOKEN:
+    try:
         inference = InferenceApi(repo_id=LLM_MODEL_NAME, token=HUGGINGFACE_API_TOKEN)
+    except Exception as e:
+        print("Failed to initialize HF Inference API client:", e)
+        inference = None
+# -------------------- DB helpers --------------------
 def init_db(db_path=DB_PATH):
     os.makedirs(os.path.dirname(db_path), exist_ok=True)
     conn = sqlite3.connect(db_path)
 init_db()
+# -------------------- PDF loading + RAG --------------------
 def extract_text_from_pdf(pdf_path: str) -> str:
+    try:
+        doc = fitz.open(pdf_path)
+    except Exception:
+        return ""
     pages = []
     for page in doc:
+        try:
+            txt = page.get_text("text")
+            if txt:
+                pages.append(txt)
+        except Exception:
+            continue
     return "\n".join(pages)
 def load_all_pdfs(pdf_dir: str):
     if not os.path.isdir(pdf_dir):
         print("PDF_DIR not found:", pdf_dir)
         return texts, metas
+    for fname in sorted(os.listdir(pdf_dir)):
         if fname.lower().endswith(".pdf"):
             path = os.path.join(pdf_dir, fname)
             print("Reading:", path)
     return texts, metas
 def split_text(text: str, chunk_size=600, overlap=120):
+    if not text:
+        return []
     chunks = []
     start = 0
+    L = len(text)
+    # Keep stepping forward by chunk_size - overlap
+    step = max(chunk_size - overlap, 1)
+    while start < L:
+        end = min(start + chunk_size, L)
         chunk = text[start:end]
         if chunk.strip():
             chunks.append(chunk)
+        start += step
     return chunks
 print("Loading embedding model:", EMBEDDING_MODEL_NAME)
     corpus_metas.extend([meta] * len(chs))
 print("Total chunks:", len(corpus_chunks))
+index = None
 if len(corpus_chunks) > 0:
+    print("Encoding chunks (this may take some seconds)...")
+    try:
+        embs = embedding_model.encode(corpus_chunks, batch_size=32, show_progress_bar=False).astype("float32")
+        dim = embs.shape[1]
+        index = faiss.IndexFlatL2(dim)
+        index.add(embs)
+        print("✅ FAISS index ready; dim:", dim)
+    except Exception as e:
+        print("Failed to encode/add to index:", e)
+        index = None
 else:
+    print("No corpus chunks found: upload PDFs to ./pdfs/class10")
 def rag_search(query: str, k: int = TOP_K):
     if index is None:
         return []
+    try:
+        q_vec = embedding_model.encode([query]).astype("float32")
+        D, I = index.search(q_vec, k)
+        results = []
+        for dist, idx in zip(D[0], I[0]):
+            if idx == -1:
+                continue
+            results.append(
+                {
+                    "score": float(dist),
+                    "text": corpus_chunks[idx],
+                    "meta": corpus_metas[idx],
+                }
+            )
+        return results
+    except Exception:
+        return []
+# -------------------- LLM helpers --------------------
 SYSTEM_PROMPT = """
 You are "Jajabor", an expert SEBA Assamese tutor for Class 10.
 Always prefer to answer in Assamese. If the student clearly asks for English, you may reply in English.
     return prompt
 def call_llm_via_hf(prompt: str, max_tokens=512):
+    if inference is None:
+        return "LLM not available: HF Inference client not configured (set HF_API_TOKEN and ensure model name is accessible)."
     try:
+        # Some inference endpoints accept dict return, some strings. Handle flexibly.
         out = inference(inputs=prompt, params={"max_new_tokens": max_tokens, "temperature": 0.3})
+        # Handle common return types
         if isinstance(out, dict) and "generated_text" in out:
             return out["generated_text"]
+        if isinstance(out, list) and len(out) > 0:
+            if isinstance(out[0], dict) and "generated_text" in out[0]:
+                return out[0]["generated_text"]
+            # sometimes list of strings
+            if isinstance(out[0], str):
+                return out[0]
         if isinstance(out, str):
             return out
         return str(out)
     except Exception as e:
+        traceback.print_exc()
         return f"LLM call failed: {e}"
 def llm_answer_with_rag(question: str, chat_history):
     else:
         return "LLM not configured (USE_HF_INFERENCE=False)."
+# -------------------- OCR + math helpers --------------------
 def ocr_from_image(img: Image.Image):
     if img is None:
         return ""
     try:
+        img = img.convert("RGB")
+    except Exception:
+        pass
+    try:
+        # try Assamese + English; fallback if languages not installed
         text = pytesseract.image_to_string(img, lang="asm+eng")
     except Exception:
+        try:
+            text = pytesseract.image_to_string(img)
+        except Exception:
+            text = ""
     return text.strip()
 def is_likely_math(text: str) -> bool:
+    if not text:
+        return False
     math_chars = set("0123456789+-*/=^()%")
     if any(ch in text for ch in math_chars):
         return True
+    kws = ["গণিত", "সমীকৰণ", "উদাহৰণ", "প্ৰশ্ন", "বীজগণিত", "solve", "equation"]
+    if any(k in text for k in kws):
+        return True
+    return False
 def solve_math_expression(expr: str):
     try:
             steps = []
             steps.append("প্ৰথমে সমীকৰণ লওঁ:")
             steps.append(f"{sp.pretty(eq)}")
+            steps.append("Sympy ৰ সহায়ত সমাধান পোৱা যায়:")
             steps.append(str(sol))
             explanation = "ধাপ-ধাপে সমাধান (সংক্ষেপে):\n" + "\n".join(f"- {s}" for s in steps)
+            explanation += f"\n\nসেয়ে সমাধান: {sol}"
         else:
             expr_s = sp.sympify(expr)
             simp = sp.simplify(expr_s)
             explanation = (
+                "প্ৰদত্ত গণিতীয় অভিব্যক্তি:\n"
+                f"{expr}\n\nসরলীকৰণ কৰাৰ পিছত পোৱা যায়:\n{simp}"
             )
         return explanation
     except Exception:
         return (
+            "মই সঠিকভাৱে গণিতীয় অভিব্যক্তি চিনাক্ত কৰিব নোৱাৰিলোঁ। "
+            "দয়া কৰি সমীকৰণটো অলপ বেছি স্পষ্টকৈ লিখা: উদাহৰণ – 2*x + 3 = 7"
         )
 def speech_to_text(audio):
+    # stub for future ASR integration
     return ""
 def text_to_speech(text: str):
+    # stub for TTS integration
     return None
+# -------------------- Chat logic --------------------
 def login_user(username, user_state):
     username = (username or "").strip()
     if not username:
     chat_history,
     user_state,
 ):
+    # Ensure chat_history is a list
+    if chat_history is None:
+        chat_history = []
     if not user_state or not user_state.get("user_id"):
         sys_msg = "⚠️ প্ৰথমে ওপৰত আপোনাৰ নাম লিখি **Login / লগিন** টিপক।"
         chat_history = chat_history + [[text_input or "", sys_msg]]
         return chat_history, user_state, None
     user_id = user_state["user_id"]
     final_query_parts = []
+    # audio (stub)
     voice_text = speech_to_text(audio_input)
     if voice_text:
         final_query_parts.append(voice_text)
+    # image handling (robust)
     ocr_text = ""
+    if image_input is not None and image_input != "":
+        img = None
         try:
+            # If Gradio returns a file path (string)
+            if isinstance(image_input, str):
+                try:
+                    img = Image.open(image_input)
+                except Exception:
+                    img = None
+            else:
+                # If it's a file-like object: has .read()
+                read_method = getattr(image_input, "read", None)
+                if callable(read_method):
+                    try:
+                        raw = image_input.read()
+                        img = Image.open(io.BytesIO(raw))
+                    except Exception:
+                        img = None
+                # If it's already a PIL Image
+                if img is None and isinstance(image_input, Image.Image):
+                    img = image_input
         except Exception:
+            img = None
+        if img is not None:
+            try:
+                ocr_text = ocr_from_image(img)
+                if ocr_text:
+                    final_query_parts.append(ocr_text)
+            except Exception:
+                pass
+    # text input
     if text_input:
         final_query_parts.append(text_input)
         return chat_history, user_state, None
     full_query = "\n".join(final_query_parts)
     conv = []
     for u, b in chat_history:
         if u:
             conv.append(("Tutor", b))
     is_math = is_likely_math(full_query)
     if is_math:
         math_answer = solve_math_expression(full_query)
         combined_question = (
     else:
         final_answer = llm_answer_with_rag(full_query, conv)
+    # If LLM returns the whole prompt + generation, try to remove the prompt (best-effort)
+    if isinstance(final_answer, str) and final_answer.strip().startswith(SYSTEM_PROMPT.strip()):
+        # best-effort: don't leak huge prompts to chat UI; keep as-is if detection fails
+        # (Many HF inference responses do not include the prompt anyway)
+        pass
     log_interaction(user_id, full_query, final_answer, is_math)
     audio_out = text_to_speech(final_answer)
     display_question = text_input or voice_text or ocr_text or "(empty)"
     chat_history = chat_history + [[display_question, final_answer]]
     return chat_history, user_state, audio_out
+# -------------------- Gradio UI --------------------
+with gr.Blocks(title=APP_NAME, css=None) as demo:
     gr.Markdown(
         """
+        # 🧭 জাজাবৰ – SEBA অসমীয়া ক্লাছ ১০ AI Tutor
+        - Upload your SEBA Class 10 PDFs to `pdfs/class10` in this repo (or when running locally, ensure folder exists)
         - Text + Image (OCR) input
         - Math step-by-step solutions
         - User login + progress
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 👤 লগিন")
+            username_inp = gr.Textbox(
+                label="নাম / ইউজাৰ আইডি",
+                placeholder="উদাহৰণ: abu10, student01 ..."
+            )
             login_btn = gr.Button("✅ Login / লগিন")
             stats_md = gr.Markdown("এতিয়ালৈকে লগিন হোৱা নাই।", elem_classes="stats-box")
+            gr.Markdown(
+                """
+                ### 💡 টিপছ
+                - "ক্লাছ ১০ গণিত: উদাহৰণ ৩.১ প্ৰশ্ন ২" – এই ধৰণৰ প্ৰশ্ন ভাল
+                - ফটো আপলোড কৰিলে টেক্স্টটো OCR কৰি পঢ়িব চেষ্টা কৰা হয়
+                - সম্ভৱ হলে প্ৰশ্নটো অসমীয়াত সোধক 🙂
+                """
+            )
         with gr.Column(scale=3):
             chat = gr.Chatbot(label="জাজাবৰ সৈতে কথোপকথন", height=500)
+            text_inp = gr.Textbox(
+                label="আপোনাৰ প্ৰশ্ন লিখক",
+                placeholder='উদাহৰণ: "ক্লাছ ১০ অসমীয়া: অনুচ্ছেদ পাঠ ১ ৰ মূল বিষয় কি?"',
+                lines=2,
+            )
             with gr.Row():
+                # IMPORTANT: use type="filepath" so Gradio returns a local path string
+                image_inp = gr.Image(label="📷 প্ৰশ্নৰ ছবি (Optional)", type="filepath")
+                audio_inp = gr.Audio(label="🎙️ কণ্ঠস্বৰ প্ৰশ্ন (Stub — not used now)", type="numpy")
             with gr.Row():
                 ask_btn = gr.Button("🤖 জাজাবৰক সোধক")
+                audio_out = gr.Audio(label="🔊 উত্তৰৰ অডিঅ’ (TTS – future upgrade)", interactive=False)
+    login_btn.click(
+        login_user,
+        inputs=[username_inp, user_state],
+        outputs=[user_state, stats_md],
+    )
     def wrapped_chat(text, image, audio, history, user_state_inner, username_inner):
+        # keep username in state if provided
+        if user_state_inner is None:
+            user_state_inner = {}
+        if username_inner and not user_state_inner.get("username"):
             user_state_inner["username"] = username_inner
         return chat_logic(username_inner, text, image, audio, history, user_state_inner)
         inputs=[text_inp, image_inp, audio_inp, chat, user_state, username_inp],
         outputs=[chat, user_state, audio_out],
     )
     text_inp.submit(
         wrapped_chat,
         inputs=[text_inp, image_inp, audio_inp, chat, user_state, username_inp],
         outputs=[chat, user_state, audio_out],
     )
+# -------------------- Launch --------------------
 if __name__ == "__main__":
+    # For Spaces, demo.launch() is fine. Locally you can set server_name to "0.0.0.0"
     demo.launch()