Spaces:

ahm1378
/

multimodal-rag-demo

Sleeping

App Files Files Community

amirhossein mohammadpour commited on Sep 16

Commit

73a17b2

1 Parent(s): 6ca41c8

hanlde speed

Browse files

Files changed (1) hide show

app.py +150 -64

app.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import os, io, gc, json, re, ast
 import numpy as np
 import pandas as pd
 import faiss
@@ -10,7 +12,9 @@ import gradio as gr
 from huggingface_hub import hf_hub_download
 from sentence_transformers import SentenceTransformer
 from transformers import AutoTokenizer, AutoModelForCausalLM
 # =========================
 # Config (override in Space → Settings → Variables & secrets)
 # =========================
@@ -27,13 +31,15 @@ HF_TOKEN          = os.getenv("HF_TOKEN", None)  # needed if DATASET_REPO is pri
 # Models (CPU-friendly defaults; override via env if desired)
 E5_ID       = os.getenv("E5_ID", "intfloat/multilingual-e5-small")
 CLIP_TXT_ID = os.getenv("CLIP_TXT_ID", "sentence-transformers/clip-ViT-B-32-multilingual-v1")
-LLM_ID      = os.getenv("LLM_ID", "Qwen/Qwen2-0.5B-Instruct")   # small enough for free CPU
-# Generation defaults (also controllable from UI)
-MAX_NEW_TOKENS_DEFAULT = int(os.getenv("MAX_NEW_TOKENS", "192"))
-TEMPERATURE_DEFAULT    = float(os.getenv("TEMPERATURE", "0.0"))  # deterministic by default on CPU
-TOP_P_DEFAULT          = float(os.getenv("TOP_P", "0.9"))
-TOP_K_DEFAULT          = int(os.getenv("TOP_K", "50"))
 # =========================
 # Helpers
@@ -163,15 +169,20 @@ model = AutoModelForCausalLM.from_pretrained(
     torch_dtype=dtype,
 ).to("cpu").eval()
 # =========================
 # Retrieval helpers
 # =========================
-@torch.no_grad()
-def _encode_query_e5(q: str) -> np.ndarray:
     qn = "query: " + normalize_digits_months(q)
     v = st_e5.encode([qn], batch_size=1, convert_to_numpy=True, normalize_embeddings=True)[0]
     return v.astype("float32")
 def _faiss_search(index, q_vec: np.ndarray, k: int):
     if q_vec.ndim == 1:
         q_vec = q_vec[None, :]
@@ -262,7 +273,7 @@ def make_query_embed(query_text: str,
 def search_fusion(query_text: str, image: Image.Image, k: int = 5, alpha_q: float = 0.7):
     if index_fusion is None:
         raise RuntimeError("Fusion index not available (upload FUSION_INDEX_FILE to dataset repo).")
-    qv = make_query_embed(query_text, image=image, alpha_q=alpha_q, use_aug=True, n_aug=3)
     return _faiss_search(index_fusion, qv, k)
 # =========================
@@ -288,7 +299,7 @@ def retrieve_context_auto(question: str, k: int = 5, image: Image.Image = None)
             ctxs.append({"index": int(idx), "id": row.get("id", idx), "score": float(score), "bio": str(row["bio"])})
     return {"route": route, "contexts": ctxs}
-def build_prompt(question: str, contexts: List[Dict[str, Any]], lang="fa", max_chars=5000) -> str:
     sys_fa = "تو یک دستیار پاسخ‌گو هستی که فقط بر اساس متن‌های داده‌شده پاسخ می‌دهی. اگر پاسخی در متن‌ها نبود، صادقانه بگو «در متن‌های بازیابی‌شده پاسخی پیدا نشد.»"
     sys_en = "You are a helpful assistant. Answer only using retrieved passages. If not found, say 'No answer found in retrieved passages.'"
     system_text = sys_fa if lang == "fa" else sys_en
@@ -311,20 +322,19 @@ def build_prompt(question: str, contexts: List[Dict[str, Any]], lang="fa", max_c
     return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
 @torch.inference_mode()
-def llm_generate(prompt: str,
-                 max_new_tokens=MAX_NEW_TOKENS_DEFAULT,
-                 temperature=TEMPERATURE_DEFAULT,
-                 top_p=TOP_P_DEFAULT,
-                 top_k=TOP_K_DEFAULT,
-                 do_sample=False) -> str:
     inputs = tokenizer(prompt, return_tensors="pt")
     out = model.generate(
         **inputs,
-        max_new_tokens=int(max_new_tokens),
-        do_sample=bool(do_sample),
-        temperature=float(temperature),
-        top_p=float(top_p),
-        top_k=int(top_k),
         pad_token_id=tokenizer.eos_token_id,
         eos_token_id=tokenizer.eos_token_id,
     )
@@ -334,7 +344,7 @@ def llm_generate(prompt: str,
     return text.strip()
 # ---- MCQ helpers ----
-def build_mcq_prompt(question: str, options: List[str], contexts: List[Dict[str, Any]], lang="fa", max_chars=5000) -> str:
     sys_fa = (
         "تو یک دستیار پاسخ‌گو هستی که فقط بر اساس متن‌های داده‌شده پاسخ می‌دهی. "
         "باید دقیقاً فقط یک شیء JSON برگردانی و هیچ متن دیگری ننویسی."
@@ -389,33 +399,109 @@ def _strict_json_from_text(text: str):
     except Exception:
         return None
-def score_options_by_context(options: List[str], contexts: List[Dict[str, Any]]) -> int:
     """
-    فال‌بک:
-      1) اگر اسم گزینه به‌صورت substring در متون بود → امتیاز خیلی بالا
-      2) وگرنه شباهت embedding با mE5 بین گزینه و کل کانتکست‌ها
     """
-    text_blob = "\n".join([c.get("bio","") for c in contexts]).lower()
-    # 1) substring hit
-    hits = []
     for i, opt in enumerate(options):
-        o = normalize_digits_months(str(opt).strip().lower())
-        score = 0
-        if o and (o in text_blob):
-            score += 10_000
-        hits.append((score, i))
-    hits.sort(reverse=True)
-    if hits and hits[0][0] > 0:
-        return hits[0][1]
-    # 2) embedding similarity (mE5)
     try:
-        q_vecs = [_encode_query_e5(opt) for opt in options]   # (n, dim)
-        ctx_vec = _encode_query_e5(text_blob)                 # (dim,)
-        sims = [float(_np.dot(qv, ctx_vec)) for qv in q_vecs]
-        return int(_np.argmax(sims))
     except Exception:
-        return 0  # پیش‌فرض محافظه‌کارانه
 def parse_mcq_output_strict(text: str, options: List[str], contexts: List[Dict[str, Any]]) -> Dict[str, Any]:
     obj = _strict_json_from_text(text)
@@ -424,9 +510,8 @@ def parse_mcq_output_strict(text: str, options: List[str], contexts: List[Dict[s
         if isinstance(idx, int) and 0 <= idx < len(options):
             reason = str(obj.get("reason", "")).strip() or "—"
             return {"answer_index": idx, "reason": reason}
-    # اگر JSON درست نبود → فال‌بک
-    idx = score_options_by_context(options, contexts)
-    return {"answer_index": idx, "reason": "fallback_by_context_matching"}
 def parse_mcq_output(text: str, n: int) -> Dict[str, Any]:
     m = re.search(r'{"\s*answer_index"\s*:\s*([0-9]+)\s*,\s*"reason"\s*:\s*"(.*?)"}', text, re.S)
@@ -455,7 +540,7 @@ def ui_answer(question, image, topk, max_tokens, temperature, top_p, top_k):
         return "Please enter a question.", [], ""
     # Retrieve
     ret = retrieve_context_auto(question, k=int(topk), image=image)
-    prompt = build_prompt(question, ret["contexts"], lang="fa", max_chars=5000)
     ans = llm_generate(prompt, max_new_tokens=int(max_tokens),
                        temperature=float(temperature), top_p=float(top_p),
                        top_k=int(top_k), do_sample=False)
@@ -492,34 +577,35 @@ with gr.Blocks(title="Multimodal RAG (CPU) • E5 + CLIP Fusion + Qwen 0.5B") as
     gr.Markdown("### Free-tier CPU demo: text RAG (E5) + optional fusion (CLIP) → Qwen 0.5B")
     with gr.Tab("Ask"):
         with gr.Row():
-            q = gr.Textbox(label="Question", placeholder="سؤال خود را بنویسید…", lines=3)
-            img = gr.Image(type="pil", label="Optional image (fusion if provided)")
         with gr.Row():
-            topk = gr.Slider(1, 20, value=5, step=1, label="Top-K retrieve")
-            max_tokens = gr.Slider(32, 1024, value=MAX_NEW_TOKENS_DEFAULT, step=16, label="Max new tokens")
         with gr.Row():
-            temperature = gr.Slider(0.0, 1.5, value=TEMPERATURE_DEFAULT, step=0.1, label="Temperature")
-            top_p = gr.Slider(0.1, 1.0, value=TOP_P_DEFAULT, step=0.05, label="Top-p")
-            top_k = gr.Slider(1, 100, value=TOP_K_DEFAULT, step=1, label="Top-k")
         btn = gr.Button("Answer")
         ans = gr.Textbox(label="Answer", lines=8)
         route = gr.Textbox(label="Route used (text_e5 or fusion)")
         table = gr.Dataframe(headers=["#", "id", "score", "snippet"], interactive=False)
-        btn.click(ui_answer, [q, img, topk, max_tokens, temperature, top_p, top_k], [ans, table, route])
     with gr.Tab("MCQ"):
         with gr.Row():
             q_mcq = gr.Textbox(label="Question", lines=3)
             opts_mcq = gr.Textbox(label="Options (one per line)", lines=8)
-        img_mcq = gr.Image(type="pil", label="Optional image (fusion if provided)")
         with gr.Row():
-            topk2 = gr.Slider(1, 20, value=5, step=1, label="Top-K retrieve")
-            max_tokens2 = gr.Slider(32, 1024, value=MAX_NEW_TOKENS_DEFAULT, step=16, label="Max new tokens")
         with gr.Row():
-            temperature2 = gr.Slider(0.0, 1.5, value=TEMPERATURE_DEFAULT, step=0.1, label="Temperature")
-            top_p2 = gr.Slider(0.1, 1.0, value=TOP_P_DEFAULT, step=0.05, label="Top-p")
-            top_k2 = gr.Slider(1, 100, value=TOP_K_DEFAULT, step=1, label="Top-k")
         btn2 = gr.Button("Answer MCQ")
-        # 👇 باکس‌ها بزرگ‌تر
         result = gr.Textbox(label="Prediction", lines=12, max_lines=20)
         raw = gr.Textbox(label="Raw LLM output", lines=12, max_lines=20)
         route2 = gr.Textbox(label="Route used")

 import os, io, gc, json, re, ast
+from functools import lru_cache
 import numpy as np
 import pandas as pd
 import faiss
 from huggingface_hub import hf_hub_download
 from sentence_transformers import SentenceTransformer
 from transformers import AutoTokenizer, AutoModelForCausalLM
+import os, torch
+torch.set_num_threads(2)                 # vCPUهای Space معمولاً 2 تاست
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 # =========================
 # Config (override in Space → Settings → Variables & secrets)
 # =========================
 # Models (CPU-friendly defaults; override via env if desired)
 E5_ID       = os.getenv("E5_ID", "intfloat/multilingual-e5-small")
 CLIP_TXT_ID = os.getenv("CLIP_TXT_ID", "sentence-transformers/clip-ViT-B-32-multilingual-v1")
+LLM_ID = os.getenv("LLM_ID", "Qwen/Qwen2-0.5B-Instruct")
+# خروجی کوتاه‌تر
+MAX_NEW_TOKENS_DEFAULT = int(os.getenv("MAX_NEW_TOKENS_DEFAULT", "96"))  # قبلاً 256
+# نمونه‌برداری خاموش (قطعی و سریع‌تر)
+TEMPERATURE_DEFAULT = float(os.getenv("TEMPERATURE_DEFAULT", "0.0"))
+TOP_P_DEFAULT       = float(os.getenv("TOP_P_DEFAULT", "1.0"))
+TOP_K_DEFAULT       = int(os.getenv("TOP_K_DEFAULT", "50"))
 # =========================
 # Helpers
     torch_dtype=dtype,
 ).to("cpu").eval()
 # =========================
 # Retrieval helpers
 # =========================
+@lru_cache(maxsize=4096)
+def _encode_query_e5_cached(q: str) -> np.ndarray:
     qn = "query: " + normalize_digits_months(q)
     v = st_e5.encode([qn], batch_size=1, convert_to_numpy=True, normalize_embeddings=True)[0]
     return v.astype("float32")
+# استفاده به‌جای قدیمی:
+def _encode_query_e5(q: str) -> np.ndarray:
+    return _encode_query_e5_cached(q)
 def _faiss_search(index, q_vec: np.ndarray, k: int):
     if q_vec.ndim == 1:
         q_vec = q_vec[None, :]
 def search_fusion(query_text: str, image: Image.Image, k: int = 5, alpha_q: float = 0.7):
     if index_fusion is None:
         raise RuntimeError("Fusion index not available (upload FUSION_INDEX_FILE to dataset repo).")
+    qv = make_query_embed(query_text, image=image, alpha_q=alpha_q, use_aug=False, n_aug=3)
     return _faiss_search(index_fusion, qv, k)
 # =========================
             ctxs.append({"index": int(idx), "id": row.get("id", idx), "score": float(score), "bio": str(row["bio"])})
     return {"route": route, "contexts": ctxs}
+def build_prompt(question: str, contexts: List[Dict[str, Any]], lang="fa", max_chars=1800) -> str:
     sys_fa = "تو یک دستیار پاسخ‌گو هستی که فقط بر اساس متن‌های داده‌شده پاسخ می‌دهی. اگر پاسخی در متن‌ها نبود، صادقانه بگو «در متن‌های بازیابی‌شده پاسخی پیدا نشد.»"
     sys_en = "You are a helpful assistant. Answer only using retrieved passages. If not found, say 'No answer found in retrieved passages.'"
     system_text = sys_fa if lang == "fa" else sys_en
     return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
 @torch.inference_mode()
+def llm_generate(prompt: str, max_new_tokens=96, temperature=0.0, top_p=1.0, top_k=50, do_sample=False) -> str:
     inputs = tokenizer(prompt, return_tensors="pt")
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
     out = model.generate(
         **inputs,
+        max_new_tokens=max_new_tokens,
+        do_sample=False,             # قطعی
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        num_beams=1,                 # بدون beam-search
+        use_cache=True,              # سریع‌تر
         pad_token_id=tokenizer.eos_token_id,
         eos_token_id=tokenizer.eos_token_id,
     )
     return text.strip()
 # ---- MCQ helpers ----
+def build_mcq_prompt(question: str, options: List[str], contexts: List[Dict[str, Any]], lang="fa", max_chars=1800) -> str:
     sys_fa = (
         "تو یک دستیار پاسخ‌گو هستی که فقط بر اساس متن‌های داده‌شده پاسخ می‌دهی. "
         "باید دقیقاً فقط یک شیء JSON برگردانی و هیچ متن دیگری ننویسی."
     except Exception:
         return None
+import re as _re
+import numpy as _np
+def _norm_text_for_match(s: str) -> str:
+    # نرمال‌سازی ساده: اعداد فارسی/عربی، ZWNJ، فاصله‌های اضافه
+    s = normalize_digits_months(s or "")
+    s = s.replace("\u200c", " ").strip()
+    # پایین‌حرفی و تک‌فاصله
+    s = _re.sub(r"\s+", " ", s.lower())
+    return s
+def _find_snippet(hay: str, needle: str, win: int = 60) -> str:
+    """یک تکه متن کوتاه اطراف اولین مچ را بده."""
+    try:
+        i = hay.index(needle)
+        start = max(0, i - win)
+        end   = min(len(hay), i + len(needle) + win)
+        return hay[start:end].replace("\n", " ")
+    except ValueError:
+        return ""
+def score_options_by_context(
+    options: List[str],
+    contexts: List[Dict[str, Any]],
+    return_snippet: bool = False
+):
     """
+    فال‌بک هوشمند:
+      1) boundary-aware substring در تک‌تک کانتکست‌ها (امتیاز بالا + تعداد وقوع)
+      2) اگر هیچ مچی نبود → شباهت embedding با mE5 بین هر گزینه و کل کانتکست‌ها
+    خروجی:
+      - اگر return_snippet=False → فقط best_idx (int)
+      - اگر return_snippet=True  → (best_idx, snippet) برمی‌گرداند
     """
+    # آماده‌سازی کانتکست‌ها
+    raw_ctxs = [c.get("bio", "") for c in contexts]
+    norm_ctxs = [_norm_text_for_match(x) for x in raw_ctxs]
+    joined_norm = " \n ".join(norm_ctxs)
+    # 1) جست‌وجوی دقیق‌تر: word boundary + شمارش
+    # برای فارسی/عربی هم خوب جواب می‌دهد چون از فاصله استفاده می‌کنیم.
+    best_idx, best_score, best_snip = 0, -1.0, ""
     for i, opt in enumerate(options):
+        o_raw = str(opt).strip()
+        o = _norm_text_for_match(o_raw)
+        if not o:
+            continue
+        # الگوی boundary ساده: (شروع/فاصله) + عبارت + (پایان/فاصله)
+        # اگر گزینه چندکلمه‌ای است، همین هم خوب جواب می‌دهد.
+        # اگر لازم شد می‌توان regex دقیق‌تر نوشت.
+        pat = r"(?<!\S)" + _re.escape(o) + r"(?!\S)"
+        total_hits = 0
+        first_snip = ""
+        for raw, norm in zip(raw_ctxs, norm_ctxs):
+            for m in _re.finditer(pat, norm):
+                total_hits += 1
+                if not first_snip:
+                    # اسنیپت از متن خام (خواناتر)
+                    # موقعیت متن خام را تقریبی می‌گیریم با جست‌وجوی ساده
+                    # (اگر اختلاف normalization زیاد بود، از norm استفاده می‌کنیم)
+                    sn = _find_snippet(raw, o_raw) or _find_snippet(norm, o)
+                    first_snip = sn
+        if total_hits > 0:
+            # امتیاز بالا برای مچ صریح + تعداد وقوع
+            score = 10000.0 + total_hits
+            if score > best_score:
+                best_score, best_idx, best_snip = score, i, first_snip
+    if best_score > 0:
+        return (best_idx, best_snip) if return_snippet else best_idx
+    # 2) اگر هیچ مچی نبود → شباهت embedding (mE5)
     try:
+        # وکتور کل کانتکست‌ها (یک‌بار)
+        ctx_vec = _encode_query_e5(joined_norm)  # (dim,)
+        sims = []
+        for opt in options:
+            qv = _encode_query_e5(str(opt))
+            sims.append(float(_np.dot(qv, ctx_vec)))
+        best_idx = int(_np.argmax(sims))
+        # برای snippet در این مسیر: نزدیک‌ترین کانتکست را با dot جداگانه پیدا کنیم
+        # (سریع و به‌اندازه کافی خوب)
+        best_snip = ""
+        try:
+            opt_vec = _encode_query_e5(str(options[best_idx]))
+            # کوساین تقریباً همان inner-prod چون نرمال شده‌اند
+            # امتیاز هر کانتکست با گزینه‌ی برنده:
+            c_scores = []
+            for raw, norm in zip(raw_ctxs, norm_ctxs):
+                c_vec = _encode_query_e5(norm)
+                c_scores.append(float(_np.dot(opt_vec, c_vec)))
+            j = int(_np.argmax(c_scores))
+            best_snip = _find_snippet(raw_ctxs[j], str(options[best_idx])) or raw_ctxs[j][:120].replace("\n"," ")
+        except Exception:
+            pass
+        return (best_idx, best_snip) if return_snippet else best_idx
     except Exception:
+        return (0, "") if return_snippet else 0  # پیش‌فرض محافظه‌کارانه
 def parse_mcq_output_strict(text: str, options: List[str], contexts: List[Dict[str, Any]]) -> Dict[str, Any]:
     obj = _strict_json_from_text(text)
         if isinstance(idx, int) and 0 <= idx < len(options):
             reason = str(obj.get("reason", "")).strip() or "—"
             return {"answer_index": idx, "reason": reason}
+    idx, snip = score_options_by_context(options, contexts, return_snippet=True)
+    return {"answer_index": idx, "reason": snip or "matched by context"}
 def parse_mcq_output(text: str, n: int) -> Dict[str, Any]:
     m = re.search(r'{"\s*answer_index"\s*:\s*([0-9]+)\s*,\s*"reason"\s*:\s*"(.*?)"}', text, re.S)
         return "Please enter a question.", [], ""
     # Retrieve
     ret = retrieve_context_auto(question, k=int(topk), image=image)
+    prompt = build_prompt(question, ret["contexts"], lang="fa", max_chars=1800)
     ans = llm_generate(prompt, max_new_tokens=int(max_tokens),
                        temperature=float(temperature), top_p=float(top_p),
                        top_k=int(top_k), do_sample=False)
     gr.Markdown("### Free-tier CPU demo: text RAG (E5) + optional fusion (CLIP) → Qwen 0.5B")
     with gr.Tab("Ask"):
         with gr.Row():
+            q = gr.Textbox(label="Question", lines=3)
+            img = gr.Image(type="pil", label="Optional image")
+            use_fusion = gr.Checkbox(label="Use image fusion (slower on CPU)", value=False)
         with gr.Row():
+            topk = gr.Slider(1, 20, value=3, step=1, label="Top-K retrieve")
+            max_tokens = gr.Slider(16, 512, value=96, step=16, label="Max new tokens")
         with gr.Row():
+            temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature")
+            top_p = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Top-p")
+            top_k = gr.Slider(1, 100, value=50, step=1, label="Top-k")
         btn = gr.Button("Answer")
         ans = gr.Textbox(label="Answer", lines=8)
         route = gr.Textbox(label="Route used (text_e5 or fusion)")
         table = gr.Dataframe(headers=["#", "id", "score", "snippet"], interactive=False)
+        btn.click(ui_answer, [q, img, use_fusion, topk, max_tokens, temperature, top_p, top_k], [ans, table, route])
     with gr.Tab("MCQ"):
         with gr.Row():
             q_mcq = gr.Textbox(label="Question", lines=3)
             opts_mcq = gr.Textbox(label="Options (one per line)", lines=8)
+        img_mcq = gr.Image(type="pil", label="Optional image (fusion if enabled)")
         with gr.Row():
+            topk2 = gr.Slider(1, 20, value=3, step=1, label="Top-K retrieve")
+            max_tokens2 = gr.Slider(16, 512, value=96, step=16, label="Max new tokens")
         with gr.Row():
+            temperature2 = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature")
+            top_p2 = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Top-p")
+            top_k2 = gr.Slider(1, 100, value=50, step=1, label="Top-k")
         btn2 = gr.Button("Answer MCQ")
         result = gr.Textbox(label="Prediction", lines=12, max_lines=20)
         raw = gr.Textbox(label="Raw LLM output", lines=12, max_lines=20)
         route2 = gr.Textbox(label="Route used")