Spaces:

Heng2004
/

Laos-Natural-Science-Chatbot

Running

App Files Files Community

Heng2004 commited on 8 days ago

Commit

a254b10

verified ·

1 Parent(s): c5298d8

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -22

app.py CHANGED Viewed

@@ -3,6 +3,8 @@
 import os
 import re
 import json
 import gradio as gr
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -20,17 +22,31 @@ model = AutoModelForCausalLM.from_pretrained(
 DATA_PATH = "data/1_Year_U_1.jsonl"
 ENTRIES = []    # each entry is one JSON object (your schema)
-QA_INDEX = {}   # fast lookup: normalized question -> answer
 def _normalize_question(q: str) -> str:
-    # lowercase, remove basic punctuation, collapse spaces
     q = q.lower()
     q = re.sub(r"[?!？！\.\,\:\;\"“”'‘’]", " ", q)
     q = re.sub(r"\s+", " ", q)
     return q.strip()
 if os.path.exists(DATA_PATH):
     with open(DATA_PATH, "r", encoding="utf-8") as f:
         for line in f:
@@ -87,7 +103,7 @@ def retrieve_context(question: str, max_entries: int = 2) -> str:
             for e in chosen
         )
-    scored = []
     for e in ENTRIES:
         text = e.get("text", "")
@@ -157,13 +173,17 @@ def build_prompt(question: str) -> str:
 def generate_answer(question: str) -> str:
     prompt = build_prompt(question)
     inputs = tokenizer(prompt, return_tensors="pt")
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
-            max_new_tokens=160,   # shorter answers = faster
-            do_sample=False,      # greedy decoding → more stable & a bit faster
         )
     # slice off the prompt part
@@ -174,37 +194,56 @@ def generate_answer(question: str) -> str:
 def answer_from_qa(question: str) -> str | None:
     """
-    1) Try exact match in QA_INDEX.
-    2) If not found, use simple fuzzy match:
-       pick the stored question that shares the most words.
     """
     norm_q = _normalize_question(question)
     # 1) exact match first
     if norm_q in QA_INDEX:
         return QA_INDEX[norm_q]
-    # 2) fuzzy match
-    q_terms = [t for t in norm_q.split(" ") if len(t) > 1]
-    if not q_terms:
-        return None
-    best_score = 0
     best_answer = None
     for stored_q, a in QA_INDEX.items():
-        stored_terms = [t for t in stored_q.split(" ") if len(t) > 1]
-        overlap = sum(1 for t in q_terms if t in stored_terms)
-        if overlap > best_score:
-            best_score = overlap
             best_answer = a
-    # require at least 1 overlapping word (e.g. ປະຫວັດສາດ or ຄວາມສໍາຄັນ)
-    if best_score >= 1:
         return best_answer
-    return None
 # 3. Gradio chat function
@@ -212,7 +251,7 @@ def laos_history_bot(message: str, history: list):
     if not message.strip():
         return "ກະລຸນາພິມຄຳຖາມກ່ອນ."
-    # 1) Try to answer directly from QA pairs (instant)
     direct = answer_from_qa(message)
     if direct:
         return direct

 import os
 import re
 import json
+from difflib import SequenceMatcher  # 👈 for better fuzzy matching
 import gradio as gr
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 DATA_PATH = "data/1_Year_U_1.jsonl"
 ENTRIES = []    # each entry is one JSON object (your schema)
+QA_INDEX: dict[str, str] = {}   # fast lookup: normalized question -> answer
 def _normalize_question(q: str) -> str:
+    """
+    Normalize Lao questions for matching:
+    - lowercase
+    - remove common punctuation
+    - collapse spaces
+    """
     q = q.lower()
     q = re.sub(r"[?!？！\.\,\:\;\"“”'‘’]", " ", q)
     q = re.sub(r"\s+", " ", q)
     return q.strip()
+def _similarity(a: str, b: str) -> float:
+    """
+    Character-level similarity between two normalized strings.
+    Works OK for Lao because we’re still matching on shared sequences.
+    """
+    return SequenceMatcher(None, a, b).ratio()
+# Load dataset and build QA index
 if os.path.exists(DATA_PATH):
     with open(DATA_PATH, "r", encoding="utf-8") as f:
         for line in f:
             for e in chosen
         )
+    scored: list[tuple[int, dict]] = []
     for e in ENTRIES:
         text = e.get("text", "")
 def generate_answer(question: str) -> str:
+    """
+    Use SeaLLM + retrieved context to generate an answer.
+    Kept fairly short for speed and to avoid rambling.
+    """
     prompt = build_prompt(question)
     inputs = tokenizer(prompt, return_tensors="pt")
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
+            max_new_tokens=160,   # shorter answers = faster, less chance to cut mid-sentence
+            do_sample=False,      # greedy decoding → more stable & deterministic
         )
     # slice off the prompt part
 def answer_from_qa(question: str) -> str | None:
     """
+    Try to answer directly from:
+    1) Exact QA pairs.
+    2) Fuzzy QA question similarity.
+    3) Fuzzy match to entry summaries/titles (good for 'ຄວາມສໍາຄັນ...' type questions).
+    If nothing is good enough, return None so the model will answer.
     """
     norm_q = _normalize_question(question)
+    if not norm_q:
+        return None
     # 1) exact match first
     if norm_q in QA_INDEX:
         return QA_INDEX[norm_q]
+    # 2) fuzzy match over QA questions
+    best_ratio = 0.0
     best_answer = None
     for stored_q, a in QA_INDEX.items():
+        r = _similarity(norm_q, stored_q)
+        if r > best_ratio:
+            best_ratio = r
             best_answer = a
+    # threshold tuned so that very close questions (wording a bit different)
+    # still return the textbook QA answer
+    if best_ratio >= 0.55 and best_answer:
         return best_answer
+    # 3) fallback: fuzzy match over entry summaries / titles / keywords
+    best_ratio = 0.0
+    best_summary = None
+    for e in ENTRIES:
+        combined = f"{e.get('title','')} {e.get('summary','')} {' '.join(e.get('keywords', []))}"
+        combined_norm = _normalize_question(combined)
+        if not combined_norm:
+            continue
+        r = _similarity(norm_q, combined_norm)
+        if r > best_ratio:
+            best_ratio = r
+            best_summary = e.get("summary") or e.get("text")
+    # lower threshold here because we’re matching against shorter summaries
+    if best_ratio >= 0.35 and best_summary:
+        return best_summary
+    return None
 # 3. Gradio chat function
     if not message.strip():
         return "ກະລຸນາພິມຄຳຖາມກ່ອນ."
+    # 1) Try to answer directly from QA pairs or summaries (instant)
     direct = answer_from_qa(message)
     if direct:
         return direct