Spaces:

Heng2004
/

Laos-Natural-Science-Chatbot

Running

App Files Files Community

Heng2004 commited on 11 days ago

Commit

c5298d8

verified ·

1 Parent(s): b2e6f17

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -6

app.py CHANGED Viewed

@@ -24,8 +24,11 @@ QA_INDEX = {}   # fast lookup: normalized question -> answer
 def _normalize_question(q: str) -> str:
-    # simple normalization: collapse spaces and strip
-    return re.sub(r"\s+", " ", q).strip()
 if os.path.exists(DATA_PATH):
@@ -159,7 +162,7 @@ def generate_answer(question: str) -> str:
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
-            max_new_tokens=120,   # shorter answers = faster
             do_sample=False,      # greedy decoding → more stable & a bit faster
         )
@@ -171,11 +174,37 @@ def generate_answer(question: str) -> str:
 def answer_from_qa(question: str) -> str | None:
     """
-    Fast path: if the question exactly matches a QA pair from the dataset,
-    return that answer immediately (no model call).
     """
     norm_q = _normalize_question(question)
-    return QA_INDEX.get(norm_q)
 # 3. Gradio chat function

 def _normalize_question(q: str) -> str:
+    # lowercase, remove basic punctuation, collapse spaces
+    q = q.lower()
+    q = re.sub(r"[?!？！\.\,\:\;\"“”'‘’]", " ", q)
+    q = re.sub(r"\s+", " ", q)
+    return q.strip()
 if os.path.exists(DATA_PATH):
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
+            max_new_tokens=160,   # shorter answers = faster
             do_sample=False,      # greedy decoding → more stable & a bit faster
         )
 def answer_from_qa(question: str) -> str | None:
     """
+    1) Try exact match in QA_INDEX.
+    2) If not found, use simple fuzzy match:
+       pick the stored question that shares the most words.
     """
     norm_q = _normalize_question(question)
+    # 1) exact match first
+    if norm_q in QA_INDEX:
+        return QA_INDEX[norm_q]
+    # 2) fuzzy match
+    q_terms = [t for t in norm_q.split(" ") if len(t) > 1]
+    if not q_terms:
+        return None
+    best_score = 0
+    best_answer = None
+    for stored_q, a in QA_INDEX.items():
+        stored_terms = [t for t in stored_q.split(" ") if len(t) > 1]
+        overlap = sum(1 for t in q_terms if t in stored_terms)
+        if overlap > best_score:
+            best_score = overlap
+            best_answer = a
+    # require at least 1 overlapping word (e.g. ປະຫວັດສາດ or ຄວາມສໍາຄັນ)
+    if best_score >= 1:
+        return best_answer
+    return None
 # 3. Gradio chat function