Spaces:

Heng2004
/

Laos-Natural-Science-Chatbot

Running

App Files Files Community

Heng2004 commited on about 22 hours ago

Commit

c053894

verified ·

1 Parent(s): 1b5b80c

Update model_utils.py

Browse files

Files changed (1) hide show

model_utils.py +137 -54

model_utils.py CHANGED Viewed

@@ -2,56 +2,57 @@
 from typing import List, Optional
 import re
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from sentence_transformers import SentenceTransformer
 from sentence_transformers.util import cos_sim
 import qa_store
-from loader import load_curriculum, load_manual_qa, rebuild_combined_qa
 # -----------------------------
 # Base chat model
 # -----------------------------
 MODEL_NAME = "SeaLLMs/SeaLLMs-v3-1.5B-Chat"
-MAX_CONTEXT_ENTRIES = 3  # how many textbook chunks to retrieve per question
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 device = "cuda" if torch.cuda.is_available() else "cpu"
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_NAME,
-    torch_dtype=torch.float32,
-).to(device)
 model.eval()
-# -----------------------------
-# Embedding model for retrieval
-# -----------------------------
-EMBED_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
 embed_model = SentenceTransformer(EMBED_MODEL_NAME)
-# (optional) move embedding model to same device; OK to leave on CPU if you want
 embed_model = embed_model.to(device)
-# NOTE: called once after load_curriculum() to precompute embeddings.
-# If you ever reload curriculum at runtime, call _build_entry_embeddings() again.
 def _build_entry_embeddings() -> None:
     """
-    Build embeddings for each textbook entry using title + summary + text
     and store them in qa_store.TEXT_EMBEDDINGS.
     """
-    if not qa_store.ENTRIES:
         qa_store.TEXT_EMBEDDINGS = None
         return
-    texts = []
     for e in qa_store.ENTRIES:
-        title = e.get("title", "") or ""
-        summary = e.get("summary", "") or ""
         text = e.get("text", "") or ""
-        combined = f"{title}\n{summary}\n{text}"
         texts.append(combined)
     qa_store.TEXT_EMBEDDINGS = embed_model.encode(
@@ -61,30 +62,91 @@ def _build_entry_embeddings() -> None:
     )
 # -----------------------------
 # Load data once at import time
 # -----------------------------
 load_curriculum()
 load_manual_qa()
 rebuild_combined_qa()
 _build_entry_embeddings()
 SYSTEM_PROMPT = (
-    "ທ່ານແມ່ນຜູ້ຊ່ວຍເຫຼືອດ້ານປະຫວັດສາດຂອງປະເທດລາວ "
-    "ສໍາລັບນັກຮຽນຊັ້ນ ມ.1. "
     "ຕອບແຕ່ພາສາລາວ ໃຫ້ຕອບສັ້ນໆ 2–3 ປະໂຫຍກ ແລະເຂົ້າໃຈງ່າຍ. "
-    "ໃຫ້ອີງຈາກຂໍ້ມູນຂ້າງລຸ່ມນີ້ເທົ່ານັ້ນ. "
     "ຖ້າຂໍ້ມູນບໍ່ພຽງພໍ ຫຼືບໍ່ຊັດເຈນ ໃຫ້ບອກວ່າບໍ່ແນ່ໃຈ."
 )
 def retrieve_context(question: str, max_entries: int = MAX_CONTEXT_ENTRIES) -> str:
     """
     Embedding-based retrieval over textbook entries.
-    Falls back to first entries if embeddings are missing.
     """
-    if not qa_store.ENTRIES:
-        return qa_store.RAW_KNOWLEDGE
     if qa_store.TEXT_EMBEDDINGS is None:
         top_entries = qa_store.ENTRIES[:max_entries]
@@ -99,11 +161,8 @@ def retrieve_context(question: str, max_entries: int = MAX_CONTEXT_ENTRIES) -> s
         # 2) Cosine similarity with all entry embeddings
         sims = cos_sim(q_vec, qa_store.TEXT_EMBEDDINGS)[0]  # shape [N]
-        # 3) Pick top-k indices
-        k = min(max_entries, len(qa_store.ENTRIES))
-        _, top_indices = torch.topk(sims, k=k)
-        # 4) Map indices back to entries
         top_entries = [qa_store.ENTRIES[i] for i in top_indices.tolist()]
     # Build context string for the prompt
@@ -111,41 +170,54 @@ def retrieve_context(question: str, max_entries: int = MAX_CONTEXT_ENTRIES) -> s
     for e in top_entries:
         header = (
             f"[ຊັ້ນ {e.get('grade','')}, "
-            f"ບົດ {e.get('chapter','')}, "
-            f"ຫົວຂໍ້ {e.get('section','')} – {e.get('title','')}]"
         )
         context_blocks.append(f"{header}\n{e.get('text','')}")
     return "\n\n".join(context_blocks)
-def _format_history(history: Optional[List]) -> str:
     """
-    Convert last few chat turns into a Lao conversation snippet
-    to give the model context for follow-up questions.
-    Gradio history format: [[user_msg, bot_msg], [user_msg, bot_msg], ...]
     """
-    if not history:
-        return ""
-    # keep only the last 3 turns to avoid very long prompts
-    recent = history[-3:]
-    lines: List[str] = []
-    for turn in recent:
-        if not isinstance(turn, (list, tuple)) or len(turn) != 2:
-            continue
-        user_msg, bot_msg = turn
-        lines.append(f"ນັກຮຽນ: {user_msg}")
-        lines.append(f"ອາຈານ AI: {bot_msg}")
-    if not lines:
-        return ""
-    joined = "\n".join(lines)
-    return f"ປະຫວັດການສົນທະນາກ່ອນໜ້າ:\n{joined}\n\n"
 def build_prompt(question: str, history: Optional[List] = None) -> str:
     context = retrieve_context(question, max_entries=MAX_CONTEXT_ENTRIES)
     history_block = _format_history(history)
@@ -174,12 +246,15 @@ def generate_answer(question: str, history: Optional[List] = None) -> str:
     generated_ids = outputs[0][inputs["input_ids"].shape[1]:]
     answer = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
-    # Enforce 2–3 sentence answers for M.1 students
     sentences = re.split(r"(?<=[\.?!…])\s+", answer)
     short_answer = " ".join(sentences[:3]).strip()
     return short_answer if short_answer else answer
 def answer_from_qa(question: str) -> Optional[str]:
     """
     1) Exact match in QA_INDEX
@@ -217,6 +292,9 @@ def answer_from_qa(question: str) -> Optional[str]:
     return None
 def laos_history_bot(message: str, history: List) -> str:
     """
     Main chatbot function for Student tab (Gradio ChatInterface).
@@ -224,6 +302,11 @@ def laos_history_bot(message: str, history: List) -> str:
     if not message.strip():
         return "ກະລຸນາພິມຄໍາຖາມກ່ອນ."
     # 1) Try exact / fuzzy Q&A first
     direct = answer_from_qa(message)
     if direct:

 from typing import List, Optional
 import re
+import numpy as np
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from sentence_transformers import SentenceTransformer
 from sentence_transformers.util import cos_sim
 import qa_store
+from loader import load_curriculum, load_manual_qa, rebuild_combined_qa, load_glossary
 # -----------------------------
 # Base chat model
 # -----------------------------
 MODEL_NAME = "SeaLLMs/SeaLLMs-v3-1.5B-Chat"
+EMBED_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
 device = "cuda" if torch.cuda.is_available() else "cpu"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+# Use float16 on GPU to save memory, float32 on CPU
+dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=dtype)
+model.to(device)
 model.eval()
 embed_model = SentenceTransformer(EMBED_MODEL_NAME)
 embed_model = embed_model.to(device)
+# Number of textbook entries to include in the RAG context
+MAX_CONTEXT_ENTRIES = 4
+# -----------------------------
+# Embedding builders
+# -----------------------------
 def _build_entry_embeddings() -> None:
     """
+    Build embeddings for each textbook entry using chapter + section + text
     and store them in qa_store.TEXT_EMBEDDINGS.
+    Call this after loading / reloading curriculum.
     """
+    if not getattr(qa_store, "ENTRIES", None):
         qa_store.TEXT_EMBEDDINGS = None
         return
+    texts: List[str] = []
     for e in qa_store.ENTRIES:
+        chapter = e.get("chapter_title", "") or e.get("chapter", "") or ""
+        section = e.get("section_title", "") or e.get("section", "") or ""
         text = e.get("text", "") or ""
+        combined = f"{chapter}\n{section}\n{text}"
         texts.append(combined)
     qa_store.TEXT_EMBEDDINGS = embed_model.encode(
     )
+def _build_glossary_embeddings() -> None:
+    """Create embeddings for glossary terms + definitions."""
+    if not getattr(qa_store, "GLOSSARY", None):
+        qa_store.GLOSSARY_EMBEDDINGS = None
+        print("[INFO] No glossary terms to embed.")
+        return
+    # Embed term + definition together
+    texts = [
+        f"{item.get('term', '')} :: {item.get('definition', '')}"
+        for item in qa_store.GLOSSARY
+    ]
+    embeddings = embed_model.encode(
+        texts,
+        convert_to_numpy=True,
+        normalize_embeddings=True,
+    )
+    qa_store.GLOSSARY_EMBEDDINGS = embeddings
+    print(f"[INFO] Built glossary embeddings for {len(texts)} terms.")
 # -----------------------------
 # Load data once at import time
 # -----------------------------
 load_curriculum()
 load_manual_qa()
+load_glossary()
 rebuild_combined_qa()
 _build_entry_embeddings()
+_build_glossary_embeddings()
+# -----------------------------
+# System prompt (Natural Science)
+# -----------------------------
 SYSTEM_PROMPT = (
+    "ທ່ານແມ່ນຜູ້ຊ່ວຍເຫຼືອດ້ານວິທະຍາສາດທໍາມະຊາດ "
+    "ສໍາລັບນັກຮຽນຊັ້ນ ມ.1-ມ.4. "
     "ຕອບແຕ່ພາສາລາວ ໃຫ້ຕອບສັ້ນໆ 2–3 ປະໂຫຍກ ແລະເຂົ້າໃຈງ່າຍ. "
+    "ໃຫ້ອີງຈາກຂໍ້ມູນອ້າງອີງຂ້າງລຸ່ມນີ້ເທົ່ານັ້ນ. "
     "ຖ້າຂໍ້ມູນບໍ່ພຽງພໍ ຫຼືບໍ່ຊັດເຈນ ໃຫ້ບອກວ່າບໍ່ແນ່ໃຈ."
 )
+# -----------------------------
+# Helper: history formatting
+# -----------------------------
+def _format_history(history: Optional[List]) -> str:
+    """
+    Convert last few chat turns into a Lao conversation snippet
+    to give the model context for follow-up questions.
+    Gradio history format: [[user_msg, bot_msg], [user_msg, bot_msg], ...]
+    """
+    if not history:
+        return ""
+    # keep only the last 3 turns to avoid very long prompts
+    recent = history[-3:]
+    lines: List[str] = []
+    for turn in recent:
+        if not isinstance(turn, (list, tuple)) or len(turn) != 2:
+            continue
+        user_msg, bot_msg = turn
+        lines.append(f"ນັກຮຽນ: {user_msg}")
+        lines.append(f"ອາຈານ AI: {bot_msg}")
+    if not lines:
+        return ""
+    joined = "\n".join(lines) + "\n\n"
+    return joined
+# -----------------------------
+# RAG: retrieve textbook context
+# -----------------------------
 def retrieve_context(question: str, max_entries: int = MAX_CONTEXT_ENTRIES) -> str:
     """
     Embedding-based retrieval over textbook entries.
+    Falls back to concatenated raw knowledge if embeddings are missing.
     """
+    if not getattr(qa_store, "ENTRIES", None):
+        # Fallback: raw knowledge (if available) or empty string
+        return getattr(qa_store, "RAW_KNOWLEDGE", "")
     if qa_store.TEXT_EMBEDDINGS is None:
         top_entries = qa_store.ENTRIES[:max_entries]
         # 2) Cosine similarity with all entry embeddings
         sims = cos_sim(q_vec, qa_store.TEXT_EMBEDDINGS)[0]  # shape [N]
+        # 3) Take top-k
+        top_indices = torch.topk(sims, k=min(max_entries, sims.shape[0])).indices
         top_entries = [qa_store.ENTRIES[i] for i in top_indices.tolist()]
     # Build context string for the prompt
     for e in top_entries:
         header = (
             f"[ຊັ້ນ {e.get('grade','')}, "
+            f"ໜ່ວຍ {e.get('unit','')}, "
+            f"ບົດ {e.get('chapter_title','')}, "
+            f"ຫົວຂໍ້ {e.get('section_title','')}]"
         )
         context_blocks.append(f"{header}\n{e.get('text','')}")
     return "\n\n".join(context_blocks)
+# -----------------------------
+# Glossary-based answering
+# -----------------------------
+def answer_from_glossary(message: str) -> Optional[str]:
     """
+    Try to answer using the glossary index.
+    Returns Lao answer string or None if not confident.
     """
+    if not getattr(qa_store, "GLOSSARY", None) or qa_store.GLOSSARY_EMBEDDINGS is None:
+        return None
+    # Encode question
+    q_emb = embed_model.encode(
+        [message],
+        convert_to_numpy=True,
+        normalize_embeddings=True,
+    )[0]
+    sims = np.dot(qa_store.GLOSSARY_EMBEDDINGS, q_emb)
+    best_idx = int(np.argmax(sims))
+    best_sim = float(sims[best_idx])
+    # tune this threshold later if needed
+    if best_sim < 0.55:
+        return None
+    item = qa_store.GLOSSARY[best_idx]
+    definition = item.get("definition", "").strip()
+    example = item.get("example", "").strip()
+    if example:
+        return f"{definition} ຕົວຢ່າງ: {example}"
+    else:
+        return definition
+# -----------------------------
+# Prompt + LLM generation
+# -----------------------------
 def build_prompt(question: str, history: Optional[List] = None) -> str:
     context = retrieve_context(question, max_entries=MAX_CONTEXT_ENTRIES)
     history_block = _format_history(history)
     generated_ids = outputs[0][inputs["input_ids"].shape[1]:]
     answer = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+    # Enforce 2–3 sentence answers for students
     sentences = re.split(r"(?<=[\.?!…])\s+", answer)
     short_answer = " ".join(sentences[:3]).strip()
     return short_answer if short_answer else answer
+# -----------------------------
+# QA lookup (exact + fuzzy)
+# -----------------------------
 def answer_from_qa(question: str) -> Optional[str]:
     """
     1) Exact match in QA_INDEX
     return None
+# -----------------------------
+# Main chatbot entry
+# -----------------------------
 def laos_history_bot(message: str, history: List) -> str:
     """
     Main chatbot function for Student tab (Gradio ChatInterface).
     if not message.strip():
         return "ກະລຸນາພິມຄໍາຖາມກ່ອນ."
+    # 0) Try glossary first for key concepts
+    gloss = answer_from_glossary(message)
+    if gloss:
+        return gloss
     # 1) Try exact / fuzzy Q&A first
     direct = answer_from_qa(message)
     if direct: