Spaces:

Heng2004
/

Laos-Natural-Science-Chatbot

Running

File size: 1,292 Bytes

# qa_store.py
from typing import List, Dict, Any
import re

# Textbook chunks
ENTRIES: List[Dict[str, Any]] = []
RAW_KNOWLEDGE: str = ""

# QA from textbook JSONL (auto-generated from textbook)
AUTO_QA_KNOWLEDGE: List[Dict[str, Any]] = []

# Manual QA managed by teacher (manual_qa.jsonl)
MANUAL_QA_LIST: List[Dict[str, Any]] = []
MANUAL_QA_INDEX: Dict[str, Dict[str, Any]] = {}

# Combined index for fast lookup (auto + manual)
QA_INDEX: Dict[str, str] = {}
ALL_QA_KNOWLEDGE: List[Dict[str, Any]] = []

# Counter for new manual IDs
NEXT_MANUAL_ID: int = 1

# Embeddings for textbook entries (one vector per ENTRIES item)
# Will be set to a torch.Tensor by _build_entry_embeddings() in model_utils.py
TEXT_EMBEDDINGS = None # torch.Tensor

# 👇 Add these for glossary
GLOSSARY: List[Dict[str, Any]] = []            # list of glossary items (dicts)
GLOSSARY_EMBEDDINGS = None  # numpy.ndarray


def normalize_question(q: str) -> str:
    """
    Normalize Lao/English question text for matching.
    Lowercase + remove punctuation + collapse spaces.
    """
    q = (q or "").lower()
    # remove common punctuation (including Lao/English quotes)
    q = re.sub(r"[?!？！\.\,\:\;\"“”'‘’]", " ", q)
    # collapse multiple spaces
    q = re.sub(r"\s+", " ", q)
    return q.strip()