Heng2004's picture
Update qa_store.py
7502f70 verified
raw
history blame
1.12 kB
# qa_store.py
from typing import List, Dict, Any
import re
# Textbook chunks
ENTRIES: List[Dict[str, Any]] = []
RAW_KNOWLEDGE: str = ""
# QA from textbook JSONL (auto-generated from textbook)
AUTO_QA_KNOWLEDGE: List[Dict[str, Any]] = []
# Manual QA managed by teacher (manual_qa.jsonl)
MANUAL_QA_LIST: List[Dict[str, Any]] = []
MANUAL_QA_INDEX: Dict[str, Dict[str, Any]] = {}
# Combined index for fast lookup (auto + manual)
QA_INDEX: Dict[str, str] = {}
ALL_QA_KNOWLEDGE: List[Dict[str, Any]] = []
# Counter for new manual IDs
NEXT_MANUAL_ID: int = 1
# Embeddings for textbook entries (one vector per ENTRIES item)
# Will be set to a torch.Tensor by _build_entry_embeddings() in model_utils.py
TEXT_EMBEDDINGS = None
def normalize_question(q: str) -> str:
"""
Normalize Lao/English question text for matching.
Lowercase + remove punctuation + collapse spaces.
"""
q = (q or "").lower()
# remove common punctuation (including Lao/English quotes)
q = re.sub(r"[?!?!\.\,\:\;\"β€œβ€'β€˜β€™]", " ", q)
# collapse multiple spaces
q = re.sub(r"\s+", " ", q)
return q.strip()