File size: 1,292 Bytes
7171447
 
 
 
 
 
 
 
7502f70
7171447
 
7502f70
7171447
 
 
7502f70
7171447
 
 
 
 
 
076631b
7502f70
b4ddaff
076631b
7ced8a6
b4ddaff
 
7ced8a6
7171447
 
 
 
 
 
 
7502f70
7171447
7502f70
7171447
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# qa_store.py
from typing import List, Dict, Any
import re

# Textbook chunks
ENTRIES: List[Dict[str, Any]] = []
RAW_KNOWLEDGE: str = ""

# QA from textbook JSONL (auto-generated from textbook)
AUTO_QA_KNOWLEDGE: List[Dict[str, Any]] = []

# Manual QA managed by teacher (manual_qa.jsonl)
MANUAL_QA_LIST: List[Dict[str, Any]] = []
MANUAL_QA_INDEX: Dict[str, Dict[str, Any]] = {}

# Combined index for fast lookup (auto + manual)
QA_INDEX: Dict[str, str] = {}
ALL_QA_KNOWLEDGE: List[Dict[str, Any]] = []

# Counter for new manual IDs
NEXT_MANUAL_ID: int = 1

# Embeddings for textbook entries (one vector per ENTRIES item)
# Will be set to a torch.Tensor by _build_entry_embeddings() in model_utils.py
TEXT_EMBEDDINGS = None # torch.Tensor

# πŸ‘‡ Add these for glossary
GLOSSARY: List[Dict[str, Any]] = []            # list of glossary items (dicts)
GLOSSARY_EMBEDDINGS = None  # numpy.ndarray


def normalize_question(q: str) -> str:
    """
    Normalize Lao/English question text for matching.
    Lowercase + remove punctuation + collapse spaces.
    """
    q = (q or "").lower()
    # remove common punctuation (including Lao/English quotes)
    q = re.sub(r"[?!?!\.\,\:\;\"β€œβ€'β€˜β€™]", " ", q)
    # collapse multiple spaces
    q = re.sub(r"\s+", " ", q)
    return q.strip()