File size: 1,066 Bytes
9890c71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer

from rag_config import (
    CHUNKS_PATH,
    THREADS_PATH,
    MESSAGES_PATH,
    EMBEDDINGS_PATH,
    CHUNK_IDS_PATH,
    load_json,
    load_jsonl,
)

# Load base data
chunks = load_jsonl(CHUNKS_PATH)
threads = load_json(THREADS_PATH)
messages = load_json(MESSAGES_PATH)

# Map chunk_id -> chunk
chunk_id_to_chunk = {c["chunk_id"]: c for c in chunks}

# BM25 corpus
corpus_tokens = [c["text"].split() for c in chunks]
bm25 = BM25Okapi(corpus_tokens)

# Semantic embeddings
embeddings = np.load(EMBEDDINGS_PATH)  # (N, D)

with CHUNK_IDS_PATH.open("r", encoding="utf-8") as f:
    chunk_ids = load_json(CHUNK_IDS_PATH)

# Map chunk_id -> index in embeddings
chunk_index = {cid: i for i, cid in enumerate(chunk_ids)}

# SentenceTransformer model (same as used in build_embeddings)
SEM_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
sem_model = SentenceTransformer(SEM_MODEL_NAME)

# Thread IDs for dropdown
THREAD_OPTIONS = sorted(list(threads.keys()))