raviix46 commited on
Commit
9890c71
·
verified ·
1 Parent(s): 4fc333a

Create rag_data.py

Browse files
Files changed (1) hide show
  1. rag_data.py +41 -0
rag_data.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from rank_bm25 import BM25Okapi
3
+ from sentence_transformers import SentenceTransformer
4
+
5
+ from rag_config import (
6
+ CHUNKS_PATH,
7
+ THREADS_PATH,
8
+ MESSAGES_PATH,
9
+ EMBEDDINGS_PATH,
10
+ CHUNK_IDS_PATH,
11
+ load_json,
12
+ load_jsonl,
13
+ )
14
+
15
+ # Load base data
16
+ chunks = load_jsonl(CHUNKS_PATH)
17
+ threads = load_json(THREADS_PATH)
18
+ messages = load_json(MESSAGES_PATH)
19
+
20
+ # Map chunk_id -> chunk
21
+ chunk_id_to_chunk = {c["chunk_id"]: c for c in chunks}
22
+
23
+ # BM25 corpus
24
+ corpus_tokens = [c["text"].split() for c in chunks]
25
+ bm25 = BM25Okapi(corpus_tokens)
26
+
27
+ # Semantic embeddings
28
+ embeddings = np.load(EMBEDDINGS_PATH) # (N, D)
29
+
30
+ with CHUNK_IDS_PATH.open("r", encoding="utf-8") as f:
31
+ chunk_ids = load_json(CHUNK_IDS_PATH)
32
+
33
+ # Map chunk_id -> index in embeddings
34
+ chunk_index = {cid: i for i, cid in enumerate(chunk_ids)}
35
+
36
+ # SentenceTransformer model (same as used in build_embeddings)
37
+ SEM_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
38
+ sem_model = SentenceTransformer(SEM_MODEL_NAME)
39
+
40
+ # Thread IDs for dropdown
41
+ THREAD_OPTIONS = sorted(list(threads.keys()))