raviix46 commited on
Commit
59ae20b
·
verified ·
1 Parent(s): d0d4aae

Delete rag_data.py

Browse files
Files changed (1) hide show
  1. rag_data.py +0 -41
rag_data.py DELETED
@@ -1,41 +0,0 @@
1
- import numpy as np
2
- from rank_bm25 import BM25Okapi
3
- from sentence_transformers import SentenceTransformer
4
-
5
- from rag_config import (
6
- CHUNKS_PATH,
7
- THREADS_PATH,
8
- MESSAGES_PATH,
9
- EMBEDDINGS_PATH,
10
- CHUNK_IDS_PATH,
11
- load_json,
12
- load_jsonl,
13
- )
14
-
15
- # Load base data
16
- chunks = load_jsonl(CHUNKS_PATH)
17
- threads = load_json(THREADS_PATH)
18
- messages = load_json(MESSAGES_PATH)
19
-
20
- # Map chunk_id -> chunk
21
- chunk_id_to_chunk = {c["chunk_id"]: c for c in chunks}
22
-
23
- # BM25 corpus
24
- corpus_tokens = [c["text"].split() for c in chunks]
25
- bm25 = BM25Okapi(corpus_tokens)
26
-
27
- # Semantic embeddings
28
- embeddings = np.load(EMBEDDINGS_PATH) # (N, D)
29
-
30
- with CHUNK_IDS_PATH.open("r", encoding="utf-8") as f:
31
- chunk_ids = load_json(CHUNK_IDS_PATH)
32
-
33
- # Map chunk_id -> index in embeddings
34
- chunk_index = {cid: i for i, cid in enumerate(chunk_ids)}
35
-
36
- # SentenceTransformer model (same as used in build_embeddings)
37
- SEM_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
38
- sem_model = SentenceTransformer(SEM_MODEL_NAME)
39
-
40
- # Thread IDs for dropdown
41
- THREAD_OPTIONS = sorted(list(threads.keys()))