Spaces:
Sleeping
Sleeping
Delete rag_data.py
Browse files- rag_data.py +0 -41
rag_data.py
DELETED
|
@@ -1,41 +0,0 @@
|
|
| 1 |
-
import numpy as np
|
| 2 |
-
from rank_bm25 import BM25Okapi
|
| 3 |
-
from sentence_transformers import SentenceTransformer
|
| 4 |
-
|
| 5 |
-
from rag_config import (
|
| 6 |
-
CHUNKS_PATH,
|
| 7 |
-
THREADS_PATH,
|
| 8 |
-
MESSAGES_PATH,
|
| 9 |
-
EMBEDDINGS_PATH,
|
| 10 |
-
CHUNK_IDS_PATH,
|
| 11 |
-
load_json,
|
| 12 |
-
load_jsonl,
|
| 13 |
-
)
|
| 14 |
-
|
| 15 |
-
# Load base data
|
| 16 |
-
chunks = load_jsonl(CHUNKS_PATH)
|
| 17 |
-
threads = load_json(THREADS_PATH)
|
| 18 |
-
messages = load_json(MESSAGES_PATH)
|
| 19 |
-
|
| 20 |
-
# Map chunk_id -> chunk
|
| 21 |
-
chunk_id_to_chunk = {c["chunk_id"]: c for c in chunks}
|
| 22 |
-
|
| 23 |
-
# BM25 corpus
|
| 24 |
-
corpus_tokens = [c["text"].split() for c in chunks]
|
| 25 |
-
bm25 = BM25Okapi(corpus_tokens)
|
| 26 |
-
|
| 27 |
-
# Semantic embeddings
|
| 28 |
-
embeddings = np.load(EMBEDDINGS_PATH) # (N, D)
|
| 29 |
-
|
| 30 |
-
with CHUNK_IDS_PATH.open("r", encoding="utf-8") as f:
|
| 31 |
-
chunk_ids = load_json(CHUNK_IDS_PATH)
|
| 32 |
-
|
| 33 |
-
# Map chunk_id -> index in embeddings
|
| 34 |
-
chunk_index = {cid: i for i, cid in enumerate(chunk_ids)}
|
| 35 |
-
|
| 36 |
-
# SentenceTransformer model (same as used in build_embeddings)
|
| 37 |
-
SEM_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
| 38 |
-
sem_model = SentenceTransformer(SEM_MODEL_NAME)
|
| 39 |
-
|
| 40 |
-
# Thread IDs for dropdown
|
| 41 |
-
THREAD_OPTIONS = sorted(list(threads.keys()))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|