Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- app.py +7 -46
- requirements.txt +0 -1
- selfrag_core.py +0 -0
app.py
CHANGED
|
@@ -12,11 +12,9 @@ from dataclasses import dataclass, field
|
|
| 12 |
from pathlib import Path
|
| 13 |
from typing import Dict, List, Optional, Tuple
|
| 14 |
|
| 15 |
-
import faiss
|
| 16 |
import gradio as gr
|
| 17 |
-
from sentence_transformers import SentenceTransformer
|
| 18 |
|
| 19 |
-
from
|
| 20 |
|
| 21 |
|
| 22 |
APP_NAME = "SourceTruth"
|
|
@@ -163,44 +161,15 @@ class EmptyRetriever:
|
|
| 163 |
return []
|
| 164 |
|
| 165 |
|
| 166 |
-
class SessionRetriever:
|
| 167 |
-
def __init__(self, chunks: List[Chunk]
|
| 168 |
-
|
| 169 |
-
self.
|
| 170 |
-
self.index = None
|
| 171 |
-
self._build_index()
|
| 172 |
-
|
| 173 |
-
def _build_index(self):
|
| 174 |
-
if not self.chunks:
|
| 175 |
-
return
|
| 176 |
-
texts = [f"{chunk.source_file} {chunk.text}" for chunk in self.chunks]
|
| 177 |
-
embeddings = self._encoder.encode(
|
| 178 |
-
texts,
|
| 179 |
-
convert_to_numpy=True,
|
| 180 |
-
normalize_embeddings=True,
|
| 181 |
-
show_progress_bar=False,
|
| 182 |
-
).astype("float32")
|
| 183 |
-
self.index = faiss.IndexFlatIP(embeddings.shape[1])
|
| 184 |
-
self.index.add(embeddings)
|
| 185 |
-
|
| 186 |
-
def retrieve(self, query: str, k: int = K_PASSAGES) -> List[Chunk]:
|
| 187 |
-
if self.index is None:
|
| 188 |
-
return []
|
| 189 |
-
query_embedding = self._encoder.encode(
|
| 190 |
-
[query],
|
| 191 |
-
convert_to_numpy=True,
|
| 192 |
-
normalize_embeddings=True,
|
| 193 |
-
show_progress_bar=False,
|
| 194 |
-
).astype("float32")
|
| 195 |
-
_, indices = self.index.search(query_embedding, min(k, len(self.chunks)))
|
| 196 |
-
return [self.chunks[i] for i in indices[0] if 0 <= i < len(self.chunks)]
|
| 197 |
|
| 198 |
|
| 199 |
SESSIONS: Dict[str, SessionData] = {}
|
| 200 |
SESSIONS_LOCK = threading.Lock()
|
| 201 |
MODEL_LOCK = threading.Lock()
|
| 202 |
-
EMBEDDER_LOCK = threading.Lock()
|
| 203 |
-
GLOBAL_EMBEDDER: Optional[SentenceTransformer] = None
|
| 204 |
GLOBAL_AGENT_TEMPLATE: Optional[AgenticSelfRAG] = None
|
| 205 |
|
| 206 |
|
|
@@ -388,14 +357,6 @@ def cleanup_expired_sessions():
|
|
| 388 |
log_event("session_expired", session_id=session_id)
|
| 389 |
|
| 390 |
|
| 391 |
-
def get_embedder() -> SentenceTransformer:
|
| 392 |
-
global GLOBAL_EMBEDDER
|
| 393 |
-
with EMBEDDER_LOCK:
|
| 394 |
-
if GLOBAL_EMBEDDER is None:
|
| 395 |
-
GLOBAL_EMBEDDER = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
|
| 396 |
-
return GLOBAL_EMBEDDER
|
| 397 |
-
|
| 398 |
-
|
| 399 |
def get_agent_template() -> AgenticSelfRAG:
|
| 400 |
global GLOBAL_AGENT_TEMPLATE
|
| 401 |
with MODEL_LOCK:
|
|
@@ -935,7 +896,7 @@ def build_session(file_path: str) -> SessionData:
|
|
| 935 |
|
| 936 |
file_hash = sha256_file(dest_path)
|
| 937 |
file_size_bytes = os.path.getsize(dest_path)
|
| 938 |
-
retriever = SessionRetriever(chunks
|
| 939 |
agent = build_session_agent(retriever)
|
| 940 |
|
| 941 |
return SessionData(
|
|
@@ -1091,7 +1052,7 @@ def build_corpus_session() -> SessionData:
|
|
| 1091 |
if file_name == "02_Validation_Master_Plan.pdf":
|
| 1092 |
structured["vmp_table"] = parse_vmp_table(doc_pages)
|
| 1093 |
|
| 1094 |
-
retriever = SessionRetriever(chunks
|
| 1095 |
agent = build_session_agent(retriever)
|
| 1096 |
corpus_hash = hashlib.sha256("|".join(file_hash_parts).encode("utf-8")).hexdigest()
|
| 1097 |
return SessionData(
|
|
|
|
| 12 |
from pathlib import Path
|
| 13 |
from typing import Dict, List, Optional, Tuple
|
| 14 |
|
|
|
|
| 15 |
import gradio as gr
|
|
|
|
| 16 |
|
| 17 |
+
from selfrag_core import AgenticSelfRAG, Chunk, K_PASSAGES, LightweightRetriever
|
| 18 |
|
| 19 |
|
| 20 |
APP_NAME = "SourceTruth"
|
|
|
|
| 161 |
return []
|
| 162 |
|
| 163 |
|
| 164 |
+
class SessionRetriever(LightweightRetriever):
|
| 165 |
+
def __init__(self, chunks: List[Chunk]):
|
| 166 |
+
super().__init__(device="cpu")
|
| 167 |
+
self.index_chunks(chunks)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
|
| 170 |
SESSIONS: Dict[str, SessionData] = {}
|
| 171 |
SESSIONS_LOCK = threading.Lock()
|
| 172 |
MODEL_LOCK = threading.Lock()
|
|
|
|
|
|
|
| 173 |
GLOBAL_AGENT_TEMPLATE: Optional[AgenticSelfRAG] = None
|
| 174 |
|
| 175 |
|
|
|
|
| 357 |
log_event("session_expired", session_id=session_id)
|
| 358 |
|
| 359 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
def get_agent_template() -> AgenticSelfRAG:
|
| 361 |
global GLOBAL_AGENT_TEMPLATE
|
| 362 |
with MODEL_LOCK:
|
|
|
|
| 896 |
|
| 897 |
file_hash = sha256_file(dest_path)
|
| 898 |
file_size_bytes = os.path.getsize(dest_path)
|
| 899 |
+
retriever = SessionRetriever(chunks)
|
| 900 |
agent = build_session_agent(retriever)
|
| 901 |
|
| 902 |
return SessionData(
|
|
|
|
| 1052 |
if file_name == "02_Validation_Master_Plan.pdf":
|
| 1053 |
structured["vmp_table"] = parse_vmp_table(doc_pages)
|
| 1054 |
|
| 1055 |
+
retriever = SessionRetriever(chunks)
|
| 1056 |
agent = build_session_agent(retriever)
|
| 1057 |
corpus_hash = hashlib.sha256("|".join(file_hash_parts).encode("utf-8")).hexdigest()
|
| 1058 |
return SessionData(
|
requirements.txt
CHANGED
|
@@ -6,6 +6,5 @@ bitsandbytes
|
|
| 6 |
sentence-transformers
|
| 7 |
sentencepiece
|
| 8 |
protobuf
|
| 9 |
-
faiss-cpu
|
| 10 |
pymupdf
|
| 11 |
pypdf
|
|
|
|
| 6 |
sentence-transformers
|
| 7 |
sentencepiece
|
| 8 |
protobuf
|
|
|
|
| 9 |
pymupdf
|
| 10 |
pypdf
|
selfrag_core.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|