Spaces:
Sleeping
Sleeping
Update api/rag_engine.py
Browse files- api/rag_engine.py +35 -23
api/rag_engine.py
CHANGED
|
@@ -8,7 +8,9 @@ Chunk format (MVP):
|
|
| 8 |
{
|
| 9 |
"text": str,
|
| 10 |
"source_file": str,
|
| 11 |
-
"section": str
|
|
|
|
|
|
|
| 12 |
}
|
| 13 |
"""
|
| 14 |
|
|
@@ -20,14 +22,11 @@ from pypdf import PdfReader
|
|
| 20 |
from docx import Document
|
| 21 |
from pptx import Presentation
|
| 22 |
|
| 23 |
-
#
|
| 24 |
-
|
| 25 |
-
|
| 26 |
|
| 27 |
|
| 28 |
-
# ----------------------------
|
| 29 |
-
# Helpers
|
| 30 |
-
# ----------------------------
|
| 31 |
def _clean_text(s: str) -> str:
|
| 32 |
s = (s or "").replace("\r", "\n")
|
| 33 |
s = re.sub(r"\n{3,}", "\n\n", s)
|
|
@@ -36,9 +35,9 @@ def _clean_text(s: str) -> str:
|
|
| 36 |
|
| 37 |
def _split_into_chunks(text: str, max_chars: int = 1400) -> List[str]:
|
| 38 |
"""
|
| 39 |
-
|
| 40 |
- split by blank lines
|
| 41 |
-
-
|
| 42 |
"""
|
| 43 |
text = _clean_text(text)
|
| 44 |
if not text:
|
|
@@ -69,14 +68,18 @@ def _file_label(path: str) -> str:
|
|
| 69 |
return os.path.basename(path) if path else "uploaded_file"
|
| 70 |
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
# ----------------------------
|
| 73 |
# Parsers
|
| 74 |
# ----------------------------
|
| 75 |
def _parse_pdf_to_text(path: str) -> List[Tuple[str, str]]:
|
| 76 |
-
"""
|
| 77 |
-
Returns list of (section_label, text)
|
| 78 |
-
section_label uses page numbers.
|
| 79 |
-
"""
|
| 80 |
reader = PdfReader(path)
|
| 81 |
out: List[Tuple[str, str]] = []
|
| 82 |
for i, page in enumerate(reader.pages):
|
|
@@ -125,7 +128,6 @@ def build_rag_chunks_from_file(path: str, doc_type: str) -> List[Dict]:
|
|
| 125 |
ext = os.path.splitext(path)[1].lower()
|
| 126 |
source_file = _file_label(path)
|
| 127 |
|
| 128 |
-
# Parse into (section, text blocks)
|
| 129 |
sections: List[Tuple[str, str]] = []
|
| 130 |
try:
|
| 131 |
if ext == ".pdf":
|
|
@@ -138,7 +140,6 @@ def build_rag_chunks_from_file(path: str, doc_type: str) -> List[Dict]:
|
|
| 138 |
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
| 139 |
sections = [("text", _clean_text(f.read()))]
|
| 140 |
else:
|
| 141 |
-
# Unsupported file type: return empty (safe)
|
| 142 |
print(f"[rag_engine] unsupported file type: {ext}")
|
| 143 |
return []
|
| 144 |
except Exception as e:
|
|
@@ -147,14 +148,16 @@ def build_rag_chunks_from_file(path: str, doc_type: str) -> List[Dict]:
|
|
| 147 |
|
| 148 |
chunks: List[Dict] = []
|
| 149 |
for section, text in sections:
|
| 150 |
-
# Split section text into smaller chunks
|
| 151 |
for j, piece in enumerate(_split_into_chunks(text), start=1):
|
|
|
|
|
|
|
| 152 |
chunks.append(
|
| 153 |
{
|
| 154 |
"text": piece,
|
| 155 |
"source_file": source_file,
|
| 156 |
"section": f"{section}#{j}",
|
| 157 |
"doc_type": doc_type,
|
|
|
|
| 158 |
}
|
| 159 |
)
|
| 160 |
|
|
@@ -162,33 +165,42 @@ def build_rag_chunks_from_file(path: str, doc_type: str) -> List[Dict]:
|
|
| 162 |
|
| 163 |
|
| 164 |
def retrieve_relevant_chunks(
|
| 165 |
-
query: str,
|
|
|
|
|
|
|
|
|
|
| 166 |
) -> Tuple[str, List[Dict]]:
|
| 167 |
"""
|
| 168 |
-
|
| 169 |
-
- score by token overlap
|
| 170 |
- return top-k chunks concatenated as context
|
| 171 |
"""
|
| 172 |
query = _clean_text(query)
|
| 173 |
if not query or not chunks:
|
| 174 |
return "", []
|
| 175 |
|
| 176 |
-
q_tokens =
|
| 177 |
if not q_tokens:
|
| 178 |
return "", []
|
| 179 |
|
| 180 |
scored: List[Tuple[int, Dict]] = []
|
| 181 |
for c in chunks:
|
| 182 |
-
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
score = len(q_tokens.intersection(t_tokens))
|
| 185 |
if score > 0:
|
| 186 |
scored.append((score, c))
|
| 187 |
|
|
|
|
|
|
|
|
|
|
| 188 |
scored.sort(key=lambda x: x[0], reverse=True)
|
| 189 |
top = [c for _, c in scored[:k]]
|
| 190 |
|
| 191 |
-
# Build context text
|
| 192 |
buf_parts: List[str] = []
|
| 193 |
used: List[Dict] = []
|
| 194 |
total = 0
|
|
|
|
| 8 |
{
|
| 9 |
"text": str,
|
| 10 |
"source_file": str,
|
| 11 |
+
"section": str,
|
| 12 |
+
"doc_type": str,
|
| 13 |
+
"_tokens": frozenset[str] # ✅ precomputed for fast retrieval (in-memory)
|
| 14 |
}
|
| 15 |
"""
|
| 16 |
|
|
|
|
| 22 |
from docx import Document
|
| 23 |
from pptx import Presentation
|
| 24 |
|
| 25 |
+
# precompiled regex for speed
|
| 26 |
+
_WORD_RE = re.compile(r"[a-zA-Z0-9]+")
|
| 27 |
+
_WS_RE = re.compile(r"\s+")
|
| 28 |
|
| 29 |
|
|
|
|
|
|
|
|
|
|
| 30 |
def _clean_text(s: str) -> str:
|
| 31 |
s = (s or "").replace("\r", "\n")
|
| 32 |
s = re.sub(r"\n{3,}", "\n\n", s)
|
|
|
|
| 35 |
|
| 36 |
def _split_into_chunks(text: str, max_chars: int = 1400) -> List[str]:
|
| 37 |
"""
|
| 38 |
+
Deterministic chunker:
|
| 39 |
- split by blank lines
|
| 40 |
+
- pack into <= max_chars
|
| 41 |
"""
|
| 42 |
text = _clean_text(text)
|
| 43 |
if not text:
|
|
|
|
| 68 |
return os.path.basename(path) if path else "uploaded_file"
|
| 69 |
|
| 70 |
|
| 71 |
+
def _tokenize(s: str) -> frozenset:
|
| 72 |
+
# normalize whitespace first to reduce regex work slightly
|
| 73 |
+
s = _WS_RE.sub(" ", (s or "").lower()).strip()
|
| 74 |
+
if not s:
|
| 75 |
+
return frozenset()
|
| 76 |
+
return frozenset(_WORD_RE.findall(s))
|
| 77 |
+
|
| 78 |
+
|
| 79 |
# ----------------------------
|
| 80 |
# Parsers
|
| 81 |
# ----------------------------
|
| 82 |
def _parse_pdf_to_text(path: str) -> List[Tuple[str, str]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
reader = PdfReader(path)
|
| 84 |
out: List[Tuple[str, str]] = []
|
| 85 |
for i, page in enumerate(reader.pages):
|
|
|
|
| 128 |
ext = os.path.splitext(path)[1].lower()
|
| 129 |
source_file = _file_label(path)
|
| 130 |
|
|
|
|
| 131 |
sections: List[Tuple[str, str]] = []
|
| 132 |
try:
|
| 133 |
if ext == ".pdf":
|
|
|
|
| 140 |
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
| 141 |
sections = [("text", _clean_text(f.read()))]
|
| 142 |
else:
|
|
|
|
| 143 |
print(f"[rag_engine] unsupported file type: {ext}")
|
| 144 |
return []
|
| 145 |
except Exception as e:
|
|
|
|
| 148 |
|
| 149 |
chunks: List[Dict] = []
|
| 150 |
for section, text in sections:
|
|
|
|
| 151 |
for j, piece in enumerate(_split_into_chunks(text), start=1):
|
| 152 |
+
# ✅ precompute tokens once
|
| 153 |
+
toks = _tokenize(piece)
|
| 154 |
chunks.append(
|
| 155 |
{
|
| 156 |
"text": piece,
|
| 157 |
"source_file": source_file,
|
| 158 |
"section": f"{section}#{j}",
|
| 159 |
"doc_type": doc_type,
|
| 160 |
+
"_tokens": toks,
|
| 161 |
}
|
| 162 |
)
|
| 163 |
|
|
|
|
| 165 |
|
| 166 |
|
| 167 |
def retrieve_relevant_chunks(
|
| 168 |
+
query: str,
|
| 169 |
+
chunks: List[Dict],
|
| 170 |
+
k: int = 3, # ✅ smaller default = faster + less prompt
|
| 171 |
+
max_context_chars: int = 2200, # ✅ smaller default = faster
|
| 172 |
) -> Tuple[str, List[Dict]]:
|
| 173 |
"""
|
| 174 |
+
Fast deterministic retrieval:
|
| 175 |
+
- score by token overlap using precomputed chunk tokens
|
| 176 |
- return top-k chunks concatenated as context
|
| 177 |
"""
|
| 178 |
query = _clean_text(query)
|
| 179 |
if not query or not chunks:
|
| 180 |
return "", []
|
| 181 |
|
| 182 |
+
q_tokens = _tokenize(query)
|
| 183 |
if not q_tokens:
|
| 184 |
return "", []
|
| 185 |
|
| 186 |
scored: List[Tuple[int, Dict]] = []
|
| 187 |
for c in chunks:
|
| 188 |
+
t_tokens = c.get("_tokens")
|
| 189 |
+
if not t_tokens:
|
| 190 |
+
# fallback if older chunks exist without tokens
|
| 191 |
+
t_tokens = _tokenize(c.get("text") or "")
|
| 192 |
+
c["_tokens"] = t_tokens
|
| 193 |
+
|
| 194 |
score = len(q_tokens.intersection(t_tokens))
|
| 195 |
if score > 0:
|
| 196 |
scored.append((score, c))
|
| 197 |
|
| 198 |
+
if not scored:
|
| 199 |
+
return "", []
|
| 200 |
+
|
| 201 |
scored.sort(key=lambda x: x[0], reverse=True)
|
| 202 |
top = [c for _, c in scored[:k]]
|
| 203 |
|
|
|
|
| 204 |
buf_parts: List[str] = []
|
| 205 |
used: List[Dict] = []
|
| 206 |
total = 0
|