Spaces:
Running
Running
Update rag_engine.py
Browse files- rag_engine.py +31 -43
rag_engine.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
"""
|
| 2 |
-
RAG Engine
|
| 3 |
-
Embeddings :
|
| 4 |
Vector DB : ChromaDB (local)
|
| 5 |
-
LLM : HuggingFace Router API
|
| 6 |
"""
|
| 7 |
|
| 8 |
import os
|
|
@@ -14,25 +14,23 @@ from typing import Tuple, List
|
|
| 14 |
|
| 15 |
from chromadb.config import Settings
|
| 16 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 17 |
-
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 18 |
from langchain_community.vectorstores import Chroma
|
| 19 |
from langchain_community.document_loaders import PyPDFLoader, TextLoader
|
| 20 |
|
| 21 |
# Configuration
|
| 22 |
-
EMBED_MODEL = "
|
| 23 |
-
CHUNK_SIZE =
|
| 24 |
-
CHUNK_OVERLAP =
|
| 25 |
-
TOP_K =
|
| 26 |
COLLECTION_NAME = "docmind_collection"
|
| 27 |
-
CHROMA_DIR = "
|
| 28 |
|
| 29 |
-
#
|
| 30 |
HF_API_URL = "https://router.huggingface.co/v1/chat/completions"
|
| 31 |
|
| 32 |
-
# Non-reasoning models only
|
| 33 |
CANDIDATE_MODELS = [
|
| 34 |
"mistralai/Mistral-7B-Instruct-v0.3:auto",
|
| 35 |
-
"microsoft/Phi-3.5-mini-instruct:auto",
|
| 36 |
"meta-llama/Llama-3.2-3B-Instruct:auto",
|
| 37 |
"meta-llama/Llama-3.1-8B-Instruct:auto",
|
| 38 |
]
|
|
@@ -51,6 +49,9 @@ class RAGEngine:
|
|
| 51 |
@property
|
| 52 |
def embeddings(self):
|
| 53 |
if self._embeddings is None:
|
|
|
|
|
|
|
|
|
|
| 54 |
self._embeddings = HuggingFaceEmbeddings(
|
| 55 |
model_name=EMBED_MODEL,
|
| 56 |
model_kwargs={"device": "cpu"},
|
|
@@ -72,6 +73,15 @@ class RAGEngine:
|
|
| 72 |
for doc in raw_docs:
|
| 73 |
doc.metadata["source"] = name or os.path.basename(path)
|
| 74 |
chunks = self._splitter.split_documents(raw_docs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
self._vectorstore = Chroma.from_documents(
|
| 76 |
documents=chunks,
|
| 77 |
embedding=self.embeddings,
|
|
@@ -84,9 +94,10 @@ class RAGEngine:
|
|
| 84 |
def query(self, question: str) -> Tuple[str, List[str]]:
|
| 85 |
if self._vectorstore is None:
|
| 86 |
return "Please upload a document first.", []
|
|
|
|
| 87 |
retriever = self._vectorstore.as_retriever(
|
| 88 |
search_type="mmr",
|
| 89 |
-
search_kwargs={"k": TOP_K, "fetch_k": TOP_K *
|
| 90 |
)
|
| 91 |
docs = retriever.invoke(question)
|
| 92 |
context = "\n\n---\n\n".join(
|
|
@@ -106,13 +117,9 @@ class RAGEngine:
|
|
| 106 |
|
| 107 |
system_prompt = (
|
| 108 |
"You are DocMind, a document Q&A assistant. "
|
| 109 |
-
"Answer the question using only the document context
|
| 110 |
-
"
|
| 111 |
-
"No preamble. No reasoning. No 'the user is asking'. "
|
| 112 |
-
"No 'let me', 'first', 'okay', or 'I need to'. "
|
| 113 |
-
"Just answer."
|
| 114 |
)
|
| 115 |
-
|
| 116 |
user_message = (
|
| 117 |
"Context:\n" + context +
|
| 118 |
"\n\n---\nQuestion: " + question +
|
|
@@ -157,57 +164,38 @@ class RAGEngine:
|
|
| 157 |
continue
|
| 158 |
|
| 159 |
return (
|
| 160 |
-
"AI
|
| 161 |
+ extract_best(question, context)
|
| 162 |
-
+ "\n\n(
|
| 163 |
)
|
| 164 |
|
| 165 |
|
| 166 |
def strip_thinking(text: str) -> str:
|
| 167 |
-
"""
|
| 168 |
-
Hard-strip any chain-of-thought reasoning that leaks into the response.
|
| 169 |
-
Keeps only the content that appears after all reasoning paragraphs end.
|
| 170 |
-
"""
|
| 171 |
-
|
| 172 |
-
# Pattern 1: Remove <think>...</think> blocks (some models use this tag)
|
| 173 |
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()
|
| 174 |
-
|
| 175 |
-
# Pattern 2: If text starts with reasoning phrases, find where real answer begins
|
| 176 |
reasoning_starters = [
|
| 177 |
"okay", "ok,", "alright", "let me", "let's", "i need", "i will",
|
| 178 |
"i'll", "first,", "so,", "the user", "looking at", "going through",
|
| 179 |
"based on the chunk", "parsing", "to answer", "in order to",
|
| 180 |
-
"i should", "i must", "my task", "the question",
|
| 181 |
]
|
| 182 |
-
|
| 183 |
lines = text.split("\n")
|
| 184 |
clean = []
|
| 185 |
found_real = False
|
| 186 |
-
|
| 187 |
for line in lines:
|
| 188 |
-
|
| 189 |
-
lower = stripped.lower()
|
| 190 |
is_thinking = any(lower.startswith(p) for p in reasoning_starters)
|
| 191 |
-
|
| 192 |
if not found_real:
|
| 193 |
-
if
|
| 194 |
found_real = True
|
| 195 |
clean.append(line)
|
| 196 |
else:
|
| 197 |
clean.append(line)
|
| 198 |
-
|
| 199 |
result = "\n".join(clean).strip()
|
| 200 |
-
|
| 201 |
-
# Pattern 3: Last resort — if response has many paragraphs of reasoning
|
| 202 |
-
# take only the last paragraph as the final answer
|
| 203 |
if not result or len(result) > 1500:
|
| 204 |
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
|
| 205 |
if paragraphs:
|
| 206 |
last = paragraphs[-1]
|
| 207 |
-
# Only use last paragraph if it looks like an answer (short enough)
|
| 208 |
if len(last) < 800:
|
| 209 |
return last
|
| 210 |
-
|
| 211 |
return result if result else text
|
| 212 |
|
| 213 |
|
|
@@ -227,4 +215,4 @@ def extract_best(question: str, context: str) -> str:
|
|
| 227 |
|
| 228 |
|
| 229 |
def get_suffix(name: str) -> str:
|
| 230 |
-
return os.path.splitext(name)[-1].lower() or ".txt"
|
|
|
|
| 1 |
"""
|
| 2 |
+
RAG Engine - Memory optimized for HuggingFace free tier
|
| 3 |
+
Embeddings : all-MiniLM-L6-v2 via sentence-transformers (CPU, ~90MB)
|
| 4 |
Vector DB : ChromaDB (local)
|
| 5 |
+
LLM : HuggingFace Router API (no local model loaded)
|
| 6 |
"""
|
| 7 |
|
| 8 |
import os
|
|
|
|
| 14 |
|
| 15 |
from chromadb.config import Settings
|
| 16 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
| 17 |
from langchain_community.vectorstores import Chroma
|
| 18 |
from langchain_community.document_loaders import PyPDFLoader, TextLoader
|
| 19 |
|
| 20 |
# Configuration
|
| 21 |
+
EMBED_MODEL = "all-MiniLM-L6-v2"
|
| 22 |
+
CHUNK_SIZE = 600
|
| 23 |
+
CHUNK_OVERLAP = 100
|
| 24 |
+
TOP_K = 3
|
| 25 |
COLLECTION_NAME = "docmind_collection"
|
| 26 |
+
CHROMA_DIR = "/tmp/chroma_db"
|
| 27 |
|
| 28 |
+
# HF Router URL
|
| 29 |
HF_API_URL = "https://router.huggingface.co/v1/chat/completions"
|
| 30 |
|
| 31 |
+
# Non-reasoning models only
|
| 32 |
CANDIDATE_MODELS = [
|
| 33 |
"mistralai/Mistral-7B-Instruct-v0.3:auto",
|
|
|
|
| 34 |
"meta-llama/Llama-3.2-3B-Instruct:auto",
|
| 35 |
"meta-llama/Llama-3.1-8B-Instruct:auto",
|
| 36 |
]
|
|
|
|
| 49 |
@property
|
| 50 |
def embeddings(self):
|
| 51 |
if self._embeddings is None:
|
| 52 |
+
# Use sentence-transformers directly - lighter than langchain wrapper
|
| 53 |
+
from sentence_transformers import SentenceTransformer
|
| 54 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 55 |
self._embeddings = HuggingFaceEmbeddings(
|
| 56 |
model_name=EMBED_MODEL,
|
| 57 |
model_kwargs={"device": "cpu"},
|
|
|
|
| 73 |
for doc in raw_docs:
|
| 74 |
doc.metadata["source"] = name or os.path.basename(path)
|
| 75 |
chunks = self._splitter.split_documents(raw_docs)
|
| 76 |
+
|
| 77 |
+
# Clear old vectorstore to free memory before creating new one
|
| 78 |
+
if self._vectorstore is not None:
|
| 79 |
+
try:
|
| 80 |
+
self._vectorstore._client.reset()
|
| 81 |
+
except Exception:
|
| 82 |
+
pass
|
| 83 |
+
self._vectorstore = None
|
| 84 |
+
|
| 85 |
self._vectorstore = Chroma.from_documents(
|
| 86 |
documents=chunks,
|
| 87 |
embedding=self.embeddings,
|
|
|
|
| 94 |
def query(self, question: str) -> Tuple[str, List[str]]:
|
| 95 |
if self._vectorstore is None:
|
| 96 |
return "Please upload a document first.", []
|
| 97 |
+
|
| 98 |
retriever = self._vectorstore.as_retriever(
|
| 99 |
search_type="mmr",
|
| 100 |
+
search_kwargs={"k": TOP_K, "fetch_k": TOP_K * 2},
|
| 101 |
)
|
| 102 |
docs = retriever.invoke(question)
|
| 103 |
context = "\n\n---\n\n".join(
|
|
|
|
| 117 |
|
| 118 |
system_prompt = (
|
| 119 |
"You are DocMind, a document Q&A assistant. "
|
| 120 |
+
"Answer the question using only the document context. "
|
| 121 |
+
"Be short and direct. No preamble. No reasoning. Just answer."
|
|
|
|
|
|
|
|
|
|
| 122 |
)
|
|
|
|
| 123 |
user_message = (
|
| 124 |
"Context:\n" + context +
|
| 125 |
"\n\n---\nQuestion: " + question +
|
|
|
|
| 164 |
continue
|
| 165 |
|
| 166 |
return (
|
| 167 |
+
"AI unavailable. Most relevant excerpt:\n\n"
|
| 168 |
+ extract_best(question, context)
|
| 169 |
+
+ "\n\n(Error: " + last_error + ")"
|
| 170 |
)
|
| 171 |
|
| 172 |
|
| 173 |
def strip_thinking(text: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()
|
|
|
|
|
|
|
| 175 |
reasoning_starters = [
|
| 176 |
"okay", "ok,", "alright", "let me", "let's", "i need", "i will",
|
| 177 |
"i'll", "first,", "so,", "the user", "looking at", "going through",
|
| 178 |
"based on the chunk", "parsing", "to answer", "in order to",
|
|
|
|
| 179 |
]
|
|
|
|
| 180 |
lines = text.split("\n")
|
| 181 |
clean = []
|
| 182 |
found_real = False
|
|
|
|
| 183 |
for line in lines:
|
| 184 |
+
lower = line.strip().lower()
|
|
|
|
| 185 |
is_thinking = any(lower.startswith(p) for p in reasoning_starters)
|
|
|
|
| 186 |
if not found_real:
|
| 187 |
+
if line.strip() and not is_thinking:
|
| 188 |
found_real = True
|
| 189 |
clean.append(line)
|
| 190 |
else:
|
| 191 |
clean.append(line)
|
|
|
|
| 192 |
result = "\n".join(clean).strip()
|
|
|
|
|
|
|
|
|
|
| 193 |
if not result or len(result) > 1500:
|
| 194 |
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
|
| 195 |
if paragraphs:
|
| 196 |
last = paragraphs[-1]
|
|
|
|
| 197 |
if len(last) < 800:
|
| 198 |
return last
|
|
|
|
| 199 |
return result if result else text
|
| 200 |
|
| 201 |
|
|
|
|
| 215 |
|
| 216 |
|
| 217 |
def get_suffix(name: str) -> str:
|
| 218 |
+
return os.path.splitext(name)[-1].lower() or ".txt"
|