Ryanfafa commited on
Commit
4b6c56e
·
verified ·
1 Parent(s): 6a019e1

Update rag_engine.py

Browse files
Files changed (1) hide show
  1. rag_engine.py +31 -43
rag_engine.py CHANGED
@@ -1,8 +1,8 @@
1
  """
2
- RAG Engine
3
- Embeddings : sentence-transformers/all-MiniLM-L6-v2
4
  Vector DB : ChromaDB (local)
5
- LLM : HuggingFace Router API - router.huggingface.co/v1
6
  """
7
 
8
  import os
@@ -14,25 +14,23 @@ from typing import Tuple, List
14
 
15
  from chromadb.config import Settings
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
17
- from langchain_community.embeddings import HuggingFaceEmbeddings
18
  from langchain_community.vectorstores import Chroma
19
  from langchain_community.document_loaders import PyPDFLoader, TextLoader
20
 
21
  # Configuration
22
- EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
23
- CHUNK_SIZE = 800
24
- CHUNK_OVERLAP = 150
25
- TOP_K = 4
26
  COLLECTION_NAME = "docmind_collection"
27
- CHROMA_DIR = "./chroma_db"
28
 
29
- # Correct HF Router URL
30
  HF_API_URL = "https://router.huggingface.co/v1/chat/completions"
31
 
32
- # Non-reasoning models only (no chain-of-thought leakage)
33
  CANDIDATE_MODELS = [
34
  "mistralai/Mistral-7B-Instruct-v0.3:auto",
35
- "microsoft/Phi-3.5-mini-instruct:auto",
36
  "meta-llama/Llama-3.2-3B-Instruct:auto",
37
  "meta-llama/Llama-3.1-8B-Instruct:auto",
38
  ]
@@ -51,6 +49,9 @@ class RAGEngine:
51
  @property
52
  def embeddings(self):
53
  if self._embeddings is None:
 
 
 
54
  self._embeddings = HuggingFaceEmbeddings(
55
  model_name=EMBED_MODEL,
56
  model_kwargs={"device": "cpu"},
@@ -72,6 +73,15 @@ class RAGEngine:
72
  for doc in raw_docs:
73
  doc.metadata["source"] = name or os.path.basename(path)
74
  chunks = self._splitter.split_documents(raw_docs)
 
 
 
 
 
 
 
 
 
75
  self._vectorstore = Chroma.from_documents(
76
  documents=chunks,
77
  embedding=self.embeddings,
@@ -84,9 +94,10 @@ class RAGEngine:
84
  def query(self, question: str) -> Tuple[str, List[str]]:
85
  if self._vectorstore is None:
86
  return "Please upload a document first.", []
 
87
  retriever = self._vectorstore.as_retriever(
88
  search_type="mmr",
89
- search_kwargs={"k": TOP_K, "fetch_k": TOP_K * 3},
90
  )
91
  docs = retriever.invoke(question)
92
  context = "\n\n---\n\n".join(
@@ -106,13 +117,9 @@ class RAGEngine:
106
 
107
  system_prompt = (
108
  "You are DocMind, a document Q&A assistant. "
109
- "Answer the question using only the document context below. "
110
- "Reply with a short, direct answer only. "
111
- "No preamble. No reasoning. No 'the user is asking'. "
112
- "No 'let me', 'first', 'okay', or 'I need to'. "
113
- "Just answer."
114
  )
115
-
116
  user_message = (
117
  "Context:\n" + context +
118
  "\n\n---\nQuestion: " + question +
@@ -157,57 +164,38 @@ class RAGEngine:
157
  continue
158
 
159
  return (
160
- "AI answer unavailable. Most relevant excerpt:\n\n"
161
  + extract_best(question, context)
162
- + "\n\n(Last error: " + last_error + ")"
163
  )
164
 
165
 
166
  def strip_thinking(text: str) -> str:
167
- """
168
- Hard-strip any chain-of-thought reasoning that leaks into the response.
169
- Keeps only the content that appears after all reasoning paragraphs end.
170
- """
171
-
172
- # Pattern 1: Remove <think>...</think> blocks (some models use this tag)
173
  text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()
174
-
175
- # Pattern 2: If text starts with reasoning phrases, find where real answer begins
176
  reasoning_starters = [
177
  "okay", "ok,", "alright", "let me", "let's", "i need", "i will",
178
  "i'll", "first,", "so,", "the user", "looking at", "going through",
179
  "based on the chunk", "parsing", "to answer", "in order to",
180
- "i should", "i must", "my task", "the question",
181
  ]
182
-
183
  lines = text.split("\n")
184
  clean = []
185
  found_real = False
186
-
187
  for line in lines:
188
- stripped = line.strip()
189
- lower = stripped.lower()
190
  is_thinking = any(lower.startswith(p) for p in reasoning_starters)
191
-
192
  if not found_real:
193
- if stripped and not is_thinking:
194
  found_real = True
195
  clean.append(line)
196
  else:
197
  clean.append(line)
198
-
199
  result = "\n".join(clean).strip()
200
-
201
- # Pattern 3: Last resort — if response has many paragraphs of reasoning
202
- # take only the last paragraph as the final answer
203
  if not result or len(result) > 1500:
204
  paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
205
  if paragraphs:
206
  last = paragraphs[-1]
207
- # Only use last paragraph if it looks like an answer (short enough)
208
  if len(last) < 800:
209
  return last
210
-
211
  return result if result else text
212
 
213
 
@@ -227,4 +215,4 @@ def extract_best(question: str, context: str) -> str:
227
 
228
 
229
  def get_suffix(name: str) -> str:
230
- return os.path.splitext(name)[-1].lower() or ".txt"
 
1
  """
2
+ RAG Engine - Memory optimized for HuggingFace free tier
3
+ Embeddings : all-MiniLM-L6-v2 via sentence-transformers (CPU, ~90MB)
4
  Vector DB : ChromaDB (local)
5
+ LLM : HuggingFace Router API (no local model loaded)
6
  """
7
 
8
  import os
 
14
 
15
  from chromadb.config import Settings
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
17
  from langchain_community.vectorstores import Chroma
18
  from langchain_community.document_loaders import PyPDFLoader, TextLoader
19
 
20
  # Configuration
21
+ EMBED_MODEL = "all-MiniLM-L6-v2"
22
+ CHUNK_SIZE = 600
23
+ CHUNK_OVERLAP = 100
24
+ TOP_K = 3
25
  COLLECTION_NAME = "docmind_collection"
26
+ CHROMA_DIR = "/tmp/chroma_db"
27
 
28
+ # HF Router URL
29
  HF_API_URL = "https://router.huggingface.co/v1/chat/completions"
30
 
31
+ # Non-reasoning models only
32
  CANDIDATE_MODELS = [
33
  "mistralai/Mistral-7B-Instruct-v0.3:auto",
 
34
  "meta-llama/Llama-3.2-3B-Instruct:auto",
35
  "meta-llama/Llama-3.1-8B-Instruct:auto",
36
  ]
 
49
  @property
50
  def embeddings(self):
51
  if self._embeddings is None:
52
+ # Use sentence-transformers directly - lighter than langchain wrapper
53
+ from sentence_transformers import SentenceTransformer
54
+ from langchain_community.embeddings import HuggingFaceEmbeddings
55
  self._embeddings = HuggingFaceEmbeddings(
56
  model_name=EMBED_MODEL,
57
  model_kwargs={"device": "cpu"},
 
73
  for doc in raw_docs:
74
  doc.metadata["source"] = name or os.path.basename(path)
75
  chunks = self._splitter.split_documents(raw_docs)
76
+
77
+ # Clear old vectorstore to free memory before creating new one
78
+ if self._vectorstore is not None:
79
+ try:
80
+ self._vectorstore._client.reset()
81
+ except Exception:
82
+ pass
83
+ self._vectorstore = None
84
+
85
  self._vectorstore = Chroma.from_documents(
86
  documents=chunks,
87
  embedding=self.embeddings,
 
94
  def query(self, question: str) -> Tuple[str, List[str]]:
95
  if self._vectorstore is None:
96
  return "Please upload a document first.", []
97
+
98
  retriever = self._vectorstore.as_retriever(
99
  search_type="mmr",
100
+ search_kwargs={"k": TOP_K, "fetch_k": TOP_K * 2},
101
  )
102
  docs = retriever.invoke(question)
103
  context = "\n\n---\n\n".join(
 
117
 
118
  system_prompt = (
119
  "You are DocMind, a document Q&A assistant. "
120
+ "Answer the question using only the document context. "
121
+ "Be short and direct. No preamble. No reasoning. Just answer."
 
 
 
122
  )
 
123
  user_message = (
124
  "Context:\n" + context +
125
  "\n\n---\nQuestion: " + question +
 
164
  continue
165
 
166
  return (
167
+ "AI unavailable. Most relevant excerpt:\n\n"
168
  + extract_best(question, context)
169
+ + "\n\n(Error: " + last_error + ")"
170
  )
171
 
172
 
173
  def strip_thinking(text: str) -> str:
 
 
 
 
 
 
174
  text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()
 
 
175
  reasoning_starters = [
176
  "okay", "ok,", "alright", "let me", "let's", "i need", "i will",
177
  "i'll", "first,", "so,", "the user", "looking at", "going through",
178
  "based on the chunk", "parsing", "to answer", "in order to",
 
179
  ]
 
180
  lines = text.split("\n")
181
  clean = []
182
  found_real = False
 
183
  for line in lines:
184
+ lower = line.strip().lower()
 
185
  is_thinking = any(lower.startswith(p) for p in reasoning_starters)
 
186
  if not found_real:
187
+ if line.strip() and not is_thinking:
188
  found_real = True
189
  clean.append(line)
190
  else:
191
  clean.append(line)
 
192
  result = "\n".join(clean).strip()
 
 
 
193
  if not result or len(result) > 1500:
194
  paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
195
  if paragraphs:
196
  last = paragraphs[-1]
 
197
  if len(last) < 800:
198
  return last
 
199
  return result if result else text
200
 
201
 
 
215
 
216
 
217
  def get_suffix(name: str) -> str:
218
+ return os.path.splitext(name)[-1].lower() or ".txt"