Ryanfafa commited on
Commit
ce4796a
·
verified ·
1 Parent(s): aa776c7

Update rag_engine.py

Browse files
Files changed (1) hide show
  1. rag_engine.py +80 -43
rag_engine.py CHANGED
@@ -1,13 +1,14 @@
1
  """
2
  RAG Engine - Memory optimized for HuggingFace free tier
3
- Embeddings : all-MiniLM-L6-v2 via sentence-transformers (CPU, ~90MB)
4
  Vector DB : ChromaDB (local)
5
- LLM : HuggingFace Router API (no local model loaded)
6
  """
7
 
8
  import os
9
  import re
10
  import json
 
11
  import tempfile
12
  import requests
13
  from typing import Tuple, List
@@ -16,23 +17,25 @@ from chromadb.config import Settings
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
17
  from langchain_community.vectorstores import Chroma
18
  from langchain_community.document_loaders import PyPDFLoader, TextLoader
 
 
19
 
20
- # Configuration
21
  EMBED_MODEL = "all-MiniLM-L6-v2"
22
  CHUNK_SIZE = 600
23
  CHUNK_OVERLAP = 100
24
  TOP_K = 3
25
  COLLECTION_NAME = "docmind_collection"
26
  CHROMA_DIR = "/tmp/chroma_db"
 
27
 
28
- # HF Router URL
29
- HF_API_URL = "https://router.huggingface.co/v1/chat/completions"
30
-
31
- # Non-reasoning models only
32
  CANDIDATE_MODELS = [
33
- "mistralai/Mistral-7B-Instruct-v0.3:auto",
34
- "meta-llama/Llama-3.2-3B-Instruct:auto",
35
- "meta-llama/Llama-3.1-8B-Instruct:auto",
 
36
  ]
37
 
38
 
@@ -45,13 +48,11 @@ class RAGEngine:
45
  chunk_overlap=CHUNK_OVERLAP,
46
  separators=["\n\n", "\n", ". ", " ", ""],
47
  )
 
48
 
49
  @property
50
  def embeddings(self):
51
  if self._embeddings is None:
52
- # Use sentence-transformers directly - lighter than langchain wrapper
53
- from sentence_transformers import SentenceTransformer
54
- from langchain_community.embeddings import HuggingFaceEmbeddings
55
  self._embeddings = HuggingFaceEmbeddings(
56
  model_name=EMBED_MODEL,
57
  model_kwargs={"device": "cpu"},
@@ -60,11 +61,26 @@ class RAGEngine:
60
  return self._embeddings
61
 
62
  def ingest_file(self, uploaded_file) -> int:
 
63
  suffix = get_suffix(uploaded_file.name)
64
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
65
- tmp.write(uploaded_file.read())
66
- tmp_path = tmp.name
67
- return self.ingest_path(tmp_path, uploaded_file.name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  def ingest_path(self, path: str, name: str = "") -> int:
70
  suffix = get_suffix(name or path)
@@ -73,21 +89,18 @@ class RAGEngine:
73
  for doc in raw_docs:
74
  doc.metadata["source"] = name or os.path.basename(path)
75
  chunks = self._splitter.split_documents(raw_docs)
76
-
77
- # Clear old vectorstore to free memory before creating new one
78
  if self._vectorstore is not None:
79
  try:
80
  self._vectorstore._client.reset()
81
  except Exception:
82
  pass
83
  self._vectorstore = None
84
-
85
  self._vectorstore = Chroma.from_documents(
86
- documents=chunks,
87
- embedding=self.embeddings,
88
- collection_name=COLLECTION_NAME,
89
- persist_directory=CHROMA_DIR,
90
- client_settings=Settings(anonymized_telemetry=False),
91
  )
92
  return len(chunks)
93
 
@@ -95,24 +108,46 @@ class RAGEngine:
95
  if self._vectorstore is None:
96
  return "Please upload a document first.", []
97
 
98
- retriever = self._vectorstore.as_retriever(
99
- search_type="mmr",
100
- search_kwargs={"k": TOP_K, "fetch_k": TOP_K * 2},
101
- )
102
- docs = retriever.invoke(question)
103
- context = "\n\n---\n\n".join(
104
- "[Chunk {}]\n{}".format(i + 1, d.page_content) for i, d in enumerate(docs)
105
- )
106
- sources = list({d.metadata.get("source", "Document") for d in docs})
107
- answer = self._generate(question, context)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  return answer, sources
109
 
110
- def _generate(self, question: str, context: str) -> str:
111
  hf_token = os.environ.get("HF_TOKEN", "")
112
  if not hf_token:
113
  return (
114
  "HF_TOKEN not set. Add it as a Secret in Space Settings.\n\n"
115
- "Best matching excerpt:\n\n" + extract_best(question, context)
 
116
  )
117
 
118
  system_prompt = (
@@ -125,7 +160,6 @@ class RAGEngine:
125
  "\n\n---\nQuestion: " + question +
126
  "\nAnswer:"
127
  )
128
-
129
  headers = {
130
  "Authorization": "Bearer " + hf_token,
131
  "Content-Type": "application/json",
@@ -135,8 +169,8 @@ class RAGEngine:
135
  for model_id in CANDIDATE_MODELS:
136
  try:
137
  payload = {
138
- "model": model_id,
139
- "messages": [
140
  {"role": "system", "content": system_prompt},
141
  {"role": "user", "content": user_message},
142
  ],
@@ -154,20 +188,23 @@ class RAGEngine:
154
  raw = resp.json()["choices"][0]["message"]["content"].strip()
155
  answer = strip_thinking(raw)
156
  if answer:
157
- return answer
158
  else:
159
  last_error = "Model {} -> {}: {}".format(
160
  model_id, resp.status_code, resp.text[:200]
161
  )
 
162
  except Exception as e:
163
  last_error = str(e)
 
164
  continue
165
 
166
- return (
167
  "AI unavailable. Most relevant excerpt:\n\n"
168
  + extract_best(question, context)
169
  + "\n\n(Error: " + last_error + ")"
170
  )
 
171
 
172
 
173
  def strip_thinking(text: str) -> str:
@@ -215,4 +252,4 @@ def extract_best(question: str, context: str) -> str:
215
 
216
 
217
  def get_suffix(name: str) -> str:
218
- return os.path.splitext(name)[-1].lower() or ".txt"
 
1
  """
2
  RAG Engine - Memory optimized for HuggingFace free tier
3
+ Embeddings : all-MiniLM-L6-v2 (CPU, ~90MB)
4
  Vector DB : ChromaDB (local)
5
+ LLM : HuggingFace Router API with correct provider suffixes
6
  """
7
 
8
  import os
9
  import re
10
  import json
11
+ import time
12
  import tempfile
13
  import requests
14
  from typing import Tuple, List
 
17
  from langchain.text_splitter import RecursiveCharacterTextSplitter
18
  from langchain_community.vectorstores import Chroma
19
  from langchain_community.document_loaders import PyPDFLoader, TextLoader
20
+ from langchain_community.embeddings import HuggingFaceEmbeddings
21
+ import monitor
22
 
 
23
  EMBED_MODEL = "all-MiniLM-L6-v2"
24
  CHUNK_SIZE = 600
25
  CHUNK_OVERLAP = 100
26
  TOP_K = 3
27
  COLLECTION_NAME = "docmind_collection"
28
  CHROMA_DIR = "/tmp/chroma_db"
29
+ HF_API_URL = "https://router.huggingface.co/v1/chat/completions"
30
 
31
+ # Correct provider suffixes verified from HuggingFace docs (2025)
32
+ # Format: "model-id:provider"
33
+ # cerebras = fast free GPU, hf-inference = HF own CPU servers
 
34
  CANDIDATE_MODELS = [
35
+ "meta-llama/Llama-3.1-8B-Instruct:cerebras", # fast, free, no reasoning leak
36
+ "meta-llama/Llama-3.3-70B-Instruct:cerebras", # larger, still free on cerebras
37
+ "mistralai/Mistral-7B-Instruct-v0.3:fireworks-ai", # fireworks free tier
38
+ "HuggingFaceTB/SmolLM3-3B:hf-inference", # HF's own server, always available
39
  ]
40
 
41
 
 
48
  chunk_overlap=CHUNK_OVERLAP,
49
  separators=["\n\n", "\n", ". ", " ", ""],
50
  )
51
+ monitor.log_startup()
52
 
53
  @property
54
  def embeddings(self):
55
  if self._embeddings is None:
 
 
 
56
  self._embeddings = HuggingFaceEmbeddings(
57
  model_name=EMBED_MODEL,
58
  model_kwargs={"device": "cpu"},
 
61
  return self._embeddings
62
 
63
  def ingest_file(self, uploaded_file) -> int:
64
+ t0 = time.time()
65
  suffix = get_suffix(uploaded_file.name)
66
+ error = ""
67
+ chunks = 0
68
+ try:
69
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
70
+ tmp.write(uploaded_file.read())
71
+ tmp_path = tmp.name
72
+ chunks = self.ingest_path(tmp_path, uploaded_file.name)
73
+ except Exception as e:
74
+ error = str(e)
75
+ raise
76
+ finally:
77
+ monitor.log_ingestion(
78
+ filename = uploaded_file.name,
79
+ chunk_count = chunks,
80
+ latency_ms = (time.time() - t0) * 1000,
81
+ error = error,
82
+ )
83
+ return chunks
84
 
85
  def ingest_path(self, path: str, name: str = "") -> int:
86
  suffix = get_suffix(name or path)
 
89
  for doc in raw_docs:
90
  doc.metadata["source"] = name or os.path.basename(path)
91
  chunks = self._splitter.split_documents(raw_docs)
 
 
92
  if self._vectorstore is not None:
93
  try:
94
  self._vectorstore._client.reset()
95
  except Exception:
96
  pass
97
  self._vectorstore = None
 
98
  self._vectorstore = Chroma.from_documents(
99
+ documents = chunks,
100
+ embedding = self.embeddings,
101
+ collection_name = COLLECTION_NAME,
102
+ persist_directory = CHROMA_DIR,
103
+ client_settings = Settings(anonymized_telemetry=False),
104
  )
105
  return len(chunks)
106
 
 
108
  if self._vectorstore is None:
109
  return "Please upload a document first.", []
110
 
111
+ t0 = time.time()
112
+ error = ""
113
+ answer = ""
114
+ sources = []
115
+ model_used = ""
116
+
117
+ try:
118
+ retriever = self._vectorstore.as_retriever(
119
+ search_type="mmr",
120
+ search_kwargs={"k": TOP_K, "fetch_k": TOP_K * 2},
121
+ )
122
+ docs = retriever.invoke(question)
123
+ context = "\n\n---\n\n".join(
124
+ "[Chunk {}]\n{}".format(i + 1, d.page_content) for i, d in enumerate(docs)
125
+ )
126
+ sources = list({d.metadata.get("source", "Document") for d in docs})
127
+ answer, model_used = self._generate(question, context)
128
+ except Exception as e:
129
+ error = str(e)
130
+ answer = "Error: " + error
131
+ finally:
132
+ monitor.log_query(
133
+ question = question,
134
+ answer = answer,
135
+ sources = sources,
136
+ latency_ms = (time.time() - t0) * 1000,
137
+ model_used = model_used,
138
+ chunk_count = TOP_K,
139
+ error = error,
140
+ )
141
+
142
  return answer, sources
143
 
144
+ def _generate(self, question: str, context: str) -> Tuple[str, str]:
145
  hf_token = os.environ.get("HF_TOKEN", "")
146
  if not hf_token:
147
  return (
148
  "HF_TOKEN not set. Add it as a Secret in Space Settings.\n\n"
149
+ "Best matching excerpt:\n\n" + extract_best(question, context),
150
+ "none"
151
  )
152
 
153
  system_prompt = (
 
160
  "\n\n---\nQuestion: " + question +
161
  "\nAnswer:"
162
  )
 
163
  headers = {
164
  "Authorization": "Bearer " + hf_token,
165
  "Content-Type": "application/json",
 
169
  for model_id in CANDIDATE_MODELS:
170
  try:
171
  payload = {
172
+ "model": model_id,
173
+ "messages": [
174
  {"role": "system", "content": system_prompt},
175
  {"role": "user", "content": user_message},
176
  ],
 
188
  raw = resp.json()["choices"][0]["message"]["content"].strip()
189
  answer = strip_thinking(raw)
190
  if answer:
191
+ return answer, model_id
192
  else:
193
  last_error = "Model {} -> {}: {}".format(
194
  model_id, resp.status_code, resp.text[:200]
195
  )
196
+ print("[DocMind] " + last_error)
197
  except Exception as e:
198
  last_error = str(e)
199
+ print("[DocMind] Exception on {}: {}".format(model_id, last_error))
200
  continue
201
 
202
+ fallback = (
203
  "AI unavailable. Most relevant excerpt:\n\n"
204
  + extract_best(question, context)
205
  + "\n\n(Error: " + last_error + ")"
206
  )
207
+ return fallback, "fallback"
208
 
209
 
210
  def strip_thinking(text: str) -> str:
 
252
 
253
 
254
  def get_suffix(name: str) -> str:
255
+ return os.path.splitext(name)[-1].lower() or ".txt"