Spaces:

Ryanfafa
/

docmind-ai

Running

App Files Files Community

Ryanfafa commited on Feb 17

Commit

4b6c56e

verified ·

1 Parent(s): 6a019e1

Update rag_engine.py

Browse files

Files changed (1) hide show

rag_engine.py +31 -43

rag_engine.py CHANGED Viewed

@@ -1,8 +1,8 @@
 """
-RAG Engine
-Embeddings : sentence-transformers/all-MiniLM-L6-v2
 Vector DB  : ChromaDB (local)
-LLM        : HuggingFace Router API - router.huggingface.co/v1
 """
 import os
@@ -14,25 +14,23 @@ from typing import Tuple, List
 from chromadb.config import Settings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import Chroma
 from langchain_community.document_loaders import PyPDFLoader, TextLoader
 # Configuration
-EMBED_MODEL     = "sentence-transformers/all-MiniLM-L6-v2"
-CHUNK_SIZE      = 800
-CHUNK_OVERLAP   = 150
-TOP_K           = 4
 COLLECTION_NAME = "docmind_collection"
-CHROMA_DIR      = "./chroma_db"
-# Correct HF Router URL
 HF_API_URL = "https://router.huggingface.co/v1/chat/completions"
-# Non-reasoning models only (no chain-of-thought leakage)
 CANDIDATE_MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.3:auto",
-    "microsoft/Phi-3.5-mini-instruct:auto",
     "meta-llama/Llama-3.2-3B-Instruct:auto",
     "meta-llama/Llama-3.1-8B-Instruct:auto",
 ]
@@ -51,6 +49,9 @@ class RAGEngine:
     @property
     def embeddings(self):
         if self._embeddings is None:
             self._embeddings = HuggingFaceEmbeddings(
                 model_name=EMBED_MODEL,
                 model_kwargs={"device": "cpu"},
@@ -72,6 +73,15 @@ class RAGEngine:
         for doc in raw_docs:
             doc.metadata["source"] = name or os.path.basename(path)
         chunks = self._splitter.split_documents(raw_docs)
         self._vectorstore = Chroma.from_documents(
             documents=chunks,
             embedding=self.embeddings,
@@ -84,9 +94,10 @@ class RAGEngine:
     def query(self, question: str) -> Tuple[str, List[str]]:
         if self._vectorstore is None:
             return "Please upload a document first.", []
         retriever = self._vectorstore.as_retriever(
             search_type="mmr",
-            search_kwargs={"k": TOP_K, "fetch_k": TOP_K * 3},
         )
         docs    = retriever.invoke(question)
         context = "\n\n---\n\n".join(
@@ -106,13 +117,9 @@ class RAGEngine:
         system_prompt = (
             "You are DocMind, a document Q&A assistant. "
-            "Answer the question using only the document context below. "
-            "Reply with a short, direct answer only. "
-            "No preamble. No reasoning. No 'the user is asking'. "
-            "No 'let me', 'first', 'okay', or 'I need to'. "
-            "Just answer."
         )
         user_message = (
             "Context:\n" + context +
             "\n\n---\nQuestion: " + question +
@@ -157,57 +164,38 @@ class RAGEngine:
                 continue
         return (
-            "AI answer unavailable. Most relevant excerpt:\n\n"
             + extract_best(question, context)
-            + "\n\n(Last error: " + last_error + ")"
         )
 def strip_thinking(text: str) -> str:
-    """
-    Hard-strip any chain-of-thought reasoning that leaks into the response.
-    Keeps only the content that appears after all reasoning paragraphs end.
-    """
-    # Pattern 1: Remove <think>...</think> blocks (some models use this tag)
     text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()
-    # Pattern 2: If text starts with reasoning phrases, find where real answer begins
     reasoning_starters = [
         "okay", "ok,", "alright", "let me", "let's", "i need", "i will",
         "i'll", "first,", "so,", "the user", "looking at", "going through",
         "based on the chunk", "parsing", "to answer", "in order to",
-        "i should", "i must", "my task", "the question",
     ]
     lines      = text.split("\n")
     clean      = []
     found_real = False
     for line in lines:
-        stripped    = line.strip()
-        lower       = stripped.lower()
         is_thinking = any(lower.startswith(p) for p in reasoning_starters)
         if not found_real:
-            if stripped and not is_thinking:
                 found_real = True
                 clean.append(line)
         else:
             clean.append(line)
     result = "\n".join(clean).strip()
-    # Pattern 3: Last resort — if response has many paragraphs of reasoning
-    # take only the last paragraph as the final answer
     if not result or len(result) > 1500:
         paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
         if paragraphs:
             last = paragraphs[-1]
-            # Only use last paragraph if it looks like an answer (short enough)
             if len(last) < 800:
                 return last
     return result if result else text
@@ -227,4 +215,4 @@ def extract_best(question: str, context: str) -> str:
 def get_suffix(name: str) -> str:
-    return os.path.splitext(name)[-1].lower() or ".txt"

 """
+RAG Engine - Memory optimized for HuggingFace free tier
+Embeddings : all-MiniLM-L6-v2 via sentence-transformers (CPU, ~90MB)
 Vector DB  : ChromaDB (local)
+LLM        : HuggingFace Router API (no local model loaded)
 """
 import os
 from chromadb.config import Settings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
 from langchain_community.document_loaders import PyPDFLoader, TextLoader
 # Configuration
+EMBED_MODEL     = "all-MiniLM-L6-v2"
+CHUNK_SIZE      = 600
+CHUNK_OVERLAP   = 100
+TOP_K           = 3
 COLLECTION_NAME = "docmind_collection"
+CHROMA_DIR      = "/tmp/chroma_db"
+# HF Router URL
 HF_API_URL = "https://router.huggingface.co/v1/chat/completions"
+# Non-reasoning models only
 CANDIDATE_MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.3:auto",
     "meta-llama/Llama-3.2-3B-Instruct:auto",
     "meta-llama/Llama-3.1-8B-Instruct:auto",
 ]
     @property
     def embeddings(self):
         if self._embeddings is None:
+            # Use sentence-transformers directly - lighter than langchain wrapper
+            from sentence_transformers import SentenceTransformer
+            from langchain_community.embeddings import HuggingFaceEmbeddings
             self._embeddings = HuggingFaceEmbeddings(
                 model_name=EMBED_MODEL,
                 model_kwargs={"device": "cpu"},
         for doc in raw_docs:
             doc.metadata["source"] = name or os.path.basename(path)
         chunks = self._splitter.split_documents(raw_docs)
+        # Clear old vectorstore to free memory before creating new one
+        if self._vectorstore is not None:
+            try:
+                self._vectorstore._client.reset()
+            except Exception:
+                pass
+            self._vectorstore = None
         self._vectorstore = Chroma.from_documents(
             documents=chunks,
             embedding=self.embeddings,
     def query(self, question: str) -> Tuple[str, List[str]]:
         if self._vectorstore is None:
             return "Please upload a document first.", []
         retriever = self._vectorstore.as_retriever(
             search_type="mmr",
+            search_kwargs={"k": TOP_K, "fetch_k": TOP_K * 2},
         )
         docs    = retriever.invoke(question)
         context = "\n\n---\n\n".join(
         system_prompt = (
             "You are DocMind, a document Q&A assistant. "
+            "Answer the question using only the document context. "
+            "Be short and direct. No preamble. No reasoning. Just answer."
         )
         user_message = (
             "Context:\n" + context +
             "\n\n---\nQuestion: " + question +
                 continue
         return (
+            "AI unavailable. Most relevant excerpt:\n\n"
             + extract_best(question, context)
+            + "\n\n(Error: " + last_error + ")"
         )
 def strip_thinking(text: str) -> str:
     text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()
     reasoning_starters = [
         "okay", "ok,", "alright", "let me", "let's", "i need", "i will",
         "i'll", "first,", "so,", "the user", "looking at", "going through",
         "based on the chunk", "parsing", "to answer", "in order to",
     ]
     lines      = text.split("\n")
     clean      = []
     found_real = False
     for line in lines:
+        lower       = line.strip().lower()
         is_thinking = any(lower.startswith(p) for p in reasoning_starters)
         if not found_real:
+            if line.strip() and not is_thinking:
                 found_real = True
                 clean.append(line)
         else:
             clean.append(line)
     result = "\n".join(clean).strip()
     if not result or len(result) > 1500:
         paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
         if paragraphs:
             last = paragraphs[-1]
             if len(last) < 800:
                 return last
     return result if result else text
 def get_suffix(name: str) -> str:
+    return os.path.splitext(name)[-1].lower() or ".txt"