Spaces:

Ryanfafa
/

docmind-ai

Running

App Files Files Community

Ryanfafa commited on 28 days ago

Commit

ce4796a

verified ·

1 Parent(s): aa776c7

Update rag_engine.py

Browse files

Files changed (1) hide show

rag_engine.py +80 -43

rag_engine.py CHANGED Viewed

@@ -1,13 +1,14 @@
 """
 RAG Engine - Memory optimized for HuggingFace free tier
-Embeddings : all-MiniLM-L6-v2 via sentence-transformers (CPU, ~90MB)
 Vector DB  : ChromaDB (local)
-LLM        : HuggingFace Router API (no local model loaded)
 """
 import os
 import re
 import json
 import tempfile
 import requests
 from typing import Tuple, List
@@ -16,23 +17,25 @@ from chromadb.config import Settings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
 from langchain_community.document_loaders import PyPDFLoader, TextLoader
-# Configuration
 EMBED_MODEL     = "all-MiniLM-L6-v2"
 CHUNK_SIZE      = 600
 CHUNK_OVERLAP   = 100
 TOP_K           = 3
 COLLECTION_NAME = "docmind_collection"
 CHROMA_DIR      = "/tmp/chroma_db"
-# HF Router URL
-HF_API_URL = "https://router.huggingface.co/v1/chat/completions"
-# Non-reasoning models only
 CANDIDATE_MODELS = [
-    "mistralai/Mistral-7B-Instruct-v0.3:auto",
-    "meta-llama/Llama-3.2-3B-Instruct:auto",
-    "meta-llama/Llama-3.1-8B-Instruct:auto",
 ]
@@ -45,13 +48,11 @@ class RAGEngine:
             chunk_overlap=CHUNK_OVERLAP,
             separators=["\n\n", "\n", ". ", " ", ""],
         )
     @property
     def embeddings(self):
         if self._embeddings is None:
-            # Use sentence-transformers directly - lighter than langchain wrapper
-            from sentence_transformers import SentenceTransformer
-            from langchain_community.embeddings import HuggingFaceEmbeddings
             self._embeddings = HuggingFaceEmbeddings(
                 model_name=EMBED_MODEL,
                 model_kwargs={"device": "cpu"},
@@ -60,11 +61,26 @@ class RAGEngine:
         return self._embeddings
     def ingest_file(self, uploaded_file) -> int:
         suffix = get_suffix(uploaded_file.name)
-        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
-            tmp.write(uploaded_file.read())
-            tmp_path = tmp.name
-        return self.ingest_path(tmp_path, uploaded_file.name)
     def ingest_path(self, path: str, name: str = "") -> int:
         suffix = get_suffix(name or path)
@@ -73,21 +89,18 @@ class RAGEngine:
         for doc in raw_docs:
             doc.metadata["source"] = name or os.path.basename(path)
         chunks = self._splitter.split_documents(raw_docs)
-        # Clear old vectorstore to free memory before creating new one
         if self._vectorstore is not None:
             try:
                 self._vectorstore._client.reset()
             except Exception:
                 pass
             self._vectorstore = None
         self._vectorstore = Chroma.from_documents(
-            documents=chunks,
-            embedding=self.embeddings,
-            collection_name=COLLECTION_NAME,
-            persist_directory=CHROMA_DIR,
-            client_settings=Settings(anonymized_telemetry=False),
         )
         return len(chunks)
@@ -95,24 +108,46 @@ class RAGEngine:
         if self._vectorstore is None:
             return "Please upload a document first.", []
-        retriever = self._vectorstore.as_retriever(
-            search_type="mmr",
-            search_kwargs={"k": TOP_K, "fetch_k": TOP_K * 2},
-        )
-        docs    = retriever.invoke(question)
-        context = "\n\n---\n\n".join(
-            "[Chunk {}]\n{}".format(i + 1, d.page_content) for i, d in enumerate(docs)
-        )
-        sources = list({d.metadata.get("source", "Document") for d in docs})
-        answer  = self._generate(question, context)
         return answer, sources
-    def _generate(self, question: str, context: str) -> str:
         hf_token = os.environ.get("HF_TOKEN", "")
         if not hf_token:
             return (
                 "HF_TOKEN not set. Add it as a Secret in Space Settings.\n\n"
-                "Best matching excerpt:\n\n" + extract_best(question, context)
             )
         system_prompt = (
@@ -125,7 +160,6 @@ class RAGEngine:
             "\n\n---\nQuestion: " + question +
             "\nAnswer:"
         )
         headers = {
             "Authorization": "Bearer " + hf_token,
             "Content-Type":  "application/json",
@@ -135,8 +169,8 @@ class RAGEngine:
         for model_id in CANDIDATE_MODELS:
             try:
                 payload = {
-                    "model": model_id,
-                    "messages": [
                         {"role": "system", "content": system_prompt},
                         {"role": "user",   "content": user_message},
                     ],
@@ -154,20 +188,23 @@ class RAGEngine:
                     raw    = resp.json()["choices"][0]["message"]["content"].strip()
                     answer = strip_thinking(raw)
                     if answer:
-                        return answer
                 else:
                     last_error = "Model {} -> {}: {}".format(
                         model_id, resp.status_code, resp.text[:200]
                     )
             except Exception as e:
                 last_error = str(e)
                 continue
-        return (
             "AI unavailable. Most relevant excerpt:\n\n"
             + extract_best(question, context)
             + "\n\n(Error: " + last_error + ")"
         )
 def strip_thinking(text: str) -> str:
@@ -215,4 +252,4 @@ def extract_best(question: str, context: str) -> str:
 def get_suffix(name: str) -> str:
-    return os.path.splitext(name)[-1].lower() or ".txt"

 """
 RAG Engine - Memory optimized for HuggingFace free tier
+Embeddings : all-MiniLM-L6-v2 (CPU, ~90MB)
 Vector DB  : ChromaDB (local)
+LLM        : HuggingFace Router API with correct provider suffixes
 """
 import os
 import re
 import json
+import time
 import tempfile
 import requests
 from typing import Tuple, List
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
 from langchain_community.document_loaders import PyPDFLoader, TextLoader
+from langchain_community.embeddings import HuggingFaceEmbeddings
+import monitor
 EMBED_MODEL     = "all-MiniLM-L6-v2"
 CHUNK_SIZE      = 600
 CHUNK_OVERLAP   = 100
 TOP_K           = 3
 COLLECTION_NAME = "docmind_collection"
 CHROMA_DIR      = "/tmp/chroma_db"
+HF_API_URL      = "https://router.huggingface.co/v1/chat/completions"
+# Correct provider suffixes verified from HuggingFace docs (2025)
+# Format: "model-id:provider"
+# cerebras = fast free GPU, hf-inference = HF own CPU servers
 CANDIDATE_MODELS = [
+    "meta-llama/Llama-3.1-8B-Instruct:cerebras",       # fast, free, no reasoning leak
+    "meta-llama/Llama-3.3-70B-Instruct:cerebras",       # larger, still free on cerebras
+    "mistralai/Mistral-7B-Instruct-v0.3:fireworks-ai",  # fireworks free tier
+    "HuggingFaceTB/SmolLM3-3B:hf-inference",            # HF's own server, always available
 ]
             chunk_overlap=CHUNK_OVERLAP,
             separators=["\n\n", "\n", ". ", " ", ""],
         )
+        monitor.log_startup()
     @property
     def embeddings(self):
         if self._embeddings is None:
             self._embeddings = HuggingFaceEmbeddings(
                 model_name=EMBED_MODEL,
                 model_kwargs={"device": "cpu"},
         return self._embeddings
     def ingest_file(self, uploaded_file) -> int:
+        t0     = time.time()
         suffix = get_suffix(uploaded_file.name)
+        error  = ""
+        chunks = 0
+        try:
+            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+                tmp.write(uploaded_file.read())
+                tmp_path = tmp.name
+            chunks = self.ingest_path(tmp_path, uploaded_file.name)
+        except Exception as e:
+            error = str(e)
+            raise
+        finally:
+            monitor.log_ingestion(
+                filename    = uploaded_file.name,
+                chunk_count = chunks,
+                latency_ms  = (time.time() - t0) * 1000,
+                error       = error,
+            )
+        return chunks
     def ingest_path(self, path: str, name: str = "") -> int:
         suffix = get_suffix(name or path)
         for doc in raw_docs:
             doc.metadata["source"] = name or os.path.basename(path)
         chunks = self._splitter.split_documents(raw_docs)
         if self._vectorstore is not None:
             try:
                 self._vectorstore._client.reset()
             except Exception:
                 pass
             self._vectorstore = None
         self._vectorstore = Chroma.from_documents(
+            documents         = chunks,
+            embedding         = self.embeddings,
+            collection_name   = COLLECTION_NAME,
+            persist_directory = CHROMA_DIR,
+            client_settings   = Settings(anonymized_telemetry=False),
         )
         return len(chunks)
         if self._vectorstore is None:
             return "Please upload a document first.", []
+        t0         = time.time()
+        error      = ""
+        answer     = ""
+        sources    = []
+        model_used = ""
+        try:
+            retriever = self._vectorstore.as_retriever(
+                search_type="mmr",
+                search_kwargs={"k": TOP_K, "fetch_k": TOP_K * 2},
+            )
+            docs    = retriever.invoke(question)
+            context = "\n\n---\n\n".join(
+                "[Chunk {}]\n{}".format(i + 1, d.page_content) for i, d in enumerate(docs)
+            )
+            sources            = list({d.metadata.get("source", "Document") for d in docs})
+            answer, model_used = self._generate(question, context)
+        except Exception as e:
+            error  = str(e)
+            answer = "Error: " + error
+        finally:
+            monitor.log_query(
+                question    = question,
+                answer      = answer,
+                sources     = sources,
+                latency_ms  = (time.time() - t0) * 1000,
+                model_used  = model_used,
+                chunk_count = TOP_K,
+                error       = error,
+            )
         return answer, sources
+    def _generate(self, question: str, context: str) -> Tuple[str, str]:
         hf_token = os.environ.get("HF_TOKEN", "")
         if not hf_token:
             return (
                 "HF_TOKEN not set. Add it as a Secret in Space Settings.\n\n"
+                "Best matching excerpt:\n\n" + extract_best(question, context),
+                "none"
             )
         system_prompt = (
             "\n\n---\nQuestion: " + question +
             "\nAnswer:"
         )
         headers = {
             "Authorization": "Bearer " + hf_token,
             "Content-Type":  "application/json",
         for model_id in CANDIDATE_MODELS:
             try:
                 payload = {
+                    "model":       model_id,
+                    "messages":    [
                         {"role": "system", "content": system_prompt},
                         {"role": "user",   "content": user_message},
                     ],
                     raw    = resp.json()["choices"][0]["message"]["content"].strip()
                     answer = strip_thinking(raw)
                     if answer:
+                        return answer, model_id
                 else:
                     last_error = "Model {} -> {}: {}".format(
                         model_id, resp.status_code, resp.text[:200]
                     )
+                    print("[DocMind] " + last_error)
             except Exception as e:
                 last_error = str(e)
+                print("[DocMind] Exception on {}: {}".format(model_id, last_error))
                 continue
+        fallback = (
             "AI unavailable. Most relevant excerpt:\n\n"
             + extract_best(question, context)
             + "\n\n(Error: " + last_error + ")"
         )
+        return fallback, "fallback"
 def strip_thinking(text: str) -> str:
 def get_suffix(name: str) -> str:
+    return os.path.splitext(name)[-1].lower() or ".txt"