Spaces:

Ryanfafa
/

docmind-ai

Running

App Files Files Community

Ryanfafa commited on Feb 17

Commit

4b89f17

verified ·

1 Parent(s): 746cdfd

Update rag_engine.py

Browse files

Files changed (1) hide show

rag_engine.py +45 -40

rag_engine.py CHANGED Viewed

@@ -2,13 +2,14 @@
 RAG Engine
 Embeddings : sentence-transformers/all-MiniLM-L6-v2
 Vector DB  : ChromaDB (local)
-LLM        : HuggingFace InferenceClient (official library, auto-routing)
-Chunking   : Recursive character splitter with overlap
 """
 import os
 import re
 import tempfile
 from typing import Tuple, List
 from chromadb.config import Settings
@@ -25,18 +26,18 @@ TOP_K           = 4
 COLLECTION_NAME = "docmind_collection"
 CHROMA_DIR      = "./chroma_db"
-# Models to try in order (all free with HF token)
 CANDIDATE_MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.3",
     "microsoft/Phi-3.5-mini-instruct",
     "HuggingFaceH4/zephyr-7b-beta",
-    "google/gemma-2-2b-it",
 ]
 class RAGEngine:
-    """Full RAG pipeline: ingest, embed, store, retrieve, generate."""
     def __init__(self):
         self._embeddings  = None
         self._vectorstore = None
@@ -67,12 +68,9 @@ class RAGEngine:
         suffix = get_suffix(name or path)
         loader = PyPDFLoader(path) if suffix == ".pdf" else TextLoader(path, encoding="utf-8")
         raw_docs = loader.load()
         for doc in raw_docs:
             doc.metadata["source"] = name or os.path.basename(path)
         chunks = self._splitter.split_documents(raw_docs)
         self._vectorstore = Chroma.from_documents(
             documents=chunks,
             embedding=self.embeddings,
@@ -85,7 +83,6 @@ class RAGEngine:
     def query(self, question: str) -> Tuple[str, List[str]]:
         if self._vectorstore is None:
             return "Please upload a document first.", []
         retriever = self._vectorstore.as_retriever(
             search_type="mmr",
             search_kwargs={"k": TOP_K, "fetch_k": TOP_K * 3},
@@ -100,20 +97,13 @@ class RAGEngine:
     def _generate(self, question: str, context: str) -> str:
         hf_token = os.environ.get("HF_TOKEN", "")
         if not hf_token:
-            excerpt = extract_best(question, context)
             return (
-                "HF_TOKEN not set. Add it as a Secret in your Space Settings.\n\n"
-                "Best matching excerpt:\n\n" + excerpt
             )
-        # Use the official huggingface_hub InferenceClient
-        try:
-            from huggingface_hub import InferenceClient
-        except ImportError:
-            return "huggingface_hub not installed. Check requirements.txt."
         system_prompt = (
             "You are DocMind, an expert document analyst. "
             "Answer using ONLY the provided document context. "
@@ -122,35 +112,50 @@ class RAGEngine:
         )
         user_message = "Document context:\n" + context + "\n\nQuestion: " + question
         last_error = ""
         for model_id in CANDIDATE_MODELS:
             try:
-                client = InferenceClient(
-                    model=model_id,
-                    token=hf_token,
                     timeout=60,
                 )
-                result = client.chat_completion(
-                    messages=[
-                        {"role": "system", "content": system_prompt},
-                        {"role": "user",   "content": user_message},
-                    ],
-                    max_tokens=512,
-                    temperature=0.2,
-                )
-                answer = result.choices[0].message.content.strip()
-                if answer:
-                    return answer
             except Exception as e:
                 last_error = str(e)
                 continue
-        # All models failed — use extractive fallback
-        excerpt = extract_best(question, context)
         return (
-            "AI answer unavailable. Here is the most relevant part of your document:\n\n"
-            + excerpt
-            + "\n\n(Error: " + last_error + ")"
         )
@@ -170,4 +175,4 @@ def extract_best(question: str, context: str) -> str:
 def get_suffix(name: str) -> str:
-    return os.path.splitext(name)[-1].lower() or ".txt"

 RAG Engine
 Embeddings : sentence-transformers/all-MiniLM-L6-v2
 Vector DB  : ChromaDB (local)
+LLM        : HuggingFace Router API (direct requests, correct URL)
 """
 import os
 import re
+import json
 import tempfile
+import requests
 from typing import Tuple, List
 from chromadb.config import Settings
 COLLECTION_NAME = "docmind_collection"
 CHROMA_DIR      = "./chroma_db"
+# Correct router base (NOT api-inference.huggingface.co)
+HF_ROUTER_BASE = "https://router.huggingface.co/hf-inference/models"
+# Models to try in order
 CANDIDATE_MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.3",
     "microsoft/Phi-3.5-mini-instruct",
     "HuggingFaceH4/zephyr-7b-beta",
 ]
 class RAGEngine:
     def __init__(self):
         self._embeddings  = None
         self._vectorstore = None
         suffix = get_suffix(name or path)
         loader = PyPDFLoader(path) if suffix == ".pdf" else TextLoader(path, encoding="utf-8")
         raw_docs = loader.load()
         for doc in raw_docs:
             doc.metadata["source"] = name or os.path.basename(path)
         chunks = self._splitter.split_documents(raw_docs)
         self._vectorstore = Chroma.from_documents(
             documents=chunks,
             embedding=self.embeddings,
     def query(self, question: str) -> Tuple[str, List[str]]:
         if self._vectorstore is None:
             return "Please upload a document first.", []
         retriever = self._vectorstore.as_retriever(
             search_type="mmr",
             search_kwargs={"k": TOP_K, "fetch_k": TOP_K * 3},
     def _generate(self, question: str, context: str) -> str:
         hf_token = os.environ.get("HF_TOKEN", "")
         if not hf_token:
             return (
+                "HF_TOKEN not set.\n"
+                "Go to Space Settings -> Secrets -> add HF_TOKEN with your token from huggingface.co/settings/tokens\n\n"
+                "Best matching excerpt:\n\n" + extract_best(question, context)
             )
         system_prompt = (
             "You are DocMind, an expert document analyst. "
             "Answer using ONLY the provided document context. "
         )
         user_message = "Document context:\n" + context + "\n\nQuestion: " + question
+        headers = {
+            "Authorization": "Bearer " + hf_token,
+            "Content-Type":  "application/json",
+        }
         last_error = ""
         for model_id in CANDIDATE_MODELS:
+            # Build URL directly - no library, no redirects
+            url = "{}/{}/v1/chat/completions".format(HF_ROUTER_BASE, model_id)
+            payload = {
+                "model": model_id,
+                "messages": [
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user",   "content": user_message},
+                ],
+                "max_tokens":  512,
+                "temperature": 0.2,
+                "stream":      False,
+            }
             try:
+                resp = requests.post(
+                    url,
+                    headers=headers,
+                    data=json.dumps(payload),
                     timeout=60,
+                    allow_redirects=False,   # prevent redirect to old endpoint
                 )
+                if resp.status_code == 200:
+                    data   = resp.json()
+                    answer = data["choices"][0]["message"]["content"].strip()
+                    if answer:
+                        return answer
+                else:
+                    last_error = "Model {} returned {}: {}".format(
+                        model_id, resp.status_code, resp.text[:300]
+                    )
             except Exception as e:
                 last_error = str(e)
                 continue
         return (
+            "AI answer unavailable. Most relevant excerpt:\n\n"
+            + extract_best(question, context)
+            + "\n\n(Last error: " + last_error + ")"
         )
 def get_suffix(name: str) -> str:
+    return os.path.splitext(name)[-1].lower() or ".txt"