Spaces:

Ryanfafa
/

docmind-ai

Running

App Files Files Community

Ryanfafa commited on Feb 18

Commit

188f4e4

verified ·

1 Parent(s): 8c5b9b8

Update rag_engine.py

Browse files

Files changed (1) hide show

rag_engine.py +310 -129

rag_engine.py CHANGED Viewed

@@ -1,58 +1,79 @@
 """
-RAG Engine - Memory optimized for HuggingFace free tier
-Embeddings : all-MiniLM-L6-v2 (CPU, ~90MB)
-Vector DB  : ChromaDB (local)
-LLM        : HuggingFace Router API with correct provider suffixes
 """
 import os
 import re
 import json
 import time
 import tempfile
 import requests
-from typing import Tuple, List
 from chromadb.config import Settings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
-from langchain_community.document_loaders import PyPDFLoader, TextLoader
 from langchain_community.embeddings import HuggingFaceEmbeddings
 import monitor
 EMBED_MODEL     = "all-MiniLM-L6-v2"
 CHUNK_SIZE      = 600
 CHUNK_OVERLAP   = 100
-TOP_K           = 3
-COLLECTION_NAME = "docmind_collection"
 CHROMA_DIR      = "/tmp/chroma_db"
 HF_API_URL      = "https://router.huggingface.co/v1/chat/completions"
-# Correct provider suffixes verified from HuggingFace docs (2025)
-# Format: "model-id:provider"
-# cerebras = fast free GPU, hf-inference = HF own CPU servers
 CANDIDATE_MODELS = [
-    "meta-llama/Llama-3.1-8B-Instruct:cerebras",       # fast, free, no reasoning leak
-    "meta-llama/Llama-3.3-70B-Instruct:cerebras",       # larger, still free on cerebras
-    "mistralai/Mistral-7B-Instruct-v0.3:fireworks-ai",  # fireworks free tier
-    "HuggingFaceTB/SmolLM3-3B:hf-inference",            # HF's own server, always available
 ]
 class RAGEngine:
     def __init__(self):
-        self._embeddings  = None
-        self._vectorstore = None
-        self._splitter    = RecursiveCharacterTextSplitter(
             chunk_size=CHUNK_SIZE,
             chunk_overlap=CHUNK_OVERLAP,
             separators=["\n\n", "\n", ". ", " ", ""],
         )
         monitor.log_startup()
     @property
     def embeddings(self):
         if self._embeddings is None:
             self._embeddings = HuggingFaceEmbeddings(
                 model_name=EMBED_MODEL,
                 model_kwargs={"device": "cpu"},
@@ -60,35 +81,198 @@ class RAGEngine:
             )
         return self._embeddings
     def ingest_file(self, uploaded_file) -> int:
-        t0     = time.time()
-        suffix = get_suffix(uploaded_file.name)
-        error  = ""
-        chunks = 0
         try:
-            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
-                tmp.write(uploaded_file.read())
-                tmp_path = tmp.name
-            chunks = self.ingest_path(tmp_path, uploaded_file.name)
         except Exception as e:
             error = str(e)
             raise
         finally:
-            monitor.log_ingestion(
-                filename    = uploaded_file.name,
-                chunk_count = chunks,
-                latency_ms  = (time.time() - t0) * 1000,
-                error       = error,
-            )
         return chunks
     def ingest_path(self, path: str, name: str = "") -> int:
-        suffix = get_suffix(name or path)
-        loader = PyPDFLoader(path) if suffix == ".pdf" else TextLoader(path, encoding="utf-8")
-        raw_docs = loader.load()
-        for doc in raw_docs:
-            doc.metadata["source"] = name or os.path.basename(path)
-        chunks = self._splitter.split_documents(raw_docs)
         if self._vectorstore is not None:
             try:
                 self._vectorstore._client.reset()
@@ -96,160 +280,157 @@ class RAGEngine:
                 pass
             self._vectorstore = None
         self._vectorstore = Chroma.from_documents(
-            documents         = chunks,
-            embedding         = self.embeddings,
-            collection_name   = COLLECTION_NAME,
-            persist_directory = CHROMA_DIR,
-            client_settings   = Settings(anonymized_telemetry=False),
         )
         return len(chunks)
     def query(self, question: str) -> Tuple[str, List[str]]:
         if self._vectorstore is None:
             return "Please upload a document first.", []
-        t0         = time.time()
-        error      = ""
-        answer     = ""
-        sources    = []
-        model_used = ""
         try:
             retriever = self._vectorstore.as_retriever(
                 search_type="mmr",
-                search_kwargs={"k": TOP_K, "fetch_k": TOP_K * 2},
             )
-            docs    = retriever.invoke(question)
             context = "\n\n---\n\n".join(
-                "[Chunk {}]\n{}".format(i + 1, d.page_content) for i, d in enumerate(docs)
             )
-            sources            = list({d.metadata.get("source", "Document") for d in docs})
             answer, model_used = self._generate(question, context)
         except Exception as e:
             error  = str(e)
-            answer = "Error: " + error
         finally:
-            monitor.log_query(
-                question    = question,
-                answer      = answer,
-                sources     = sources,
-                latency_ms  = (time.time() - t0) * 1000,
-                model_used  = model_used,
-                chunk_count = TOP_K,
-                error       = error,
-            )
         return answer, sources
     def _generate(self, question: str, context: str) -> Tuple[str, str]:
         hf_token = os.environ.get("HF_TOKEN", "")
         if not hf_token:
             return (
                 "HF_TOKEN not set. Add it as a Secret in Space Settings.\n\n"
-                "Best matching excerpt:\n\n" + extract_best(question, context),
                 "none"
             )
         system_prompt = (
-            "You are DocMind, a document Q&A assistant. "
-            "Answer the question using only the document context. "
-            "Be short and direct. No preamble. No reasoning. Just answer."
         )
-        user_message = (
-            "Context:\n" + context +
-            "\n\n---\nQuestion: " + question +
-            "\nAnswer:"
-        )
-        headers = {
-            "Authorization": "Bearer " + hf_token,
-            "Content-Type":  "application/json",
-        }
         last_error = ""
         for model_id in CANDIDATE_MODELS:
             try:
-                payload = {
-                    "model":       model_id,
-                    "messages":    [
-                        {"role": "system", "content": system_prompt},
-                        {"role": "user",   "content": user_message},
-                    ],
-                    "max_tokens":  400,
-                    "temperature": 0.05,
-                    "stream":      False,
-                }
                 resp = requests.post(
                     HF_API_URL,
                     headers=headers,
-                    data=json.dumps(payload),
                     timeout=60,
                 )
                 if resp.status_code == 200:
                     raw    = resp.json()["choices"][0]["message"]["content"].strip()
-                    answer = strip_thinking(raw)
                     if answer:
                         return answer, model_id
                 else:
-                    last_error = "Model {} -> {}: {}".format(
-                        model_id, resp.status_code, resp.text[:200]
-                    )
-                    print("[DocMind] " + last_error)
             except Exception as e:
                 last_error = str(e)
-                print("[DocMind] Exception on {}: {}".format(model_id, last_error))
                 continue
-        fallback = (
             "AI unavailable. Most relevant excerpt:\n\n"
-            + extract_best(question, context)
-            + "\n\n(Error: " + last_error + ")"
         )
-        return fallback, "fallback"
-def strip_thinking(text: str) -> str:
     text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()
-    reasoning_starters = [
         "okay", "ok,", "alright", "let me", "let's", "i need", "i will",
         "i'll", "first,", "so,", "the user", "looking at", "going through",
         "based on the chunk", "parsing", "to answer", "in order to",
     ]
-    lines      = text.split("\n")
-    clean      = []
-    found_real = False
     for line in lines:
-        lower       = line.strip().lower()
-        is_thinking = any(lower.startswith(p) for p in reasoning_starters)
-        if not found_real:
-            if line.strip() and not is_thinking:
-                found_real = True
                 clean.append(line)
         else:
             clean.append(line)
-    result = "\n".join(clean).strip()
-    if not result or len(result) > 1500:
-        paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
-        if paragraphs:
-            last = paragraphs[-1]
-            if len(last) < 800:
-                return last
-    return result if result else text
-def extract_best(question: str, context: str) -> str:
     keywords   = set(re.findall(r'\b\w{4,}\b', question.lower()))
-    best_chunk = ""
-    best_score = 0
     for chunk in context.split("---"):
-        words = set(re.findall(r'\b\w{4,}\b', chunk.lower()))
-        score = len(keywords & words)
-        if score > best_score:
-            best_score = score
-            best_chunk = chunk.strip()
-    if not best_chunk:
-        return "No relevant content found."
-    return best_chunk[:600] + ("..." if len(best_chunk) > 600 else "")
-def get_suffix(name: str) -> str:
-    return os.path.splitext(name)[-1].lower() or ".txt"

 """
+rag_engine.py — Multimodal RAG Engine with Conversation Memory
+Supports: PDF, TXT, DOCX, CSV, XLSX, Images (JPG/PNG/WEBP)
+Memory: sliding window of last 6 exchanges
 """
 import os
 import re
+import io
 import json
 import time
 import tempfile
 import requests
+import logging
+from pathlib import Path
+from typing import Tuple, List, Optional
 from chromadb.config import Settings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
 from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.document_loaders import PyPDFLoader, TextLoader
+from langchain.schema import Document
 import monitor
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# ── Constants ────────────────────────────────────────────────────────────────
 EMBED_MODEL     = "all-MiniLM-L6-v2"
 CHUNK_SIZE      = 600
 CHUNK_OVERLAP   = 100
+TOP_K           = 4
+COLLECTION_NAME = "docmind_multimodal"
 CHROMA_DIR      = "/tmp/chroma_db"
 HF_API_URL      = "https://router.huggingface.co/v1/chat/completions"
+MEMORY_WINDOW   = 6   # number of past Q&A pairs to keep
+SUPPORTED_EXTENSIONS = {
+    ".pdf", ".txt",
+    ".docx", ".doc",
+    ".csv", ".xlsx", ".xls",
+    ".jpg", ".jpeg", ".png", ".webp",
+}
 CANDIDATE_MODELS = [
+    "meta-llama/Llama-3.1-8B-Instruct:cerebras",
+    "meta-llama/Llama-3.3-70B-Instruct:cerebras",
+    "mistralai/Mistral-7B-Instruct-v0.3:fireworks-ai",
+    "HuggingFaceTB/SmolLM3-3B:hf-inference",
 ]
+def get_suffix(name: str) -> str:
+    return Path(name).suffix.lower() or ".txt"
 class RAGEngine:
     def __init__(self):
+        self._embeddings:  Optional[HuggingFaceEmbeddings] = None
+        self._vectorstore: Optional[Chroma] = None
+        self._splitter = RecursiveCharacterTextSplitter(
             chunk_size=CHUNK_SIZE,
             chunk_overlap=CHUNK_OVERLAP,
             separators=["\n\n", "\n", ". ", " ", ""],
         )
+        self._memory: List[dict] = []
+        self._doc_name: str = ""
+        self._doc_type: str = ""
         monitor.log_startup()
     @property
     def embeddings(self):
         if self._embeddings is None:
+            logger.info("Loading embedding model...")
             self._embeddings = HuggingFaceEmbeddings(
                 model_name=EMBED_MODEL,
                 model_kwargs={"device": "cpu"},
             )
         return self._embeddings
+    # ── Memory ───────────────────────────────────────────────────────────────
+    def clear_memory(self):
+        self._memory = []
+    def add_to_memory(self, question: str, answer: str):
+        self._memory.append({"role": "user",      "content": question})
+        self._memory.append({"role": "assistant",  "content": answer})
+        max_msgs = MEMORY_WINDOW * 2
+        if len(self._memory) > max_msgs:
+            self._memory = self._memory[-max_msgs:]
+    def get_memory_messages(self) -> List[dict]:
+        return self._memory.copy()
+    def get_memory_count(self) -> int:
+        return len(self._memory) // 2
+    # ── Ingestion ────────────────────────────────────────────────────────────
     def ingest_file(self, uploaded_file) -> int:
+        """Accept FastAPI UploadFile or Streamlit UploadedFile."""
+        t0 = time.time()
+        filename = getattr(uploaded_file, "name", None) or getattr(uploaded_file, "filename", "file")
+        suffix   = get_suffix(filename)
+        error    = ""
+        chunks   = 0
+        if suffix not in SUPPORTED_EXTENSIONS:
+            raise ValueError(
+                f"Unsupported: {suffix}. Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
+            )
         try:
+            if hasattr(uploaded_file, "read"):
+                data = uploaded_file.read()
+                if hasattr(uploaded_file, "seek"):
+                    uploaded_file.seek(0)
+            else:
+                data = uploaded_file.file.read()
+            docs   = self._route(data, filename, suffix)
+            chunks = self._index(docs, filename)
+            self._doc_name = filename
+            self._doc_type = suffix
+            self.clear_memory()
         except Exception as e:
             error = str(e)
+            logger.error(f"Ingestion error: {e}")
             raise
         finally:
+            monitor.log_ingestion(filename, chunks, (time.time()-t0)*1000, error)
         return chunks
     def ingest_path(self, path: str, name: str = "") -> int:
+        filename = name or Path(path).name
+        suffix   = get_suffix(filename)
+        with open(path, "rb") as f:
+            data = f.read()
+        docs   = self._route(data, filename, suffix)
+        chunks = self._index(docs, filename)
+        self._doc_name = filename
+        self._doc_type = suffix
+        self.clear_memory()
+        return chunks
+    def _route(self, data: bytes, filename: str, suffix: str) -> List[Document]:
+        if suffix == ".pdf":
+            return self._load_pdf(data, filename)
+        elif suffix == ".txt":
+            return self._load_text(data, filename)
+        elif suffix in {".docx", ".doc"}:
+            return self._load_docx(data, filename)
+        elif suffix == ".csv":
+            return self._load_csv(data, filename)
+        elif suffix in {".xlsx", ".xls"}:
+            return self._load_excel(data, filename)
+        elif suffix in {".jpg", ".jpeg", ".png", ".webp"}:
+            return self._load_image(data, filename)
+        raise ValueError(f"No loader for {suffix}")
+    # ── Loaders ──────────────────────────────────────────────────────────────
+    def _load_pdf(self, data: bytes, filename: str) -> List[Document]:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+            tmp.write(data)
+            tmp_path = tmp.name
+        try:
+            docs = PyPDFLoader(tmp_path).load()
+            for doc in docs:
+                doc.metadata.update({"source": filename, "type": "pdf"})
+            return docs
+        finally:
+            os.unlink(tmp_path)
+    def _load_text(self, data: bytes, filename: str) -> List[Document]:
+        return [Document(
+            page_content=data.decode("utf-8", errors="replace"),
+            metadata={"source": filename, "type": "text"}
+        )]
+    def _load_docx(self, data: bytes, filename: str) -> List[Document]:
+        try:
+            import docx2txt
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
+                tmp.write(data)
+                tmp_path = tmp.name
+            try:
+                text = docx2txt.process(tmp_path)
+            finally:
+                os.unlink(tmp_path)
+        except ImportError:
+            text = data.decode("utf-8", errors="replace")
+        return [Document(page_content=text, metadata={"source": filename, "type": "docx"})]
+    def _load_csv(self, data: bytes, filename: str) -> List[Document]:
+        import pandas as pd
+        df   = pd.read_csv(io.BytesIO(data))
+        docs = []
+        summary = (
+            f"File: {filename}\n"
+            f"Shape: {df.shape[0]} rows × {df.shape[1]} columns\n"
+            f"Columns: {', '.join(df.columns.tolist())}\n\n"
+            f"First 10 rows:\n{df.head(10).to_string(index=False)}"
+        )
+        docs.append(Document(page_content=summary, metadata={"source": filename, "type": "csv_summary"}))
+        try:
+            stats = "Statistical summary:\n" + df.describe(include="all").to_string()
+            docs.append(Document(page_content=stats, metadata={"source": filename, "type": "csv_stats"}))
+        except Exception:
+            pass
+        for i in range(0, min(len(df), 500), 50):
+            chunk = f"Rows {i}–{i+50}:\n{df.iloc[i:i+50].to_string(index=False)}"
+            docs.append(Document(page_content=chunk, metadata={"source": filename, "type": "csv_rows"}))
+        return docs
+    def _load_excel(self, data: bytes, filename: str) -> List[Document]:
+        import pandas as pd
+        xl   = pd.ExcelFile(io.BytesIO(data))
+        docs = []
+        for sheet in xl.sheet_names:
+            df = xl.parse(sheet)
+            text = (
+                f"Sheet: {sheet} | {df.shape[0]} rows × {df.shape[1]} cols\n"
+                f"Columns: {', '.join(str(c) for c in df.columns)}\n\n"
+                f"{df.head(10).to_string(index=False)}"
+            )
+            docs.append(Document(page_content=text, metadata={"source": filename, "type": "excel", "sheet": sheet}))
+        return docs
+    def _load_image(self, data: bytes, filename: str) -> List[Document]:
+        caption = self._caption_image(data, filename)
+        text = (
+            f"Image file: {filename}\n\n"
+            f"AI-generated image description:\n{caption}\n\n"
+            f"The above description represents the full visual content of this image."
+        )
+        return [Document(
+            page_content=text,
+            metadata={"source": filename, "type": "image", "caption": caption}
+        )]
+    def _caption_image(self, data: bytes, filename: str) -> str:
+        hf_token = os.environ.get("HF_TOKEN", "")
+        if not hf_token:
+            return f"[Image: {filename}] — Add HF_TOKEN secret to enable AI image captioning."
+        try:
+            import base64
+            resp = requests.post(
+                "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large",
+                headers={"Authorization": f"Bearer {hf_token}"},
+                json={"inputs": base64.b64encode(data).decode()},
+                timeout=30,
+            )
+            if resp.status_code == 200:
+                result = resp.json()
+                if isinstance(result, list) and result:
+                    caption = result[0].get("generated_text", "")
+                    if caption:
+                        logger.info(f"Image caption: {caption[:80]}")
+                        return caption
+        except Exception as e:
+            logger.warning(f"Caption failed: {e}")
+        return f"[Image: {filename}] — Visual content uploaded (captioning unavailable)"
+    # ── Indexing ─────────────────────────────────────────────────────────────
+    def _index(self, docs: List[Document], filename: str) -> int:
+        chunks = self._splitter.split_documents(docs)
         if self._vectorstore is not None:
             try:
                 self._vectorstore._client.reset()
                 pass
             self._vectorstore = None
         self._vectorstore = Chroma.from_documents(
+            documents=chunks,
+            embedding=self.embeddings,
+            collection_name=COLLECTION_NAME,
+            persist_directory=CHROMA_DIR,
+            client_settings=Settings(anonymized_telemetry=False),
         )
+        logger.info(f"Indexed {len(chunks)} chunks from {filename}")
         return len(chunks)
+    # ── Query ────────────────────────────────────────────────────────────────
     def query(self, question: str) -> Tuple[str, List[str]]:
         if self._vectorstore is None:
             return "Please upload a document first.", []
+        t0 = time.time()
+        error = answer = model_used = ""
+        sources = []
         try:
             retriever = self._vectorstore.as_retriever(
                 search_type="mmr",
+                search_kwargs={"k": TOP_K, "fetch_k": TOP_K * 3},
             )
+            docs = retriever.invoke(question)
             context = "\n\n---\n\n".join(
+                f"[Chunk {i+1} | {d.metadata.get('type','text')}]\n{d.page_content}"
+                for i, d in enumerate(docs)
             )
+            sources = list({d.metadata.get("source", "Document") for d in docs})
             answer, model_used = self._generate(question, context)
+            self.add_to_memory(question, answer)
         except Exception as e:
             error  = str(e)
+            answer = f"Error: {error}"
+            logger.error(f"Query error: {e}")
         finally:
+            monitor.log_query(question, answer, sources, (time.time()-t0)*1000, model_used, TOP_K, error)
         return answer, sources
+    # ── LLM ──────────────────────────────────────────────────────────────────
     def _generate(self, question: str, context: str) -> Tuple[str, str]:
         hf_token = os.environ.get("HF_TOKEN", "")
         if not hf_token:
             return (
                 "HF_TOKEN not set. Add it as a Secret in Space Settings.\n\n"
+                "Best matching excerpt:\n\n" + _extract_best(question, context),
                 "none"
             )
+        doc_type_hint = ""
+        if self._doc_type in {".jpg", ".jpeg", ".png", ".webp"}:
+            doc_type_hint = "The document is an IMAGE described by an AI caption. Base your answer on the caption."
+        elif self._doc_type in {".csv", ".xlsx", ".xls"}:
+            doc_type_hint = "The document is tabular data (spreadsheet/CSV). Refer to column names and values precisely."
         system_prompt = (
+            f"You are DocMind AI, an expert document analyst built by Ryan Farahani.\n"
+            f"You are analyzing: '{self._doc_name}'.\n"
+            f"{doc_type_hint}\n"
+            "Answer using ONLY the provided document context. "
+            "Be concise and precise. No preamble. No reasoning out loud. Just answer.\n"
+            "If asked a follow-up question, use the conversation history for context."
         )
+        # Build messages with memory
+        messages = [{"role": "system", "content": system_prompt}]
+        memory   = self.get_memory_messages()
+        if memory:
+            # Context injection before history
+            messages.append({
+                "role":    "system",
+                "content": f"Current document context:\n{context}"
+            })
+            messages.extend(memory)
+            messages.append({"role": "user", "content": question})
+        else:
+            messages.append({
+                "role":    "user",
+                "content": f"Document context:\n{context}\n\n---\nQuestion: {question}"
+            })
+        headers    = {"Authorization": f"Bearer {hf_token}", "Content-Type": "application/json"}
         last_error = ""
         for model_id in CANDIDATE_MODELS:
             try:
                 resp = requests.post(
                     HF_API_URL,
                     headers=headers,
+                    data=json.dumps({
+                        "model":       model_id,
+                        "messages":    messages,
+                        "max_tokens":  500,
+                        "temperature": 0.1,
+                        "stream":      False,
+                    }),
                     timeout=60,
                 )
                 if resp.status_code == 200:
                     raw    = resp.json()["choices"][0]["message"]["content"].strip()
+                    answer = _strip_thinking(raw)
                     if answer:
                         return answer, model_id
                 else:
+                    last_error = f"{model_id} → {resp.status_code}: {resp.text[:150]}"
+                    logger.warning(last_error)
             except Exception as e:
                 last_error = str(e)
+                logger.warning(f"Exception on {model_id}: {e}")
                 continue
+        return (
             "AI unavailable. Most relevant excerpt:\n\n"
+            + _extract_best(question, context)
+            + f"\n\n(Error: {last_error})",
+            "fallback"
         )
+# ── Helpers ──────────────────────────────────────────────────────────────────
+def _strip_thinking(text: str) -> str:
     text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()
+    starters = [
         "okay", "ok,", "alright", "let me", "let's", "i need", "i will",
         "i'll", "first,", "so,", "the user", "looking at", "going through",
         "based on the chunk", "parsing", "to answer", "in order to",
     ]
+    lines = text.split("\n")
+    clean, found = [], False
     for line in lines:
+        lower = line.strip().lower()
+        if not found:
+            if line.strip() and not any(lower.startswith(p) for p in starters):
+                found = True
                 clean.append(line)
         else:
             clean.append(line)
+    return "\n".join(clean).strip() or text
+def _extract_best(question: str, context: str) -> str:
     keywords   = set(re.findall(r'\b\w{4,}\b', question.lower()))
+    best, score = "", 0
     for chunk in context.split("---"):
+        s = len(keywords & set(re.findall(r'\b\w{4,}\b', chunk.lower())))
+        if s > score:
+            score, best = s, chunk.strip()
+    return (best[:600] + "...") if len(best) > 600 else best or "No relevant content found."