Spaces:

Wall06
/

crazy_bot

Sleeping

App Files Files Community

Wall06 commited on 17 days ago

Commit

e73a9c6

verified ·

1 Parent(s): effc960

Update rag_engine.py

Browse files

Files changed (1) hide show

rag_engine.py +26 -91

rag_engine.py CHANGED Viewed

@@ -1,119 +1,54 @@
 import re
-import textwrap
 import faiss
 import numpy as np
 import requests
 import spacy
-from typing import Any
 from bs4 import BeautifulSoup
 from huggingface_hub import InferenceClient
 from pypdf import PdfReader
-from sentence_transformers import Transformer, SentenceTransformer
-# ── Config ────────────────────────────────────────────────────────────────────
-EMBED_MODEL   = "sentence-transformers/all-MiniLM-L6-v2"
-LLM_MODEL     = "HuggingFaceH4/zephyr-7b-beta"
-CHUNK_SIZE    = 400
-CHUNK_OVERLAP = 80
-TOP_K         = 4
-INTENT_MAP = {
-    "summarise": ["summarise", "summarize", "summary", "overview", "brief", "key points"],
-    "explain": ["explain", "what is", "what are", "define", "describe", "tell me about"],
-    "list": ["list", "enumerate", "give me a list", "what are the types"],
-}
 class RAGEngine:
     def __init__(self):
-        self.embed_model = SentenceTransformer(EMBED_MODEL)
         self.hf_client = InferenceClient()
-        # Robust model loading for HuggingFace
         try:
             self.nlp = spacy.load("en_core_web_sm")
         except:
-            import os
-            os.system("python -m spacy download en_core_web_sm")
-            self.nlp = spacy.load("en_core_web_sm")
         self.reset()
     def reset(self):
-        self.chunks = []
-        self.index = None
-        self.ready = False
-    def _process_text_into_chunks(self, text: str):
         text = re.sub(r'\s+', ' ', text).strip()
-        new_chunks = []
-        for i in range(0, len(text), CHUNK_SIZE - CHUNK_OVERLAP):
-            chunk = text[i : i + CHUNK_SIZE]
-            if len(chunk) > 20:
-                new_chunks.append(chunk)
-        self.chunks = new_chunks
         embeddings = self.embed_model.encode(self.chunks)
         self.index = faiss.IndexFlatL2(embeddings.shape[1])
         self.index.add(np.array(embeddings).astype("float32"))
         self.ready = True
-    def load_pdf(self, path: str) -> str:
-        reader = PdfReader(path)
-        text = "".join([page.extract_text() for page in reader.pages])
-        self._process_text_into_chunks(text)
-        return f"✅ Loaded PDF: {len(self.chunks)} chunks indexed."
-    def load_url(self, url: str) -> str:
         res = requests.get(url, timeout=10)
         soup = BeautifulSoup(res.text, "html.parser")
-        for script in soup(["script", "style"]):
-            script.decompose()
-        self._process_text_into_chunks(soup.get_text())
-        return f"✅ Loaded URL: {len(self.chunks)} chunks indexed."
-    def load_text(self, text: str) -> str:
-        self._process_text_into_chunks(text)
-        return f"✅ Loaded Text: {len(self.chunks)} chunks indexed."
-    def detect_intent(self, query: str) -> str:
-        query_lower = query.lower()
-        for intent, keywords in INTENT_MAP.items():
-            if any(k in query_lower for k in keywords):
-                return intent
-        return "general_query"
-    def extract_entities(self, text: str) -> dict:
-        doc = self.nlp(text)
-        entities = {}
-        for ent in doc.ents:
-            if ent.label_ not in entities:
-                entities[ent.label_] = []
-            if ent.text not in entities[ent.label_]:
-                entities[ent.label_].append(ent.text)
-        return entities
-    def _retrieve(self, query: str) -> list:
-        query_vec = self.embed_model.encode([query]).astype("float32")
-        _, indices = self.index.search(query_vec, TOP_K)
-        return [self.chunks[i] for i in indices[0]]
-    def answer(self, query: str) -> str:
-        if not self.ready:
-            return "⚠️ No knowledge source loaded."
-        chunks = self._retrieve(query)
-        context = "\n".join([f"- {c}" for c in chunks])
-        prompt = f"<|system|>\nUse context to answer accurately.\n</s>\n<|user|>\nContext: {context}\nQuestion: {query}\n</s>\n<|assistant|>"
-        try:
-            response = self.hf_client.text_generation(prompt, model=LLM_MODEL, max_new_tokens=512)
-            return response.strip()
-        except Exception as e:
-            return f"⚠️ LLM Error. Top match: {chunks[0]}"
-    def answer_with_nlp(self, query: str) -> tuple:
-        ans = self.answer(query)
-        info = {
-            "intent": self.detect_intent(query),
-            "entities_found": self.extract_entities(ans)
-        }
-        return ans, info

 import re
 import faiss
 import numpy as np
 import requests
 import spacy
 from bs4 import BeautifulSoup
 from huggingface_hub import InferenceClient
 from pypdf import PdfReader
+from sentence_transformers import SentenceTransformer
 class RAGEngine:
     def __init__(self):
+        self.embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
         self.hf_client = InferenceClient()
         try:
             self.nlp = spacy.load("en_core_web_sm")
         except:
+            self.nlp = None # Fallback if model fails to load
         self.reset()
     def reset(self):
+        self.chunks, self.index, self.ready = [], None, False
+    def _index_text(self, text):
         text = re.sub(r'\s+', ' ', text).strip()
+        self.chunks = [text[i:i+400] for i in range(0, len(text), 320) if len(text[i:i+400]) > 20]
         embeddings = self.embed_model.encode(self.chunks)
         self.index = faiss.IndexFlatL2(embeddings.shape[1])
         self.index.add(np.array(embeddings).astype("float32"))
         self.ready = True
+    def load_pdf(self, path):
+        text = "".join([p.extract_text() for p in PdfReader(path).pages])
+        self._index_text(text)
+        return f"✅ Indexed {len(self.chunks)} PDF sections."
+    def load_url(self, url):
         res = requests.get(url, timeout=10)
         soup = BeautifulSoup(res.text, "html.parser")
+        self._index_text(soup.get_text())
+        return f"✅ Indexed {len(self.chunks)} URL sections."
+    def answer_with_nlp(self, query):
+        if not self.ready: return "Please upload data first.", {}
+        # Retrieval
+        q_vec = self.embed_model.encode([query]).astype("float32")
+        _, idx = self.index.search(q_vec, 3)
+        context = "\n".join([self.chunks[i] for i in idx[0]])
+        # Generation
+        prompt = f"<|system|>Answer based on context.</s><|user|>Context: {context}\nQuestion: {query}</s><|assistant|>"
+        ans = self.hf_client.text_generation(prompt, model="HuggingFaceH4/zephyr-7b-beta", max_new_tokens=512).strip()
+        # NLP Analysis
+        entities = {ent.label_: ent.text for ent in self.nlp(ans).ents} if self.nlp else {}
+        return ans, {"entities": entities}