Spaces:

sofzcc
/

Full_RAG_Assistant

Sleeping

App Files Files Community

sofzcc commited on Dec 2, 2025

Commit

dd1add7

verified ·

1 Parent(s): 28c97dd

Update app.py

Browse files

Files changed (1) hide show

app.py +297 -79

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
 import glob
 import yaml
-from typing import List, Tuple
 import faiss
 import numpy as np
@@ -16,8 +16,49 @@ import docx
 # CONFIG
 # -----------------------------
-with open("config.yaml", "r", encoding="utf-8") as f:
-    CONFIG = yaml.safe_load(f)
 KB_DIR = CONFIG["kb"]["directory"]
 INDEX_DIR = CONFIG["kb"]["index_directory"]
@@ -35,46 +76,96 @@ NO_ANSWER_MSG = CONFIG["messages"]["no_answer"]
 # -----------------------------
 def chunk_text(text: str, chunk_size: int, overlap: int) -> List[str]:
-    if not text:
         return []
     chunks = []
     start = 0
-    while start < len(text):
-        end = min(start + chunk_size, len(text))
         chunk = text[start:end].strip()
-        if chunk:
             chunks.append(chunk)
         start += chunk_size - overlap
     return chunks
 def load_file_text(path: str) -> str:
     ext = os.path.splitext(path)[1].lower()
-    if ext == ".pdf":
-        reader = PdfReader(path)
-        return "\n".join(page.extract_text() or "" for page in reader.pages)
-    elif ext in [".docx", ".doc"]:
-        doc = docx.Document(path)
-        return "\n".join(p.text for p in doc.paragraphs)
-    else:
-        with open(path, "r", encoding="utf-8") as f:
-            return f.read()
 def load_kb_documents(kb_dir: str) -> List[Tuple[str, str]]:
     docs = []
-    if os.path.isdir(kb_dir):
-        paths = glob.glob(os.path.join(kb_dir, "*.txt")) \
-              + glob.glob(os.path.join(kb_dir, "*.md")) \
-              + glob.glob(os.path.join(kb_dir, "*.pdf")) \
-              + glob.glob(os.path.join(kb_dir, "*.docx"))
-        for path in paths:
-            try:
-                text = load_file_text(path)
-                if text.strip():
-                    docs.append((os.path.basename(path), text))
-            except Exception as e:
-                print(f"Could not read {path}: {e}")
     return docs
@@ -84,49 +175,94 @@ def load_kb_documents(kb_dir: str) -> List[Tuple[str, str]]:
 class RAGIndex:
     def __init__(self):
-        print("Loading embedding model...")
-        self.embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)
-        print("Loading QA model...")
-        self.qa_pipeline = pipeline(
-            "question-answering",
-            model=AutoModelForQuestionAnswering.from_pretrained(QA_MODEL_NAME),
-            tokenizer=AutoTokenizer.from_pretrained(QA_MODEL_NAME),
-            handle_impossible_answer=True,
-        )
         self.chunks: List[str] = []
         self.chunk_sources: List[str] = []
         self.index = None
-        self._build_or_load_index()
     def _build_or_load_index(self):
         os.makedirs(INDEX_DIR, exist_ok=True)
         idx_path = os.path.join(INDEX_DIR, "kb.index")
         meta_path = os.path.join(INDEX_DIR, "kb_meta.npy")
         if os.path.exists(idx_path) and os.path.exists(meta_path):
-            print("Loading existing FAISS index...")
-            self.index = faiss.read_index(idx_path)
-            meta = np.load(meta_path, allow_pickle=True).item()
-            self.chunks = meta["chunks"]
-            self.chunk_sources = meta["sources"]
-            print("Index loaded.")
-            return
-        print("Building new FAISS index...")
         docs = load_kb_documents(KB_DIR)
         all_chunks = []
         all_sources = []
         for source, text in docs:
-            for chunk in chunk_text(text, CHUNK_SIZE, CHUNK_OVERLAP):
                 all_chunks.append(chunk)
                 all_sources.append(source)
         if not all_chunks:
-            print("⚠️ No KB documents found, index will stay empty.")
             self.index = None
             return
-        embeddings = self.embedder.encode(all_chunks, show_progress_bar=True, convert_to_numpy=True)
         dimension = embeddings.shape[1]
         index = faiss.IndexFlatIP(dimension)
@@ -134,59 +270,118 @@ class RAGIndex:
         faiss.normalize_L2(embeddings)
         index.add(embeddings)
-        faiss.write_index(index, idx_path)
-        np.save(meta_path, {"chunks": np.array(all_chunks, dtype=object), "sources": np.array(all_sources, dtype=object)})
         self.index = index
         self.chunks = all_chunks
         self.chunk_sources = all_sources
-        print("FAISS index ready.")
     def retrieve(self, query: str, top_k: int = 5) -> List[Tuple[str, str, float]]:
-        if not query.strip() or self.index is None:
             return []
-        q_emb = self.embedder.encode([query], convert_to_numpy=True)
-        faiss.normalize_L2(q_emb)
-        scores, idxs = self.index.search(q_emb, top_k)
-        results = []
-        for score, idx in zip(scores[0], idxs[0]):
-            if idx == -1:
-                continue
-            if score < SIM_THRESHOLD:
-                continue
-            results.append((self.chunks[idx], self.chunk_sources[idx], float(score)))
-        return results
     def answer(self, question: str) -> str:
         contexts = self.retrieve(question, top_k=3)
         if not contexts:
-            return NO_ANSWER_MSG
         answers = []
         for ctx, source, score in contexts:
-            qa_input = {"question": question, "context": ctx}
             try:
                 result = self.qa_pipeline(qa_input)
-                text = result.get("answer", "").strip()
-                if text:
-                    answers.append((text, source, result.get("score", 0.0)))
             except Exception as e:
-                print(f"QA error: {e}")
         if not answers:
-            return NO_ANSWER_MSG
-        # Pick best answer
-        answers.sort(key=lambda x: x[2], reverse=True)
-        best_answer, best_source, best_score = answers[0]
         return (
             f"**Answer:** {best_answer}\n\n"
-            f"**Source:** {best_source}  (confidence: {best_score:.2f})"
         )
 rag_index = RAGIndex()
 # -----------------------------
@@ -194,19 +389,42 @@ rag_index = RAGIndex()
 # -----------------------------
 def rag_respond(message: str, history):
     return rag_index.answer(message)
-description = CONFIG["messages"]["welcome"]
 chat = gr.ChatInterface(
     fn=rag_respond,
     title=CONFIG["client"]["name"],
     description=description,
     type="messages",
-    examples=[qa["query"] for qa in CONFIG.get("quick_actions", [])],
     cache_examples=False,
 )
 if __name__ == "__main__":
-    chat.launch()

 import os
 import glob
 import yaml
+from typing import List, Tuple, Optional
 import faiss
 import numpy as np
 # CONFIG
 # -----------------------------
+def load_config():
+    """Load configuration with error handling"""
+    try:
+        with open("config.yaml", "r", encoding="utf-8") as f:
+            return yaml.safe_load(f)
+    except FileNotFoundError:
+        print("⚠️ config.yaml not found, using defaults")
+        return get_default_config()
+    except Exception as e:
+        print(f"⚠️ Error loading config: {e}, using defaults")
+        return get_default_config()
+def get_default_config():
+    """Provide default configuration"""
+    return {
+        "kb": {
+            "directory": "./knowledge_base",
+            "index_directory": "./index"
+        },
+        "models": {
+            "embedding": "all-MiniLM-L6-v2",
+            "qa": "deepset/roberta-base-squad2"
+        },
+        "chunking": {
+            "chunk_size": 500,
+            "overlap": 50
+        },
+        "thresholds": {
+            "similarity": 0.3
+        },
+        "messages": {
+            "welcome": "Ask me anything about the documents in the knowledge base!",
+            "no_answer": "I couldn't find a relevant answer in the knowledge base."
+        },
+        "client": {
+            "name": "RAG AI Assistant"
+        },
+        "quick_actions": []
+    }
+CONFIG = load_config()
 KB_DIR = CONFIG["kb"]["directory"]
 INDEX_DIR = CONFIG["kb"]["index_directory"]
 # -----------------------------
 def chunk_text(text: str, chunk_size: int, overlap: int) -> List[str]:
+    """Split text into overlapping chunks"""
+    if not text or not text.strip():
         return []
     chunks = []
     start = 0
+    text_len = len(text)
+    while start < text_len:
+        end = min(start + chunk_size, text_len)
         chunk = text[start:end].strip()
+        if chunk and len(chunk) > 20:  # Avoid tiny chunks
             chunks.append(chunk)
+        if end >= text_len:
+            break
         start += chunk_size - overlap
     return chunks
 def load_file_text(path: str) -> str:
+    """Load text from various file formats with error handling"""
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"File not found: {path}")
     ext = os.path.splitext(path)[1].lower()
+    try:
+        if ext == ".pdf":
+            reader = PdfReader(path)
+            text_parts = []
+            for page in reader.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text_parts.append(page_text)
+            return "\n".join(text_parts)
+        elif ext in [".docx", ".doc"]:
+            doc = docx.Document(path)
+            return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
+        else:  # .txt, .md, etc.
+            with open(path, "r", encoding="utf-8", errors="ignore") as f:
+                return f.read()
+    except Exception as e:
+        print(f"Error reading {path}: {e}")
+        raise
 def load_kb_documents(kb_dir: str) -> List[Tuple[str, str]]:
+    """Load all documents from knowledge base directory"""
     docs = []
+    if not os.path.exists(kb_dir):
+        print(f"⚠️ Knowledge base directory not found: {kb_dir}")
+        print(f"Creating directory: {kb_dir}")
+        os.makedirs(kb_dir, exist_ok=True)
+        return docs
+    if not os.path.isdir(kb_dir):
+        print(f"⚠️ {kb_dir} is not a directory")
+        return docs
+    # Support multiple file formats
+    patterns = ["*.txt", "*.md", "*.pdf", "*.docx", "*.doc"]
+    paths = []
+    for pattern in patterns:
+        paths.extend(glob.glob(os.path.join(kb_dir, pattern)))
+    if not paths:
+        print(f"⚠️ No documents found in {kb_dir}")
+        return docs
+    print(f"Found {len(paths)} documents in knowledge base")
+    for path in paths:
+        try:
+            text = load_file_text(path)
+            if text and text.strip():
+                docs.append((os.path.basename(path), text))
+                print(f"✓ Loaded: {os.path.basename(path)}")
+            else:
+                print(f"⚠️ Empty file: {os.path.basename(path)}")
+        except Exception as e:
+            print(f"✗ Could not read {path}: {e}")
     return docs
 class RAGIndex:
     def __init__(self):
+        self.embedder = None
+        self.qa_pipeline = None
         self.chunks: List[str] = []
         self.chunk_sources: List[str] = []
         self.index = None
+        self.initialized = False
+        try:
+            print("🔄 Initializing RAG Assistant...")
+            self._initialize_models()
+            self._build_or_load_index()
+            self.initialized = True
+            print("✅ RAG Assistant ready!")
+        except Exception as e:
+            print(f"❌ Initialization error: {e}")
+            print("The assistant will run in limited mode.")
+    def _initialize_models(self):
+        """Initialize embedding and QA models"""
+        try:
+            print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}")
+            self.embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)
+            print(f"Loading QA model: {QA_MODEL_NAME}")
+            self.qa_pipeline = pipeline(
+                "question-answering",
+                model=AutoModelForQuestionAnswering.from_pretrained(QA_MODEL_NAME),
+                tokenizer=AutoTokenizer.from_pretrained(QA_MODEL_NAME),
+                handle_impossible_answer=True,
+            )
+        except Exception as e:
+            print(f"Error loading models: {e}")
+            raise
     def _build_or_load_index(self):
+        """Build or load FAISS index from knowledge base"""
         os.makedirs(INDEX_DIR, exist_ok=True)
         idx_path = os.path.join(INDEX_DIR, "kb.index")
         meta_path = os.path.join(INDEX_DIR, "kb_meta.npy")
+        # Try to load existing index
         if os.path.exists(idx_path) and os.path.exists(meta_path):
+            try:
+                print("Loading existing FAISS index...")
+                self.index = faiss.read_index(idx_path)
+                meta = np.load(meta_path, allow_pickle=True).item()
+                self.chunks = list(meta["chunks"])
+                self.chunk_sources = list(meta["sources"])
+                print(f"✓ Index loaded with {len(self.chunks)} chunks")
+                return
+            except Exception as e:
+                print(f"⚠️ Could not load existing index: {e}")
+                print("Building new index...")
+        # Build new index
+        print("Building new FAISS index from knowledge base...")
         docs = load_kb_documents(KB_DIR)
+        if not docs:
+            print("⚠️ No documents found in knowledge base")
+            print(f"   Please add .txt, .md, .pdf, or .docx files to: {KB_DIR}")
+            self.index = None
+            return
         all_chunks = []
         all_sources = []
         for source, text in docs:
+            chunks = chunk_text(text, CHUNK_SIZE, CHUNK_OVERLAP)
+            for chunk in chunks:
                 all_chunks.append(chunk)
                 all_sources.append(source)
         if not all_chunks:
+            print("⚠️ No valid chunks created from documents")
             self.index = None
             return
+        print(f"Created {len(all_chunks)} chunks from {len(docs)} documents")
+        print("Generating embeddings...")
+        embeddings = self.embedder.encode(
+            all_chunks,
+            show_progress_bar=True,
+            convert_to_numpy=True,
+            batch_size=32
+        )
         dimension = embeddings.shape[1]
         index = faiss.IndexFlatIP(dimension)
         faiss.normalize_L2(embeddings)
         index.add(embeddings)
+        # Save index
+        try:
+            faiss.write_index(index, idx_path)
+            np.save(meta_path, {
+                "chunks": np.array(all_chunks, dtype=object),
+                "sources": np.array(all_sources, dtype=object)
+            })
+            print("✓ Index saved successfully")
+        except Exception as e:
+            print(f"⚠️ Could not save index: {e}")
         self.index = index
         self.chunks = all_chunks
         self.chunk_sources = all_sources
     def retrieve(self, query: str, top_k: int = 5) -> List[Tuple[str, str, float]]:
+        """Retrieve relevant chunks for a query"""
+        if not query or not query.strip():
+            return []
+        if self.index is None or not self.initialized:
+            return []
+        try:
+            q_emb = self.embedder.encode([query], convert_to_numpy=True)
+            faiss.normalize_L2(q_emb)
+            scores, idxs = self.index.search(q_emb, min(top_k, len(self.chunks)))
+            results = []
+            for score, idx in zip(scores[0], idxs[0]):
+                if idx == -1 or idx >= len(self.chunks):
+                    continue
+                if score < SIM_THRESHOLD:
+                    continue
+                results.append((self.chunks[idx], self.chunk_sources[idx], float(score)))
+            return results
+        except Exception as e:
+            print(f"Retrieval error: {e}")
             return []
     def answer(self, question: str) -> str:
+        """Answer a question using RAG"""
+        if not self.initialized:
+            return "❌ Assistant not properly initialized. Please check the logs."
+        if not question or not question.strip():
+            return "Please ask a question."
+        if self.index is None:
+            return (
+                f"📚 Knowledge base is empty.\n\n"
+                f"Please add documents to: `{KB_DIR}`\n"
+                f"Supported formats: .txt, .md, .pdf, .docx"
+            )
+        # Retrieve relevant contexts
         contexts = self.retrieve(question, top_k=3)
         if not contexts:
+            return (
+                f"{NO_ANSWER_MSG}\n\n"
+                f"💡 Try rephrasing your question or check if relevant documents exist in the knowledge base."
+            )
+        # Try to extract answer from each context
         answers = []
         for ctx, source, score in contexts:
+            # Truncate context if too long (max 512 tokens for most QA models)
+            max_context_length = 2000  # characters, roughly 512 tokens
+            truncated_ctx = ctx[:max_context_length]
+            qa_input = {"question": question, "context": truncated_ctx}
             try:
                 result = self.qa_pipeline(qa_input)
+                answer_text = result.get("answer", "").strip()
+                answer_score = result.get("score", 0.0)
+                if answer_text and answer_score > 0.01:  # Minimum confidence threshold
+                    answers.append((answer_text, source, answer_score, score))
             except Exception as e:
+                print(f"QA error on context from {source}: {e}")
+                continue
         if not answers:
+            # Provide context even if no specific answer found
+            best_ctx, best_src, best_score = contexts[0]
+            preview = best_ctx[:300] + "..." if len(best_ctx) > 300 else best_ctx
+            return (
+                f"I found relevant information but couldn't extract a specific answer.\n\n"
+                f"**Relevant context from {best_src}:**\n{preview}\n\n"
+                f"💡 Try asking a more specific question."
+            )
+        # Pick best answer (weighted by both retrieval and QA scores)
+        answers.sort(key=lambda x: x[2] * x[3], reverse=True)
+        best_answer, best_source, qa_score, retrieval_score = answers[0]
         return (
             f"**Answer:** {best_answer}\n\n"
+            f"**Source:** {best_source}\n"
+            f"**Confidence:** {qa_score:.2%}"
         )
+# Initialize RAG system
+print("=" * 50)
 rag_index = RAGIndex()
+print("=" * 50)
 # -----------------------------
 # -----------------------------
 def rag_respond(message: str, history):
+    """Handle chat messages"""
+    if not message or not message.strip():
+        return "Please enter a question."
     return rag_index.answer(message)
+# Build interface
+description = WELCOME_MSG
+if not rag_index.initialized or rag_index.index is None:
+    description += f"\n\n⚠️ **Note:** Knowledge base is empty. Add documents to `{KB_DIR}` and restart."
+examples = [qa.get("query") for qa in CONFIG.get("quick_actions", []) if qa.get("query")]
+if not examples and rag_index.initialized and rag_index.index is not None:
+    examples = [
+        "What is this document about?",
+        "Can you summarize the main points?",
+        "What are the key findings?"
+    ]
 chat = gr.ChatInterface(
     fn=rag_respond,
     title=CONFIG["client"]["name"],
     description=description,
     type="messages",
+    examples=examples if examples else None,
     cache_examples=False,
+    retry_btn="🔄 Retry",
+    undo_btn="↩️ Undo",
+    clear_btn="🗑️ Clear",
 )
 if __name__ == "__main__":
+    # Launch with better settings for Hugging Face Spaces
+    chat.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )