Spaces:

sofzcc
/

Full_RAG_Assistant

Sleeping

App Files Files Community

sofzcc commited on Dec 2, 2025

Commit

8e14def

verified ·

1 Parent(s): 02a1b59

Update app.py

Browse files

Files changed (1) hide show

app.py +545 -325

app.py CHANGED Viewed

@@ -1,420 +1,640 @@
 import os
 import glob
 from typing import List, Tuple
-import time
-import gradio as gr
 import numpy as np
-from sentence_transformers import SentenceTransformer
 # -----------------------------
 # CONFIG
 # -----------------------------
-KB_DIR = "./kb"  # folder with .txt or .md files
-EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
-TOP_K = 3
-CHUNK_SIZE = 500  # characters
-CHUNK_OVERLAP = 100  # characters
-MIN_SIMILARITY_THRESHOLD = 0.3  # Minimum similarity score to include results
 # -----------------------------
 # UTILITIES
 # -----------------------------
-def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
-    """Split long text into overlapping chunks so retrieval is more precise."""
-    if not text:
         return []
     chunks = []
     start = 0
-    length = len(text)
-    while start < length:
-        end = min(start + chunk_size, length)
         chunk = text[start:end].strip()
-        if chunk:
             chunks.append(chunk)
         start += chunk_size - overlap
     return chunks
-def load_kb_texts(kb_dir: str = KB_DIR) -> List[Tuple[str, str]]:
     """
-    Load all .txt and .md files from the KB directory.
-    Returns a list of (source_name, content).
     """
-    texts = []
-    if os.path.isdir(kb_dir):
-        paths = glob.glob(os.path.join(kb_dir, "*.txt")) + glob.glob(os.path.join(kb_dir, "*.md"))
-        for path in paths:
-            try:
-                with open(path, "r", encoding="utf-8") as f:
-                    content = f.read()
-                if content.strip():
-                    texts.append((os.path.basename(path), content))
-            except Exception as e:
-                print(f"Could not read {path}: {e}")
-    # If no files found, fall back to built-in demo content
-    if not texts:
-        print("No KB files found. Using built-in demo content.")
-        demo_text = """
-        Welcome to the Self-Service KB Assistant.
-        This assistant is meant to help you find information inside a knowledge base.
-        In a real setup, it would be connected to your own articles, procedures,
-        troubleshooting guides and FAQs.
-        Good knowledge base content is:
-        - Clear and structured with headings, steps and expected outcomes.
-        - Written in a customer-friendly tone.
-        - Easy to scan, with short paragraphs and bullet points.
-        - Maintained regularly to reflect product and process changes.
-        Example use cases for a KB assistant:
-        - Agents quickly searching for internal procedures.
-        - Customers asking "how do I…" style questions.
-        - Managers analyzing gaps in documentation based on repeated queries.
-        """
-        texts.append(("demo_content.txt", demo_text))
-    return texts
 # -----------------------------
-# KB INDEX
 # -----------------------------
-class KBIndex:
-    def __init__(self, model_name: str = EMBEDDING_MODEL_NAME):
-        print("Loading embedding model...")
-        self.model = SentenceTransformer(model_name)
-        print("Embedding model loaded.")
         self.chunks: List[str] = []
         self.chunk_sources: List[str] = []
-        self.embeddings = None
-        self.build_index()
-    def build_index(self):
-        """Load KB texts, split into chunks, and build an embedding index."""
-        texts = load_kb_texts(KB_DIR)
-        all_chunks = []
-        all_sources = []
-        for source_name, content in texts:
-            for chunk in chunk_text(content):
                 all_chunks.append(chunk)
-                all_sources.append(source_name)
         if not all_chunks:
-            print("⚠️ No chunks found for KB index.")
             self.chunks = []
             self.chunk_sources = []
-            self.embeddings = None
             return
-        print(f"Creating embeddings for {len(all_chunks)} chunks...")
-        embeddings = self.model.encode(all_chunks, show_progress_bar=False, convert_to_numpy=True)
         self.chunks = all_chunks
         self.chunk_sources = all_sources
-        self.embeddings = embeddings
-        print("KB index ready.")
-    def search(self, query: str, top_k: int = TOP_K) -> List[Tuple[str, str, float]]:
-        """Return top-k (chunk, source_name, score) for a given query."""
-        if not query.strip():
             return []
-        if self.embeddings is None or not len(self.chunks):
             return []
-        query_vec = self.model.encode([query], show_progress_bar=False, convert_to_numpy=True)[0]
-        # Cosine similarity
-        dot_scores = np.dot(self.embeddings, query_vec)
-        norm_docs = np.linalg.norm(self.embeddings, axis=1)
-        norm_query = np.linalg.norm(query_vec) + 1e-10
-        scores = dot_scores / (norm_docs * norm_query + 1e-10)
-        top_idx = np.argsort(scores)[::-1][:top_k]
-        results = []
-        for idx in top_idx:
-            results.append((self.chunks[idx], self.chunk_sources[idx], float(scores[idx])))
-        return results
-# Initialize KB index
-print("Initializing KB index...")
-kb_index = KBIndex()
-# Initialize LLM for answer generation
-print("Loading LLM for answer generation...")
-try:
-    from transformers import AutoTokenizer, AutoModelForCausalLM
-    import torch
-    # Use a small but capable model for faster responses
-    LLM_MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # Fast and good quality
-    print(f"Loading {LLM_MODEL_NAME}...")
-    llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
-    llm_model = AutoModelForCausalLM.from_pretrained(
-        LLM_MODEL_NAME,
-        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-        device_map="auto" if torch.cuda.is_available() else None,
-    )
-    if not torch.cuda.is_available():
-        llm_model = llm_model.to("cpu")
-    llm_model.eval()
-    print(f"✅ LLM loaded successfully on {'GPU' if torch.cuda.is_available() else 'CPU'}")
-    llm_available = True
-except Exception as e:
-    print(f"⚠️ Could not load LLM: {e}")
-    print("⚠️ Will use fallback mode (direct retrieval)")
-    llm_available = False
-    llm_tokenizer = None
-    llm_model = None
-print("✅ KB Assistant ready!")
-# -----------------------------
-# CHAT LOGIC (With LLM Answer Generation)
-# -----------------------------
-def clean_context(text: str) -> str:
-    """Clean up text for context, removing markdown and excess whitespace."""
-    # Remove markdown headers
-    text = text.replace('#', '')
-    # Remove multiple spaces
-    text = ' '.join(text.split())
-    return text.strip()
-def generate_answer_with_llm(query: str, context: str, sources: List[str]) -> str:
-    """
-    Generate a natural, conversational answer using LLM based on retrieved context.
-    """
-    if not llm_available:
-        return None
-    # Create a focused prompt
-    prompt = f"""<|system|>
-You are a helpful knowledge base assistant. Answer the user's question based ONLY on the provided context. Be conversational, clear, and concise. If the context doesn't contain enough information, say so.
-</s>
-<|user|>
-Context from knowledge base:
-{context}
-Question: {query}
-</s>
-<|assistant|>
-"""
-    try:
-        # Tokenize
-        inputs = llm_tokenizer(
-            prompt,
-            return_tensors="pt",
-            truncation=True,
-            max_length=1024
-        )
-        if torch.cuda.is_available():
-            inputs = {k: v.to("cuda") for k, v in inputs.items()}
-        # Generate
-        with torch.no_grad():
-            outputs = llm_model.generate(
-                **inputs,
-                max_new_tokens=256,
-                temperature=0.7,
-                top_p=0.9,
-                do_sample=True,
-                pad_token_id=llm_tokenizer.eos_token_id,
             )
-        # Decode
-        full_response = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Extract only the assistant's response
-        if "<|assistant|>" in full_response:
-            answer = full_response.split("<|assistant|>")[-1].strip()
-        else:
-            answer = full_response.strip()
-        # Clean up the answer
-        answer = answer.replace("</s>", "").strip()
-        # Add source attribution
-        sources_text = ", ".join(sources)
-        final_answer = f"{answer}\n\n---\n📚 **Sources:** {sources_text}"
-        return final_answer
-    except Exception as e:
-        print(f"Error in LLM generation: {e}")
-        return None
-def format_fallback_answer(results: List[Tuple[str, str, float]]) -> str:
-    """
-    Fallback formatting when LLM is not available or fails.
-    """
-    if not results:
-        return (
-            "I couldn't find any relevant information in the knowledge base.\n\n"
-            "**Try:**\n"
-            "- Rephrasing your question\n"
-            "- Using different keywords\n"
-            "- Breaking down complex questions"
-        )
-    # Get best result
-    best_chunk, best_source, best_score = results[0]
-    # Clean markdown
-    cleaned = clean_context(best_chunk)
-    # Format nicely
-    answer = f"**From {best_source}:**\n\n{cleaned}"
-    # Add other sources if available
-    if len(results) > 1:
-        other_sources = list(set([src for _, src, _ in results[1:]]))
-        if other_sources:
-            answer += f"\n\n💡 **Also see:** {', '.join(other_sources)}"
-    return answer
-def build_answer(query: str) -> str:
-    """
-    Main answer generation function using LLM for natural responses.
-    Process:
-    1. Retrieve relevant chunks from KB
-    2. Build context from top results
-    3. Use LLM to generate natural answer
-    4. Cite sources
-    """
-    # Step 1: Search the knowledge base
-    results = kb_index.search(query, top_k=TOP_K)
-    if not results:
         return (
-            "I couldn't find any relevant information in the knowledge base to answer your question.\n\n"
-            "**Suggestions:**\n"
-            "- Try rephrasing with different words\n"
-            "- Check if the topic is covered in the KB\n"
-            "- Be more specific about what you're looking for"
         )
-    # Step 2: Filter by similarity threshold
-    filtered_results = [
-        (chunk, src, score)
-        for chunk, src, score in results
-        if score >= MIN_SIMILARITY_THRESHOLD
-    ]
-    if not filtered_results:
         return (
-            "I found some content, but it doesn't seem relevant enough to your question.\n\n"
-            "Please try being more specific or using different keywords."
         )
-    # Step 3: Build context from top results
-    context_parts = []
-    sources = []
-    for chunk, source, score in filtered_results[:2]:  # Top 2 most relevant
-        cleaned = clean_context(chunk)
-        context_parts.append(cleaned)
-        if source not in sources:
-            sources.append(source)
-    # Combine context (limit to 1000 chars for speed)
-    context = " ".join(context_parts)[:1000]
-    # Step 4: Generate answer with LLM
-    if llm_available:
-        llm_answer = generate_answer_with_llm(query, context, sources)
-        if llm_answer:
-            return llm_answer
-    # Step 5: Fallback if LLM fails or unavailable
-    return format_fallback_answer(filtered_results)
-def chat_respond(message: str, history):
-    """
-    Gradio ChatInterface callback.
-    Args:
-        message: Latest user message (str)
-        history: List of previous messages (handled by Gradio)
-    Returns:
-        Assistant's reply as a string
-    """
-    if not message or not message.strip():
-        return "Please ask me a question about the knowledge base."
-    try:
-        answer = build_answer(message.strip())
-        return answer
-    except Exception as e:
-        print(f"Error generating answer: {e}")
-        return f"Sorry, I encountered an error processing your question: {str(e)}"
 # -----------------------------
-# GRADIO UI
 # -----------------------------
-description = """
-🚀 **Fast Knowledge Base Search Assistant**
-Ask questions and get instant answers from the knowledge base.
-This assistant uses semantic search to find the most relevant information quickly.
-**Tips for better results:**
-- Be specific in your questions
-- Use keywords related to your topic
-- Ask one question at a time
 """
-# Create ChatInterface (without 'type' parameter for compatibility)
-chat_interface = gr.ChatInterface(
-    fn=chat_respond,
-    title="🤖 Self-Service KB Assistant",
-    description=description,
-    examples=[
-        "What makes a good knowledge base article?",
-        "How could a KB assistant help agents?",
-        "Why is self-service important for customer support?",
-    ],
-    cache_examples=False,
-)
-# Launch
 if __name__ == "__main__":
-    # Detect environment and launch appropriately
-    is_huggingface = os.getenv('SPACE_ID') is not None
-    is_container = os.path.exists('/.dockerenv') or os.getenv('KUBERNETES_SERVICE_HOST') is not None
-    if is_huggingface:
-        print("🤗 Launching on HuggingFace Spaces...")
-        chat_interface.launch(server_name="0.0.0.0", server_port=7860)
-    elif is_container:
-        print("🐳 Launching in container environment...")
-        chat_interface.launch(server_name="0.0.0.0", server_port=7860, share=False)
-    else:
-        print("💻 Launching locally...")
-        chat_interface.launch(share=False)

 import os
 import glob
+import yaml
+import shutil
+import re
 from typing import List, Tuple
+import faiss
 import numpy as np
+import gradio as gr
+from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from PyPDF2 import PdfReader
+import docx
 # -----------------------------
 # CONFIG
 # -----------------------------
+def load_config():
+    """Load configuration with error handling"""
+    try:
+        with open("config.yaml", "r", encoding="utf-8") as f:
+            return yaml.safe_load(f)
+    except FileNotFoundError:
+        print("⚠️ config.yaml not found, using defaults")
+        return get_default_config()
+    except Exception as e:
+        print(f"⚠️ Error loading config: {e}, using defaults")
+        return get_default_config()
+def get_default_config():
+    """Provide default configuration"""
+    return {
+        "kb": {
+            "directory": "./knowledge_base",   # can be overridden in config.yaml (e.g., ./kb)
+            "index_directory": "./index",
+        },
+        "models": {
+            "embedding": "sentence-transformers/all-MiniLM-L6-v2",
+            "qa": "google/flan-t5-small",
+        },
+        "chunking": {
+            "chunk_size": 1200,
+            "overlap": 200,
+        },
+        "thresholds": {
+            "similarity": 0.1,
+        },
+        "messages": {
+            "welcome": "Ask me anything about the documents in the knowledge base!",
+            "no_answer": "I couldn't find a relevant answer in the knowledge base.",
+        },
+        "client": {
+            "name": "RAG AI Assistant",
+        },
+        "quick_actions": [],
+    }
+CONFIG = load_config()
+KB_DIR = CONFIG["kb"]["directory"]
+INDEX_DIR = CONFIG["kb"]["index_directory"]
+EMBEDDING_MODEL_NAME = CONFIG["models"]["embedding"]
+QA_MODEL_NAME = CONFIG["models"].get("qa", "google/flan-t5-small")
+CHUNK_SIZE = CONFIG["chunking"]["chunk_size"]
+CHUNK_OVERLAP = CONFIG["chunking"]["overlap"]
+SIM_THRESHOLD = CONFIG["thresholds"]["similarity"]
+WELCOME_MSG = CONFIG["messages"]["welcome"]
+NO_ANSWER_MSG = CONFIG["messages"]["no_answer"]
 # -----------------------------
 # UTILITIES
 # -----------------------------
+def chunk_text(text: str, chunk_size: int, overlap: int) -> List[str]:
+    """Split text into overlapping chunks"""
+    if not text or not text.strip():
         return []
     chunks = []
     start = 0
+    text_len = len(text)
+    while start < text_len:
+        end = min(start + chunk_size, text_len)
         chunk = text[start:end].strip()
+        if chunk and len(chunk) > 20:  # Avoid tiny chunks
             chunks.append(chunk)
+        if end >= text_len:
+            break
         start += chunk_size - overlap
     return chunks
+def load_file_text(path: str) -> str:
+    """Load text from various file formats with error handling"""
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"File not found: {path}")
+    ext = os.path.splitext(path)[1].lower()
+    try:
+        if ext == ".pdf":
+            reader = PdfReader(path)
+            text_parts = []
+            for page in reader.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text_parts.append(page_text)
+            return "\n".join(text_parts)
+        elif ext in [".docx", ".doc"]:
+            doc = docx.Document(path)
+            return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
+        else:  # .txt, .md, etc.
+            with open(path, "r", encoding="utf-8", errors="ignore") as f:
+                return f.read()
+    except Exception as e:
+        print(f"Error reading {path}: {e}")
+        raise
+def load_kb_documents(kb_dir: str) -> List[Tuple[str, str]]:
+    """Load all documents from knowledge base directory"""
+    docs: List[Tuple[str, str]] = []
+    if not os.path.exists(kb_dir):
+        print(f"⚠️ Knowledge base directory not found: {kb_dir}")
+        print(f"Creating directory: {kb_dir}")
+        os.makedirs(kb_dir, exist_ok=True)
+        return docs
+    if not os.path.isdir(kb_dir):
+        print(f"⚠️ {kb_dir} is not a directory")
+        return docs
+    # Support multiple file formats
+    patterns = ["*.txt", "*.md", "*.pdf", "*.docx", "*.doc"]
+    paths = []
+    for pattern in patterns:
+        paths.extend(glob.glob(os.path.join(kb_dir, pattern)))
+    if not paths:
+        print(f"⚠️ No documents found in {kb_dir}")
+        return docs
+    print(f"Found {len(paths)} documents in knowledge base")
+    for path in paths:
+        try:
+            text = load_file_text(path)
+            if text and text.strip():
+                docs.append((os.path.basename(path), text))
+                print(f"✓ Loaded: {os.path.basename(path)}")
+            else:
+                print(f"⚠️ Empty file: {os.path.basename(path)}")
+        except Exception as e:
+            print(f"✗ Could not read {path}: {e}")
+    return docs
+def clean_context_text(text: str) -> str:
     """
+    Clean raw document context before sending to the generator:
+    - Remove markdown headings (#, ##, ###)
+    - Remove list markers (1., 2), -, *)
+    - Remove duplicate lines
     """
+    lines = text.splitlines()
+    cleaned = []
+    seen = set()
+    for line in lines:
+        l = line.strip()
+        if not l:
+            continue
+        # Remove markdown headings like "# 1. Title", "## Section"
+        l = re.sub(r"^#+\s*", "", l)
+        # Remove ordered list prefixes like "1. ", "2) "
+        l = re.sub(r"^\d+[\.\)]\s*", "", l)
+        # Remove bullet markers like "- ", "* "
+        l = re.sub(r"^[-*]\s*", "", l)
+        # Skip very short "noise" lines
+        if len(l) < 5:
+            continue
+        # Avoid exact duplicates
+        if l in seen:
+            continue
+        seen.add(l)
+        cleaned.append(l)
+    return "\n".join(cleaned)
 # -----------------------------
+# KB INDEX (FAISS)
 # -----------------------------
+class RAGIndex:
+    def __init__(self):
+        self.embedder = None
+        self.qa_tokenizer = None
+        self.qa_model = None
         self.chunks: List[str] = []
         self.chunk_sources: List[str] = []
+        self.index = None
+        self.initialized = False
+        try:
+            print("🔄 Initializing RAG Assistant...")
+            self._initialize_models()
+            self._build_or_load_index()
+            self.initialized = True
+            print("✅ RAG Assistant ready!")
+        except Exception as e:
+            print(f"❌ Initialization error: {e}")
+            print("The assistant will run in limited mode.")
+    def _initialize_models(self):
+        """Initialize embedding and QA models"""
+        try:
+            print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}")
+            self.embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)
+            print(f"Loading QA (seq2seq) model: {QA_MODEL_NAME}")
+            self.qa_tokenizer = AutoTokenizer.from_pretrained(QA_MODEL_NAME)
+            self.qa_model = AutoModelForSeq2SeqLM.from_pretrained(QA_MODEL_NAME)
+        except Exception as e:
+            print(f"Error loading models: {e}")
+            raise
+    def _build_or_load_index(self):
+        """Build or load FAISS index from knowledge base"""
+        os.makedirs(INDEX_DIR, exist_ok=True)
+        idx_path = os.path.join(INDEX_DIR, "kb.index")
+        meta_path = os.path.join(INDEX_DIR, "kb_meta.npy")
+        # Try to load existing index
+        if os.path.exists(idx_path) and os.path.exists(meta_path):
+            try:
+                print("Loading existing FAISS index...")
+                self.index = faiss.read_index(idx_path)
+                meta = np.load(meta_path, allow_pickle=True).item()
+                self.chunks = list(meta["chunks"])
+                self.chunk_sources = list(meta["sources"])
+                print(f"✓ Index loaded with {len(self.chunks)} chunks")
+                return
+            except Exception as e:
+                print(f"⚠️ Could not load existing index: {e}")
+                print("Building new index...")
+        # Build new index
+        print("Building new FAISS index from knowledge base...")
+        docs = load_kb_documents(KB_DIR)
+        if not docs:
+            print("⚠️ No documents found in knowledge base")
+            print(f"   Please add .txt, .md, .pdf, or .docx files to: {KB_DIR}")
+            self.index = None
+            self.chunks = []
+            self.chunk_sources = []
+            return
+        all_chunks: List[str] = []
+        all_sources: List[str] = []
+        for source, text in docs:
+            chunks = chunk_text(text, CHUNK_SIZE, CHUNK_OVERLAP)
+            for chunk in chunks:
                 all_chunks.append(chunk)
+                all_sources.append(source)
         if not all_chunks:
+            print("⚠️ No valid chunks created from documents")
+            self.index = None
             self.chunks = []
             self.chunk_sources = []
             return
+        print(f"Created {len(all_chunks)} chunks from {len(docs)} documents")
+        print("Generating embeddings...")
+        embeddings = self.embedder.encode(
+            all_chunks,
+            show_progress_bar=True,
+            convert_to_numpy=True,
+            batch_size=32,
+        )
+        dimension = embeddings.shape[1]
+        index = faiss.IndexFlatIP(dimension)
+        # Normalize for cosine similarity
+        faiss.normalize_L2(embeddings)
+        index.add(embeddings)
+        # Save index
+        try:
+            faiss.write_index(index, idx_path)
+            np.save(
+                meta_path,
+                {
+                    "chunks": np.array(all_chunks, dtype=object),
+                    "sources": np.array(all_sources, dtype=object),
+                },
+            )
+            print("✓ Index saved successfully")
+        except Exception as e:
+            print(f"⚠️ Could not save index: {e}")
+        self.index = index
         self.chunks = all_chunks
         self.chunk_sources = all_sources
+    def retrieve(self, query: str, top_k: int = 5) -> List[Tuple[str, str, float]]:
+        """Retrieve relevant chunks for a query"""
+        if not query or not query.strip():
             return []
+        if self.index is None or not self.initialized:
             return []
+        try:
+            q_emb = self.embedder.encode([query], convert_to_numpy=True)
+            faiss.normalize_L2(q_emb)
+            k = min(top_k, len(self.chunks)) if self.chunks else 0
+            if k == 0:
+                return []
+            scores, idxs = self.index.search(q_emb, k)
+            results: List[Tuple[str, str, float]] = []
+            for score, idx in zip(scores[0], idxs[0]):
+                if idx == -1 or idx >= len(self.chunks):
+                    continue
+                if score < SIM_THRESHOLD:
+                    continue
+                results.append(
+                    (self.chunks[idx], self.chunk_sources[idx], float(score))
+                )
+            return results
+        except Exception as e:
+            print(f"Retrieval error: {e}")
+            return []
+    def _generate_from_context(self, prompt: str, max_new_tokens: int = 128) -> str:
+        """Run Flan-T5 on the given prompt and return the decoded answer."""
+        if self.qa_model is None or self.qa_tokenizer is None:
+            raise RuntimeError("QA model not loaded.")
+        inputs = self.qa_tokenizer(
+            prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=768,
+        )
+        outputs = self.qa_model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=False,
+        )
+        answer = self.qa_tokenizer.decode(
+            outputs[0],
+            skip_special_tokens=True,
+        ).strip()
+        return answer
+    def answer(self, question: str) -> str:
+        """Answer a question using RAG with simplified, clearer prompting."""
+        if not self.initialized:
+            return "❌ Assistant not properly initialized. Please check the logs."
+        if not question or not question.strip():
+            return "Please ask a question."
+        if self.index is None or not self.chunks:
+            return (
+                f"📚 Knowledge base is empty.\n\n"
+                f"Please add documents to: `{KB_DIR}`\n"
+                f"Supported formats: .txt, .md, .pdf, .docx"
+            )
+        # 1) Retrieve relevant contexts
+        contexts = self.retrieve(question, top_k=3)
+        if not contexts:
+            return (
+                f"{NO_ANSWER_MSG}\n\n"
+                f"💡 Try rephrasing your question or check if relevant documents exist in the knowledge base."
+            )
+        used_sources = set()
+        # 2) Collect and clean the best contexts
+        evidence_parts = []
+        for ctx, source, score in contexts:
+            used_sources.add(source)
+            cleaned_ctx = clean_context_text(ctx)
+            if cleaned_ctx.strip():
+                evidence_parts.append(cleaned_ctx)
+        if not evidence_parts:
+            return (
+                f"{NO_ANSWER_MSG}\n\n"
+                f"💡 Try rephrasing your question or adding more detailed documents to the knowledge base."
             )
+        # Combine contexts (limit to avoid overwhelming the model)
+        combined_context = " ".join(evidence_parts[:2])[:1000]
+        # 3) FIXED: Simple, direct prompt (no complex instructions)
+        answer_prompt = f"""Answer this question using the context below. Be concise and natural.
+    Context: {combined_context}
+    Question: {question}
+    Answer:"""
+        try:
+            answer_text = self._generate_from_context(answer_prompt, max_new_tokens=150)
+            answer_text = answer_text.strip()
+            # Safety check: if model leaked instructions, try simpler prompt
+            if answer_text.startswith("Do NOT") or answer_text.startswith("You are") or len(answer_text) < 10:
+                simple_prompt = f"Context: {combined_context}\n\nQ: {question}\nA:"
+                answer_text = self._generate_from_context(simple_prompt, max_new_tokens=150).strip()
+        except Exception as e:
+            print(f"Generation error: {e}")
+            return (
+                "There was an error while generating the answer. "
+                "Please try again with a shorter question or different wording."
+            )
+        sources_str = ", ".join(sorted(used_sources)) if used_sources else "N/A"
         return (
+            f"**Answer:** {answer_text}\n\n"
+            f"**Sources:** {sources_str}"
         )
+        try:
+            answer_text = self._generate_from_context(answer_prompt, max_new_tokens=128)
+        except Exception as e:
+            print(f"Generation error: {e}")
+            return (
+                "There was an error while generating the answer. "
+                "Please try again with a shorter question or different wording."
+            )
+        sources_str = ", ".join(sorted(used_sources)) if used_sources else "N/A"
         return (
+            f"**Answer:** {answer_text}\n\n"
+            f"**Sources:** {sources_str}"
         )
+# Initialize RAG system
+print("=" * 50)
+rag_index = RAGIndex()
+print("=" * 50)
 # -----------------------------
+# GRADIO APP (BLOCKS)
 # -----------------------------
+def rag_respond(message, history):
+    """Handle chat messages for chatbot UI (messages format)"""
+    if history is None:
+        history = []
+    if not message or not str(message).strip():
+        return "", history
+    user_msg = str(message)
+    history.append({
+        "role": "user",
+        "content": user_msg,
+    })
+    bot_reply = rag_index.answer(user_msg)
+    history.append({
+        "role": "assistant",
+        "content": bot_reply,
+    })
+    return "", history
+def upload_to_kb(files):
+    """Save uploaded files into the KB directory"""
+    if not files:
+        return "No files uploaded."
+    if not isinstance(files, list):
+        files = [files]
+    os.makedirs(KB_DIR, exist_ok=True)
+    saved_files = []
+    for f in files:
+        src_path = getattr(f, "name", None) or str(f)
+        if not os.path.exists(src_path):
+            continue
+        filename = os.path.basename(src_path)
+        dest_path = os.path.join(KB_DIR, filename)
+        try:
+            shutil.copy(src_path, dest_path)
+            saved_files.append(filename)
+        except Exception as e:
+            print(f"Error saving file {filename}: {e}")
+    if not saved_files:
+        return "No files could be saved. Check logs."
+    return (
+        f"✅ Saved {len(saved_files)} file(s) to knowledge base:\n- "
+        + "\n- ".join(saved_files)
+        + "\n\nClick **Rebuild index** to include them in search."
+    )
+def rebuild_index():
+    """Trigger index rebuild from UI"""
+    rag_index._build_or_load_index()
+    if rag_index.index is None or not rag_index.chunks:
+        return (
+            "⚠️ Index rebuild finished, but no documents or chunks were found.\n"
+            f"Add files to `{KB_DIR}` and try again."
+        )
+    return (
+        f"✅ Index rebuilt successfully.\n"
+        f"Chunks in index: {len(rag_index.chunks)}"
+    )
+# Description + optional examples
+description = WELCOME_MSG
+if not rag_index.initialized or rag_index.index is None or not rag_index.chunks:
+    description += (
+        f"\n\n⚠️ **Note:** Knowledge base is currently empty or index is not built.\n"
+        f"Upload documents in the **Knowledge Base** tab and click **Rebuild index**."
+    )
+examples = [
+    qa.get("query")
+    for qa in CONFIG.get("quick_actions", [])
+    if qa.get("query")
+]
+if not examples and rag_index.initialized and rag_index.index is not None and rag_index.chunks:
+    examples = [
+        "What is a knowledge base?",
+        "What are best practices for maintaining a KB?",
+        "How should I structure knowledge base articles?",
+    ]
+with gr.Blocks(title=CONFIG["client"]["name"]) as demo:
+    gr.Markdown(f"# {CONFIG['client']['name']}")
+    gr.Markdown(description)
+    with gr.Tab("Chat"):
+        chatbot = gr.Chatbot(label="RAG Chat")
+        with gr.Row():
+            txt = gr.Textbox(
+                show_label=False,
+                placeholder="Ask a question about your documents and press Enter to send...",
+                lines=1,  # single line so Enter submits
+            )
+        with gr.Row():
+            send_btn = gr.Button("Send")
+            clear_btn = gr.Button("Clear")
+        txt.submit(rag_respond, [txt, chatbot], [txt, chatbot])
+        send_btn.click(rag_respond, [txt, chatbot], [txt, chatbot])
+        clear_btn.click(lambda: ([], ""), None, [chatbot, txt])
+    with gr.Tab("Knowledge Base"):
+        gr.Markdown(
+            f"""
+### Manage Knowledge Base
+- Supported formats: `.txt`, `.md`, `.pdf`, `.docx`, `.doc`
+- Files are stored in: `{KB_DIR}`
+- After uploading, click **Rebuild index** so the assistant can use the new content.
 """
+        )
+        kb_upload = gr.File(
+            label="Upload documents",
+            file_count="multiple",
+        )
+        kb_status = gr.Textbox(
+            label="Status",
+            lines=6,
+            interactive=False,
+        )
+        rebuild_btn = gr.Button("Rebuild index")
+        kb_upload.change(upload_to_kb, inputs=kb_upload, outputs=kb_status)
+        rebuild_btn.click(rebuild_index, inputs=None, outputs=kb_status)
 if __name__ == "__main__":
+    port = int(os.environ.get("PORT", 7860))
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=port,
+        share=False,
+    )