Spaces:

sofzcc
/

Full_RAG_Assistant

Sleeping

App Files Files Community

sofzcc commited on Dec 2, 2025

Commit

02a1b59

verified ·

1 Parent(s): 7494e47

Update app.py

Browse files

Files changed (1) hide show

app.py +341 -557

app.py CHANGED Viewed

@@ -1,636 +1,420 @@
 import os
 import glob
-import yaml
-import shutil
-import re
 from typing import List, Tuple
-import faiss
-import numpy as np
 import gradio as gr
-from sentence_transformers import SentenceTransformer
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-from PyPDF2 import PdfReader
-import docx
 # -----------------------------
 # CONFIG
 # -----------------------------
-def load_config():
-    """Load configuration with error handling"""
-    try:
-        with open("config.yaml", "r", encoding="utf-8") as f:
-            return yaml.safe_load(f)
-    except FileNotFoundError:
-        print("⚠️ config.yaml not found, using defaults")
-        return get_default_config()
-    except Exception as e:
-        print(f"⚠️ Error loading config: {e}, using defaults")
-        return get_default_config()
-def get_default_config():
-    """Provide default configuration"""
-    return {
-        "kb": {
-            "directory": "./knowledge_base",   # can be overridden in config.yaml (e.g., ./kb)
-            "index_directory": "./index",
-        },
-        "models": {
-            "embedding": "sentence-transformers/all-MiniLM-L6-v2",
-            "qa": "google/flan-t5-small",
-        },
-        "chunking": {
-            "chunk_size": 1200,
-            "overlap": 200,
-        },
-        "thresholds": {
-            "similarity": 0.1,
-        },
-        "messages": {
-            "welcome": "Ask me anything about the documents in the knowledge base!",
-            "no_answer": "I couldn't find a relevant answer in the knowledge base.",
-        },
-        "client": {
-            "name": "RAG AI Assistant",
-        },
-        "quick_actions": [],
-    }
-CONFIG = load_config()
-KB_DIR = CONFIG["kb"]["directory"]
-INDEX_DIR = CONFIG["kb"]["index_directory"]
-EMBEDDING_MODEL_NAME = CONFIG["models"]["embedding"]
-QA_MODEL_NAME = CONFIG["models"].get("qa", "google/flan-t5-small")
-CHUNK_SIZE = CONFIG["chunking"]["chunk_size"]
-CHUNK_OVERLAP = CONFIG["chunking"]["overlap"]
-SIM_THRESHOLD = CONFIG["thresholds"]["similarity"]
-WELCOME_MSG = CONFIG["messages"]["welcome"]
-NO_ANSWER_MSG = CONFIG["messages"]["no_answer"]
 # -----------------------------
 # UTILITIES
 # -----------------------------
-def chunk_text(text: str, chunk_size: int, overlap: int) -> List[str]:
-    """Split text into overlapping chunks"""
-    if not text or not text.strip():
         return []
     chunks = []
     start = 0
-    text_len = len(text)
-    while start < text_len:
-        end = min(start + chunk_size, text_len)
         chunk = text[start:end].strip()
-        if chunk and len(chunk) > 20:  # Avoid tiny chunks
             chunks.append(chunk)
-        if end >= text_len:
-            break
         start += chunk_size - overlap
     return chunks
-def load_file_text(path: str) -> str:
-    """Load text from various file formats with error handling"""
-    if not os.path.exists(path):
-        raise FileNotFoundError(f"File not found: {path}")
-    ext = os.path.splitext(path)[1].lower()
-    try:
-        if ext == ".pdf":
-            reader = PdfReader(path)
-            text_parts = []
-            for page in reader.pages:
-                page_text = page.extract_text()
-                if page_text:
-                    text_parts.append(page_text)
-            return "\n".join(text_parts)
-        elif ext in [".docx", ".doc"]:
-            doc = docx.Document(path)
-            return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
-        else:  # .txt, .md, etc.
-            with open(path, "r", encoding="utf-8", errors="ignore") as f:
-                return f.read()
-    except Exception as e:
-        print(f"Error reading {path}: {e}")
-        raise
-def load_kb_documents(kb_dir: str) -> List[Tuple[str, str]]:
-    """Load all documents from knowledge base directory"""
-    docs: List[Tuple[str, str]] = []
-    if not os.path.exists(kb_dir):
-        print(f"⚠️ Knowledge base directory not found: {kb_dir}")
-        print(f"Creating directory: {kb_dir}")
-        os.makedirs(kb_dir, exist_ok=True)
-        return docs
-    if not os.path.isdir(kb_dir):
-        print(f"⚠️ {kb_dir} is not a directory")
-        return docs
-    # Support multiple file formats
-    patterns = ["*.txt", "*.md", "*.pdf", "*.docx", "*.doc"]
-    paths = []
-    for pattern in patterns:
-        paths.extend(glob.glob(os.path.join(kb_dir, pattern)))
-    if not paths:
-        print(f"⚠️ No documents found in {kb_dir}")
-        return docs
-    print(f"Found {len(paths)} documents in knowledge base")
-    for path in paths:
-        try:
-            text = load_file_text(path)
-            if text and text.strip():
-                docs.append((os.path.basename(path), text))
-                print(f"✓ Loaded: {os.path.basename(path)}")
-            else:
-                print(f"⚠️ Empty file: {os.path.basename(path)}")
-        except Exception as e:
-            print(f"✗ Could not read {path}: {e}")
-    return docs
-def clean_context_text(text: str) -> str:
     """
-    Clean raw document context before sending to the generator:
-    - Remove markdown headings (#, ##, ###)
-    - Remove list markers (1., 2), -, *)
-    - Remove duplicate lines
     """
-    lines = text.splitlines()
-    cleaned = []
-    seen = set()
-    for line in lines:
-        l = line.strip()
-        if not l:
-            continue
-        # Remove markdown headings like "# 1. Title", "## Section"
-        l = re.sub(r"^#+\s*", "", l)
-        # Remove ordered list prefixes like "1. ", "2) "
-        l = re.sub(r"^\d+[\.\)]\s*", "", l)
-        # Remove bullet markers like "- ", "* "
-        l = re.sub(r"^[-*]\s*", "", l)
-        # Skip very short "noise" lines
-        if len(l) < 5:
-            continue
-        # Avoid exact duplicates
-        if l in seen:
-            continue
-        seen.add(l)
-        cleaned.append(l)
-    return "\n".join(cleaned)
 # -----------------------------
-# KB INDEX (FAISS)
 # -----------------------------
-class RAGIndex:
-    def __init__(self):
-        self.embedder = None
-        self.qa_tokenizer = None
-        self.qa_model = None
         self.chunks: List[str] = []
         self.chunk_sources: List[str] = []
-        self.index = None
-        self.initialized = False
-        try:
-            print("🔄 Initializing RAG Assistant...")
-            self._initialize_models()
-            self._build_or_load_index()
-            self.initialized = True
-            print("✅ RAG Assistant ready!")
-        except Exception as e:
-            print(f"❌ Initialization error: {e}")
-            print("The assistant will run in limited mode.")
-    def _initialize_models(self):
-        """Initialize embedding and QA models"""
-        try:
-            print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}")
-            self.embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)
-            print(f"Loading QA (seq2seq) model: {QA_MODEL_NAME}")
-            self.qa_tokenizer = AutoTokenizer.from_pretrained(QA_MODEL_NAME)
-            self.qa_model = AutoModelForSeq2SeqLM.from_pretrained(QA_MODEL_NAME)
-        except Exception as e:
-            print(f"Error loading models: {e}")
-            raise
-    def _build_or_load_index(self):
-        """Build or load FAISS index from knowledge base"""
-        os.makedirs(INDEX_DIR, exist_ok=True)
-        idx_path = os.path.join(INDEX_DIR, "kb.index")
-        meta_path = os.path.join(INDEX_DIR, "kb_meta.npy")
-        # Try to load existing index
-        if os.path.exists(idx_path) and os.path.exists(meta_path):
-            try:
-                print("Loading existing FAISS index...")
-                self.index = faiss.read_index(idx_path)
-                meta = np.load(meta_path, allow_pickle=True).item()
-                self.chunks = list(meta["chunks"])
-                self.chunk_sources = list(meta["sources"])
-                print(f"✓ Index loaded with {len(self.chunks)} chunks")
-                return
-            except Exception as e:
-                print(f"⚠️ Could not load existing index: {e}")
-                print("Building new index...")
-        # Build new index
-        print("Building new FAISS index from knowledge base...")
-        docs = load_kb_documents(KB_DIR)
-        if not docs:
-            print("⚠️ No documents found in knowledge base")
-            print(f"   Please add .txt, .md, .pdf, or .docx files to: {KB_DIR}")
-            self.index = None
-            self.chunks = []
-            self.chunk_sources = []
-            return
-        all_chunks: List[str] = []
-        all_sources: List[str] = []
-        for source, text in docs:
-            chunks = chunk_text(text, CHUNK_SIZE, CHUNK_OVERLAP)
-            for chunk in chunks:
                 all_chunks.append(chunk)
-                all_sources.append(source)
         if not all_chunks:
-            print("⚠️ No valid chunks created from documents")
-            self.index = None
             self.chunks = []
             self.chunk_sources = []
             return
-        print(f"Created {len(all_chunks)} chunks from {len(docs)} documents")
-        print("Generating embeddings...")
-        embeddings = self.embedder.encode(
-            all_chunks,
-            show_progress_bar=True,
-            convert_to_numpy=True,
-            batch_size=32,
-        )
-        dimension = embeddings.shape[1]
-        index = faiss.IndexFlatIP(dimension)
-        # Normalize for cosine similarity
-        faiss.normalize_L2(embeddings)
-        index.add(embeddings)
-        # Save index
-        try:
-            faiss.write_index(index, idx_path)
-            np.save(
-                meta_path,
-                {
-                    "chunks": np.array(all_chunks, dtype=object),
-                    "sources": np.array(all_sources, dtype=object),
-                },
-            )
-            print("✓ Index saved successfully")
-        except Exception as e:
-            print(f"⚠️ Could not save index: {e}")
-        self.index = index
         self.chunks = all_chunks
         self.chunk_sources = all_sources
-    def retrieve(self, query: str, top_k: int = 5) -> List[Tuple[str, str, float]]:
-        """Retrieve relevant chunks for a query"""
-        if not query or not query.strip():
             return []
-        if self.index is None or not self.initialized:
             return []
-        try:
-            q_emb = self.embedder.encode([query], convert_to_numpy=True)
-            faiss.normalize_L2(q_emb)
-            k = min(top_k, len(self.chunks)) if self.chunks else 0
-            if k == 0:
-                return []
-            scores, idxs = self.index.search(q_emb, k)
-            results: List[Tuple[str, str, float]] = []
-            for score, idx in zip(scores[0], idxs[0]):
-                if idx == -1 or idx >= len(self.chunks):
-                    continue
-                if score < SIM_THRESHOLD:
-                    continue
-                results.append(
-                    (self.chunks[idx], self.chunk_sources[idx], float(score))
-                )
-            return results
-        except Exception as e:
-            print(f"Retrieval error: {e}")
-            return []
-    def _generate_from_context(self, prompt: str, max_new_tokens: int = 128) -> str:
-        """Run Flan-T5 on the given prompt and return the decoded answer."""
-        if self.qa_model is None or self.qa_tokenizer is None:
-            raise RuntimeError("QA model not loaded.")
-        inputs = self.qa_tokenizer(
             prompt,
             return_tensors="pt",
             truncation=True,
-            max_length=768,
-        )
-        outputs = self.qa_model.generate(
-            **inputs,
-            max_new_tokens=max_new_tokens,
-            do_sample=False,
         )
-        answer = self.qa_tokenizer.decode(
-            outputs[0],
-            skip_special_tokens=True,
-        ).strip()
-        return answer
-    def answer(self, question: str) -> str:
-        """Answer a question using RAG + two-step summarization + generation."""
-        if not self.initialized:
-            return "❌ Assistant not properly initialized. Please check the logs."
-        if not question or not question.strip():
-            return "Please ask a question."
-        if self.index is None or not self.chunks:
-            return (
-                f"📚 Knowledge base is empty.\n\n"
-                f"Please add documents to: `{KB_DIR}`\n"
-                f"Supported formats: .txt, .md, .pdf, .docx"
-            )
-        # 1) Retrieve relevant contexts
-        contexts = self.retrieve(question, top_k=3)
-        if not contexts:
-            return (
-                f"{NO_ANSWER_MSG}\n\n"
-                f"💡 Try rephrasing your question or check if relevant documents exist in the knowledge base."
-            )
-        used_sources = set()
-        # 2) Summarize each retrieved chunk into 1 sentence
-        summaries = []
-        for ctx, source, score in contexts:
-            used_sources.add(source)
-            cleaned_ctx = clean_context_text(ctx)
-            if not cleaned_ctx.strip():
-                continue
-            summary_prompt = (
-                "Summarize the following text in ONE concise sentence, keeping only the main idea. "
-                "Do not include headings, numbers, or bullet markers.\n\n"
-                f"{cleaned_ctx}\n\n"
-                "Summary:"
-            )
-            try:
-                summary = self._generate_from_context(summary_prompt, max_new_tokens=64)
-                summaries.append(summary)
-            except Exception as e:
-                print(f"Summary generation error: {e}")
-                continue
-        if not summaries:
-            return (
-                f"{NO_ANSWER_MSG}\n\n"
-                f"💡 Try rephrasing your question or adding more detailed documents to the knowledge base."
-            )
-        # 3) Combine summaries into an evidence pool
-        evidence = " ".join(summaries)
-        # 4) Ask the model to answer using only the summaries
-        answer_prompt = (
-            "You are an AI assistant that answers questions using only the summarized evidence below.\n"
-            "Write a clear, helpful answer in 1–3 sentences, in your own words.\n"
-            "- Do NOT include headings, section numbers, markdown, or bullet symbols.\n"
-            "- Do NOT mention file names or sources in the answer.\n"
-            "- If the answer cannot be found in the evidence, reply exactly: "
-            "\"I don't know based on the provided documents.\"\n\n"
-            f"Evidence:\n{evidence}\n\n"
-            f"Question: {question}\n\n"
-            "Answer:"
-        )
-        try:
-            answer_text = self._generate_from_context(answer_prompt, max_new_tokens=128)
-        except Exception as e:
-            print(f"Generation error: {e}")
-            return (
-                "There was an error while generating the answer. "
-                "Please try again with a shorter question or different wording."
             )
-        sources_str = ", ".join(sorted(used_sources)) if used_sources else "N/A"
         return (
-            f"**Answer:** {answer_text}\n\n"
-            f"**Sources:** {sources_str}"
         )
-# Initialize RAG system
-print("=" * 50)
-rag_index = RAGIndex()
-print("=" * 50)
-# -----------------------------
-# GRADIO APP (BLOCKS)
-# -----------------------------
-def rag_respond(message, history):
-    """Handle chat messages for chatbot UI (messages format)"""
-    if history is None:
-        history = []
-    if not message or not str(message).strip():
-        return "", history
-    user_msg = str(message)
-    history.append({
-        "role": "user",
-        "content": user_msg,
-    })
-    bot_reply = rag_index.answer(user_msg)
-    history.append({
-        "role": "assistant",
-        "content": bot_reply,
-    })
-    return "", history
-def upload_to_kb(files):
-    """Save uploaded files into the KB directory"""
-    if not files:
-        return "No files uploaded."
-    if not isinstance(files, list):
-        files = [files]
-    os.makedirs(KB_DIR, exist_ok=True)
-    saved_files = []
-    for f in files:
-        src_path = getattr(f, "name", None) or str(f)
-        if not os.path.exists(src_path):
-            continue
-        filename = os.path.basename(src_path)
-        dest_path = os.path.join(KB_DIR, filename)
-        try:
-            shutil.copy(src_path, dest_path)
-            saved_files.append(filename)
-        except Exception as e:
-            print(f"Error saving file {filename}: {e}")
-    if not saved_files:
-        return "No files could be saved. Check logs."
-    return (
-        f"✅ Saved {len(saved_files)} file(s) to knowledge base:\n- "
-        + "\n- ".join(saved_files)
-        + "\n\nClick **Rebuild index** to include them in search."
-    )
-def rebuild_index():
-    """Trigger index rebuild from UI"""
-    rag_index._build_or_load_index()
-    if rag_index.index is None or not rag_index.chunks:
         return (
-            "⚠️ Index rebuild finished, but no documents or chunks were found.\n"
-            f"Add files to `{KB_DIR}` and try again."
         )
-    return (
-        f"✅ Index rebuilt successfully.\n"
-        f"Chunks in index: {len(rag_index.chunks)}"
-    )
-# Description + optional examples
-description = WELCOME_MSG
-if not rag_index.initialized or rag_index.index is None or not rag_index.chunks:
-    description += (
-        f"\n\n⚠️ **Note:** Knowledge base is currently empty or index is not built.\n"
-        f"Upload documents in the **Knowledge Base** tab and click **Rebuild index**."
-    )
-examples = [
-    qa.get("query")
-    for qa in CONFIG.get("quick_actions", [])
-    if qa.get("query")
-]
-if not examples and rag_index.initialized and rag_index.index is not None and rag_index.chunks:
-    examples = [
-        "What is a knowledge base?",
-        "What are best practices for maintaining a KB?",
-        "How should I structure knowledge base articles?",
     ]
-with gr.Blocks(title=CONFIG["client"]["name"]) as demo:
-    gr.Markdown(f"# {CONFIG['client']['name']}")
-    gr.Markdown(description)
-    with gr.Tab("Chat"):
-        chatbot = gr.Chatbot(label="RAG Chat")
-        with gr.Row():
-            txt = gr.Textbox(
-                show_label=False,
-                placeholder="Ask a question about your documents and press Enter to send...",
-                lines=1,  # single line so Enter submits
-            )
-        with gr.Row():
-            send_btn = gr.Button("Send")
-            clear_btn = gr.Button("Clear")
-        txt.submit(rag_respond, [txt, chatbot], [txt, chatbot])
-        send_btn.click(rag_respond, [txt, chatbot], [txt, chatbot])
-        clear_btn.click(lambda: ([], ""), None, [chatbot, txt])
-    with gr.Tab("Knowledge Base"):
-        gr.Markdown(
-            f"""
-### Manage Knowledge Base
-- Supported formats: `.txt`, `.md`, `.pdf`, `.docx`, `.doc`
-- Files are stored in: `{KB_DIR}`
-- After uploading, click **Rebuild index** so the assistant can use the new content.
 """
-        )
-        kb_upload = gr.File(
-            label="Upload documents",
-            file_count="multiple",
-        )
-        kb_status = gr.Textbox(
-            label="Status",
-            lines=6,
-            interactive=False,
-        )
-        rebuild_btn = gr.Button("Rebuild index")
-        kb_upload.change(upload_to_kb, inputs=kb_upload, outputs=kb_status)
-        rebuild_btn.click(rebuild_index, inputs=None, outputs=kb_status)
 if __name__ == "__main__":
-    port = int(os.environ.get("PORT", 7860))
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=port,
-        share=False,
-    )

 import os
 import glob
 from typing import List, Tuple
+import time
 import gradio as gr
+import numpy as np
+from sentence_transformers import SentenceTransformer
 # -----------------------------
 # CONFIG
 # -----------------------------
+KB_DIR = "./kb"  # folder with .txt or .md files
+EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+TOP_K = 3
+CHUNK_SIZE = 500  # characters
+CHUNK_OVERLAP = 100  # characters
+MIN_SIMILARITY_THRESHOLD = 0.3  # Minimum similarity score to include results
 # -----------------------------
 # UTILITIES
 # -----------------------------
+def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
+    """Split long text into overlapping chunks so retrieval is more precise."""
+    if not text:
         return []
     chunks = []
     start = 0
+    length = len(text)
+    while start < length:
+        end = min(start + chunk_size, length)
         chunk = text[start:end].strip()
+        if chunk:
             chunks.append(chunk)
         start += chunk_size - overlap
     return chunks
+def load_kb_texts(kb_dir: str = KB_DIR) -> List[Tuple[str, str]]:
     """
+    Load all .txt and .md files from the KB directory.
+    Returns a list of (source_name, content).
     """
+    texts = []
+    if os.path.isdir(kb_dir):
+        paths = glob.glob(os.path.join(kb_dir, "*.txt")) + glob.glob(os.path.join(kb_dir, "*.md"))
+        for path in paths:
+            try:
+                with open(path, "r", encoding="utf-8") as f:
+                    content = f.read()
+                if content.strip():
+                    texts.append((os.path.basename(path), content))
+            except Exception as e:
+                print(f"Could not read {path}: {e}")
+    # If no files found, fall back to built-in demo content
+    if not texts:
+        print("No KB files found. Using built-in demo content.")
+        demo_text = """
+        Welcome to the Self-Service KB Assistant.
+        This assistant is meant to help you find information inside a knowledge base.
+        In a real setup, it would be connected to your own articles, procedures,
+        troubleshooting guides and FAQs.
+        Good knowledge base content is:
+        - Clear and structured with headings, steps and expected outcomes.
+        - Written in a customer-friendly tone.
+        - Easy to scan, with short paragraphs and bullet points.
+        - Maintained regularly to reflect product and process changes.
+        Example use cases for a KB assistant:
+        - Agents quickly searching for internal procedures.
+        - Customers asking "how do I…" style questions.
+        - Managers analyzing gaps in documentation based on repeated queries.
+        """
+        texts.append(("demo_content.txt", demo_text))
+    return texts
 # -----------------------------
+# KB INDEX
 # -----------------------------
+class KBIndex:
+    def __init__(self, model_name: str = EMBEDDING_MODEL_NAME):
+        print("Loading embedding model...")
+        self.model = SentenceTransformer(model_name)
+        print("Embedding model loaded.")
         self.chunks: List[str] = []
         self.chunk_sources: List[str] = []
+        self.embeddings = None
+        self.build_index()
+    def build_index(self):
+        """Load KB texts, split into chunks, and build an embedding index."""
+        texts = load_kb_texts(KB_DIR)
+        all_chunks = []
+        all_sources = []
+        for source_name, content in texts:
+            for chunk in chunk_text(content):
                 all_chunks.append(chunk)
+                all_sources.append(source_name)
         if not all_chunks:
+            print("⚠️ No chunks found for KB index.")
             self.chunks = []
             self.chunk_sources = []
+            self.embeddings = None
             return
+        print(f"Creating embeddings for {len(all_chunks)} chunks...")
+        embeddings = self.model.encode(all_chunks, show_progress_bar=False, convert_to_numpy=True)
         self.chunks = all_chunks
         self.chunk_sources = all_sources
+        self.embeddings = embeddings
+        print("KB index ready.")
+    def search(self, query: str, top_k: int = TOP_K) -> List[Tuple[str, str, float]]:
+        """Return top-k (chunk, source_name, score) for a given query."""
+        if not query.strip():
             return []
+        if self.embeddings is None or not len(self.chunks):
             return []
+        query_vec = self.model.encode([query], show_progress_bar=False, convert_to_numpy=True)[0]
+        # Cosine similarity
+        dot_scores = np.dot(self.embeddings, query_vec)
+        norm_docs = np.linalg.norm(self.embeddings, axis=1)
+        norm_query = np.linalg.norm(query_vec) + 1e-10
+        scores = dot_scores / (norm_docs * norm_query + 1e-10)
+        top_idx = np.argsort(scores)[::-1][:top_k]
+        results = []
+        for idx in top_idx:
+            results.append((self.chunks[idx], self.chunk_sources[idx], float(scores[idx])))
+        return results
+# Initialize KB index
+print("Initializing KB index...")
+kb_index = KBIndex()
+# Initialize LLM for answer generation
+print("Loading LLM for answer generation...")
+try:
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+    import torch
+    # Use a small but capable model for faster responses
+    LLM_MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # Fast and good quality
+    print(f"Loading {LLM_MODEL_NAME}...")
+    llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
+    llm_model = AutoModelForCausalLM.from_pretrained(
+        LLM_MODEL_NAME,
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+        device_map="auto" if torch.cuda.is_available() else None,
+    )
+    if not torch.cuda.is_available():
+        llm_model = llm_model.to("cpu")
+    llm_model.eval()
+    print(f"✅ LLM loaded successfully on {'GPU' if torch.cuda.is_available() else 'CPU'}")
+    llm_available = True
+except Exception as e:
+    print(f"⚠️ Could not load LLM: {e}")
+    print("⚠️ Will use fallback mode (direct retrieval)")
+    llm_available = False
+    llm_tokenizer = None
+    llm_model = None
+print("✅ KB Assistant ready!")
+# -----------------------------
+# CHAT LOGIC (With LLM Answer Generation)
+# -----------------------------
+def clean_context(text: str) -> str:
+    """Clean up text for context, removing markdown and excess whitespace."""
+    # Remove markdown headers
+    text = text.replace('#', '')
+    # Remove multiple spaces
+    text = ' '.join(text.split())
+    return text.strip()
+def generate_answer_with_llm(query: str, context: str, sources: List[str]) -> str:
+    """
+    Generate a natural, conversational answer using LLM based on retrieved context.
+    """
+    if not llm_available:
+        return None
+    # Create a focused prompt
+    prompt = f"""<|system|>
+You are a helpful knowledge base assistant. Answer the user's question based ONLY on the provided context. Be conversational, clear, and concise. If the context doesn't contain enough information, say so.
+</s>
+<|user|>
+Context from knowledge base:
+{context}
+Question: {query}
+</s>
+<|assistant|>
+"""
+    try:
+        # Tokenize
+        inputs = llm_tokenizer(
             prompt,
             return_tensors="pt",
             truncation=True,
+            max_length=1024
         )
+        if torch.cuda.is_available():
+            inputs = {k: v.to("cuda") for k, v in inputs.items()}
+        # Generate
+        with torch.no_grad():
+            outputs = llm_model.generate(
+                **inputs,
+                max_new_tokens=256,
+                temperature=0.7,
+                top_p=0.9,
+                do_sample=True,
+                pad_token_id=llm_tokenizer.eos_token_id,
             )
+        # Decode
+        full_response = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract only the assistant's response
+        if "<|assistant|>" in full_response:
+            answer = full_response.split("<|assistant|>")[-1].strip()
+        else:
+            answer = full_response.strip()
+        # Clean up the answer
+        answer = answer.replace("</s>", "").strip()
+        # Add source attribution
+        sources_text = ", ".join(sources)
+        final_answer = f"{answer}\n\n---\n📚 **Sources:** {sources_text}"
+        return final_answer
+    except Exception as e:
+        print(f"Error in LLM generation: {e}")
+        return None
+def format_fallback_answer(results: List[Tuple[str, str, float]]) -> str:
+    """
+    Fallback formatting when LLM is not available or fails.
+    """
+    if not results:
         return (
+            "I couldn't find any relevant information in the knowledge base.\n\n"
+            "**Try:**\n"
+            "- Rephrasing your question\n"
+            "- Using different keywords\n"
+            "- Breaking down complex questions"
         )
+    # Get best result
+    best_chunk, best_source, best_score = results[0]
+    # Clean markdown
+    cleaned = clean_context(best_chunk)
+    # Format nicely
+    answer = f"**From {best_source}:**\n\n{cleaned}"
+    # Add other sources if available
+    if len(results) > 1:
+        other_sources = list(set([src for _, src, _ in results[1:]]))
+        if other_sources:
+            answer += f"\n\n💡 **Also see:** {', '.join(other_sources)}"
+    return answer
+def build_answer(query: str) -> str:
+    """
+    Main answer generation function using LLM for natural responses.
+    Process:
+    1. Retrieve relevant chunks from KB
+    2. Build context from top results
+    3. Use LLM to generate natural answer
+    4. Cite sources
+    """
+    # Step 1: Search the knowledge base
+    results = kb_index.search(query, top_k=TOP_K)
+    if not results:
         return (
+            "I couldn't find any relevant information in the knowledge base to answer your question.\n\n"
+            "**Suggestions:**\n"
+            "- Try rephrasing with different words\n"
+            "- Check if the topic is covered in the KB\n"
+            "- Be more specific about what you're looking for"
         )
+    # Step 2: Filter by similarity threshold
+    filtered_results = [
+        (chunk, src, score)
+        for chunk, src, score in results
+        if score >= MIN_SIMILARITY_THRESHOLD
     ]
+    if not filtered_results:
+        return (
+            "I found some content, but it doesn't seem relevant enough to your question.\n\n"
+            "Please try being more specific or using different keywords."
+        )
+    # Step 3: Build context from top results
+    context_parts = []
+    sources = []
+    for chunk, source, score in filtered_results[:2]:  # Top 2 most relevant
+        cleaned = clean_context(chunk)
+        context_parts.append(cleaned)
+        if source not in sources:
+            sources.append(source)
+    # Combine context (limit to 1000 chars for speed)
+    context = " ".join(context_parts)[:1000]
+    # Step 4: Generate answer with LLM
+    if llm_available:
+        llm_answer = generate_answer_with_llm(query, context, sources)
+        if llm_answer:
+            return llm_answer
+    # Step 5: Fallback if LLM fails or unavailable
+    return format_fallback_answer(filtered_results)
+def chat_respond(message: str, history):
+    """
+    Gradio ChatInterface callback.
+    Args:
+        message: Latest user message (str)
+        history: List of previous messages (handled by Gradio)
+    Returns:
+        Assistant's reply as a string
+    """
+    if not message or not message.strip():
+        return "Please ask me a question about the knowledge base."
+    try:
+        answer = build_answer(message.strip())
+        return answer
+    except Exception as e:
+        print(f"Error generating answer: {e}")
+        return f"Sorry, I encountered an error processing your question: {str(e)}"
+# -----------------------------
+# GRADIO UI
+# -----------------------------
+description = """
+🚀 **Fast Knowledge Base Search Assistant**
+Ask questions and get instant answers from the knowledge base.
+This assistant uses semantic search to find the most relevant information quickly.
+**Tips for better results:**
+- Be specific in your questions
+- Use keywords related to your topic
+- Ask one question at a time
 """
+# Create ChatInterface (without 'type' parameter for compatibility)
+chat_interface = gr.ChatInterface(
+    fn=chat_respond,
+    title="🤖 Self-Service KB Assistant",
+    description=description,
+    examples=[
+        "What makes a good knowledge base article?",
+        "How could a KB assistant help agents?",
+        "Why is self-service important for customer support?",
+    ],
+    cache_examples=False,
+)
+# Launch
 if __name__ == "__main__":
+    # Detect environment and launch appropriately
+    is_huggingface = os.getenv('SPACE_ID') is not None
+    is_container = os.path.exists('/.dockerenv') or os.getenv('KUBERNETES_SERVICE_HOST') is not None
+    if is_huggingface:
+        print("🤗 Launching on HuggingFace Spaces...")
+        chat_interface.launch(server_name="0.0.0.0", server_port=7860)
+    elif is_container:
+        print("🐳 Launching in container environment...")
+        chat_interface.launch(server_name="0.0.0.0", server_port=7860, share=False)
+    else:
+        print("💻 Launching locally...")
+        chat_interface.launch(share=False)