Spaces:

sofzcc
/

Self-Service-KB-Assistant

Sleeping

App Files Files Community

sofzcc commited on Dec 2, 2025

Commit

461f357

verified ·

1 Parent(s): 777fd2c

Update app.py

Browse files

Files changed (1) hide show

app.py +176 -89

app.py CHANGED Viewed

@@ -5,8 +5,7 @@ import time
 import gradio as gr
 import numpy as np
-from sentence_transformers import SentenceTransformer
-from transformers import pipeline
 # -----------------------------
 # CONFIG
@@ -151,118 +150,206 @@ class KBIndex:
 # Initialize KB index
 print("Initializing KB index...")
 kb_index = KBIndex()
 print("✅ KB Assistant ready!")
 # -----------------------------
-# CHAT LOGIC (Retrieval-Only, No LLM)
 # -----------------------------
-def format_answer_from_results(query: str, results: List[Tuple[str, str, float]]) -> str:
     """
-    Format a helpful answer from retrieved chunks without using an LLM.
-    This is much faster and works well for knowledge base lookup.
     """
-    if not results:
-        return (
-            "❌ **I couldn't find anything relevant in the knowledge base for this query.**\n\n"
-            "**Suggestions:**\n"
-            "- Try rephrasing your question\n"
-            "- Use different keywords\n"
-            "- Check if the information exists in the knowledge base\n\n"
-            "If this information should be available, consider adding it to the KB."
-        )
-    # Filter by similarity threshold
-    filtered_results = [(chunk, src, score) for chunk, src, score in results if score >= MIN_SIMILARITY_THRESHOLD]
-    if not filtered_results:
         return (
-            "⚠️ **I found some related content, but it doesn't seem very relevant to your question.**\n\n"
             "**Try:**\n"
-            "- Being more specific in your question\n"
-            "- Using different terminology\n"
-            "- Breaking down complex questions into simpler parts"
         )
-    # Build a concise, readable answer
-    answer_parts = []
-    # Get the best (highest scoring) result
-    best_chunk, best_source, best_score = filtered_results[0]
-    # Clean and format the content
-    cleaned_content = clean_markdown(best_chunk)
-    # Create header
-    relevance_emoji = "🟢" if best_score > 0.7 else "🟡" if best_score > 0.5 else "🟠"
-    answer_parts.append(f"{relevance_emoji} **Answer from: {best_source}**\n")
-    # Add the main content
-    answer_parts.append(cleaned_content)
-    # If there are additional relevant sources, mention them
-    if len(filtered_results) > 1:
-        other_sources = [src for _, src, _ in filtered_results[1:]]
-        unique_sources = list(set(other_sources))
-        if unique_sources:
-            answer_parts.append(f"\n\n💡 **Additional information available in:** {', '.join(unique_sources)}")
-    # Add footer
-    answer_parts.append("\n\n---")
-    all_sources = list(set([src for _, src, _ in filtered_results]))
-    answer_parts.append(f"📚 **Sources:** {', '.join(all_sources)}")
-    return "\n".join(answer_parts)
-def clean_markdown(text: str) -> str:
     """
-    Clean up markdown text for better readability.
-    Removes excessive formatting while keeping structure.
     """
-    lines = text.split('\n')
-    cleaned_lines = []
-    for line in lines:
-        line = line.strip()
-        if not line:
-            continue
-        # Convert markdown headers to bold text
-        if line.startswith('#'):
-            # Remove # symbols and make bold
-            header_text = line.lstrip('#').strip()
-            if header_text:
-                cleaned_lines.append(f"\n**{header_text}**")
-        # Keep list items
-        elif line.startswith('-') or line.startswith('*'):
-            cleaned_lines.append(line)
-        # Keep numbered lists
-        elif line[0].isdigit() and '.' in line[:3]:
-            cleaned_lines.append(line)
-        # Regular text
-        else:
-            cleaned_lines.append(line)
-    # Join and clean up excessive newlines
-    result = '\n'.join(cleaned_lines)
-    # Remove multiple consecutive newlines
-    while '\n\n\n' in result:
-        result = result.replace('\n\n\n', '\n\n')
-    return result.strip()
-def build_answer(query: str) -> str:
-    """
-    Fast retrieval-based answer without LLM generation.
-    Returns formatted results from the knowledge base.
-    """
-    # Search the KB
-    results = kb_index.search(query, top_k=TOP_K)
-    # Format and return the answer
-    return format_answer_from_results(query, results)
 def chat_respond(message: str, history):

 import gradio as gr
 import numpy as np
+from sentence_transformers import SentenceTransformer
 # -----------------------------
 # CONFIG
 # Initialize KB index
 print("Initializing KB index...")
 kb_index = KBIndex()
+# Initialize LLM for answer generation
+print("Loading LLM for answer generation...")
+try:
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+    import torch
+    # Use a small but capable model for faster responses
+    LLM_MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # Fast and good quality
+    print(f"Loading {LLM_MODEL_NAME}...")
+    llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
+    llm_model = AutoModelForCausalLM.from_pretrained(
+        LLM_MODEL_NAME,
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+        device_map="auto" if torch.cuda.is_available() else None,
+    )
+    if not torch.cuda.is_available():
+        llm_model = llm_model.to("cpu")
+    llm_model.eval()
+    print(f"✅ LLM loaded successfully on {'GPU' if torch.cuda.is_available() else 'CPU'}")
+    llm_available = True
+except Exception as e:
+    print(f"⚠️ Could not load LLM: {e}")
+    print("⚠️ Will use fallback mode (direct retrieval)")
+    llm_available = False
+    llm_tokenizer = None
+    llm_model = None
 print("✅ KB Assistant ready!")
 # -----------------------------
+# CHAT LOGIC (With LLM Answer Generation)
 # -----------------------------
+def clean_context(text: str) -> str:
+    """Clean up text for context, removing markdown and excess whitespace."""
+    # Remove markdown headers
+    text = text.replace('#', '')
+    # Remove multiple spaces
+    text = ' '.join(text.split())
+    return text.strip()
+def generate_answer_with_llm(query: str, context: str, sources: List[str]) -> str:
     """
+    Generate a natural, conversational answer using LLM based on retrieved context.
     """
+    if not llm_available:
+        return None
+    # Create a focused prompt
+    prompt = f"""<|system|>
+You are a helpful knowledge base assistant. Answer the user's question based ONLY on the provided context. Be conversational, clear, and concise. If the context doesn't contain enough information, say so.
+</s>
+<|user|>
+Context from knowledge base:
+{context}
+Question: {query}
+</s>
+<|assistant|>
+"""
+    try:
+        # Tokenize
+        inputs = llm_tokenizer(
+            prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=1024
+        )
+        if torch.cuda.is_available():
+            inputs = {k: v.to("cuda") for k, v in inputs.items()}
+        # Generate
+        with torch.no_grad():
+            outputs = llm_model.generate(
+                **inputs,
+                max_new_tokens=256,
+                temperature=0.7,
+                top_p=0.9,
+                do_sample=True,
+                pad_token_id=llm_tokenizer.eos_token_id,
+            )
+        # Decode
+        full_response = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract only the assistant's response
+        if "<|assistant|>" in full_response:
+            answer = full_response.split("<|assistant|>")[-1].strip()
+        else:
+            answer = full_response.strip()
+        # Clean up the answer
+        answer = answer.replace("</s>", "").strip()
+        # Add source attribution
+        sources_text = ", ".join(sources)
+        final_answer = f"{answer}\n\n---\n📚 **Sources:** {sources_text}"
+        return final_answer
+    except Exception as e:
+        print(f"Error in LLM generation: {e}")
+        return None
+def format_fallback_answer(results: List[Tuple[str, str, float]]) -> str:
+    """
+    Fallback formatting when LLM is not available or fails.
+    """
+    if not results:
         return (
+            "I couldn't find any relevant information in the knowledge base.\n\n"
             "**Try:**\n"
+            "- Rephrasing your question\n"
+            "- Using different keywords\n"
+            "- Breaking down complex questions"
         )
+    # Get best result
+    best_chunk, best_source, best_score = results[0]
+    # Clean markdown
+    cleaned = clean_context(best_chunk)
+    # Format nicely
+    answer = f"**From {best_source}:**\n\n{cleaned}"
+    # Add other sources if available
+    if len(results) > 1:
+        other_sources = list(set([src for _, src, _ in results[1:]]))
+        if other_sources:
+            answer += f"\n\n💡 **Also see:** {', '.join(other_sources)}"
+    return answer
+def build_answer(query: str) -> str:
     """
+    Main answer generation function using LLM for natural responses.
+    Process:
+    1. Retrieve relevant chunks from KB
+    2. Build context from top results
+    3. Use LLM to generate natural answer
+    4. Cite sources
     """
+    # Step 1: Search the knowledge base
+    results = kb_index.search(query, top_k=TOP_K)
+    if not results:
+        return (
+            "I couldn't find any relevant information in the knowledge base to answer your question.\n\n"
+            "**Suggestions:**\n"
+            "- Try rephrasing with different words\n"
+            "- Check if the topic is covered in the KB\n"
+            "- Be more specific about what you're looking for"
+        )
+    # Step 2: Filter by similarity threshold
+    filtered_results = [
+        (chunk, src, score)
+        for chunk, src, score in results
+        if score >= MIN_SIMILARITY_THRESHOLD
+    ]
+    if not filtered_results:
+        return (
+            "I found some content, but it doesn't seem relevant enough to your question.\n\n"
+            "Please try being more specific or using different keywords."
+        )
+    # Step 3: Build context from top results
+    context_parts = []
+    sources = []
+    for chunk, source, score in filtered_results[:2]:  # Top 2 most relevant
+        cleaned = clean_context(chunk)
+        context_parts.append(cleaned)
+        if source not in sources:
+            sources.append(source)
+    # Combine context (limit to 1000 chars for speed)
+    context = " ".join(context_parts)[:1000]
+    # Step 4: Generate answer with LLM
+    if llm_available:
+        llm_answer = generate_answer_with_llm(query, context, sources)
+        if llm_answer:
+            return llm_answer
+    # Step 5: Fallback if LLM fails or unavailable
+    return format_fallback_answer(filtered_results)
 def chat_respond(message: str, history):