Spaces:

sofzcc
/

Self-Service-KB-Assistant

Sleeping

App Files Files Community

sofzcc commited on Nov 27, 2025

Commit

a80d6ce

verified ·

1 Parent(s): 79f9a01

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -60

app.py CHANGED Viewed

@@ -8,20 +8,16 @@ from sentence_transformers import SentenceTransformer
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 # -----------------------------
 # CONFIG
 # -----------------------------
-KB_DIR = "./kb"  # optional: folder with .txt or .md files
 EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
 GEN_MODEL_NAME = "google/flan-t5-base"
 TOP_K = 3
 CHUNK_SIZE = 500  # characters
 CHUNK_OVERLAP = 100  # characters
 # -----------------------------
 # UTILITIES
 # -----------------------------
@@ -63,7 +59,7 @@ def load_kb_texts(kb_dir: str = KB_DIR) -> List[Tuple[str, str]]:
             except Exception as e:
                 print(f"Could not read {path}: {e}")
-    # If no files found, fall back to some built-in demo content
     if not texts:
         print("No KB files found. Using built-in demo content.")
         demo_text = """
@@ -81,7 +77,7 @@ def load_kb_texts(kb_dir: str = KB_DIR) -> List[Tuple[str, str]]:
         Example use cases for a KB assistant:
         - Agents quickly searching for internal procedures.
-        - Customers asking “how do I…” style questions.
         - Managers analyzing gaps in documentation based on repeated queries.
         """
         texts.append(("demo_content.txt", demo_text))
@@ -97,10 +93,10 @@ class KBIndex:
     def __init__(self, model_name: str = EMBEDDING_MODEL_NAME):
         print("Loading embedding model...")
         self.model = SentenceTransformer(model_name)
-        print("Model loaded.")
         self.chunks: List[str] = []
         self.chunk_sources: List[str] = []
-        self.embeddings: np.ndarray | None = None
         self.build_index()
     def build_index(self):
@@ -152,49 +148,18 @@ class KBIndex:
         return results
 kb_index = KBIndex()
 print("Loading generation model...")
 gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
 gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 gen_model.to(device)
 gen_model.eval()
-print("Generation model ready.")
-# -----------------------------
-# LLM (FLAN-T5-Large) - lazy load
-# -----------------------------
-_llm_pipeline = None
-def get_llm():
-    """
-    Lazily load FLAN-T5-Large as a text2text-generation pipeline.
-    This avoids blocking startup too much.
-    """
-    global _llm_pipeline
-    if _llm_pipeline is not None:
-        return _llm_pipeline
-    print("Loading FLAN-T5-Large model...")
-    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
-    import torch
-    tokenizer = AutoTokenizer.from_pretrained(FLAN_MODEL_NAME)
-    model = AutoModelForSeq2SeqLM.from_pretrained(FLAN_MODEL_NAME)
-    device = 0 if torch.cuda.is_available() else -1
-    _llm_pipeline = pipeline(
-        "text2text-generation",
-        model=model,
-        tokenizer=tokenizer,
-        device=device,
-    )
-    print("FLAN-T5-Large loaded.")
-    return _llm_pipeline
 # -----------------------------
 # CHAT LOGIC
@@ -206,7 +171,6 @@ def build_context_from_results(results: List[Tuple[str, str, float]]) -> str:
     """
     context_parts = []
     for chunk, source, score in results:
-        # Keep it concise; we don't need every line label
         cleaned = chunk.strip()
         context_parts.append(f"From {source}:\n{cleaned}")
     return "\n\n".join(context_parts)
@@ -230,7 +194,7 @@ def build_answer(query: str) -> str:
     # Build context for the model
     context = build_context_from_results(results)
-    # Short list of sources for a small citation line
     source_names = list({src for _, src, _ in results})
     source_line = "Based on: " + ", ".join(source_names)
@@ -262,23 +226,32 @@ def build_answer(query: str) -> str:
     answer_text = gen_tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
-    # Add a subtle source hint at the end
     final_answer = f"{answer_text}\n\n— {source_line}"
     return final_answer
 def chat_respond(message: str, history):
     """
-    Gradio ChatInterface (type='messages') calls this with:
-      - message: latest user message (str)
-      - history: list of previous messages (handled by Gradio)
-    We only need to return the assistant's reply as a string.
     """
-    answer = build_answer(message)
-    return answer
 # -----------------------------
@@ -292,9 +265,10 @@ help center or internal documentation. Here, it's using a small demo
 knowledge base to show how retrieval-based self-service can work.
 """
-chat = gr.ChatInterface(
-    fn=chat,
-    title="Self-Service KB Assistant",
     description=description,
     type="messages",
     examples=[
@@ -305,6 +279,18 @@ chat = gr.ChatInterface(
     cache_examples=False,
 )
 if __name__ == "__main__":
-    chat.launch()

 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 # -----------------------------
 # CONFIG
 # -----------------------------
+KB_DIR = "./kb"  # folder with .txt or .md files
 EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
 GEN_MODEL_NAME = "google/flan-t5-base"
 TOP_K = 3
 CHUNK_SIZE = 500  # characters
 CHUNK_OVERLAP = 100  # characters
 # -----------------------------
 # UTILITIES
 # -----------------------------
             except Exception as e:
                 print(f"Could not read {path}: {e}")
+    # If no files found, fall back to built-in demo content
     if not texts:
         print("No KB files found. Using built-in demo content.")
         demo_text = """
         Example use cases for a KB assistant:
         - Agents quickly searching for internal procedures.
+        - Customers asking "how do I…" style questions.
         - Managers analyzing gaps in documentation based on repeated queries.
         """
         texts.append(("demo_content.txt", demo_text))
     def __init__(self, model_name: str = EMBEDDING_MODEL_NAME):
         print("Loading embedding model...")
         self.model = SentenceTransformer(model_name)
+        print("Embedding model loaded.")
         self.chunks: List[str] = []
         self.chunk_sources: List[str] = []
+        self.embeddings = None
         self.build_index()
     def build_index(self):
         return results
+# Initialize KB index
+print("Initializing KB index...")
 kb_index = KBIndex()
+# Initialize generation model
 print("Loading generation model...")
 gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
 gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 gen_model.to(device)
 gen_model.eval()
+print(f"Generation model ready on {device}.")
 # -----------------------------
 # CHAT LOGIC
     """
     context_parts = []
     for chunk, source, score in results:
         cleaned = chunk.strip()
         context_parts.append(f"From {source}:\n{cleaned}")
     return "\n\n".join(context_parts)
     # Build context for the model
     context = build_context_from_results(results)
+    # Short list of sources for citation
     source_names = list({src for _, src, _ in results})
     source_line = "Based on: " + ", ".join(source_names)
     answer_text = gen_tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
+    # Add source citation at the end
     final_answer = f"{answer_text}\n\n— {source_line}"
     return final_answer
 def chat_respond(message: str, history):
     """
+    Gradio ChatInterface callback.
+    Args:
+        message: Latest user message (str)
+        history: List of previous messages (handled by Gradio)
+    Returns:
+        Assistant's reply as a string
     """
+    if not message or not message.strip():
+        return "Please ask me a question about the knowledge base."
+    try:
+        answer = build_answer(message.strip())
+        return answer
+    except Exception as e:
+        print(f"Error generating answer: {e}")
+        return f"Sorry, I encountered an error processing your question: {str(e)}"
 # -----------------------------
 knowledge base to show how retrieval-based self-service can work.
 """
+# Create ChatInterface
+chat_interface = gr.ChatInterface(
+    fn=chat_respond,
+    title="🤖 Self-Service KB Assistant",
     description=description,
     type="messages",
     examples=[
     cache_examples=False,
 )
+# Launch
 if __name__ == "__main__":
+    # Detect environment and launch appropriately
+    is_huggingface = os.getenv('SPACE_ID') is not None
+    is_container = os.path.exists('/.dockerenv') or os.getenv('KUBERNETES_SERVICE_HOST') is not None
+    if is_huggingface:
+        print("🤗 Launching on HuggingFace Spaces...")
+        chat_interface.launch(server_name="0.0.0.0", server_port=7860)
+    elif is_container:
+        print("🐳 Launching in container environment...")
+        chat_interface.launch(server_name="0.0.0.0", server_port=7860, share=False)
+    else:
+        print("💻 Launching locally...")
+        chat_interface.launch(share=False)