ChatBotAgenticRAG1

Build error

App Files Files Community

Phoenix21 commited on Jan 13, 2025

Commit

1eb0002

verified ·

1 Parent(s): 0b20500

Added pydantic error handling

Browse files

Files changed (1) hide show

pipeline.py +214 -150

pipeline.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import getpass
 import spacy
 import pandas as pd
-from typing import Optional
 import subprocess
 from langchain.llms.base import LLM
 from langchain.docstore.document import Document
@@ -10,7 +10,7 @@ from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.chains import RetrievalQA
 from smolagents import CodeAgent, DuckDuckGoSearchTool, ManagedAgent, LiteLLMModel
-from pydantic import BaseModel, ValidationError, validator
 from mistralai import Mistral
 from langchain.prompts import PromptTemplate
@@ -25,7 +25,33 @@ from prompts import classification_prompt, refusal_prompt, tailor_prompt
 mistral_api_key = os.environ.get("MISTRAL_API_KEY")
 client = Mistral(api_key=mistral_api_key)
-# Load spaCy model for NER and download it if not already installed
 def install_spacy_model():
     try:
         spacy.load("en_core_web_sm")
@@ -38,99 +64,121 @@ def install_spacy_model():
 install_spacy_model()
 nlp = spacy.load("en_core_web_sm")
-# Function to extract the main topic from the query using spaCy NER
 def extract_main_topic(query: str) -> str:
-    doc = nlp(query)
-    main_topic = None
-    for ent in doc.ents:
-        if ent.label_ in ["ORG", "PRODUCT", "PERSON", "GPE", "TIME"]:
-            main_topic = ent.text
-            break
-    if not main_topic:
-        for token in doc:
-            if token.pos_ in ["NOUN", "PROPN"]:
-                main_topic = token.text
                 break
-    return main_topic if main_topic else "this topic"
-# Pydantic model to handle string input validation
-class QueryInput(BaseModel):
-    query: str
-    # Validator to ensure the query is always a string
-    @validator('query')
-    def check_query_is_string(cls, v):
-        if not isinstance(v, str):
-            raise ValueError("Query must be a valid string.")
-        return v
-# Function to classify query based on wellness topics
-def classify_query(query: str) -> str:
-    wellness_keywords = ["box breathing", "meditation", "yoga", "mindfulness", "breathing exercises"]
-    if any(keyword in query.lower() for keyword in wellness_keywords):
-        return "Wellness"
-    # Fallback to classification chain if not directly recognized
-    class_result = classification_chain.invoke({"query": query})
-    classification = class_result.get("text", "").strip()
-    return classification if classification != "OutOfScope" else "OutOfScope"
-# Function to moderate text using Mistral moderation API (sync version)
-def moderate_text(query: str) -> str:
     try:
-        # Use Pydantic to validate text input
-        query_input = QueryInput(query=query)  # This will validate that the query is a string
     except ValidationError as e:
-        print(f"Error validating text: {e}")
-        return "Invalid text format."
-    # Call the Mistral moderation API
-    response = client.classifiers.moderate_chat(
-        model="mistral-moderation-latest",
-        inputs=[{"role": "user", "content": query}]
-    )
-    # Check if harmful categories are present in the response
-    if hasattr(response, 'results') and response.results:
-        categories = response.results[0].categories
-        if categories.get("violence_and_threats", False) or \
-           categories.get("hate_and_discrimination", False) or \
-           categories.get("dangerous_and_criminal_content", False) or \
-           categories.get("selfharm", False):
-            return "OutOfScope"
-    return query
-# Function to build or load the vector store from CSV data
 def build_or_load_vectorstore(csv_path: str, store_dir: str) -> FAISS:
-    if os.path.exists(store_dir):
-        print(f"DEBUG: Found existing FAISS store at '{store_dir}'. Loading...")
-        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
-        vectorstore = FAISS.load_local(store_dir, embeddings)
-        return vectorstore
-    else:
-        print(f"DEBUG: Building new store from CSV: {csv_path}")
         df = pd.read_csv(csv_path)
         df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
         df.columns = df.columns.str.strip()
         if "Answer" in df.columns:
             df.rename(columns={"Answer": "Answers"}, inplace=True)
         if "Question" not in df.columns and "Question " in df.columns:
             df.rename(columns={"Question ": "Question"}, inplace=True)
         if "Question" not in df.columns or "Answers" not in df.columns:
-            raise ValueError("CSV must have 'Question' and 'Answers' columns.")
-        docs = []
-        for _, row in df.iterrows():
-            q = str(row["Question"])
-            ans = str(row["Answers"])
-            doc = Document(page_content=ans, metadata={"question": q})
-            docs.append(doc)
         embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
         vectorstore = FAISS.from_documents(docs, embedding=embeddings)
         vectorstore.save_local(store_dir)
         return vectorstore
-# Function to build RAG chain
 def build_rag_chain(llm_model: LiteLLMModel, vectorstore: FAISS) -> RetrievalQA:
     class GeminiLangChainLLM(LLM):
         def _call(self, prompt: str, stop: Optional[list] = None, **kwargs) -> str:
@@ -141,87 +189,103 @@ def build_rag_chain(llm_model: LiteLLMModel, vectorstore: FAISS) -> RetrievalQA:
         def _llm_type(self) -> str:
             return "custom_gemini"
-    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
-    gemini_as_llm = GeminiLangChainLLM()
-    rag_chain = RetrievalQA.from_chain_type(
-        llm=gemini_as_llm,
-        chain_type="stuff",
-        retriever=retriever,
-        return_source_documents=True
-    )
-    return rag_chain
-# Function to perform web search using DuckDuckGo
 def do_web_search(query: str) -> str:
-    search_tool = DuckDuckGoSearchTool()
-    web_agent = CodeAgent(tools=[search_tool], model=pydantic_agent)
-    managed_web_agent = ManagedAgent(agent=web_agent, name="web_search", description="Runs web search for you.")
-    manager_agent = CodeAgent(tools=[], model=pydantic_agent, managed_agents=[managed_web_agent])
-    search_query = f"Give me relevant info: {query}"
-    response = manager_agent.run(search_query)
-    return response
-# Function to combine web and knowledge base responses
 def merge_responses(kb_answer: str, web_answer: str) -> str:
-    # Merge both answers with a cohesive response
-    final_answer = f"Knowledge Base Answer: {kb_answer}\n\nWeb Search Result: {web_answer}"
-    return final_answer.strip()
-# Orchestrate the entire workflow
 def run_pipeline(query: str) -> str:
-    # Moderate the query for harmful content
-    moderated_query = moderate_text(query)
-    if moderated_query == "OutOfScope":
-        return "Sorry, this query contains harmful or inappropriate content."
-    # Classify the query manually
-    classification = classify_query(moderated_query)
-    if classification == "OutOfScope":
         refusal_text = refusal_chain.run({"topic": "this topic"})
-        final_refusal = tailor_chain.run({"response": refusal_text})
-        return final_refusal.strip()
-    if classification == "Wellness":
-        rag_result = wellness_rag_chain({"query": moderated_query})
-        csv_answer = rag_result["result"].strip()
-        web_answer = ""  # Empty if we found an answer from the knowledge base
-        if not csv_answer:
-            web_answer = do_web_search(moderated_query)
-        final_merged = merge_responses(csv_answer, web_answer)
-        final_answer = tailor_chain.run({"response": final_merged})
-        return final_answer.strip()
-    if classification == "Brand":
-        rag_result = brand_rag_chain({"query": moderated_query})
-        csv_answer = rag_result["result"].strip()
-        final_merged = merge_responses(csv_answer, "")
-        final_answer = tailor_chain.run({"response": final_merged})
-        return final_answer.strip()
-    refusal_text = refusal_chain.run({"topic": "this topic"})
-    final_refusal = tailor_chain.run({"response": refusal_text})
-    return final_refusal.strip()
-# Initialize chains
-classification_chain = get_classification_chain()
-refusal_chain = get_refusal_chain()
-tailor_chain = get_tailor_chain()
-cleaner_chain = get_cleaner_chain()
-wellness_csv = "AIChatbot.csv"
-brand_csv = "BrandAI.csv"
-wellness_store_dir = "faiss_wellness_store"
-brand_store_dir = "faiss_brand_store"
-wellness_vectorstore = build_or_load_vectorstore(wellness_csv, wellness_store_dir)
-brand_vectorstore = build_or_load_vectorstore(brand_csv, brand_store_dir)
-gemini_llm = LiteLLMModel(model_id="gemini/gemini-pro", api_key=os.environ.get("GEMINI_API_KEY"))
-wellness_rag_chain = build_rag_chain(gemini_llm, wellness_vectorstore)
-brand_rag_chain = build_rag_chain(gemini_llm, brand_vectorstore)
-# Function to wrap up and run the chain
 def run_with_chain(query: str) -> str:
-    return run_pipeline(query)

 import getpass
 import spacy
 import pandas as pd
+from typing import Optional, List, Dict, Any
 import subprocess
 from langchain.llms.base import LLM
 from langchain.docstore.document import Document
 from langchain.vectorstores import FAISS
 from langchain.chains import RetrievalQA
 from smolagents import CodeAgent, DuckDuckGoSearchTool, ManagedAgent, LiteLLMModel
+from pydantic import BaseModel, Field, ValidationError, validator
 from mistralai import Mistral
 from langchain.prompts import PromptTemplate
 mistral_api_key = os.environ.get("MISTRAL_API_KEY")
 client = Mistral(api_key=mistral_api_key)
+# Pydantic models for validation and type safety
+class QueryInput(BaseModel):
+    query: str = Field(..., min_length=1, description="The input query string")
+    @validator('query')
+    def check_query_is_string(cls, v):
+        if not isinstance(v, str):
+            raise ValueError("Query must be a valid string")
+        if v.strip() == "":
+            raise ValueError("Query cannot be empty or just whitespace")
+        return v.strip()
+class ClassificationResult(BaseModel):
+    category: str = Field(..., description="The classification category")
+    confidence: float = Field(..., ge=0.0, le=1.0, description="Classification confidence score")
+class ModerationResult(BaseModel):
+    is_safe: bool = Field(..., description="Whether the content is safe")
+    categories: Dict[str, bool] = Field(default_factory=dict, description="Detected content categories")
+    original_text: str = Field(..., description="The original input text")
+class RAGResponse(BaseModel):
+    answer: str = Field(..., description="The generated answer")
+    sources: List[str] = Field(default_factory=list, description="Source documents used")
+    confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence score of the answer")
+# Load spaCy model for NER
 def install_spacy_model():
     try:
         spacy.load("en_core_web_sm")
 install_spacy_model()
 nlp = spacy.load("en_core_web_sm")
 def extract_main_topic(query: str) -> str:
+    try:
+        query_input = QueryInput(query=query)
+        doc = nlp(query_input.query)
+        main_topic = None
+        # Try to find named entities first
+        for ent in doc.ents:
+            if ent.label_ in ["ORG", "PRODUCT", "PERSON", "GPE", "TIME"]:
+                main_topic = ent.text
                 break
+        # If no named entities found, look for nouns
+        if not main_topic:
+            for token in doc:
+                if token.pos_ in ["NOUN", "PROPN"]:
+                    main_topic = token.text
+                    break
+        return main_topic if main_topic else "this topic"
+    except Exception as e:
+        print(f"Error extracting main topic: {e}")
+        return "this topic"
+def moderate_text(query: str) -> ModerationResult:
     try:
+        query_input = QueryInput(query=query)
+        response = client.classifiers.moderate_chat(
+            model="mistral-moderation-latest",
+            inputs=[{"role": "user", "content": query_input.query}]
+        )
+        is_safe = True
+        categories = {}
+        if hasattr(response, 'results') and response.results:
+            categories = {
+                "violence": response.results[0].categories.get("violence_and_threats", False),
+                "hate": response.results[0].categories.get("hate_and_discrimination", False),
+                "dangerous": response.results[0].categories.get("dangerous_and_criminal_content", False),
+                "selfharm": response.results[0].categories.get("selfharm", False)
+            }
+            is_safe = not any(categories.values())
+        return ModerationResult(
+            is_safe=is_safe,
+            categories=categories,
+            original_text=query_input.query
+        )
     except ValidationError as e:
+        raise ValueError(f"Input validation failed: {str(e)}")
+    except Exception as e:
+        raise RuntimeError(f"Moderation failed: {str(e)}")
+def classify_query(query: str) -> ClassificationResult:
+    try:
+        query_input = QueryInput(query=query)
+        wellness_keywords = ["box breathing", "meditation", "yoga", "mindfulness", "breathing exercises"]
+        if any(keyword in query_input.query.lower() for keyword in wellness_keywords):
+            return ClassificationResult(category="Wellness", confidence=0.9)
+        class_result = classification_chain.invoke({"query": query_input.query})
+        classification = class_result.get("text", "").strip()
+        confidence_map = {
+            "Wellness": 0.8,
+            "Brand": 0.8,
+            "OutOfScope": 0.6
+        }
+        return ClassificationResult(
+            category=classification if classification != "" else "OutOfScope",
+            confidence=confidence_map.get(classification, 0.5)
+        )
+    except ValidationError as e:
+        raise ValueError(f"Classification input validation failed: {str(e)}")
+    except Exception as e:
+        raise RuntimeError(f"Classification failed: {str(e)}")
 def build_or_load_vectorstore(csv_path: str, store_dir: str) -> FAISS:
+    try:
+        if os.path.exists(store_dir):
+            print(f"Loading existing FAISS store from '{store_dir}'")
+            embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
+            return FAISS.load_local(store_dir, embeddings)
+        print(f"Building new FAISS store from CSV: {csv_path}")
         df = pd.read_csv(csv_path)
         df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
         df.columns = df.columns.str.strip()
+        # Handle column name variations
         if "Answer" in df.columns:
             df.rename(columns={"Answer": "Answers"}, inplace=True)
         if "Question" not in df.columns and "Question " in df.columns:
             df.rename(columns={"Question ": "Question"}, inplace=True)
         if "Question" not in df.columns or "Answers" not in df.columns:
+            raise ValueError("CSV must have 'Question' and 'Answers' columns")
+        docs = [
+            Document(page_content=str(row["Answers"]), metadata={"question": str(row["Question"])})
+            for _, row in df.iterrows()
+        ]
         embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
         vectorstore = FAISS.from_documents(docs, embedding=embeddings)
         vectorstore.save_local(store_dir)
         return vectorstore
+    except Exception as e:
+        raise RuntimeError(f"Error building/loading vector store: {str(e)}")
 def build_rag_chain(llm_model: LiteLLMModel, vectorstore: FAISS) -> RetrievalQA:
     class GeminiLangChainLLM(LLM):
         def _call(self, prompt: str, stop: Optional[list] = None, **kwargs) -> str:
         def _llm_type(self) -> str:
             return "custom_gemini"
+    try:
+        retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
+        gemini_as_llm = GeminiLangChainLLM()
+        return RetrievalQA.from_chain_type(
+            llm=gemini_as_llm,
+            chain_type="stuff",
+            retriever=retriever,
+            return_source_documents=True
+        )
+    except Exception as e:
+        raise RuntimeError(f"Error building RAG chain: {str(e)}")
 def do_web_search(query: str) -> str:
+    try:
+        query_input = QueryInput(query=query)
+        search_tool = DuckDuckGoSearchTool()
+        web_agent = CodeAgent(tools=[search_tool], model=pydantic_agent)
+        managed_web_agent = ManagedAgent(agent=web_agent, name="web_search", description="Performs web searches")
+        manager_agent = CodeAgent(tools=[], model=pydantic_agent, managed_agents=[managed_web_agent])
+        search_query = f"Give me relevant info: {query_input.query}"
+        return manager_agent.run(search_query)
+    except Exception as e:
+        return f"Web search failed: {str(e)}"
 def merge_responses(kb_answer: str, web_answer: str) -> str:
+    try:
+        if not kb_answer and not web_answer:
+            return "No relevant information found."
+        if not web_answer:
+            return kb_answer.strip()
+        if not kb_answer:
+            return web_answer.strip()
+        return f"Knowledge Base Answer: {kb_answer.strip()}\n\nWeb Search Result: {web_answer.strip()}"
+    except Exception as e:
+        return f"Error merging responses: {str(e)}"
 def run_pipeline(query: str) -> str:
+    try:
+        # Validate and moderate input
+        moderation_result = moderate_text(query)
+        if not moderation_result.is_safe:
+            return "Sorry, this query contains harmful or inappropriate content."
+        # Classify the query
+        classification_result = classify_query(moderation_result.original_text)
+        if classification_result.category == "OutOfScope":
+            refusal_text = refusal_chain.run({"topic": "this topic"})
+            return tailor_chain.run({"response": refusal_text}).strip()
+        # Handle different classifications
+        if classification_result.category == "Wellness":
+            rag_result = wellness_rag_chain({"query": moderation_result.original_text})
+            csv_answer = rag_result["result"].strip()
+            web_answer = "" if csv_answer else do_web_search(moderation_result.original_text)
+            final_merged = merge_responses(csv_answer, web_answer)
+            return tailor_chain.run({"response": final_merged}).strip()
+        if classification_result.category == "Brand":
+            rag_result = brand_rag_chain({"query": moderation_result.original_text})
+            csv_answer = rag_result["result"].strip()
+            final_merged = merge_responses(csv_answer, "")
+            return tailor_chain.run({"response": final_merged}).strip()
+        # Default fallback
         refusal_text = refusal_chain.run({"topic": "this topic"})
+        return tailor_chain.run({"response": refusal_text}).strip()
+    except Exception as e:
+        return f"An error occurred while processing your request: {str(e)}"
+# Initialize chains and vectorstores
+try:
+    classification_chain = get_classification_chain()
+    refusal_chain = get_refusal_chain()
+    tailor_chain = get_tailor_chain()
+    cleaner_chain = get_cleaner_chain()
+    wellness_csv = "AIChatbot.csv"
+    brand_csv = "BrandAI.csv"
+    wellness_store_dir = "faiss_wellness_store"
+    brand_store_dir = "faiss_brand_store"
+    wellness_vectorstore = build_or_load_vectorstore(wellness_csv, wellness_store_dir)
+    brand_vectorstore = build_or_load_vectorstore(brand_csv, brand_store_dir)
+    gemini_llm = LiteLLMModel(model_id="gemini/gemini-pro", api_key=os.environ.get("GEMINI_API_KEY"))
+    wellness_rag_chain = build_rag_chain(gemini_llm, wellness_vectorstore)
+    brand_rag_chain = build_rag_chain(gemini_llm, brand_vectorstore)
+    print("Pipeline initialized successfully!")
+except Exception as e:
+    print(f"Error initializing pipeline: {str(e)}")
 def run_with_chain(query: str) -> str:
+    return run_pipeline(query)