Spaces:

Phoenix21
/

ChatBotAgenticRAG

Build error

App Files Files Community

Phoenix21 commited on Jan 12, 2025

Commit

53b33ac

verified ·

1 Parent(s): 293661c

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +108 -8

pipeline.py CHANGED Viewed

@@ -4,6 +4,7 @@ import spacy
 import pandas as pd
 from typing import Optional
 import subprocess
 from langchain.llms.base import LLM
 from langchain.docstore.document import Document
 from langchain.embeddings import HuggingFaceEmbeddings
@@ -12,7 +13,14 @@ from langchain.chains import RetrievalQA
 from smolagents import CodeAgent, DuckDuckGoSearchTool, ManagedAgent, LiteLLMModel
 from pydantic_ai import Agent  # Import Pydantic AI's Agent
 from mistralai import Mistral
-import asyncio  # Needed for managing async tasks
 # Initialize Mistral API client
 mistral_api_key = os.environ.get("MISTRAL_API_KEY")
@@ -54,23 +62,30 @@ def classify_query(query: str) -> str:
     wellness_keywords = ["box breathing", "meditation", "yoga", "mindfulness", "breathing exercises"]
     if any(keyword in query.lower() for keyword in wellness_keywords):
         return "Wellness"
     class_result = classification_chain.invoke({"query": query})
     classification = class_result.get("text", "").strip()
     return classification if classification != "OutOfScope" else "OutOfScope"
-# Function to moderate text using Mistral moderation API (async version)
-async def moderate_text(query: str) -> str:
     try:
-        await pydantic_agent.run(query)  # Use async run for Pydantic validation
     except Exception as e:
         print(f"Error validating text: {e}")
         return "Invalid text format."
-    response = await client.classifiers.moderate_chat(
         model="mistral-moderation-latest",
         inputs=[{"role": "user", "content": query}]
     )
     categories = response['results'][0]['categories']
     if categories.get("violence_and_threats", False) or \
        categories.get("hate_and_discrimination", False) or \
        categories.get("dangerous_and_criminal_content", False) or \
@@ -79,7 +94,74 @@ async def moderate_text(query: str) -> str:
     return query
-# Use the event loop to run the async functions properly
 async def run_async_pipeline(query: str) -> str:
     # Moderate the query for harmful content (async)
     moderated_query = await moderate_text(query)
@@ -100,14 +182,14 @@ async def run_async_pipeline(query: str) -> str:
         web_answer = ""  # Empty if we found an answer from the knowledge base
         if not csv_answer:
             web_answer = await do_web_search(moderated_query)
-        final_merged = cleaner_chain.merge(kb=csv_answer, web=web_answer)
         final_answer = tailor_chain.run({"response": final_merged})
         return final_answer.strip()
     if classification == "Brand":
         rag_result = brand_rag_chain({"query": moderated_query})
         csv_answer = rag_result["result"].strip()
-        final_merged = cleaner_chain.merge(kb=csv_answer, web="")
         final_answer = tailor_chain.run({"response": final_merged})
         return final_answer.strip()
@@ -118,3 +200,21 @@ async def run_async_pipeline(query: str) -> str:
 # Run the pipeline with the event loop
 def run_with_chain(query: str) -> str:
     return asyncio.run(run_async_pipeline(query))

 import pandas as pd
 from typing import Optional
 import subprocess
+import asyncio  # Needed for managing async tasks
 from langchain.llms.base import LLM
 from langchain.docstore.document import Document
 from langchain.embeddings import HuggingFaceEmbeddings
 from smolagents import CodeAgent, DuckDuckGoSearchTool, ManagedAgent, LiteLLMModel
 from pydantic_ai import Agent  # Import Pydantic AI's Agent
 from mistralai import Mistral
+from langchain.prompts import PromptTemplate
+# Import chains and tools
+from classification_chain import get_classification_chain
+from cleaner_chain import get_cleaner_chain
+from refusal_chain import get_refusal_chain
+from tailor_chain import get_tailor_chain
+from prompts import classification_prompt, refusal_prompt, tailor_prompt
 # Initialize Mistral API client
 mistral_api_key = os.environ.get("MISTRAL_API_KEY")
     wellness_keywords = ["box breathing", "meditation", "yoga", "mindfulness", "breathing exercises"]
     if any(keyword in query.lower() for keyword in wellness_keywords):
         return "Wellness"
+    # Fallback to classification chain if not directly recognized
     class_result = classification_chain.invoke({"query": query})
     classification = class_result.get("text", "").strip()
     return classification if classification != "OutOfScope" else "OutOfScope"
+# Function to moderate text using Mistral moderation API (sync version)
+def moderate_text(query: str) -> str:
     try:
+        # Use Pydantic AI for text validation synchronously
+        pydantic_agent.run(query)  # This is a synchronous call
     except Exception as e:
         print(f"Error validating text: {e}")
         return "Invalid text format."
+    # Mistral moderation, no need for await as it's synchronous
+    response = client.classifiers.moderate_chat(
         model="mistral-moderation-latest",
         inputs=[{"role": "user", "content": query}]
     )
+    # Extract moderation categories
     categories = response['results'][0]['categories']
+    # Check for harmful categories and return "OutOfScope" if any are found
     if categories.get("violence_and_threats", False) or \
        categories.get("hate_and_discrimination", False) or \
        categories.get("dangerous_and_criminal_content", False) or \
     return query
+# Function to build or load the vector store from CSV data
+def build_or_load_vectorstore(csv_path: str, store_dir: str) -> FAISS:
+    if os.path.exists(store_dir):
+        print(f"DEBUG: Found existing FAISS store at '{store_dir}'. Loading...")
+        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
+        vectorstore = FAISS.load_local(store_dir, embeddings)
+        return vectorstore
+    else:
+        print(f"DEBUG: Building new store from CSV: {csv_path}")
+        df = pd.read_csv(csv_path)
+        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
+        df.columns = df.columns.str.strip()
+        if "Answer" in df.columns:
+            df.rename(columns={"Answer": "Answers"}, inplace=True)
+        if "Question" not in df.columns and "Question " in df.columns:
+            df.rename(columns={"Question ": "Question"}, inplace=True)
+        if "Question" not in df.columns or "Answers" not in df.columns:
+            raise ValueError("CSV must have 'Question' and 'Answers' columns.")
+        docs = []
+        for _, row in df.iterrows():
+            q = str(row["Question"])
+            ans = str(row["Answers"])
+            doc = Document(page_content=ans, metadata={"question": q})
+            docs.append(doc)
+        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")
+        vectorstore = FAISS.from_documents(docs, embedding=embeddings)
+        vectorstore.save_local(store_dir)
+        return vectorstore
+# Function to build RAG chain
+def build_rag_chain(llm_model: LiteLLMModel, vectorstore: FAISS) -> RetrievalQA:
+    class GeminiLangChainLLM(LLM):
+        def _call(self, prompt: str, stop: Optional[list] = None, **kwargs) -> str:
+            messages = [{"role": "user", "content": prompt}]
+            return llm_model(messages, stop_sequences=stop)
+        @property
+        def _llm_type(self) -> str:
+            return "custom_gemini"
+    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
+    gemini_as_llm = GeminiLangChainLLM()
+    rag_chain = RetrievalQA.from_chain_type(
+        llm=gemini_as_llm,
+        chain_type="stuff",
+        retriever=retriever,
+        return_source_documents=True
+    )
+    return rag_chain
+# Function to perform web search using DuckDuckGo
+async def do_web_search(query: str) -> str:
+    search_tool = DuckDuckGoSearchTool()
+    web_agent = CodeAgent(tools=[search_tool], model=pydantic_agent)
+    managed_web_agent = ManagedAgent(agent=web_agent, name="web_search", description="Runs web search for you.")
+    manager_agent = CodeAgent(tools=[], model=pydantic_agent, managed_agents=[managed_web_agent])
+    search_query = f"Give me relevant info: {query}"
+    response = manager_agent.run(search_query)
+    return response
+# Function to combine web and knowledge base responses
+async def merge_responses(kb_answer: str, web_answer: str) -> str:
+    # Merge both answers with a cohesive response
+    final_answer = f"Knowledge Base Answer: {kb_answer}\n\nWeb Search Result: {web_answer}"
+    return final_answer.strip()
+# Orchestrate the entire workflow
 async def run_async_pipeline(query: str) -> str:
     # Moderate the query for harmful content (async)
     moderated_query = await moderate_text(query)
         web_answer = ""  # Empty if we found an answer from the knowledge base
         if not csv_answer:
             web_answer = await do_web_search(moderated_query)
+        final_merged = await merge_responses(csv_answer, web_answer)
         final_answer = tailor_chain.run({"response": final_merged})
         return final_answer.strip()
     if classification == "Brand":
         rag_result = brand_rag_chain({"query": moderated_query})
         csv_answer = rag_result["result"].strip()
+        final_merged = await merge_responses(csv_answer, "")
         final_answer = tailor_chain.run({"response": final_merged})
         return final_answer.strip()
 # Run the pipeline with the event loop
 def run_with_chain(query: str) -> str:
     return asyncio.run(run_async_pipeline(query))
+# Initialize chains here
+classification_chain = get_classification_chain()
+refusal_chain = get_refusal_chain()
+tailor_chain = get_tailor_chain()
+cleaner_chain = get_cleaner_chain()
+wellness_csv = "AIChatbot.csv"
+brand_csv = "BrandAI.csv"
+wellness_store_dir = "faiss_wellness_store"
+brand_store_dir = "faiss_brand_store"
+wellness_vectorstore = build_or_load_vectorstore(wellness_csv, wellness_store_dir)
+brand_vectorstore = build_or_load_vectorstore(brand_csv, brand_store_dir)
+gemini_llm = LiteLLMModel(model_id="gemini/gemini-pro", api_key=os.environ.get("GEMINI_API_KEY"))
+wellness_rag_chain = build_rag_chain(gemini_llm, wellness_vectorstore)
+brand_rag_chain = build_rag_chain(gemini_llm, brand_vectorstore)