Spaces:

can-org
/

Testing-AI-Contain

Sleeping

App Files Files Community

Sangyog10 commited on Aug 4, 2025

Commit

273204e

1 Parent(s): 7ce4837

set up rag pipeline for chatbot

Browse files

Files changed (4) hide show

README.md +3 -1
app.py +3 -0
features/rag_chatbot/rag_pipeline.py +40 -151
requirements.txt +8 -0

README.md CHANGED Viewed

@@ -131,7 +131,9 @@ AI-Checker/
 2. **Run the API**
    ```bash
-   uvicorn app:app --reload
    ```
 3. **Build Docker (optional)**

 2. **Run the API**
    ```bash
+   chroma run --path ./chroma_database ## to run chromadb locally
+   uvicorn app:app --reload --port 8001 ## fastapi (run after chromadb)
    ```
 3. **Build Docker (optional)**

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from features.nepali_text_classifier.routes import (
 )
 from features.image_classifier.routes import router as image_classifier_router
 from features.image_edit_detector.routes import router as image_edit_detector_router
 from fastapi.staticfiles import StaticFiles
 from config import ACCESS_RATE
@@ -41,6 +42,8 @@ app.include_router(text_classifier_router, prefix="/text")
 app.include_router(nepali_text_classifier_router, prefix="/NP")
 app.include_router(image_classifier_router, prefix="/AI-image")
 app.include_router(image_edit_detector_router, prefix="/detect")
 @app.get("/")

 )
 from features.image_classifier.routes import router as image_classifier_router
 from features.image_edit_detector.routes import router as image_edit_detector_router
+from features.rag_chatbot.routes import router as rag_router
 from fastapi.staticfiles import StaticFiles
 from config import ACCESS_RATE
 app.include_router(nepali_text_classifier_router, prefix="/NP")
 app.include_router(image_classifier_router, prefix="/AI-image")
 app.include_router(image_edit_detector_router, prefix="/detect")
+app.include_router(rag_router, prefix="/rag")
 @app.get("/")

features/rag_chatbot/rag_pipeline.py CHANGED Viewed

@@ -3,99 +3,38 @@ import chromadb
 from dotenv import load_dotenv
 from langchain_core.documents import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain_community.llms import OpenAI
 from langchain.chains.question_answering import load_qa_chain
 from langchain_community.vectorstores import Chroma
 from langchain.chains import LLMChain
 from langchain.prompts import PromptTemplate
-from langchain.chat_models import ChatOpenAI
 load_dotenv()
-# ChromaDB configuration
-CHROMA_HOST = os.getenv("CHROMA_HOST", "localhost") # change in env in production when hosted
 COLLECTION_NAME = "company_docs_collection"
-# LLM Provider Configuration
-LLM_PROVIDER = os.getenv("LLM_PROVIDER", "openai").lower()
-LLM_API_KEY = os.getenv("LLM_API_KEY")
-LLM_MODEL = os.getenv("LLM_MODEL", "gpt-3.5-turbo")
-LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0"))
-LLM_MAX_TOKENS = int(os.getenv("LLM_MAX_TOKENS", "2048"))
-# Provider-specific configurations
-PROVIDER_CONFIGS = {
-    "openai": {
-        "api_base": "https://api.openai.com/v1",
-        "default_model": "gpt-3.5-turbo"
-    },
-    "groq": {
-        "api_base": "https://api.groq.com/openai/v1",
-        "default_model": "llama-3.3-70b-versatile"
-    },
-    "openrouter": {
-        "api_base": "https://openrouter.ai/api/v1",
-        "default_model": "mistralai/mistral-small-3.2-24b-instruct:free"
-    }
-}
 vector_store = None
 company_qa_chain = None
 query_router_chain = None
 cybersecurity_chain = None
-llm = None
-def get_llm_config():
-    """Get the appropriate LLM configuration based on the provider."""
-    if LLM_PROVIDER not in PROVIDER_CONFIGS:
-        raise ValueError(f"Unsupported LLM provider: {LLM_PROVIDER}. Supported: {list(PROVIDER_CONFIGS.keys())}")
-    config = PROVIDER_CONFIGS[LLM_PROVIDER].copy()
-    # Use provided model or fall back to default
-    model = LLM_MODEL if LLM_MODEL != "gpt-3.5-turbo" else config["default_model"]
-    return {
-        "model": model,
-        "openai_api_key": LLM_API_KEY,
-        "openai_api_base": config["api_base"],
-        "temperature": LLM_TEMPERATURE,
-        "max_tokens": LLM_MAX_TOKENS,
-    }
-def initialize_llm():
-    """Initialize the LLM based on the configured provider."""
-    if not LLM_API_KEY:
-        raise ValueError(f"LLM_API_KEY environment variable is required for {LLM_PROVIDER}")
-    config = get_llm_config()
-    print(f"Initializing {LLM_PROVIDER.upper()} with model: {config['model']}")
-    return ChatOpenAI(**config)
 def initialize_pipelines():
     """Initializes all required models, chains, and the vector store."""
     global vector_store, company_qa_chain, query_router_chain, cybersecurity_chain, llm
     try:
-        # Initialize LLM
-        llm = initialize_llm()
-        # Initialize embeddings
-        embeddings = HuggingFaceEmbeddings(
-            model_name="all-MiniLM-L6-v2",
-            model_kwargs={'device': 'cpu'},
-            encode_kwargs={'normalize_embeddings': True}
-        )
         # Initialize ChromaDB client
         try:
             chroma_client = chromadb.HttpClient(host=CHROMA_HOST, port=8000)
-            chroma_client.heartbeat()
         except Exception as e:
             raise ConnectionError("Failed to connect to ChromaDB.") from e
         # Initialize vector store
@@ -106,14 +45,16 @@ def initialize_pipelines():
         )
         # Query Router Chain
-        router_template = """You are a query classifier. Classify the following query into one of these categories:
-- COMPANY: Questions about our company, its products, services, or general information
-- CYBERSECURITY: Questions about cybersecurity, security threats, best practices, or vulnerabilities
-- OFF_TOPIC: Questions that don't fit the above categories
-Query: {query}
-Respond with only the category name (COMPANY, CYBERSECURITY, or OFF_TOPIC):"""
         router_prompt = PromptTemplate(
             input_variables=["query"],
@@ -125,34 +66,17 @@ Respond with only the category name (COMPANY, CYBERSECURITY, or OFF_TOPIC):"""
             prompt=router_prompt
         )
-        # Custom Company QA Chain
-        company_qa_template = """You are a helpful assistant for CyberAlertNepal. Answer the following question about our company using the information provided and links if only available. Give a natural, direct and polite response.
-Question: {question}
-Information:
-{context}
-Answer:"""
-        company_qa_prompt = PromptTemplate(
-            input_variables=["question", "context"],
-            template=company_qa_template
-        )
-        company_qa_chain = LLMChain(
-            llm=llm,
-            prompt=company_qa_prompt
-        )
-        # Cybersecurity Chain
-        cybersecurity_template = """You are a cybersecurity professional. Answer the following question truthfully and concisely.
-If you are not 100% sure about the answer, simply respond with: "I am not sure about the answer."
-Do not add extra explanations or assumptions. Do not provide false or speculative information.
-Question: {question}
-Provide a comprehensive and accurate answer about cybersecurity:"""
         cybersecurity_prompt = PromptTemplate(
             input_variables=["question"],
@@ -164,8 +88,8 @@ Provide a comprehensive and accurate answer about cybersecurity:"""
             prompt=cybersecurity_prompt
         )
-        print(f"Successfully initialized pipelines with {LLM_PROVIDER.upper()}")
     except Exception as e:
         print(f"Error initializing pipelines: {e}")
         raise
@@ -188,6 +112,7 @@ def add_document_to_rag(text: str, metadata: dict):
             print("Document was empty after splitting, not adding to ChromaDB.")
             return False
         vector_store.add_documents(docs)
         print("Successfully added documents.")
         return True
@@ -208,6 +133,7 @@ def route_and_process_query(query: str):
         route_result = query_router_chain.run(query)
         route = route_result.strip().upper()
         # 2. Route to appropriate logic
         if "CYBERSECURITY" in route:
@@ -215,47 +141,38 @@ def route_and_process_query(query: str):
             return {
                 "answer": answer,
                 "source": "Cybersecurity Knowledge Base",
-                "route": "CYBERSECURITY",
-                "provider": LLM_PROVIDER.upper(),
-                "model": get_llm_config()["model"]
             }
         elif "COMPANY" in route:
             # Perform similarity search on ChromaDB
             docs = vector_store.similarity_search(query, k=3)
             if not docs:
                 return {
                     "answer": "I could not find any relevant information to answer your question.",
                     "source": "Company Documents",
-                    "route": "COMPANY",
-                    "provider": LLM_PROVIDER.upper(),
-                    "model": get_llm_config()["model"]
                 }
-            # Combine document content for context
-            context = "\n\n".join([doc.page_content for doc in docs])
-            # Run the custom QA chain
-            answer = company_qa_chain.run(question=query, context=context)
             sources = list(set([doc.metadata.get("source", "Unknown") for doc in docs]))
             return {
                 "answer": answer,
                 "source": "Company Documents",
                 "documents": sources,
-                "route": "COMPANY",
-                "provider": LLM_PROVIDER.upper(),
-                "model": get_llm_config()["model"]
             }
         else:  # OFF_TOPIC
             return {
                 "answer": "I am a specialized assistant of CyberAlertNepal. I cannot answer questions outside of cybersecurity topics.",
                 "source": "N/A",
-                "route": "OFF_TOPIC",
-                "provider": LLM_PROVIDER.upper(),
-                "model": get_llm_config()["model"]
             }
     except Exception as e:
@@ -263,9 +180,6 @@ def route_and_process_query(query: str):
         return {
             "answer": "I encountered an error while processing your query. Please try again.",
             "source": "Error",
-            "route": None,
-            "documents": None,
-            "provider": LLM_PROVIDER.upper(),
             "error": str(e)
         }
@@ -281,42 +195,17 @@ def check_system_health():
             "vector_store": vector_store is not None,
             "company_qa_chain": company_qa_chain is not None,
             "query_router_chain": query_router_chain is not None,
-            "cybersecurity_chain": cybersecurity_chain is not None,
-            "llm": llm is not None
         }
         return {
             "status": "healthy" if all(components.values()) else "unhealthy",
-            "components": components,
-            "provider": LLM_PROVIDER.upper(),
-            "model": get_llm_config()["model"] if llm else "Not initialized"
         }
     except Exception as e:
         return {
             "status": "unhealthy",
-            "error": str(e),
-            "provider": LLM_PROVIDER.upper()
-        }
-def test_llm_connection():
-    """Test the LLM API connection."""
-    try:
-        if not llm:
-            initialize_pipelines()
-        # Simple test query
-        test_response = llm("Say 'Hello, LLM is working!'")
-        return {
-            "success": True,
-            "provider": LLM_PROVIDER.upper(),
-            "model": get_llm_config()["model"],
-            "response": str(test_response)
-        }
-    except Exception as e:
-        return {
-            "success": False,
-            "provider": LLM_PROVIDER.upper(),
             "error": str(e)
         }

 from dotenv import load_dotenv
 from langchain_core.documents import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_openai import OpenAIEmbeddings, OpenAI
 from langchain.chains.question_answering import load_qa_chain
 from langchain_community.vectorstores import Chroma
 from langchain.chains import LLMChain
 from langchain.prompts import PromptTemplate
 load_dotenv()
+CHROMA_HOST = os.getenv("CHROMA_HOST", "localhost")
 COLLECTION_NAME = "company_docs_collection"
 vector_store = None
 company_qa_chain = None
 query_router_chain = None
 cybersecurity_chain = None
+llm = OpenAI(temperature=0)
 def initialize_pipelines():
     """Initializes all required models, chains, and the vector store."""
     global vector_store, company_qa_chain, query_router_chain, cybersecurity_chain, llm
     try:
+        embeddings = OpenAIEmbeddings()
         # Initialize ChromaDB client
         try:
             chroma_client = chromadb.HttpClient(host=CHROMA_HOST, port=8000)
+            chroma_client.heartbeat() # Heartbeat check to confirm the connection
+            print("Successfully connected to ChromaDB.")
         except Exception as e:
+            print(f"FATAL: Could not connect to ChromaDB at {CHROMA_HOST}:8000. Please ensure the ChromaDB server is running.")
+            print(f"Error details: {e}")
             raise ConnectionError("Failed to connect to ChromaDB.") from e
         # Initialize vector store
         )
         # Query Router Chain
+        router_template = """
+        You are a query classifier. Classify the following query into one of these categories:
+        - COMPANY: Questions about company policies, procedures, documents, or internal information
+        - CYBERSECURITY: Questions about cybersecurity, security threats, best practices, or vulnerabilities
+        - OFF_TOPIC: Questions that don't fit the above categories
+        Query: {query}
+        Respond with only the category name (COMPANY, CYBERSECURITY, or OFF_TOPIC):
+        """
         router_prompt = PromptTemplate(
             input_variables=["query"],
             prompt=router_prompt
         )
+        # Company QA Chain
+        company_qa_chain = load_qa_chain(llm, chain_type="stuff")
+        # Cybersecurity Chain
+        cybersecurity_template = """
+        You are a cybersecurity expert. Answer the following cybersecurity question based on your knowledge:
+        Question: {question}
+        Provide a comprehensive and accurate answer about cybersecurity:
+        """
         cybersecurity_prompt = PromptTemplate(
             input_variables=["question"],
             prompt=cybersecurity_prompt
         )
+        print("All pipelines initialized successfully!")
     except Exception as e:
         print(f"Error initializing pipelines: {e}")
         raise
             print("Document was empty after splitting, not adding to ChromaDB.")
             return False
+        print(f"Adding {len(docs)} document chunks to ChromaDB...")
         vector_store.add_documents(docs)
         print("Successfully added documents.")
         return True
         route_result = query_router_chain.run(query)
         route = route_result.strip().upper()
+        print(f"Query routed to: {route}")
         # 2. Route to appropriate logic
         if "CYBERSECURITY" in route:
             return {
                 "answer": answer,
                 "source": "Cybersecurity Knowledge Base",
+                "route": "CYBERSECURITY"
             }
         elif "COMPANY" in route:
             # Perform similarity search on ChromaDB
             docs = vector_store.similarity_search(query, k=3)
+            print(f"Found {len(docs)} relevant documents.")
+            print(f"Documents: {[doc.metadata.get('source', 'Unknown') for doc in docs]}")
             if not docs:
                 return {
                     "answer": "I could not find any relevant information to answer your question.",
                     "source": "Company Documents",
+                    "route": "COMPANY"
                 }
+            # Run the QA chain
+            answer = company_qa_chain.run(input_documents=docs, question=query)
             sources = list(set([doc.metadata.get("source", "Unknown") for doc in docs]))
             return {
                 "answer": answer,
                 "source": "Company Documents",
                 "documents": sources,
+                "route": "COMPANY"
             }
         else:  # OFF_TOPIC
             return {
                 "answer": "I am a specialized assistant of CyberAlertNepal. I cannot answer questions outside of cybersecurity topics.",
                 "source": "N/A",
+                "route": "OFF_TOPIC"
             }
     except Exception as e:
         return {
             "answer": "I encountered an error while processing your query. Please try again.",
             "source": "Error",
             "error": str(e)
         }
             "vector_store": vector_store is not None,
             "company_qa_chain": company_qa_chain is not None,
             "query_router_chain": query_router_chain is not None,
+            "cybersecurity_chain": cybersecurity_chain is not None
         }
         return {
             "status": "healthy" if all(components.values()) else "unhealthy",
+            "components": components
         }
     except Exception as e:
         return {
             "status": "unhealthy",
             "error": str(e)
         }

requirements.txt CHANGED Viewed

@@ -21,3 +21,11 @@ tools
 pandas
 requests
 beautifulsoup4

 pandas
 requests
 beautifulsoup4
+langchain
+langchain-community
+langchain-openai
+faiss-cpu
+PyPDF2
+tiktoken
+chromadb
+langchain_chroma