Spaces:

youdata-ai
/

rites-pdf

Sleeping

App Files Files Community

akshansh36 commited on Feb 13, 2025

Commit

7d4344b

verified ·

1 Parent(s): 2311dd1

Upload 3 files

Browse files

Files changed (3) hide show

app.py +187 -0
requirements.txt +0 -0
tools.py +73 -0

app.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import streamlit as st
+import streamlit_chat
+from langgraph.prebuilt import create_react_agent
+from langchain_openai import ChatOpenAI
+from langchain.schema import HumanMessage, AIMessage
+from tools import get_context
+import os
+from pymongo import MongoClient
+from bson import ObjectId
+from pytz import timezone, utc
+from dotenv import load_dotenv
+from datetime import datetime
+load_dotenv()
+st.set_page_config(layout="wide", page_title="RITES Bot", page_icon="📄")
+OPENAI_KEY = os.getenv("OPENAI_API_KEY")
+MONGO_URI = os.getenv("MONGO_URI")
+model = ChatOpenAI(
+    model="gpt-4o-mini",
+    temperature=0,
+    openai_api_key=OPENAI_KEY,
+    streaming=True
+)
+client = MongoClient(MONGO_URI)
+db = client["rites"]
+chat_sessions = db["rites_pdf_chat"]
+tools = [get_context]
+system_prompt = """
+You are an AI-powered assistant for the RITES website, providing users with accurate and relevant information sourced from official PDF documents.
+- These documents include job openings, annual returns, financial reports, approved vendor lists, banned vendor lists, and press releases.
+- To answer the user query you will be provided a get_context tool, which allows you to retrieve data chunks from the relevant documents based on user query.
+Follow these instructions carefully:
+1. **Tool Usage**
+   - You can use this tool as needed to fetch information from the knowledgebase.
+2. **History Utilization**:
+   - You will be provided with conversation history to track context. If the user’s question relates to prior responses, try to answer from memory without invoking the search tool.
+   - If additional information is required, reformulate the query to be self-contained before invoking the search tool again.
+3. **General Messages and Salutations**:
+   - If the user says "Hi," "Hello," "How are you?" or similar, respond conversationally without invoking the search tool.
+4. **Unrelated Questions**:
+   - If a user asks something outside the scope of RITES (e.g., sports, movies, general trivia), politely decline by saying:
+   "I can assist you with information related to RITES, such as job openings, financial reports, and vendor details. Let me know how I can help."
+5. **Response Formation**:
+   - Each retrieved chunk will have a PDF URL associated with it; you must cite that PDF URL if you use any information from it.
+   - Each retrieved chunk will also have a start_page and end_page indicating the span of pages containing the information. Cite these page numbers if used.
+   - Do not cite the same URL or page number multiple times; combine the citations at the end.
+   - If using multiple PDFs, provide the information separately for each, with clear citations.
+   - Respond in a friendly, well-formatted manner without mentioning internal terms like "chunk" or "chunk number."
+6. **Clear and Complete Responses**:
+   - Provide clear explanations with all relevant details. Never omit important information.
+   - If the user query cannot be answered from the available data, politely ask for clarification.
+## List of tools available
+1. 'get_context'
+"""
+agent_executor = create_react_agent(model, tools, state_modifier=system_prompt)
+# Initialize session state variables if not present
+if 'current_chat_id' not in st.session_state:
+    st.session_state['current_chat_id'] = None
+if 'chat_history' not in st.session_state:
+    st.session_state['chat_history'] = []  # Now a list of message dicts with "role" and "content"
+# Function to create a new chat session in MongoDB
+def create_new_chat_session():
+    # Get the current time in IST
+    ind_time = datetime.now(timezone("Asia/Kolkata"))
+    # Convert IST time to UTC for storing in MongoDB
+    utc_time = ind_time.astimezone(utc)
+    new_session = {
+        "created_at": utc_time,  # Store in UTC
+        "messages": []  # Initially empty
+    }
+    session_id = chat_sessions.insert_one(new_session).inserted_id
+    return str(session_id)
+# Function to load a chat session by MongoDB ID (loads full history for display)
+def load_chat_session(session_id):
+    session = chat_sessions.find_one({"_id": ObjectId(session_id)})
+    if session:
+        st.session_state['chat_history'] = session.get('messages', [])
+# Function to update a chat session in MongoDB by appending new messages
+def update_chat_session(session_id, new_messages):
+    """
+    Append new messages to the chat session.
+    Args:
+        session_id (str): The MongoDB session ID.
+        new_messages (list): A list of message dictionaries, each with keys "role" and "content".
+    """
+    chat_sessions.update_one(
+        {"_id": ObjectId(session_id)},
+        {"$push": {"messages": {"$each": new_messages}}}
+    )
+# Sidebar: Chat sessions management
+st.sidebar.header("Chat Sessions")
+# Button to create a new chat session
+if st.sidebar.button("New Chat"):
+    new_chat_id = create_new_chat_session()
+    st.session_state['current_chat_id'] = new_chat_id
+    st.session_state['chat_history'] = []
+# List existing chat sessions with delete option
+existing_sessions = chat_sessions.find().sort("created_at", -1)
+for session in existing_sessions:
+    session_id = str(session['_id'])
+    # Convert stored UTC time to IST for display
+    utc_time = session['created_at']
+    ist_time = utc_time.replace(tzinfo=utc).astimezone(timezone("Asia/Kolkata"))
+    session_date = ist_time.strftime("%Y-%m-%d %H:%M:%S")
+    col1, col2 = st.sidebar.columns([8, 1])
+    with col1:
+        if st.button(f"Session {session_date}", key=session_id):
+            st.session_state['current_chat_id'] = session_id
+            load_chat_session(session_id)
+    with col2:
+        if st.button("🗑️", key=f"delete_{session_id}"):
+            chat_sessions.delete_one({"_id": ObjectId(session_id)})
+            st.rerun()  # Refresh to update the sidebar
+# Main Chat Interface
+st.markdown('<div class="fixed-header"><h1>Welcome To "RITES" Chatbot</h1></div>', unsafe_allow_html=True)
+st.markdown("<hr>", unsafe_allow_html=True)
+# Input box for the user question
+user_question = st.chat_input("Ask a Question related to RITES PDFs")
+if user_question:
+    # Create a new session if none exists
+    if not st.session_state['current_chat_id']:
+        new_chat_id = create_new_chat_session()
+        st.session_state['current_chat_id'] = new_chat_id
+    with st.spinner("Please wait, I am thinking!!"):
+        # Append the new user message to the full history
+        user_message = {"role": "user", "content": user_question}
+        st.session_state['chat_history'].append(user_message)
+        # Prepare the last 5 messages for the agent input
+        recent_messages = st.session_state['chat_history'][-5:]
+        messages = []
+        for msg in recent_messages:
+            if msg["role"] == "user":
+                messages.append(HumanMessage(content=msg["content"]))
+            else:
+                messages.append(AIMessage(content=msg["content"]))
+        inputs = {"messages": messages}
+        response = agent_executor.invoke(inputs)
+        if response:
+            reply = response["messages"][-1].content
+            assistant_message = {"role": "assistant", "content": reply}
+            st.session_state['chat_history'].append(assistant_message)
+            # Update MongoDB with both the user and assistant messages
+            if st.session_state['current_chat_id']:
+                update_chat_session(
+                    st.session_state['current_chat_id'],
+                    [user_message, assistant_message]
+                )
+        else:
+            st.error("Error processing your request, please try again later.")
+# Display the last 15 messages in the UI
+for i, msg in enumerate(st.session_state['chat_history'][-15:]):
+    if msg["role"] == "user":
+        streamlit_chat.message(msg["content"], is_user=True, key=f"chat_message_user_{i}")
+    else:
+        streamlit_chat.message(msg["content"], is_user=False, key=f"chat_message_assistant_{i}")

requirements.txt ADDED Viewed

Binary file (4.25 kB). View file

tools.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from langchain_core.tools import tool
+import pinecone
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+import os
+from dotenv import load_dotenv
+load_dotenv()
+GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
+PINECONE_API = os.getenv("PINECONE_API_KEY")
+google_embeddings = GoogleGenerativeAIEmbeddings(
+    model="models/embedding-001",  # Correct model name
+    google_api_key=GOOGLE_API_KEY
+)
+pc = pinecone.Pinecone(
+    api_key=PINECONE_API
+)
+PINECONE_INDEX = "rites-pdf"
+index = pc.Index(PINECONE_INDEX)
+@tool
+def get_context(query: str) -> str:
+    """
+    Retrieve context information by performing a semantic search on indexed document chunks.
+    This tool embeds the provided user query using a Google Generative AI embeddings model,
+    then queries a Pinecone index to fetch the top 10 matching document chunks. Each match
+    includes metadata such as the text chunk, starting page, ending page, and the source PDF URL.
+    The function aggregates these details into a formatted string.
+    Args:
+        query (str): A user query search string used for semantic matching against the document index.
+    Returns:
+        str: A formatted string containing the matched document chunks along with their associated metadata,
+             including start page, end page, and PDF URL.
+    """
+    embedding = google_embeddings.embed_query(query)
+    search_results = index.query(
+        vector=embedding,
+        top_k=10,  # Retrieve top 10 results
+        include_metadata=True
+    )
+    context = " "
+    count = 1
+    for match in search_results["matches"]:
+        chunk = match["metadata"].get("chunk")
+        url = match["metadata"].get("pdf_url")
+        start_page = match["metadata"].get("start_page")
+        end_page = match["metadata"].get("end_page")
+        context += f"""
+        Chunk {count}:
+        {chunk}
+        start_page: {start_page}
+        end_page: {end_page}
+        pdf_url: {url}
+        #########################################
+        """
+        count += 1
+    return context