import streamlit as st from sentence_transformers import SentenceTransformer, CrossEncoder from pinecone import Pinecone from groq import Groq import uuid import time from pinecone_text.sparse import BM25Encoder import os import pickle import nltk import markdown2 nltk.download("punkt", quiet=True) nltk.download("punkt_tab", quiet=True) PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY") GROQ_API_KEY = os.environ.get("GROQ_API_KEY") HF_TOKEN = os.environ.get("HF_TOKEN") # ------------------------------- # Page Configuration # ------------------------------- st.set_page_config( page_title="AI Document Search & Chat", page_icon="πŸ”", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS for modern styling st.markdown(""" """, unsafe_allow_html=True) # ------------------------------- # Load models with better caching # ------------------------------- @st.cache_resource(show_spinner=False) def load_models(): with st.spinner("πŸ€– Loading AI models..."): embed_model = SentenceTransformer( "google/embeddinggemma-300m", token=HF_TOKEN ) reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") return embed_model, reranker @st.cache_resource(show_spinner=False) def initialize_pinecone(): pc = Pinecone(api_key=PINECONE_API_KEY) index = pc.Index("rag-latest") return index @st.cache_resource(show_spinner=False) def initialize_bm25(): with open("src/bm25_model.pkl", "rb") as f: bm25 = pickle.load(f) return bm25 @st.cache_resource(show_spinner=False) def initialize_groq(): client = Groq(api_key=GROQ_API_KEY) return client # Initialize models and services with st.spinner("πŸš€ Initializing AI services..."): embed_model, reranker = load_models() index = initialize_pinecone() bm25 = initialize_bm25() groq_client = initialize_groq() # Initialize session state if "chat_mode" not in st.session_state: st.session_state.chat_mode = True # ------------------------------- # Helper Functions # ------------------------------- def search_documents(query, filter_dict, top_k): """Search for relevant documents using embedding similarity and reranking.""" dense_query = embed_model.encode(query).tolist() sparse_query = bm25.encode_queries([query])[0] # Query Pinecone res = index.query( vector=dense_query, sparse_vector=sparse_query, top_k=10, include_metadata=True, hybrid=True, filter=filter_dict ) candidates = res["matches"] if candidates: # Rerank results pairs = [(query, match["metadata"].get("text", "")) for match in candidates] scores = reranker.predict(pairs) for match, score in zip(candidates, scores): match["rerank_score"] = float(score) reranked = sorted(candidates, key=lambda x: x["rerank_score"], reverse=True) return reranked[:3] return [] def generate_ai_response(query, relevant_docs): """Generate AI response using Groq LLM based on query and relevant documents.""" # Prepare context from relevant documents context_parts = [] sources = [] for i, doc in enumerate(relevant_docs, 1): metadata = doc["metadata"] text = metadata.get("text") doc_id = metadata.get("doc_id") title = metadata.get("title") fiscal_year = metadata.get("fiscal_year") page_no = metadata.get("page_no") # Context for LLM context_parts.append(f"[CHUNK {i} DOC {doc_id} {title} fiscal year {fiscal_year} ] (Page {page_no})\n{text}") # Collect for UI sources.append({ "id": i, "title": title, "page": page_no, "doc_type": metadata.get("doc_type", ""), }) context = "\n\n".join(context_parts) # Create the prompt for Groq prompt = f""" You will answer the question using ONLY the provided document excerpts. When you use information from a document, cite it with the format [DOC i], where i corresponds to the document number given in CONTEXT DOCUMENTS. If multiple docs are relevant, cite all of them (e.g., [DOC 1][DOC 3]). CONTEXT DOCUMENTS: {context} USER QUESTION: {query} ANSWER : " " """ try: # Call Groq API chat_completion = groq_client.chat.completions.create( messages=[ { "role": "system", "content": """You are a professional assistant that answers user questions based **only on the content of provided document excerpts**. The user will ask a question, and you will also receive related text chunks retrieved from company documents or PDFs. Instructions: 1. Use **only** the retrieved chunks to answer the user’s question. Do **not** add information from memory or outside sources. 2. If multiple chunks provide relevant info, combine them into a **clear, concise answer**. 3. If the answer is **not found** in the chunks, respond exactly with: "The document does not provide enough information to answer this question." 4. Keep the style **professional, factual, and concise**. 5. retrun the response as markdown format 7. Refuse to answer or speculate if no reliable evidence is found in the chunks. """ }, { "role": "user", "content": prompt } ], model="llama-3.3-70b-versatile", stream=False ) return chat_completion.choices[0].message.content except Exception as e: return f"❌ Error generating AI response: {str(e)}" # ------------------------------- # Header # ------------------------------- st.markdown("""

Hybrid Search RAG

Using Groq LLM, Pinecone, and Sentence Transformers

""", unsafe_allow_html=True) # ------------------------------- # Sidebar for filters and mode toggle # ------------------------------- def clear_all_filters(): # Common st.session_state.search_query = "" st.session_state.page_no_filter = "" # Annual Report st.session_state.company_filter = "" st.session_state.fiscal_year_filter = "" st.session_state.currency_filter = "" st.session_state.unit_filter = "" # # Contract Report # st.session_state.agreement_date_filter = "" # st.session_state.promoter_filter = "" # st.session_state.allottee_filter = "" # st.session_state.project_name_filter = "" # st.session_state.apartment_block_filter = "" # st.session_state.apartment_floor_filter = "" # st.session_state.apartment_type_filter = "" # # st.session_state.carpet_area_filter = "" # if you add this back # st.session_state.jurisdiction_filter = "" with st.sidebar: st.markdown("### 🎯 Search Filters") # Remove the annual_report option doc_type = st.selectbox( "Document Type", ["contract_report"], # Only keep contract_report key="doc_type_filter" ) # Contract Report filters if doc_type == "contract_report": with st.expander("Contract Report Filters", expanded=False): agreement_date = st.text_input("Agreement Date", placeholder="YYYY-MM-DD", key="agreement_date_filter") promoter = st.text_input("Promoter / Developer", placeholder="Enter promoter name...", key="promoter_filter") allottee = st.text_input("Allottee (Buyer)", placeholder="Enter allottee name...", key="allottee_filter") project_name = st.text_input("Project Name", placeholder="Enter project name...", key="project_name_filter") apartment_block = st.text_input("Block", placeholder="e.g., Tower A", key="apartment_block_filter") apartment_floor = st.text_input("Floor", placeholder="e.g., 10th floor", key="apartment_floor_filter") apartment_type = st.text_input("Apartment Type", placeholder="e.g., 2BHK", key="apartment_type_filter") jurisdiction = st.text_input("Jurisdiction", placeholder="e.g., Madras High Court", key="jurisdiction_filter") page_no = st.text_input("Page Number", placeholder="e.g., 15", key="page_no_filter") # Reset button st.button("Clear All Filters", on_click=clear_all_filters) # Model info st.markdown("---") st.markdown("### ℹ️ Model Info") st.info("**Embedding**: Google EmbeddingGemma-300M\n**Reranker**: MS-MARCO MiniLM-L-6-v2\n**LLM**: Groq Llama-3.1-70B") # ------------------------------- # Main search interface # ------------------------------- col1, col2 = st.columns([3, 1]) with col1: if st.session_state.chat_mode: query = st.text_input( "πŸ’¬ Ask a question about your documents", placeholder="What would you like to know from the documents?", label_visibility="collapsed", key="search_query" ) else: query = st.text_input( "πŸ” Search Query", placeholder="What would you like to find in the documents?", label_visibility="collapsed", key="search_query" ) with col2: if st.session_state.chat_mode: search_clicked = st.button("πŸ’¬ Ask AI", type="primary") else: search_clicked = st.button("πŸš€ Search", type="primary") # ------------------------------- # Search functionality # ------------------------------- if search_clicked or (query and len(query.strip()) > 0): if not query.strip(): st.warning("⚠️ Please enter a search query to continue.") else: # Build filter dictionary filter_dict = {} # Common filters if doc_type and doc_type != "All Types": filter_dict["doc_type"] = {"$eq": doc_type} if page_no and page_no.strip(): try: filter_dict["page_no"] = {"$eq": int(page_no.strip())} except ValueError: st.error("⚠️ Page number must be a valid integer.") st.stop() # Contract Report filters if doc_type == "contract_report": if agreement_date and agreement_date.strip(): filter_dict["agreement_date"] = {"$eq": agreement_date.strip()} if promoter and promoter.strip(): filter_dict["promoter_legal_name"] = {"$eq": promoter.strip()} if allottee and allottee.strip(): filter_dict["allottee_name"] = {"$eq": allottee.strip()} if project_name and project_name.strip(): filter_dict["project_name"] = {"$eq": project_name.strip()} if apartment_block and apartment_block.strip(): filter_dict["apartment_block"] = {"$eq": apartment_block.strip()} if apartment_floor and apartment_floor.strip(): filter_dict["apartment_floor"] = {"$eq": apartment_floor.strip()} if apartment_type and apartment_type.strip(): filter_dict["apartment_type"] = {"$eq": apartment_type.strip()} if jurisdiction and jurisdiction.strip(): filter_dict["jurisdiction"] = {"$eq": jurisdiction.strip()} # Perform search with progress indicators start_time = time.time() with st.spinner("πŸ” Searching through documents..."): relevant_docs = search_documents(query, filter_dict, top_k=5) # Generate AI response if in chat mode if st.session_state.chat_mode: with st.spinner("πŸ€– Generating AI response..."): ai_response = generate_ai_response(query, relevant_docs) # Display AI response # st.markdown(ai_response,unsafe_allow_html=True) html_content1 = markdown2.markdown(ai_response) st.markdown(f'
{html_content1}
', unsafe_allow_html=True) st.markdown("---") if relevant_docs: search_time = time.time() - start_time # Display source documents if st.session_state.chat_mode: st.markdown("### Evidence") # else: # st.markdown("### πŸ“‹ Search Results") for i, result in enumerate(relevant_docs, start=1): metadata = result["metadata"] text_content = metadata.get("text", "No text available") doc_id = metadata.get("doc_id", "N/A") page_no = metadata.get("page_no", "N/A") title = metadata.get("title") # st.markdown("#### [{i}] DOC : {doc_id} | Page: {page_no} | Title {title}".format(i=i, doc_id=doc_id, page_no=page_no, title=title)) st.markdown( "#### [{i}] DOC : {doc_id} | Page: {page_no} | Title: {title}".format( i=i, doc_id=doc_id, page_no=page_no, title=title ), unsafe_allow_html=True ) html_content = markdown2.markdown(text_content) st.markdown(f'
{html_content}
', unsafe_allow_html=True) # Expandable full metadata doc_label = "Source" if st.session_state.chat_mode else "Result" with st.expander(f"πŸ” View full metadata for {doc_label} #{i}"): st.json(metadata) st.markdown("", unsafe_allow_html=True) else: # No results found st.markdown("""

πŸ€·β€β™‚οΈ No results found

Try adjusting your search query or filters to find what you're looking for.

πŸ’‘ Search Tips:

""", unsafe_allow_html=True) # ------------------------------- # Usage Instructions # ------------------------------- if not query: st.markdown("---") st.markdown("### πŸ’‘ How to Use") st.markdown(""" **πŸ’¬ AI Chat Mode:** - Ask natural language questions - Get AI-generated answers based on documents - View source documents used for the response """) # ------------------------------- # Footer # ------------------------------- st.markdown("---") st.markdown("""
πŸ€– Powered by Groq, Sentence Transformers, Pinecone, and Streamlit | Built with ❀️ for intelligent document search and chat
""", unsafe_allow_html=True)