import streamlit as st from sentence_transformers import SentenceTransformer, CrossEncoder from pinecone import Pinecone from groq import Groq import uuid import time from pinecone_text.sparse import BM25Encoder import os import pickle import nltk import markdown2 nltk.download("punkt", quiet=True) nltk.download("punkt_tab", quiet=True) PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY") GROQ_API_KEY = os.environ.get("GROQ_API_KEY") HF_TOKEN = os.environ.get("HF_TOKEN") # ------------------------------- # Page Configuration # ------------------------------- st.set_page_config( page_title="AI Document Search & Chat", page_icon="🔍", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS for modern styling st.markdown(""" """, unsafe_allow_html=True) # ------------------------------- # Load models with better caching # ------------------------------- @st.cache_resource(show_spinner=False) def load_models(): with st.spinner("🤖 Loading AI models..."): embed_model = SentenceTransformer( "google/embeddinggemma-300m", token=HF_TOKEN ) reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") return embed_model, reranker @st.cache_resource(show_spinner=False) def initialize_pinecone(): pc = Pinecone(api_key=PINECONE_API_KEY) index = pc.Index("rag-latest") return index @st.cache_resource(show_spinner=False) def initialize_bm25(): with open("src/bm25_model.pkl", "rb") as f: bm25 = pickle.load(f) return bm25 @st.cache_resource(show_spinner=False) def initialize_groq(): client = Groq(api_key=GROQ_API_KEY) return client # Initialize models and services with st.spinner("🚀 Initializing AI services..."): embed_model, reranker = load_models() index = initialize_pinecone() bm25 = initialize_bm25() groq_client = initialize_groq() # Initialize session state if "chat_mode" not in st.session_state: st.session_state.chat_mode = True # ------------------------------- # Helper Functions # ------------------------------- def search_documents(query, filter_dict, top_k): """Search for relevant documents using embedding similarity and reranking.""" dense_query = embed_model.encode(query).tolist() sparse_query = bm25.encode_queries([query])[0] # Query Pinecone res = index.query( vector=dense_query, sparse_vector=sparse_query, top_k=10, include_metadata=True, hybrid=True, filter=filter_dict ) candidates = res["matches"] if candidates: # Rerank results pairs = [(query, match["metadata"].get("text", "")) for match in candidates] scores = reranker.predict(pairs) for match, score in zip(candidates, scores): match["rerank_score"] = float(score) reranked = sorted(candidates, key=lambda x: x["rerank_score"], reverse=True) return reranked[:3] return [] def generate_ai_response(query, relevant_docs): """Generate AI response using Groq LLM based on query and relevant documents.""" # Prepare context from relevant documents context_parts = [] sources = [] for i, doc in enumerate(relevant_docs, 1): metadata = doc["metadata"] text = metadata.get("text") doc_id = metadata.get("doc_id") title = metadata.get("title") fiscal_year = metadata.get("fiscal_year") page_no = metadata.get("page_no") # Context for LLM context_parts.append(f"[CHUNK {i} DOC {doc_id} {title} fiscal year {fiscal_year} ] (Page {page_no})\n{text}") # Collect for UI sources.append({ "id": i, "title": title, "page": page_no, "doc_type": metadata.get("doc_type", ""), }) context = "\n\n".join(context_parts) # Create the prompt for Groq prompt = f""" You will answer the question using ONLY the provided document excerpts. When you use information from a document, cite it with the format [DOC i], where i corresponds to the document number given in CONTEXT DOCUMENTS. If multiple docs are relevant, cite all of them (e.g., [DOC 1][DOC 3]). CONTEXT DOCUMENTS: {context} USER QUESTION: {query} ANSWER : " " """ try: # Call Groq API chat_completion = groq_client.chat.completions.create( messages=[ { "role": "system", "content": """You are a professional assistant that answers user questions based **only on the content of provided document excerpts**. The user will ask a question, and you will also receive related text chunks retrieved from company documents or PDFs. Instructions: 1. Use **only** the retrieved chunks to answer the user’s question. Do **not** add information from memory or outside sources. 2. If multiple chunks provide relevant info, combine them into a **clear, concise answer**. 3. If the answer is **not found** in the chunks, respond exactly with: "The document does not provide enough information to answer this question." 4. Keep the style **professional, factual, and concise**. 5. retrun the response as markdown format 7. Refuse to answer or speculate if no reliable evidence is found in the chunks. """ }, { "role": "user", "content": prompt } ], model="llama-3.3-70b-versatile", stream=False ) return chat_completion.choices[0].message.content except Exception as e: return f"❌ Error generating AI response: {str(e)}" # ------------------------------- # Header # ------------------------------- st.markdown("""

Hybrid Search RAG

Using Groq LLM, Pinecone, and Sentence Transformers

""", unsafe_allow_html=True) # ------------------------------- # Sidebar for filters and mode toggle # ------------------------------- def clear_all_filters(): # Common st.session_state.search_query = "" st.session_state.page_no_filter = "" # Annual Report st.session_state.company_filter = "" st.session_state.fiscal_year_filter = "" st.session_state.currency_filter = "" st.session_state.unit_filter = "" # # Contract Report # st.session_state.agreement_date_filter = "" # st.session_state.promoter_filter = "" # st.session_state.allottee_filter = "" # st.session_state.project_name_filter = "" # st.session_state.apartment_block_filter = "" # st.session_state.apartment_floor_filter = "" # st.session_state.apartment_type_filter = "" # # st.session_state.carpet_area_filter = "" # if you add this back # st.session_state.jurisdiction_filter = "" with st.sidebar: st.markdown("### 🎯 Search Filters") # Remove the annual_report option doc_type = st.selectbox( "Document Type", ["contract_report"], # Only keep contract_report key="doc_type_filter" ) # Contract Report filters if doc_type == "contract_report": with st.expander("Contract Report Filters", expanded=False): agreement_date = st.text_input("Agreement Date", placeholder="YYYY-MM-DD", key="agreement_date_filter") promoter = st.text_input("Promoter / Developer", placeholder="Enter promoter name...", key="promoter_filter") allottee = st.text_input("Allottee (Buyer)", placeholder="Enter allottee name...", key="allottee_filter") project_name = st.text_input("Project Name", placeholder="Enter project name...", key="project_name_filter") apartment_block = st.text_input("Block", placeholder="e.g., Tower A", key="apartment_block_filter") apartment_floor = st.text_input("Floor", placeholder="e.g., 10th floor", key="apartment_floor_filter") apartment_type = st.text_input("Apartment Type", placeholder="e.g., 2BHK", key="apartment_type_filter") jurisdiction = st.text_input("Jurisdiction", placeholder="e.g., Madras High Court", key="jurisdiction_filter") page_no = st.text_input("Page Number", placeholder="e.g., 15", key="page_no_filter") # Reset button st.button("Clear All Filters", on_click=clear_all_filters) # Model info st.markdown("---") st.markdown("### ℹ️ Model Info") st.info("**Embedding**: Google EmbeddingGemma-300M\n**Reranker**: MS-MARCO MiniLM-L-6-v2\n**LLM**: Groq Llama-3.1-70B") # ------------------------------- # Main search interface # ------------------------------- col1, col2 = st.columns([3, 1]) with col1: if st.session_state.chat_mode: query = st.text_input( "💬 Ask a question about your documents", placeholder="What would you like to know from the documents?", label_visibility="collapsed", key="search_query" ) else: query = st.text_input( "🔍 Search Query", placeholder="What would you like to find in the documents?", label_visibility="collapsed", key="search_query" ) with col2: if st.session_state.chat_mode: search_clicked = st.button("💬 Ask AI", type="primary") else: search_clicked = st.button("🚀 Search", type="primary") # ------------------------------- # Search functionality # ------------------------------- if search_clicked or (query and len(query.strip()) > 0): if not query.strip(): st.warning("⚠️ Please enter a search query to continue.") else: # Build filter dictionary filter_dict = {} # Common filters if doc_type and doc_type != "All Types": filter_dict["doc_type"] = {"$eq": doc_type} if page_no and page_no.strip(): try: filter_dict["page_no"] = {"$eq": int(page_no.strip())} except ValueError: st.error("⚠️ Page number must be a valid integer.") st.stop() # Contract Report filters if doc_type == "contract_report": if agreement_date and agreement_date.strip(): filter_dict["agreement_date"] = {"$eq": agreement_date.strip()} if promoter and promoter.strip(): filter_dict["promoter_legal_name"] = {"$eq": promoter.strip()} if allottee and allottee.strip(): filter_dict["allottee_name"] = {"$eq": allottee.strip()} if project_name and project_name.strip(): filter_dict["project_name"] = {"$eq": project_name.strip()} if apartment_block and apartment_block.strip(): filter_dict["apartment_block"] = {"$eq": apartment_block.strip()} if apartment_floor and apartment_floor.strip(): filter_dict["apartment_floor"] = {"$eq": apartment_floor.strip()} if apartment_type and apartment_type.strip(): filter_dict["apartment_type"] = {"$eq": apartment_type.strip()} if jurisdiction and jurisdiction.strip(): filter_dict["jurisdiction"] = {"$eq": jurisdiction.strip()} # Perform search with progress indicators start_time = time.time() with st.spinner("🔍 Searching through documents..."): relevant_docs = search_documents(query, filter_dict, top_k=5) # Generate AI response if in chat mode if st.session_state.chat_mode: with st.spinner("🤖 Generating AI response..."): ai_response = generate_ai_response(query, relevant_docs) # Display AI response # st.markdown(ai_response,unsafe_allow_html=True) html_content1 = markdown2.markdown(ai_response) st.markdown(f'

{html_content1}

', unsafe_allow_html=True) st.markdown("---") if relevant_docs: search_time = time.time() - start_time # Display source documents if st.session_state.chat_mode: st.markdown("### Evidence") # else: # st.markdown("### 📋 Search Results") for i, result in enumerate(relevant_docs, start=1): metadata = result["metadata"] text_content = metadata.get("text", "No text available") doc_id = metadata.get("doc_id", "N/A") page_no = metadata.get("page_no", "N/A") title = metadata.get("title") # st.markdown("#### [{i}] DOC : {doc_id} | Page: {page_no} | Title {title}".format(i=i, doc_id=doc_id, page_no=page_no, title=title)) st.markdown( "#### [{i}] DOC : {doc_id} | Page: {page_no} | Title: {title}".format( i=i, doc_id=doc_id, page_no=page_no, title=title ), unsafe_allow_html=True ) html_content = markdown2.markdown(text_content) st.markdown(f'

{html_content}

', unsafe_allow_html=True) # Expandable full metadata doc_label = "Source" if st.session_state.chat_mode else "Result" with st.expander(f"🔍 View full metadata for {doc_label} #{i}"): st.json(metadata) st.markdown("", unsafe_allow_html=True) else: # No results found st.markdown("""

🤷‍♂️ No results found

Try adjusting your search query or filters to find what you're looking for.

💡 Search Tips:

Use specific keywords related to your topic
Try removing some filters to broaden your search
Check for typos in your query or filter values
Use synonyms or related terms

""", unsafe_allow_html=True) # ------------------------------- # Usage Instructions # ------------------------------- if not query: st.markdown("---") st.markdown("### 💡 How to Use") st.markdown(""" **💬 AI Chat Mode:** - Ask natural language questions - Get AI-generated answers based on documents - View source documents used for the response """) # ------------------------------- # Footer # ------------------------------- st.markdown("---") st.markdown("""

🤖 Powered by Groq, Sentence Transformers, Pinecone, and Streamlit | Built with ❤️ for intelligent document search and chat

""", unsafe_allow_html=True)