Spaces:

yuvabe-ai
/

AI-DocumentSearch

Sleeping

App Files Files Community

vthamaraikannan1@gmail.com commited on Oct 3, 2025

Commit

646b9b3

1 Parent(s): eab9192

Enhance streamlit_app.py with improved document context handling and UI updates; add .gitignore for environment and build files

Browse files

Files changed (2) hide show

.gitignore +37 -0
src/streamlit_app.py +141 -118

.gitignore ADDED Viewed

	@@ -0,0 +1,37 @@

+# Ignore Python cache and virtual environments
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.db
+*.sqlite3
+# Ignore virtual environment folders
+venv/
+env/
+ENV/
+.venv/
+.idea/
+.vscode/
+# Ignore OS generated files
+.DS_Store
+Thumbs.db
+# Ignore logs and temp files
+*.log
+*.tmp
+# Ignore test and coverage files
+.coverage
+htmlcov/
+*.egg-info/
+dist/
+build/
+# Ignore node modules if present
+node_modules/
+# Ignore Docker files themselves
+Dockerfile
+.dockerignore

src/streamlit_app.py CHANGED Viewed

@@ -16,10 +16,12 @@ nltk.download("punkt_tab", quiet=True)
 PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
 GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
 HF_TOKEN = os.environ.get("HF_TOKEN")
 # -------------------------------
 # Page Configuration
 # -------------------------------
@@ -214,7 +216,7 @@ def initialize_pinecone():
 @st.cache_resource(show_spinner=False)
 def initialize_bm25():
-    with open("src/bm25_model.pkl", "rb") as f:
         bm25 = pickle.load(f)
     return bm25
@@ -273,20 +275,47 @@ def generate_ai_response(query, relevant_docs):
     # Prepare context from relevant documents
     context_parts = []
     for i, doc in enumerate(relevant_docs, 1):
         metadata = doc["metadata"]
         text = metadata.get("text")
-        context_parts.append(f"""{text}""")
-    context = "\n".join(context_parts)
     # Create the prompt for Groq
     prompt = f"""
             CONTEXT DOCUMENTS:
             {context}
             USER QUESTION: {query}
             """
     try:
@@ -295,7 +324,7 @@ def generate_ai_response(query, relevant_docs):
             messages=[
                 {
                     "role": "system",
-                    "content": """You are a professional assistant that answers user questions based **only on the content of provided document excerpts**. The user will ask a question, and you will also receive related text chunks retrieved from company documents or PDFs.
                         Instructions:
                         1. Use **only** the retrieved chunks to answer the user’s question. Do **not** add information from memory or outside sources.
@@ -327,9 +356,9 @@ def generate_ai_response(query, relevant_docs):
 # -------------------------------
 st.markdown("""
 <div class="main-header">
-    <h1 style="margin: 0; font-size: 1.9rem;">🔍 AI Document Search & Chat</h1>
     <p style="margin: 0.5rem 0 0 0; font-size: 1.1rem; opacity: 0.9;">
-        Intelligent document retrieval with AI-powered question answering
     </p>
 </div>
 """, unsafe_allow_html=True)
@@ -338,57 +367,63 @@ st.markdown("""
 # Sidebar for filters and mode toggle
 # -------------------------------
 def clear_all_filters():
-    st.session_state.doc_type_filter = "All Types"
     st.session_state.company_filter = ""
     st.session_state.fiscal_year_filter = ""
-    st.session_state.page_no_filter = ""
-    st.session_state.search_query = ""
 with st.sidebar:
-    # Mode toggle
-    st.markdown("### 🤖 Search Mode")
-    chat_mode = st.toggle(
-        "💬 AI Chat Mode",
-        value=st.session_state.chat_mode,
-        help="Enable AI chat responses based on document content"
-    )
-    st.session_state.chat_mode = chat_mode
-    if chat_mode:
-        st.success("🤖 AI Chat Mode: ON\nGet AI-generated responses based on document content")
-    else:
-        st.info("📋 Search Mode: Document results only")
-    st.markdown("---")
     st.markdown("### 🎯 Search Filters")
     doc_type = st.selectbox(
-        "📄 Document Type",
-        ["All Types", "annual_report", "contract_report"],
         key="doc_type_filter"
     )
-    # company = st.text_input(
-    #     "🏢 Company",
-    #     placeholder="Enter company name...",
-    #     key="company_filter"
-    # )
-    # fiscal_year = st.text_input(
-    #     "📅 Fiscal Year",
-    #     placeholder="e.g., 2023",
-    #     key="fiscal_year_filter"
-    # )
-    page_no = st.text_input(
-        "📃 Page Number",
-        placeholder="e.g., 15",
-        key="page_no_filter"
-    )
-    # Clear filters button
-    st.button("🗑️ Clear All Filters", on_click=clear_all_filters)
     # Model info
     st.markdown("---")
@@ -431,18 +466,49 @@ if search_clicked or (query and len(query.strip()) > 0):
     else:
         # Build filter dictionary
         filter_dict = {}
         if doc_type and doc_type != "All Types":
             filter_dict["doc_type"] = {"$eq": doc_type}
-        # if company.strip():
-        #     filter_dict["company"] = {"$eq": company.strip()}
-        # if fiscal_year.strip():
-        #     filter_dict["fiscal_year"] = {"$eq": fiscal_year.strip()}
-        if page_no.strip():
             try:
                 filter_dict["page_no"] = {"$eq": int(page_no.strip())}
             except ValueError:
                 st.error("⚠️ Page number must be a valid integer.")
                 st.stop()
@@ -459,67 +525,35 @@ if search_clicked or (query and len(query.strip()) > 0):
                     ai_response = generate_ai_response(query, relevant_docs)
                 # Display AI response
-                st.markdown(ai_response,unsafe_allow_html=True)
         st.markdown("---")
-        # Display applied filters
-        if filter_dict:
-            st.markdown("### 📌 Applied Filters")
-            filter_chips = ""
-            for key, value in filter_dict.items():
-                filter_value = value.get("$eq", "")
-                filter_chips += f'<span class="metadata-chip">{key}: {filter_value}</span>'
-            st.markdown(filter_chips, unsafe_allow_html=True)
         if relevant_docs:
             search_time = time.time() - start_time
-            # Display search statistics
-            st.markdown(f"""
-            <div class="stats-container">
-                <div style="display: flex; justify-content: space-between; align-items: center;">
-                    <div>
-                        <strong>🎯 Found {len(relevant_docs)} relevant results</strong>
-                    </div>
-                    <div>
-                        <strong>⚡ {search_time:.2f}s</strong>
-                    </div>
-                </div>
-            </div>
-            """, unsafe_allow_html=True)
             # Display source documents
             if st.session_state.chat_mode:
-                st.markdown("### 📚 Source Documents")
-            else:
-                st.markdown("### 📋 Search Results")
             for i, result in enumerate(relevant_docs, start=1):
                 metadata = result["metadata"]
                 text_content = metadata.get("text", "No text available")
-                rerank_score = result["rerank_score"]
-                # Create result card
-                st.markdown(f"""
-                <div >
-                    <div style="display: flex; justify-content: between; align-items: flex-start; margin-bottom: 1rem;">
-                        <h4 style="margin: 0; color: #f2f5f7; flex-grow: 1;">{"" if st.session_state.chat_mode else "Result"} #{i}</h4>
-                    </div>
-                """, unsafe_allow_html=True)
-                # Display metadata as chips
-                st.markdown("#### 📊 Metadata:")
-                metadata_chips = ""
-                for key, value in metadata.items():
-                    if key != "text":  # Don't show text in metadata chips
-                        metadata_chips += f'<span class="metadata-chip">{key}: {value}</span>'
-                if metadata_chips:
-                    st.markdown(metadata_chips, unsafe_allow_html=True)
-                # Display text content
-                st.markdown(f"#### 📝 Content:")
                 st.markdown(f'<div style="background: #303336; padding: 1rem; border-radius: 8px; margin: 1rem 0; line-height: 1.6;">{text_content}</div>', unsafe_allow_html=True)
@@ -554,24 +588,13 @@ if search_clicked or (query and len(query.strip()) > 0):
 if not query:
     st.markdown("---")
     st.markdown("### 💡 How to Use")
-    col1, col2 = st.columns(2)
-    with col1:
-        st.markdown("""
-        **🔍 Search Mode:**
-        - Enter keywords to find relevant documents
-        - Results show document excerpts and metadata
-        - Use filters to narrow down results
-        """)
-    with col2:
-        st.markdown("""
-        **💬 AI Chat Mode:**
-        - Ask natural language questions
-        - Get AI-generated answers based on documents
-        - View source documents used for the response
-        """)
 # -------------------------------
 # Footer

 PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
 GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
 HF_TOKEN = os.environ.get("HF_TOKEN")
 # -------------------------------
 # Page Configuration
 # -------------------------------
 @st.cache_resource(show_spinner=False)
 def initialize_bm25():
+    with open(r"D:\rag_hugging\AI-DocumentSearch\src\bm25_model.pkl", "rb") as f:
         bm25 = pickle.load(f)
     return bm25
     # Prepare context from relevant documents
     context_parts = []
+    sources = []
     for i, doc in enumerate(relevant_docs, 1):
         metadata = doc["metadata"]
         text = metadata.get("text")
+        doc_id = metadata.get("doc_id")
+        title = metadata.get("title")
+        fiscal_year = metadata.get("fiscal_year")
+        page_no = metadata.get("page_no")
+        # Context for LLM
+        context_parts.append(f"[CHUNK {i} DOC {doc_id} {title} fiscal year {fiscal_year} ] (Page {page_no})\n{text}")
+        # Collect for UI
+        sources.append({
+            "id": i,
+            "title": title,
+            "page": page_no,
+            "doc_type": metadata.get("doc_type", ""),
+        })
+    context = "\n\n".join(context_parts)
     # Create the prompt for Groq
     prompt = f"""
+    You will answer the question using ONLY the provided document excerpts.
+    When you use information from a document, cite it with the format [DOC i],
+    where i corresponds to the document number given in CONTEXT DOCUMENTS.
+    If multiple docs are relevant, cite all of them (e.g., [DOC 1][DOC 3]).
             CONTEXT DOCUMENTS:
             {context}
             USER QUESTION: {query}
+            ANSWER : " "
             """
     try:
             messages=[
                 {
                     "role": "system",
+                    "content": """You are a professional assistant that answers user questions based **only on the content of provided document excerpts**. The user will ask a question, and you will also receive related text chunks retrieved from company documents or PDFs.
                         Instructions:
                         1. Use **only** the retrieved chunks to answer the user’s question. Do **not** add information from memory or outside sources.
 # -------------------------------
 st.markdown("""
 <div class="main-header">
+    <h1 style="margin: 0; font-size: 1.9rem;"> Hybrid Search RAG </h1>
     <p style="margin: 0.5rem 0 0 0; font-size: 1.1rem; opacity: 0.9;">
+        Using Groq LLM, Pinecone, and Sentence Transformers
     </p>
 </div>
 """, unsafe_allow_html=True)
 # Sidebar for filters and mode toggle
 # -------------------------------
 def clear_all_filters():
+    # Common
+    st.session_state.search_query = ""
+    st.session_state.page_no_filter = ""
+    # Annual Report
     st.session_state.company_filter = ""
     st.session_state.fiscal_year_filter = ""
+    st.session_state.currency_filter = ""
+    st.session_state.segment_filter = ""
+    # Contract Report
+    st.session_state.agreement_date_filter = ""
+    st.session_state.promoter_filter = ""
+    st.session_state.allottee_filter = ""
+    st.session_state.project_name_filter = ""
+    st.session_state.apartment_block_filter = ""
+    st.session_state.apartment_floor_filter = ""
+    st.session_state.apartment_type_filter = ""
+    # st.session_state.carpet_area_filter = ""  # if you add this back
+    st.session_state.jurisdiction_filter = ""
 with st.sidebar:
     st.markdown("### 🎯 Search Filters")
     doc_type = st.selectbox(
+        "Document Type",
+        ["annual_report", "contract_report"],
         key="doc_type_filter"
     )
+    # Annual Report filters
+    if doc_type == "annual_report":
+        with st.expander("Annual Report Filters", expanded=False):
+            company = st.text_input("Company", placeholder="Enter company name...", key="company_filter")
+            fiscal_year = st.text_input("Fiscal Year", placeholder="e.g., 2024", key="fiscal_year_filter")
+            currency = st.text_input("Currency", placeholder="e.g., USD", key="currency_filter")
+            segment = st.text_input("Segment", placeholder="e.g., Paint Stores Group", key="segment_filter")
+            page_no = st.text_input("Page Number", placeholder="e.g., 15", key="page_no_filter")
+    # Contract Report filters
+    elif doc_type == "contract_report":
+        with st.expander("Contract Report Filters", expanded=False):
+            agreement_date = st.text_input("Agreement Date", placeholder="YYYY-MM-DD", key="agreement_date_filter")
+            promoter = st.text_input("Promoter / Developer", placeholder="Enter promoter name...", key="promoter_filter")
+            allottee = st.text_input("Allottee (Buyer)", placeholder="Enter allottee name...", key="allottee_filter")
+            project_name = st.text_input("Project Name", placeholder="Enter project name...", key="project_name_filter")
+            apartment_block = st.text_input("Block", placeholder="e.g., Tower A", key="apartment_block_filter")
+            apartment_floor = st.text_input("Floor", placeholder="e.g., 10th floor", key="apartment_floor_filter")
+            apartment_type = st.text_input("Apartment Type", placeholder="e.g., 2BHK", key="apartment_type_filter")
+            jurisdiction = st.text_input("Jurisdiction", placeholder="e.g., Madras High Court", key="jurisdiction_filter")
+            page_no = st.text_input("Page Number", placeholder="e.g., 15", key="page_no_filter")
+    # Reset button
+    st.button("Clear All Filters", on_click=clear_all_filters)
     # Model info
     st.markdown("---")
     else:
         # Build filter dictionary
         filter_dict = {}
+        # Common filters
         if doc_type and doc_type != "All Types":
             filter_dict["doc_type"] = {"$eq": doc_type}
+        if page_no and page_no.strip():
             try:
                 filter_dict["page_no"] = {"$eq": int(page_no.strip())}
             except ValueError:
                 st.error("⚠️ Page number must be a valid integer.")
                 st.stop()
+        # Annual Report filters
+        if doc_type == "annual_report":
+            if company and company.strip():
+                filter_dict["company"] = {"$eq": company.strip()}
+            if fiscal_year and fiscal_year.strip():
+                filter_dict["fiscal_year"] = {"$eq": fiscal_year.strip()}
+            if currency and currency.strip():
+                filter_dict["currency"] = {"$eq": currency.strip()}
+            if segment and segment.strip():
+                filter_dict["segment"] = {"$eq": segment.strip()}
+        # Contract Report filters
+        elif doc_type == "contract_report":
+            if agreement_date and agreement_date.strip():
+                filter_dict["agreement_date"] = {"$eq": agreement_date.strip()}
+            if promoter and promoter.strip():
+                filter_dict["promoter_legal_name"] = {"$eq": promoter.strip()}
+            if allottee and allottee.strip():
+                filter_dict["allottee_name"] = {"$eq": allottee.strip()}
+            if project_name and project_name.strip():
+                filter_dict["project_name"] = {"$eq": project_name.strip()}
+            if apartment_block and apartment_block.strip():
+                filter_dict["apartment_block"] = {"$eq": apartment_block.strip()}
+            if apartment_floor and apartment_floor.strip():
+                filter_dict["apartment_floor"] = {"$eq": apartment_floor.strip()}
+            if apartment_type and apartment_type.strip():
+                filter_dict["apartment_type"] = {"$eq": apartment_type.strip()}
+            if jurisdiction and jurisdiction.strip():
+                filter_dict["jurisdiction"] = {"$eq": jurisdiction.strip()}
                     ai_response = generate_ai_response(query, relevant_docs)
                 # Display AI response
+                # st.markdown(ai_response,unsafe_allow_html=True)
+                st.markdown(f'<div style="background: #303336; padding: 1rem; border-radius: 8px; margin: 1rem 0; line-height: 1.6;">{ai_response}</div>', unsafe_allow_html=True)
         st.markdown("---")
         if relevant_docs:
             search_time = time.time() - start_time
             # Display source documents
             if st.session_state.chat_mode:
+                st.markdown("### Evidence")
+            # else:
+            #     st.markdown("### 📋 Search Results")
             for i, result in enumerate(relevant_docs, start=1):
                 metadata = result["metadata"]
                 text_content = metadata.get("text", "No text available")
+                doc_id = metadata.get("doc_id", "N/A")
+                page_no = metadata.get("page_no", "N/A")
+                title = metadata.get("title")
+                st.markdown("#### [{i}] DOC : {doc_id}  |  Page: {page_no} | Title {title}".format(i=i, doc_id=doc_id, page_no=page_no, title=title))
                 st.markdown(f'<div style="background: #303336; padding: 1rem; border-radius: 8px; margin: 1rem 0; line-height: 1.6;">{text_content}</div>', unsafe_allow_html=True)
 if not query:
     st.markdown("---")
     st.markdown("### 💡 How to Use")
+    st.markdown("""
+    **💬 AI Chat Mode:**
+    - Ask natural language questions
+    - Get AI-generated answers based on documents
+    - View source documents used for the response
+    """)
 # -------------------------------
 # Footer