import streamlit as st import time import markdown2 import torch from qdrant_client import QdrantClient from qdrant_client import models from qdrant_client.models import Filter, FieldCondition, Range, MatchValue from llm import analyze_image_with_query # ------------------------------- # Page Configuration # ------------------------------- st.set_page_config( page_title="AI Document Search & Chat", page_icon="🔍", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS for modern styling st.markdown(""" """, unsafe_allow_html=True) # ------------------------------- # Load models with better caching # ------------------------------- @st.cache_resource(show_spinner=False) def initialize_qdrant(): """Initialize Qdrant client""" qdrant_client = QdrantClient( url="https://c75f218b-ecf5-4693-9b49-a2253478cf80.us-east4-0.gcp.cloud.qdrant.io:6333", api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.ErMufKlEvB6nuEzFTwXZWDvb0IrEC2VUkPpQ1ZaLcXc", ) return qdrant_client @st.cache_resource(show_spinner=False) def initialize_colpali(): """Initialize ColPali model and processor""" try: # Note: ColPali requires specific installation from colpali_engine.models import ColPali, ColPaliProcessor model_name = "vidore/colpali-v1.2" colpali_model = ColPali.from_pretrained( model_name, torch_dtype=torch.bfloat16, device_map="cuda:0" if torch.cuda.is_available() else "cpu", ) colpali_processor = ColPaliProcessor.from_pretrained( "vidore/colpaligemma-3b-pt-448-base" ) return colpali_model, colpali_processor except ImportError: st.error("ColPali not installed. Please install with: pip install colpali-engine") return None, None # Initialize models and services with st.spinner("🚀 Initializing AI services..."): qdrant_client = initialize_qdrant() colpali_model, colpali_processor = initialize_colpali() if colpali_model is None or colpali_processor is None: st.stop() # Initialize session state if "chat_mode" not in st.session_state: st.session_state.chat_mode = True # Collection name collection_name = "colpaliWithQdrant" # ------------------------------- # Helper Functions # ------------------------------- def search_documents(query, filter_dict): """Search for relevant documents using embedding similarity with metadata filtering.""" if colpali_model is None or colpali_processor is None: st.error("ColPali model not initialized") return [] try: # Generate query embedding with torch.no_grad(): batch_query = colpali_processor.process_queries([query]) if batch_query is None: st.error("ColPali processor failed to process query.") return [] batch_query = batch_query.to(colpali_model.device) query_embedding = colpali_model(**batch_query) if query_embedding is None: st.error("ColPali model failed to generate embedding.") return [] multivector_query = query_embedding[0].cpu().float().numpy().tolist() string_fields = ["fiscal_year", "page_number"] must_conditions = [] for key in ["fiscal_year", "page_number", "company", "currency"]: if filter_dict.get(key): value = filter_dict[key]["$eq"] if key in string_fields: value = str(value) must_conditions.append( FieldCondition( key=key, match=MatchValue(value=value) ) ) filter_by_tag = Filter(must=must_conditions) # Search in Qdrant with filters start_time = time.time() try: search_result = qdrant_client.query_points( collection_name=collection_name, query=multivector_query, query_filter=filter_by_tag, limit=3, timeout=100, search_params=models.SearchParams( quantization=models.QuantizationSearchParams( ignore=False, rescore=True, oversampling=2.0, ) ) ) except Exception as filter_error: if "Index required" in str(filter_error): st.warning("âš ī¸ Metadata filtering not available. Performing search without filters. Please create payload indexes.") # Retry without filters search_result = qdrant_client.query_points( collection_name=collection_name, query=multivector_query, limit=3, timeout=100, search_params=models.SearchParams( quantization=models.QuantizationSearchParams( ignore=False, rescore=True, oversampling=2.0, ) ) ) else: raise end_time = time.time() elapsed_time = end_time - start_time print(f"Search completed in {elapsed_time:.4f} seconds") # Convert search results to list of documents print(f"Found {len(search_result.points)} results") relevant_docs = [] for point in search_result.points: relevant_docs.append({ "id": point.id, "score": point.score, "metadata": point.payload }) return relevant_docs except Exception as e: st.error(f"Search error: {str(e)}") return [] # ------------------------------- # Header # ------------------------------- st.markdown("""

Document Search RAG

Using ColPali, Qdrant and llama-4-scout-17b-16e-instruct.

""", unsafe_allow_html=True) # ------------------------------- # Sidebar for filters and mode toggle # ------------------------------- def clear_all_filters(): # Common st.session_state.search_query = "" st.session_state.page_no_filter = "" # Annual Report st.session_state.company_filter = "" st.session_state.fiscal_year_filter = "" st.session_state.currency_filter = "" with st.sidebar: st.markdown("### đŸŽ¯ Search Filters") doc_type = st.selectbox( "Document Type", ["annual_report"], key="doc_type_filter" ) # Annual Report filters if doc_type == "annual_report": with st.expander("Annual Report Filters", expanded=False): # Example: company dropdown (populate with known companies) companies_list = ["Sherwin Williams", "MSCI", "Eaton"] company = st.selectbox("Company", [""] + companies_list, index=0, key="company_filter") # Fiscal year dropdown fiscal_years = ["2024"] fiscal_year = st.selectbox("Fiscal Year", [""] + fiscal_years, index=0, key="fiscal_year_filter") # Currency dropdown currencies = ["USD"] currency = st.selectbox("Currency", [""] + currencies, index=0, key="currency_filter") # Page number dropdown or text input # If you know page numbers, you can make it a dropdown page_no = st.text_input("Page Number", placeholder="e.g., 15", key="page_no_filter") # Reset button st.button("Clear All Filters", on_click=clear_all_filters) # Model info st.markdown("---") st.markdown("### â„šī¸ Model Info") st.info("**Embedding**: ColPali v1.2\n**Vector DB**: Qdrant\n**LLM**:llama-4-scout-17b-16e-instruct") # ------------------------------- # Main search interface # ------------------------------- col1, col2 = st.columns([3, 1]) with col1: if st.session_state.chat_mode: query = st.text_input( "đŸ’Ŧ Ask a question about your documents", placeholder="What would you like to know from the documents?", label_visibility="collapsed", key="search_query" ) else: query = st.text_input( "🔍 Search Query", placeholder="What would you like to find in the documents?", label_visibility="collapsed", key="search_query" ) with col2: if st.session_state.chat_mode: search_clicked = st.button("đŸ’Ŧ Ask AI", type="primary") else: search_clicked = st.button("🚀 Search", type="primary") # ------------------------------- # Search functionality # ------------------------------- if search_clicked or (query and len(query.strip()) > 0): if not query.strip(): st.warning("âš ī¸ Please enter a search query to continue.") else: # Build filter dictionary filter_dict = {} # Common filters if doc_type and doc_type != "All Types": filter_dict["doc_type"] = {"$eq": doc_type} if page_no and page_no.strip(): try: filter_dict["page_number"] = {"$eq": int(page_no.strip())} except ValueError: st.error("âš ī¸ Page number must be a valid integer.") st.stop() # Annual Report filters if doc_type == "annual_report": if company and company.strip(): filter_dict["company"] = {"$eq": company.strip()} if fiscal_year and fiscal_year.strip(): filter_dict["fiscal_year"] = {"$eq": fiscal_year.strip()} if currency and currency.strip(): filter_dict["currency"] = {"$eq": currency.strip()} # Perform search with progress indicators start_time = time.time() with st.spinner("🔍 Searching through documents..."): relevant_docs = search_documents(query, filter_dict) # Generate AI response if in chat mode if st.session_state.chat_mode and relevant_docs: with st.spinner("🤖 Generating AI response..."): gemini_answer = analyze_image_with_query(relevant_docs, query) if gemini_answer: st.markdown( f"""
{gemini_answer}
""", unsafe_allow_html=True ) else: st.warning("No AI response received.") st.markdown("---") if relevant_docs: search_time = time.time() - start_time # Display source documents if st.session_state.chat_mode: st.markdown("### 📚 Evidence") for i, result in enumerate(relevant_docs, start=1): metadata = result["metadata"] image_url = metadata.get("image_url") # URL of the image doc_id = metadata.get("doc_id", "N/A") page_no = metadata.get("page_number", "N/A") title = metadata.get("title", "N/A") # Display document info st.markdown( f"#### [{i}] DOC : {doc_id} | Page: {page_no} | Title: {title}", unsafe_allow_html=True ) # Display image from URL if image_url: st.image(image_url, caption=f"{title} | Page {page_no}", width='stretch') else: st.warning("No image available for this document.") # Expandable full metadata doc_label = "Source" if st.session_state.chat_mode else "Result" with st.expander(f"🔍 View full metadata for {doc_label} #{i}"): st.json(metadata) else: # No results found st.markdown("""

đŸ¤ˇâ€â™‚ī¸ No results found

Try adjusting your search query or filters to find what you're looking for.

💡 Search Tips:

""", unsafe_allow_html=True) # ------------------------------- # Usage Instructions # ------------------------------- if not query: st.markdown("---") st.markdown("### 💡 How to Use") st.markdown(""" **đŸ’Ŧ AI Chat Mode:** - Ask natural language questions - Get AI-generated answers based on documents - View source documents used for the response """) # ------------------------------- # Footer # ------------------------------- st.markdown("---") st.markdown("""
🤖 Powered by Groq, ColPali, Qdrant, and Streamlit | Built with â¤ī¸ for intelligent document search and chat
""", unsafe_allow_html=True)