import os import streamlit as st import hashlib import time from pinecone import Pinecone import google.generativeai as genai # Import your data processing functions from data_processor import ( get_document_text, split_text_into_chunks, generate_embeddings, index_chunks_in_pinecone, ) # --- Page Configuration --- st.set_page_config( page_title="Insurance DocAI 🤖", page_icon="📄", layout="wide" ) # --- API and Client Initialization --- # Use st.secrets for secure handling of API keys on Streamlit Cloud/Hugging Face try: GOOGLE_API_KEY = st.secrets["GOOGLE_API_KEY"] PINECONE_API_KEY = st.secrets["PINECONE_API_KEY"] genai.configure(api_key=GOOGLE_API_KEY) pc = Pinecone(api_key=PINECONE_API_KEY) INDEX_NAME = "hackrx-policy-index" except Exception as e: st.error("🚨 Could not find API keys. Please add them to the secrets management in your deployment environment.", icon="🚨") st.stop() # --- Helper Functions (adapted from your main.py) --- def create_doc_id_from_url(url: str) -> str: """Creates a stable SHA256 hash of the URL to use as a document ID (namespace).""" return hashlib.sha256(url.encode('utf-8')).hexdigest() def generate_answer_with_gemini(question: str, context: str) -> str: """Generates an answer using Gemini based on the provided context.""" model = genai.GenerativeModel('gemini-1.5-flash-latest') prompt = f""" You are an expert insurance policy analyst. Based ONLY on the context provided below from an insurance document, answer the user's question concisely. Do not use any external knowledge or make assumptions. If the answer cannot be found in the provided context, state that clearly. CONTEXT: --- {context} --- QUESTION: {question} ANSWER: """ try: response = model.generate_content(prompt) return response.text.strip() if response.parts else "The model's response was empty." except Exception as e: return f"An error occurred while generating the answer: {e}" # --- Caching --- # Use Streamlit's caching to avoid re-processing the same document repeatedly. @st.cache_data(show_spinner=False) def process_document(doc_url): """ Full pipeline: Downloads, chunks, embeds, and indexes a document. This function is cached, so it only runs once per URL. """ with st.spinner(f"Processing document: {doc_url}... This may take a moment."): namespace = create_doc_id_from_url(doc_url) index = pc.Index(INDEX_NAME) # Check if the document is already processed by checking the namespace stats = index.describe_index_stats() if stats.get('namespaces', {}).get(namespace, {}).get('vector_count', 0) > 0: st.success(f"Document '{doc_url}' is already processed and ready for questions.") return namespace # Full processing pipeline document_text = get_document_text(doc_url) if not document_text: st.error("Failed to retrieve or extract text from the document.") return None chunks = split_text_into_chunks(document_text) if not chunks: st.error("Failed to split document into chunks.") return None embeddings = generate_embeddings(chunks) if not embeddings: st.error("Failed to generate embeddings.") return None index_chunks_in_pinecone(chunks, embeddings, INDEX_NAME, namespace=namespace) st.success(f"Successfully processed and indexed document: {doc_url}") return namespace # --- Streamlit UI --- st.title("📄 Insurance DocAI: Your Insurance Policy Expert") st.markdown("Enter the URL of an insurance policy document (PDF) and ask questions about it.") # Initialize session state for conversation history if "messages" not in st.session_state: st.session_state.messages = [] # Input for document URL doc_url = st.text_input("Enter the Document URL", placeholder="https://your-document-url.pdf", key="doc_url_input") if doc_url: # Process the document and get the namespace namespace = process_document(doc_url) if namespace: st.info("Document is ready. You can now ask questions below.") # Display chat messages from history on app rerun for message in st.session_state.messages: with st.chat_message(message["role"]): st.markdown(message["content"]) # Accept user input if prompt := st.chat_input("Ask a question about the policy"): # Add user message to chat history st.session_state.messages.append({"role": "user", "content": prompt}) # Display user message in chat message container with st.chat_message("user"): st.markdown(prompt) # Display assistant response in chat message container with st.chat_message("assistant"): message_placeholder = st.empty() with st.spinner("Thinking..."): # 1. Generate embedding for the question question_embedding_response = genai.embed_content( model="models/embedding-001", content=prompt, task_type="retrieval_query" ) question_embedding = question_embedding_response['embedding'] # 2. Query Pinecone for relevant context index = pc.Index(INDEX_NAME) search_results = index.query( vector=question_embedding, top_k=5, include_metadata=True, namespace=namespace ) # 3. Assemble the context and generate the answer context_chunks = [match.metadata['text'] for match in search_results.matches] context = "\n\n".join(context_chunks) answer = generate_answer_with_gemini(prompt, context) message_placeholder.markdown(answer) # Add assistant response to chat history st.session_state.messages.append({"role": "assistant", "content": answer})