File size: 6,441 Bytes
87c78a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
888e988
87c78a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
888e988
87c78a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import os
import streamlit as st
import hashlib
import time
from pinecone import Pinecone
import google.generativeai as genai

# Import your data processing functions
from data_processor import (
    get_document_text,
    split_text_into_chunks,
    generate_embeddings,
    index_chunks_in_pinecone,
)

# --- Page Configuration ---
st.set_page_config(
    page_title="Insurance DocAI πŸ€–",
    page_icon="πŸ“„",
    layout="wide"
)

# --- API and Client Initialization ---
# Use st.secrets for secure handling of API keys on Streamlit Cloud/Hugging Face
try:
    GOOGLE_API_KEY = st.secrets["GOOGLE_API_KEY"]
    PINECONE_API_KEY = st.secrets["PINECONE_API_KEY"]
    
    genai.configure(api_key=GOOGLE_API_KEY)
    pc = Pinecone(api_key=PINECONE_API_KEY)
    INDEX_NAME = "hackrx-policy-index"

except Exception as e:
    st.error("🚨 Could not find API keys. Please add them to the secrets management in your deployment environment.", icon="🚨")
    st.stop()


# --- Helper Functions (adapted from your main.py) ---

def create_doc_id_from_url(url: str) -> str:
    """Creates a stable SHA256 hash of the URL to use as a document ID (namespace)."""
    return hashlib.sha256(url.encode('utf-8')).hexdigest()

def generate_answer_with_gemini(question: str, context: str) -> str:
    """Generates an answer using Gemini based on the provided context."""
    model = genai.GenerativeModel('gemini-1.5-flash-latest')
    prompt = f"""
    You are an expert insurance policy analyst.
    Based ONLY on the context provided below from an insurance document, answer the user's question concisely.
    Do not use any external knowledge or make assumptions.
    If the answer cannot be found in the provided context, state that clearly.

    CONTEXT:
    ---
    {context}
    ---

    QUESTION: {question}

    ANSWER:
    """
    try:
        response = model.generate_content(prompt)
        return response.text.strip() if response.parts else "The model's response was empty."
    except Exception as e:
        return f"An error occurred while generating the answer: {e}"

# --- Caching ---
# Use Streamlit's caching to avoid re-processing the same document repeatedly.
@st.cache_data(show_spinner=False)
def process_document(doc_url):
    """
    Full pipeline: Downloads, chunks, embeds, and indexes a document.
    This function is cached, so it only runs once per URL.
    """
    with st.spinner(f"Processing document: {doc_url}... This may take a moment."):
        namespace = create_doc_id_from_url(doc_url)
        index = pc.Index(INDEX_NAME)
        
        # Check if the document is already processed by checking the namespace
        stats = index.describe_index_stats()
        if stats.get('namespaces', {}).get(namespace, {}).get('vector_count', 0) > 0:
            st.success(f"Document '{doc_url}' is already processed and ready for questions.")
            return namespace

        # Full processing pipeline
        document_text = get_document_text(doc_url)
        if not document_text:
            st.error("Failed to retrieve or extract text from the document.")
            return None
        
        chunks = split_text_into_chunks(document_text)
        if not chunks:
            st.error("Failed to split document into chunks.")
            return None
            
        embeddings = generate_embeddings(chunks)
        if not embeddings:
            st.error("Failed to generate embeddings.")
            return None
            
        index_chunks_in_pinecone(chunks, embeddings, INDEX_NAME, namespace=namespace)
        st.success(f"Successfully processed and indexed document: {doc_url}")
        return namespace

# --- Streamlit UI ---

st.title("πŸ“„ Insurance DocAI: Your Insurance Policy Expert")
st.markdown("Enter the URL of an insurance policy document (PDF) and ask questions about it.")

# Initialize session state for conversation history
if "messages" not in st.session_state:
    st.session_state.messages = []

# Input for document URL
doc_url = st.text_input("Enter the Document URL", placeholder="https://your-document-url.pdf", key="doc_url_input")

if doc_url:
    # Process the document and get the namespace
    namespace = process_document(doc_url)
    
    if namespace:
        st.info("Document is ready. You can now ask questions below.")
        
        # Display chat messages from history on app rerun
        for message in st.session_state.messages:
            with st.chat_message(message["role"]):
                st.markdown(message["content"])

        # Accept user input
        if prompt := st.chat_input("Ask a question about the policy"):
            # Add user message to chat history
            st.session_state.messages.append({"role": "user", "content": prompt})
            # Display user message in chat message container
            with st.chat_message("user"):
                st.markdown(prompt)

            # Display assistant response in chat message container
            with st.chat_message("assistant"):
                message_placeholder = st.empty()
                with st.spinner("Thinking..."):
                    # 1. Generate embedding for the question
                    question_embedding_response = genai.embed_content(
                        model="models/embedding-001",
                        content=prompt,
                        task_type="retrieval_query"
                    )
                    question_embedding = question_embedding_response['embedding']
                    
                    # 2. Query Pinecone for relevant context
                    index = pc.Index(INDEX_NAME)
                    search_results = index.query(
                        vector=question_embedding,
                        top_k=5,
                        include_metadata=True,
                        namespace=namespace
                    )
                    
                    # 3. Assemble the context and generate the answer
                    context_chunks = [match.metadata['text'] for match in search_results.matches]
                    context = "\n\n".join(context_chunks)
                    
                    answer = generate_answer_with_gemini(prompt, context)
                    
                    message_placeholder.markdown(answer)
            
            # Add assistant response to chat history
            st.session_state.messages.append({"role": "assistant", "content": answer})