Spaces:
Sleeping
Sleeping
File size: 6,441 Bytes
87c78a9 888e988 87c78a9 888e988 87c78a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
import os
import streamlit as st
import hashlib
import time
from pinecone import Pinecone
import google.generativeai as genai
# Import your data processing functions
from data_processor import (
get_document_text,
split_text_into_chunks,
generate_embeddings,
index_chunks_in_pinecone,
)
# --- Page Configuration ---
st.set_page_config(
page_title="Insurance DocAI π€",
page_icon="π",
layout="wide"
)
# --- API and Client Initialization ---
# Use st.secrets for secure handling of API keys on Streamlit Cloud/Hugging Face
try:
GOOGLE_API_KEY = st.secrets["GOOGLE_API_KEY"]
PINECONE_API_KEY = st.secrets["PINECONE_API_KEY"]
genai.configure(api_key=GOOGLE_API_KEY)
pc = Pinecone(api_key=PINECONE_API_KEY)
INDEX_NAME = "hackrx-policy-index"
except Exception as e:
st.error("π¨ Could not find API keys. Please add them to the secrets management in your deployment environment.", icon="π¨")
st.stop()
# --- Helper Functions (adapted from your main.py) ---
def create_doc_id_from_url(url: str) -> str:
"""Creates a stable SHA256 hash of the URL to use as a document ID (namespace)."""
return hashlib.sha256(url.encode('utf-8')).hexdigest()
def generate_answer_with_gemini(question: str, context: str) -> str:
"""Generates an answer using Gemini based on the provided context."""
model = genai.GenerativeModel('gemini-1.5-flash-latest')
prompt = f"""
You are an expert insurance policy analyst.
Based ONLY on the context provided below from an insurance document, answer the user's question concisely.
Do not use any external knowledge or make assumptions.
If the answer cannot be found in the provided context, state that clearly.
CONTEXT:
---
{context}
---
QUESTION: {question}
ANSWER:
"""
try:
response = model.generate_content(prompt)
return response.text.strip() if response.parts else "The model's response was empty."
except Exception as e:
return f"An error occurred while generating the answer: {e}"
# --- Caching ---
# Use Streamlit's caching to avoid re-processing the same document repeatedly.
@st.cache_data(show_spinner=False)
def process_document(doc_url):
"""
Full pipeline: Downloads, chunks, embeds, and indexes a document.
This function is cached, so it only runs once per URL.
"""
with st.spinner(f"Processing document: {doc_url}... This may take a moment."):
namespace = create_doc_id_from_url(doc_url)
index = pc.Index(INDEX_NAME)
# Check if the document is already processed by checking the namespace
stats = index.describe_index_stats()
if stats.get('namespaces', {}).get(namespace, {}).get('vector_count', 0) > 0:
st.success(f"Document '{doc_url}' is already processed and ready for questions.")
return namespace
# Full processing pipeline
document_text = get_document_text(doc_url)
if not document_text:
st.error("Failed to retrieve or extract text from the document.")
return None
chunks = split_text_into_chunks(document_text)
if not chunks:
st.error("Failed to split document into chunks.")
return None
embeddings = generate_embeddings(chunks)
if not embeddings:
st.error("Failed to generate embeddings.")
return None
index_chunks_in_pinecone(chunks, embeddings, INDEX_NAME, namespace=namespace)
st.success(f"Successfully processed and indexed document: {doc_url}")
return namespace
# --- Streamlit UI ---
st.title("π Insurance DocAI: Your Insurance Policy Expert")
st.markdown("Enter the URL of an insurance policy document (PDF) and ask questions about it.")
# Initialize session state for conversation history
if "messages" not in st.session_state:
st.session_state.messages = []
# Input for document URL
doc_url = st.text_input("Enter the Document URL", placeholder="https://your-document-url.pdf", key="doc_url_input")
if doc_url:
# Process the document and get the namespace
namespace = process_document(doc_url)
if namespace:
st.info("Document is ready. You can now ask questions below.")
# Display chat messages from history on app rerun
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
# Accept user input
if prompt := st.chat_input("Ask a question about the policy"):
# Add user message to chat history
st.session_state.messages.append({"role": "user", "content": prompt})
# Display user message in chat message container
with st.chat_message("user"):
st.markdown(prompt)
# Display assistant response in chat message container
with st.chat_message("assistant"):
message_placeholder = st.empty()
with st.spinner("Thinking..."):
# 1. Generate embedding for the question
question_embedding_response = genai.embed_content(
model="models/embedding-001",
content=prompt,
task_type="retrieval_query"
)
question_embedding = question_embedding_response['embedding']
# 2. Query Pinecone for relevant context
index = pc.Index(INDEX_NAME)
search_results = index.query(
vector=question_embedding,
top_k=5,
include_metadata=True,
namespace=namespace
)
# 3. Assemble the context and generate the answer
context_chunks = [match.metadata['text'] for match in search_results.matches]
context = "\n\n".join(context_chunks)
answer = generate_answer_with_gemini(prompt, context)
message_placeholder.markdown(answer)
# Add assistant response to chat history
st.session_state.messages.append({"role": "assistant", "content": answer}) |