Spaces:

archis99
/

Insurance_DocAI

Sleeping

App Files Files Community

archis99 commited on Sep 23, 2025

Commit

87c78a9

1 Parent(s): c6b1f4e

Initial project commit with app files

Browse files

Files changed (8) hide show

.gitattributes +0 -35
Dockerfile +0 -20
README.md +0 -20
app.py +168 -0
data_processor.py +189 -0
packages.txt +1 -0
requirements.txt +7 -3
src/streamlit_app.py +0 -40

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

Dockerfile DELETED Viewed

@@ -1,20 +0,0 @@
-FROM python:3.13.5-slim
-WORKDIR /app
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
-EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

README.md DELETED Viewed

@@ -1,20 +0,0 @@
----
-title: Insurance DocAI
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
-pinned: false
-short_description: HackRx 6.0- Bajaj Finserv Annual Flagship Hackathon
-license: mit
----
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

app.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import os
+import streamlit as st
+import hashlib
+import time
+from pinecone import Pinecone
+import google.generativeai as genai
+# Import your data processing functions
+from data_processor import (
+    get_document_text,
+    split_text_into_chunks,
+    generate_embeddings,
+    index_chunks_in_pinecone,
+)
+# --- Page Configuration ---
+st.set_page_config(
+    page_title="ClarityClaim AI 🤖",
+    page_icon="📄",
+    layout="wide"
+)
+# --- API and Client Initialization ---
+# Use st.secrets for secure handling of API keys on Streamlit Cloud/Hugging Face
+try:
+    GOOGLE_API_KEY = st.secrets["GOOGLE_API_KEY"]
+    PINECONE_API_KEY = st.secrets["PINECONE_API_KEY"]
+    genai.configure(api_key=GOOGLE_API_KEY)
+    pc = Pinecone(api_key=PINECONE_API_KEY)
+    INDEX_NAME = "hackrx-policy-index"
+except Exception as e:
+    st.error("🚨 Could not find API keys. Please add them to the secrets management in your deployment environment.", icon="🚨")
+    st.stop()
+# --- Helper Functions (adapted from your main.py) ---
+def create_doc_id_from_url(url: str) -> str:
+    """Creates a stable SHA256 hash of the URL to use as a document ID (namespace)."""
+    return hashlib.sha256(url.encode('utf-8')).hexdigest()
+def generate_answer_with_gemini(question: str, context: str) -> str:
+    """Generates an answer using Gemini based on the provided context."""
+    model = genai.GenerativeModel('gemini-1.5-flash-latest')
+    prompt = f"""
+    You are an expert insurance policy analyst.
+    Based ONLY on the context provided below from an insurance document, answer the user's question concisely.
+    Do not use any external knowledge or make assumptions.
+    If the answer cannot be found in the provided context, state that clearly.
+    CONTEXT:
+    ---
+    {context}
+    ---
+    QUESTION: {question}
+    ANSWER:
+    """
+    try:
+        response = model.generate_content(prompt)
+        return response.text.strip() if response.parts else "The model's response was empty."
+    except Exception as e:
+        return f"An error occurred while generating the answer: {e}"
+# --- Caching ---
+# Use Streamlit's caching to avoid re-processing the same document repeatedly.
+@st.cache_data(show_spinner=False)
+def process_document(doc_url):
+    """
+    Full pipeline: Downloads, chunks, embeds, and indexes a document.
+    This function is cached, so it only runs once per URL.
+    """
+    with st.spinner(f"Processing document: {doc_url}... This may take a moment."):
+        namespace = create_doc_id_from_url(doc_url)
+        index = pc.Index(INDEX_NAME)
+        # Check if the document is already processed by checking the namespace
+        stats = index.describe_index_stats()
+        if stats.get('namespaces', {}).get(namespace, {}).get('vector_count', 0) > 0:
+            st.success(f"Document '{doc_url}' is already processed and ready for questions.")
+            return namespace
+        # Full processing pipeline
+        document_text = get_document_text(doc_url)
+        if not document_text:
+            st.error("Failed to retrieve or extract text from the document.")
+            return None
+        chunks = split_text_into_chunks(document_text)
+        if not chunks:
+            st.error("Failed to split document into chunks.")
+            return None
+        embeddings = generate_embeddings(chunks)
+        if not embeddings:
+            st.error("Failed to generate embeddings.")
+            return None
+        index_chunks_in_pinecone(chunks, embeddings, INDEX_NAME, namespace=namespace)
+        st.success(f"Successfully processed and indexed document: {doc_url}")
+        return namespace
+# --- Streamlit UI ---
+st.title("📄 ClarityClaim AI: Your Insurance Policy Expert")
+st.markdown("Enter the URL of an insurance policy document (PDF) and ask questions about it.")
+# Initialize session state for conversation history
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+# Input for document URL
+doc_url = st.text_input("Enter the Document URL", placeholder="https://your-document-url.pdf", key="doc_url_input")
+if doc_url:
+    # Process the document and get the namespace
+    namespace = process_document(doc_url)
+    if namespace:
+        st.info("Document is ready. You can now ask questions below.")
+        # Display chat messages from history on app rerun
+        for message in st.session_state.messages:
+            with st.chat_message(message["role"]):
+                st.markdown(message["content"])
+        # Accept user input
+        if prompt := st.chat_input("Ask a question about the policy"):
+            # Add user message to chat history
+            st.session_state.messages.append({"role": "user", "content": prompt})
+            # Display user message in chat message container
+            with st.chat_message("user"):
+                st.markdown(prompt)
+            # Display assistant response in chat message container
+            with st.chat_message("assistant"):
+                message_placeholder = st.empty()
+                with st.spinner("Thinking..."):
+                    # 1. Generate embedding for the question
+                    question_embedding_response = genai.embed_content(
+                        model="models/embedding-001",
+                        content=prompt,
+                        task_type="retrieval_query"
+                    )
+                    question_embedding = question_embedding_response['embedding']
+                    # 2. Query Pinecone for relevant context
+                    index = pc.Index(INDEX_NAME)
+                    search_results = index.query(
+                        vector=question_embedding,
+                        top_k=5,
+                        include_metadata=True,
+                        namespace=namespace
+                    )
+                    # 3. Assemble the context and generate the answer
+                    context_chunks = [match.metadata['text'] for match in search_results.matches]
+                    context = "\n\n".join(context_chunks)
+                    answer = generate_answer_with_gemini(prompt, context)
+                    message_placeholder.markdown(answer)
+            # Add assistant response to chat history
+            st.session_state.messages.append({"role": "assistant", "content": answer})

data_processor.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import requests
+import fitz
+import textwrap
+import os
+import google.generativeai as genai
+from dotenv import load_dotenv
+from pinecone import Pinecone, ServerlessSpec
+import hashlib
+import time
+# Load environment variables from .env file
+load_dotenv()
+GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
+PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
+PINECONE_ENVIRONMENT = os.environ.get("PINECONE_ENVIRONMENT")
+# Initialize clients
+genai.configure(api_key=GOOGLE_API_KEY)
+pc = Pinecone(api_key=PINECONE_API_KEY)
+# --- CORRECTED FUNCTION: Handles both URLs and binary file content ---
+def get_document_text(source) -> str:
+    """
+    Extracts text from a document, handling either a URL or raw binary content.
+    """
+    document_content = None
+    if isinstance(source, str):  # If the source is a URL string
+        print(f"Downloading document from {source}...")
+        try:
+            response = requests.get(source)
+            response.raise_for_status()
+            document_content = response.content
+        except requests.exceptions.RequestException as e:
+            print(f"Error downloading the document: {e}")
+            return ""
+    elif isinstance(source, bytes):  # If the source is raw file content (from upload)
+        print("Processing uploaded document content...")
+        document_content = source
+    else:
+        print("Invalid source type provided to get_document_text.")
+        return ""
+    if not document_content:
+        return ""
+    print("Extracting text from the document...")
+    document_text = ""
+    try:
+        pdf_document = fitz.open(stream=document_content, filetype="pdf")
+        for page_num in range(len(pdf_document)):
+            page = pdf_document.load_page(page_num)
+            document_text += page.get_text()
+    except Exception as e:
+        print(f"Error extracting text: {e}")
+        return ""
+    return document_text
+def create_document_id(source: str) -> str:
+    """Creates a stable SHA256 hash of the URL to use as a document ID."""
+    return hashlib.sha256(source.encode()).hexdigest()
+def split_text_into_chunks(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> list[str]:
+    """
+    Splits a large text document into smaller, overlapping chunks using a recursive strategy.
+    """
+    def _recursive_split(t, separators, size, overlap):
+        if not separators:
+            return textwrap.wrap(t, size)
+        current_sep = separators[0]
+        other_seps = separators[1:]
+        parts = t.split(current_sep)
+        chunks = []
+        for part in parts:
+            if len(part) > size:
+                chunks.extend(_recursive_split(part, other_seps, size, overlap))
+            else:
+                chunks.append(part)
+        final_chunks = []
+        if chunks:
+            current_chunk = chunks[0]
+            for i in range(1, len(chunks)):
+                if len(current_chunk) + len(chunks[i]) <= size + overlap:
+                    current_chunk += current_sep + chunks[i]
+                else:
+                    final_chunks.append(current_chunk)
+                    current_chunk = chunks[i]
+            final_chunks.append(current_chunk)
+        return [c for c in final_chunks if c.strip()]
+    separators = ["\n\n", "\n", ". ", " "]
+    chunks = _recursive_split(text, separators, chunk_size, chunk_overlap)
+    return chunks
+def generate_embeddings(text_chunks: list[str]) -> list:
+    """
+    Generates vector embeddings for a list of text chunks using Gemini Pro API.
+    """
+    print(f"Generating embeddings for {len(text_chunks)} chunks using Gemini Pro...")
+    embeddings = []
+    try:
+        response = genai.embed_content(
+            model="models/embedding-001",
+            content=text_chunks
+        )
+        embeddings = response['embedding']
+        print("Embeddings generated successfully.")
+    except Exception as e:
+        print(f"Error generating embeddings: {e}")
+    return embeddings
+def index_chunks_in_pinecone(chunks: list[str], embeddings: list, index_name: str, namespace: str):
+    """
+    Indexes the text chunks and their embeddings in a specific Pinecone namespace.
+    """
+    print(f"Indexing {len(chunks)} chunks in Pinecone index '{index_name}' under namespace '{namespace}'...")
+    try:
+        # Check if index exists, and create if it doesn't
+        if index_name not in pc.list_indexes().names():
+            print(f"Creating new Pinecone index: '{index_name}'")
+            pc.create_index(
+                name=index_name,
+                dimension=len(embeddings[0]),
+                metric='cosine',
+                spec=ServerlessSpec(cloud='aws', region='us-east-1')
+            )
+            print("Index created successfully. Waiting for it to become ready...")
+            # Wait for index to be ready
+            while not pc.describe_index(index_name).status.ready:
+                time.sleep(1)
+        index = pc.Index(index_name)
+        # Prepare data for upsert
+        vectors_to_upsert = []
+        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
+            vectors_to_upsert.append({
+                "id": f"chunk-{namespace}-{i}", # Make ID unique across namespaces
+                "values": embedding,
+                "metadata": {"text": chunk}
+            })
+        # Upsert in batches
+        batch_size = 100
+        for i in range(0, len(vectors_to_upsert), batch_size):
+            batch = vectors_to_upsert[i:i + batch_size]
+            index.upsert(vectors=batch, namespace=namespace) # <-- USE THE NAMESPACE
+            print(f"Upserted batch {i // batch_size + 1} into namespace '{namespace}'")
+        print(f"Successfully indexed {len(chunks)} chunks in namespace '{namespace}'.")
+        # Give a moment for the index to become queryable
+        time.sleep(5)
+    except Exception as e:
+        print(f"Error indexing in Pinecone: {e}")
+if __name__ == "__main__":
+    sample_url = "https://hackrx.blob.core.windows.net/assets/hackrx_6/policies/BAJHLIP23020V012223.pdf?sv=2023-01-03&st=2025-07-30T06%3A46%3A49Z&se=2025-09-01T06%3A46%3A00Z&sr=c&sp=rl&sig=9szykRKdGYj0BVm1skP%2BX8N9%2FRENEn2k7MQPUp33jyQ%3D"
+    index_name = "hackrx-policy-index"
+    document_content = get_document_text(sample_url)
+    if document_content:
+        chunks = split_text_into_chunks(document_content)
+        print(f"\n--- Document Split into {len(chunks)} Chunks ---")
+        embeddings = generate_embeddings(chunks)
+        if embeddings:
+            print(f"Generated {len(embeddings)} embeddings.")
+            print(f"Size of each embedding vector: {len(embeddings[0])}")
+            # Index the chunks in Pinecone
+            print("--- Running standalone script test ---")
+            test_namespace = create_document_id(sample_url) # Use the new function!
+            index_chunks_in_pinecone(chunks, embeddings, index_name, namespace=test_namespace)
+        else:
+            print("Failed to generate embeddings. Pinecone indexing skipped.")
+    else:
+        print("Failed to process document content.")

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ poppler-utils

requirements.txt CHANGED Viewed

@@ -1,3 +1,7 @@
-altair
-pandas
-streamlit

+streamlit
+requests
+pymupdf
+google-generativeai
+python-dotenv
+pinecone-client
+hashlib

src/streamlit_app.py DELETED Viewed

@@ -1,40 +0,0 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))