Spaces:
Sleeping
Sleeping
Initial project commit with app files
Browse files- .gitattributes +0 -35
- Dockerfile +0 -20
- README.md +0 -20
- app.py +168 -0
- data_processor.py +189 -0
- packages.txt +1 -0
- requirements.txt +7 -3
- src/streamlit_app.py +0 -40
.gitattributes
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dockerfile
DELETED
|
@@ -1,20 +0,0 @@
|
|
| 1 |
-
FROM python:3.13.5-slim
|
| 2 |
-
|
| 3 |
-
WORKDIR /app
|
| 4 |
-
|
| 5 |
-
RUN apt-get update && apt-get install -y \
|
| 6 |
-
build-essential \
|
| 7 |
-
curl \
|
| 8 |
-
git \
|
| 9 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
-
|
| 11 |
-
COPY requirements.txt ./
|
| 12 |
-
COPY src/ ./src/
|
| 13 |
-
|
| 14 |
-
RUN pip3 install -r requirements.txt
|
| 15 |
-
|
| 16 |
-
EXPOSE 8501
|
| 17 |
-
|
| 18 |
-
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
| 19 |
-
|
| 20 |
-
ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
DELETED
|
@@ -1,20 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: Insurance DocAI
|
| 3 |
-
emoji: 🚀
|
| 4 |
-
colorFrom: red
|
| 5 |
-
colorTo: red
|
| 6 |
-
sdk: docker
|
| 7 |
-
app_port: 8501
|
| 8 |
-
tags:
|
| 9 |
-
- streamlit
|
| 10 |
-
pinned: false
|
| 11 |
-
short_description: HackRx 6.0- Bajaj Finserv Annual Flagship Hackathon
|
| 12 |
-
license: mit
|
| 13 |
-
---
|
| 14 |
-
|
| 15 |
-
# Welcome to Streamlit!
|
| 16 |
-
|
| 17 |
-
Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
|
| 18 |
-
|
| 19 |
-
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
| 20 |
-
forums](https://discuss.streamlit.io).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import streamlit as st
|
| 3 |
+
import hashlib
|
| 4 |
+
import time
|
| 5 |
+
from pinecone import Pinecone
|
| 6 |
+
import google.generativeai as genai
|
| 7 |
+
|
| 8 |
+
# Import your data processing functions
|
| 9 |
+
from data_processor import (
|
| 10 |
+
get_document_text,
|
| 11 |
+
split_text_into_chunks,
|
| 12 |
+
generate_embeddings,
|
| 13 |
+
index_chunks_in_pinecone,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
# --- Page Configuration ---
|
| 17 |
+
st.set_page_config(
|
| 18 |
+
page_title="ClarityClaim AI 🤖",
|
| 19 |
+
page_icon="📄",
|
| 20 |
+
layout="wide"
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# --- API and Client Initialization ---
|
| 24 |
+
# Use st.secrets for secure handling of API keys on Streamlit Cloud/Hugging Face
|
| 25 |
+
try:
|
| 26 |
+
GOOGLE_API_KEY = st.secrets["GOOGLE_API_KEY"]
|
| 27 |
+
PINECONE_API_KEY = st.secrets["PINECONE_API_KEY"]
|
| 28 |
+
|
| 29 |
+
genai.configure(api_key=GOOGLE_API_KEY)
|
| 30 |
+
pc = Pinecone(api_key=PINECONE_API_KEY)
|
| 31 |
+
INDEX_NAME = "hackrx-policy-index"
|
| 32 |
+
|
| 33 |
+
except Exception as e:
|
| 34 |
+
st.error("🚨 Could not find API keys. Please add them to the secrets management in your deployment environment.", icon="🚨")
|
| 35 |
+
st.stop()
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# --- Helper Functions (adapted from your main.py) ---
|
| 39 |
+
|
| 40 |
+
def create_doc_id_from_url(url: str) -> str:
|
| 41 |
+
"""Creates a stable SHA256 hash of the URL to use as a document ID (namespace)."""
|
| 42 |
+
return hashlib.sha256(url.encode('utf-8')).hexdigest()
|
| 43 |
+
|
| 44 |
+
def generate_answer_with_gemini(question: str, context: str) -> str:
|
| 45 |
+
"""Generates an answer using Gemini based on the provided context."""
|
| 46 |
+
model = genai.GenerativeModel('gemini-1.5-flash-latest')
|
| 47 |
+
prompt = f"""
|
| 48 |
+
You are an expert insurance policy analyst.
|
| 49 |
+
Based ONLY on the context provided below from an insurance document, answer the user's question concisely.
|
| 50 |
+
Do not use any external knowledge or make assumptions.
|
| 51 |
+
If the answer cannot be found in the provided context, state that clearly.
|
| 52 |
+
|
| 53 |
+
CONTEXT:
|
| 54 |
+
---
|
| 55 |
+
{context}
|
| 56 |
+
---
|
| 57 |
+
|
| 58 |
+
QUESTION: {question}
|
| 59 |
+
|
| 60 |
+
ANSWER:
|
| 61 |
+
"""
|
| 62 |
+
try:
|
| 63 |
+
response = model.generate_content(prompt)
|
| 64 |
+
return response.text.strip() if response.parts else "The model's response was empty."
|
| 65 |
+
except Exception as e:
|
| 66 |
+
return f"An error occurred while generating the answer: {e}"
|
| 67 |
+
|
| 68 |
+
# --- Caching ---
|
| 69 |
+
# Use Streamlit's caching to avoid re-processing the same document repeatedly.
|
| 70 |
+
@st.cache_data(show_spinner=False)
|
| 71 |
+
def process_document(doc_url):
|
| 72 |
+
"""
|
| 73 |
+
Full pipeline: Downloads, chunks, embeds, and indexes a document.
|
| 74 |
+
This function is cached, so it only runs once per URL.
|
| 75 |
+
"""
|
| 76 |
+
with st.spinner(f"Processing document: {doc_url}... This may take a moment."):
|
| 77 |
+
namespace = create_doc_id_from_url(doc_url)
|
| 78 |
+
index = pc.Index(INDEX_NAME)
|
| 79 |
+
|
| 80 |
+
# Check if the document is already processed by checking the namespace
|
| 81 |
+
stats = index.describe_index_stats()
|
| 82 |
+
if stats.get('namespaces', {}).get(namespace, {}).get('vector_count', 0) > 0:
|
| 83 |
+
st.success(f"Document '{doc_url}' is already processed and ready for questions.")
|
| 84 |
+
return namespace
|
| 85 |
+
|
| 86 |
+
# Full processing pipeline
|
| 87 |
+
document_text = get_document_text(doc_url)
|
| 88 |
+
if not document_text:
|
| 89 |
+
st.error("Failed to retrieve or extract text from the document.")
|
| 90 |
+
return None
|
| 91 |
+
|
| 92 |
+
chunks = split_text_into_chunks(document_text)
|
| 93 |
+
if not chunks:
|
| 94 |
+
st.error("Failed to split document into chunks.")
|
| 95 |
+
return None
|
| 96 |
+
|
| 97 |
+
embeddings = generate_embeddings(chunks)
|
| 98 |
+
if not embeddings:
|
| 99 |
+
st.error("Failed to generate embeddings.")
|
| 100 |
+
return None
|
| 101 |
+
|
| 102 |
+
index_chunks_in_pinecone(chunks, embeddings, INDEX_NAME, namespace=namespace)
|
| 103 |
+
st.success(f"Successfully processed and indexed document: {doc_url}")
|
| 104 |
+
return namespace
|
| 105 |
+
|
| 106 |
+
# --- Streamlit UI ---
|
| 107 |
+
|
| 108 |
+
st.title("📄 ClarityClaim AI: Your Insurance Policy Expert")
|
| 109 |
+
st.markdown("Enter the URL of an insurance policy document (PDF) and ask questions about it.")
|
| 110 |
+
|
| 111 |
+
# Initialize session state for conversation history
|
| 112 |
+
if "messages" not in st.session_state:
|
| 113 |
+
st.session_state.messages = []
|
| 114 |
+
|
| 115 |
+
# Input for document URL
|
| 116 |
+
doc_url = st.text_input("Enter the Document URL", placeholder="https://your-document-url.pdf", key="doc_url_input")
|
| 117 |
+
|
| 118 |
+
if doc_url:
|
| 119 |
+
# Process the document and get the namespace
|
| 120 |
+
namespace = process_document(doc_url)
|
| 121 |
+
|
| 122 |
+
if namespace:
|
| 123 |
+
st.info("Document is ready. You can now ask questions below.")
|
| 124 |
+
|
| 125 |
+
# Display chat messages from history on app rerun
|
| 126 |
+
for message in st.session_state.messages:
|
| 127 |
+
with st.chat_message(message["role"]):
|
| 128 |
+
st.markdown(message["content"])
|
| 129 |
+
|
| 130 |
+
# Accept user input
|
| 131 |
+
if prompt := st.chat_input("Ask a question about the policy"):
|
| 132 |
+
# Add user message to chat history
|
| 133 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
| 134 |
+
# Display user message in chat message container
|
| 135 |
+
with st.chat_message("user"):
|
| 136 |
+
st.markdown(prompt)
|
| 137 |
+
|
| 138 |
+
# Display assistant response in chat message container
|
| 139 |
+
with st.chat_message("assistant"):
|
| 140 |
+
message_placeholder = st.empty()
|
| 141 |
+
with st.spinner("Thinking..."):
|
| 142 |
+
# 1. Generate embedding for the question
|
| 143 |
+
question_embedding_response = genai.embed_content(
|
| 144 |
+
model="models/embedding-001",
|
| 145 |
+
content=prompt,
|
| 146 |
+
task_type="retrieval_query"
|
| 147 |
+
)
|
| 148 |
+
question_embedding = question_embedding_response['embedding']
|
| 149 |
+
|
| 150 |
+
# 2. Query Pinecone for relevant context
|
| 151 |
+
index = pc.Index(INDEX_NAME)
|
| 152 |
+
search_results = index.query(
|
| 153 |
+
vector=question_embedding,
|
| 154 |
+
top_k=5,
|
| 155 |
+
include_metadata=True,
|
| 156 |
+
namespace=namespace
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
# 3. Assemble the context and generate the answer
|
| 160 |
+
context_chunks = [match.metadata['text'] for match in search_results.matches]
|
| 161 |
+
context = "\n\n".join(context_chunks)
|
| 162 |
+
|
| 163 |
+
answer = generate_answer_with_gemini(prompt, context)
|
| 164 |
+
|
| 165 |
+
message_placeholder.markdown(answer)
|
| 166 |
+
|
| 167 |
+
# Add assistant response to chat history
|
| 168 |
+
st.session_state.messages.append({"role": "assistant", "content": answer})
|
data_processor.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import fitz
|
| 3 |
+
import textwrap
|
| 4 |
+
import os
|
| 5 |
+
import google.generativeai as genai
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
from pinecone import Pinecone, ServerlessSpec
|
| 8 |
+
import hashlib
|
| 9 |
+
import time
|
| 10 |
+
|
| 11 |
+
# Load environment variables from .env file
|
| 12 |
+
load_dotenv()
|
| 13 |
+
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
|
| 14 |
+
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
|
| 15 |
+
PINECONE_ENVIRONMENT = os.environ.get("PINECONE_ENVIRONMENT")
|
| 16 |
+
|
| 17 |
+
# Initialize clients
|
| 18 |
+
genai.configure(api_key=GOOGLE_API_KEY)
|
| 19 |
+
pc = Pinecone(api_key=PINECONE_API_KEY)
|
| 20 |
+
|
| 21 |
+
# --- CORRECTED FUNCTION: Handles both URLs and binary file content ---
|
| 22 |
+
def get_document_text(source) -> str:
|
| 23 |
+
"""
|
| 24 |
+
Extracts text from a document, handling either a URL or raw binary content.
|
| 25 |
+
"""
|
| 26 |
+
document_content = None
|
| 27 |
+
|
| 28 |
+
if isinstance(source, str): # If the source is a URL string
|
| 29 |
+
print(f"Downloading document from {source}...")
|
| 30 |
+
try:
|
| 31 |
+
response = requests.get(source)
|
| 32 |
+
response.raise_for_status()
|
| 33 |
+
document_content = response.content
|
| 34 |
+
except requests.exceptions.RequestException as e:
|
| 35 |
+
print(f"Error downloading the document: {e}")
|
| 36 |
+
return ""
|
| 37 |
+
elif isinstance(source, bytes): # If the source is raw file content (from upload)
|
| 38 |
+
print("Processing uploaded document content...")
|
| 39 |
+
document_content = source
|
| 40 |
+
else:
|
| 41 |
+
print("Invalid source type provided to get_document_text.")
|
| 42 |
+
return ""
|
| 43 |
+
|
| 44 |
+
if not document_content:
|
| 45 |
+
return ""
|
| 46 |
+
|
| 47 |
+
print("Extracting text from the document...")
|
| 48 |
+
document_text = ""
|
| 49 |
+
try:
|
| 50 |
+
pdf_document = fitz.open(stream=document_content, filetype="pdf")
|
| 51 |
+
for page_num in range(len(pdf_document)):
|
| 52 |
+
page = pdf_document.load_page(page_num)
|
| 53 |
+
document_text += page.get_text()
|
| 54 |
+
except Exception as e:
|
| 55 |
+
print(f"Error extracting text: {e}")
|
| 56 |
+
return ""
|
| 57 |
+
|
| 58 |
+
return document_text
|
| 59 |
+
|
| 60 |
+
def create_document_id(source: str) -> str:
|
| 61 |
+
"""Creates a stable SHA256 hash of the URL to use as a document ID."""
|
| 62 |
+
return hashlib.sha256(source.encode()).hexdigest()
|
| 63 |
+
|
| 64 |
+
def split_text_into_chunks(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> list[str]:
|
| 65 |
+
"""
|
| 66 |
+
Splits a large text document into smaller, overlapping chunks using a recursive strategy.
|
| 67 |
+
"""
|
| 68 |
+
def _recursive_split(t, separators, size, overlap):
|
| 69 |
+
if not separators:
|
| 70 |
+
return textwrap.wrap(t, size)
|
| 71 |
+
|
| 72 |
+
current_sep = separators[0]
|
| 73 |
+
other_seps = separators[1:]
|
| 74 |
+
|
| 75 |
+
parts = t.split(current_sep)
|
| 76 |
+
chunks = []
|
| 77 |
+
|
| 78 |
+
for part in parts:
|
| 79 |
+
if len(part) > size:
|
| 80 |
+
chunks.extend(_recursive_split(part, other_seps, size, overlap))
|
| 81 |
+
else:
|
| 82 |
+
chunks.append(part)
|
| 83 |
+
|
| 84 |
+
final_chunks = []
|
| 85 |
+
if chunks:
|
| 86 |
+
current_chunk = chunks[0]
|
| 87 |
+
for i in range(1, len(chunks)):
|
| 88 |
+
if len(current_chunk) + len(chunks[i]) <= size + overlap:
|
| 89 |
+
current_chunk += current_sep + chunks[i]
|
| 90 |
+
else:
|
| 91 |
+
final_chunks.append(current_chunk)
|
| 92 |
+
current_chunk = chunks[i]
|
| 93 |
+
final_chunks.append(current_chunk)
|
| 94 |
+
|
| 95 |
+
return [c for c in final_chunks if c.strip()]
|
| 96 |
+
|
| 97 |
+
separators = ["\n\n", "\n", ". ", " "]
|
| 98 |
+
chunks = _recursive_split(text, separators, chunk_size, chunk_overlap)
|
| 99 |
+
|
| 100 |
+
return chunks
|
| 101 |
+
|
| 102 |
+
def generate_embeddings(text_chunks: list[str]) -> list:
|
| 103 |
+
"""
|
| 104 |
+
Generates vector embeddings for a list of text chunks using Gemini Pro API.
|
| 105 |
+
"""
|
| 106 |
+
print(f"Generating embeddings for {len(text_chunks)} chunks using Gemini Pro...")
|
| 107 |
+
embeddings = []
|
| 108 |
+
try:
|
| 109 |
+
response = genai.embed_content(
|
| 110 |
+
model="models/embedding-001",
|
| 111 |
+
content=text_chunks
|
| 112 |
+
)
|
| 113 |
+
embeddings = response['embedding']
|
| 114 |
+
print("Embeddings generated successfully.")
|
| 115 |
+
except Exception as e:
|
| 116 |
+
print(f"Error generating embeddings: {e}")
|
| 117 |
+
|
| 118 |
+
return embeddings
|
| 119 |
+
|
| 120 |
+
def index_chunks_in_pinecone(chunks: list[str], embeddings: list, index_name: str, namespace: str):
|
| 121 |
+
"""
|
| 122 |
+
Indexes the text chunks and their embeddings in a specific Pinecone namespace.
|
| 123 |
+
"""
|
| 124 |
+
print(f"Indexing {len(chunks)} chunks in Pinecone index '{index_name}' under namespace '{namespace}'...")
|
| 125 |
+
try:
|
| 126 |
+
# Check if index exists, and create if it doesn't
|
| 127 |
+
if index_name not in pc.list_indexes().names():
|
| 128 |
+
print(f"Creating new Pinecone index: '{index_name}'")
|
| 129 |
+
pc.create_index(
|
| 130 |
+
name=index_name,
|
| 131 |
+
dimension=len(embeddings[0]),
|
| 132 |
+
metric='cosine',
|
| 133 |
+
spec=ServerlessSpec(cloud='aws', region='us-east-1')
|
| 134 |
+
)
|
| 135 |
+
print("Index created successfully. Waiting for it to become ready...")
|
| 136 |
+
# Wait for index to be ready
|
| 137 |
+
while not pc.describe_index(index_name).status.ready:
|
| 138 |
+
time.sleep(1)
|
| 139 |
+
|
| 140 |
+
index = pc.Index(index_name)
|
| 141 |
+
|
| 142 |
+
# Prepare data for upsert
|
| 143 |
+
vectors_to_upsert = []
|
| 144 |
+
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
|
| 145 |
+
vectors_to_upsert.append({
|
| 146 |
+
"id": f"chunk-{namespace}-{i}", # Make ID unique across namespaces
|
| 147 |
+
"values": embedding,
|
| 148 |
+
"metadata": {"text": chunk}
|
| 149 |
+
})
|
| 150 |
+
|
| 151 |
+
# Upsert in batches
|
| 152 |
+
batch_size = 100
|
| 153 |
+
for i in range(0, len(vectors_to_upsert), batch_size):
|
| 154 |
+
batch = vectors_to_upsert[i:i + batch_size]
|
| 155 |
+
index.upsert(vectors=batch, namespace=namespace) # <-- USE THE NAMESPACE
|
| 156 |
+
print(f"Upserted batch {i // batch_size + 1} into namespace '{namespace}'")
|
| 157 |
+
|
| 158 |
+
print(f"Successfully indexed {len(chunks)} chunks in namespace '{namespace}'.")
|
| 159 |
+
# Give a moment for the index to become queryable
|
| 160 |
+
time.sleep(5)
|
| 161 |
+
|
| 162 |
+
except Exception as e:
|
| 163 |
+
print(f"Error indexing in Pinecone: {e}")
|
| 164 |
+
|
| 165 |
+
if __name__ == "__main__":
|
| 166 |
+
sample_url = "https://hackrx.blob.core.windows.net/assets/hackrx_6/policies/BAJHLIP23020V012223.pdf?sv=2023-01-03&st=2025-07-30T06%3A46%3A49Z&se=2025-09-01T06%3A46%3A00Z&sr=c&sp=rl&sig=9szykRKdGYj0BVm1skP%2BX8N9%2FRENEn2k7MQPUp33jyQ%3D"
|
| 167 |
+
index_name = "hackrx-policy-index"
|
| 168 |
+
|
| 169 |
+
document_content = get_document_text(sample_url)
|
| 170 |
+
|
| 171 |
+
if document_content:
|
| 172 |
+
chunks = split_text_into_chunks(document_content)
|
| 173 |
+
print(f"\n--- Document Split into {len(chunks)} Chunks ---")
|
| 174 |
+
|
| 175 |
+
embeddings = generate_embeddings(chunks)
|
| 176 |
+
|
| 177 |
+
if embeddings:
|
| 178 |
+
print(f"Generated {len(embeddings)} embeddings.")
|
| 179 |
+
print(f"Size of each embedding vector: {len(embeddings[0])}")
|
| 180 |
+
|
| 181 |
+
# Index the chunks in Pinecone
|
| 182 |
+
print("--- Running standalone script test ---")
|
| 183 |
+
test_namespace = create_document_id(sample_url) # Use the new function!
|
| 184 |
+
index_chunks_in_pinecone(chunks, embeddings, index_name, namespace=test_namespace)
|
| 185 |
+
else:
|
| 186 |
+
print("Failed to generate embeddings. Pinecone indexing skipped.")
|
| 187 |
+
|
| 188 |
+
else:
|
| 189 |
+
print("Failed to process document content.")
|
packages.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
poppler-utils
|
requirements.txt
CHANGED
|
@@ -1,3 +1,7 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
requests
|
| 3 |
+
pymupdf
|
| 4 |
+
google-generativeai
|
| 5 |
+
python-dotenv
|
| 6 |
+
pinecone-client
|
| 7 |
+
hashlib
|
src/streamlit_app.py
DELETED
|
@@ -1,40 +0,0 @@
|
|
| 1 |
-
import altair as alt
|
| 2 |
-
import numpy as np
|
| 3 |
-
import pandas as pd
|
| 4 |
-
import streamlit as st
|
| 5 |
-
|
| 6 |
-
"""
|
| 7 |
-
# Welcome to Streamlit!
|
| 8 |
-
|
| 9 |
-
Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
|
| 10 |
-
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
| 11 |
-
forums](https://discuss.streamlit.io).
|
| 12 |
-
|
| 13 |
-
In the meantime, below is an example of what you can do with just a few lines of code:
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
|
| 17 |
-
num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
|
| 18 |
-
|
| 19 |
-
indices = np.linspace(0, 1, num_points)
|
| 20 |
-
theta = 2 * np.pi * num_turns * indices
|
| 21 |
-
radius = indices
|
| 22 |
-
|
| 23 |
-
x = radius * np.cos(theta)
|
| 24 |
-
y = radius * np.sin(theta)
|
| 25 |
-
|
| 26 |
-
df = pd.DataFrame({
|
| 27 |
-
"x": x,
|
| 28 |
-
"y": y,
|
| 29 |
-
"idx": indices,
|
| 30 |
-
"rand": np.random.randn(num_points),
|
| 31 |
-
})
|
| 32 |
-
|
| 33 |
-
st.altair_chart(alt.Chart(df, height=700, width=700)
|
| 34 |
-
.mark_point(filled=True)
|
| 35 |
-
.encode(
|
| 36 |
-
x=alt.X("x", axis=None),
|
| 37 |
-
y=alt.Y("y", axis=None),
|
| 38 |
-
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
| 39 |
-
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
| 40 |
-
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|