archis99 commited on
Commit
87c78a9
·
1 Parent(s): c6b1f4e

Initial project commit with app files

Browse files
Files changed (8) hide show
  1. .gitattributes +0 -35
  2. Dockerfile +0 -20
  3. README.md +0 -20
  4. app.py +168 -0
  5. data_processor.py +189 -0
  6. packages.txt +1 -0
  7. requirements.txt +7 -3
  8. src/streamlit_app.py +0 -40
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Dockerfile DELETED
@@ -1,20 +0,0 @@
1
- FROM python:3.13.5-slim
2
-
3
- WORKDIR /app
4
-
5
- RUN apt-get update && apt-get install -y \
6
- build-essential \
7
- curl \
8
- git \
9
- && rm -rf /var/lib/apt/lists/*
10
-
11
- COPY requirements.txt ./
12
- COPY src/ ./src/
13
-
14
- RUN pip3 install -r requirements.txt
15
-
16
- EXPOSE 8501
17
-
18
- HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19
-
20
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md DELETED
@@ -1,20 +0,0 @@
1
- ---
2
- title: Insurance DocAI
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
- pinned: false
11
- short_description: HackRx 6.0- Bajaj Finserv Annual Flagship Hackathon
12
- license: mit
13
- ---
14
-
15
- # Welcome to Streamlit!
16
-
17
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
18
-
19
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
20
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import hashlib
4
+ import time
5
+ from pinecone import Pinecone
6
+ import google.generativeai as genai
7
+
8
+ # Import your data processing functions
9
+ from data_processor import (
10
+ get_document_text,
11
+ split_text_into_chunks,
12
+ generate_embeddings,
13
+ index_chunks_in_pinecone,
14
+ )
15
+
16
+ # --- Page Configuration ---
17
+ st.set_page_config(
18
+ page_title="ClarityClaim AI 🤖",
19
+ page_icon="📄",
20
+ layout="wide"
21
+ )
22
+
23
+ # --- API and Client Initialization ---
24
+ # Use st.secrets for secure handling of API keys on Streamlit Cloud/Hugging Face
25
+ try:
26
+ GOOGLE_API_KEY = st.secrets["GOOGLE_API_KEY"]
27
+ PINECONE_API_KEY = st.secrets["PINECONE_API_KEY"]
28
+
29
+ genai.configure(api_key=GOOGLE_API_KEY)
30
+ pc = Pinecone(api_key=PINECONE_API_KEY)
31
+ INDEX_NAME = "hackrx-policy-index"
32
+
33
+ except Exception as e:
34
+ st.error("🚨 Could not find API keys. Please add them to the secrets management in your deployment environment.", icon="🚨")
35
+ st.stop()
36
+
37
+
38
+ # --- Helper Functions (adapted from your main.py) ---
39
+
40
+ def create_doc_id_from_url(url: str) -> str:
41
+ """Creates a stable SHA256 hash of the URL to use as a document ID (namespace)."""
42
+ return hashlib.sha256(url.encode('utf-8')).hexdigest()
43
+
44
+ def generate_answer_with_gemini(question: str, context: str) -> str:
45
+ """Generates an answer using Gemini based on the provided context."""
46
+ model = genai.GenerativeModel('gemini-1.5-flash-latest')
47
+ prompt = f"""
48
+ You are an expert insurance policy analyst.
49
+ Based ONLY on the context provided below from an insurance document, answer the user's question concisely.
50
+ Do not use any external knowledge or make assumptions.
51
+ If the answer cannot be found in the provided context, state that clearly.
52
+
53
+ CONTEXT:
54
+ ---
55
+ {context}
56
+ ---
57
+
58
+ QUESTION: {question}
59
+
60
+ ANSWER:
61
+ """
62
+ try:
63
+ response = model.generate_content(prompt)
64
+ return response.text.strip() if response.parts else "The model's response was empty."
65
+ except Exception as e:
66
+ return f"An error occurred while generating the answer: {e}"
67
+
68
+ # --- Caching ---
69
+ # Use Streamlit's caching to avoid re-processing the same document repeatedly.
70
+ @st.cache_data(show_spinner=False)
71
+ def process_document(doc_url):
72
+ """
73
+ Full pipeline: Downloads, chunks, embeds, and indexes a document.
74
+ This function is cached, so it only runs once per URL.
75
+ """
76
+ with st.spinner(f"Processing document: {doc_url}... This may take a moment."):
77
+ namespace = create_doc_id_from_url(doc_url)
78
+ index = pc.Index(INDEX_NAME)
79
+
80
+ # Check if the document is already processed by checking the namespace
81
+ stats = index.describe_index_stats()
82
+ if stats.get('namespaces', {}).get(namespace, {}).get('vector_count', 0) > 0:
83
+ st.success(f"Document '{doc_url}' is already processed and ready for questions.")
84
+ return namespace
85
+
86
+ # Full processing pipeline
87
+ document_text = get_document_text(doc_url)
88
+ if not document_text:
89
+ st.error("Failed to retrieve or extract text from the document.")
90
+ return None
91
+
92
+ chunks = split_text_into_chunks(document_text)
93
+ if not chunks:
94
+ st.error("Failed to split document into chunks.")
95
+ return None
96
+
97
+ embeddings = generate_embeddings(chunks)
98
+ if not embeddings:
99
+ st.error("Failed to generate embeddings.")
100
+ return None
101
+
102
+ index_chunks_in_pinecone(chunks, embeddings, INDEX_NAME, namespace=namespace)
103
+ st.success(f"Successfully processed and indexed document: {doc_url}")
104
+ return namespace
105
+
106
+ # --- Streamlit UI ---
107
+
108
+ st.title("📄 ClarityClaim AI: Your Insurance Policy Expert")
109
+ st.markdown("Enter the URL of an insurance policy document (PDF) and ask questions about it.")
110
+
111
+ # Initialize session state for conversation history
112
+ if "messages" not in st.session_state:
113
+ st.session_state.messages = []
114
+
115
+ # Input for document URL
116
+ doc_url = st.text_input("Enter the Document URL", placeholder="https://your-document-url.pdf", key="doc_url_input")
117
+
118
+ if doc_url:
119
+ # Process the document and get the namespace
120
+ namespace = process_document(doc_url)
121
+
122
+ if namespace:
123
+ st.info("Document is ready. You can now ask questions below.")
124
+
125
+ # Display chat messages from history on app rerun
126
+ for message in st.session_state.messages:
127
+ with st.chat_message(message["role"]):
128
+ st.markdown(message["content"])
129
+
130
+ # Accept user input
131
+ if prompt := st.chat_input("Ask a question about the policy"):
132
+ # Add user message to chat history
133
+ st.session_state.messages.append({"role": "user", "content": prompt})
134
+ # Display user message in chat message container
135
+ with st.chat_message("user"):
136
+ st.markdown(prompt)
137
+
138
+ # Display assistant response in chat message container
139
+ with st.chat_message("assistant"):
140
+ message_placeholder = st.empty()
141
+ with st.spinner("Thinking..."):
142
+ # 1. Generate embedding for the question
143
+ question_embedding_response = genai.embed_content(
144
+ model="models/embedding-001",
145
+ content=prompt,
146
+ task_type="retrieval_query"
147
+ )
148
+ question_embedding = question_embedding_response['embedding']
149
+
150
+ # 2. Query Pinecone for relevant context
151
+ index = pc.Index(INDEX_NAME)
152
+ search_results = index.query(
153
+ vector=question_embedding,
154
+ top_k=5,
155
+ include_metadata=True,
156
+ namespace=namespace
157
+ )
158
+
159
+ # 3. Assemble the context and generate the answer
160
+ context_chunks = [match.metadata['text'] for match in search_results.matches]
161
+ context = "\n\n".join(context_chunks)
162
+
163
+ answer = generate_answer_with_gemini(prompt, context)
164
+
165
+ message_placeholder.markdown(answer)
166
+
167
+ # Add assistant response to chat history
168
+ st.session_state.messages.append({"role": "assistant", "content": answer})
data_processor.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import fitz
3
+ import textwrap
4
+ import os
5
+ import google.generativeai as genai
6
+ from dotenv import load_dotenv
7
+ from pinecone import Pinecone, ServerlessSpec
8
+ import hashlib
9
+ import time
10
+
11
+ # Load environment variables from .env file
12
+ load_dotenv()
13
+ GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
14
+ PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
15
+ PINECONE_ENVIRONMENT = os.environ.get("PINECONE_ENVIRONMENT")
16
+
17
+ # Initialize clients
18
+ genai.configure(api_key=GOOGLE_API_KEY)
19
+ pc = Pinecone(api_key=PINECONE_API_KEY)
20
+
21
+ # --- CORRECTED FUNCTION: Handles both URLs and binary file content ---
22
+ def get_document_text(source) -> str:
23
+ """
24
+ Extracts text from a document, handling either a URL or raw binary content.
25
+ """
26
+ document_content = None
27
+
28
+ if isinstance(source, str): # If the source is a URL string
29
+ print(f"Downloading document from {source}...")
30
+ try:
31
+ response = requests.get(source)
32
+ response.raise_for_status()
33
+ document_content = response.content
34
+ except requests.exceptions.RequestException as e:
35
+ print(f"Error downloading the document: {e}")
36
+ return ""
37
+ elif isinstance(source, bytes): # If the source is raw file content (from upload)
38
+ print("Processing uploaded document content...")
39
+ document_content = source
40
+ else:
41
+ print("Invalid source type provided to get_document_text.")
42
+ return ""
43
+
44
+ if not document_content:
45
+ return ""
46
+
47
+ print("Extracting text from the document...")
48
+ document_text = ""
49
+ try:
50
+ pdf_document = fitz.open(stream=document_content, filetype="pdf")
51
+ for page_num in range(len(pdf_document)):
52
+ page = pdf_document.load_page(page_num)
53
+ document_text += page.get_text()
54
+ except Exception as e:
55
+ print(f"Error extracting text: {e}")
56
+ return ""
57
+
58
+ return document_text
59
+
60
+ def create_document_id(source: str) -> str:
61
+ """Creates a stable SHA256 hash of the URL to use as a document ID."""
62
+ return hashlib.sha256(source.encode()).hexdigest()
63
+
64
+ def split_text_into_chunks(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> list[str]:
65
+ """
66
+ Splits a large text document into smaller, overlapping chunks using a recursive strategy.
67
+ """
68
+ def _recursive_split(t, separators, size, overlap):
69
+ if not separators:
70
+ return textwrap.wrap(t, size)
71
+
72
+ current_sep = separators[0]
73
+ other_seps = separators[1:]
74
+
75
+ parts = t.split(current_sep)
76
+ chunks = []
77
+
78
+ for part in parts:
79
+ if len(part) > size:
80
+ chunks.extend(_recursive_split(part, other_seps, size, overlap))
81
+ else:
82
+ chunks.append(part)
83
+
84
+ final_chunks = []
85
+ if chunks:
86
+ current_chunk = chunks[0]
87
+ for i in range(1, len(chunks)):
88
+ if len(current_chunk) + len(chunks[i]) <= size + overlap:
89
+ current_chunk += current_sep + chunks[i]
90
+ else:
91
+ final_chunks.append(current_chunk)
92
+ current_chunk = chunks[i]
93
+ final_chunks.append(current_chunk)
94
+
95
+ return [c for c in final_chunks if c.strip()]
96
+
97
+ separators = ["\n\n", "\n", ". ", " "]
98
+ chunks = _recursive_split(text, separators, chunk_size, chunk_overlap)
99
+
100
+ return chunks
101
+
102
+ def generate_embeddings(text_chunks: list[str]) -> list:
103
+ """
104
+ Generates vector embeddings for a list of text chunks using Gemini Pro API.
105
+ """
106
+ print(f"Generating embeddings for {len(text_chunks)} chunks using Gemini Pro...")
107
+ embeddings = []
108
+ try:
109
+ response = genai.embed_content(
110
+ model="models/embedding-001",
111
+ content=text_chunks
112
+ )
113
+ embeddings = response['embedding']
114
+ print("Embeddings generated successfully.")
115
+ except Exception as e:
116
+ print(f"Error generating embeddings: {e}")
117
+
118
+ return embeddings
119
+
120
+ def index_chunks_in_pinecone(chunks: list[str], embeddings: list, index_name: str, namespace: str):
121
+ """
122
+ Indexes the text chunks and their embeddings in a specific Pinecone namespace.
123
+ """
124
+ print(f"Indexing {len(chunks)} chunks in Pinecone index '{index_name}' under namespace '{namespace}'...")
125
+ try:
126
+ # Check if index exists, and create if it doesn't
127
+ if index_name not in pc.list_indexes().names():
128
+ print(f"Creating new Pinecone index: '{index_name}'")
129
+ pc.create_index(
130
+ name=index_name,
131
+ dimension=len(embeddings[0]),
132
+ metric='cosine',
133
+ spec=ServerlessSpec(cloud='aws', region='us-east-1')
134
+ )
135
+ print("Index created successfully. Waiting for it to become ready...")
136
+ # Wait for index to be ready
137
+ while not pc.describe_index(index_name).status.ready:
138
+ time.sleep(1)
139
+
140
+ index = pc.Index(index_name)
141
+
142
+ # Prepare data for upsert
143
+ vectors_to_upsert = []
144
+ for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
145
+ vectors_to_upsert.append({
146
+ "id": f"chunk-{namespace}-{i}", # Make ID unique across namespaces
147
+ "values": embedding,
148
+ "metadata": {"text": chunk}
149
+ })
150
+
151
+ # Upsert in batches
152
+ batch_size = 100
153
+ for i in range(0, len(vectors_to_upsert), batch_size):
154
+ batch = vectors_to_upsert[i:i + batch_size]
155
+ index.upsert(vectors=batch, namespace=namespace) # <-- USE THE NAMESPACE
156
+ print(f"Upserted batch {i // batch_size + 1} into namespace '{namespace}'")
157
+
158
+ print(f"Successfully indexed {len(chunks)} chunks in namespace '{namespace}'.")
159
+ # Give a moment for the index to become queryable
160
+ time.sleep(5)
161
+
162
+ except Exception as e:
163
+ print(f"Error indexing in Pinecone: {e}")
164
+
165
+ if __name__ == "__main__":
166
+ sample_url = "https://hackrx.blob.core.windows.net/assets/hackrx_6/policies/BAJHLIP23020V012223.pdf?sv=2023-01-03&st=2025-07-30T06%3A46%3A49Z&se=2025-09-01T06%3A46%3A00Z&sr=c&sp=rl&sig=9szykRKdGYj0BVm1skP%2BX8N9%2FRENEn2k7MQPUp33jyQ%3D"
167
+ index_name = "hackrx-policy-index"
168
+
169
+ document_content = get_document_text(sample_url)
170
+
171
+ if document_content:
172
+ chunks = split_text_into_chunks(document_content)
173
+ print(f"\n--- Document Split into {len(chunks)} Chunks ---")
174
+
175
+ embeddings = generate_embeddings(chunks)
176
+
177
+ if embeddings:
178
+ print(f"Generated {len(embeddings)} embeddings.")
179
+ print(f"Size of each embedding vector: {len(embeddings[0])}")
180
+
181
+ # Index the chunks in Pinecone
182
+ print("--- Running standalone script test ---")
183
+ test_namespace = create_document_id(sample_url) # Use the new function!
184
+ index_chunks_in_pinecone(chunks, embeddings, index_name, namespace=test_namespace)
185
+ else:
186
+ print("Failed to generate embeddings. Pinecone indexing skipped.")
187
+
188
+ else:
189
+ print("Failed to process document content.")
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ poppler-utils
requirements.txt CHANGED
@@ -1,3 +1,7 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
1
+ streamlit
2
+ requests
3
+ pymupdf
4
+ google-generativeai
5
+ python-dotenv
6
+ pinecone-client
7
+ hashlib
src/streamlit_app.py DELETED
@@ -1,40 +0,0 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
- import streamlit as st
5
-
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))