Spaces:
No application file
No application file
File size: 13,345 Bytes
049b08c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 | # Importing libraries for web scraping
import requests # For making HTTP requests
from bs4 import BeautifulSoup # For parsing HTML content
# Importing library for data handling
import pandas as pd
# OS and file handling libraries
import os
import shutil
# LlamaIndex imports for document indexing and retrieval
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import HierarchicalNodeParser
# Importing ChromaDB for persistent vector storage
import chromadb
# LlamaIndex wrapper for using Chroma as a vector store
from llama_index.vector_stores.chroma import ChromaVectorStore
# HuggingFace embedding model for generating vector representations
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# Ingestion pipeline to preprocess and ingest documents into a vector store
from llama_index.core.ingestion import IngestionPipeline
# Tools for creating complex metadata-based filters for search and retrieval
from llama_index.core.vector_stores import MetadataFilters, ExactMatchFilter, MetadataFilter, FilterOperator, FilterCondition
# For retrieving relevant documents using a vector index
from llama_index.core.retrievers import VectorIndexRetriever
# OpenRouter LLM wrapper to use models via OpenRouter platform
from llama_index.llms.openrouter import OpenRouter
# Synthesizer to generate responses from retrieved documents
from llama_index.core.response_synthesizers import get_response_synthesizer
# Query engine that combines retriever and synthesizer for answering queries
from llama_index.core.query_engine import RetrieverQueryEngine
# Import core classes from CrewAI
from crewai import Crew, Agent, Task
def fetch_and_download_policy_documents(insurer, UIN, results, save_path):
"""
Fetches health insurance policy documents from the IRDAI website using the insurer name and UIN.
Downloads the associated PDF files and saves metadata as a CSV.
Args:
insurer (str): Name of the insurance provider.
UIN (str): Unique Identification Number for the insurance product.
results (int): Number of search results to fetch.
save_path (str): Local directory path where documents will be downloaded.
Returns:
pd.DataFrame: DataFrame containing metadata of the downloaded documents.
"""
# Construct the URL for IRDAI document search with filters applied
url = (
f'https://irdai.gov.in/health-insurance-products'
f'?p_p_id=com_irdai_document_media_IRDAIDocumentMediaPortlet'
f'&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view'
f'&_com_irdai_document_media_IRDAIDocumentMediaPortlet_filterInsurer={insurer}'
f'&_com_irdai_document_media_IRDAIDocumentMediaPortlet_filterUIN={UIN}'
f'&_com_irdai_document_media_IRDAIDocumentMediaPortlet_filterApprovalDateFrom=01%2F01%2F2020'
f'&_com_irdai_document_media_IRDAIDocumentMediaPortlet_resetCur=false'
f'&_com_irdai_document_media_IRDAIDocumentMediaPortlet_delta={results}'
)
# Set headers to mimic a browser request
headers = {
"User-Agent": "Mozilla/5.0"
}
# Make a GET request and parse the HTML content
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
# Find the table containing policy data
table = soup.find("table")
if not table:
raise ValueError("No table found – the content structure may have changed.")
# Extract all rows in the table
rows = table.find_all("tr")
data = []
# Extract column headers and append additional metadata columns
header_row = rows[0]
header_cols = [th.text.strip() for th in header_row.find_all("th")]
header_cols.append("Document URL")
header_cols.append("Document Name")
# Parse each row to extract text data and document link info
for row in rows[1:-1]:
cols = row.find_all("td")
text_data = [ele.text.strip() for ele in cols]
# Extract the document link and name from the relevant column
doc_col = cols[7]
link_tag = doc_col.find("a")
href = link_tag['href'] if link_tag and 'href' in link_tag.attrs else None
doc_name = link_tag.text.strip() if link_tag else None
text_data.append(href)
text_data.append(doc_name)
data.append(text_data)
# Create a DataFrame from the extracted data
df = pd.DataFrame(data, columns=header_cols)
# Remove the directory if it already exists to avoid old file conflicts
try:
shutil.rmtree(save_path)
except FileNotFoundError:
pass # Ignore if directory does not exist
# Create directory for saving documents
os.makedirs(save_path, exist_ok=True)
# Download each document using the extracted URLs
for index, row in df.iterrows():
document_url = row['Document URL']
if document_url:
try:
# Stream download to avoid loading entire file in memory
response = requests.get(document_url, stream=True, headers=headers)
response.raise_for_status()
# Construct filename using UIN and save to file
filename = row['UIN'] + '.pdf'
filepath = os.path.join(save_path, filename)
with open(filepath, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
# Uncomment to log downloaded files
# print(f"Downloaded: {filename}")
except requests.exceptions.RequestException as e:
print(f"Error downloading {document_url}: {e}")
else:
# Skip rows without a valid document link
print(f"Skipping row {index}: No document URL found.")
# Save the DataFrame with document metadata to a CSV file
csv_file_path = './policy_documents_metadata.csv'
df.to_csv(csv_file_path, index=False)
print('./policy_documents_metadata.csv has been saved')
return df
async def create_vDB(doc_path, vDB_path, vDB_colection, embedding_model):
"""
Asynchronously creates a vector database (vDB) using ChromaDB and stores embedded document data.
Args:
doc_path (str): Path to the folder containing input documents.
vDB_path (str): Directory path for storing the persistent ChromaDB vector database.
vDB_colection (str): Name of the vector collection inside ChromaDB.
embedding_model (str): Name of the HuggingFace model used for embedding text.
Returns:
ChromaVectorStore: An instance of the vector store containing embedded document nodes.
"""
# Load all documents from the specified directory
documents = SimpleDirectoryReader(doc_path).load_data()
# Add 'UIN' metadata to each document using the filename (excluding extension)
for doc in documents:
doc.metadata['UIN'] = doc.metadata['file_name'][:-4]
# Parse documents into hierarchical nodes for structured semantic representation
node_parser = HierarchicalNodeParser.from_defaults()
nodes = node_parser.get_nodes_from_documents(documents)
# Create a persistent Chroma client using the specified vector DB path
db = chromadb.PersistentClient(path=vDB_path)
# Remove the existing collection if it exists (for a fresh start)
try:
db.delete_collection(name=vDB_colection)
except Exception as e:
pass # Ignore errors if the collection does not exist
# Create or retrieve a vector collection in ChromaDB
chroma_collection = db.get_or_create_collection(name=vDB_colection)
# Initialize the Chroma-based vector store
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
# Set up an ingestion pipeline that includes HuggingFace embedding transformation
pipeline = IngestionPipeline(
transformations=[
HuggingFaceEmbedding(model_name=embedding_model),
],
vector_store=vector_store,
)
# Set batch size to control memory usage during ingestion
BATCH_SIZE = 1000
# Asynchronously ingest nodes into the vector store in batches
async def ingest_in_batches(nodes):
for i in range(0, len(nodes), BATCH_SIZE):
batch = nodes[i:i + BATCH_SIZE]
print(f"Ingesting batch {i // BATCH_SIZE + 1} ({len(batch)} nodes)...")
await pipeline.arun(nodes=batch)
# Run the batch ingestion process
await ingest_in_batches(nodes)
# Return the vector store instance for further querying or retrieval
return vector_store
def create_query_engine(UIN, embedding_model, vector_store, similarity_top_k, llm_model, api_key):
"""
Creates a RetrieverQueryEngine that performs filtered semantic search and generates responses using an LLM.
Args:
UIN (str): Unique Identification Number used to filter relevant documents.
embedding_model (str): Name of the HuggingFace model used for embedding text.
vector_store (ChromaVectorStore): Pre-built vector store containing embedded documents.
similarity_top_k (int): Number of most semantically similar nodes to retrieve.
llm_model (str): Name of the language model served via OpenRouter for generating responses.
api_key (str): API key for accessing the OpenRouter platform.
Returns:
RetrieverQueryEngine: A query engine capable of semantic search and LLM-powered response generation.
"""
# Build a vector index from the existing vector store using the specified embedding model
index = VectorStoreIndex.from_vector_store(
vector_store=vector_store,
embed_model=HuggingFaceEmbedding(model_name=embedding_model)
)
# Define metadata filters to limit search results to documents matching the specified UIN
filters = MetadataFilters(
filters=[
ExactMatchFilter(key="UIN", value=UIN)
]
)
# Create a retriever that uses both vector similarity and metadata filters
retriever = VectorIndexRetriever(
index=index,
filters=filters,
similarity_top_k=similarity_top_k # Retrieve top x most semantically similar nodes
)
# Initialize the LLM from OpenRouter using the specified model name
llm = OpenRouter(
api_key=api_key,
model=llm_model,
)
# Create a response synthesizer that leverages the LLM to answer user queries
response_synthesizer = get_response_synthesizer(llm=llm)
# Set up the complete query engine by combining retriever and response synthesizer
query_engine = RetrieverQueryEngine(
retriever=retriever,
response_synthesizer=response_synthesizer
)
return query_engine
def archive_vdb(vdb_path, archive_path):
"""
Archives the vDB (vector database) directory into a ZIP file.
Args:
vdb_path (str): Path to the directory containing the vector database to archive.
archive_path (str): Full path (including .zip extension) where the archive will be saved.
Returns:
None
"""
try:
# Create a ZIP archive of the vDB directory
# shutil.make_archive requires the archive path without the extension
shutil.make_archive(archive_path[:-4], 'zip', vdb_path) # Remove .zip before archiving
print(f"vDB successfully archived to {archive_path}")
except FileNotFoundError:
# Handle case where vDB path does not exist
print(f"Error: vDB directory not found at {vdb_path}")
except Exception as e:
# Catch-all for any unexpected errors during archiving
print(f"An error occurred during archiving: {e}")
def load_vdb_from_archive(archive_path, vdb_path, collection):
"""
Extracts and loads a Chroma-based vector database (vDB) from a ZIP archive.
Args:
archive_path (str): Full path to the ZIP archive containing the vDB.
vdb_path (str): Destination directory where the archive contents will be extracted.
collection (str): Name of the Chroma collection within the vDB.
Returns:
ChromaVectorStore or None: A vector store object ready for use, or None if loading fails.
"""
try:
# Extract the archive to the specified vdb_path directory
shutil.unpack_archive(archive_path, vdb_path)
print(f"vDB archive extracted to {vdb_path}")
# Initialize a persistent ChromaDB client from the extracted directory
db = chromadb.PersistentClient(path=vdb_path)
# Retrieve or create the 'IRDAI' collection from the ChromaDB
chroma_collection = db.get_or_create_collection(name=collection)
# Wrap the Chroma collection in a ChromaVectorStore object for use with LlamaIndex
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
print("ChromaDB loaded successfully from archive.")
return vector_store
except FileNotFoundError:
# Handle case where the ZIP archive does not exist
print(f"Error: vDB archive not found at {archive_path}")
return None
except Exception as e:
# Catch-all for any unexpected errors during extraction or loading
print(f"An error occurred during loading: {e}")
return None
|