File size: 5,960 Bytes
4598839
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import os
import re
import json
import tempfile
import requests
import fitz
import pytesseract
from PIL import Image
from docx import Document
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
from fastapi import FastAPI, Request

app = FastAPI()
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Utility function: Download file from URL to temp directory
def download_file(url: str, dest_dir: str) -> str:
    ext = url.split('.')[-1].split('?')[0]
    local_path = os.path.join(dest_dir, f"file_{abs(hash(url))}.{ext}")
    resp = requests.get(url, stream=True)
    resp.raise_for_status()
    with open(local_path, "wb") as f:
        for chunk in resp.iter_content(8192):
            f.write(chunk)
    return local_path

# Extract text from PDF, DOCX, or Images
def extract_text(file_path: str, max_pages: int = 3) -> str:
    ext = file_path.split('.')[-1].lower()
    if ext == "pdf":
        doc = fitz.open(file_path)
        return "\n".join(page.get_text() for page in doc[:max_pages])
    elif ext == "docx":
        doc = Document(file_path)
        return "\n".join(p.text for p in doc.paragraphs)
    elif ext in {"jpg", "jpeg", "png"}:
        return pytesseract.image_to_string(Image.open(file_path))
    else:
        raise ValueError(f"Unsupported file type: {ext}")

# Extract parameters like age, gender, procedure, location, policy_duration from text
def extract_params(text: str) -> dict:
    age_m = re.search(r"(\d{2})[- ]?year[- ]?old", text, re.IGNORECASE)
    gender_m = re.search(r"\b(male|female)\b", text, re.IGNORECASE)
    proc_m = re.search(r"(\w+(?:\s\w+)*\s(?:surgery|replacement|operation|treatment))", text, re.IGNORECASE)
    loc_m = re.search(r"(?:in|at)\s([A-Z][a-z]+(?:\s[A-Z][a-z]+)?)", text)
    dur_m = re.search(r"(\d+)[- ]?(?:month|year)[- ]?old.*?insurance", text, re.IGNORECASE)
    return {
        "age": int(age_m.group(1)) if age_m else None,
        "gender": gender_m.group(1).lower() if gender_m else None,
        "procedure": proc_m.group(1).strip() if proc_m else None,
        "location": loc_m.group(1).strip() if loc_m else None,
        "policy_duration": (
            dur_m.group(1) + (" months" if "month" in dur_m.group(0) else " years")
        ) if dur_m else None
    }

# Chunk large text into overlapping pieces
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 100) -> list:
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

# Prepare FAISS index from list of policy document file paths
def prepare_policy_index(policy_file_paths: list) -> tuple:
    all_chunks, chunk_sources = [], []
    for path in policy_file_paths:
        text = extract_text(path)
        chunks = chunk_text(text)
        all_chunks.extend(chunks)
        chunk_sources.extend([os.path.basename(path)] * len(chunks))
    embeddings = embedding_model.encode(all_chunks, show_progress_bar=True)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings))
    return all_chunks, chunk_sources, index

# Semantic search over the FAISS index for a query string
def semantic_search(query: str, chunks: list, chunk_sources: list, index, top_k: int = 3) -> list:
    query_embedding = embedding_model.encode([query])
    D, I = index.search(np.array(query_embedding), top_k)
    return [(chunks[i], chunk_sources[i]) for i in I[0]]

# Call Gemini LLM for final decision
def get_llm_decision_gemini(structured_json: dict, retrieved_clauses: list, gemini_api_key: str) -> str:
    genai.configure(api_key=gemini_api_key)
    llm = genai.GenerativeModel("gemini-1.5-flash")
    prompt = f"""
You are an insurance claim decision model.

Claim Info:
{json.dumps(structured_json, indent=2)}

Relevant Policy Clauses:
{retrieved_clauses[0][0]}
{retrieved_clauses[1][0] if len(retrieved_clauses) > 1 else ''}
{retrieved_clauses[2][0] if len(retrieved_clauses) > 2 else ''}

Your task is to:
1. Decide if the claim should be approved or rejected
2. Mention amount if applicable (else null)
3. Give clear justification pointing to the relevant clauses

Respond only in JSON:
{{"Decision": "...", "Amount": "...", "Justification": "..."}}
"""
    response = llm.generate_content(prompt)
    return response.text

# The FastAPI /hackrx/run endpoint
@app.post("/hackrx/run")
async def hackrx_run(request: Request):
    data = await request.json()
    document_urls = data.get("documents")
    questions = data.get("questions", [])

    if not document_urls:
        return {"error": "No documents provided."}

    if isinstance(document_urls, str):
        document_urls = [document_urls]

    gemini_api_key = os.environ.get("GOOGLE_API_KEY")
    if not gemini_api_key:
        return {"error": "API key not configured in environment variables."}

    with tempfile.TemporaryDirectory() as tmpdir:
        # Download all policy docs
        policy_paths = [download_file(url, tmpdir) for url in document_urls]
        # Extract text and build FAISS index once per request
        chunks, chunk_sources, index = prepare_policy_index(policy_paths)

        answers = []
        for question in questions:
            # Extract structured info from question (optional; can also use raw question text)
            structured_query = extract_params(question)
            # Compose query text for semantic search
            query_text = " ".join([str(v) for v in structured_query.values() if v])
            # Retrieve top relevant clauses
            retrieved_clauses = semantic_search(query_text, chunks, chunk_sources, index)
            # Get final decision from Gemini
            answer = get_llm_decision_gemini(structured_query, retrieved_clauses, gemini_api_key)
            answers.append(answer)

    return {"answers": answers}