Vansh180 commited on
Commit
4598839
·
1 Parent(s): 89f49d8

Initial commit

Browse files
Files changed (2) hide show
  1. README copy.md +11 -0
  2. main.py +151 -0
README copy.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Hackrx6
3
+ emoji: 📊
4
+ colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: docker
7
+ pinned: false
8
+ license: apache-2.0
9
+ ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
main.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import tempfile
5
+ import requests
6
+ import fitz
7
+ import pytesseract
8
+ from PIL import Image
9
+ from docx import Document
10
+ import numpy as np
11
+ import faiss
12
+ from sentence_transformers import SentenceTransformer
13
+ import google.generativeai as genai
14
+ from fastapi import FastAPI, Request
15
+
16
+ app = FastAPI()
17
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
18
+
19
+ # Utility function: Download file from URL to temp directory
20
+ def download_file(url: str, dest_dir: str) -> str:
21
+ ext = url.split('.')[-1].split('?')[0]
22
+ local_path = os.path.join(dest_dir, f"file_{abs(hash(url))}.{ext}")
23
+ resp = requests.get(url, stream=True)
24
+ resp.raise_for_status()
25
+ with open(local_path, "wb") as f:
26
+ for chunk in resp.iter_content(8192):
27
+ f.write(chunk)
28
+ return local_path
29
+
30
+ # Extract text from PDF, DOCX, or Images
31
+ def extract_text(file_path: str, max_pages: int = 3) -> str:
32
+ ext = file_path.split('.')[-1].lower()
33
+ if ext == "pdf":
34
+ doc = fitz.open(file_path)
35
+ return "\n".join(page.get_text() for page in doc[:max_pages])
36
+ elif ext == "docx":
37
+ doc = Document(file_path)
38
+ return "\n".join(p.text for p in doc.paragraphs)
39
+ elif ext in {"jpg", "jpeg", "png"}:
40
+ return pytesseract.image_to_string(Image.open(file_path))
41
+ else:
42
+ raise ValueError(f"Unsupported file type: {ext}")
43
+
44
+ # Extract parameters like age, gender, procedure, location, policy_duration from text
45
+ def extract_params(text: str) -> dict:
46
+ age_m = re.search(r"(\d{2})[- ]?year[- ]?old", text, re.IGNORECASE)
47
+ gender_m = re.search(r"\b(male|female)\b", text, re.IGNORECASE)
48
+ proc_m = re.search(r"(\w+(?:\s\w+)*\s(?:surgery|replacement|operation|treatment))", text, re.IGNORECASE)
49
+ loc_m = re.search(r"(?:in|at)\s([A-Z][a-z]+(?:\s[A-Z][a-z]+)?)", text)
50
+ dur_m = re.search(r"(\d+)[- ]?(?:month|year)[- ]?old.*?insurance", text, re.IGNORECASE)
51
+ return {
52
+ "age": int(age_m.group(1)) if age_m else None,
53
+ "gender": gender_m.group(1).lower() if gender_m else None,
54
+ "procedure": proc_m.group(1).strip() if proc_m else None,
55
+ "location": loc_m.group(1).strip() if loc_m else None,
56
+ "policy_duration": (
57
+ dur_m.group(1) + (" months" if "month" in dur_m.group(0) else " years")
58
+ ) if dur_m else None
59
+ }
60
+
61
+ # Chunk large text into overlapping pieces
62
+ def chunk_text(text: str, chunk_size: int = 500, overlap: int = 100) -> list:
63
+ words = text.split()
64
+ chunks = []
65
+ for i in range(0, len(words), chunk_size - overlap):
66
+ chunk = " ".join(words[i:i + chunk_size])
67
+ chunks.append(chunk)
68
+ return chunks
69
+
70
+ # Prepare FAISS index from list of policy document file paths
71
+ def prepare_policy_index(policy_file_paths: list) -> tuple:
72
+ all_chunks, chunk_sources = [], []
73
+ for path in policy_file_paths:
74
+ text = extract_text(path)
75
+ chunks = chunk_text(text)
76
+ all_chunks.extend(chunks)
77
+ chunk_sources.extend([os.path.basename(path)] * len(chunks))
78
+ embeddings = embedding_model.encode(all_chunks, show_progress_bar=True)
79
+ dimension = embeddings.shape[1]
80
+ index = faiss.IndexFlatL2(dimension)
81
+ index.add(np.array(embeddings))
82
+ return all_chunks, chunk_sources, index
83
+
84
+ # Semantic search over the FAISS index for a query string
85
+ def semantic_search(query: str, chunks: list, chunk_sources: list, index, top_k: int = 3) -> list:
86
+ query_embedding = embedding_model.encode([query])
87
+ D, I = index.search(np.array(query_embedding), top_k)
88
+ return [(chunks[i], chunk_sources[i]) for i in I[0]]
89
+
90
+ # Call Gemini LLM for final decision
91
+ def get_llm_decision_gemini(structured_json: dict, retrieved_clauses: list, gemini_api_key: str) -> str:
92
+ genai.configure(api_key=gemini_api_key)
93
+ llm = genai.GenerativeModel("gemini-1.5-flash")
94
+ prompt = f"""
95
+ You are an insurance claim decision model.
96
+
97
+ Claim Info:
98
+ {json.dumps(structured_json, indent=2)}
99
+
100
+ Relevant Policy Clauses:
101
+ {retrieved_clauses[0][0]}
102
+ {retrieved_clauses[1][0] if len(retrieved_clauses) > 1 else ''}
103
+ {retrieved_clauses[2][0] if len(retrieved_clauses) > 2 else ''}
104
+
105
+ Your task is to:
106
+ 1. Decide if the claim should be approved or rejected
107
+ 2. Mention amount if applicable (else null)
108
+ 3. Give clear justification pointing to the relevant clauses
109
+
110
+ Respond only in JSON:
111
+ {{"Decision": "...", "Amount": "...", "Justification": "..."}}
112
+ """
113
+ response = llm.generate_content(prompt)
114
+ return response.text
115
+
116
+ # The FastAPI /hackrx/run endpoint
117
+ @app.post("/hackrx/run")
118
+ async def hackrx_run(request: Request):
119
+ data = await request.json()
120
+ document_urls = data.get("documents")
121
+ questions = data.get("questions", [])
122
+
123
+ if not document_urls:
124
+ return {"error": "No documents provided."}
125
+
126
+ if isinstance(document_urls, str):
127
+ document_urls = [document_urls]
128
+
129
+ gemini_api_key = os.environ.get("GOOGLE_API_KEY")
130
+ if not gemini_api_key:
131
+ return {"error": "API key not configured in environment variables."}
132
+
133
+ with tempfile.TemporaryDirectory() as tmpdir:
134
+ # Download all policy docs
135
+ policy_paths = [download_file(url, tmpdir) for url in document_urls]
136
+ # Extract text and build FAISS index once per request
137
+ chunks, chunk_sources, index = prepare_policy_index(policy_paths)
138
+
139
+ answers = []
140
+ for question in questions:
141
+ # Extract structured info from question (optional; can also use raw question text)
142
+ structured_query = extract_params(question)
143
+ # Compose query text for semantic search
144
+ query_text = " ".join([str(v) for v in structured_query.values() if v])
145
+ # Retrieve top relevant clauses
146
+ retrieved_clauses = semantic_search(query_text, chunks, chunk_sources, index)
147
+ # Get final decision from Gemini
148
+ answer = get_llm_decision_gemini(structured_query, retrieved_clauses, gemini_api_key)
149
+ answers.append(answer)
150
+
151
+ return {"answers": answers}