Spaces:

Vansh180
/

Hackrx6

Sleeping

App Files Files Community

Vansh180 commited on Jul 29, 2025

Commit

4598839

1 Parent(s): 89f49d8

Initial commit

Browse files

Files changed (2) hide show

README copy.md +11 -0
main.py +151 -0

README copy.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+title: Hackrx6
+emoji: 📊
+colorFrom: blue
+colorTo: yellow
+sdk: docker
+pinned: false
+license: apache-2.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

main.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import os
+import re
+import json
+import tempfile
+import requests
+import fitz
+import pytesseract
+from PIL import Image
+from docx import Document
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+import google.generativeai as genai
+from fastapi import FastAPI, Request
+app = FastAPI()
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+# Utility function: Download file from URL to temp directory
+def download_file(url: str, dest_dir: str) -> str:
+    ext = url.split('.')[-1].split('?')[0]
+    local_path = os.path.join(dest_dir, f"file_{abs(hash(url))}.{ext}")
+    resp = requests.get(url, stream=True)
+    resp.raise_for_status()
+    with open(local_path, "wb") as f:
+        for chunk in resp.iter_content(8192):
+            f.write(chunk)
+    return local_path
+# Extract text from PDF, DOCX, or Images
+def extract_text(file_path: str, max_pages: int = 3) -> str:
+    ext = file_path.split('.')[-1].lower()
+    if ext == "pdf":
+        doc = fitz.open(file_path)
+        return "\n".join(page.get_text() for page in doc[:max_pages])
+    elif ext == "docx":
+        doc = Document(file_path)
+        return "\n".join(p.text for p in doc.paragraphs)
+    elif ext in {"jpg", "jpeg", "png"}:
+        return pytesseract.image_to_string(Image.open(file_path))
+    else:
+        raise ValueError(f"Unsupported file type: {ext}")
+# Extract parameters like age, gender, procedure, location, policy_duration from text
+def extract_params(text: str) -> dict:
+    age_m = re.search(r"(\d{2})[- ]?year[- ]?old", text, re.IGNORECASE)
+    gender_m = re.search(r"\b(male|female)\b", text, re.IGNORECASE)
+    proc_m = re.search(r"(\w+(?:\s\w+)*\s(?:surgery|replacement|operation|treatment))", text, re.IGNORECASE)
+    loc_m = re.search(r"(?:in|at)\s([A-Z][a-z]+(?:\s[A-Z][a-z]+)?)", text)
+    dur_m = re.search(r"(\d+)[- ]?(?:month|year)[- ]?old.*?insurance", text, re.IGNORECASE)
+    return {
+        "age": int(age_m.group(1)) if age_m else None,
+        "gender": gender_m.group(1).lower() if gender_m else None,
+        "procedure": proc_m.group(1).strip() if proc_m else None,
+        "location": loc_m.group(1).strip() if loc_m else None,
+        "policy_duration": (
+            dur_m.group(1) + (" months" if "month" in dur_m.group(0) else " years")
+        ) if dur_m else None
+    }
+# Chunk large text into overlapping pieces
+def chunk_text(text: str, chunk_size: int = 500, overlap: int = 100) -> list:
+    words = text.split()
+    chunks = []
+    for i in range(0, len(words), chunk_size - overlap):
+        chunk = " ".join(words[i:i + chunk_size])
+        chunks.append(chunk)
+    return chunks
+# Prepare FAISS index from list of policy document file paths
+def prepare_policy_index(policy_file_paths: list) -> tuple:
+    all_chunks, chunk_sources = [], []
+    for path in policy_file_paths:
+        text = extract_text(path)
+        chunks = chunk_text(text)
+        all_chunks.extend(chunks)
+        chunk_sources.extend([os.path.basename(path)] * len(chunks))
+    embeddings = embedding_model.encode(all_chunks, show_progress_bar=True)
+    dimension = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dimension)
+    index.add(np.array(embeddings))
+    return all_chunks, chunk_sources, index
+# Semantic search over the FAISS index for a query string
+def semantic_search(query: str, chunks: list, chunk_sources: list, index, top_k: int = 3) -> list:
+    query_embedding = embedding_model.encode([query])
+    D, I = index.search(np.array(query_embedding), top_k)
+    return [(chunks[i], chunk_sources[i]) for i in I[0]]
+# Call Gemini LLM for final decision
+def get_llm_decision_gemini(structured_json: dict, retrieved_clauses: list, gemini_api_key: str) -> str:
+    genai.configure(api_key=gemini_api_key)
+    llm = genai.GenerativeModel("gemini-1.5-flash")
+    prompt = f"""
+You are an insurance claim decision model.
+Claim Info:
+{json.dumps(structured_json, indent=2)}
+Relevant Policy Clauses:
+{retrieved_clauses[0][0]}
+{retrieved_clauses[1][0] if len(retrieved_clauses) > 1 else ''}
+{retrieved_clauses[2][0] if len(retrieved_clauses) > 2 else ''}
+Your task is to:
+1. Decide if the claim should be approved or rejected
+2. Mention amount if applicable (else null)
+3. Give clear justification pointing to the relevant clauses
+Respond only in JSON:
+{{"Decision": "...", "Amount": "...", "Justification": "..."}}
+"""
+    response = llm.generate_content(prompt)
+    return response.text
+# The FastAPI /hackrx/run endpoint
+@app.post("/hackrx/run")
+async def hackrx_run(request: Request):
+    data = await request.json()
+    document_urls = data.get("documents")
+    questions = data.get("questions", [])
+    if not document_urls:
+        return {"error": "No documents provided."}
+    if isinstance(document_urls, str):
+        document_urls = [document_urls]
+    gemini_api_key = os.environ.get("GOOGLE_API_KEY")
+    if not gemini_api_key:
+        return {"error": "API key not configured in environment variables."}
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # Download all policy docs
+        policy_paths = [download_file(url, tmpdir) for url in document_urls]
+        # Extract text and build FAISS index once per request
+        chunks, chunk_sources, index = prepare_policy_index(policy_paths)
+        answers = []
+        for question in questions:
+            # Extract structured info from question (optional; can also use raw question text)
+            structured_query = extract_params(question)
+            # Compose query text for semantic search
+            query_text = " ".join([str(v) for v in structured_query.values() if v])
+            # Retrieve top relevant clauses
+            retrieved_clauses = semantic_search(query_text, chunks, chunk_sources, index)
+            # Get final decision from Gemini
+            answer = get_llm_decision_gemini(structured_query, retrieved_clauses, gemini_api_key)
+            answers.append(answer)
+    return {"answers": answers}