Spaces:
Sleeping
Sleeping
Add Files
Browse files- .dockerignore +26 -0
- .gitattributes copy +35 -0
- .gitignore +61 -0
- Dockerfile +39 -0
- app.py +244 -0
- embedder.py +45 -0
- llm.py +122 -0
- pdf_parser.py +86 -0
- requirements.txt +12 -0
- retriever.py +11 -0
.dockerignore
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.gitignore
|
| 3 |
+
README.md
|
| 4 |
+
DEPLOYMENT.md
|
| 5 |
+
render.yaml
|
| 6 |
+
start.sh
|
| 7 |
+
__pycache__
|
| 8 |
+
*.pyc
|
| 9 |
+
*.pyo
|
| 10 |
+
*.pyd
|
| 11 |
+
.Python
|
| 12 |
+
env
|
| 13 |
+
pip-log.txt
|
| 14 |
+
pip-delete-this-directory.txt
|
| 15 |
+
.tox
|
| 16 |
+
.coverage
|
| 17 |
+
.coverage.*
|
| 18 |
+
.cache
|
| 19 |
+
nosetests.xml
|
| 20 |
+
coverage.xml
|
| 21 |
+
*.cover
|
| 22 |
+
*.log
|
| 23 |
+
.git
|
| 24 |
+
.mypy_cache
|
| 25 |
+
.pytest_cache
|
| 26 |
+
.hypothesis
|
.gitattributes copy
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment variables
|
| 2 |
+
.env
|
| 3 |
+
.env.local
|
| 4 |
+
.env.production
|
| 5 |
+
|
| 6 |
+
# Python
|
| 7 |
+
__pycache__/
|
| 8 |
+
*.py[cod]
|
| 9 |
+
*$py.class
|
| 10 |
+
*.so
|
| 11 |
+
.Python
|
| 12 |
+
build/
|
| 13 |
+
develop-eggs/
|
| 14 |
+
dist/
|
| 15 |
+
downloads/
|
| 16 |
+
eggs/
|
| 17 |
+
.eggs/
|
| 18 |
+
lib/
|
| 19 |
+
lib64/
|
| 20 |
+
parts/
|
| 21 |
+
sdist/
|
| 22 |
+
var/
|
| 23 |
+
wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
.cache
|
| 29 |
+
# Virtual environments
|
| 30 |
+
venv/
|
| 31 |
+
env/
|
| 32 |
+
ENV/
|
| 33 |
+
env.bak/
|
| 34 |
+
venv.bak/
|
| 35 |
+
|
| 36 |
+
# IDE
|
| 37 |
+
.vscode/
|
| 38 |
+
.idea/
|
| 39 |
+
*.swp
|
| 40 |
+
*.swo
|
| 41 |
+
*~
|
| 42 |
+
|
| 43 |
+
# OS
|
| 44 |
+
.DS_Store
|
| 45 |
+
Thumbs.db
|
| 46 |
+
|
| 47 |
+
# Logs
|
| 48 |
+
*.log
|
| 49 |
+
|
| 50 |
+
# Temporary files
|
| 51 |
+
*.tmp
|
| 52 |
+
*.temp
|
| 53 |
+
|
| 54 |
+
# FAISS index files
|
| 55 |
+
*.index
|
| 56 |
+
*.faiss
|
| 57 |
+
|
| 58 |
+
# PDF files (if you don't want to commit them)
|
| 59 |
+
*.pdf
|
| 60 |
+
|
| 61 |
+
DEPLOYMENT.md
|
Dockerfile
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
build-essential \
|
| 8 |
+
tesseract-ocr \
|
| 9 |
+
libglib2.0-0 \
|
| 10 |
+
libsm6 \
|
| 11 |
+
libxext6 \
|
| 12 |
+
libxrender-dev \
|
| 13 |
+
poppler-utils \
|
| 14 |
+
&& apt-get clean \
|
| 15 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
+
|
| 17 |
+
# Create a non-root user
|
| 18 |
+
RUN useradd --create-home --shell /bin/bash appuser
|
| 19 |
+
|
| 20 |
+
# Copy requirements first for better caching
|
| 21 |
+
COPY requirements.txt .
|
| 22 |
+
|
| 23 |
+
# Install Python dependencies
|
| 24 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 25 |
+
|
| 26 |
+
# Copy application code
|
| 27 |
+
COPY . .
|
| 28 |
+
|
| 29 |
+
# Create cache directory with proper permissions
|
| 30 |
+
RUN mkdir -p /app/.cache && chown -R appuser:appuser /app
|
| 31 |
+
|
| 32 |
+
# Switch to non-root user
|
| 33 |
+
USER appuser
|
| 34 |
+
|
| 35 |
+
# Expose port
|
| 36 |
+
EXPOSE 7860
|
| 37 |
+
|
| 38 |
+
# Run the application
|
| 39 |
+
CMD ["python", "app.py"]
|
app.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import warnings
|
| 3 |
+
import logging
|
| 4 |
+
import time
|
| 5 |
+
import json
|
| 6 |
+
import hashlib
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 9 |
+
from threading import Lock
|
| 10 |
+
|
| 11 |
+
# Set up cache directory for HuggingFace models
|
| 12 |
+
cache_dir = os.path.join(os.getcwd(), ".cache")
|
| 13 |
+
os.makedirs(cache_dir, exist_ok=True)
|
| 14 |
+
os.environ['HF_HOME'] = cache_dir
|
| 15 |
+
os.environ['TRANSFORMERS_CACHE'] = cache_dir
|
| 16 |
+
|
| 17 |
+
# Suppress TensorFlow warnings
|
| 18 |
+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
| 19 |
+
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
|
| 20 |
+
os.environ['TF_LOGGING_LEVEL'] = 'ERROR'
|
| 21 |
+
os.environ['TF_ENABLE_DEPRECATION_WARNINGS'] = '0'
|
| 22 |
+
|
| 23 |
+
warnings.filterwarnings('ignore', category=DeprecationWarning, module='tensorflow')
|
| 24 |
+
logging.getLogger('tensorflow').setLevel(logging.ERROR)
|
| 25 |
+
|
| 26 |
+
from fastapi import FastAPI, HTTPException, Depends, Header
|
| 27 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 28 |
+
from pydantic import BaseModel
|
| 29 |
+
from pdf_parser import parse_pdf_from_url_multithreaded as parse_pdf_from_url, parse_pdf_from_file_multithreaded as parse_pdf_from_file
|
| 30 |
+
from embedder import build_faiss_index, preload_model
|
| 31 |
+
from retriever import retrieve_chunks
|
| 32 |
+
from llm import query_gemini
|
| 33 |
+
import uvicorn
|
| 34 |
+
|
| 35 |
+
app = FastAPI(title="HackRx Insurance Policy Assistant", version="1.0.0")
|
| 36 |
+
|
| 37 |
+
app.add_middleware(
|
| 38 |
+
CORSMiddleware,
|
| 39 |
+
allow_origins=["*"],
|
| 40 |
+
allow_credentials=True,
|
| 41 |
+
allow_methods=["*"],
|
| 42 |
+
allow_headers=["*"],
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
@app.on_event("startup")
|
| 46 |
+
async def startup_event():
|
| 47 |
+
print("Starting up HackRx Insurance Policy Assistant...")
|
| 48 |
+
print("Preloading sentence transformer model...")
|
| 49 |
+
preload_model()
|
| 50 |
+
print("Model preloading completed. API is ready to serve requests!")
|
| 51 |
+
|
| 52 |
+
@app.get("/")
|
| 53 |
+
async def root():
|
| 54 |
+
return {"message": "HackRx Insurance Policy Assistant API is running!"}
|
| 55 |
+
|
| 56 |
+
@app.get("/health")
|
| 57 |
+
async def health_check():
|
| 58 |
+
return {"status": "healthy"}
|
| 59 |
+
|
| 60 |
+
class QueryRequest(BaseModel):
|
| 61 |
+
documents: str
|
| 62 |
+
questions: list[str]
|
| 63 |
+
|
| 64 |
+
class LocalQueryRequest(BaseModel):
|
| 65 |
+
document_path: str
|
| 66 |
+
questions: list[str]
|
| 67 |
+
|
| 68 |
+
def verify_token(authorization: str = Header(None)):
|
| 69 |
+
if not authorization or not authorization.startswith("Bearer "):
|
| 70 |
+
raise HTTPException(status_code=401, detail="Invalid authorization header")
|
| 71 |
+
token = authorization.replace("Bearer ", "")
|
| 72 |
+
if not token:
|
| 73 |
+
raise HTTPException(status_code=401, detail="Invalid token")
|
| 74 |
+
return token
|
| 75 |
+
|
| 76 |
+
def process_batch(batch_questions, context_chunks):
|
| 77 |
+
return query_gemini(batch_questions, context_chunks)
|
| 78 |
+
|
| 79 |
+
def get_document_id_from_url(url: str) -> str:
|
| 80 |
+
return hashlib.md5(url.encode()).hexdigest()
|
| 81 |
+
|
| 82 |
+
# Document cache with thread safety
|
| 83 |
+
doc_cache = {}
|
| 84 |
+
doc_cache_lock = Lock()
|
| 85 |
+
|
| 86 |
+
@app.post("/api/v1/hackrx/run")
|
| 87 |
+
async def run_query(request: QueryRequest, token: str = Depends(verify_token)):
|
| 88 |
+
start_time = time.time()
|
| 89 |
+
timing_data = {}
|
| 90 |
+
try:
|
| 91 |
+
print("=== INPUT JSON ===")
|
| 92 |
+
print(json.dumps({"documents": request.documents, "questions": request.questions}, indent=2))
|
| 93 |
+
print("==================\n")
|
| 94 |
+
|
| 95 |
+
print(f"Processing {len(request.questions)} questions...")
|
| 96 |
+
|
| 97 |
+
# PDF Parsing and FAISS Caching
|
| 98 |
+
doc_id = get_document_id_from_url(request.documents)
|
| 99 |
+
with doc_cache_lock:
|
| 100 |
+
if doc_id in doc_cache:
|
| 101 |
+
print("✅ Using cached document...")
|
| 102 |
+
cached = doc_cache[doc_id]
|
| 103 |
+
text_chunks = cached["chunks"]
|
| 104 |
+
index = cached["index"]
|
| 105 |
+
texts = cached["texts"]
|
| 106 |
+
else:
|
| 107 |
+
print("⚙️ Parsing and indexing new document...")
|
| 108 |
+
pdf_start = time.time()
|
| 109 |
+
text_chunks = parse_pdf_from_url(request.documents)
|
| 110 |
+
timing_data['pdf_parsing'] = round(time.time() - pdf_start, 2)
|
| 111 |
+
|
| 112 |
+
index_start = time.time()
|
| 113 |
+
index, texts = build_faiss_index(text_chunks)
|
| 114 |
+
timing_data['faiss_index_building'] = round(time.time() - index_start, 2)
|
| 115 |
+
|
| 116 |
+
doc_cache[doc_id] = {
|
| 117 |
+
"chunks": text_chunks,
|
| 118 |
+
"index": index,
|
| 119 |
+
"texts": texts
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
# Chunk Retrieval
|
| 123 |
+
retrieval_start = time.time()
|
| 124 |
+
all_chunks = set()
|
| 125 |
+
for question in request.questions:
|
| 126 |
+
top_chunks = retrieve_chunks(index, texts, question)
|
| 127 |
+
all_chunks.update(top_chunks)
|
| 128 |
+
timing_data['chunk_retrieval'] = round(time.time() - retrieval_start, 2)
|
| 129 |
+
print(f"Retrieved {len(all_chunks)} unique chunks")
|
| 130 |
+
|
| 131 |
+
# LLM Batch Processing
|
| 132 |
+
questions = request.questions
|
| 133 |
+
context_chunks = list(all_chunks)
|
| 134 |
+
batch_size = 10
|
| 135 |
+
batches = [(i, questions[i:i + batch_size]) for i in range(0, len(questions), batch_size)]
|
| 136 |
+
|
| 137 |
+
llm_start = time.time()
|
| 138 |
+
results_dict = {}
|
| 139 |
+
with ThreadPoolExecutor(max_workers=min(5, len(batches))) as executor:
|
| 140 |
+
futures = [executor.submit(process_batch, batch, context_chunks) for _, batch in batches]
|
| 141 |
+
for (start_idx, batch), future in zip(batches, futures):
|
| 142 |
+
try:
|
| 143 |
+
result = future.result()
|
| 144 |
+
if isinstance(result, dict) and "answers" in result:
|
| 145 |
+
for j, answer in enumerate(result["answers"]):
|
| 146 |
+
results_dict[start_idx + j] = answer
|
| 147 |
+
else:
|
| 148 |
+
for j in range(len(batch)):
|
| 149 |
+
results_dict[start_idx + j] = "Error in response"
|
| 150 |
+
except Exception as e:
|
| 151 |
+
for j in range(len(batch)):
|
| 152 |
+
results_dict[start_idx + j] = f"Error: {str(e)}"
|
| 153 |
+
timing_data['llm_processing'] = round(time.time() - llm_start, 2)
|
| 154 |
+
|
| 155 |
+
responses = [results_dict.get(i, "Not Found") for i in range(len(questions))]
|
| 156 |
+
timing_data['total_time'] = round(time.time() - start_time, 2)
|
| 157 |
+
|
| 158 |
+
print(f"\n=== TIMING BREAKDOWN ===")
|
| 159 |
+
for k, v in timing_data.items():
|
| 160 |
+
print(f"{k}: {v}s")
|
| 161 |
+
print(f"=======================\n")
|
| 162 |
+
|
| 163 |
+
print(f"=== OUTPUT JSON ===")
|
| 164 |
+
print(json.dumps({"answers": responses}, indent=2))
|
| 165 |
+
print(f"==================\n")
|
| 166 |
+
|
| 167 |
+
return {"answers": responses}
|
| 168 |
+
|
| 169 |
+
except Exception as e:
|
| 170 |
+
print(f"Error: {str(e)}")
|
| 171 |
+
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 172 |
+
|
| 173 |
+
@app.post("/api/v1/hackrx/local")
|
| 174 |
+
async def run_local_query(request: LocalQueryRequest):
|
| 175 |
+
start_time = time.time()
|
| 176 |
+
timing_data = {}
|
| 177 |
+
try:
|
| 178 |
+
print("=== INPUT JSON ===")
|
| 179 |
+
print(json.dumps({"document_path": request.document_path, "questions": request.questions}, indent=2))
|
| 180 |
+
print("==================\n")
|
| 181 |
+
|
| 182 |
+
print(f"Processing {len(request.questions)} questions locally...")
|
| 183 |
+
|
| 184 |
+
pdf_start = time.time()
|
| 185 |
+
text_chunks = parse_pdf_from_file(request.document_path)
|
| 186 |
+
timing_data['pdf_parsing'] = round(time.time() - pdf_start, 2)
|
| 187 |
+
print(f"Extracted {len(text_chunks)} text chunks from PDF")
|
| 188 |
+
|
| 189 |
+
index_start = time.time()
|
| 190 |
+
index, texts = build_faiss_index(text_chunks)
|
| 191 |
+
timing_data['faiss_index_building'] = round(time.time() - index_start, 2)
|
| 192 |
+
|
| 193 |
+
retrieval_start = time.time()
|
| 194 |
+
all_chunks = set()
|
| 195 |
+
for question in request.questions:
|
| 196 |
+
top_chunks = retrieve_chunks(index, texts, question)
|
| 197 |
+
all_chunks.update(top_chunks)
|
| 198 |
+
timing_data['chunk_retrieval'] = round(time.time() - retrieval_start, 2)
|
| 199 |
+
print(f"Retrieved {len(all_chunks)} unique chunks")
|
| 200 |
+
|
| 201 |
+
questions = request.questions
|
| 202 |
+
context_chunks = list(all_chunks)
|
| 203 |
+
batch_size = 20
|
| 204 |
+
batches = [(i, questions[i:i + batch_size]) for i in range(0, len(questions), batch_size)]
|
| 205 |
+
|
| 206 |
+
llm_start = time.time()
|
| 207 |
+
results_dict = {}
|
| 208 |
+
with ThreadPoolExecutor(max_workers=min(5, len(batches))) as executor:
|
| 209 |
+
futures = [executor.submit(process_batch, batch, context_chunks) for _, batch in batches]
|
| 210 |
+
for (start_idx, batch), future in zip(batches, futures):
|
| 211 |
+
try:
|
| 212 |
+
result = future.result()
|
| 213 |
+
if isinstance(result, dict) and "answers" in result:
|
| 214 |
+
for j, answer in enumerate(result["answers"]):
|
| 215 |
+
results_dict[start_idx + j] = answer
|
| 216 |
+
else:
|
| 217 |
+
for j in range(len(batch)):
|
| 218 |
+
results_dict[start_idx + j] = "Error in response"
|
| 219 |
+
except Exception as e:
|
| 220 |
+
for j in range(len(batch)):
|
| 221 |
+
results_dict[start_idx + j] = f"Error: {str(e)}"
|
| 222 |
+
timing_data['llm_processing'] = round(time.time() - llm_start, 2)
|
| 223 |
+
|
| 224 |
+
responses = [results_dict.get(i, "Not Found") for i in range(len(questions))]
|
| 225 |
+
timing_data['total_time'] = round(time.time() - start_time, 2)
|
| 226 |
+
|
| 227 |
+
print(f"\n=== TIMING BREAKDOWN ===")
|
| 228 |
+
for k, v in timing_data.items():
|
| 229 |
+
print(f"{k}: {v}s")
|
| 230 |
+
print(f"=======================\n")
|
| 231 |
+
|
| 232 |
+
print(f"=== OUTPUT JSON ===")
|
| 233 |
+
print(json.dumps({"answers": responses}, indent=2))
|
| 234 |
+
print(f"==================\n")
|
| 235 |
+
|
| 236 |
+
return {"answers": responses}
|
| 237 |
+
|
| 238 |
+
except Exception as e:
|
| 239 |
+
print(f"Error: {str(e)}")
|
| 240 |
+
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 241 |
+
|
| 242 |
+
if __name__ == "__main__":
|
| 243 |
+
port = int(os.environ.get("PORT", 7860))
|
| 244 |
+
uvicorn.run("app:app", host="0.0.0.0", port=port)
|
embedder.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import faiss
|
| 2 |
+
import numpy as np
|
| 3 |
+
import os
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
|
| 6 |
+
cache_dir = os.path.join(os.getcwd(), ".cache")
|
| 7 |
+
os.makedirs(cache_dir, exist_ok=True)
|
| 8 |
+
os.environ['HF_HOME'] = cache_dir
|
| 9 |
+
os.environ['TRANSFORMERS_CACHE'] = cache_dir
|
| 10 |
+
|
| 11 |
+
_model = None
|
| 12 |
+
|
| 13 |
+
def preload_model(model_name="paraphrase-MiniLM-L3-v2"):
|
| 14 |
+
global _model
|
| 15 |
+
if _model is not None:
|
| 16 |
+
return _model
|
| 17 |
+
|
| 18 |
+
print(f"Preloading sentence transformer model: {model_name}...")
|
| 19 |
+
try:
|
| 20 |
+
_model = SentenceTransformer(model_name, cache_folder=cache_dir)
|
| 21 |
+
except Exception as e:
|
| 22 |
+
print(f"Primary model load failed: {e}")
|
| 23 |
+
fallback_name = "sentence-transformers/" + model_name
|
| 24 |
+
print(f"Trying fallback: {fallback_name}")
|
| 25 |
+
_model = SentenceTransformer(fallback_name, cache_folder=cache_dir)
|
| 26 |
+
|
| 27 |
+
print("✅ Model ready.")
|
| 28 |
+
return _model
|
| 29 |
+
|
| 30 |
+
def get_model():
|
| 31 |
+
return preload_model()
|
| 32 |
+
|
| 33 |
+
def build_faiss_index(chunks, batch_size=128, show_progress_bar=False):
|
| 34 |
+
model = get_model()
|
| 35 |
+
embeddings = model.encode(
|
| 36 |
+
chunks,
|
| 37 |
+
batch_size=batch_size,
|
| 38 |
+
show_progress_bar=show_progress_bar,
|
| 39 |
+
convert_to_numpy=True,
|
| 40 |
+
normalize_embeddings=True
|
| 41 |
+
)
|
| 42 |
+
dim = embeddings.shape[1]
|
| 43 |
+
index = faiss.IndexFlatL2(dim)
|
| 44 |
+
index.add(embeddings)
|
| 45 |
+
return index, chunks
|
llm.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import google.generativeai as genai
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
# Support multiple Gemini keys (comma-separated or single key)
|
| 9 |
+
api_keys = os.getenv("GOOGLE_API_KEYS") or os.getenv("GOOGLE_API_KEY")
|
| 10 |
+
if not api_keys:
|
| 11 |
+
raise ValueError("No Gemini API keys found in GOOGLE_API_KEYS or GOOGLE_API_KEY environment variable.")
|
| 12 |
+
|
| 13 |
+
api_keys = [k.strip() for k in api_keys.split(",") if k.strip()]
|
| 14 |
+
print(f"Loaded {len(api_keys)} Gemini API key(s)")
|
| 15 |
+
|
| 16 |
+
def query_gemini(questions, contexts, max_retries=3):
|
| 17 |
+
import itertools
|
| 18 |
+
|
| 19 |
+
context = "\n\n".join(contexts)
|
| 20 |
+
questions_text = "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
|
| 21 |
+
|
| 22 |
+
prompt = f"""
|
| 23 |
+
You are an expert insurance assistant responsible for drafting formal, policy-aligned answers to user questions. Each response must follow structured formatting, proper terminology, and clean grammar. The tone must reflect official insurance documentation but remain simple and understandable to any reader.
|
| 24 |
+
|
| 25 |
+
FORMAT & STYLE GUIDELINES:
|
| 26 |
+
|
| 27 |
+
- Use third-person professional language only. Avoid “you”, “we”, or “policyholder”.
|
| 28 |
+
- Begin answers with “Yes,” “No,” or “Can apply” where applicable.
|
| 29 |
+
- Each answer must contain 2–3 clear sentences, each with a defined role:
|
| 30 |
+
1. First sentence: Direct answer (Yes/No/Definition).
|
| 31 |
+
2. Second sentence: Clarification, eligibility, limits, or conditions.
|
| 32 |
+
3. Optional third (if needed): Legal basis or policy clause (e.g., specific Act, PPN rule).
|
| 33 |
+
- Write numbers in word–digit format (e.g., “thirty-six (36) months”).
|
| 34 |
+
- Use formal but human-readable insurance terms (e.g., “Sum Insured”, “grace period”, “renewal”, “direct complications”, “capped”, “continuous coverage”).
|
| 35 |
+
- Avoid passive constructions unless required by tone. Use precise, subject-led sentences.
|
| 36 |
+
- Maintain consistency in describing timeframes and benefits:
|
| 37 |
+
- “A grace period of thirty (30) days is provided…”
|
| 38 |
+
- “The benefit is limited to two (2) deliveries during the policy period.”
|
| 39 |
+
- Always include limits, duration, eligibility, and conditions, when relevant.
|
| 40 |
+
|
| 41 |
+
STRUCTURED ANSWERING BEHAVIOR:
|
| 42 |
+
|
| 43 |
+
- If an answer is Yes/No/Conditional:
|
| 44 |
+
- Start with that term and follow up with explanation.
|
| 45 |
+
- If the answer defines a feature (e.g., "What is hospital?"):
|
| 46 |
+
- Start with the clean definition.
|
| 47 |
+
- Never elaborate with theory, history, or deep medical details.
|
| 48 |
+
- Do not repeat terms or explain known insurance concepts.
|
| 49 |
+
- Avoid vague statements — prefer clarity: "is capped at", "must be", "is covered under", etc.
|
| 50 |
+
|
| 51 |
+
DO NOT:
|
| 52 |
+
|
| 53 |
+
- Say “according to the document” or “based on context”.
|
| 54 |
+
- Use markdown, emojis, or formatting symbols like %, ₹, or bullets.
|
| 55 |
+
- Give long explanations, bullet points, or repeat words/ideas.
|
| 56 |
+
- Mention “context”, “source”, or “document” at all.
|
| 57 |
+
- Use uncertain or filler language (e.g., “It might”, “It appears”, “It could be”).
|
| 58 |
+
|
| 59 |
+
✅ DO:
|
| 60 |
+
- Write in clean, informative language.
|
| 61 |
+
- Give complete answers in 2–3 sentences maximum.
|
| 62 |
+
|
| 63 |
+
📝 EXAMPLE ANSWERS:
|
| 64 |
+
- "Yes, the policy covers damage to personal property caused by fire, up to a limit of $50,000."
|
| 65 |
+
- "No, the policy does not cover pre-existing conditions."
|
| 66 |
+
- "The waiting period for coverage to begin is 30 days from the start date of the policy."
|
| 67 |
+
|
| 68 |
+
📤 OUTPUT FORMAT (strict):
|
| 69 |
+
Respond with only the following JSON — no explanations, no comments, no markdown:
|
| 70 |
+
|
| 71 |
+
{{
|
| 72 |
+
"answers": [
|
| 73 |
+
"Answer to question 1",
|
| 74 |
+
"Answer to question 2",
|
| 75 |
+
...
|
| 76 |
+
]
|
| 77 |
+
}}
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
📚 CONTEXT:
|
| 81 |
+
{context}
|
| 82 |
+
|
| 83 |
+
❓ QUESTIONS:
|
| 84 |
+
{questions_text}
|
| 85 |
+
|
| 86 |
+
Your task: For each question, provide a complete, professional, and clearly written answer in 2–3 sentences using a formal but readable tone.
|
| 87 |
+
"""
|
| 88 |
+
|
| 89 |
+
last_exception = None
|
| 90 |
+
total_attempts = len(api_keys) * max_retries
|
| 91 |
+
key_cycle = itertools.cycle(api_keys)
|
| 92 |
+
|
| 93 |
+
for attempt in range(total_attempts):
|
| 94 |
+
key = next(key_cycle)
|
| 95 |
+
try:
|
| 96 |
+
genai.configure(api_key=key)
|
| 97 |
+
model = genai.GenerativeModel("gemini-2.5-flash-lite")
|
| 98 |
+
response = model.generate_content(prompt)
|
| 99 |
+
response_text = getattr(response, "text", "").strip()
|
| 100 |
+
|
| 101 |
+
if not response_text:
|
| 102 |
+
raise ValueError("Empty response received from Gemini API.")
|
| 103 |
+
|
| 104 |
+
if response_text.startswith("```json"):
|
| 105 |
+
response_text = response_text.replace("```json", "").replace("```", "").strip()
|
| 106 |
+
elif response_text.startswith("```"):
|
| 107 |
+
response_text = response_text.replace("```", "").strip()
|
| 108 |
+
|
| 109 |
+
parsed = json.loads(response_text)
|
| 110 |
+
if "answers" in parsed and isinstance(parsed["answers"], list):
|
| 111 |
+
return parsed
|
| 112 |
+
else:
|
| 113 |
+
raise ValueError("Invalid response format received from Gemini.")
|
| 114 |
+
|
| 115 |
+
except Exception as e:
|
| 116 |
+
last_exception = e
|
| 117 |
+
msg = str(e).lower()
|
| 118 |
+
print(f"[Retry {attempt+1}/{total_attempts}] Gemini key {key[:8]}... failed: {e}")
|
| 119 |
+
continue
|
| 120 |
+
|
| 121 |
+
print(f"All Gemini API attempts failed. Last error: {last_exception}")
|
| 122 |
+
return {"answers": [f"Error generating response: {str(last_exception)}"] * len(questions)}
|
pdf_parser.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz # PyMuPDF
|
| 2 |
+
import requests
|
| 3 |
+
from io import BytesIO
|
| 4 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import pytesseract
|
| 7 |
+
import imghdr
|
| 8 |
+
|
| 9 |
+
def _extract_text(page):
|
| 10 |
+
text = page.get_text()
|
| 11 |
+
return text.strip() if text and text.strip() else None
|
| 12 |
+
|
| 13 |
+
def is_image(content):
|
| 14 |
+
return imghdr.what(None, h=content) in ["jpeg", "png", "bmp", "gif", "tiff", "webp"]
|
| 15 |
+
|
| 16 |
+
def extract_text_from_image_bytes(image_bytes):
|
| 17 |
+
image = Image.open(BytesIO(image_bytes))
|
| 18 |
+
return pytesseract.image_to_string(image).strip()
|
| 19 |
+
|
| 20 |
+
def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
|
| 21 |
+
"""
|
| 22 |
+
Download document (PDF or Image) from URL, extract text accordingly.
|
| 23 |
+
Gracefully return fallback message if unsupported or failed.
|
| 24 |
+
"""
|
| 25 |
+
try:
|
| 26 |
+
res = requests.get(url)
|
| 27 |
+
content = res.content
|
| 28 |
+
content_type = res.headers.get("content-type", "").lower()
|
| 29 |
+
except Exception as e:
|
| 30 |
+
print(f"❌ Failed to download: {str(e)}")
|
| 31 |
+
return [f"No data found in this document (download error)"]
|
| 32 |
+
|
| 33 |
+
# Check for unsupported content
|
| 34 |
+
if "zip" in content_type or url.endswith(".zip"):
|
| 35 |
+
return ["No data found in this document (zip)"]
|
| 36 |
+
if "octet-stream" in content_type or url.endswith(".bin"):
|
| 37 |
+
return ["No data found in this document (bin)"]
|
| 38 |
+
|
| 39 |
+
# OCR for image files
|
| 40 |
+
if "image" in content_type or is_image(content):
|
| 41 |
+
print("📷 Detected image file. Using OCR...")
|
| 42 |
+
try:
|
| 43 |
+
text = extract_text_from_image_bytes(content)
|
| 44 |
+
return [text] if text else ["No data found in this document (image empty)"]
|
| 45 |
+
except Exception as e:
|
| 46 |
+
print(f"❌ OCR failed: {str(e)}")
|
| 47 |
+
return [f"No data found in this document (image/OCR error)"]
|
| 48 |
+
|
| 49 |
+
# Try PDF fallback
|
| 50 |
+
try:
|
| 51 |
+
with fitz.open(stream=BytesIO(content), filetype="pdf") as doc:
|
| 52 |
+
pages = list(doc)
|
| 53 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 54 |
+
texts = list(executor.map(_extract_text, pages))
|
| 55 |
+
if chunk_size > 1:
|
| 56 |
+
chunks = []
|
| 57 |
+
for i in range(0, len(texts), chunk_size):
|
| 58 |
+
chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
|
| 59 |
+
if chunk:
|
| 60 |
+
chunks.append(chunk)
|
| 61 |
+
return chunks if chunks else ["No data found in this document (empty PDF)"]
|
| 62 |
+
return [t for t in texts if t] or ["No data found in this document (empty PDF)"]
|
| 63 |
+
except Exception as e:
|
| 64 |
+
print(f"❌ Failed to parse as PDF: {str(e)}")
|
| 65 |
+
return [f"No data found in this document (not PDF or corrupted)"]
|
| 66 |
+
|
| 67 |
+
def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1):
|
| 68 |
+
"""
|
| 69 |
+
Parse a local PDF file, extract text in parallel, optionally chunk pages.
|
| 70 |
+
"""
|
| 71 |
+
try:
|
| 72 |
+
with fitz.open(file_path) as doc:
|
| 73 |
+
pages = list(doc)
|
| 74 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 75 |
+
texts = list(executor.map(_extract_text, pages))
|
| 76 |
+
if chunk_size > 1:
|
| 77 |
+
chunks = []
|
| 78 |
+
for i in range(0, len(texts), chunk_size):
|
| 79 |
+
chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
|
| 80 |
+
if chunk:
|
| 81 |
+
chunks.append(chunk)
|
| 82 |
+
return chunks if chunks else ["No data found in this document (local PDF empty)"]
|
| 83 |
+
return [t for t in texts if t] or ["No data found in this document (local PDF empty)"]
|
| 84 |
+
except Exception as e:
|
| 85 |
+
print(f"❌ Failed to open local file: {str(e)}")
|
| 86 |
+
return [f"No data found in this document (local file error)"]
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
requests
|
| 4 |
+
faiss-cpu
|
| 5 |
+
sentence-transformers
|
| 6 |
+
PyMuPDF
|
| 7 |
+
python-dotenv
|
| 8 |
+
tf-keras
|
| 9 |
+
google-generativeai
|
| 10 |
+
pytesseract
|
| 11 |
+
Pillow
|
| 12 |
+
|
retriever.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers.util import cos_sim
|
| 2 |
+
from embedder import get_model
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
def retrieve_chunks(index, texts, question, top_k=15):
|
| 6 |
+
model = get_model()
|
| 7 |
+
q_embedding = model.encode([question], convert_to_numpy=True, normalize_embeddings=True)[0]
|
| 8 |
+
|
| 9 |
+
scores, indices = index.search(np.array([q_embedding]), top_k)
|
| 10 |
+
selected = [texts[i] for i in indices[0]]
|
| 11 |
+
return selected
|