TharanJ commited on
Commit
5281ffe
·
1 Parent(s): 22fda55
Files changed (10) hide show
  1. .dockerignore +26 -0
  2. .gitattributes copy +35 -0
  3. .gitignore +61 -0
  4. Dockerfile +39 -0
  5. app.py +244 -0
  6. embedder.py +45 -0
  7. llm.py +122 -0
  8. pdf_parser.py +86 -0
  9. requirements.txt +12 -0
  10. retriever.py +11 -0
.dockerignore ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .gitignore
3
+ README.md
4
+ DEPLOYMENT.md
5
+ render.yaml
6
+ start.sh
7
+ __pycache__
8
+ *.pyc
9
+ *.pyo
10
+ *.pyd
11
+ .Python
12
+ env
13
+ pip-log.txt
14
+ pip-delete-this-directory.txt
15
+ .tox
16
+ .coverage
17
+ .coverage.*
18
+ .cache
19
+ nosetests.xml
20
+ coverage.xml
21
+ *.cover
22
+ *.log
23
+ .git
24
+ .mypy_cache
25
+ .pytest_cache
26
+ .hypothesis
.gitattributes copy ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment variables
2
+ .env
3
+ .env.local
4
+ .env.production
5
+
6
+ # Python
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+ *.so
11
+ .Python
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+ .cache
29
+ # Virtual environments
30
+ venv/
31
+ env/
32
+ ENV/
33
+ env.bak/
34
+ venv.bak/
35
+
36
+ # IDE
37
+ .vscode/
38
+ .idea/
39
+ *.swp
40
+ *.swo
41
+ *~
42
+
43
+ # OS
44
+ .DS_Store
45
+ Thumbs.db
46
+
47
+ # Logs
48
+ *.log
49
+
50
+ # Temporary files
51
+ *.tmp
52
+ *.temp
53
+
54
+ # FAISS index files
55
+ *.index
56
+ *.faiss
57
+
58
+ # PDF files (if you don't want to commit them)
59
+ *.pdf
60
+
61
+ DEPLOYMENT.md
Dockerfile ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ tesseract-ocr \
9
+ libglib2.0-0 \
10
+ libsm6 \
11
+ libxext6 \
12
+ libxrender-dev \
13
+ poppler-utils \
14
+ && apt-get clean \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ # Create a non-root user
18
+ RUN useradd --create-home --shell /bin/bash appuser
19
+
20
+ # Copy requirements first for better caching
21
+ COPY requirements.txt .
22
+
23
+ # Install Python dependencies
24
+ RUN pip install --no-cache-dir -r requirements.txt
25
+
26
+ # Copy application code
27
+ COPY . .
28
+
29
+ # Create cache directory with proper permissions
30
+ RUN mkdir -p /app/.cache && chown -R appuser:appuser /app
31
+
32
+ # Switch to non-root user
33
+ USER appuser
34
+
35
+ # Expose port
36
+ EXPOSE 7860
37
+
38
+ # Run the application
39
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import warnings
3
+ import logging
4
+ import time
5
+ import json
6
+ import hashlib
7
+ from datetime import datetime
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from threading import Lock
10
+
11
+ # Set up cache directory for HuggingFace models
12
+ cache_dir = os.path.join(os.getcwd(), ".cache")
13
+ os.makedirs(cache_dir, exist_ok=True)
14
+ os.environ['HF_HOME'] = cache_dir
15
+ os.environ['TRANSFORMERS_CACHE'] = cache_dir
16
+
17
+ # Suppress TensorFlow warnings
18
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
19
+ os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
20
+ os.environ['TF_LOGGING_LEVEL'] = 'ERROR'
21
+ os.environ['TF_ENABLE_DEPRECATION_WARNINGS'] = '0'
22
+
23
+ warnings.filterwarnings('ignore', category=DeprecationWarning, module='tensorflow')
24
+ logging.getLogger('tensorflow').setLevel(logging.ERROR)
25
+
26
+ from fastapi import FastAPI, HTTPException, Depends, Header
27
+ from fastapi.middleware.cors import CORSMiddleware
28
+ from pydantic import BaseModel
29
+ from pdf_parser import parse_pdf_from_url_multithreaded as parse_pdf_from_url, parse_pdf_from_file_multithreaded as parse_pdf_from_file
30
+ from embedder import build_faiss_index, preload_model
31
+ from retriever import retrieve_chunks
32
+ from llm import query_gemini
33
+ import uvicorn
34
+
35
+ app = FastAPI(title="HackRx Insurance Policy Assistant", version="1.0.0")
36
+
37
+ app.add_middleware(
38
+ CORSMiddleware,
39
+ allow_origins=["*"],
40
+ allow_credentials=True,
41
+ allow_methods=["*"],
42
+ allow_headers=["*"],
43
+ )
44
+
45
+ @app.on_event("startup")
46
+ async def startup_event():
47
+ print("Starting up HackRx Insurance Policy Assistant...")
48
+ print("Preloading sentence transformer model...")
49
+ preload_model()
50
+ print("Model preloading completed. API is ready to serve requests!")
51
+
52
+ @app.get("/")
53
+ async def root():
54
+ return {"message": "HackRx Insurance Policy Assistant API is running!"}
55
+
56
+ @app.get("/health")
57
+ async def health_check():
58
+ return {"status": "healthy"}
59
+
60
+ class QueryRequest(BaseModel):
61
+ documents: str
62
+ questions: list[str]
63
+
64
+ class LocalQueryRequest(BaseModel):
65
+ document_path: str
66
+ questions: list[str]
67
+
68
+ def verify_token(authorization: str = Header(None)):
69
+ if not authorization or not authorization.startswith("Bearer "):
70
+ raise HTTPException(status_code=401, detail="Invalid authorization header")
71
+ token = authorization.replace("Bearer ", "")
72
+ if not token:
73
+ raise HTTPException(status_code=401, detail="Invalid token")
74
+ return token
75
+
76
+ def process_batch(batch_questions, context_chunks):
77
+ return query_gemini(batch_questions, context_chunks)
78
+
79
+ def get_document_id_from_url(url: str) -> str:
80
+ return hashlib.md5(url.encode()).hexdigest()
81
+
82
+ # Document cache with thread safety
83
+ doc_cache = {}
84
+ doc_cache_lock = Lock()
85
+
86
+ @app.post("/api/v1/hackrx/run")
87
+ async def run_query(request: QueryRequest, token: str = Depends(verify_token)):
88
+ start_time = time.time()
89
+ timing_data = {}
90
+ try:
91
+ print("=== INPUT JSON ===")
92
+ print(json.dumps({"documents": request.documents, "questions": request.questions}, indent=2))
93
+ print("==================\n")
94
+
95
+ print(f"Processing {len(request.questions)} questions...")
96
+
97
+ # PDF Parsing and FAISS Caching
98
+ doc_id = get_document_id_from_url(request.documents)
99
+ with doc_cache_lock:
100
+ if doc_id in doc_cache:
101
+ print("✅ Using cached document...")
102
+ cached = doc_cache[doc_id]
103
+ text_chunks = cached["chunks"]
104
+ index = cached["index"]
105
+ texts = cached["texts"]
106
+ else:
107
+ print("⚙️ Parsing and indexing new document...")
108
+ pdf_start = time.time()
109
+ text_chunks = parse_pdf_from_url(request.documents)
110
+ timing_data['pdf_parsing'] = round(time.time() - pdf_start, 2)
111
+
112
+ index_start = time.time()
113
+ index, texts = build_faiss_index(text_chunks)
114
+ timing_data['faiss_index_building'] = round(time.time() - index_start, 2)
115
+
116
+ doc_cache[doc_id] = {
117
+ "chunks": text_chunks,
118
+ "index": index,
119
+ "texts": texts
120
+ }
121
+
122
+ # Chunk Retrieval
123
+ retrieval_start = time.time()
124
+ all_chunks = set()
125
+ for question in request.questions:
126
+ top_chunks = retrieve_chunks(index, texts, question)
127
+ all_chunks.update(top_chunks)
128
+ timing_data['chunk_retrieval'] = round(time.time() - retrieval_start, 2)
129
+ print(f"Retrieved {len(all_chunks)} unique chunks")
130
+
131
+ # LLM Batch Processing
132
+ questions = request.questions
133
+ context_chunks = list(all_chunks)
134
+ batch_size = 10
135
+ batches = [(i, questions[i:i + batch_size]) for i in range(0, len(questions), batch_size)]
136
+
137
+ llm_start = time.time()
138
+ results_dict = {}
139
+ with ThreadPoolExecutor(max_workers=min(5, len(batches))) as executor:
140
+ futures = [executor.submit(process_batch, batch, context_chunks) for _, batch in batches]
141
+ for (start_idx, batch), future in zip(batches, futures):
142
+ try:
143
+ result = future.result()
144
+ if isinstance(result, dict) and "answers" in result:
145
+ for j, answer in enumerate(result["answers"]):
146
+ results_dict[start_idx + j] = answer
147
+ else:
148
+ for j in range(len(batch)):
149
+ results_dict[start_idx + j] = "Error in response"
150
+ except Exception as e:
151
+ for j in range(len(batch)):
152
+ results_dict[start_idx + j] = f"Error: {str(e)}"
153
+ timing_data['llm_processing'] = round(time.time() - llm_start, 2)
154
+
155
+ responses = [results_dict.get(i, "Not Found") for i in range(len(questions))]
156
+ timing_data['total_time'] = round(time.time() - start_time, 2)
157
+
158
+ print(f"\n=== TIMING BREAKDOWN ===")
159
+ for k, v in timing_data.items():
160
+ print(f"{k}: {v}s")
161
+ print(f"=======================\n")
162
+
163
+ print(f"=== OUTPUT JSON ===")
164
+ print(json.dumps({"answers": responses}, indent=2))
165
+ print(f"==================\n")
166
+
167
+ return {"answers": responses}
168
+
169
+ except Exception as e:
170
+ print(f"Error: {str(e)}")
171
+ raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
172
+
173
+ @app.post("/api/v1/hackrx/local")
174
+ async def run_local_query(request: LocalQueryRequest):
175
+ start_time = time.time()
176
+ timing_data = {}
177
+ try:
178
+ print("=== INPUT JSON ===")
179
+ print(json.dumps({"document_path": request.document_path, "questions": request.questions}, indent=2))
180
+ print("==================\n")
181
+
182
+ print(f"Processing {len(request.questions)} questions locally...")
183
+
184
+ pdf_start = time.time()
185
+ text_chunks = parse_pdf_from_file(request.document_path)
186
+ timing_data['pdf_parsing'] = round(time.time() - pdf_start, 2)
187
+ print(f"Extracted {len(text_chunks)} text chunks from PDF")
188
+
189
+ index_start = time.time()
190
+ index, texts = build_faiss_index(text_chunks)
191
+ timing_data['faiss_index_building'] = round(time.time() - index_start, 2)
192
+
193
+ retrieval_start = time.time()
194
+ all_chunks = set()
195
+ for question in request.questions:
196
+ top_chunks = retrieve_chunks(index, texts, question)
197
+ all_chunks.update(top_chunks)
198
+ timing_data['chunk_retrieval'] = round(time.time() - retrieval_start, 2)
199
+ print(f"Retrieved {len(all_chunks)} unique chunks")
200
+
201
+ questions = request.questions
202
+ context_chunks = list(all_chunks)
203
+ batch_size = 20
204
+ batches = [(i, questions[i:i + batch_size]) for i in range(0, len(questions), batch_size)]
205
+
206
+ llm_start = time.time()
207
+ results_dict = {}
208
+ with ThreadPoolExecutor(max_workers=min(5, len(batches))) as executor:
209
+ futures = [executor.submit(process_batch, batch, context_chunks) for _, batch in batches]
210
+ for (start_idx, batch), future in zip(batches, futures):
211
+ try:
212
+ result = future.result()
213
+ if isinstance(result, dict) and "answers" in result:
214
+ for j, answer in enumerate(result["answers"]):
215
+ results_dict[start_idx + j] = answer
216
+ else:
217
+ for j in range(len(batch)):
218
+ results_dict[start_idx + j] = "Error in response"
219
+ except Exception as e:
220
+ for j in range(len(batch)):
221
+ results_dict[start_idx + j] = f"Error: {str(e)}"
222
+ timing_data['llm_processing'] = round(time.time() - llm_start, 2)
223
+
224
+ responses = [results_dict.get(i, "Not Found") for i in range(len(questions))]
225
+ timing_data['total_time'] = round(time.time() - start_time, 2)
226
+
227
+ print(f"\n=== TIMING BREAKDOWN ===")
228
+ for k, v in timing_data.items():
229
+ print(f"{k}: {v}s")
230
+ print(f"=======================\n")
231
+
232
+ print(f"=== OUTPUT JSON ===")
233
+ print(json.dumps({"answers": responses}, indent=2))
234
+ print(f"==================\n")
235
+
236
+ return {"answers": responses}
237
+
238
+ except Exception as e:
239
+ print(f"Error: {str(e)}")
240
+ raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
241
+
242
+ if __name__ == "__main__":
243
+ port = int(os.environ.get("PORT", 7860))
244
+ uvicorn.run("app:app", host="0.0.0.0", port=port)
embedder.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import numpy as np
3
+ import os
4
+ from sentence_transformers import SentenceTransformer
5
+
6
+ cache_dir = os.path.join(os.getcwd(), ".cache")
7
+ os.makedirs(cache_dir, exist_ok=True)
8
+ os.environ['HF_HOME'] = cache_dir
9
+ os.environ['TRANSFORMERS_CACHE'] = cache_dir
10
+
11
+ _model = None
12
+
13
+ def preload_model(model_name="paraphrase-MiniLM-L3-v2"):
14
+ global _model
15
+ if _model is not None:
16
+ return _model
17
+
18
+ print(f"Preloading sentence transformer model: {model_name}...")
19
+ try:
20
+ _model = SentenceTransformer(model_name, cache_folder=cache_dir)
21
+ except Exception as e:
22
+ print(f"Primary model load failed: {e}")
23
+ fallback_name = "sentence-transformers/" + model_name
24
+ print(f"Trying fallback: {fallback_name}")
25
+ _model = SentenceTransformer(fallback_name, cache_folder=cache_dir)
26
+
27
+ print("✅ Model ready.")
28
+ return _model
29
+
30
+ def get_model():
31
+ return preload_model()
32
+
33
+ def build_faiss_index(chunks, batch_size=128, show_progress_bar=False):
34
+ model = get_model()
35
+ embeddings = model.encode(
36
+ chunks,
37
+ batch_size=batch_size,
38
+ show_progress_bar=show_progress_bar,
39
+ convert_to_numpy=True,
40
+ normalize_embeddings=True
41
+ )
42
+ dim = embeddings.shape[1]
43
+ index = faiss.IndexFlatL2(dim)
44
+ index.add(embeddings)
45
+ return index, chunks
llm.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import google.generativeai as genai
2
+ import os
3
+ import json
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+ # Support multiple Gemini keys (comma-separated or single key)
9
+ api_keys = os.getenv("GOOGLE_API_KEYS") or os.getenv("GOOGLE_API_KEY")
10
+ if not api_keys:
11
+ raise ValueError("No Gemini API keys found in GOOGLE_API_KEYS or GOOGLE_API_KEY environment variable.")
12
+
13
+ api_keys = [k.strip() for k in api_keys.split(",") if k.strip()]
14
+ print(f"Loaded {len(api_keys)} Gemini API key(s)")
15
+
16
+ def query_gemini(questions, contexts, max_retries=3):
17
+ import itertools
18
+
19
+ context = "\n\n".join(contexts)
20
+ questions_text = "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
21
+
22
+ prompt = f"""
23
+ You are an expert insurance assistant responsible for drafting formal, policy-aligned answers to user questions. Each response must follow structured formatting, proper terminology, and clean grammar. The tone must reflect official insurance documentation but remain simple and understandable to any reader.
24
+
25
+ FORMAT & STYLE GUIDELINES:
26
+
27
+ - Use third-person professional language only. Avoid “you”, “we”, or “policyholder”.
28
+ - Begin answers with “Yes,” “No,” or “Can apply” where applicable.
29
+ - Each answer must contain 2–3 clear sentences, each with a defined role:
30
+ 1. First sentence: Direct answer (Yes/No/Definition).
31
+ 2. Second sentence: Clarification, eligibility, limits, or conditions.
32
+ 3. Optional third (if needed): Legal basis or policy clause (e.g., specific Act, PPN rule).
33
+ - Write numbers in word–digit format (e.g., “thirty-six (36) months”).
34
+ - Use formal but human-readable insurance terms (e.g., “Sum Insured”, “grace period”, “renewal”, “direct complications”, “capped”, “continuous coverage”).
35
+ - Avoid passive constructions unless required by tone. Use precise, subject-led sentences.
36
+ - Maintain consistency in describing timeframes and benefits:
37
+ - “A grace period of thirty (30) days is provided…”
38
+ - “The benefit is limited to two (2) deliveries during the policy period.”
39
+ - Always include limits, duration, eligibility, and conditions, when relevant.
40
+
41
+ STRUCTURED ANSWERING BEHAVIOR:
42
+
43
+ - If an answer is Yes/No/Conditional:
44
+ - Start with that term and follow up with explanation.
45
+ - If the answer defines a feature (e.g., "What is hospital?"):
46
+ - Start with the clean definition.
47
+ - Never elaborate with theory, history, or deep medical details.
48
+ - Do not repeat terms or explain known insurance concepts.
49
+ - Avoid vague statements — prefer clarity: "is capped at", "must be", "is covered under", etc.
50
+
51
+ DO NOT:
52
+
53
+ - Say “according to the document” or “based on context”.
54
+ - Use markdown, emojis, or formatting symbols like %, ₹, or bullets.
55
+ - Give long explanations, bullet points, or repeat words/ideas.
56
+ - Mention “context”, “source”, or “document” at all.
57
+ - Use uncertain or filler language (e.g., “It might”, “It appears”, “It could be”).
58
+
59
+ ✅ DO:
60
+ - Write in clean, informative language.
61
+ - Give complete answers in 2–3 sentences maximum.
62
+
63
+ 📝 EXAMPLE ANSWERS:
64
+ - "Yes, the policy covers damage to personal property caused by fire, up to a limit of $50,000."
65
+ - "No, the policy does not cover pre-existing conditions."
66
+ - "The waiting period for coverage to begin is 30 days from the start date of the policy."
67
+
68
+ 📤 OUTPUT FORMAT (strict):
69
+ Respond with only the following JSON — no explanations, no comments, no markdown:
70
+
71
+ {{
72
+ "answers": [
73
+ "Answer to question 1",
74
+ "Answer to question 2",
75
+ ...
76
+ ]
77
+ }}
78
+
79
+
80
+ 📚 CONTEXT:
81
+ {context}
82
+
83
+ ❓ QUESTIONS:
84
+ {questions_text}
85
+
86
+ Your task: For each question, provide a complete, professional, and clearly written answer in 2–3 sentences using a formal but readable tone.
87
+ """
88
+
89
+ last_exception = None
90
+ total_attempts = len(api_keys) * max_retries
91
+ key_cycle = itertools.cycle(api_keys)
92
+
93
+ for attempt in range(total_attempts):
94
+ key = next(key_cycle)
95
+ try:
96
+ genai.configure(api_key=key)
97
+ model = genai.GenerativeModel("gemini-2.5-flash-lite")
98
+ response = model.generate_content(prompt)
99
+ response_text = getattr(response, "text", "").strip()
100
+
101
+ if not response_text:
102
+ raise ValueError("Empty response received from Gemini API.")
103
+
104
+ if response_text.startswith("```json"):
105
+ response_text = response_text.replace("```json", "").replace("```", "").strip()
106
+ elif response_text.startswith("```"):
107
+ response_text = response_text.replace("```", "").strip()
108
+
109
+ parsed = json.loads(response_text)
110
+ if "answers" in parsed and isinstance(parsed["answers"], list):
111
+ return parsed
112
+ else:
113
+ raise ValueError("Invalid response format received from Gemini.")
114
+
115
+ except Exception as e:
116
+ last_exception = e
117
+ msg = str(e).lower()
118
+ print(f"[Retry {attempt+1}/{total_attempts}] Gemini key {key[:8]}... failed: {e}")
119
+ continue
120
+
121
+ print(f"All Gemini API attempts failed. Last error: {last_exception}")
122
+ return {"answers": [f"Error generating response: {str(last_exception)}"] * len(questions)}
pdf_parser.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import requests
3
+ from io import BytesIO
4
+ from concurrent.futures import ThreadPoolExecutor
5
+ from PIL import Image
6
+ import pytesseract
7
+ import imghdr
8
+
9
+ def _extract_text(page):
10
+ text = page.get_text()
11
+ return text.strip() if text and text.strip() else None
12
+
13
+ def is_image(content):
14
+ return imghdr.what(None, h=content) in ["jpeg", "png", "bmp", "gif", "tiff", "webp"]
15
+
16
+ def extract_text_from_image_bytes(image_bytes):
17
+ image = Image.open(BytesIO(image_bytes))
18
+ return pytesseract.image_to_string(image).strip()
19
+
20
+ def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
21
+ """
22
+ Download document (PDF or Image) from URL, extract text accordingly.
23
+ Gracefully return fallback message if unsupported or failed.
24
+ """
25
+ try:
26
+ res = requests.get(url)
27
+ content = res.content
28
+ content_type = res.headers.get("content-type", "").lower()
29
+ except Exception as e:
30
+ print(f"❌ Failed to download: {str(e)}")
31
+ return [f"No data found in this document (download error)"]
32
+
33
+ # Check for unsupported content
34
+ if "zip" in content_type or url.endswith(".zip"):
35
+ return ["No data found in this document (zip)"]
36
+ if "octet-stream" in content_type or url.endswith(".bin"):
37
+ return ["No data found in this document (bin)"]
38
+
39
+ # OCR for image files
40
+ if "image" in content_type or is_image(content):
41
+ print("📷 Detected image file. Using OCR...")
42
+ try:
43
+ text = extract_text_from_image_bytes(content)
44
+ return [text] if text else ["No data found in this document (image empty)"]
45
+ except Exception as e:
46
+ print(f"❌ OCR failed: {str(e)}")
47
+ return [f"No data found in this document (image/OCR error)"]
48
+
49
+ # Try PDF fallback
50
+ try:
51
+ with fitz.open(stream=BytesIO(content), filetype="pdf") as doc:
52
+ pages = list(doc)
53
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
54
+ texts = list(executor.map(_extract_text, pages))
55
+ if chunk_size > 1:
56
+ chunks = []
57
+ for i in range(0, len(texts), chunk_size):
58
+ chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
59
+ if chunk:
60
+ chunks.append(chunk)
61
+ return chunks if chunks else ["No data found in this document (empty PDF)"]
62
+ return [t for t in texts if t] or ["No data found in this document (empty PDF)"]
63
+ except Exception as e:
64
+ print(f"❌ Failed to parse as PDF: {str(e)}")
65
+ return [f"No data found in this document (not PDF or corrupted)"]
66
+
67
+ def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1):
68
+ """
69
+ Parse a local PDF file, extract text in parallel, optionally chunk pages.
70
+ """
71
+ try:
72
+ with fitz.open(file_path) as doc:
73
+ pages = list(doc)
74
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
75
+ texts = list(executor.map(_extract_text, pages))
76
+ if chunk_size > 1:
77
+ chunks = []
78
+ for i in range(0, len(texts), chunk_size):
79
+ chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
80
+ if chunk:
81
+ chunks.append(chunk)
82
+ return chunks if chunks else ["No data found in this document (local PDF empty)"]
83
+ return [t for t in texts if t] or ["No data found in this document (local PDF empty)"]
84
+ except Exception as e:
85
+ print(f"❌ Failed to open local file: {str(e)}")
86
+ return [f"No data found in this document (local file error)"]
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ requests
4
+ faiss-cpu
5
+ sentence-transformers
6
+ PyMuPDF
7
+ python-dotenv
8
+ tf-keras
9
+ google-generativeai
10
+ pytesseract
11
+ Pillow
12
+
retriever.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers.util import cos_sim
2
+ from embedder import get_model
3
+ import numpy as np
4
+
5
+ def retrieve_chunks(index, texts, question, top_k=15):
6
+ model = get_model()
7
+ q_embedding = model.encode([question], convert_to_numpy=True, normalize_embeddings=True)[0]
8
+
9
+ scores, indices = index.search(np.array([q_embedding]), top_k)
10
+ selected = [texts[i] for i in indices[0]]
11
+ return selected