rohannsinghal commited on
Commit
774aab5
·
0 Parent(s):

Inital commit of hackrx6.0 RAG System

Browse files
Files changed (5) hide show
  1. .gitignore +25 -0
  2. app/main_api.py +188 -0
  3. app/parser.py +457 -0
  4. requirements.txt +13 -0
  5. run.py +16 -0
.gitignore ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ *build/
7
+ *dist/
8
+
9
+ # Virtual Environments
10
+ venv/
11
+ hackrxenv/
12
+ unstructuredenv/
13
+
14
+ # Secrets - VERY IMPORTANT
15
+ .env
16
+
17
+ # Local data
18
+ app/documents/
19
+ app/chroma_db/
20
+ _fast_parsed_output.json
21
+
22
+ # IDE and OS files
23
+ .vscode/
24
+ .idea/
25
+ .DS_Store
app/main_api.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/main_api.py
2
+
3
+ import os
4
+ import json
5
+ import uuid
6
+ from typing import List, Dict, Any, Optional
7
+ import logging
8
+ import asyncio
9
+ from itertools import cycle
10
+
11
+ # FastAPI and core dependencies
12
+ from fastapi import FastAPI, Body, HTTPException
13
+ from fastapi.middleware.cors import CORSMiddleware
14
+ from pydantic import BaseModel, Field
15
+
16
+ # Embeddings and Vector DB
17
+ from sentence_transformers import SentenceTransformer
18
+ import chromadb
19
+
20
+ # LLM Integration
21
+ import groq
22
+
23
+ # Direct import from our local parser module
24
+ from .parser import FastDocumentParserService
25
+
26
+ # HTTP Client for downloading documents
27
+ import httpx
28
+
29
+ # NEW: Library to load environment variables from .env file
30
+ from dotenv import load_dotenv
31
+
32
+ # Setup
33
+ load_dotenv() # Load environment variables from .env file
34
+ logging.basicConfig(level=logging.INFO)
35
+ logger = logging.getLogger(__name__)
36
+
37
+ app = FastAPI(title="HackRx 6.0 RAG System", version="FINAL")
38
+
39
+ # CORS
40
+ app.add_middleware(
41
+ CORSMiddleware,
42
+ allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"],
43
+ )
44
+
45
+ # --- CONFIGURATION & INITIALIZATION ---
46
+ # API Key Rotation Setup
47
+ GROQ_API_KEYS = os.getenv("GROQ_API_KEYS", "").split(',')
48
+ if not all(GROQ_API_KEYS):
49
+ logger.warning("GROQ_API_KEYS not found in .env file. Using placeholder.")
50
+ # Provide a fallback or raise an error if keys are essential
51
+ GROQ_API_KEYS = ["gsk_YourDefaultKeyHere"]
52
+
53
+ api_key_cycler = cycle(GROQ_API_KEYS)
54
+
55
+ def get_next_api_key():
56
+ return next(api_key_cycler)
57
+
58
+ EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"
59
+ CHROMA_PERSIST_DIR = "./app/chroma_db"
60
+ UPLOAD_DIR = "./app/documents"
61
+
62
+ try:
63
+ embedding_model = SentenceTransformer(EMBEDDING_MODEL)
64
+ chroma_client = chromadb.PersistentClient(path=CHROMA_PERSIST_DIR)
65
+ # The client's API key will be updated per-request
66
+ groq_client = groq.Groq(api_key=get_next_api_key())
67
+ parsing_service = FastDocumentParserService()
68
+ except Exception as e:
69
+ logger.error(f"FATAL: Could not initialize models. Error: {e}")
70
+
71
+ # Pydantic Models for Hackathon
72
+ class SubmissionRequest(BaseModel):
73
+ documents: List[str]
74
+ questions: List[str]
75
+
76
+ class Answer(BaseModel):
77
+ question: str
78
+ answer: str
79
+
80
+ class SubmissionResponse(BaseModel):
81
+ answers: List[Answer]
82
+
83
+ # RAG Pipeline Class
84
+ class RAGPipeline:
85
+ def __init__(self, collection_name: str):
86
+ self.collection_name = collection_name
87
+ self.collection = chroma_client.get_or_create_collection(name=self.collection_name)
88
+
89
+ def add_documents(self, chunks: List[Dict]):
90
+ if not chunks: return
91
+ self.collection.add(
92
+ embeddings=embedding_model.encode([c["content"] for c in chunks], show_progress_bar=True).tolist(),
93
+ documents=[c["content"] for c in chunks],
94
+ metadatas=[c["metadata"] for c in chunks],
95
+ ids=[c["chunk_id"] for c in chunks]
96
+ )
97
+
98
+ def query_documents(self, query: str, n_results: int = 5) -> List[Dict]:
99
+ if not self.collection.count(): return []
100
+ results = self.collection.query(
101
+ query_embeddings=embedding_model.encode([query]).tolist(),
102
+ n_results=min(n_results, self.collection.count()),
103
+ include=["documents", "metadatas"]
104
+ )
105
+ return [{"content": doc, "metadata": meta} for doc, meta in zip(results["documents"][0], results["metadatas"][0])]
106
+
107
+ async def generate_answer(self, query: str, context_docs: List[Dict]) -> str:
108
+ context = "\n\n".join([f"--- REFERENCE TEXT ---\n{doc['content']}" for doc in context_docs])
109
+ system_prompt = "You are an expert AI assistant. Your task is to answer the user's question based *only* on the provided reference text. Do not use any outside knowledge. If the answer is not contained within the text, you must state 'The answer could not be found in the provided document.' Be concise and directly answer the question."
110
+ user_prompt = f"REFERENCE TEXT:\n{context}\n\nQUESTION: {query}"
111
+
112
+ try:
113
+ # --- API KEY ROTATION ---
114
+ groq_client.api_key = get_next_api_key()
115
+ logger.info(f"Using Groq API key ending in ...{groq_client.api_key[-4:]}")
116
+
117
+ response = await asyncio.to_thread(
118
+ groq_client.chat.completions.create,
119
+ model="llama3-8b-8192",
120
+ messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
121
+ temperature=0.0,
122
+ max_tokens=300
123
+ )
124
+ return response.choices[0].message.content.strip()
125
+ except Exception as e:
126
+ logger.error(f"Groq API call failed: {e}")
127
+ return "Error: Could not generate an answer from the language model."
128
+
129
+ # --- Main Hackathon Endpoint ---
130
+ @app.post("/hackrx/run", response_model=SubmissionResponse)
131
+ async def run_submission(request: SubmissionRequest = Body(...)):
132
+
133
+ # 1. Cleanup and Setup
134
+ try:
135
+ for collection in chroma_client.list_collections():
136
+ if collection.name.startswith("hackrx_session_"):
137
+ chroma_client.delete_collection(name=collection.name)
138
+ except Exception as e:
139
+ logger.warning(f"Could not clean up old collections: {e}")
140
+
141
+ session_collection_name = f"hackrx_session_{uuid.uuid4().hex}"
142
+ rag_pipeline = RAGPipeline(collection_name=session_collection_name)
143
+
144
+ # 2. Download and Process Documents
145
+ all_chunks = []
146
+ async with httpx.AsyncClient(timeout=120.0) as client:
147
+ for doc_url in request.documents:
148
+ try:
149
+ logger.info(f"Downloading document from: {doc_url}")
150
+ response = await client.get(doc_url, follow_redirects=True)
151
+ response.raise_for_status()
152
+
153
+ file_name = os.path.basename(doc_url.split('?')[0])
154
+ temp_file_path = os.path.join(UPLOAD_DIR, f"temp_{uuid.uuid4()}_{file_name}")
155
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
156
+ with open(temp_file_path, "wb") as f:
157
+ f.write(response.content)
158
+
159
+ # Your proven parsing logic
160
+ chunks = parsing_service.process_pdf_ultrafast(temp_file_path)
161
+ all_chunks.extend(chunks)
162
+ os.remove(temp_file_path)
163
+
164
+ except Exception as e:
165
+ logger.error(f"Failed to process document at {doc_url}: {e}")
166
+ continue
167
+
168
+ if not all_chunks:
169
+ failed_answers = [Answer(question=q, answer="A valid document could not be processed, so an answer could not be found.") for q in request.questions]
170
+ return SubmissionResponse(answers=failed_answers)
171
+
172
+ # 3. Add to Vector DB
173
+ rag_pipeline.add_documents(all_chunks)
174
+
175
+ # 4. Asynchronously answer all questions
176
+ async def answer_question(question: str):
177
+ relevant_docs = rag_pipeline.query_documents(question)
178
+ answer_text = await rag_pipeline.generate_answer(question, relevant_docs)
179
+ return Answer(question=question, answer=answer_text)
180
+
181
+ tasks = [answer_question(q) for q in request.questions]
182
+ answers = await asyncio.gather(*tasks)
183
+
184
+ return SubmissionResponse(answers=answers)
185
+
186
+ @app.get("/")
187
+ def read_root():
188
+ return {"message": "HackRx 6.0 RAG System is running. See /docs for API details."}
app/parser.py ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FAST Document Parser - Optimized for Speed and Large Documents
2
+
3
+ import os
4
+ import json
5
+ import uuid
6
+ import logging
7
+ import uvicorn
8
+ import gc
9
+ from typing import List, Dict, Any, Optional
10
+ from pathlib import Path
11
+ from fastapi import FastAPI, UploadFile, File, HTTPException
12
+
13
+ # Minimal dependencies for speed
14
+ import fitz # PyMuPDF - faster than Unstructured
15
+ import pdfplumber # Only for tables
16
+ import mammoth
17
+ import email
18
+ import email.policy
19
+ from bs4 import BeautifulSoup
20
+
21
+ # Setup logging
22
+ logging.basicConfig(level=logging.INFO)
23
+ logger = logging.getLogger(__name__)
24
+
25
+ class DocumentChunk:
26
+ """Simple data class for document chunks"""
27
+ def __init__(self, content: str, metadata: Dict[str, Any], chunk_id: str):
28
+ self.content = content
29
+ self.metadata = metadata
30
+ self.chunk_id = chunk_id
31
+
32
+ def to_dict(self):
33
+ return {
34
+ "content": self.content,
35
+ "metadata": self.metadata,
36
+ "chunk_id": self.chunk_id
37
+ }
38
+
39
+ class FastDocumentParserService:
40
+ """Ultra-fast document parsing service"""
41
+
42
+ def __init__(self):
43
+ self.chunk_size = 2000 # Larger chunks = fewer chunks
44
+ self.chunk_overlap = 200 # Minimal overlap
45
+ self.max_chunks = 500 # Hard limit on total chunks
46
+ self.table_row_limit = 20 # Max rows per table
47
+ logger.info("FastDocumentParserService initialized with speed optimizations")
48
+
49
+ def fast_text_split(self, text: str, source: str) -> List[str]:
50
+ """Super fast text splitting with hard limits"""
51
+ if not text or len(text) < 100:
52
+ return [text] if text else []
53
+
54
+ # If text is small enough, return as single chunk
55
+ if len(text) <= self.chunk_size:
56
+ return [text]
57
+
58
+ chunks = []
59
+ start = 0
60
+ chunk_count = 0
61
+
62
+ while start < len(text) and chunk_count < self.max_chunks:
63
+ end = min(start + self.chunk_size, len(text))
64
+
65
+ # Quick sentence boundary check (no complex searching)
66
+ if end < len(text):
67
+ # Look back max 200 chars for period
68
+ search_start = max(start, end - 200)
69
+ period_pos = text.rfind('.', search_start, end)
70
+ if period_pos > search_start:
71
+ end = period_pos + 1
72
+
73
+ chunk = text[start:end].strip()
74
+ if chunk:
75
+ chunks.append(chunk)
76
+ chunk_count += 1
77
+
78
+ start = end - self.chunk_overlap
79
+
80
+ # Safety break for infinite loops
81
+ if start <= 0:
82
+ start = end
83
+
84
+ logger.info(f"Split {source} into {len(chunks)} chunks (limit: {self.max_chunks})")
85
+ return chunks[:self.max_chunks] # Hard limit
86
+
87
+ def extract_tables_fast(self, file_path: str) -> str:
88
+ """Fast table extraction with smart limits"""
89
+ table_text = ""
90
+ table_count = 0
91
+ max_tables = 25 # Increased for better coverage
92
+
93
+ try:
94
+ with pdfplumber.open(file_path) as pdf:
95
+ total_pages = len(pdf.pages)
96
+
97
+ # Better sampling strategy
98
+ if total_pages <= 20:
99
+ step = 1 # Process ALL pages for small docs
100
+ elif total_pages <= 40:
101
+ step = 2 # Process every 2nd page for medium docs
102
+ else:
103
+ step = 3 # Process every 3rd page for large docs
104
+
105
+ pages_to_process = list(range(0, min(total_pages, 50), step)) # Increased to 50 pages max
106
+
107
+ logger.info(f"📊 Smart table scan: processing {len(pages_to_process)} of {total_pages} pages (step={step})")
108
+
109
+ for page_num in pages_to_process:
110
+ if table_count >= max_tables:
111
+ break
112
+
113
+ page = pdf.pages[page_num]
114
+ tables = page.find_tables()
115
+
116
+ for table_idx, table in enumerate(tables):
117
+ if table_count >= max_tables:
118
+ break
119
+
120
+ try:
121
+ table_data = table.extract()
122
+ if table_data and len(table_data) >= 2:
123
+ # Better table processing
124
+ limited_data = table_data[:min(30, len(table_data))] # Up to 30 rows
125
+
126
+ # Smart markdown conversion with better formatting
127
+ if len(limited_data[0]) <= 6: # Reasonable number of columns
128
+ header = " | ".join(str(cell or "").strip()[:60] for cell in limited_data[0]) # 60 chars per cell
129
+ separator = " | ".join(["---"] * len(limited_data[0]))
130
+
131
+ rows = []
132
+ for row in limited_data[1:]:
133
+ # Pad row to match header length
134
+ padded_row = list(row) + [None] * (len(limited_data[0]) - len(row))
135
+ row_str = " | ".join(str(cell or "").strip()[:60] for cell in padded_row)
136
+ rows.append(row_str)
137
+
138
+ table_md = f"\n**TABLE {table_count + 1} - Page {page_num + 1}**\n"
139
+ table_md += f"*{len(limited_data)} rows × {len(limited_data[0])} columns*\n\n"
140
+ table_md += f"| {header} |\n| {separator} |\n"
141
+ for row in rows:
142
+ table_md += f"| {row} |\n"
143
+ table_md += "\n"
144
+
145
+ table_text += table_md
146
+ table_count += 1
147
+ logger.info(f"⚡ Table {table_count}: {len(limited_data)}×{len(limited_data[0])} from page {page_num + 1}")
148
+ else:
149
+ logger.info(f"⚠️ Skipped wide table ({len(limited_data[0])} cols) on page {page_num + 1}")
150
+
151
+ except Exception as e:
152
+ logger.warning(f"⚠️ Skip table on page {page_num + 1}: {e}")
153
+
154
+ logger.info(f"🎯 Extracted {table_count} tables in fast mode")
155
+
156
+ except Exception as e:
157
+ logger.error(f"❌ Fast table extraction failed: {e}")
158
+
159
+ return table_text
160
+
161
+ def process_pdf_ultrafast(self, file_path: str) -> List[DocumentChunk]:
162
+ """Ultra-fast PDF processing - under 1 minute target"""
163
+ logger.info(f"🚀 ULTRA-FAST PDF processing: {os.path.basename(file_path)}")
164
+ start_time = __import__('time').time()
165
+
166
+ chunks = []
167
+
168
+ try:
169
+ # STEP 1: Fast table extraction (parallel to text extraction)
170
+ logger.info("📊 Fast table extraction...")
171
+ table_content = self.extract_tables_fast(file_path)
172
+
173
+ # STEP 2: Fast text extraction with PyMuPDF
174
+ logger.info("📄 Fast text extraction with PyMuPDF...")
175
+ doc = fitz.open(file_path)
176
+
177
+ full_text = ""
178
+ total_pages = len(doc)
179
+
180
+ # Process pages in chunks for large documents
181
+ if total_pages > 40:
182
+ # For very large docs, process every 2nd page
183
+ pages_to_process = list(range(0, min(total_pages, 60), 2))
184
+ logger.info(f"📑 Large document: processing {len(pages_to_process)} of {total_pages} pages")
185
+ else:
186
+ pages_to_process = list(range(total_pages))
187
+
188
+ for page_num in pages_to_process:
189
+ try:
190
+ page = doc[page_num]
191
+ page_text = page.get_text()
192
+
193
+ # Clean and limit page text
194
+ page_text = page_text.strip()
195
+ if len(page_text) > 10000: # Limit page size
196
+ page_text = page_text[:10000] + f"\n[Page {page_num + 1} truncated for speed]"
197
+
198
+ full_text += f"\n\n--- Page {page_num + 1} ---\n{page_text}"
199
+
200
+ except Exception as e:
201
+ logger.warning(f"⚠️ Error processing page {page_num + 1}: {e}")
202
+
203
+ doc.close()
204
+
205
+ # STEP 3: Append tables at the end
206
+ if table_content:
207
+ full_text += f"\n\n{'='*50}\nEXTRACTED TABLES\n{'='*50}\n{table_content}"
208
+
209
+ # STEP 4: Fast chunking with hard limits
210
+ logger.info("📦 Creating chunks...")
211
+ text_chunks = self.fast_text_split(full_text, os.path.basename(file_path))
212
+
213
+ # STEP 5: Create DocumentChunk objects
214
+ for idx, chunk_text in enumerate(text_chunks):
215
+ has_tables = "**TABLE" in chunk_text or "EXTRACTED TABLES" in chunk_text
216
+
217
+ chunks.append(DocumentChunk(
218
+ content=chunk_text,
219
+ metadata={
220
+ "source": os.path.basename(file_path),
221
+ "chunk_index": idx,
222
+ "document_type": "pdf_ultrafast",
223
+ "has_tables": has_tables,
224
+ "total_pages": total_pages,
225
+ "pages_processed": len(pages_to_process),
226
+ "processing_method": "ultrafast_pymupdf"
227
+ },
228
+ chunk_id=str(uuid.uuid4())
229
+ ))
230
+
231
+ elapsed = __import__('time').time() - start_time
232
+ logger.info(f"✅ ULTRA-FAST processing complete in {elapsed:.2f}s: {len(chunks)} chunks")
233
+
234
+ if elapsed > 90: # 1.5 minutes
235
+ logger.warning(f"⚠️ Processing took {elapsed:.2f}s - consider reducing document size")
236
+
237
+ return chunks
238
+
239
+ except Exception as e:
240
+ logger.error(f"❌ Ultra-fast processing failed: {e}")
241
+ return self._emergency_fallback(file_path)
242
+
243
+ def _emergency_fallback(self, file_path: str) -> List[DocumentChunk]:
244
+ """Emergency fallback - text only, no tables"""
245
+ logger.info("🆘 Emergency fallback: text-only extraction")
246
+
247
+ try:
248
+ doc = fitz.open(file_path)
249
+
250
+ # Process only first 10 pages
251
+ max_pages = min(10, len(doc))
252
+ text_parts = []
253
+
254
+ for page_num in range(max_pages):
255
+ page = doc[page_num]
256
+ page_text = page.get_text()
257
+ if len(page_text) > 5000:
258
+ page_text = page_text[:5000] + f"\n[Page {page_num + 1} truncated]"
259
+ text_parts.append(f"Page {page_num + 1}:\n{page_text}")
260
+
261
+ doc.close()
262
+
263
+ full_text = "\n\n".join(text_parts)
264
+ chunks = []
265
+
266
+ # Create max 10 chunks
267
+ chunk_size = len(full_text) // 10 + 1
268
+ for i in range(0, len(full_text), chunk_size):
269
+ chunk_text = full_text[i:i + chunk_size]
270
+ chunks.append(DocumentChunk(
271
+ content=chunk_text,
272
+ metadata={
273
+ "source": os.path.basename(file_path),
274
+ "chunk_index": len(chunks),
275
+ "document_type": "pdf_emergency_fallback",
276
+ "has_tables": False,
277
+ "pages_processed": max_pages
278
+ },
279
+ chunk_id=str(uuid.uuid4())
280
+ ))
281
+
282
+ return chunks
283
+
284
+ except Exception as e:
285
+ logger.error(f"Emergency fallback failed: {e}")
286
+ raise Exception("All processing methods failed")
287
+
288
+ def process_word_doc_fast(self, file_path: str) -> List[DocumentChunk]:
289
+ """Fast Word document processing"""
290
+ chunks = []
291
+
292
+ try:
293
+ with open(file_path, "rb") as docx_file:
294
+ result = mammoth.convert_to_html(docx_file)
295
+ soup = BeautifulSoup(result.html, 'html.parser')
296
+
297
+ # Quick table conversion
298
+ tables = soup.find_all('table')
299
+ for idx, table in enumerate(tables[:10]): # Max 10 tables
300
+ rows = table.find_all('tr')[:15] # Max 15 rows per table
301
+ table_md = f"\n**TABLE {idx + 1}**\n"
302
+
303
+ for row in rows:
304
+ cells = [cell.get_text(strip=True)[:30] for cell in row.find_all(['td', 'th'])]
305
+ table_md += "| " + " | ".join(cells) + " |\n"
306
+
307
+ table.replace_with(table_md)
308
+
309
+ text_content = soup.get_text()
310
+ text_chunks = self.fast_text_split(text_content, os.path.basename(file_path))
311
+
312
+ for idx, chunk in enumerate(text_chunks):
313
+ chunks.append(DocumentChunk(
314
+ content=chunk,
315
+ metadata={
316
+ "source": os.path.basename(file_path),
317
+ "chunk_index": idx,
318
+ "document_type": "docx_fast",
319
+ "has_tables": "**TABLE" in chunk
320
+ },
321
+ chunk_id=str(uuid.uuid4())
322
+ ))
323
+
324
+ except Exception as e:
325
+ logger.error(f"Fast Word processing failed: {e}")
326
+ raise Exception(f"Word processing failed: {e}")
327
+
328
+ return chunks
329
+
330
+ def process_email_fast(self, file_path: str) -> List[DocumentChunk]:
331
+ """Fast email processing"""
332
+ chunks = []
333
+
334
+ try:
335
+ with open(file_path, 'rb') as email_file:
336
+ msg = email.message_from_bytes(email_file.read(), policy=email.policy.default)
337
+
338
+ subject = msg.get('Subject', 'No Subject')
339
+ sender = msg.get('From', 'Unknown Sender')
340
+ date = msg.get('Date', 'Unknown Date')
341
+
342
+ # Get body content quickly
343
+ body_content = ""
344
+ if msg.is_multipart():
345
+ for part in msg.walk():
346
+ if part.get_content_type() == "text/plain":
347
+ content = part.get_content()[:5000] # Limit size
348
+ body_content += content
349
+ break # Take first text part only
350
+ else:
351
+ body_content = msg.get_content()[:5000]
352
+
353
+ email_content = f"EMAIL: {subject}\nFrom: {sender}\nDate: {date}\n\n{body_content}"
354
+ text_chunks = self.fast_text_split(email_content, os.path.basename(file_path))
355
+
356
+ for idx, chunk in enumerate(text_chunks):
357
+ chunks.append(DocumentChunk(
358
+ content=chunk,
359
+ metadata={
360
+ "source": os.path.basename(file_path),
361
+ "chunk_index": idx,
362
+ "document_type": "email_fast",
363
+ "subject": subject
364
+ },
365
+ chunk_id=str(uuid.uuid4())
366
+ ))
367
+
368
+ except Exception as e:
369
+ logger.error(f"Fast email processing failed: {e}")
370
+ raise Exception(f"Email processing failed: {e}")
371
+
372
+ return chunks
373
+
374
+
375
+ # Create the fast parser service
376
+ parser_service = FastDocumentParserService()
377
+
378
+ # FastAPI app
379
+ app = FastAPI(title="Ultra-Fast Document Parser", version="3.0.0")
380
+
381
+ @app.get("/health")
382
+ async def health_check():
383
+ return {"status": "healthy", "message": "Ultra-fast document parser running"}
384
+
385
+ @app.post("/parse")
386
+ async def parse_file(file: UploadFile = File(...)):
387
+ """Ultra-fast file parsing - target < 60 seconds"""
388
+ temp_file_path = None
389
+ start_time = __import__('time').time()
390
+
391
+ try:
392
+ gc.collect() # Clean start
393
+
394
+ temp_file_path = f"./temp_{uuid.uuid4()}_{file.filename}"
395
+
396
+ # Fast file write
397
+ with open(temp_file_path, "wb") as buffer:
398
+ content = await file.read()
399
+ buffer.write(content)
400
+
401
+ file_extension = Path(file.filename).suffix.lower()
402
+ logger.info(f"⚡ FAST processing: {file.filename} ({file_extension})")
403
+
404
+ # Route to appropriate fast processor
405
+ if file_extension == '.pdf':
406
+ chunks = parser_service.process_pdf_ultrafast(temp_file_path)
407
+ elif file_extension in ['.docx', '.doc']:
408
+ chunks = parser_service.process_word_doc_fast(temp_file_path)
409
+ elif file_extension in ['.eml', '.msg']:
410
+ chunks = parser_service.process_email_fast(temp_file_path)
411
+ else:
412
+ raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_extension}")
413
+
414
+ # Convert to response format
415
+ chunk_dicts = [chunk.to_dict() for chunk in chunks]
416
+
417
+ elapsed = __import__('time').time() - start_time
418
+
419
+ # Save minimal debug info
420
+ try:
421
+ with open("./_fast_parsed_output.json", "w") as f:
422
+ json.dump({
423
+ "filename": file.filename,
424
+ "total_chunks": len(chunks),
425
+ "processing_time_seconds": elapsed,
426
+ "first_chunk_preview": chunks[0].content[:200] if chunks else "No chunks"
427
+ }, f, indent=2)
428
+ except:
429
+ pass
430
+
431
+ logger.info(f"🎯 COMPLETED {file.filename} in {elapsed:.2f}s: {len(chunks)} chunks")
432
+
433
+ return {
434
+ "filename": file.filename,
435
+ "status": "success",
436
+ "chunks": chunk_dicts,
437
+ "total_chunks": len(chunks),
438
+ "processing_time_seconds": round(elapsed, 2),
439
+ "processing_method": "ultrafast"
440
+ }
441
+
442
+ except Exception as e:
443
+ elapsed = __import__('time').time() - start_time
444
+ logger.error(f"❌ Processing failed after {elapsed:.2f}s: {e}")
445
+ raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
446
+
447
+ finally:
448
+ if temp_file_path and os.path.exists(temp_file_path):
449
+ try:
450
+ os.remove(temp_file_path)
451
+ except:
452
+ pass
453
+ gc.collect()
454
+
455
+ if __name__ == "__main__":
456
+ logger.info("🚀 Starting Ultra-Fast Document Parser...")
457
+ uvicorn.run(app, host="0.0.0.0", port=8001, log_level="info")
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ pydantic
4
+ sentence-transformers
5
+ chromadb
6
+ groq
7
+ langchain
8
+ PyMuPDF # Corrected from 'fitz'
9
+ pdfplumber
10
+ mammoth # Corrected from 'mammoth-convert'
11
+ beautifulsoup4
12
+ httpx
13
+ python-multipart
run.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # run.py
2
+
3
+ import uvicorn
4
+ import sys
5
+ import os
6
+
7
+ # This line is crucial for running from the top-level directory
8
+ # It tells Python to look for modules inside the 'app' folder
9
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'app')))
10
+
11
+ from main_api import app
12
+
13
+ if __name__ == "__main__":
14
+ print("🚀 Starting HackRx 6.0 RAG Server...")
15
+ print("API Documentation available at http://127.0.0.1:8000/docs")
16
+ uvicorn.run("main_api:app", host="0.0.0.0", port=8000, reload=True, app_dir="app")