Spaces:
Sleeping
Sleeping
Commit ·
774aab5
0
Parent(s):
Inital commit of hackrx6.0 RAG System
Browse files- .gitignore +25 -0
- app/main_api.py +188 -0
- app/parser.py +457 -0
- requirements.txt +13 -0
- run.py +16 -0
.gitignore
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
*.pyo
|
| 5 |
+
*.pyd
|
| 6 |
+
*build/
|
| 7 |
+
*dist/
|
| 8 |
+
|
| 9 |
+
# Virtual Environments
|
| 10 |
+
venv/
|
| 11 |
+
hackrxenv/
|
| 12 |
+
unstructuredenv/
|
| 13 |
+
|
| 14 |
+
# Secrets - VERY IMPORTANT
|
| 15 |
+
.env
|
| 16 |
+
|
| 17 |
+
# Local data
|
| 18 |
+
app/documents/
|
| 19 |
+
app/chroma_db/
|
| 20 |
+
_fast_parsed_output.json
|
| 21 |
+
|
| 22 |
+
# IDE and OS files
|
| 23 |
+
.vscode/
|
| 24 |
+
.idea/
|
| 25 |
+
.DS_Store
|
app/main_api.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app/main_api.py
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import json
|
| 5 |
+
import uuid
|
| 6 |
+
from typing import List, Dict, Any, Optional
|
| 7 |
+
import logging
|
| 8 |
+
import asyncio
|
| 9 |
+
from itertools import cycle
|
| 10 |
+
|
| 11 |
+
# FastAPI and core dependencies
|
| 12 |
+
from fastapi import FastAPI, Body, HTTPException
|
| 13 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 14 |
+
from pydantic import BaseModel, Field
|
| 15 |
+
|
| 16 |
+
# Embeddings and Vector DB
|
| 17 |
+
from sentence_transformers import SentenceTransformer
|
| 18 |
+
import chromadb
|
| 19 |
+
|
| 20 |
+
# LLM Integration
|
| 21 |
+
import groq
|
| 22 |
+
|
| 23 |
+
# Direct import from our local parser module
|
| 24 |
+
from .parser import FastDocumentParserService
|
| 25 |
+
|
| 26 |
+
# HTTP Client for downloading documents
|
| 27 |
+
import httpx
|
| 28 |
+
|
| 29 |
+
# NEW: Library to load environment variables from .env file
|
| 30 |
+
from dotenv import load_dotenv
|
| 31 |
+
|
| 32 |
+
# Setup
|
| 33 |
+
load_dotenv() # Load environment variables from .env file
|
| 34 |
+
logging.basicConfig(level=logging.INFO)
|
| 35 |
+
logger = logging.getLogger(__name__)
|
| 36 |
+
|
| 37 |
+
app = FastAPI(title="HackRx 6.0 RAG System", version="FINAL")
|
| 38 |
+
|
| 39 |
+
# CORS
|
| 40 |
+
app.add_middleware(
|
| 41 |
+
CORSMiddleware,
|
| 42 |
+
allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"],
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# --- CONFIGURATION & INITIALIZATION ---
|
| 46 |
+
# API Key Rotation Setup
|
| 47 |
+
GROQ_API_KEYS = os.getenv("GROQ_API_KEYS", "").split(',')
|
| 48 |
+
if not all(GROQ_API_KEYS):
|
| 49 |
+
logger.warning("GROQ_API_KEYS not found in .env file. Using placeholder.")
|
| 50 |
+
# Provide a fallback or raise an error if keys are essential
|
| 51 |
+
GROQ_API_KEYS = ["gsk_YourDefaultKeyHere"]
|
| 52 |
+
|
| 53 |
+
api_key_cycler = cycle(GROQ_API_KEYS)
|
| 54 |
+
|
| 55 |
+
def get_next_api_key():
|
| 56 |
+
return next(api_key_cycler)
|
| 57 |
+
|
| 58 |
+
EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"
|
| 59 |
+
CHROMA_PERSIST_DIR = "./app/chroma_db"
|
| 60 |
+
UPLOAD_DIR = "./app/documents"
|
| 61 |
+
|
| 62 |
+
try:
|
| 63 |
+
embedding_model = SentenceTransformer(EMBEDDING_MODEL)
|
| 64 |
+
chroma_client = chromadb.PersistentClient(path=CHROMA_PERSIST_DIR)
|
| 65 |
+
# The client's API key will be updated per-request
|
| 66 |
+
groq_client = groq.Groq(api_key=get_next_api_key())
|
| 67 |
+
parsing_service = FastDocumentParserService()
|
| 68 |
+
except Exception as e:
|
| 69 |
+
logger.error(f"FATAL: Could not initialize models. Error: {e}")
|
| 70 |
+
|
| 71 |
+
# Pydantic Models for Hackathon
|
| 72 |
+
class SubmissionRequest(BaseModel):
|
| 73 |
+
documents: List[str]
|
| 74 |
+
questions: List[str]
|
| 75 |
+
|
| 76 |
+
class Answer(BaseModel):
|
| 77 |
+
question: str
|
| 78 |
+
answer: str
|
| 79 |
+
|
| 80 |
+
class SubmissionResponse(BaseModel):
|
| 81 |
+
answers: List[Answer]
|
| 82 |
+
|
| 83 |
+
# RAG Pipeline Class
|
| 84 |
+
class RAGPipeline:
|
| 85 |
+
def __init__(self, collection_name: str):
|
| 86 |
+
self.collection_name = collection_name
|
| 87 |
+
self.collection = chroma_client.get_or_create_collection(name=self.collection_name)
|
| 88 |
+
|
| 89 |
+
def add_documents(self, chunks: List[Dict]):
|
| 90 |
+
if not chunks: return
|
| 91 |
+
self.collection.add(
|
| 92 |
+
embeddings=embedding_model.encode([c["content"] for c in chunks], show_progress_bar=True).tolist(),
|
| 93 |
+
documents=[c["content"] for c in chunks],
|
| 94 |
+
metadatas=[c["metadata"] for c in chunks],
|
| 95 |
+
ids=[c["chunk_id"] for c in chunks]
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
def query_documents(self, query: str, n_results: int = 5) -> List[Dict]:
|
| 99 |
+
if not self.collection.count(): return []
|
| 100 |
+
results = self.collection.query(
|
| 101 |
+
query_embeddings=embedding_model.encode([query]).tolist(),
|
| 102 |
+
n_results=min(n_results, self.collection.count()),
|
| 103 |
+
include=["documents", "metadatas"]
|
| 104 |
+
)
|
| 105 |
+
return [{"content": doc, "metadata": meta} for doc, meta in zip(results["documents"][0], results["metadatas"][0])]
|
| 106 |
+
|
| 107 |
+
async def generate_answer(self, query: str, context_docs: List[Dict]) -> str:
|
| 108 |
+
context = "\n\n".join([f"--- REFERENCE TEXT ---\n{doc['content']}" for doc in context_docs])
|
| 109 |
+
system_prompt = "You are an expert AI assistant. Your task is to answer the user's question based *only* on the provided reference text. Do not use any outside knowledge. If the answer is not contained within the text, you must state 'The answer could not be found in the provided document.' Be concise and directly answer the question."
|
| 110 |
+
user_prompt = f"REFERENCE TEXT:\n{context}\n\nQUESTION: {query}"
|
| 111 |
+
|
| 112 |
+
try:
|
| 113 |
+
# --- API KEY ROTATION ---
|
| 114 |
+
groq_client.api_key = get_next_api_key()
|
| 115 |
+
logger.info(f"Using Groq API key ending in ...{groq_client.api_key[-4:]}")
|
| 116 |
+
|
| 117 |
+
response = await asyncio.to_thread(
|
| 118 |
+
groq_client.chat.completions.create,
|
| 119 |
+
model="llama3-8b-8192",
|
| 120 |
+
messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
|
| 121 |
+
temperature=0.0,
|
| 122 |
+
max_tokens=300
|
| 123 |
+
)
|
| 124 |
+
return response.choices[0].message.content.strip()
|
| 125 |
+
except Exception as e:
|
| 126 |
+
logger.error(f"Groq API call failed: {e}")
|
| 127 |
+
return "Error: Could not generate an answer from the language model."
|
| 128 |
+
|
| 129 |
+
# --- Main Hackathon Endpoint ---
|
| 130 |
+
@app.post("/hackrx/run", response_model=SubmissionResponse)
|
| 131 |
+
async def run_submission(request: SubmissionRequest = Body(...)):
|
| 132 |
+
|
| 133 |
+
# 1. Cleanup and Setup
|
| 134 |
+
try:
|
| 135 |
+
for collection in chroma_client.list_collections():
|
| 136 |
+
if collection.name.startswith("hackrx_session_"):
|
| 137 |
+
chroma_client.delete_collection(name=collection.name)
|
| 138 |
+
except Exception as e:
|
| 139 |
+
logger.warning(f"Could not clean up old collections: {e}")
|
| 140 |
+
|
| 141 |
+
session_collection_name = f"hackrx_session_{uuid.uuid4().hex}"
|
| 142 |
+
rag_pipeline = RAGPipeline(collection_name=session_collection_name)
|
| 143 |
+
|
| 144 |
+
# 2. Download and Process Documents
|
| 145 |
+
all_chunks = []
|
| 146 |
+
async with httpx.AsyncClient(timeout=120.0) as client:
|
| 147 |
+
for doc_url in request.documents:
|
| 148 |
+
try:
|
| 149 |
+
logger.info(f"Downloading document from: {doc_url}")
|
| 150 |
+
response = await client.get(doc_url, follow_redirects=True)
|
| 151 |
+
response.raise_for_status()
|
| 152 |
+
|
| 153 |
+
file_name = os.path.basename(doc_url.split('?')[0])
|
| 154 |
+
temp_file_path = os.path.join(UPLOAD_DIR, f"temp_{uuid.uuid4()}_{file_name}")
|
| 155 |
+
os.makedirs(UPLOAD_DIR, exist_ok=True)
|
| 156 |
+
with open(temp_file_path, "wb") as f:
|
| 157 |
+
f.write(response.content)
|
| 158 |
+
|
| 159 |
+
# Your proven parsing logic
|
| 160 |
+
chunks = parsing_service.process_pdf_ultrafast(temp_file_path)
|
| 161 |
+
all_chunks.extend(chunks)
|
| 162 |
+
os.remove(temp_file_path)
|
| 163 |
+
|
| 164 |
+
except Exception as e:
|
| 165 |
+
logger.error(f"Failed to process document at {doc_url}: {e}")
|
| 166 |
+
continue
|
| 167 |
+
|
| 168 |
+
if not all_chunks:
|
| 169 |
+
failed_answers = [Answer(question=q, answer="A valid document could not be processed, so an answer could not be found.") for q in request.questions]
|
| 170 |
+
return SubmissionResponse(answers=failed_answers)
|
| 171 |
+
|
| 172 |
+
# 3. Add to Vector DB
|
| 173 |
+
rag_pipeline.add_documents(all_chunks)
|
| 174 |
+
|
| 175 |
+
# 4. Asynchronously answer all questions
|
| 176 |
+
async def answer_question(question: str):
|
| 177 |
+
relevant_docs = rag_pipeline.query_documents(question)
|
| 178 |
+
answer_text = await rag_pipeline.generate_answer(question, relevant_docs)
|
| 179 |
+
return Answer(question=question, answer=answer_text)
|
| 180 |
+
|
| 181 |
+
tasks = [answer_question(q) for q in request.questions]
|
| 182 |
+
answers = await asyncio.gather(*tasks)
|
| 183 |
+
|
| 184 |
+
return SubmissionResponse(answers=answers)
|
| 185 |
+
|
| 186 |
+
@app.get("/")
|
| 187 |
+
def read_root():
|
| 188 |
+
return {"message": "HackRx 6.0 RAG System is running. See /docs for API details."}
|
app/parser.py
ADDED
|
@@ -0,0 +1,457 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FAST Document Parser - Optimized for Speed and Large Documents
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import json
|
| 5 |
+
import uuid
|
| 6 |
+
import logging
|
| 7 |
+
import uvicorn
|
| 8 |
+
import gc
|
| 9 |
+
from typing import List, Dict, Any, Optional
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from fastapi import FastAPI, UploadFile, File, HTTPException
|
| 12 |
+
|
| 13 |
+
# Minimal dependencies for speed
|
| 14 |
+
import fitz # PyMuPDF - faster than Unstructured
|
| 15 |
+
import pdfplumber # Only for tables
|
| 16 |
+
import mammoth
|
| 17 |
+
import email
|
| 18 |
+
import email.policy
|
| 19 |
+
from bs4 import BeautifulSoup
|
| 20 |
+
|
| 21 |
+
# Setup logging
|
| 22 |
+
logging.basicConfig(level=logging.INFO)
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
class DocumentChunk:
|
| 26 |
+
"""Simple data class for document chunks"""
|
| 27 |
+
def __init__(self, content: str, metadata: Dict[str, Any], chunk_id: str):
|
| 28 |
+
self.content = content
|
| 29 |
+
self.metadata = metadata
|
| 30 |
+
self.chunk_id = chunk_id
|
| 31 |
+
|
| 32 |
+
def to_dict(self):
|
| 33 |
+
return {
|
| 34 |
+
"content": self.content,
|
| 35 |
+
"metadata": self.metadata,
|
| 36 |
+
"chunk_id": self.chunk_id
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
class FastDocumentParserService:
|
| 40 |
+
"""Ultra-fast document parsing service"""
|
| 41 |
+
|
| 42 |
+
def __init__(self):
|
| 43 |
+
self.chunk_size = 2000 # Larger chunks = fewer chunks
|
| 44 |
+
self.chunk_overlap = 200 # Minimal overlap
|
| 45 |
+
self.max_chunks = 500 # Hard limit on total chunks
|
| 46 |
+
self.table_row_limit = 20 # Max rows per table
|
| 47 |
+
logger.info("FastDocumentParserService initialized with speed optimizations")
|
| 48 |
+
|
| 49 |
+
def fast_text_split(self, text: str, source: str) -> List[str]:
|
| 50 |
+
"""Super fast text splitting with hard limits"""
|
| 51 |
+
if not text or len(text) < 100:
|
| 52 |
+
return [text] if text else []
|
| 53 |
+
|
| 54 |
+
# If text is small enough, return as single chunk
|
| 55 |
+
if len(text) <= self.chunk_size:
|
| 56 |
+
return [text]
|
| 57 |
+
|
| 58 |
+
chunks = []
|
| 59 |
+
start = 0
|
| 60 |
+
chunk_count = 0
|
| 61 |
+
|
| 62 |
+
while start < len(text) and chunk_count < self.max_chunks:
|
| 63 |
+
end = min(start + self.chunk_size, len(text))
|
| 64 |
+
|
| 65 |
+
# Quick sentence boundary check (no complex searching)
|
| 66 |
+
if end < len(text):
|
| 67 |
+
# Look back max 200 chars for period
|
| 68 |
+
search_start = max(start, end - 200)
|
| 69 |
+
period_pos = text.rfind('.', search_start, end)
|
| 70 |
+
if period_pos > search_start:
|
| 71 |
+
end = period_pos + 1
|
| 72 |
+
|
| 73 |
+
chunk = text[start:end].strip()
|
| 74 |
+
if chunk:
|
| 75 |
+
chunks.append(chunk)
|
| 76 |
+
chunk_count += 1
|
| 77 |
+
|
| 78 |
+
start = end - self.chunk_overlap
|
| 79 |
+
|
| 80 |
+
# Safety break for infinite loops
|
| 81 |
+
if start <= 0:
|
| 82 |
+
start = end
|
| 83 |
+
|
| 84 |
+
logger.info(f"Split {source} into {len(chunks)} chunks (limit: {self.max_chunks})")
|
| 85 |
+
return chunks[:self.max_chunks] # Hard limit
|
| 86 |
+
|
| 87 |
+
def extract_tables_fast(self, file_path: str) -> str:
|
| 88 |
+
"""Fast table extraction with smart limits"""
|
| 89 |
+
table_text = ""
|
| 90 |
+
table_count = 0
|
| 91 |
+
max_tables = 25 # Increased for better coverage
|
| 92 |
+
|
| 93 |
+
try:
|
| 94 |
+
with pdfplumber.open(file_path) as pdf:
|
| 95 |
+
total_pages = len(pdf.pages)
|
| 96 |
+
|
| 97 |
+
# Better sampling strategy
|
| 98 |
+
if total_pages <= 20:
|
| 99 |
+
step = 1 # Process ALL pages for small docs
|
| 100 |
+
elif total_pages <= 40:
|
| 101 |
+
step = 2 # Process every 2nd page for medium docs
|
| 102 |
+
else:
|
| 103 |
+
step = 3 # Process every 3rd page for large docs
|
| 104 |
+
|
| 105 |
+
pages_to_process = list(range(0, min(total_pages, 50), step)) # Increased to 50 pages max
|
| 106 |
+
|
| 107 |
+
logger.info(f"📊 Smart table scan: processing {len(pages_to_process)} of {total_pages} pages (step={step})")
|
| 108 |
+
|
| 109 |
+
for page_num in pages_to_process:
|
| 110 |
+
if table_count >= max_tables:
|
| 111 |
+
break
|
| 112 |
+
|
| 113 |
+
page = pdf.pages[page_num]
|
| 114 |
+
tables = page.find_tables()
|
| 115 |
+
|
| 116 |
+
for table_idx, table in enumerate(tables):
|
| 117 |
+
if table_count >= max_tables:
|
| 118 |
+
break
|
| 119 |
+
|
| 120 |
+
try:
|
| 121 |
+
table_data = table.extract()
|
| 122 |
+
if table_data and len(table_data) >= 2:
|
| 123 |
+
# Better table processing
|
| 124 |
+
limited_data = table_data[:min(30, len(table_data))] # Up to 30 rows
|
| 125 |
+
|
| 126 |
+
# Smart markdown conversion with better formatting
|
| 127 |
+
if len(limited_data[0]) <= 6: # Reasonable number of columns
|
| 128 |
+
header = " | ".join(str(cell or "").strip()[:60] for cell in limited_data[0]) # 60 chars per cell
|
| 129 |
+
separator = " | ".join(["---"] * len(limited_data[0]))
|
| 130 |
+
|
| 131 |
+
rows = []
|
| 132 |
+
for row in limited_data[1:]:
|
| 133 |
+
# Pad row to match header length
|
| 134 |
+
padded_row = list(row) + [None] * (len(limited_data[0]) - len(row))
|
| 135 |
+
row_str = " | ".join(str(cell or "").strip()[:60] for cell in padded_row)
|
| 136 |
+
rows.append(row_str)
|
| 137 |
+
|
| 138 |
+
table_md = f"\n**TABLE {table_count + 1} - Page {page_num + 1}**\n"
|
| 139 |
+
table_md += f"*{len(limited_data)} rows × {len(limited_data[0])} columns*\n\n"
|
| 140 |
+
table_md += f"| {header} |\n| {separator} |\n"
|
| 141 |
+
for row in rows:
|
| 142 |
+
table_md += f"| {row} |\n"
|
| 143 |
+
table_md += "\n"
|
| 144 |
+
|
| 145 |
+
table_text += table_md
|
| 146 |
+
table_count += 1
|
| 147 |
+
logger.info(f"⚡ Table {table_count}: {len(limited_data)}×{len(limited_data[0])} from page {page_num + 1}")
|
| 148 |
+
else:
|
| 149 |
+
logger.info(f"⚠️ Skipped wide table ({len(limited_data[0])} cols) on page {page_num + 1}")
|
| 150 |
+
|
| 151 |
+
except Exception as e:
|
| 152 |
+
logger.warning(f"⚠️ Skip table on page {page_num + 1}: {e}")
|
| 153 |
+
|
| 154 |
+
logger.info(f"🎯 Extracted {table_count} tables in fast mode")
|
| 155 |
+
|
| 156 |
+
except Exception as e:
|
| 157 |
+
logger.error(f"❌ Fast table extraction failed: {e}")
|
| 158 |
+
|
| 159 |
+
return table_text
|
| 160 |
+
|
| 161 |
+
def process_pdf_ultrafast(self, file_path: str) -> List[DocumentChunk]:
|
| 162 |
+
"""Ultra-fast PDF processing - under 1 minute target"""
|
| 163 |
+
logger.info(f"🚀 ULTRA-FAST PDF processing: {os.path.basename(file_path)}")
|
| 164 |
+
start_time = __import__('time').time()
|
| 165 |
+
|
| 166 |
+
chunks = []
|
| 167 |
+
|
| 168 |
+
try:
|
| 169 |
+
# STEP 1: Fast table extraction (parallel to text extraction)
|
| 170 |
+
logger.info("📊 Fast table extraction...")
|
| 171 |
+
table_content = self.extract_tables_fast(file_path)
|
| 172 |
+
|
| 173 |
+
# STEP 2: Fast text extraction with PyMuPDF
|
| 174 |
+
logger.info("📄 Fast text extraction with PyMuPDF...")
|
| 175 |
+
doc = fitz.open(file_path)
|
| 176 |
+
|
| 177 |
+
full_text = ""
|
| 178 |
+
total_pages = len(doc)
|
| 179 |
+
|
| 180 |
+
# Process pages in chunks for large documents
|
| 181 |
+
if total_pages > 40:
|
| 182 |
+
# For very large docs, process every 2nd page
|
| 183 |
+
pages_to_process = list(range(0, min(total_pages, 60), 2))
|
| 184 |
+
logger.info(f"📑 Large document: processing {len(pages_to_process)} of {total_pages} pages")
|
| 185 |
+
else:
|
| 186 |
+
pages_to_process = list(range(total_pages))
|
| 187 |
+
|
| 188 |
+
for page_num in pages_to_process:
|
| 189 |
+
try:
|
| 190 |
+
page = doc[page_num]
|
| 191 |
+
page_text = page.get_text()
|
| 192 |
+
|
| 193 |
+
# Clean and limit page text
|
| 194 |
+
page_text = page_text.strip()
|
| 195 |
+
if len(page_text) > 10000: # Limit page size
|
| 196 |
+
page_text = page_text[:10000] + f"\n[Page {page_num + 1} truncated for speed]"
|
| 197 |
+
|
| 198 |
+
full_text += f"\n\n--- Page {page_num + 1} ---\n{page_text}"
|
| 199 |
+
|
| 200 |
+
except Exception as e:
|
| 201 |
+
logger.warning(f"⚠️ Error processing page {page_num + 1}: {e}")
|
| 202 |
+
|
| 203 |
+
doc.close()
|
| 204 |
+
|
| 205 |
+
# STEP 3: Append tables at the end
|
| 206 |
+
if table_content:
|
| 207 |
+
full_text += f"\n\n{'='*50}\nEXTRACTED TABLES\n{'='*50}\n{table_content}"
|
| 208 |
+
|
| 209 |
+
# STEP 4: Fast chunking with hard limits
|
| 210 |
+
logger.info("📦 Creating chunks...")
|
| 211 |
+
text_chunks = self.fast_text_split(full_text, os.path.basename(file_path))
|
| 212 |
+
|
| 213 |
+
# STEP 5: Create DocumentChunk objects
|
| 214 |
+
for idx, chunk_text in enumerate(text_chunks):
|
| 215 |
+
has_tables = "**TABLE" in chunk_text or "EXTRACTED TABLES" in chunk_text
|
| 216 |
+
|
| 217 |
+
chunks.append(DocumentChunk(
|
| 218 |
+
content=chunk_text,
|
| 219 |
+
metadata={
|
| 220 |
+
"source": os.path.basename(file_path),
|
| 221 |
+
"chunk_index": idx,
|
| 222 |
+
"document_type": "pdf_ultrafast",
|
| 223 |
+
"has_tables": has_tables,
|
| 224 |
+
"total_pages": total_pages,
|
| 225 |
+
"pages_processed": len(pages_to_process),
|
| 226 |
+
"processing_method": "ultrafast_pymupdf"
|
| 227 |
+
},
|
| 228 |
+
chunk_id=str(uuid.uuid4())
|
| 229 |
+
))
|
| 230 |
+
|
| 231 |
+
elapsed = __import__('time').time() - start_time
|
| 232 |
+
logger.info(f"✅ ULTRA-FAST processing complete in {elapsed:.2f}s: {len(chunks)} chunks")
|
| 233 |
+
|
| 234 |
+
if elapsed > 90: # 1.5 minutes
|
| 235 |
+
logger.warning(f"⚠️ Processing took {elapsed:.2f}s - consider reducing document size")
|
| 236 |
+
|
| 237 |
+
return chunks
|
| 238 |
+
|
| 239 |
+
except Exception as e:
|
| 240 |
+
logger.error(f"❌ Ultra-fast processing failed: {e}")
|
| 241 |
+
return self._emergency_fallback(file_path)
|
| 242 |
+
|
| 243 |
+
def _emergency_fallback(self, file_path: str) -> List[DocumentChunk]:
|
| 244 |
+
"""Emergency fallback - text only, no tables"""
|
| 245 |
+
logger.info("🆘 Emergency fallback: text-only extraction")
|
| 246 |
+
|
| 247 |
+
try:
|
| 248 |
+
doc = fitz.open(file_path)
|
| 249 |
+
|
| 250 |
+
# Process only first 10 pages
|
| 251 |
+
max_pages = min(10, len(doc))
|
| 252 |
+
text_parts = []
|
| 253 |
+
|
| 254 |
+
for page_num in range(max_pages):
|
| 255 |
+
page = doc[page_num]
|
| 256 |
+
page_text = page.get_text()
|
| 257 |
+
if len(page_text) > 5000:
|
| 258 |
+
page_text = page_text[:5000] + f"\n[Page {page_num + 1} truncated]"
|
| 259 |
+
text_parts.append(f"Page {page_num + 1}:\n{page_text}")
|
| 260 |
+
|
| 261 |
+
doc.close()
|
| 262 |
+
|
| 263 |
+
full_text = "\n\n".join(text_parts)
|
| 264 |
+
chunks = []
|
| 265 |
+
|
| 266 |
+
# Create max 10 chunks
|
| 267 |
+
chunk_size = len(full_text) // 10 + 1
|
| 268 |
+
for i in range(0, len(full_text), chunk_size):
|
| 269 |
+
chunk_text = full_text[i:i + chunk_size]
|
| 270 |
+
chunks.append(DocumentChunk(
|
| 271 |
+
content=chunk_text,
|
| 272 |
+
metadata={
|
| 273 |
+
"source": os.path.basename(file_path),
|
| 274 |
+
"chunk_index": len(chunks),
|
| 275 |
+
"document_type": "pdf_emergency_fallback",
|
| 276 |
+
"has_tables": False,
|
| 277 |
+
"pages_processed": max_pages
|
| 278 |
+
},
|
| 279 |
+
chunk_id=str(uuid.uuid4())
|
| 280 |
+
))
|
| 281 |
+
|
| 282 |
+
return chunks
|
| 283 |
+
|
| 284 |
+
except Exception as e:
|
| 285 |
+
logger.error(f"Emergency fallback failed: {e}")
|
| 286 |
+
raise Exception("All processing methods failed")
|
| 287 |
+
|
| 288 |
+
def process_word_doc_fast(self, file_path: str) -> List[DocumentChunk]:
|
| 289 |
+
"""Fast Word document processing"""
|
| 290 |
+
chunks = []
|
| 291 |
+
|
| 292 |
+
try:
|
| 293 |
+
with open(file_path, "rb") as docx_file:
|
| 294 |
+
result = mammoth.convert_to_html(docx_file)
|
| 295 |
+
soup = BeautifulSoup(result.html, 'html.parser')
|
| 296 |
+
|
| 297 |
+
# Quick table conversion
|
| 298 |
+
tables = soup.find_all('table')
|
| 299 |
+
for idx, table in enumerate(tables[:10]): # Max 10 tables
|
| 300 |
+
rows = table.find_all('tr')[:15] # Max 15 rows per table
|
| 301 |
+
table_md = f"\n**TABLE {idx + 1}**\n"
|
| 302 |
+
|
| 303 |
+
for row in rows:
|
| 304 |
+
cells = [cell.get_text(strip=True)[:30] for cell in row.find_all(['td', 'th'])]
|
| 305 |
+
table_md += "| " + " | ".join(cells) + " |\n"
|
| 306 |
+
|
| 307 |
+
table.replace_with(table_md)
|
| 308 |
+
|
| 309 |
+
text_content = soup.get_text()
|
| 310 |
+
text_chunks = self.fast_text_split(text_content, os.path.basename(file_path))
|
| 311 |
+
|
| 312 |
+
for idx, chunk in enumerate(text_chunks):
|
| 313 |
+
chunks.append(DocumentChunk(
|
| 314 |
+
content=chunk,
|
| 315 |
+
metadata={
|
| 316 |
+
"source": os.path.basename(file_path),
|
| 317 |
+
"chunk_index": idx,
|
| 318 |
+
"document_type": "docx_fast",
|
| 319 |
+
"has_tables": "**TABLE" in chunk
|
| 320 |
+
},
|
| 321 |
+
chunk_id=str(uuid.uuid4())
|
| 322 |
+
))
|
| 323 |
+
|
| 324 |
+
except Exception as e:
|
| 325 |
+
logger.error(f"Fast Word processing failed: {e}")
|
| 326 |
+
raise Exception(f"Word processing failed: {e}")
|
| 327 |
+
|
| 328 |
+
return chunks
|
| 329 |
+
|
| 330 |
+
def process_email_fast(self, file_path: str) -> List[DocumentChunk]:
|
| 331 |
+
"""Fast email processing"""
|
| 332 |
+
chunks = []
|
| 333 |
+
|
| 334 |
+
try:
|
| 335 |
+
with open(file_path, 'rb') as email_file:
|
| 336 |
+
msg = email.message_from_bytes(email_file.read(), policy=email.policy.default)
|
| 337 |
+
|
| 338 |
+
subject = msg.get('Subject', 'No Subject')
|
| 339 |
+
sender = msg.get('From', 'Unknown Sender')
|
| 340 |
+
date = msg.get('Date', 'Unknown Date')
|
| 341 |
+
|
| 342 |
+
# Get body content quickly
|
| 343 |
+
body_content = ""
|
| 344 |
+
if msg.is_multipart():
|
| 345 |
+
for part in msg.walk():
|
| 346 |
+
if part.get_content_type() == "text/plain":
|
| 347 |
+
content = part.get_content()[:5000] # Limit size
|
| 348 |
+
body_content += content
|
| 349 |
+
break # Take first text part only
|
| 350 |
+
else:
|
| 351 |
+
body_content = msg.get_content()[:5000]
|
| 352 |
+
|
| 353 |
+
email_content = f"EMAIL: {subject}\nFrom: {sender}\nDate: {date}\n\n{body_content}"
|
| 354 |
+
text_chunks = self.fast_text_split(email_content, os.path.basename(file_path))
|
| 355 |
+
|
| 356 |
+
for idx, chunk in enumerate(text_chunks):
|
| 357 |
+
chunks.append(DocumentChunk(
|
| 358 |
+
content=chunk,
|
| 359 |
+
metadata={
|
| 360 |
+
"source": os.path.basename(file_path),
|
| 361 |
+
"chunk_index": idx,
|
| 362 |
+
"document_type": "email_fast",
|
| 363 |
+
"subject": subject
|
| 364 |
+
},
|
| 365 |
+
chunk_id=str(uuid.uuid4())
|
| 366 |
+
))
|
| 367 |
+
|
| 368 |
+
except Exception as e:
|
| 369 |
+
logger.error(f"Fast email processing failed: {e}")
|
| 370 |
+
raise Exception(f"Email processing failed: {e}")
|
| 371 |
+
|
| 372 |
+
return chunks
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
# Create the fast parser service
|
| 376 |
+
parser_service = FastDocumentParserService()
|
| 377 |
+
|
| 378 |
+
# FastAPI app
|
| 379 |
+
app = FastAPI(title="Ultra-Fast Document Parser", version="3.0.0")
|
| 380 |
+
|
| 381 |
+
@app.get("/health")
|
| 382 |
+
async def health_check():
|
| 383 |
+
return {"status": "healthy", "message": "Ultra-fast document parser running"}
|
| 384 |
+
|
| 385 |
+
@app.post("/parse")
|
| 386 |
+
async def parse_file(file: UploadFile = File(...)):
|
| 387 |
+
"""Ultra-fast file parsing - target < 60 seconds"""
|
| 388 |
+
temp_file_path = None
|
| 389 |
+
start_time = __import__('time').time()
|
| 390 |
+
|
| 391 |
+
try:
|
| 392 |
+
gc.collect() # Clean start
|
| 393 |
+
|
| 394 |
+
temp_file_path = f"./temp_{uuid.uuid4()}_{file.filename}"
|
| 395 |
+
|
| 396 |
+
# Fast file write
|
| 397 |
+
with open(temp_file_path, "wb") as buffer:
|
| 398 |
+
content = await file.read()
|
| 399 |
+
buffer.write(content)
|
| 400 |
+
|
| 401 |
+
file_extension = Path(file.filename).suffix.lower()
|
| 402 |
+
logger.info(f"⚡ FAST processing: {file.filename} ({file_extension})")
|
| 403 |
+
|
| 404 |
+
# Route to appropriate fast processor
|
| 405 |
+
if file_extension == '.pdf':
|
| 406 |
+
chunks = parser_service.process_pdf_ultrafast(temp_file_path)
|
| 407 |
+
elif file_extension in ['.docx', '.doc']:
|
| 408 |
+
chunks = parser_service.process_word_doc_fast(temp_file_path)
|
| 409 |
+
elif file_extension in ['.eml', '.msg']:
|
| 410 |
+
chunks = parser_service.process_email_fast(temp_file_path)
|
| 411 |
+
else:
|
| 412 |
+
raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_extension}")
|
| 413 |
+
|
| 414 |
+
# Convert to response format
|
| 415 |
+
chunk_dicts = [chunk.to_dict() for chunk in chunks]
|
| 416 |
+
|
| 417 |
+
elapsed = __import__('time').time() - start_time
|
| 418 |
+
|
| 419 |
+
# Save minimal debug info
|
| 420 |
+
try:
|
| 421 |
+
with open("./_fast_parsed_output.json", "w") as f:
|
| 422 |
+
json.dump({
|
| 423 |
+
"filename": file.filename,
|
| 424 |
+
"total_chunks": len(chunks),
|
| 425 |
+
"processing_time_seconds": elapsed,
|
| 426 |
+
"first_chunk_preview": chunks[0].content[:200] if chunks else "No chunks"
|
| 427 |
+
}, f, indent=2)
|
| 428 |
+
except:
|
| 429 |
+
pass
|
| 430 |
+
|
| 431 |
+
logger.info(f"🎯 COMPLETED {file.filename} in {elapsed:.2f}s: {len(chunks)} chunks")
|
| 432 |
+
|
| 433 |
+
return {
|
| 434 |
+
"filename": file.filename,
|
| 435 |
+
"status": "success",
|
| 436 |
+
"chunks": chunk_dicts,
|
| 437 |
+
"total_chunks": len(chunks),
|
| 438 |
+
"processing_time_seconds": round(elapsed, 2),
|
| 439 |
+
"processing_method": "ultrafast"
|
| 440 |
+
}
|
| 441 |
+
|
| 442 |
+
except Exception as e:
|
| 443 |
+
elapsed = __import__('time').time() - start_time
|
| 444 |
+
logger.error(f"❌ Processing failed after {elapsed:.2f}s: {e}")
|
| 445 |
+
raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
|
| 446 |
+
|
| 447 |
+
finally:
|
| 448 |
+
if temp_file_path and os.path.exists(temp_file_path):
|
| 449 |
+
try:
|
| 450 |
+
os.remove(temp_file_path)
|
| 451 |
+
except:
|
| 452 |
+
pass
|
| 453 |
+
gc.collect()
|
| 454 |
+
|
| 455 |
+
if __name__ == "__main__":
|
| 456 |
+
logger.info("🚀 Starting Ultra-Fast Document Parser...")
|
| 457 |
+
uvicorn.run(app, host="0.0.0.0", port=8001, log_level="info")
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn[standard]
|
| 3 |
+
pydantic
|
| 4 |
+
sentence-transformers
|
| 5 |
+
chromadb
|
| 6 |
+
groq
|
| 7 |
+
langchain
|
| 8 |
+
PyMuPDF # Corrected from 'fitz'
|
| 9 |
+
pdfplumber
|
| 10 |
+
mammoth # Corrected from 'mammoth-convert'
|
| 11 |
+
beautifulsoup4
|
| 12 |
+
httpx
|
| 13 |
+
python-multipart
|
run.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# run.py
|
| 2 |
+
|
| 3 |
+
import uvicorn
|
| 4 |
+
import sys
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
# This line is crucial for running from the top-level directory
|
| 8 |
+
# It tells Python to look for modules inside the 'app' folder
|
| 9 |
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'app')))
|
| 10 |
+
|
| 11 |
+
from main_api import app
|
| 12 |
+
|
| 13 |
+
if __name__ == "__main__":
|
| 14 |
+
print("🚀 Starting HackRx 6.0 RAG Server...")
|
| 15 |
+
print("API Documentation available at http://127.0.0.1:8000/docs")
|
| 16 |
+
uvicorn.run("main_api:app", host="0.0.0.0", port=8000, reload=True, app_dir="app")
|