Spaces:

rohannsinghal
/

hackrx6.0-final

Paused

App Files Files Community

rohannsinghal commited on Aug 10, 2025

Commit

c4c5f40

1 Parent(s): 9d7e0bb

few changes

Browse files

Files changed (1) hide show

app/main_api.py +26 -48

app/main_api.py CHANGED Viewed

@@ -985,20 +985,10 @@ doc_processor = UniversalDocumentProcessor()
 kaggle_client = LazyKaggleModelClient()
 # --- API MODELS ---
-# In main_api.py
-from pydantic import BaseModel, validator
-from typing import List
 class SubmissionRequest(BaseModel):
-    documents: List[str]
     questions: List[str]
-    @validator('documents', pre=True)
-    def allow_single_string(cls, v):
-        if isinstance(v, str):
-            return [v] # Automatically convert string to a list
-        return v
 class SubmissionResponse(BaseModel):
     answers: List[str]
@@ -1023,7 +1013,8 @@ def test_endpoint():
 @app.post("/api/v1/hackrx/run", response_model=SubmissionResponse, dependencies=[Depends(verify_bearer_token)])
 async def run_submission(request: Request, submission_request: SubmissionRequest = Body(...)):
     start_time = time.time()
-    logger.info(f"🎯 DEADLOCK-FREE KAGGLE-POWERED PROCESSING: {len(submission_request.documents)} docs, {len(submission_request.questions)} questions")
     try:
         # LAZY INITIALIZATION: Only now do we connect to Kaggle!
@@ -1040,47 +1031,34 @@ async def run_submission(request: Request, submission_request: SubmissionRequest
         session_id = f"kaggle_{uuid.uuid4().hex[:6]}"  # Shorter UUID
         rag_pipeline = DeadlockFreeRAGPipeline(session_id, multi_llm, kaggle_client)
-        # Process all documents with higher concurrency
         all_chunks = []
         async with httpx.AsyncClient(
             timeout=45.0,
             headers={"ngrok-skip-browser-warning": "true"}
-        ) as client:  # Tighter timeout + ngrok header
-            # SPEED OPTIMIZATION: Higher concurrency
-            semaphore = asyncio.Semaphore(5)  # Increased from 3
             async def process_single_document(doc_idx: int, doc_url: str):
-                async with semaphore:
-                    try:
-                        logger.info(f"📥 Downloading document {doc_idx + 1}")
-                        response = await client.get(doc_url, follow_redirects=True)
-                        response.raise_for_status()
-                        # Get filename from URL or generate one
-                        filename = os.path.basename(doc_url.split('?')[0]) or f"document_{doc_idx}"
-                        # Process document with caching
-                        chunks = await doc_processor.process_document(filename, response.content)
-                        logger.info(f"✅ Document {doc_idx + 1}: {len(chunks)} chunks")
-                        return chunks
-                    except Exception as e:
-                        logger.error(f"❌ Document {doc_idx + 1} failed: {e}")
-                        return []
-            # Process all documents concurrently
-            tasks = [
-                process_single_document(i, url)
-                for i, url in enumerate(submission_request.documents)
-            ]
-            results = await asyncio.gather(*tasks)
-            # Flatten results
-            for chunks in results:
-                all_chunks.extend(chunks)
         logger.info(f"📊 Total chunks processed: {len(all_chunks)}")
@@ -1097,8 +1075,7 @@ async def run_submission(request: Request, submission_request: SubmissionRequest
         # SPEED OPTIMIZATION: Full parallel question answering
         logger.info(f"⚡ Answering questions in parallel...")
-        # INCREASED concurrency for questions
-        semaphore = asyncio.Semaphore(4)  # Increased from 2
         async def answer_single_question(question: str) -> str:
             async with semaphore:
@@ -1120,7 +1097,8 @@ async def run_submission(request: Request, submission_request: SubmissionRequest
             "Processing error occurred. Please try again."
             for _ in submission_request.questions
         ])
 # --- HEALTH ENDPOINTS (YOUR EXCELLENT ORIGINAL + DEADLOCK-FREE INFO) ---
 @app.get("/")
 def read_root():

 kaggle_client = LazyKaggleModelClient()
 # --- API MODELS ---
 class SubmissionRequest(BaseModel):
+    documents: str  # <-- This now correctly expects a single string
     questions: List[str]
 class SubmissionResponse(BaseModel):
     answers: List[str]
 @app.post("/api/v1/hackrx/run", response_model=SubmissionResponse, dependencies=[Depends(verify_bearer_token)])
 async def run_submission(request: Request, submission_request: SubmissionRequest = Body(...)):
     start_time = time.time()
+    # This log is changed to reflect one document
+    logger.info(f"🎯 DEADLOCK-FREE KAGGLE-POWERED PROCESSING: 1 doc, {len(submission_request.questions)} questions")
     try:
         # LAZY INITIALIZATION: Only now do we connect to Kaggle!
         session_id = f"kaggle_{uuid.uuid4().hex[:6]}"  # Shorter UUID
         rag_pipeline = DeadlockFreeRAGPipeline(session_id, multi_llm, kaggle_client)
+        # Process the single document
         all_chunks = []
         async with httpx.AsyncClient(
             timeout=45.0,
             headers={"ngrok-skip-browser-warning": "true"}
+        ) as client:
             async def process_single_document(doc_idx: int, doc_url: str):
+                # This inner function remains the same
+                try:
+                    logger.info(f"📥 Downloading document {doc_idx + 1}")
+                    response = await client.get(doc_url, follow_redirects=True)
+                    response.raise_for_status()
+                    filename = os.path.basename(doc_url.split('?')[0]) or f"document_{doc_idx}"
+                    chunks = await doc_processor.process_document(filename, response.content)
+                    logger.info(f"✅ Document {doc_idx + 1}: {len(chunks)} chunks")
+                    return chunks
+                except Exception as e:
+                    logger.error(f"❌ Document {doc_idx + 1} failed: {e}")
+                    return []
+            # --- THIS IS THE CORRECTED LOGIC ---
+            # It now processes only the single string from submission_request.documents
+            single_doc_url = submission_request.documents
+            chunks_for_single_doc = await process_single_document(0, single_doc_url)
+            all_chunks.extend(chunks_for_single_doc)
+            # ------------------------------------
         logger.info(f"📊 Total chunks processed: {len(all_chunks)}")
         # SPEED OPTIMIZATION: Full parallel question answering
         logger.info(f"⚡ Answering questions in parallel...")
+        semaphore = asyncio.Semaphore(4)
         async def answer_single_question(question: str) -> str:
             async with semaphore:
             "Processing error occurred. Please try again."
             for _ in submission_request.questions
         ])
 # --- HEALTH ENDPOINTS (YOUR EXCELLENT ORIGINAL + DEADLOCK-FREE INFO) ---
 @app.get("/")
 def read_root():