Spaces:

Tahasaif3
/

DataExtract

Sleeping

App Files Files Community

Tahasaif3 commited on Oct 27, 2025

Commit

ad56575

verified ·

1 Parent(s): 59dd903

Update main.py

Browse files

Files changed (1) hide show

main.py +23 -208

main.py CHANGED Viewed

@@ -44,7 +44,6 @@ class StudentRecord(BaseModel):
 class ExtractResponse(BaseModel):
     students: List[StudentRecord] = Field(default_factory=list)
-    total_extracted: int = Field(default=0, description="Total number of students extracted")
 class PDFRequest(BaseModel):
     pdfUrl: str
@@ -54,76 +53,30 @@ student_agent = Agent(
     name="StudentPDFExtractor",
     model=Model,
     instructions="""
-You are a precise data extraction agent specialized in extracting student records from PDF text.
-CRITICAL INSTRUCTIONS:
-1. Extract ALL students from the provided text - do not skip any entries
-2. The text contains student data in tabular format with these columns:
-   - Name
-   - Roll No.
-   - Class
-   - Section
-   - Mobile
-3. IGNORE these lines:
-   - Headers like "Student Data Report", "Total Students:", "Generated on:"
-   - Column headers (Name, Roll No., Class, Section, Mobile)
-   - Page breaks or separator lines
-4. EXTRACT every student entry that has at least a name and roll number
-5. Data patterns to handle:
-   - Some entries may have missing sections or mobile numbers
-   - Class names can be multi-word (e.g., "BS Zoology 2023-2027")
-   - Some class info includes year ranges like "2023-2027" or "2022-2026"
-   - Mobile numbers are typically 11 digits starting with 0
-   - Roll numbers are typically 8 digits (e.g., 00234429)
-6. For each student, extract:
-   - name: The student's full name (first column)
-   - roll_no: The roll number (typically 8 digits)
-   - class_name: The full class/program name
-   - section: The section (Evening, Morning, A, B, etc.) - use empty string if not present
-   - mobile: The mobile number - use empty string if not present
-7. Return ALL students in the JSON format specified in the output schema
-8. Be thorough - if the text contains 100+ students, extract all of them
-EXAMPLE INPUT:
 Name Roll No. Class Section Mobile
-Nana 00234429 BS Zoology 2023-2027 Evening 03156654438
-Noor Fatima 00243403 Bs IR 3rd sem 03010071997
-EXAMPLE OUTPUT:
 {
   "students": [
     {
-      "name": "Nana",
-      "roll_no": "00234429",
-      "class_name": "BS Zoology 2023-2027",
-      "section": "Evening",
-      "mobile": "03156654438"
-    },
-    {
-      "name": "Noor Fatima",
-      "roll_no": "00243403",
-      "class_name": "Bs IR",
-      "section": "3rd sem",
-      "mobile": "03010071997"
     }
-  ],
-  "total_extracted": 2
 }
-IMPORTANT: Extract EVERY single student record. Do not truncate or summarize.
 """,
     output_type=ExtractResponse,
-    model_settings=ModelSettings(
-        temperature=0.1,  # Lower temperature for more deterministic extraction
-        top_p=0.9,
-        max_tokens=16000  # Increased for large responses
-    )
 )
 runner = Runner()
@@ -132,7 +85,7 @@ runner = Runner()
 def download_and_extract_text(pdf_url: str) -> str:
     """Downloads a PDF from Cloudinary and extracts text"""
     print(f"📥 Downloading PDF from: {pdf_url}")
-    response = requests.get(pdf_url, timeout=30)
     response.raise_for_status()
     with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
@@ -140,115 +93,28 @@ def download_and_extract_text(pdf_url: str) -> str:
         tmp_path = tmp.name
     doc = fitz.open(tmp_path)
-    # Extract text from all pages with better formatting
-    text_parts = []
-    for page_num, page in enumerate(doc, 1):
-        page_text = page.get_text("text")
-        text_parts.append(f"\n--- Page {page_num} ---\n{page_text}")
-    text = "\n".join(text_parts)
     doc.close()
     os.remove(tmp_path)
-    print(f"✅ PDF text extracted successfully. Total length: {len(text)} characters")
-    print(f"📄 Total pages processed: {len(doc)}")
     return text
 async def extract_from_text(text: str) -> dict:
     """Runs the agent to extract structured data"""
     print(f"📄 Extracting from {len(text)} characters...")
-    print(f"🔍 Estimated student count (based on 'Roll No.' occurrences): {text.count('00')}")
     resp = await runner.run(
         student_agent,
-        f"Extract all student records from this PDF text:\n\n{text}",
         session=SQLiteSession("student_trace.db")
     )
     if hasattr(resp, "output"):
-        result = resp.output.model_dump()
     elif hasattr(resp, "final_output"):
-        result = resp.final_output.model_dump()
-    else:
-        result = {"students": [], "total_extracted": 0}
-    # Set total_extracted if not set by agent
-    if "total_extracted" not in result or result["total_extracted"] == 0:
-        result["total_extracted"] = len(result.get("students", []))
-    print(f"✅ Extraction complete. Found {result['total_extracted']} students")
-    return result
-def chunk_text(text: str, chunk_size: int = 15000) -> List[str]:
-    """
-    Splits text into chunks for processing large PDFs.
-    Tries to split at page boundaries or double newlines.
-    """
-    if len(text) <= chunk_size:
-        return [text]
-    chunks = []
-    current_chunk = ""
-    # Split by pages first
-    pages = text.split("--- Page")
-    for page in pages:
-        if not page.strip():
-            continue
-        page_text = "--- Page" + page if not page.startswith("--- Page") else page
-        if len(current_chunk) + len(page_text) <= chunk_size:
-            current_chunk += page_text
-        else:
-            if current_chunk:
-                chunks.append(current_chunk)
-            current_chunk = page_text
-    if current_chunk:
-        chunks.append(current_chunk)
-    return chunks
-async def extract_from_large_text(text: str) -> dict:
-    """
-    Handles extraction from large PDFs by chunking if necessary
-    """
-    # If text is small enough, process directly
-    if len(text) < 30000:
-        return await extract_from_text(text)
-    print(f"📚 Large PDF detected. Chunking for processing...")
-    chunks = chunk_text(text, chunk_size=20000)
-    print(f"📦 Split into {len(chunks)} chunks")
-    all_students = []
-    for i, chunk in enumerate(chunks, 1):
-        print(f"🔄 Processing chunk {i}/{len(chunks)}...")
-        result = await extract_from_text(chunk)
-        chunk_students = result.get("students", [])
-        all_students.extend(chunk_students)
-        print(f"   Found {len(chunk_students)} students in chunk {i}")
-    # Deduplicate based on roll_no
-    seen_rolls = set()
-    unique_students = []
-    for student in all_students:
-        if student["roll_no"] and student["roll_no"] not in seen_rolls:
-            seen_rolls.add(student["roll_no"])
-            unique_students.append(student)
-    print(f"✅ Total unique students after deduplication: {len(unique_students)}")
-    return {
-        "students": unique_students,
-        "total_extracted": len(unique_students)
-    }
 # ---------------- FastAPI Endpoint ----------------
 @app.post("/extract-student")
@@ -256,66 +122,15 @@ async def extract_student(req: PDFRequest):
     """
     Accepts a Cloudinary PDF URL,
     downloads it, extracts text, and returns structured student data.
-    Handles large PDFs with 200+ students.
     """
     try:
         text = download_and_extract_text(req.pdfUrl)
-        structured = await extract_from_large_text(text)
         return {
             "success": True,
             "pdfUrl": req.pdfUrl,
-            "total_students": structured.get("total_extracted", 0),
             "structured": structured,
-            "raw_text_length": len(text),
             "raw_text_preview": text[:800]  # trimmed preview
         }
     except Exception as e:
-        print(f"❌ Error: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return {
-            "success": False,
-            "error": str(e),
-            "error_type": type(e).__name__
-        }
-# Optional: Add a test endpoint for debugging
-@app.post("/test-extract")
-async def test_extract(req: PDFRequest):
-    """
-    Test endpoint that shows more debugging information
-    """
-    try:
-        text = download_and_extract_text(req.pdfUrl)
-        # Count potential student records
-        lines = text.split('\n')
-        potential_students = [line for line in lines if any(char.isdigit() for char in line)]
-        structured = await extract_from_large_text(text)
-        return {
-            "success": True,
-            "pdfUrl": req.pdfUrl,
-            "total_students_extracted": structured.get("total_extracted", 0),
-            "text_length": len(text),
-            "total_lines": len(lines),
-            "lines_with_numbers": len(potential_students),
-            "first_10_students": structured.get("students", [])[:10],
-            "raw_text_preview": text[:1500]
-        }
-    except Exception as e:
-        print(f"❌ Error: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return {
-            "success": False,
-            "error": str(e)
-        }
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 class ExtractResponse(BaseModel):
     students: List[StudentRecord] = Field(default_factory=list)
 class PDFRequest(BaseModel):
     pdfUrl: str
     name="StudentPDFExtractor",
     model=Model,
     instructions="""
+You are a precise data extraction agent. Read the provided text extracted from a student report PDF and return structured student data.
+The PDF text typically includes:
+Student Data Report - hyderabad sspo
+Generated on: 10/24/2025
 Name Roll No. Class Section Mobile
+John Doe 05738999 12 A 09338488484848388
+Ignore headers like 'Student Data Report' and 'Generated on:'.
+Return all students in JSON with this schema:
 {
   "students": [
     {
+      "name": "string",
+      "roll_no": "string",
+      "class_name": "string",
+      "section": "string",
+      "mobile": "string"
     }
+  ]
 }
 """,
     output_type=ExtractResponse,
+    model_settings=ModelSettings(temperature=0.2, top_p=0.85)
 )
 runner = Runner()
 def download_and_extract_text(pdf_url: str) -> str:
     """Downloads a PDF from Cloudinary and extracts text"""
     print(f"📥 Downloading PDF from: {pdf_url}")
+    response = requests.get(pdf_url)
     response.raise_for_status()
     with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
         tmp_path = tmp.name
     doc = fitz.open(tmp_path)
+    text = "\n".join(page.get_text("text") for page in doc)
     doc.close()
     os.remove(tmp_path)
+    print("✅ PDF text extracted successfully.")
     return text
 async def extract_from_text(text: str) -> dict:
     """Runs the agent to extract structured data"""
     print(f"📄 Extracting from {len(text)} characters...")
     resp = await runner.run(
         student_agent,
+        text,  # ✅ plain text only
         session=SQLiteSession("student_trace.db")
     )
     if hasattr(resp, "output"):
+        return resp.output.model_dump()
     elif hasattr(resp, "final_output"):
+        return resp.final_output.model_dump()
+    return {"students": []}
 # ---------------- FastAPI Endpoint ----------------
 @app.post("/extract-student")
     """
     Accepts a Cloudinary PDF URL,
     downloads it, extracts text, and returns structured student data.
     """
     try:
         text = download_and_extract_text(req.pdfUrl)
+        structured = await extract_from_text(text)
         return {
             "success": True,
             "pdfUrl": req.pdfUrl,
             "structured": structured,
             "raw_text_preview": text[:800]  # trimmed preview
         }
     except Exception as e:
+        return {"success": False, "error": str(e)}