Spaces:

Tahasaif3
/

DataExtract

Sleeping

App Files Files Community

Tahasaif3 commited on Oct 27, 2025

Commit

96a7aae

verified ·

1 Parent(s): eeabf12

Update main.py

Browse files

Files changed (1) hide show

main.py +31 -171

main.py CHANGED Viewed

@@ -2,8 +2,6 @@ import os
 import fitz  # PyMuPDF
 import tempfile
 import requests
-import re
-import json
 from typing import List
 from fastapi import FastAPI
 from pydantic import BaseModel, Field
@@ -44,6 +42,9 @@ class StudentRecord(BaseModel):
     section: str = Field(default="", description="Section letter")
     mobile: str = Field(default="", description="Mobile number")
 class PDFRequest(BaseModel):
     pdfUrl: str
@@ -52,15 +53,16 @@ student_agent = Agent(
     name="StudentPDFExtractor",
     model=Model,
     instructions="""
-You are a precise data extraction agent. Extract ALL student records from the provided text.
-CRITICAL RULES:
-1. Extract EVERY single student record - do NOT skip any
-2. Do NOT truncate or limit the output
-3. Return a complete, valid JSON with all students found
-4. Each student has: name, roll_no, class_name, section, mobile
-Return ONLY this JSON format:
 {
   "students": [
     {
@@ -72,10 +74,9 @@ Return ONLY this JSON format:
     }
   ]
 }
-IMPORTANT: Ensure ALL students are included. No truncation allowed. Close all JSON arrays and objects properly.
 """,
-    model_settings=ModelSettings(temperature=0.05, top_p=0.7, max_tokens=8000)
 )
 runner = Runner()
@@ -95,182 +96,41 @@ def download_and_extract_text(pdf_url: str) -> str:
     text = "\n".join(page.get_text("text") for page in doc)
     doc.close()
     os.remove(tmp_path)
-    print(f"✅ PDF text extracted successfully ({len(text)} characters)")
     return text
-def parse_json_from_output(output: str) -> dict:
-    """Parse JSON from agent output, handling truncation and errors"""
-    if not output:
-        return {"students": []}
-    # Try direct JSON parse first
-    try:
-        return json.loads(output)
-    except json.JSONDecodeError:
-        pass
-    # Try to extract JSON from markdown or other formatting
-    json_match = re.search(r'\{[\s\S]*\}', output)
-    if not json_match:
-        return {"students": []}
-    json_str = json_match.group(0)
-    # Try to fix incomplete JSON
-    try:
-        return json.loads(json_str)
-    except json.JSONDecodeError:
-        # Try closing the JSON if it's truncated
-        open_braces = json_str.count('{') - json_str.count('}')
-        open_brackets = json_str.count('[') - json_str.count(']')
-        json_str = json_str.rstrip().rstrip(',') + ']' * open_brackets + '}' * open_braces
-        try:
-            return json.loads(json_str)
-        except json.JSONDecodeError as e:
-            print(f"⚠️ Failed to parse JSON even after fixing: {e}")
-            return {"students": []}
-def regex_fallback_extraction(text: str) -> dict:
-    """Robust regex-based extraction for when agent fails"""
-    print("🔄 Using regex fallback for extraction...")
-    students = []
-    # Try multiple regex patterns for flexibility
-    patterns = [
-        # Pattern 1: Name | Roll | Class | Section | Mobile
-        r'^([A-Za-z\s]+?)\s*\|\s*(\d+)\s*\|\s*([\w\d\s\.,-]+?)\s*\|\s*([A-Za-z0-9\s,.-]+?)\s*\|\s*(\d+)',
-        # Pattern 2: Space-separated format
-        r'^([A-Za-z\s]+?)\s+(\d{8,})\s+([\w\d\s\.,-]+?)\s+([A-Za-z0-9\s,.-]+?)\s+(\d{11,})',
-        # Pattern 3: Tab-separated
-        r'^([A-Za-z\s]+?)\t+(\d{8,})\t+([\w\d\s\.,-]+?)\t+([A-Za-z0-9\s,.-]+?)\t+(\d{11,})',
-    ]
-    seen = set()
-    for line in text.splitlines():
-        line = line.strip()
-        if not line or "name" in line.lower() or "roll" in line.lower() or "generated" in line.lower():
-            continue
-        for pattern in patterns:
-            match = re.search(pattern, line)
-            if match:
-                name = match.group(1).strip()
-                roll_no = match.group(2)
-                class_name = match.group(3).strip()
-                section = match.group(4).strip()
-                mobile = match.group(5)
-                key = (name, roll_no)
-                if key not in seen and name and roll_no:
-                    seen.add(key)
-                    students.append({
-                        "name": name,
-                        "roll_no": roll_no,
-                        "class_name": class_name,
-                        "section": section,
-                        "mobile": mobile
-                    })
-                break
-    print(f"✅ Regex extracted {len(students)} students")
-    return {"students": students}
-async def extract_from_text_chunked(text: str) -> dict:
-    """Runs the agent with flexible JSON parsing for large datasets"""
     print(f"📄 Extracting from {len(text)} characters...")
-    try:
-        resp = await runner.run(
-            student_agent,
-            text,
-            session=SQLiteSession("student_trace.db")
-        )
-        output = None
-        if hasattr(resp, "output"):
-            output = resp.output
-        elif hasattr(resp, "final_output"):
-            output = resp.final_output
-        if output:
-            # Convert to string if needed
-            if isinstance(output, str):
-                output_str = output
-            else:
-                output_str = str(output)
-            # Parse JSON flexibly
-            result = parse_json_from_output(output_str)
-            student_count = len(result.get("students", []))
-            print(f"✅ Agent extracted {student_count} students")
-            if student_count > 0:
-                return result
-            else:
-                print("⚠️ Agent returned empty results")
-    except Exception as e:
-        print(f"⚠️ Agent extraction error: {e}")
-    # Fallback to regex if agent fails or returns empty
-    return regex_fallback_extraction(text)
-def clean_and_deduplicate(students: List[dict]) -> List[dict]:
-    """Remove duplicates and clean data"""
-    seen = set()
-    unique = []
-    for s in students:
-        name = str(s.get("name", "")).strip()
-        roll_no = str(s.get("roll_no", "")).strip()
-        key = (name, roll_no)
-        if key and key[0] and key[1] and key not in seen:
-            seen.add(key)
-            unique.append(s)
-    print(f"📋 After deduplication: {len(unique)} unique students")
-    return unique
 # ---------------- FastAPI Endpoint ----------------
 @app.post("/extract-student")
 async def extract_student(req: PDFRequest):
     """
     Accepts a Cloudinary PDF URL,
-    downloads it, extracts text, and returns ALL structured student data.
-    Features:
-    - Handles large datasets (200+ students)
-    - No strict validation - flexible JSON parsing
-    - Regex backup for comprehensive coverage
-    - Automatic deduplication
     """
     try:
         text = download_and_extract_text(req.pdfUrl)
-        structured = await extract_from_text_chunked(text)
-        # Clean and deduplicate
-        students = structured.get("students", [])
-        cleaned = clean_and_deduplicate(students)
         return {
             "success": True,
             "pdfUrl": req.pdfUrl,
-            "total_students": len(cleaned),
-            "students": cleaned
         }
     except Exception as e:
-        print(f"❌ Error: {e}")
-        import traceback
-        traceback.print_exc()
-        return {
-            "success": False,
-            "error": str(e),
-            "total_students": 0,
-            "students": []
-        }

 import fitz  # PyMuPDF
 import tempfile
 import requests
 from typing import List
 from fastapi import FastAPI
 from pydantic import BaseModel, Field
     section: str = Field(default="", description="Section letter")
     mobile: str = Field(default="", description="Mobile number")
+class ExtractResponse(BaseModel):
+    students: List[StudentRecord] = Field(default_factory=list)
 class PDFRequest(BaseModel):
     pdfUrl: str
     name="StudentPDFExtractor",
     model=Model,
     instructions="""
+You are a precise data extraction agent. Read the provided text extracted from a student report PDF and return structured student data.
+The PDF text typically includes:
+Student Data Report - hyderabad sspo
+Generated on: 10/24/2025
+Name Roll No. Class Section Mobile
+John Doe 05738999 12 A 09338488484848388
+Ignore headers like 'Student Data Report' and 'Generated on:'.
+Return all students in JSON with this schema:
 {
   "students": [
     {
     }
   ]
 }
 """,
+    output_type=ExtractResponse,
+    model_settings=ModelSettings(temperature=0.2, top_p=0.85)
 )
 runner = Runner()
     text = "\n".join(page.get_text("text") for page in doc)
     doc.close()
     os.remove(tmp_path)
+    print("✅ PDF text extracted successfully.")
     return text
+async def extract_from_text(text: str) -> dict:
+    """Runs the agent to extract structured data"""
     print(f"📄 Extracting from {len(text)} characters...")
+    resp = await runner.run(
+        student_agent,
+        text,  # ✅ plain text only
+        session=SQLiteSession("student_trace.db")
+    )
+    if hasattr(resp, "output"):
+        return resp.output.model_dump()
+    elif hasattr(resp, "final_output"):
+        return resp.final_output.model_dump()
+    return {"students": []}
 # ---------------- FastAPI Endpoint ----------------
 @app.post("/extract-student")
 async def extract_student(req: PDFRequest):
     """
     Accepts a Cloudinary PDF URL,
+    downloads it, extracts text, and returns structured student data.
     """
     try:
         text = download_and_extract_text(req.pdfUrl)
+        structured = await extract_from_text(text)
         return {
             "success": True,
             "pdfUrl": req.pdfUrl,
+            "structured": structured,
+            "raw_text_preview": text[:800]  # trimmed preview
         }
     except Exception as e:
+        return {"success": False, "error": str(e)}