Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -44,7 +44,6 @@ class StudentRecord(BaseModel):
|
|
| 44 |
|
| 45 |
class ExtractResponse(BaseModel):
|
| 46 |
students: List[StudentRecord] = Field(default_factory=list)
|
| 47 |
-
total_extracted: int = Field(default=0, description="Total number of students extracted")
|
| 48 |
|
| 49 |
class PDFRequest(BaseModel):
|
| 50 |
pdfUrl: str
|
|
@@ -54,76 +53,30 @@ student_agent = Agent(
|
|
| 54 |
name="StudentPDFExtractor",
|
| 55 |
model=Model,
|
| 56 |
instructions="""
|
| 57 |
-
You are a precise data extraction agent
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
- Name
|
| 63 |
-
- Roll No.
|
| 64 |
-
- Class
|
| 65 |
-
- Section
|
| 66 |
-
- Mobile
|
| 67 |
-
|
| 68 |
-
3. IGNORE these lines:
|
| 69 |
-
- Headers like "Student Data Report", "Total Students:", "Generated on:"
|
| 70 |
-
- Column headers (Name, Roll No., Class, Section, Mobile)
|
| 71 |
-
- Page breaks or separator lines
|
| 72 |
-
|
| 73 |
-
4. EXTRACT every student entry that has at least a name and roll number
|
| 74 |
-
|
| 75 |
-
5. Data patterns to handle:
|
| 76 |
-
- Some entries may have missing sections or mobile numbers
|
| 77 |
-
- Class names can be multi-word (e.g., "BS Zoology 2023-2027")
|
| 78 |
-
- Some class info includes year ranges like "2023-2027" or "2022-2026"
|
| 79 |
-
- Mobile numbers are typically 11 digits starting with 0
|
| 80 |
-
- Roll numbers are typically 8 digits (e.g., 00234429)
|
| 81 |
-
|
| 82 |
-
6. For each student, extract:
|
| 83 |
-
- name: The student's full name (first column)
|
| 84 |
-
- roll_no: The roll number (typically 8 digits)
|
| 85 |
-
- class_name: The full class/program name
|
| 86 |
-
- section: The section (Evening, Morning, A, B, etc.) - use empty string if not present
|
| 87 |
-
- mobile: The mobile number - use empty string if not present
|
| 88 |
-
|
| 89 |
-
7. Return ALL students in the JSON format specified in the output schema
|
| 90 |
-
|
| 91 |
-
8. Be thorough - if the text contains 100+ students, extract all of them
|
| 92 |
-
|
| 93 |
-
EXAMPLE INPUT:
|
| 94 |
Name Roll No. Class Section Mobile
|
| 95 |
-
|
| 96 |
-
Noor Fatima 00243403 Bs IR 3rd sem 03010071997
|
| 97 |
|
| 98 |
-
|
|
|
|
| 99 |
{
|
| 100 |
"students": [
|
| 101 |
{
|
| 102 |
-
"name": "
|
| 103 |
-
"roll_no": "
|
| 104 |
-
"class_name": "
|
| 105 |
-
"section": "
|
| 106 |
-
"mobile": "
|
| 107 |
-
},
|
| 108 |
-
{
|
| 109 |
-
"name": "Noor Fatima",
|
| 110 |
-
"roll_no": "00243403",
|
| 111 |
-
"class_name": "Bs IR",
|
| 112 |
-
"section": "3rd sem",
|
| 113 |
-
"mobile": "03010071997"
|
| 114 |
}
|
| 115 |
-
]
|
| 116 |
-
"total_extracted": 2
|
| 117 |
}
|
| 118 |
-
|
| 119 |
-
IMPORTANT: Extract EVERY single student record. Do not truncate or summarize.
|
| 120 |
""",
|
| 121 |
output_type=ExtractResponse,
|
| 122 |
-
model_settings=ModelSettings(
|
| 123 |
-
temperature=0.1, # Lower temperature for more deterministic extraction
|
| 124 |
-
top_p=0.9,
|
| 125 |
-
max_tokens=16000 # Increased for large responses
|
| 126 |
-
)
|
| 127 |
)
|
| 128 |
|
| 129 |
runner = Runner()
|
|
@@ -132,7 +85,7 @@ runner = Runner()
|
|
| 132 |
def download_and_extract_text(pdf_url: str) -> str:
|
| 133 |
"""Downloads a PDF from Cloudinary and extracts text"""
|
| 134 |
print(f"📥 Downloading PDF from: {pdf_url}")
|
| 135 |
-
response = requests.get(pdf_url
|
| 136 |
response.raise_for_status()
|
| 137 |
|
| 138 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
|
@@ -140,115 +93,28 @@ def download_and_extract_text(pdf_url: str) -> str:
|
|
| 140 |
tmp_path = tmp.name
|
| 141 |
|
| 142 |
doc = fitz.open(tmp_path)
|
| 143 |
-
|
| 144 |
-
text_parts = []
|
| 145 |
-
for page_num, page in enumerate(doc, 1):
|
| 146 |
-
page_text = page.get_text("text")
|
| 147 |
-
text_parts.append(f"\n--- Page {page_num} ---\n{page_text}")
|
| 148 |
-
|
| 149 |
-
text = "\n".join(text_parts)
|
| 150 |
doc.close()
|
| 151 |
os.remove(tmp_path)
|
| 152 |
-
|
| 153 |
-
print(f"✅ PDF text extracted successfully. Total length: {len(text)} characters")
|
| 154 |
-
print(f"📄 Total pages processed: {len(doc)}")
|
| 155 |
return text
|
| 156 |
|
| 157 |
|
| 158 |
async def extract_from_text(text: str) -> dict:
|
| 159 |
"""Runs the agent to extract structured data"""
|
| 160 |
print(f"📄 Extracting from {len(text)} characters...")
|
| 161 |
-
print(f"🔍 Estimated student count (based on 'Roll No.' occurrences): {text.count('00')}")
|
| 162 |
-
|
| 163 |
resp = await runner.run(
|
| 164 |
student_agent,
|
| 165 |
-
|
| 166 |
session=SQLiteSession("student_trace.db")
|
| 167 |
)
|
| 168 |
|
| 169 |
if hasattr(resp, "output"):
|
| 170 |
-
|
| 171 |
elif hasattr(resp, "final_output"):
|
| 172 |
-
|
| 173 |
-
else:
|
| 174 |
-
result = {"students": [], "total_extracted": 0}
|
| 175 |
-
|
| 176 |
-
# Set total_extracted if not set by agent
|
| 177 |
-
if "total_extracted" not in result or result["total_extracted"] == 0:
|
| 178 |
-
result["total_extracted"] = len(result.get("students", []))
|
| 179 |
-
|
| 180 |
-
print(f"✅ Extraction complete. Found {result['total_extracted']} students")
|
| 181 |
-
return result
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
def chunk_text(text: str, chunk_size: int = 15000) -> List[str]:
|
| 185 |
-
"""
|
| 186 |
-
Splits text into chunks for processing large PDFs.
|
| 187 |
-
Tries to split at page boundaries or double newlines.
|
| 188 |
-
"""
|
| 189 |
-
if len(text) <= chunk_size:
|
| 190 |
-
return [text]
|
| 191 |
-
|
| 192 |
-
chunks = []
|
| 193 |
-
current_chunk = ""
|
| 194 |
-
|
| 195 |
-
# Split by pages first
|
| 196 |
-
pages = text.split("--- Page")
|
| 197 |
-
|
| 198 |
-
for page in pages:
|
| 199 |
-
if not page.strip():
|
| 200 |
-
continue
|
| 201 |
-
|
| 202 |
-
page_text = "--- Page" + page if not page.startswith("--- Page") else page
|
| 203 |
-
|
| 204 |
-
if len(current_chunk) + len(page_text) <= chunk_size:
|
| 205 |
-
current_chunk += page_text
|
| 206 |
-
else:
|
| 207 |
-
if current_chunk:
|
| 208 |
-
chunks.append(current_chunk)
|
| 209 |
-
current_chunk = page_text
|
| 210 |
-
|
| 211 |
-
if current_chunk:
|
| 212 |
-
chunks.append(current_chunk)
|
| 213 |
-
|
| 214 |
-
return chunks
|
| 215 |
|
| 216 |
-
|
| 217 |
-
async def extract_from_large_text(text: str) -> dict:
|
| 218 |
-
"""
|
| 219 |
-
Handles extraction from large PDFs by chunking if necessary
|
| 220 |
-
"""
|
| 221 |
-
# If text is small enough, process directly
|
| 222 |
-
if len(text) < 30000:
|
| 223 |
-
return await extract_from_text(text)
|
| 224 |
-
|
| 225 |
-
print(f"📚 Large PDF detected. Chunking for processing...")
|
| 226 |
-
chunks = chunk_text(text, chunk_size=20000)
|
| 227 |
-
print(f"📦 Split into {len(chunks)} chunks")
|
| 228 |
-
|
| 229 |
-
all_students = []
|
| 230 |
-
|
| 231 |
-
for i, chunk in enumerate(chunks, 1):
|
| 232 |
-
print(f"🔄 Processing chunk {i}/{len(chunks)}...")
|
| 233 |
-
result = await extract_from_text(chunk)
|
| 234 |
-
chunk_students = result.get("students", [])
|
| 235 |
-
all_students.extend(chunk_students)
|
| 236 |
-
print(f" Found {len(chunk_students)} students in chunk {i}")
|
| 237 |
-
|
| 238 |
-
# Deduplicate based on roll_no
|
| 239 |
-
seen_rolls = set()
|
| 240 |
-
unique_students = []
|
| 241 |
-
for student in all_students:
|
| 242 |
-
if student["roll_no"] and student["roll_no"] not in seen_rolls:
|
| 243 |
-
seen_rolls.add(student["roll_no"])
|
| 244 |
-
unique_students.append(student)
|
| 245 |
-
|
| 246 |
-
print(f"✅ Total unique students after deduplication: {len(unique_students)}")
|
| 247 |
-
|
| 248 |
-
return {
|
| 249 |
-
"students": unique_students,
|
| 250 |
-
"total_extracted": len(unique_students)
|
| 251 |
-
}
|
| 252 |
|
| 253 |
# ---------------- FastAPI Endpoint ----------------
|
| 254 |
@app.post("/extract-student")
|
|
@@ -256,66 +122,15 @@ async def extract_student(req: PDFRequest):
|
|
| 256 |
"""
|
| 257 |
Accepts a Cloudinary PDF URL,
|
| 258 |
downloads it, extracts text, and returns structured student data.
|
| 259 |
-
Handles large PDFs with 200+ students.
|
| 260 |
"""
|
| 261 |
try:
|
| 262 |
text = download_and_extract_text(req.pdfUrl)
|
| 263 |
-
structured = await
|
| 264 |
-
|
| 265 |
return {
|
| 266 |
"success": True,
|
| 267 |
"pdfUrl": req.pdfUrl,
|
| 268 |
-
"total_students": structured.get("total_extracted", 0),
|
| 269 |
"structured": structured,
|
| 270 |
-
"raw_text_length": len(text),
|
| 271 |
"raw_text_preview": text[:800] # trimmed preview
|
| 272 |
}
|
| 273 |
except Exception as e:
|
| 274 |
-
|
| 275 |
-
import traceback
|
| 276 |
-
traceback.print_exc()
|
| 277 |
-
return {
|
| 278 |
-
"success": False,
|
| 279 |
-
"error": str(e),
|
| 280 |
-
"error_type": type(e).__name__
|
| 281 |
-
}
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
# Optional: Add a test endpoint for debugging
|
| 285 |
-
@app.post("/test-extract")
|
| 286 |
-
async def test_extract(req: PDFRequest):
|
| 287 |
-
"""
|
| 288 |
-
Test endpoint that shows more debugging information
|
| 289 |
-
"""
|
| 290 |
-
try:
|
| 291 |
-
text = download_and_extract_text(req.pdfUrl)
|
| 292 |
-
|
| 293 |
-
# Count potential student records
|
| 294 |
-
lines = text.split('\n')
|
| 295 |
-
potential_students = [line for line in lines if any(char.isdigit() for char in line)]
|
| 296 |
-
|
| 297 |
-
structured = await extract_from_large_text(text)
|
| 298 |
-
|
| 299 |
-
return {
|
| 300 |
-
"success": True,
|
| 301 |
-
"pdfUrl": req.pdfUrl,
|
| 302 |
-
"total_students_extracted": structured.get("total_extracted", 0),
|
| 303 |
-
"text_length": len(text),
|
| 304 |
-
"total_lines": len(lines),
|
| 305 |
-
"lines_with_numbers": len(potential_students),
|
| 306 |
-
"first_10_students": structured.get("students", [])[:10],
|
| 307 |
-
"raw_text_preview": text[:1500]
|
| 308 |
-
}
|
| 309 |
-
except Exception as e:
|
| 310 |
-
print(f"❌ Error: {str(e)}")
|
| 311 |
-
import traceback
|
| 312 |
-
traceback.print_exc()
|
| 313 |
-
return {
|
| 314 |
-
"success": False,
|
| 315 |
-
"error": str(e)
|
| 316 |
-
}
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
if __name__ == "__main__":
|
| 320 |
-
import uvicorn
|
| 321 |
-
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
|
| 44 |
|
| 45 |
class ExtractResponse(BaseModel):
|
| 46 |
students: List[StudentRecord] = Field(default_factory=list)
|
|
|
|
| 47 |
|
| 48 |
class PDFRequest(BaseModel):
|
| 49 |
pdfUrl: str
|
|
|
|
| 53 |
name="StudentPDFExtractor",
|
| 54 |
model=Model,
|
| 55 |
instructions="""
|
| 56 |
+
You are a precise data extraction agent. Read the provided text extracted from a student report PDF and return structured student data.
|
| 57 |
|
| 58 |
+
The PDF text typically includes:
|
| 59 |
+
Student Data Report - hyderabad sspo
|
| 60 |
+
Generated on: 10/24/2025
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
Name Roll No. Class Section Mobile
|
| 62 |
+
John Doe 05738999 12 A 09338488484848388
|
|
|
|
| 63 |
|
| 64 |
+
Ignore headers like 'Student Data Report' and 'Generated on:'.
|
| 65 |
+
Return all students in JSON with this schema:
|
| 66 |
{
|
| 67 |
"students": [
|
| 68 |
{
|
| 69 |
+
"name": "string",
|
| 70 |
+
"roll_no": "string",
|
| 71 |
+
"class_name": "string",
|
| 72 |
+
"section": "string",
|
| 73 |
+
"mobile": "string"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
}
|
| 75 |
+
]
|
|
|
|
| 76 |
}
|
|
|
|
|
|
|
| 77 |
""",
|
| 78 |
output_type=ExtractResponse,
|
| 79 |
+
model_settings=ModelSettings(temperature=0.2, top_p=0.85)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
)
|
| 81 |
|
| 82 |
runner = Runner()
|
|
|
|
| 85 |
def download_and_extract_text(pdf_url: str) -> str:
|
| 86 |
"""Downloads a PDF from Cloudinary and extracts text"""
|
| 87 |
print(f"📥 Downloading PDF from: {pdf_url}")
|
| 88 |
+
response = requests.get(pdf_url)
|
| 89 |
response.raise_for_status()
|
| 90 |
|
| 91 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
|
|
|
| 93 |
tmp_path = tmp.name
|
| 94 |
|
| 95 |
doc = fitz.open(tmp_path)
|
| 96 |
+
text = "\n".join(page.get_text("text") for page in doc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
doc.close()
|
| 98 |
os.remove(tmp_path)
|
| 99 |
+
print("✅ PDF text extracted successfully.")
|
|
|
|
|
|
|
| 100 |
return text
|
| 101 |
|
| 102 |
|
| 103 |
async def extract_from_text(text: str) -> dict:
|
| 104 |
"""Runs the agent to extract structured data"""
|
| 105 |
print(f"📄 Extracting from {len(text)} characters...")
|
|
|
|
|
|
|
| 106 |
resp = await runner.run(
|
| 107 |
student_agent,
|
| 108 |
+
text, # ✅ plain text only
|
| 109 |
session=SQLiteSession("student_trace.db")
|
| 110 |
)
|
| 111 |
|
| 112 |
if hasattr(resp, "output"):
|
| 113 |
+
return resp.output.model_dump()
|
| 114 |
elif hasattr(resp, "final_output"):
|
| 115 |
+
return resp.final_output.model_dump()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
+
return {"students": []}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
# ---------------- FastAPI Endpoint ----------------
|
| 120 |
@app.post("/extract-student")
|
|
|
|
| 122 |
"""
|
| 123 |
Accepts a Cloudinary PDF URL,
|
| 124 |
downloads it, extracts text, and returns structured student data.
|
|
|
|
| 125 |
"""
|
| 126 |
try:
|
| 127 |
text = download_and_extract_text(req.pdfUrl)
|
| 128 |
+
structured = await extract_from_text(text)
|
|
|
|
| 129 |
return {
|
| 130 |
"success": True,
|
| 131 |
"pdfUrl": req.pdfUrl,
|
|
|
|
| 132 |
"structured": structured,
|
|
|
|
| 133 |
"raw_text_preview": text[:800] # trimmed preview
|
| 134 |
}
|
| 135 |
except Exception as e:
|
| 136 |
+
return {"success": False, "error": str(e)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|