Spaces:
Sleeping
Sleeping
File size: 3,901 Bytes
8d736da 96a7aae 8d736da ad56575 8d736da ad56575 96a7aae ad56575 8d736da ad56575 8d736da ad56575 8d736da ad56575 8d736da 96a7aae ad56575 8d736da ad56575 8d736da ad56575 8d736da ad56575 8d736da 96a7aae 8d736da 96a7aae ad56575 96a7aae 4ae9f5c 96a7aae ad56575 96a7aae ad56575 4ae9f5c ad56575 8d736da 96a7aae 8d736da ad56575 8d736da 96a7aae 8d736da ad56575 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | import os
import fitz # PyMuPDF
import tempfile
import requests
from typing import List
from fastapi import FastAPI
from pydantic import BaseModel, Field
from agents import (
Agent,
Runner,
AsyncOpenAI,
OpenAIChatCompletionsModel,
set_tracing_disabled,
SQLiteSession,
ModelSettings
)
from dotenv import load_dotenv
# ---------------- Setup ----------------
load_dotenv()
set_tracing_disabled(True)
API_KEY = os.getenv("GEMINI_API_KEY")
client_provider = AsyncOpenAI(
api_key=API_KEY,
base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
)
Model = OpenAIChatCompletionsModel(
model="gemini-2.0-flash",
openai_client=client_provider,
)
app = FastAPI(title="Student Data Extractor API")
# ---------------- Schemas ----------------
class StudentRecord(BaseModel):
name: str = Field(default="", description="Student's name")
roll_no: str = Field(default="", description="Roll number")
class_name: str = Field(default="", description="Class level")
section: str = Field(default="", description="Section letter")
mobile: str = Field(default="", description="Mobile number")
class ExtractResponse(BaseModel):
students: List[StudentRecord] = Field(default_factory=list)
class PDFRequest(BaseModel):
pdfUrl: str
# ---------------- Agent Definition ----------------
student_agent = Agent(
name="StudentPDFExtractor",
model=Model,
instructions="""
You are a precise data extraction agent. Read the provided text extracted from a student report PDF and return structured student data.
The PDF text typically includes:
Student Data Report - hyderabad sspo
Generated on: 10/24/2025
Name Roll No. Class Section Mobile
John Doe 05738999 12 A 09338488484848388
Ignore headers like 'Student Data Report' and 'Generated on:'.
Return all students in JSON with this schema:
{
"students": [
{
"name": "string",
"roll_no": "string",
"class_name": "string",
"section": "string",
"mobile": "string"
}
]
}
""",
output_type=ExtractResponse,
model_settings=ModelSettings(temperature=0.2, top_p=0.85)
)
runner = Runner()
# ---------------- Helper Functions ----------------
def download_and_extract_text(pdf_url: str) -> str:
"""Downloads a PDF from Cloudinary and extracts text"""
print(f"π₯ Downloading PDF from: {pdf_url}")
response = requests.get(pdf_url)
response.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
tmp.write(response.content)
tmp_path = tmp.name
doc = fitz.open(tmp_path)
text = "\n".join(page.get_text("text") for page in doc)
doc.close()
os.remove(tmp_path)
print("β
PDF text extracted successfully.")
return text
async def extract_from_text(text: str) -> dict:
"""Runs the agent to extract structured data"""
print(f"π Extracting from {len(text)} characters...")
resp = await runner.run(
student_agent,
text, # β
plain text only
session=SQLiteSession("student_trace.db")
)
if hasattr(resp, "output"):
return resp.output.model_dump()
elif hasattr(resp, "final_output"):
return resp.final_output.model_dump()
return {"students": []}
# ---------------- FastAPI Endpoint ----------------
@app.post("/extract-student")
async def extract_student(req: PDFRequest):
"""
Accepts a Cloudinary PDF URL,
downloads it, extracts text, and returns structured student data.
"""
try:
text = download_and_extract_text(req.pdfUrl)
structured = await extract_from_text(text)
return {
"success": True,
"pdfUrl": req.pdfUrl,
"structured": structured,
"raw_text_preview": text[:800] # trimmed preview
}
except Exception as e:
return {"success": False, "error": str(e)}
|