DataExtract / main.py
Tahasaif3's picture
Update main.py
ad56575 verified
import os
import fitz # PyMuPDF
import tempfile
import requests
from typing import List
from fastapi import FastAPI
from pydantic import BaseModel, Field
from agents import (
Agent,
Runner,
AsyncOpenAI,
OpenAIChatCompletionsModel,
set_tracing_disabled,
SQLiteSession,
ModelSettings
)
from dotenv import load_dotenv
# ---------------- Setup ----------------
load_dotenv()
set_tracing_disabled(True)
API_KEY = os.getenv("GEMINI_API_KEY")
client_provider = AsyncOpenAI(
api_key=API_KEY,
base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
)
Model = OpenAIChatCompletionsModel(
model="gemini-2.0-flash",
openai_client=client_provider,
)
app = FastAPI(title="Student Data Extractor API")
# ---------------- Schemas ----------------
class StudentRecord(BaseModel):
name: str = Field(default="", description="Student's name")
roll_no: str = Field(default="", description="Roll number")
class_name: str = Field(default="", description="Class level")
section: str = Field(default="", description="Section letter")
mobile: str = Field(default="", description="Mobile number")
class ExtractResponse(BaseModel):
students: List[StudentRecord] = Field(default_factory=list)
class PDFRequest(BaseModel):
pdfUrl: str
# ---------------- Agent Definition ----------------
student_agent = Agent(
name="StudentPDFExtractor",
model=Model,
instructions="""
You are a precise data extraction agent. Read the provided text extracted from a student report PDF and return structured student data.
The PDF text typically includes:
Student Data Report - hyderabad sspo
Generated on: 10/24/2025
Name Roll No. Class Section Mobile
John Doe 05738999 12 A 09338488484848388
Ignore headers like 'Student Data Report' and 'Generated on:'.
Return all students in JSON with this schema:
{
"students": [
{
"name": "string",
"roll_no": "string",
"class_name": "string",
"section": "string",
"mobile": "string"
}
]
}
""",
output_type=ExtractResponse,
model_settings=ModelSettings(temperature=0.2, top_p=0.85)
)
runner = Runner()
# ---------------- Helper Functions ----------------
def download_and_extract_text(pdf_url: str) -> str:
"""Downloads a PDF from Cloudinary and extracts text"""
print(f"πŸ“₯ Downloading PDF from: {pdf_url}")
response = requests.get(pdf_url)
response.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
tmp.write(response.content)
tmp_path = tmp.name
doc = fitz.open(tmp_path)
text = "\n".join(page.get_text("text") for page in doc)
doc.close()
os.remove(tmp_path)
print("βœ… PDF text extracted successfully.")
return text
async def extract_from_text(text: str) -> dict:
"""Runs the agent to extract structured data"""
print(f"πŸ“„ Extracting from {len(text)} characters...")
resp = await runner.run(
student_agent,
text, # βœ… plain text only
session=SQLiteSession("student_trace.db")
)
if hasattr(resp, "output"):
return resp.output.model_dump()
elif hasattr(resp, "final_output"):
return resp.final_output.model_dump()
return {"students": []}
# ---------------- FastAPI Endpoint ----------------
@app.post("/extract-student")
async def extract_student(req: PDFRequest):
"""
Accepts a Cloudinary PDF URL,
downloads it, extracts text, and returns structured student data.
"""
try:
text = download_and_extract_text(req.pdfUrl)
structured = await extract_from_text(text)
return {
"success": True,
"pdfUrl": req.pdfUrl,
"structured": structured,
"raw_text_preview": text[:800] # trimmed preview
}
except Exception as e:
return {"success": False, "error": str(e)}