import os import fitz # PyMuPDF import tempfile import requests from typing import List from fastapi import FastAPI from pydantic import BaseModel, Field from agents import ( Agent, Runner, AsyncOpenAI, OpenAIChatCompletionsModel, set_tracing_disabled, SQLiteSession, ModelSettings ) from dotenv import load_dotenv # ---------------- Setup ---------------- load_dotenv() set_tracing_disabled(True) API_KEY = os.getenv("GEMINI_API_KEY") client_provider = AsyncOpenAI( api_key=API_KEY, base_url="https://generativelanguage.googleapis.com/v1beta/openai/", ) Model = OpenAIChatCompletionsModel( model="gemini-2.0-flash", openai_client=client_provider, ) app = FastAPI(title="Student Data Extractor API") # ---------------- Schemas ---------------- class StudentRecord(BaseModel): name: str = Field(default="", description="Student's name") roll_no: str = Field(default="", description="Roll number") class_name: str = Field(default="", description="Class level") section: str = Field(default="", description="Section letter") mobile: str = Field(default="", description="Mobile number") class ExtractResponse(BaseModel): students: List[StudentRecord] = Field(default_factory=list) class PDFRequest(BaseModel): pdfUrl: str # ---------------- Agent Definition ---------------- student_agent = Agent( name="StudentPDFExtractor", model=Model, instructions=""" You are a precise data extraction agent. Read the provided text extracted from a student report PDF and return structured student data. The PDF text typically includes: Student Data Report - hyderabad sspo Generated on: 10/24/2025 Name Roll No. Class Section Mobile John Doe 05738999 12 A 09338488484848388 Ignore headers like 'Student Data Report' and 'Generated on:'. Return all students in JSON with this schema: { "students": [ { "name": "string", "roll_no": "string", "class_name": "string", "section": "string", "mobile": "string" } ] } """, output_type=ExtractResponse, model_settings=ModelSettings(temperature=0.2, top_p=0.85) ) runner = Runner() # ---------------- Helper Functions ---------------- def download_and_extract_text(pdf_url: str) -> str: """Downloads a PDF from Cloudinary and extracts text""" print(f"📥 Downloading PDF from: {pdf_url}") response = requests.get(pdf_url) response.raise_for_status() with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: tmp.write(response.content) tmp_path = tmp.name doc = fitz.open(tmp_path) text = "\n".join(page.get_text("text") for page in doc) doc.close() os.remove(tmp_path) print("✅ PDF text extracted successfully.") return text async def extract_from_text(text: str) -> dict: """Runs the agent to extract structured data""" print(f"📄 Extracting from {len(text)} characters...") resp = await runner.run( student_agent, text, # ✅ plain text only session=SQLiteSession("student_trace.db") ) if hasattr(resp, "output"): return resp.output.model_dump() elif hasattr(resp, "final_output"): return resp.final_output.model_dump() return {"students": []} # ---------------- FastAPI Endpoint ---------------- @app.post("/extract-student") async def extract_student(req: PDFRequest): """ Accepts a Cloudinary PDF URL, downloads it, extracts text, and returns structured student data. """ try: text = download_and_extract_text(req.pdfUrl) structured = await extract_from_text(text) return { "success": True, "pdfUrl": req.pdfUrl, "structured": structured, "raw_text_preview": text[:800] # trimmed preview } except Exception as e: return {"success": False, "error": str(e)}