Spaces:
Sleeping
Sleeping
| import os | |
| import fitz # PyMuPDF | |
| import tempfile | |
| import requests | |
| from typing import List | |
| from fastapi import FastAPI | |
| from pydantic import BaseModel, Field | |
| from agents import ( | |
| Agent, | |
| Runner, | |
| AsyncOpenAI, | |
| OpenAIChatCompletionsModel, | |
| set_tracing_disabled, | |
| SQLiteSession, | |
| ModelSettings | |
| ) | |
| from dotenv import load_dotenv | |
| # ---------------- Setup ---------------- | |
| load_dotenv() | |
| set_tracing_disabled(True) | |
| API_KEY = os.getenv("GEMINI_API_KEY") | |
| client_provider = AsyncOpenAI( | |
| api_key=API_KEY, | |
| base_url="https://generativelanguage.googleapis.com/v1beta/openai/", | |
| ) | |
| Model = OpenAIChatCompletionsModel( | |
| model="gemini-2.0-flash", | |
| openai_client=client_provider, | |
| ) | |
| app = FastAPI(title="Student Data Extractor API") | |
| # ---------------- Schemas ---------------- | |
| class StudentRecord(BaseModel): | |
| name: str = Field(default="", description="Student's name") | |
| roll_no: str = Field(default="", description="Roll number") | |
| class_name: str = Field(default="", description="Class level") | |
| section: str = Field(default="", description="Section letter") | |
| mobile: str = Field(default="", description="Mobile number") | |
| class ExtractResponse(BaseModel): | |
| students: List[StudentRecord] = Field(default_factory=list) | |
| class PDFRequest(BaseModel): | |
| pdfUrl: str | |
| # ---------------- Agent Definition ---------------- | |
| student_agent = Agent( | |
| name="StudentPDFExtractor", | |
| model=Model, | |
| instructions=""" | |
| You are a precise data extraction agent. Read the provided text extracted from a student report PDF and return structured student data. | |
| The PDF text typically includes: | |
| Student Data Report - hyderabad sspo | |
| Generated on: 10/24/2025 | |
| Name Roll No. Class Section Mobile | |
| John Doe 05738999 12 A 09338488484848388 | |
| Ignore headers like 'Student Data Report' and 'Generated on:'. | |
| Return all students in JSON with this schema: | |
| { | |
| "students": [ | |
| { | |
| "name": "string", | |
| "roll_no": "string", | |
| "class_name": "string", | |
| "section": "string", | |
| "mobile": "string" | |
| } | |
| ] | |
| } | |
| """, | |
| output_type=ExtractResponse, | |
| model_settings=ModelSettings(temperature=0.2, top_p=0.85) | |
| ) | |
| runner = Runner() | |
| # ---------------- Helper Functions ---------------- | |
| def download_and_extract_text(pdf_url: str) -> str: | |
| """Downloads a PDF from Cloudinary and extracts text""" | |
| print(f"π₯ Downloading PDF from: {pdf_url}") | |
| response = requests.get(pdf_url) | |
| response.raise_for_status() | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: | |
| tmp.write(response.content) | |
| tmp_path = tmp.name | |
| doc = fitz.open(tmp_path) | |
| text = "\n".join(page.get_text("text") for page in doc) | |
| doc.close() | |
| os.remove(tmp_path) | |
| print("β PDF text extracted successfully.") | |
| return text | |
| async def extract_from_text(text: str) -> dict: | |
| """Runs the agent to extract structured data""" | |
| print(f"π Extracting from {len(text)} characters...") | |
| resp = await runner.run( | |
| student_agent, | |
| text, # β plain text only | |
| session=SQLiteSession("student_trace.db") | |
| ) | |
| if hasattr(resp, "output"): | |
| return resp.output.model_dump() | |
| elif hasattr(resp, "final_output"): | |
| return resp.final_output.model_dump() | |
| return {"students": []} | |
| # ---------------- FastAPI Endpoint ---------------- | |
| async def extract_student(req: PDFRequest): | |
| """ | |
| Accepts a Cloudinary PDF URL, | |
| downloads it, extracts text, and returns structured student data. | |
| """ | |
| try: | |
| text = download_and_extract_text(req.pdfUrl) | |
| structured = await extract_from_text(text) | |
| return { | |
| "success": True, | |
| "pdfUrl": req.pdfUrl, | |
| "structured": structured, | |
| "raw_text_preview": text[:800] # trimmed preview | |
| } | |
| except Exception as e: | |
| return {"success": False, "error": str(e)} | |