Spaces:

Tahasaif3
/

DataExtract

Sleeping

App Files Files Community

DataExtract / main.py

Tahasaif3

Update main.py

ad56575 verified 5 months ago

raw

history blame contribute delete

3.9 kB

	import os
	import fitz # PyMuPDF
	import tempfile
	import requests
	from typing import List
	from fastapi import FastAPI
	from pydantic import BaseModel, Field
	from agents import (
	Agent,
	Runner,
	AsyncOpenAI,
	OpenAIChatCompletionsModel,
	set_tracing_disabled,
	SQLiteSession,
	ModelSettings
	)
	from dotenv import load_dotenv

	# ---------------- Setup ----------------
	load_dotenv()
	set_tracing_disabled(True)

	API_KEY = os.getenv("GEMINI_API_KEY")

	client_provider = AsyncOpenAI(
	api_key=API_KEY,
	base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
	)

	Model = OpenAIChatCompletionsModel(
	model="gemini-2.0-flash",
	openai_client=client_provider,
	)

	app = FastAPI(title="Student Data Extractor API")

	# ---------------- Schemas ----------------
	class StudentRecord(BaseModel):
	name: str = Field(default="", description="Student's name")
	roll_no: str = Field(default="", description="Roll number")
	class_name: str = Field(default="", description="Class level")
	section: str = Field(default="", description="Section letter")
	mobile: str = Field(default="", description="Mobile number")

	class ExtractResponse(BaseModel):
	students: List[StudentRecord] = Field(default_factory=list)

	class PDFRequest(BaseModel):
	pdfUrl: str

	# ---------------- Agent Definition ----------------
	student_agent = Agent(
	name="StudentPDFExtractor",
	model=Model,
	instructions="""
	You are a precise data extraction agent. Read the provided text extracted from a student report PDF and return structured student data.

	The PDF text typically includes:
	Student Data Report - hyderabad sspo
	Generated on: 10/24/2025
	Name Roll No. Class Section Mobile
	John Doe 05738999 12 A 09338488484848388

	Ignore headers like 'Student Data Report' and 'Generated on:'.
	Return all students in JSON with this schema:
	{
	"students": [
	{
	"name": "string",
	"roll_no": "string",
	"class_name": "string",
	"section": "string",
	"mobile": "string"
	}
	]
	}
	""",
	output_type=ExtractResponse,
	model_settings=ModelSettings(temperature=0.2, top_p=0.85)
	)

	runner = Runner()

	# ---------------- Helper Functions ----------------
	def download_and_extract_text(pdf_url: str) -> str:
	"""Downloads a PDF from Cloudinary and extracts text"""
	print(f"📥 Downloading PDF from: {pdf_url}")
	response = requests.get(pdf_url)
	response.raise_for_status()

	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
	tmp.write(response.content)
	tmp_path = tmp.name

	doc = fitz.open(tmp_path)
	text = "\n".join(page.get_text("text") for page in doc)
	doc.close()
	os.remove(tmp_path)
	print("✅ PDF text extracted successfully.")
	return text


	async def extract_from_text(text: str) -> dict:
	"""Runs the agent to extract structured data"""
	print(f"📄 Extracting from {len(text)} characters...")
	resp = await runner.run(
	student_agent,
	text, # ✅ plain text only
	session=SQLiteSession("student_trace.db")
	)

	if hasattr(resp, "output"):
	return resp.output.model_dump()
	elif hasattr(resp, "final_output"):
	return resp.final_output.model_dump()

	return {"students": []}

	# ---------------- FastAPI Endpoint ----------------
	@app.post("/extract-student")
	async def extract_student(req: PDFRequest):
	"""
	Accepts a Cloudinary PDF URL,
	downloads it, extracts text, and returns structured student data.
	"""
	try:
	text = download_and_extract_text(req.pdfUrl)
	structured = await extract_from_text(text)
	return {
	"success": True,
	"pdfUrl": req.pdfUrl,
	"structured": structured,
	"raw_text_preview": text[:800] # trimmed preview
	}
	except Exception as e:
	return {"success": False, "error": str(e)}