Tahasaif3 commited on
Commit
8d736da
·
1 Parent(s): 8c03e3b
Files changed (2) hide show
  1. main.py +136 -0
  2. requirements.txt +7 -0
main.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import fitz # PyMuPDF
3
+ import tempfile
4
+ import requests
5
+ from typing import List
6
+ from fastapi import FastAPI
7
+ from pydantic import BaseModel, Field
8
+ from agents import (
9
+ Agent,
10
+ Runner,
11
+ AsyncOpenAI,
12
+ OpenAIChatCompletionsModel,
13
+ set_tracing_disabled,
14
+ SQLiteSession,
15
+ ModelSettings
16
+ )
17
+ from dotenv import load_dotenv
18
+
19
+ # ---------------- Setup ----------------
20
+ load_dotenv()
21
+ set_tracing_disabled(True)
22
+
23
+ API_KEY = os.getenv("GEMINI_API_KEY")
24
+
25
+ client_provider = AsyncOpenAI(
26
+ api_key=API_KEY,
27
+ base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
28
+ )
29
+
30
+ Model = OpenAIChatCompletionsModel(
31
+ model="gemini-2.0-flash",
32
+ openai_client=client_provider,
33
+ )
34
+
35
+ app = FastAPI(title="Student Data Extractor API")
36
+
37
+ # ---------------- Schemas ----------------
38
+ class StudentRecord(BaseModel):
39
+ name: str = Field(default="", description="Student's name")
40
+ roll_no: str = Field(default="", description="Roll number")
41
+ class_name: str = Field(default="", description="Class level")
42
+ section: str = Field(default="", description="Section letter")
43
+ mobile: str = Field(default="", description="Mobile number")
44
+
45
+ class ExtractResponse(BaseModel):
46
+ students: List[StudentRecord] = Field(default_factory=list)
47
+
48
+ class PDFRequest(BaseModel):
49
+ pdfUrl: str
50
+
51
+ # ---------------- Agent Definition ----------------
52
+ student_agent = Agent(
53
+ name="StudentPDFExtractor",
54
+ model=Model,
55
+ instructions="""
56
+ You are a precise data extraction agent. Read the provided text extracted from a student report PDF and return structured student data.
57
+
58
+ The PDF text typically includes:
59
+ Student Data Report - hyderabad sspo
60
+ Generated on: 10/24/2025
61
+ Name Roll No. Class Section Mobile
62
+ John Doe 05738999 12 A 09338488484848388
63
+
64
+ Ignore headers like 'Student Data Report' and 'Generated on:'.
65
+ Return all students in JSON with this schema:
66
+ {
67
+ "students": [
68
+ {
69
+ "name": "string",
70
+ "roll_no": "string",
71
+ "class_name": "string",
72
+ "section": "string",
73
+ "mobile": "string"
74
+ }
75
+ ]
76
+ }
77
+ """,
78
+ output_type=ExtractResponse,
79
+ model_settings=ModelSettings(temperature=0.2, top_p=0.85)
80
+ )
81
+
82
+ runner = Runner()
83
+
84
+ # ---------------- Helper Functions ----------------
85
+ def download_and_extract_text(pdf_url: str) -> str:
86
+ """Downloads a PDF from Cloudinary and extracts text"""
87
+ print(f"📥 Downloading PDF from: {pdf_url}")
88
+ response = requests.get(pdf_url)
89
+ response.raise_for_status()
90
+
91
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
92
+ tmp.write(response.content)
93
+ tmp_path = tmp.name
94
+
95
+ doc = fitz.open(tmp_path)
96
+ text = "\n".join(page.get_text("text") for page in doc)
97
+ doc.close()
98
+ os.remove(tmp_path)
99
+ print("✅ PDF text extracted successfully.")
100
+ return text
101
+
102
+
103
+ async def extract_from_text(text: str) -> dict:
104
+ """Runs the agent to extract structured data"""
105
+ print(f"📄 Extracting from {len(text)} characters...")
106
+ resp = await runner.run(
107
+ student_agent,
108
+ text, # ✅ plain text only
109
+ session=SQLiteSession("student_trace.db")
110
+ )
111
+
112
+ if hasattr(resp, "output"):
113
+ return resp.output.model_dump()
114
+ elif hasattr(resp, "final_output"):
115
+ return resp.final_output.model_dump()
116
+
117
+ return {"students": []}
118
+
119
+ # ---------------- FastAPI Endpoint ----------------
120
+ @app.post("/extract-student")
121
+ async def extract_student(req: PDFRequest):
122
+ """
123
+ Accepts a Cloudinary PDF URL,
124
+ downloads it, extracts text, and returns structured student data.
125
+ """
126
+ try:
127
+ text = download_and_extract_text(req.pdfUrl)
128
+ structured = await extract_from_text(text)
129
+ return {
130
+ "success": True,
131
+ "pdfUrl": req.pdfUrl,
132
+ "structured": structured,
133
+ "raw_text_preview": text[:800] # trimmed preview
134
+ }
135
+ except Exception as e:
136
+ return {"success": False, "error": str(e)}
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ python-dotenv
3
+ uvicorn
4
+ pydantic
5
+ openai-agents
6
+ PyMuPDF
7
+ requests