Tahasaif3 commited on
Commit
96a7aae
·
verified ·
1 Parent(s): eeabf12

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +31 -171
main.py CHANGED
@@ -2,8 +2,6 @@ import os
2
  import fitz # PyMuPDF
3
  import tempfile
4
  import requests
5
- import re
6
- import json
7
  from typing import List
8
  from fastapi import FastAPI
9
  from pydantic import BaseModel, Field
@@ -44,6 +42,9 @@ class StudentRecord(BaseModel):
44
  section: str = Field(default="", description="Section letter")
45
  mobile: str = Field(default="", description="Mobile number")
46
 
 
 
 
47
  class PDFRequest(BaseModel):
48
  pdfUrl: str
49
 
@@ -52,15 +53,16 @@ student_agent = Agent(
52
  name="StudentPDFExtractor",
53
  model=Model,
54
  instructions="""
55
- You are a precise data extraction agent. Extract ALL student records from the provided text.
56
 
57
- CRITICAL RULES:
58
- 1. Extract EVERY single student record - do NOT skip any
59
- 2. Do NOT truncate or limit the output
60
- 3. Return a complete, valid JSON with all students found
61
- 4. Each student has: name, roll_no, class_name, section, mobile
62
 
63
- Return ONLY this JSON format:
 
64
  {
65
  "students": [
66
  {
@@ -72,10 +74,9 @@ Return ONLY this JSON format:
72
  }
73
  ]
74
  }
75
-
76
- IMPORTANT: Ensure ALL students are included. No truncation allowed. Close all JSON arrays and objects properly.
77
  """,
78
- model_settings=ModelSettings(temperature=0.05, top_p=0.7, max_tokens=8000)
 
79
  )
80
 
81
  runner = Runner()
@@ -95,182 +96,41 @@ def download_and_extract_text(pdf_url: str) -> str:
95
  text = "\n".join(page.get_text("text") for page in doc)
96
  doc.close()
97
  os.remove(tmp_path)
98
- print(f"✅ PDF text extracted successfully ({len(text)} characters)")
99
  return text
100
 
101
 
102
- def parse_json_from_output(output: str) -> dict:
103
- """Parse JSON from agent output, handling truncation and errors"""
104
- if not output:
105
- return {"students": []}
106
-
107
- # Try direct JSON parse first
108
- try:
109
- return json.loads(output)
110
- except json.JSONDecodeError:
111
- pass
112
-
113
- # Try to extract JSON from markdown or other formatting
114
- json_match = re.search(r'\{[\s\S]*\}', output)
115
- if not json_match:
116
- return {"students": []}
117
-
118
- json_str = json_match.group(0)
119
-
120
- # Try to fix incomplete JSON
121
- try:
122
- return json.loads(json_str)
123
- except json.JSONDecodeError:
124
- # Try closing the JSON if it's truncated
125
- open_braces = json_str.count('{') - json_str.count('}')
126
- open_brackets = json_str.count('[') - json_str.count(']')
127
-
128
- json_str = json_str.rstrip().rstrip(',') + ']' * open_brackets + '}' * open_braces
129
-
130
- try:
131
- return json.loads(json_str)
132
- except json.JSONDecodeError as e:
133
- print(f"⚠️ Failed to parse JSON even after fixing: {e}")
134
- return {"students": []}
135
-
136
-
137
- def regex_fallback_extraction(text: str) -> dict:
138
- """Robust regex-based extraction for when agent fails"""
139
- print("🔄 Using regex fallback for extraction...")
140
- students = []
141
-
142
- # Try multiple regex patterns for flexibility
143
- patterns = [
144
- # Pattern 1: Name | Roll | Class | Section | Mobile
145
- r'^([A-Za-z\s]+?)\s*\|\s*(\d+)\s*\|\s*([\w\d\s\.,-]+?)\s*\|\s*([A-Za-z0-9\s,.-]+?)\s*\|\s*(\d+)',
146
- # Pattern 2: Space-separated format
147
- r'^([A-Za-z\s]+?)\s+(\d{8,})\s+([\w\d\s\.,-]+?)\s+([A-Za-z0-9\s,.-]+?)\s+(\d{11,})',
148
- # Pattern 3: Tab-separated
149
- r'^([A-Za-z\s]+?)\t+(\d{8,})\t+([\w\d\s\.,-]+?)\t+([A-Za-z0-9\s,.-]+?)\t+(\d{11,})',
150
- ]
151
-
152
- seen = set()
153
- for line in text.splitlines():
154
- line = line.strip()
155
- if not line or "name" in line.lower() or "roll" in line.lower() or "generated" in line.lower():
156
- continue
157
-
158
- for pattern in patterns:
159
- match = re.search(pattern, line)
160
- if match:
161
- name = match.group(1).strip()
162
- roll_no = match.group(2)
163
- class_name = match.group(3).strip()
164
- section = match.group(4).strip()
165
- mobile = match.group(5)
166
-
167
- key = (name, roll_no)
168
- if key not in seen and name and roll_no:
169
- seen.add(key)
170
- students.append({
171
- "name": name,
172
- "roll_no": roll_no,
173
- "class_name": class_name,
174
- "section": section,
175
- "mobile": mobile
176
- })
177
- break
178
-
179
- print(f"✅ Regex extracted {len(students)} students")
180
- return {"students": students}
181
-
182
-
183
- async def extract_from_text_chunked(text: str) -> dict:
184
- """Runs the agent with flexible JSON parsing for large datasets"""
185
  print(f"📄 Extracting from {len(text)} characters...")
186
-
187
- try:
188
- resp = await runner.run(
189
- student_agent,
190
- text,
191
- session=SQLiteSession("student_trace.db")
192
- )
193
-
194
- output = None
195
- if hasattr(resp, "output"):
196
- output = resp.output
197
- elif hasattr(resp, "final_output"):
198
- output = resp.final_output
199
-
200
- if output:
201
- # Convert to string if needed
202
- if isinstance(output, str):
203
- output_str = output
204
- else:
205
- output_str = str(output)
206
-
207
- # Parse JSON flexibly
208
- result = parse_json_from_output(output_str)
209
- student_count = len(result.get("students", []))
210
- print(f"✅ Agent extracted {student_count} students")
211
-
212
- if student_count > 0:
213
- return result
214
- else:
215
- print("⚠️ Agent returned empty results")
216
- except Exception as e:
217
- print(f"⚠️ Agent extraction error: {e}")
218
-
219
- # Fallback to regex if agent fails or returns empty
220
- return regex_fallback_extraction(text)
221
 
 
 
 
 
222
 
223
- def clean_and_deduplicate(students: List[dict]) -> List[dict]:
224
- """Remove duplicates and clean data"""
225
- seen = set()
226
- unique = []
227
-
228
- for s in students:
229
- name = str(s.get("name", "")).strip()
230
- roll_no = str(s.get("roll_no", "")).strip()
231
- key = (name, roll_no)
232
-
233
- if key and key[0] and key[1] and key not in seen:
234
- seen.add(key)
235
- unique.append(s)
236
-
237
- print(f"📋 After deduplication: {len(unique)} unique students")
238
- return unique
239
 
240
  # ---------------- FastAPI Endpoint ----------------
241
  @app.post("/extract-student")
242
  async def extract_student(req: PDFRequest):
243
  """
244
  Accepts a Cloudinary PDF URL,
245
- downloads it, extracts text, and returns ALL structured student data.
246
-
247
- Features:
248
- - Handles large datasets (200+ students)
249
- - No strict validation - flexible JSON parsing
250
- - Regex backup for comprehensive coverage
251
- - Automatic deduplication
252
  """
253
  try:
254
  text = download_and_extract_text(req.pdfUrl)
255
- structured = await extract_from_text_chunked(text)
256
-
257
- # Clean and deduplicate
258
- students = structured.get("students", [])
259
- cleaned = clean_and_deduplicate(students)
260
-
261
  return {
262
  "success": True,
263
  "pdfUrl": req.pdfUrl,
264
- "total_students": len(cleaned),
265
- "students": cleaned
266
  }
267
  except Exception as e:
268
- print(f" Error: {e}")
269
- import traceback
270
- traceback.print_exc()
271
- return {
272
- "success": False,
273
- "error": str(e),
274
- "total_students": 0,
275
- "students": []
276
- }
 
2
  import fitz # PyMuPDF
3
  import tempfile
4
  import requests
 
 
5
  from typing import List
6
  from fastapi import FastAPI
7
  from pydantic import BaseModel, Field
 
42
  section: str = Field(default="", description="Section letter")
43
  mobile: str = Field(default="", description="Mobile number")
44
 
45
+ class ExtractResponse(BaseModel):
46
+ students: List[StudentRecord] = Field(default_factory=list)
47
+
48
  class PDFRequest(BaseModel):
49
  pdfUrl: str
50
 
 
53
  name="StudentPDFExtractor",
54
  model=Model,
55
  instructions="""
56
+ You are a precise data extraction agent. Read the provided text extracted from a student report PDF and return structured student data.
57
 
58
+ The PDF text typically includes:
59
+ Student Data Report - hyderabad sspo
60
+ Generated on: 10/24/2025
61
+ Name Roll No. Class Section Mobile
62
+ John Doe 05738999 12 A 09338488484848388
63
 
64
+ Ignore headers like 'Student Data Report' and 'Generated on:'.
65
+ Return all students in JSON with this schema:
66
  {
67
  "students": [
68
  {
 
74
  }
75
  ]
76
  }
 
 
77
  """,
78
+ output_type=ExtractResponse,
79
+ model_settings=ModelSettings(temperature=0.2, top_p=0.85)
80
  )
81
 
82
  runner = Runner()
 
96
  text = "\n".join(page.get_text("text") for page in doc)
97
  doc.close()
98
  os.remove(tmp_path)
99
+ print("✅ PDF text extracted successfully.")
100
  return text
101
 
102
 
103
+ async def extract_from_text(text: str) -> dict:
104
+ """Runs the agent to extract structured data"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  print(f"📄 Extracting from {len(text)} characters...")
106
+ resp = await runner.run(
107
+ student_agent,
108
+ text, # plain text only
109
+ session=SQLiteSession("student_trace.db")
110
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
+ if hasattr(resp, "output"):
113
+ return resp.output.model_dump()
114
+ elif hasattr(resp, "final_output"):
115
+ return resp.final_output.model_dump()
116
 
117
+ return {"students": []}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  # ---------------- FastAPI Endpoint ----------------
120
  @app.post("/extract-student")
121
  async def extract_student(req: PDFRequest):
122
  """
123
  Accepts a Cloudinary PDF URL,
124
+ downloads it, extracts text, and returns structured student data.
 
 
 
 
 
 
125
  """
126
  try:
127
  text = download_and_extract_text(req.pdfUrl)
128
+ structured = await extract_from_text(text)
 
 
 
 
 
129
  return {
130
  "success": True,
131
  "pdfUrl": req.pdfUrl,
132
+ "structured": structured,
133
+ "raw_text_preview": text[:800] # trimmed preview
134
  }
135
  except Exception as e:
136
+ return {"success": False, "error": str(e)}