Tahasaif3 commited on
Commit
ad56575
·
verified ·
1 Parent(s): 59dd903

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +23 -208
main.py CHANGED
@@ -44,7 +44,6 @@ class StudentRecord(BaseModel):
44
 
45
  class ExtractResponse(BaseModel):
46
  students: List[StudentRecord] = Field(default_factory=list)
47
- total_extracted: int = Field(default=0, description="Total number of students extracted")
48
 
49
  class PDFRequest(BaseModel):
50
  pdfUrl: str
@@ -54,76 +53,30 @@ student_agent = Agent(
54
  name="StudentPDFExtractor",
55
  model=Model,
56
  instructions="""
57
- You are a precise data extraction agent specialized in extracting student records from PDF text.
58
 
59
- CRITICAL INSTRUCTIONS:
60
- 1. Extract ALL students from the provided text - do not skip any entries
61
- 2. The text contains student data in tabular format with these columns:
62
- - Name
63
- - Roll No.
64
- - Class
65
- - Section
66
- - Mobile
67
-
68
- 3. IGNORE these lines:
69
- - Headers like "Student Data Report", "Total Students:", "Generated on:"
70
- - Column headers (Name, Roll No., Class, Section, Mobile)
71
- - Page breaks or separator lines
72
-
73
- 4. EXTRACT every student entry that has at least a name and roll number
74
-
75
- 5. Data patterns to handle:
76
- - Some entries may have missing sections or mobile numbers
77
- - Class names can be multi-word (e.g., "BS Zoology 2023-2027")
78
- - Some class info includes year ranges like "2023-2027" or "2022-2026"
79
- - Mobile numbers are typically 11 digits starting with 0
80
- - Roll numbers are typically 8 digits (e.g., 00234429)
81
-
82
- 6. For each student, extract:
83
- - name: The student's full name (first column)
84
- - roll_no: The roll number (typically 8 digits)
85
- - class_name: The full class/program name
86
- - section: The section (Evening, Morning, A, B, etc.) - use empty string if not present
87
- - mobile: The mobile number - use empty string if not present
88
-
89
- 7. Return ALL students in the JSON format specified in the output schema
90
-
91
- 8. Be thorough - if the text contains 100+ students, extract all of them
92
-
93
- EXAMPLE INPUT:
94
  Name Roll No. Class Section Mobile
95
- Nana 00234429 BS Zoology 2023-2027 Evening 03156654438
96
- Noor Fatima 00243403 Bs IR 3rd sem 03010071997
97
 
98
- EXAMPLE OUTPUT:
 
99
  {
100
  "students": [
101
  {
102
- "name": "Nana",
103
- "roll_no": "00234429",
104
- "class_name": "BS Zoology 2023-2027",
105
- "section": "Evening",
106
- "mobile": "03156654438"
107
- },
108
- {
109
- "name": "Noor Fatima",
110
- "roll_no": "00243403",
111
- "class_name": "Bs IR",
112
- "section": "3rd sem",
113
- "mobile": "03010071997"
114
  }
115
- ],
116
- "total_extracted": 2
117
  }
118
-
119
- IMPORTANT: Extract EVERY single student record. Do not truncate or summarize.
120
  """,
121
  output_type=ExtractResponse,
122
- model_settings=ModelSettings(
123
- temperature=0.1, # Lower temperature for more deterministic extraction
124
- top_p=0.9,
125
- max_tokens=16000 # Increased for large responses
126
- )
127
  )
128
 
129
  runner = Runner()
@@ -132,7 +85,7 @@ runner = Runner()
132
  def download_and_extract_text(pdf_url: str) -> str:
133
  """Downloads a PDF from Cloudinary and extracts text"""
134
  print(f"📥 Downloading PDF from: {pdf_url}")
135
- response = requests.get(pdf_url, timeout=30)
136
  response.raise_for_status()
137
 
138
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
@@ -140,115 +93,28 @@ def download_and_extract_text(pdf_url: str) -> str:
140
  tmp_path = tmp.name
141
 
142
  doc = fitz.open(tmp_path)
143
- # Extract text from all pages with better formatting
144
- text_parts = []
145
- for page_num, page in enumerate(doc, 1):
146
- page_text = page.get_text("text")
147
- text_parts.append(f"\n--- Page {page_num} ---\n{page_text}")
148
-
149
- text = "\n".join(text_parts)
150
  doc.close()
151
  os.remove(tmp_path)
152
-
153
- print(f"✅ PDF text extracted successfully. Total length: {len(text)} characters")
154
- print(f"📄 Total pages processed: {len(doc)}")
155
  return text
156
 
157
 
158
  async def extract_from_text(text: str) -> dict:
159
  """Runs the agent to extract structured data"""
160
  print(f"📄 Extracting from {len(text)} characters...")
161
- print(f"🔍 Estimated student count (based on 'Roll No.' occurrences): {text.count('00')}")
162
-
163
  resp = await runner.run(
164
  student_agent,
165
- f"Extract all student records from this PDF text:\n\n{text}",
166
  session=SQLiteSession("student_trace.db")
167
  )
168
 
169
  if hasattr(resp, "output"):
170
- result = resp.output.model_dump()
171
  elif hasattr(resp, "final_output"):
172
- result = resp.final_output.model_dump()
173
- else:
174
- result = {"students": [], "total_extracted": 0}
175
-
176
- # Set total_extracted if not set by agent
177
- if "total_extracted" not in result or result["total_extracted"] == 0:
178
- result["total_extracted"] = len(result.get("students", []))
179
-
180
- print(f"✅ Extraction complete. Found {result['total_extracted']} students")
181
- return result
182
-
183
-
184
- def chunk_text(text: str, chunk_size: int = 15000) -> List[str]:
185
- """
186
- Splits text into chunks for processing large PDFs.
187
- Tries to split at page boundaries or double newlines.
188
- """
189
- if len(text) <= chunk_size:
190
- return [text]
191
-
192
- chunks = []
193
- current_chunk = ""
194
-
195
- # Split by pages first
196
- pages = text.split("--- Page")
197
-
198
- for page in pages:
199
- if not page.strip():
200
- continue
201
-
202
- page_text = "--- Page" + page if not page.startswith("--- Page") else page
203
-
204
- if len(current_chunk) + len(page_text) <= chunk_size:
205
- current_chunk += page_text
206
- else:
207
- if current_chunk:
208
- chunks.append(current_chunk)
209
- current_chunk = page_text
210
-
211
- if current_chunk:
212
- chunks.append(current_chunk)
213
-
214
- return chunks
215
 
216
-
217
- async def extract_from_large_text(text: str) -> dict:
218
- """
219
- Handles extraction from large PDFs by chunking if necessary
220
- """
221
- # If text is small enough, process directly
222
- if len(text) < 30000:
223
- return await extract_from_text(text)
224
-
225
- print(f"📚 Large PDF detected. Chunking for processing...")
226
- chunks = chunk_text(text, chunk_size=20000)
227
- print(f"📦 Split into {len(chunks)} chunks")
228
-
229
- all_students = []
230
-
231
- for i, chunk in enumerate(chunks, 1):
232
- print(f"🔄 Processing chunk {i}/{len(chunks)}...")
233
- result = await extract_from_text(chunk)
234
- chunk_students = result.get("students", [])
235
- all_students.extend(chunk_students)
236
- print(f" Found {len(chunk_students)} students in chunk {i}")
237
-
238
- # Deduplicate based on roll_no
239
- seen_rolls = set()
240
- unique_students = []
241
- for student in all_students:
242
- if student["roll_no"] and student["roll_no"] not in seen_rolls:
243
- seen_rolls.add(student["roll_no"])
244
- unique_students.append(student)
245
-
246
- print(f"✅ Total unique students after deduplication: {len(unique_students)}")
247
-
248
- return {
249
- "students": unique_students,
250
- "total_extracted": len(unique_students)
251
- }
252
 
253
  # ---------------- FastAPI Endpoint ----------------
254
  @app.post("/extract-student")
@@ -256,66 +122,15 @@ async def extract_student(req: PDFRequest):
256
  """
257
  Accepts a Cloudinary PDF URL,
258
  downloads it, extracts text, and returns structured student data.
259
- Handles large PDFs with 200+ students.
260
  """
261
  try:
262
  text = download_and_extract_text(req.pdfUrl)
263
- structured = await extract_from_large_text(text)
264
-
265
  return {
266
  "success": True,
267
  "pdfUrl": req.pdfUrl,
268
- "total_students": structured.get("total_extracted", 0),
269
  "structured": structured,
270
- "raw_text_length": len(text),
271
  "raw_text_preview": text[:800] # trimmed preview
272
  }
273
  except Exception as e:
274
- print(f" Error: {str(e)}")
275
- import traceback
276
- traceback.print_exc()
277
- return {
278
- "success": False,
279
- "error": str(e),
280
- "error_type": type(e).__name__
281
- }
282
-
283
-
284
- # Optional: Add a test endpoint for debugging
285
- @app.post("/test-extract")
286
- async def test_extract(req: PDFRequest):
287
- """
288
- Test endpoint that shows more debugging information
289
- """
290
- try:
291
- text = download_and_extract_text(req.pdfUrl)
292
-
293
- # Count potential student records
294
- lines = text.split('\n')
295
- potential_students = [line for line in lines if any(char.isdigit() for char in line)]
296
-
297
- structured = await extract_from_large_text(text)
298
-
299
- return {
300
- "success": True,
301
- "pdfUrl": req.pdfUrl,
302
- "total_students_extracted": structured.get("total_extracted", 0),
303
- "text_length": len(text),
304
- "total_lines": len(lines),
305
- "lines_with_numbers": len(potential_students),
306
- "first_10_students": structured.get("students", [])[:10],
307
- "raw_text_preview": text[:1500]
308
- }
309
- except Exception as e:
310
- print(f"❌ Error: {str(e)}")
311
- import traceback
312
- traceback.print_exc()
313
- return {
314
- "success": False,
315
- "error": str(e)
316
- }
317
-
318
-
319
- if __name__ == "__main__":
320
- import uvicorn
321
- uvicorn.run(app, host="0.0.0.0", port=8000)
 
44
 
45
  class ExtractResponse(BaseModel):
46
  students: List[StudentRecord] = Field(default_factory=list)
 
47
 
48
  class PDFRequest(BaseModel):
49
  pdfUrl: str
 
53
  name="StudentPDFExtractor",
54
  model=Model,
55
  instructions="""
56
+ You are a precise data extraction agent. Read the provided text extracted from a student report PDF and return structured student data.
57
 
58
+ The PDF text typically includes:
59
+ Student Data Report - hyderabad sspo
60
+ Generated on: 10/24/2025
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  Name Roll No. Class Section Mobile
62
+ John Doe 05738999 12 A 09338488484848388
 
63
 
64
+ Ignore headers like 'Student Data Report' and 'Generated on:'.
65
+ Return all students in JSON with this schema:
66
  {
67
  "students": [
68
  {
69
+ "name": "string",
70
+ "roll_no": "string",
71
+ "class_name": "string",
72
+ "section": "string",
73
+ "mobile": "string"
 
 
 
 
 
 
 
74
  }
75
+ ]
 
76
  }
 
 
77
  """,
78
  output_type=ExtractResponse,
79
+ model_settings=ModelSettings(temperature=0.2, top_p=0.85)
 
 
 
 
80
  )
81
 
82
  runner = Runner()
 
85
  def download_and_extract_text(pdf_url: str) -> str:
86
  """Downloads a PDF from Cloudinary and extracts text"""
87
  print(f"📥 Downloading PDF from: {pdf_url}")
88
+ response = requests.get(pdf_url)
89
  response.raise_for_status()
90
 
91
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
 
93
  tmp_path = tmp.name
94
 
95
  doc = fitz.open(tmp_path)
96
+ text = "\n".join(page.get_text("text") for page in doc)
 
 
 
 
 
 
97
  doc.close()
98
  os.remove(tmp_path)
99
+ print("✅ PDF text extracted successfully.")
 
 
100
  return text
101
 
102
 
103
  async def extract_from_text(text: str) -> dict:
104
  """Runs the agent to extract structured data"""
105
  print(f"📄 Extracting from {len(text)} characters...")
 
 
106
  resp = await runner.run(
107
  student_agent,
108
+ text, # plain text only
109
  session=SQLiteSession("student_trace.db")
110
  )
111
 
112
  if hasattr(resp, "output"):
113
+ return resp.output.model_dump()
114
  elif hasattr(resp, "final_output"):
115
+ return resp.final_output.model_dump()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
+ return {"students": []}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  # ---------------- FastAPI Endpoint ----------------
120
  @app.post("/extract-student")
 
122
  """
123
  Accepts a Cloudinary PDF URL,
124
  downloads it, extracts text, and returns structured student data.
 
125
  """
126
  try:
127
  text = download_and_extract_text(req.pdfUrl)
128
+ structured = await extract_from_text(text)
 
129
  return {
130
  "success": True,
131
  "pdfUrl": req.pdfUrl,
 
132
  "structured": structured,
 
133
  "raw_text_preview": text[:800] # trimmed preview
134
  }
135
  except Exception as e:
136
+ return {"success": False, "error": str(e)}