Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -252,26 +252,10 @@ class LearningStyleQuiz:
|
|
| 252 |
# Initialize learning style quiz
|
| 253 |
learning_style_quiz = LearningStyleQuiz()
|
| 254 |
|
| 255 |
-
# ========== TRANSCRIPT PARSER ==========
|
| 256 |
-
class
|
| 257 |
def __init__(self):
|
| 258 |
-
|
| 259 |
-
self.format1_patterns = {
|
| 260 |
-
'student_info': re.compile(
|
| 261 |
-
r"(\d{7}) - (.*?)\s*\|\s*Current Grade:\s*(\d+)\s*\|\s*YOG\s*(\d{4})"
|
| 262 |
-
r"\s*\|\s*Weighted GPA\s*([\d.]+)\s*\|\s*Comm Serv Date\s*(\d{2}/\d{2}/\d{4})"
|
| 263 |
-
r"\s*\|\s*Total Credits Earned\s*([\d.]+)"
|
| 264 |
-
),
|
| 265 |
-
'requirement': re.compile(
|
| 266 |
-
r"([A-Z]-[A-Za-z ]+)\s*\|\s*([^|]+)\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([^|]+)%"
|
| 267 |
-
),
|
| 268 |
-
'course': re.compile(
|
| 269 |
-
r"([A-Z]-[A-Za-z ]+)\s*\|\s*(\d{4}-\d{4})\s*\|\s*(\d{2})\s*\|\s*([A-Z0-9]+)\s*\|\s*([^|]+)\|"
|
| 270 |
-
r"\s*([A-Z0-9])\s*\|\s*(\d+)\s*\|\s*([A-Z])\s*\|\s*([A-Z])\s*\|\s*([\d.]+|inProgress)"
|
| 271 |
-
)
|
| 272 |
-
}
|
| 273 |
-
|
| 274 |
-
self.format2_patterns = {
|
| 275 |
'student_info': re.compile(
|
| 276 |
r"LEGAL NAME:\s*([A-Z]+,\s*[A-Z]+).*?"
|
| 277 |
r"GRADE LEVEL:\s*(\d+).*?"
|
|
@@ -297,98 +281,42 @@ class MiamiDadeTranscriptParser:
|
|
| 297 |
r"BIOLOGY ASSESSMENT PASSED|"
|
| 298 |
r"DISTRICT COMM/VOL SERVICE RQMT MET:\s*(YES).*?HRS:\s*(\d+)",
|
| 299 |
re.DOTALL
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
)
|
| 301 |
}
|
| 302 |
-
|
| 303 |
def parse_transcript(self, file_path: str) -> Dict:
|
| 304 |
-
"""Parse Miami-Dade transcript PDF
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
else:
|
| 318 |
-
raise ValueError("Unrecognized transcript format")
|
| 319 |
-
|
| 320 |
-
def _parse_format1(self, text: str) -> Dict:
|
| 321 |
-
"""Parse the first transcript format"""
|
| 322 |
-
parsed_data = {
|
| 323 |
-
'student_info': self._parse_format1_student_info(text),
|
| 324 |
-
'requirements': self._parse_format1_requirements(text),
|
| 325 |
-
'course_history': self._parse_format1_courses(text),
|
| 326 |
-
'format': 'progress_summary'
|
| 327 |
-
}
|
| 328 |
-
return parsed_data
|
| 329 |
-
|
| 330 |
-
def _parse_format1_student_info(self, text: str) -> Dict:
|
| 331 |
-
"""Extract student information from format 1"""
|
| 332 |
-
match = self.format1_patterns['student_info'].search(text)
|
| 333 |
-
if not match:
|
| 334 |
-
return {}
|
| 335 |
-
|
| 336 |
-
return {
|
| 337 |
-
'id': match.group(1),
|
| 338 |
-
'name': match.group(2).strip(),
|
| 339 |
-
'grade': match.group(3),
|
| 340 |
-
'year_of_graduation': match.group(4),
|
| 341 |
-
'weighted_gpa': float(match.group(5)),
|
| 342 |
-
'community_service_date': match.group(6),
|
| 343 |
-
'total_credits': float(match.group(7)),
|
| 344 |
-
'district': 'Miami-Dade'
|
| 345 |
-
}
|
| 346 |
-
|
| 347 |
-
def _parse_format1_requirements(self, text: str) -> Dict:
|
| 348 |
-
"""Parse graduation requirements section from format 1"""
|
| 349 |
-
requirements = {}
|
| 350 |
-
for match in self.format1_patterns['requirement'].finditer(text):
|
| 351 |
-
requirements[match.group(1).strip()] = {
|
| 352 |
-
'description': match.group(2).strip(),
|
| 353 |
-
'required': float(match.group(3)),
|
| 354 |
-
'waived': float(match.group(4)),
|
| 355 |
-
'completed': float(match.group(5)),
|
| 356 |
-
'percent_complete': float(match.group(6))
|
| 357 |
-
}
|
| 358 |
-
return requirements
|
| 359 |
-
|
| 360 |
-
def _parse_format1_courses(self, text: str) -> List[Dict]:
|
| 361 |
-
"""Parse course history section from format 1"""
|
| 362 |
-
courses = []
|
| 363 |
-
for match in self.format1_patterns['course'].finditer(text):
|
| 364 |
-
courses.append({
|
| 365 |
-
'requirement': match.group(1).strip(),
|
| 366 |
-
'school_year': match.group(2),
|
| 367 |
-
'grade_level': match.group(3),
|
| 368 |
-
'course_code': match.group(4),
|
| 369 |
-
'description': match.group(5).strip(),
|
| 370 |
-
'term': match.group(6),
|
| 371 |
-
'district_number': match.group(7),
|
| 372 |
-
'included': match.group(8),
|
| 373 |
-
'credits': 0 if 'inProgress' in match.group(9) else float(match.group(9)),
|
| 374 |
-
'status': 'In Progress' if 'inProgress' in match.group(9) else 'Completed'
|
| 375 |
-
})
|
| 376 |
-
return courses
|
| 377 |
|
| 378 |
-
def
|
| 379 |
-
"""Parse the
|
| 380 |
parsed_data = {
|
| 381 |
-
'student_info': self.
|
| 382 |
-
'academic_summary': self.
|
| 383 |
-
'course_history': self.
|
| 384 |
-
'assessments': self.
|
| 385 |
-
'format': '
|
| 386 |
}
|
| 387 |
return parsed_data
|
| 388 |
|
| 389 |
-
def
|
| 390 |
-
"""Extract student information
|
| 391 |
-
match = self.
|
| 392 |
if not match:
|
| 393 |
return {}
|
| 394 |
|
|
@@ -411,10 +339,11 @@ class MiamiDadeTranscriptParser:
|
|
| 411 |
eth_match = re.search(r"ETHNICITY:\s*([^\n]+)", text)
|
| 412 |
return eth_match.group(1).strip() if eth_match else None
|
| 413 |
|
| 414 |
-
def
|
| 415 |
-
"""Parse academic summary section
|
| 416 |
-
gpa_match = self.
|
| 417 |
-
credits_matches = self.
|
|
|
|
| 418 |
|
| 419 |
summary = {
|
| 420 |
'gpa': {
|
|
@@ -422,7 +351,10 @@ class MiamiDadeTranscriptParser:
|
|
| 422 |
'state': float(gpa_match.group(2)) if gpa_match else None
|
| 423 |
},
|
| 424 |
'credits': {},
|
| 425 |
-
'class_rank':
|
|
|
|
|
|
|
|
|
|
| 426 |
}
|
| 427 |
|
| 428 |
for match in credits_matches:
|
|
@@ -435,21 +367,10 @@ class MiamiDadeTranscriptParser:
|
|
| 435 |
|
| 436 |
return summary
|
| 437 |
|
| 438 |
-
def
|
| 439 |
-
"""
|
| 440 |
-
rank_match = re.search(
|
| 441 |
-
r"\*\s+PERCENTILE:\s*(\d+)\s*\*\s*TOTAL NUMBER IN CLASS:\s*(\d+)",
|
| 442 |
-
text
|
| 443 |
-
)
|
| 444 |
-
return {
|
| 445 |
-
'percentile': int(rank_match.group(1)) if rank_match else None,
|
| 446 |
-
'class_size': int(rank_match.group(2)) if rank_match else None
|
| 447 |
-
}
|
| 448 |
-
|
| 449 |
-
def _parse_format2_courses(self, text: str) -> List[Dict]:
|
| 450 |
-
"""Parse course history section from format 2"""
|
| 451 |
courses = []
|
| 452 |
-
for match in self.
|
| 453 |
courses.append({
|
| 454 |
'term': match.group(1),
|
| 455 |
'course_code': match.group(2),
|
|
@@ -463,9 +384,9 @@ class MiamiDadeTranscriptParser:
|
|
| 463 |
})
|
| 464 |
return courses
|
| 465 |
|
| 466 |
-
def
|
| 467 |
-
"""Parse assessment and requirement information
|
| 468 |
-
matches = self.
|
| 469 |
assessments = {
|
| 470 |
'ela_passed_date': None,
|
| 471 |
'algebra_passed': False,
|
|
@@ -491,8 +412,8 @@ class MiamiDadeTranscriptParser:
|
|
| 491 |
|
| 492 |
return assessments
|
| 493 |
|
| 494 |
-
# Initialize
|
| 495 |
-
transcript_parser =
|
| 496 |
|
| 497 |
# ========== ACADEMIC ANALYZER ==========
|
| 498 |
class AcademicAnalyzer:
|
|
@@ -600,7 +521,6 @@ class AcademicAnalyzer:
|
|
| 600 |
|
| 601 |
try:
|
| 602 |
if parsed_data.get('format') == 'progress_summary':
|
| 603 |
-
# Format 1 analysis
|
| 604 |
total_match = re.search(r'Total\s*\|\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)%', text)
|
| 605 |
if total_match:
|
| 606 |
analysis['completion_percentage'] = float(total_match.group(4))
|
|
@@ -628,7 +548,6 @@ class AcademicAnalyzer:
|
|
| 628 |
if req and float(req.get('completed', 0)) < float(req.get('required', 0))
|
| 629 |
]
|
| 630 |
else:
|
| 631 |
-
# Format 2 analysis
|
| 632 |
credits = parsed_data.get('academic_summary', {}).get('credits', {})
|
| 633 |
total_required = sum(
|
| 634 |
v.get('required', 0)
|
|
@@ -1466,7 +1385,7 @@ class EnhancedTeachingAssistant:
|
|
| 1466 |
service_hours = transcript.get('student_info', {}).get('community_service_hours', 0)
|
| 1467 |
else:
|
| 1468 |
gpa = transcript.get('academic_summary', {}).get('gpa', {}).get('district', None)
|
| 1469 |
-
service_hours = transcript.get('assessments', {}).get('community_service', {}).get('hours', 0)
|
| 1470 |
|
| 1471 |
learning_style = re.search(r"Your primary learning style is\s*\*\*(.*?)\*\*",
|
| 1472 |
profile.get('learning_style', ''))
|
|
|
|
| 252 |
# Initialize learning style quiz
|
| 253 |
learning_style_quiz = LearningStyleQuiz()
|
| 254 |
|
| 255 |
+
# ========== ENHANCED TRANSCRIPT PARSER ==========
|
| 256 |
+
class EnhancedMiamiDadeTranscriptParser:
|
| 257 |
def __init__(self):
|
| 258 |
+
self.patterns = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
'student_info': re.compile(
|
| 260 |
r"LEGAL NAME:\s*([A-Z]+,\s*[A-Z]+).*?"
|
| 261 |
r"GRADE LEVEL:\s*(\d+).*?"
|
|
|
|
| 281 |
r"BIOLOGY ASSESSMENT PASSED|"
|
| 282 |
r"DISTRICT COMM/VOL SERVICE RQMT MET:\s*(YES).*?HRS:\s*(\d+)",
|
| 283 |
re.DOTALL
|
| 284 |
+
),
|
| 285 |
+
'class_rank': re.compile(
|
| 286 |
+
r"\*\s+PERCENTILE:\s*(\d+)\s*\*\s*TOTAL NUMBER IN CLASS:\s*(\d+)",
|
| 287 |
+
re.DOTALL
|
| 288 |
)
|
| 289 |
}
|
| 290 |
+
|
| 291 |
def parse_transcript(self, file_path: str) -> Dict:
|
| 292 |
+
"""Parse Miami-Dade transcript PDF with enhanced pattern matching"""
|
| 293 |
+
try:
|
| 294 |
+
with pdfplumber.open(file_path) as pdf:
|
| 295 |
+
text = "\n".join(page.extract_text() for page in pdf.pages)
|
| 296 |
+
|
| 297 |
+
# Clean up text
|
| 298 |
+
text = re.sub(r'\s+', ' ', text)
|
| 299 |
+
text = re.sub(r'(?<=\d)\s+(?=\d)', '', text)
|
| 300 |
+
|
| 301 |
+
return self._parse_format(text)
|
| 302 |
+
except Exception as e:
|
| 303 |
+
logger.error(f"Error parsing transcript: {str(e)}")
|
| 304 |
+
raise ValueError(f"Error processing transcript: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
|
| 306 |
+
def _parse_format(self, text: str) -> Dict:
|
| 307 |
+
"""Parse the transcript format shown in the example"""
|
| 308 |
parsed_data = {
|
| 309 |
+
'student_info': self._parse_student_info(text),
|
| 310 |
+
'academic_summary': self._parse_academic_summary(text),
|
| 311 |
+
'course_history': self._parse_courses(text),
|
| 312 |
+
'assessments': self._parse_assessments(text),
|
| 313 |
+
'format': 'cumulative_summary_v2'
|
| 314 |
}
|
| 315 |
return parsed_data
|
| 316 |
|
| 317 |
+
def _parse_student_info(self, text: str) -> Dict:
|
| 318 |
+
"""Extract student information"""
|
| 319 |
+
match = self.patterns['student_info'].search(text)
|
| 320 |
if not match:
|
| 321 |
return {}
|
| 322 |
|
|
|
|
| 339 |
eth_match = re.search(r"ETHNICITY:\s*([^\n]+)", text)
|
| 340 |
return eth_match.group(1).strip() if eth_match else None
|
| 341 |
|
| 342 |
+
def _parse_academic_summary(self, text: str) -> Dict:
|
| 343 |
+
"""Parse academic summary section"""
|
| 344 |
+
gpa_match = self.patterns['gpa'].search(text)
|
| 345 |
+
credits_matches = self.patterns['credits'].finditer(text)
|
| 346 |
+
rank_match = self.patterns['class_rank'].search(text)
|
| 347 |
|
| 348 |
summary = {
|
| 349 |
'gpa': {
|
|
|
|
| 351 |
'state': float(gpa_match.group(2)) if gpa_match else None
|
| 352 |
},
|
| 353 |
'credits': {},
|
| 354 |
+
'class_rank': {
|
| 355 |
+
'percentile': int(rank_match.group(1)) if rank_match else None,
|
| 356 |
+
'class_size': int(rank_match.group(2)) if rank_match else None
|
| 357 |
+
}
|
| 358 |
}
|
| 359 |
|
| 360 |
for match in credits_matches:
|
|
|
|
| 367 |
|
| 368 |
return summary
|
| 369 |
|
| 370 |
+
def _parse_courses(self, text: str) -> List[Dict]:
|
| 371 |
+
"""Parse course history section"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
courses = []
|
| 373 |
+
for match in self.patterns['course'].finditer(text):
|
| 374 |
courses.append({
|
| 375 |
'term': match.group(1),
|
| 376 |
'course_code': match.group(2),
|
|
|
|
| 384 |
})
|
| 385 |
return courses
|
| 386 |
|
| 387 |
+
def _parse_assessments(self, text: str) -> Dict:
|
| 388 |
+
"""Parse assessment and requirement information"""
|
| 389 |
+
matches = self.patterns['assessment'].finditer(text)
|
| 390 |
assessments = {
|
| 391 |
'ela_passed_date': None,
|
| 392 |
'algebra_passed': False,
|
|
|
|
| 412 |
|
| 413 |
return assessments
|
| 414 |
|
| 415 |
+
# Initialize the enhanced parser
|
| 416 |
+
transcript_parser = EnhancedMiamiDadeTranscriptParser()
|
| 417 |
|
| 418 |
# ========== ACADEMIC ANALYZER ==========
|
| 419 |
class AcademicAnalyzer:
|
|
|
|
| 521 |
|
| 522 |
try:
|
| 523 |
if parsed_data.get('format') == 'progress_summary':
|
|
|
|
| 524 |
total_match = re.search(r'Total\s*\|\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)%', text)
|
| 525 |
if total_match:
|
| 526 |
analysis['completion_percentage'] = float(total_match.group(4))
|
|
|
|
| 548 |
if req and float(req.get('completed', 0)) < float(req.get('required', 0))
|
| 549 |
]
|
| 550 |
else:
|
|
|
|
| 551 |
credits = parsed_data.get('academic_summary', {}).get('credits', {})
|
| 552 |
total_required = sum(
|
| 553 |
v.get('required', 0)
|
|
|
|
| 1385 |
service_hours = transcript.get('student_info', {}).get('community_service_hours', 0)
|
| 1386 |
else:
|
| 1387 |
gpa = transcript.get('academic_summary', {}).get('gpa', {}).get('district', None)
|
| 1388 |
+
service_hours = transcript.get('assessments', {}).get('community_service', {}).get('hours', 0))
|
| 1389 |
|
| 1390 |
learning_style = re.search(r"Your primary learning style is\s*\*\*(.*?)\*\*",
|
| 1391 |
profile.get('learning_style', ''))
|