Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -23,6 +23,7 @@ import asyncio
|
|
| 23 |
from functools import lru_cache
|
| 24 |
import hashlib
|
| 25 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
| 26 |
|
| 27 |
# ========== CONFIGURATION ==========
|
| 28 |
PROFILES_DIR = "student_profiles"
|
|
@@ -244,6 +245,33 @@ def remove_sensitive_info(text: str) -> str:
|
|
| 244 |
return text
|
| 245 |
|
| 246 |
# ========== TRANSCRIPT PARSING ==========
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
class TranscriptParser:
|
| 248 |
def __init__(self):
|
| 249 |
self.student_data = {}
|
|
@@ -253,27 +281,98 @@ class TranscriptParser:
|
|
| 253 |
self.graduation_status = {}
|
| 254 |
|
| 255 |
def parse_transcript(self, text: str) -> Dict:
|
| 256 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
try:
|
| 258 |
parsed_data = {
|
| 259 |
'student_info': {},
|
| 260 |
-
'
|
|
|
|
|
|
|
| 261 |
}
|
| 262 |
|
| 263 |
-
# Extract student
|
| 264 |
-
|
| 265 |
-
if
|
| 266 |
-
parsed_data['student_info']['
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
-
|
| 269 |
-
if
|
| 270 |
-
parsed_data['student_info']['
|
| 271 |
|
| 272 |
-
|
| 273 |
-
if
|
| 274 |
-
parsed_data['student_info']['
|
| 275 |
|
| 276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
course_pattern = r'([A-Z]{2,4}\s?\d{3})\s+(.*?)\s+([A-F][+-]?)\s+([0-9.]+)'
|
| 278 |
courses = re.findall(course_pattern, text)
|
| 279 |
for course in courses:
|
|
@@ -287,8 +386,41 @@ class TranscriptParser:
|
|
| 287 |
return parsed_data
|
| 288 |
|
| 289 |
except Exception as e:
|
| 290 |
-
logging.
|
| 291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
|
| 294 |
"""Process transcript file and return simple confirmation"""
|
|
|
|
| 23 |
from functools import lru_cache
|
| 24 |
import hashlib
|
| 25 |
from concurrent.futures import ThreadPoolExecutor
|
| 26 |
+
from pydantic import BaseModel
|
| 27 |
|
| 28 |
# ========== CONFIGURATION ==========
|
| 29 |
PROFILES_DIR = "student_profiles"
|
|
|
|
| 245 |
return text
|
| 246 |
|
| 247 |
# ========== TRANSCRIPT PARSING ==========
|
| 248 |
+
class Course(BaseModel):
|
| 249 |
+
requirement: str
|
| 250 |
+
school_year: str
|
| 251 |
+
grade_level: str
|
| 252 |
+
course_code: str
|
| 253 |
+
description: str
|
| 254 |
+
term: str
|
| 255 |
+
district_number: str
|
| 256 |
+
fg: str
|
| 257 |
+
included: str
|
| 258 |
+
credits: str
|
| 259 |
+
|
| 260 |
+
class GraduationProgress(BaseModel):
|
| 261 |
+
student_name: str
|
| 262 |
+
student_id: str
|
| 263 |
+
current_grade: str
|
| 264 |
+
year_of_graduation: str
|
| 265 |
+
unweighted_gpa: float
|
| 266 |
+
weighted_gpa: float
|
| 267 |
+
community_service_hours: int
|
| 268 |
+
community_service_date: str
|
| 269 |
+
total_credits_earned: float
|
| 270 |
+
virtual_grade: str
|
| 271 |
+
requirements: Dict[str, Dict[str, float]]
|
| 272 |
+
courses: List[Course]
|
| 273 |
+
assessments: Dict[str, str]
|
| 274 |
+
|
| 275 |
class TranscriptParser:
|
| 276 |
def __init__(self):
|
| 277 |
self.student_data = {}
|
|
|
|
| 281 |
self.graduation_status = {}
|
| 282 |
|
| 283 |
def parse_transcript(self, text: str) -> Dict:
|
| 284 |
+
"""Parse transcript text and return structured data"""
|
| 285 |
+
try:
|
| 286 |
+
# First try the new detailed parser
|
| 287 |
+
parsed_data = self._parse_detailed_transcript(text)
|
| 288 |
+
if parsed_data:
|
| 289 |
+
return parsed_data
|
| 290 |
+
|
| 291 |
+
# Fall back to simplified parser if detailed parsing fails
|
| 292 |
+
return self._parse_simplified_transcript(text)
|
| 293 |
+
|
| 294 |
+
except Exception as e:
|
| 295 |
+
logging.error(f"Error parsing transcript: {str(e)}")
|
| 296 |
+
raise ValueError(f"Couldn't parse transcript: {str(e)}")
|
| 297 |
+
|
| 298 |
+
def _parse_detailed_transcript(self, text: str) -> Optional[Dict]:
|
| 299 |
+
"""Parse detailed transcript format"""
|
| 300 |
try:
|
| 301 |
parsed_data = {
|
| 302 |
'student_info': {},
|
| 303 |
+
'requirements': {},
|
| 304 |
+
'course_history': [],
|
| 305 |
+
'assessments': {}
|
| 306 |
}
|
| 307 |
|
| 308 |
+
# Extract student info
|
| 309 |
+
student_info_match = re.search(r"(\d{7}) - (.*?)\n", text)
|
| 310 |
+
if student_info_match:
|
| 311 |
+
parsed_data['student_info']['id'] = student_info_match.group(1)
|
| 312 |
+
parsed_data['student_info']['name'] = student_info_match.group(2).strip()
|
| 313 |
+
|
| 314 |
+
current_grade_match = re.search(r"Current Grade: (\d+)", text)
|
| 315 |
+
if current_grade_match:
|
| 316 |
+
parsed_data['student_info']['grade'] = current_grade_match.group(1)
|
| 317 |
+
|
| 318 |
+
yog_match = re.search(r"YOG (\d{4})", text)
|
| 319 |
+
if yog_match:
|
| 320 |
+
parsed_data['student_info']['year_of_graduation'] = yog_match.group(1)
|
| 321 |
+
|
| 322 |
+
unweighted_gpa_match = re.search(r"Un-weighted GPA (\d+\.\d+)", text)
|
| 323 |
+
if unweighted_gpa_match:
|
| 324 |
+
parsed_data['student_info']['unweighted_gpa'] = float(unweighted_gpa_match.group(1))
|
| 325 |
+
|
| 326 |
+
weighted_gpa_match = re.search(r"Weighted GPA (\d+\.\d+)", text)
|
| 327 |
+
if weighted_gpa_match:
|
| 328 |
+
parsed_data['student_info']['weighted_gpa'] = float(weighted_gpa_match.group(1))
|
| 329 |
+
|
| 330 |
+
service_hours_match = re.search(r"Comm Serv Hours (\d+)", text)
|
| 331 |
+
if service_hours_match:
|
| 332 |
+
parsed_data['student_info']['community_service_hours'] = int(service_hours_match.group(1))
|
| 333 |
|
| 334 |
+
service_date_match = re.search(r"Comm Serv Date (\d{2}/\d{2}/\d{4})", text)
|
| 335 |
+
if service_date_match:
|
| 336 |
+
parsed_data['student_info']['community_service_date'] = service_date_match.group(1)
|
| 337 |
|
| 338 |
+
credits_match = re.search(r"Total Credits Earned (\d+\.\d+)", text)
|
| 339 |
+
if credits_match:
|
| 340 |
+
parsed_data['student_info']['total_credits'] = float(credits_match.group(1))
|
| 341 |
|
| 342 |
+
virtual_grade_match = re.search(r"Virtual Grade (\w+)", text)
|
| 343 |
+
if virtual_grade_match:
|
| 344 |
+
parsed_data['student_info']['virtual_grade'] = virtual_grade_match.group(1)
|
| 345 |
+
|
| 346 |
+
# Extract requirements
|
| 347 |
+
req_pattern = re.compile(r"([A-Z]-.*?)\s*\|\s*(.*?)\s*\|\s*(\d+\.\d+)\s*\|\s*(\d+\.\d+)\s*\|\s*(\d+\.\d+)\s*\|\s*(\d+) %")
|
| 348 |
+
for match in req_pattern.finditer(text):
|
| 349 |
+
code = match.group(1).strip()
|
| 350 |
+
desc = match.group(2).strip()
|
| 351 |
+
required = float(match.group(3))
|
| 352 |
+
waived = float(match.group(4))
|
| 353 |
+
completed = float(match.group(5))
|
| 354 |
+
percent = float(match.group(6))
|
| 355 |
+
parsed_data['requirements'][code] = {
|
| 356 |
+
"description": desc,
|
| 357 |
+
"required": required,
|
| 358 |
+
"waived": waived,
|
| 359 |
+
"completed": completed,
|
| 360 |
+
"percent_complete": percent
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
# Extract assessments
|
| 364 |
+
assess_pattern = re.compile(r"Z-Assessment: (.*?)\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+) %")
|
| 365 |
+
for match in assess_pattern.finditer(text):
|
| 366 |
+
name = f"Assessment: {match.group(1)}"
|
| 367 |
+
status = match.group(3)
|
| 368 |
+
parsed_data['assessments'][name] = status
|
| 369 |
+
|
| 370 |
+
for z_item in ["Community Service Hours", "GPA"]:
|
| 371 |
+
if re.search(fr"Z-{z_item.replace(' ', '.*?')}\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+) %", text):
|
| 372 |
+
status = re.search(fr"Z-{z_item.replace(' ', '.*?')}\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+) %", text).group(2)
|
| 373 |
+
parsed_data['assessments'][z_item] = status
|
| 374 |
+
|
| 375 |
+
# Extract courses (simplified for now - can be enhanced)
|
| 376 |
course_pattern = r'([A-Z]{2,4}\s?\d{3})\s+(.*?)\s+([A-F][+-]?)\s+([0-9.]+)'
|
| 377 |
courses = re.findall(course_pattern, text)
|
| 378 |
for course in courses:
|
|
|
|
| 386 |
return parsed_data
|
| 387 |
|
| 388 |
except Exception as e:
|
| 389 |
+
logging.warning(f"Detailed transcript parsing failed, falling back to simple parser: {str(e)}")
|
| 390 |
+
return None
|
| 391 |
+
|
| 392 |
+
def _parse_simplified_transcript(self, text: str) -> Dict:
|
| 393 |
+
"""Fallback simplified transcript parser that extracts key information"""
|
| 394 |
+
parsed_data = {
|
| 395 |
+
'student_info': {},
|
| 396 |
+
'course_history': []
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
# Extract student information
|
| 400 |
+
name_match = re.search(r'(?:Name|Student)[:\s]+([A-Za-z,\s]+)', text, re.IGNORECASE)
|
| 401 |
+
if name_match:
|
| 402 |
+
parsed_data['student_info']['name'] = name_match.group(1).strip()
|
| 403 |
+
|
| 404 |
+
id_match = re.search(r'(?:ID|Student\s*ID)[:\s]+([A-Za-z0-9-]+)', text, re.IGNORECASE)
|
| 405 |
+
if id_match:
|
| 406 |
+
parsed_data['student_info']['id'] = id_match.group(1).strip()
|
| 407 |
+
|
| 408 |
+
gpa_match = re.search(r'(?:GPA|Grade\s*Point\s*Average)[:\s]+([0-9.]+)', text, re.IGNORECASE)
|
| 409 |
+
if gpa_match:
|
| 410 |
+
parsed_data['student_info']['gpa'] = float(gpa_match.group(1))
|
| 411 |
+
|
| 412 |
+
# Extract courses (simplified pattern)
|
| 413 |
+
course_pattern = r'([A-Z]{2,4}\s?\d{3})\s+(.*?)\s+([A-F][+-]?)\s+([0-9.]+)'
|
| 414 |
+
courses = re.findall(course_pattern, text)
|
| 415 |
+
for course in courses:
|
| 416 |
+
parsed_data['course_history'].append({
|
| 417 |
+
'course_code': course[0],
|
| 418 |
+
'description': course[1],
|
| 419 |
+
'grade': course[2],
|
| 420 |
+
'credits': float(course[3])
|
| 421 |
+
})
|
| 422 |
+
|
| 423 |
+
return parsed_data
|
| 424 |
|
| 425 |
def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
|
| 426 |
"""Process transcript file and return simple confirmation"""
|