Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -235,12 +235,170 @@ def remove_sensitive_info(text: str) -> str:
|
|
| 235 |
return text
|
| 236 |
|
| 237 |
# ========== TRANSCRIPT PARSING ==========
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
|
| 239 |
"""Use AI model to parse transcript text with progress feedback"""
|
| 240 |
model, tokenizer = model_loader.load_model(model_loader.current_model or DEFAULT_MODEL, progress)
|
| 241 |
if model is None or tokenizer is None:
|
| 242 |
raise gr.Error(f"Model failed to load. {model_loader.error or 'Please try loading a model first.'}")
|
| 243 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
# Pre-process the text
|
| 245 |
text = remove_sensitive_info(text[:15000]) # Limit input size
|
| 246 |
|
|
@@ -263,7 +421,7 @@ def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
|
|
| 263 |
"""
|
| 264 |
|
| 265 |
try:
|
| 266 |
-
progress(0.1, desc="Processing transcript...")
|
| 267 |
|
| 268 |
# Tokenize and generate response
|
| 269 |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
|
@@ -271,7 +429,7 @@ def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
|
|
| 271 |
|
| 272 |
outputs = model.generate(
|
| 273 |
**inputs,
|
| 274 |
-
max_new_tokens=1500,
|
| 275 |
temperature=0.1,
|
| 276 |
do_sample=True
|
| 277 |
)
|
|
|
|
| 235 |
return text
|
| 236 |
|
| 237 |
# ========== TRANSCRIPT PARSING ==========
|
| 238 |
+
class TranscriptParser:
|
| 239 |
+
def __init__(self):
|
| 240 |
+
self.student_data = {}
|
| 241 |
+
self.requirements = {}
|
| 242 |
+
self.current_courses = []
|
| 243 |
+
self.course_history = []
|
| 244 |
+
|
| 245 |
+
def parse_transcript(self, text: str) -> Dict:
|
| 246 |
+
"""Main method to parse transcript text"""
|
| 247 |
+
self._extract_student_info(text)
|
| 248 |
+
self._extract_requirements(text)
|
| 249 |
+
self._extract_course_history(text)
|
| 250 |
+
self._extract_current_courses(text)
|
| 251 |
+
|
| 252 |
+
return {
|
| 253 |
+
"student_info": self.student_data,
|
| 254 |
+
"requirements": self.requirements,
|
| 255 |
+
"current_courses": self.current_courses,
|
| 256 |
+
"course_history": self.course_history,
|
| 257 |
+
"completion_status": self._calculate_completion()
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
def _extract_student_info(self, text: str):
|
| 261 |
+
"""Extract student personal information"""
|
| 262 |
+
header_match = re.search(
|
| 263 |
+
r"(\d{7}) - ([\w\s,]+)\s*\|\s*Cohort \w+\s*\|\s*Un-weighted GPA ([\d.]+)\s*\|\s*Comm Serv Hours (\d+)",
|
| 264 |
+
text
|
| 265 |
+
)
|
| 266 |
+
if header_match:
|
| 267 |
+
self.student_data = {
|
| 268 |
+
"id": header_match.group(1),
|
| 269 |
+
"name": header_match.group(2).strip(),
|
| 270 |
+
"unweighted_gpa": float(header_match.group(3)),
|
| 271 |
+
"community_service_hours": int(header_match.group(4))
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
# Extract additional info
|
| 275 |
+
grade_match = re.search(
|
| 276 |
+
r"Current Grade: (\d+)\s*\|\s*YOG (\d{4})\s*\|\s*Weighted GPA ([\d.]+)\s*\|\s*Total Credits Earned ([\d.]+)",
|
| 277 |
+
text
|
| 278 |
+
)
|
| 279 |
+
if grade_match:
|
| 280 |
+
self.student_data.update({
|
| 281 |
+
"current_grade": grade_match.group(1),
|
| 282 |
+
"graduation_year": grade_match.group(2),
|
| 283 |
+
"weighted_gpa": float(grade_match.group(3)),
|
| 284 |
+
"total_credits": float(grade_match.group(4))
|
| 285 |
+
})
|
| 286 |
+
|
| 287 |
+
def _extract_requirements(self, text: str):
|
| 288 |
+
"""Parse the graduation requirements section"""
|
| 289 |
+
req_table = re.findall(
|
| 290 |
+
r"\|([A-Z]-[\w\s]+)\s*\|([^\|]+)\|([\d.]+)\s*\|([\d.]+)\s*\|([\d.]+)\s*\|([^\|]+)\|",
|
| 291 |
+
text
|
| 292 |
+
)
|
| 293 |
+
|
| 294 |
+
for row in req_table:
|
| 295 |
+
req_name = row[0].strip()
|
| 296 |
+
self.requirements[req_name] = {
|
| 297 |
+
"required": float(row[2]),
|
| 298 |
+
"completed": float(row[4]),
|
| 299 |
+
"status": f"{row[5].strip()}%"
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
def _extract_course_history(self, text: str):
|
| 303 |
+
"""Parse the detailed course history"""
|
| 304 |
+
course_lines = re.findall(
|
| 305 |
+
r"\|([A-Z]-[\w\s&\(\)]+)\s*\|(\d{4}-\d{4})\s*\|(\d{2})\s*\|([A-Z0-9]+)\s*\|([^\|]+)\|([^\|]+)\|([^\|]+)\|([A-Z])\s*\|([YRXW]?)\s*\|([^\|]+)\|",
|
| 306 |
+
text
|
| 307 |
+
)
|
| 308 |
+
|
| 309 |
+
for course in course_lines:
|
| 310 |
+
self.course_history.append({
|
| 311 |
+
"requirement_category": course[0].strip(),
|
| 312 |
+
"school_year": course[1],
|
| 313 |
+
"grade_level": course[2],
|
| 314 |
+
"course_code": course[3],
|
| 315 |
+
"description": course[4].strip(),
|
| 316 |
+
"term": course[5].strip(),
|
| 317 |
+
"district_number": course[6].strip(),
|
| 318 |
+
"grade": course[7],
|
| 319 |
+
"inclusion_status": course[8],
|
| 320 |
+
"credits": course[9].strip()
|
| 321 |
+
})
|
| 322 |
+
|
| 323 |
+
def _extract_current_courses(self, text: str):
|
| 324 |
+
"""Identify courses currently in progress"""
|
| 325 |
+
in_progress = [c for c in self.course_history if "inProgress" in c["credits"]]
|
| 326 |
+
self.current_courses = [
|
| 327 |
+
{
|
| 328 |
+
"course": c["description"],
|
| 329 |
+
"category": c["requirement_category"],
|
| 330 |
+
"term": c["term"],
|
| 331 |
+
"credits": c["credits"]
|
| 332 |
+
}
|
| 333 |
+
for c in in_progress
|
| 334 |
+
]
|
| 335 |
+
|
| 336 |
+
def _calculate_completion(self) -> Dict:
|
| 337 |
+
"""Calculate overall completion status"""
|
| 338 |
+
total_required = sum(req["required"] for req in self.requirements.values())
|
| 339 |
+
total_completed = sum(req["completed"] for req in self.requirements.values())
|
| 340 |
+
|
| 341 |
+
return {
|
| 342 |
+
"total_required": total_required,
|
| 343 |
+
"total_completed": total_completed,
|
| 344 |
+
"percent_complete": round((total_completed / total_required) * 100, 1),
|
| 345 |
+
"remaining_credits": total_required - total_completed
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
def to_json(self) -> str:
|
| 349 |
+
"""Export parsed data as JSON"""
|
| 350 |
+
return json.dumps({
|
| 351 |
+
"student_info": self.student_data,
|
| 352 |
+
"requirements": self.requirements,
|
| 353 |
+
"current_courses": self.current_courses,
|
| 354 |
+
"course_history": self.course_history,
|
| 355 |
+
"completion_status": self._calculate_completion()
|
| 356 |
+
}, indent=2)
|
| 357 |
+
|
| 358 |
def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
|
| 359 |
"""Use AI model to parse transcript text with progress feedback"""
|
| 360 |
model, tokenizer = model_loader.load_model(model_loader.current_model or DEFAULT_MODEL, progress)
|
| 361 |
if model is None or tokenizer is None:
|
| 362 |
raise gr.Error(f"Model failed to load. {model_loader.error or 'Please try loading a model first.'}")
|
| 363 |
|
| 364 |
+
# First try the structured parser
|
| 365 |
+
try:
|
| 366 |
+
progress(0.1, desc="Parsing transcript structure...")
|
| 367 |
+
parser = TranscriptParser()
|
| 368 |
+
parsed_data = parser.parse_transcript(text)
|
| 369 |
+
progress(0.9, desc="Formatting results...")
|
| 370 |
+
|
| 371 |
+
# Convert to expected format
|
| 372 |
+
formatted_data = {
|
| 373 |
+
"grade_level": parsed_data["student_info"].get("current_grade", "Unknown"),
|
| 374 |
+
"gpa": {
|
| 375 |
+
"weighted": parsed_data["student_info"].get("weighted_gpa", "N/A"),
|
| 376 |
+
"unweighted": parsed_data["student_info"].get("unweighted_gpa", "N/A")
|
| 377 |
+
},
|
| 378 |
+
"courses": []
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
# Add courses
|
| 382 |
+
for course in parsed_data["course_history"]:
|
| 383 |
+
formatted_data["courses"].append({
|
| 384 |
+
"code": course["course_code"],
|
| 385 |
+
"name": course["description"],
|
| 386 |
+
"grade": course["grade"],
|
| 387 |
+
"credits": course["credits"],
|
| 388 |
+
"year": course["school_year"],
|
| 389 |
+
"grade_level": course["grade_level"]
|
| 390 |
+
})
|
| 391 |
+
|
| 392 |
+
progress(1.0)
|
| 393 |
+
return validate_parsed_data(formatted_data)
|
| 394 |
+
|
| 395 |
+
except Exception as e:
|
| 396 |
+
print(f"Structured parsing failed, falling back to AI: {str(e)}")
|
| 397 |
+
# Fall back to AI parsing if structured parsing fails
|
| 398 |
+
return parse_transcript_with_ai_fallback(text, progress)
|
| 399 |
+
|
| 400 |
+
def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict:
|
| 401 |
+
"""Fallback AI parsing method"""
|
| 402 |
# Pre-process the text
|
| 403 |
text = remove_sensitive_info(text[:15000]) # Limit input size
|
| 404 |
|
|
|
|
| 421 |
"""
|
| 422 |
|
| 423 |
try:
|
| 424 |
+
progress(0.1, desc="Processing transcript with AI...")
|
| 425 |
|
| 426 |
# Tokenize and generate response
|
| 427 |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
|
|
|
| 429 |
|
| 430 |
outputs = model.generate(
|
| 431 |
**inputs,
|
| 432 |
+
max_new_tokens=1500,
|
| 433 |
temperature=0.1,
|
| 434 |
do_sample=True
|
| 435 |
)
|