Spaces:

Dannyar608
/

Final_project

Runtime error

App Files Files Community

Dannyar608 commited on May 19, 2025

Commit

33513a8

verified ·

1 Parent(s): dc1d757

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -52

app.py CHANGED Viewed

@@ -66,6 +66,15 @@ class ModelLoader:
     def load_model(self, progress: gr.Progress = None) -> Tuple[Optional[AutoModelForCausalLM], Optional[AutoTokenizer]]:
         """Lazy load the model with progress feedback"""
         try:
             if progress:
                 progress(0.1, desc="Checking GPU availability...")
@@ -117,6 +126,8 @@ class ModelLoader:
             self.error = f"Model loading failed: {str(e)}"
             logging.error(self.error)
             return None, None
 # Initialize model loader
 model_loader = ModelLoader()
@@ -170,6 +181,13 @@ def validate_file(file_obj) -> None:
         raise ValueError(f"File too large. Maximum size is {MAX_FILE_SIZE_MB}MB.")
 # ========== TEXT EXTRACTION FUNCTIONS ==========
 def extract_text_from_file(file_path: str, file_ext: str) -> str:
     text = ""
@@ -312,6 +330,8 @@ class TranscriptParser:
     def parse_transcript(self, text: str) -> Dict:
         """Parse transcript text and return structured data"""
         try:
             # First try the new detailed parser
             parsed_data = self._parse_detailed_transcript(text)
             if parsed_data:
@@ -349,28 +369,29 @@ class TranscriptParser:
             if yog_match:
                 parsed_data['student_info']['year_of_graduation'] = yog_match.group(1)
-            # Improved GPA extraction
-            gpa_matches = re.findall(r"(?:Un-weighted|Weighted)\s*GPA\s*([\d.]+)", text, re.IGNORECASE)
-            if len(gpa_matches) >= 2:
                 parsed_data['student_info']['unweighted_gpa'] = float(gpa_matches[0])
                 parsed_data['student_info']['weighted_gpa'] = float(gpa_matches[1])
             # Community service info
-            service_hours_match = re.search(r"Comm\s*Serv\s*Hours\s*(\d+)", text, re.IGNORECASE)
             if service_hours_match:
                 parsed_data['student_info']['community_service_hours'] = int(service_hours_match.group(1))
-            service_date_match = re.search(r"Comm\s*Serv\s*Date\s*(\d{2}/\d{2}/\d{4})", text, re.IGNORECASE)
             if service_date_match:
                 parsed_data['student_info']['community_service_date'] = service_date_match.group(1)
             # Credits info
-            credits_match = re.search(r"Total\s*Credits\s*Earned\s*([\d.]+)", text, re.IGNORECASE)
             if credits_match:
                 parsed_data['student_info']['total_credits'] = float(credits_match.group(1))
             # Virtual grade
-            virtual_grade_match = re.search(r"Virtual\s*Grade\s*(\w+)", text, re.IGNORECASE)
             if virtual_grade_match:
                 parsed_data['student_info']['virtual_grade'] = virtual_grade_match.group(1)
@@ -379,10 +400,10 @@ class TranscriptParser:
             for match in req_pattern.finditer(text):
                 code = match.group(1).strip()
                 desc = match.group(2).strip()
-                required = float(match.group(3))
-                waived = float(match.group(4))
-                completed = float(match.group(5))
-                percent = float(match.group(6))
                 parsed_data['requirements'][code] = {
                     "description": desc,
                     "required": required,
@@ -392,7 +413,7 @@ class TranscriptParser:
                 }
             # Extract assessments with more flexible pattern
-            assess_pattern = re.compile(r"Z-Assessment:\s*(.*?)\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+)\s*%")
             for match in assess_pattern.finditer(text):
                 name = f"Assessment: {match.group(1).strip()}"
                 status = match.group(3).strip()
@@ -406,22 +427,22 @@ class TranscriptParser:
                     parsed_data['assessments'][z_item] = status
             # Extract course history with more robust pattern
-            course_history_section = re.search(r"Requirement.*?School Year.*?GradeLv1.*?CrsNum.*?Description.*?Term.*?DstNumber.*?FG.*?Incl.*?Credits(.*?)(?:\n\s*\n|$)", text, re.DOTALL)
             if course_history_section:
                 course_lines = [line.strip() for line in course_history_section.group(1).split('\n') if line.strip()]
                 for line in course_lines:
                     parts = [part.strip() for part in line.split('|')]
                     if len(parts) >= 9:
                         course = {
-                            'requirement': parts[0],
-                            'school_year': parts[1],
-                            'grade_level': parts[2],
-                            'course_code': parts[3],
-                            'description': parts[4],
-                            'term': parts[5],
-                            'district_number': parts[6],
-                            'fg': parts[7],
-                            'included': parts[8],
                             'credits': parts[9] if len(parts) > 9 else "0"
                         }
                         parsed_data['course_history'].append(course)
@@ -435,7 +456,7 @@ class TranscriptParser:
     def _parse_simplified_transcript(self, text: str) -> Dict:
         """Fallback simplified transcript parser with multiple pattern attempts"""
         patterns = [
-            (r'(?:Course|Subject)\s*Code.*?Grade.*?Credits(.*?)(?:\n\s*\n|\Z)', 'table'),
             (r'([A-Z]{2,4}\s?\d{3}[A-Z]?)\s+(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)', 'line'),
             (r'(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)', 'minimal')
         ]
@@ -444,8 +465,10 @@ class TranscriptParser:
             try:
                 if pattern_type == 'table':
                     # Parse tabular data
-                    courses = re.findall(r'([A-Z]{2,4}\s?\d{3}[A-Z]?)\s+(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)',
-                                       re.search(pattern, text, re.DOTALL).group(1))
                 elif pattern_type == 'line':
                     courses = re.findall(pattern, text)
                 else:
@@ -454,14 +477,22 @@ class TranscriptParser:
                 if courses:
                     parsed_data = {'course_history': []}
                     for course in courses:
-                        parsed_data['course_history'].append({
-                            'course_code': course[0].strip(),
-                            'description': course[1].strip() if len(course) > 1 else '',
-                            'grade': course[2].strip() if len(course) > 2 else '',
-                            'credits': float(course[3]) if len(course) > 3 else 0.0
-                        })
                     return parsed_data
-            except:
                 continue
         raise ValueError("Could not identify course information in transcript")
@@ -469,7 +500,7 @@ class TranscriptParser:
 # ========== ENHANCED ANALYSIS FUNCTIONS ==========
 def analyze_gpa(parsed_data: Dict) -> str:
     try:
-        gpa = float(parsed_data['student_info']['weighted_gpa'])
         if gpa >= 4.5:
             return "🌟 Excellent GPA! You're in the top tier of students."
         elif gpa >= 3.5:
@@ -484,15 +515,15 @@ def analyze_gpa(parsed_data: Dict) -> str:
 def analyze_graduation_status(parsed_data: Dict) -> str:
     try:
         total_required = sum(
-            float(req['required'])
-            for req in parsed_data['requirements'].values()
-            if req.get('required') and str(req['required']).replace('.', '').isdigit()
         )
         total_completed = sum(
-            float(req['completed'])
-            for req in parsed_data['requirements'].values()
-            if req.get('completed') and str(req['completed']).replace('.', '').isdigit()
         )
         completion_percentage = (total_completed / total_required) * 100 if total_required > 0 else 0
@@ -513,7 +544,7 @@ def generate_advice(parsed_data: Dict) -> str:
     # GPA advice
     try:
-        gpa = float(parsed_data['student_info']['weighted_gpa'])
         if gpa < 3.0:
             advice.append("📚 Your GPA could improve. Consider:\n- Seeking tutoring for challenging subjects\n- Meeting with teachers during office hours\n- Developing better study habits")
     except (TypeError, ValueError, KeyError, AttributeError):
@@ -521,7 +552,7 @@ def generate_advice(parsed_data: Dict) -> str:
     # Community service advice
     try:
-        service_hours = int(parsed_data['student_info']['community_service_hours'])
         if service_hours < 100:
             advice.append("🤝 Consider more community service:\n- Many colleges value 100+ hours\n- Look for opportunities that align with your interests")
     except (TypeError, ValueError, KeyError, AttributeError):
@@ -530,19 +561,20 @@ def generate_advice(parsed_data: Dict) -> str:
     # Missing requirements advice
     try:
         missing_reqs = [
-            req for code, req in parsed_data['requirements'].items()
-            if float(req['percent_complete']) < 100 and not code.startswith("Z-Assessment")
         ]
         if missing_reqs:
-            req_list = "\n- ".join([f"{code}: {req['description']}" for code, req in missing_reqs])
             advice.append(f"🎓 Focus on completing these requirements:\n- {req_list}")
     except (TypeError, ValueError, KeyError, AttributeError):
         pass
     # Course rigor advice
     try:
-        ap_count = sum(1 for course in parsed_data['course_history'] if "Advanced Placement" in course['description'])
         if ap_count < 3:
             advice.append("🧠 Consider taking more challenging courses:\n- AP/IB courses can strengthen college applications\n- Shows academic rigor to admissions officers")
     except (TypeError, KeyError, AttributeError):
@@ -552,9 +584,10 @@ def generate_advice(parsed_data: Dict) -> str:
 def generate_college_recommendations(parsed_data: Dict) -> str:
     try:
-        gpa = float(parsed_data['student_info']['weighted_gpa'])
-        ap_count = sum(1 for course in parsed_data['course_history'] if "Advanced Placement" in course['description'])
-        service_hours = int(parsed_data['student_info']['community_service_hours']) if parsed_data['student_info'].get('community_service_hours') else 0
         recommendations = []
@@ -589,8 +622,8 @@ def create_gpa_visualization(parsed_data: Dict):
         gpa_data = {
             "Type": ["Weighted GPA", "Unweighted GPA"],
             "Value": [
-                float(parsed_data['student_info']['weighted_gpa']),
-                float(parsed_data['student_info']['unweighted_gpa'])
             ]
         }
         df = pd.DataFrame(gpa_data)
@@ -606,8 +639,8 @@ def create_gpa_visualization(parsed_data: Dict):
 def create_requirements_visualization(parsed_data: Dict):
     try:
         req_data = []
-        for code, req in parsed_data['requirements'].items():
-            if req.get('percent_complete'):
                 completion = float(req['percent_complete'])
                 req_data.append({
                     "Requirement": code,
@@ -663,6 +696,8 @@ def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Di
         parser = TranscriptParser()
         try:
             parsed_data = parser.parse_transcript(text)
         except Exception as e:
             raise ValueError(f"Couldn't parse transcript content. Error: {str(e)}")

     def load_model(self, progress: gr.Progress = None) -> Tuple[Optional[AutoModelForCausalLM], Optional[AutoTokenizer]]:
         """Lazy load the model with progress feedback"""
+        if self.loaded:
+            return self.model, self.tokenizer
+        if self.loading:
+            while self.loading:
+                time.sleep(0.1)
+            return self.model, self.tokenizer
+        self.loading = True
         try:
             if progress:
                 progress(0.1, desc="Checking GPU availability...")
             self.error = f"Model loading failed: {str(e)}"
             logging.error(self.error)
             return None, None
+        finally:
+            self.loading = False
 # Initialize model loader
 model_loader = ModelLoader()
         raise ValueError(f"File too large. Maximum size is {MAX_FILE_SIZE_MB}MB.")
 # ========== TEXT EXTRACTION FUNCTIONS ==========
+def preprocess_text(text: str) -> str:
+    """Normalize text for more reliable parsing"""
+    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
+    text = text.replace('|', ' ')      # Handle common OCR errors
+    text = text.upper()                # Standardize case for certain fields
+    return text
 def extract_text_from_file(file_path: str, file_ext: str) -> str:
     text = ""
     def parse_transcript(self, text: str) -> Dict:
         """Parse transcript text and return structured data"""
         try:
+            text = preprocess_text(text)
             # First try the new detailed parser
             parsed_data = self._parse_detailed_transcript(text)
             if parsed_data:
             if yog_match:
                 parsed_data['student_info']['year_of_graduation'] = yog_match.group(1)
+            # Improved GPA extraction with more flexible patterns
+            gpa_matches = re.findall(r"(?:UNWEIGHTED|WEIGHTED)\s*GPA\s*([\d.]+)", text, re.IGNORECASE)
+            if len(gpa_matches) >= 1:
                 parsed_data['student_info']['unweighted_gpa'] = float(gpa_matches[0])
+            if len(gpa_matches) >= 2:
                 parsed_data['student_info']['weighted_gpa'] = float(gpa_matches[1])
             # Community service info
+            service_hours_match = re.search(r"COMM\s*SERV\s*HOURS\s*(\d+)", text, re.IGNORECASE)
             if service_hours_match:
                 parsed_data['student_info']['community_service_hours'] = int(service_hours_match.group(1))
+            service_date_match = re.search(r"COMM\s*SERV\s*DATE\s*(\d{2}/\d{2}/\d{4})", text, re.IGNORECASE)
             if service_date_match:
                 parsed_data['student_info']['community_service_date'] = service_date_match.group(1)
             # Credits info
+            credits_match = re.search(r"TOTAL\s*CREDITS\s*EARNED\s*([\d.]+)", text, re.IGNORECASE)
             if credits_match:
                 parsed_data['student_info']['total_credits'] = float(credits_match.group(1))
             # Virtual grade
+            virtual_grade_match = re.search(r"VIRTUAL\s*GRADE\s*(\w+)", text, re.IGNORECASE)
             if virtual_grade_match:
                 parsed_data['student_info']['virtual_grade'] = virtual_grade_match.group(1)
             for match in req_pattern.finditer(text):
                 code = match.group(1).strip()
                 desc = match.group(2).strip()
+                required = float(match.group(3)) if match.group(3) else 0.0
+                waived = float(match.group(4)) if match.group(4) else 0.0
+                completed = float(match.group(5)) if match.group(5) else 0.0
+                percent = float(match.group(6)) if match.group(6) else 0.0
                 parsed_data['requirements'][code] = {
                     "description": desc,
                     "required": required,
                 }
             # Extract assessments with more flexible pattern
+            assess_pattern = re.compile(r"Z-ASSESSMENT:\s*(.*?)\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+)\s*%", re.IGNORECASE)
             for match in assess_pattern.finditer(text):
                 name = f"Assessment: {match.group(1).strip()}"
                 status = match.group(3).strip()
                     parsed_data['assessments'][z_item] = status
             # Extract course history with more robust pattern
+            course_history_section = re.search(r"REQUIREMENT.*?SCHOOL YEAR.*?GRADELV1.*?CRSNUM.*?DESCRIPTION.*?TERM.*?DSTNUMBER.*?FG.*?INCL.*?CREDITS(.*?)(?:\n\s*\n|$)", text, re.DOTALL | re.IGNORECASE)
             if course_history_section:
                 course_lines = [line.strip() for line in course_history_section.group(1).split('\n') if line.strip()]
                 for line in course_lines:
                     parts = [part.strip() for part in line.split('|')]
                     if len(parts) >= 9:
                         course = {
+                            'requirement': parts[0] if len(parts) > 0 else "",
+                            'school_year': parts[1] if len(parts) > 1 else "",
+                            'grade_level': parts[2] if len(parts) > 2 else "",
+                            'course_code': parts[3] if len(parts) > 3 else "",
+                            'description': parts[4] if len(parts) > 4 else "",
+                            'term': parts[5] if len(parts) > 5 else "",
+                            'district_number': parts[6] if len(parts) > 6 else "",
+                            'fg': parts[7] if len(parts) > 7 else "",
+                            'included': parts[8] if len(parts) > 8 else "",
                             'credits': parts[9] if len(parts) > 9 else "0"
                         }
                         parsed_data['course_history'].append(course)
     def _parse_simplified_transcript(self, text: str) -> Dict:
         """Fallback simplified transcript parser with multiple pattern attempts"""
         patterns = [
+            (r'(?:COURSE|SUBJECT)\s*CODE.*?GRADE.*?CREDITS(.*?)(?:\n\s*\n|\Z)', 'table'),
             (r'([A-Z]{2,4}\s?\d{3}[A-Z]?)\s+(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)', 'line'),
             (r'(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)', 'minimal')
         ]
             try:
                 if pattern_type == 'table':
                     # Parse tabular data
+                    table_section = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
+                    if table_section:
+                        courses = re.findall(r'([A-Z]{2,4}\s?\d{3}[A-Z]?)\s+(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)',
+                                           table_section.group(1))
                 elif pattern_type == 'line':
                     courses = re.findall(pattern, text)
                 else:
                 if courses:
                     parsed_data = {'course_history': []}
                     for course in courses:
+                        if len(course) >= 4:
+                            parsed_data['course_history'].append({
+                                'course_code': course[0].strip(),
+                                'description': course[1].strip(),
+                                'grade': course[2].strip(),
+                                'credits': float(course[3]) if course[3] else 0.0
+                            })
+                        elif len(course) == 3:
+                            parsed_data['course_history'].append({
+                                'description': course[0].strip(),
+                                'grade': course[1].strip(),
+                                'credits': float(course[2]) if course[2] else 0.0
+                            })
                     return parsed_data
+            except Exception as e:
+                logging.warning(f"Pattern {pattern} failed: {str(e)}")
                 continue
         raise ValueError("Could not identify course information in transcript")
 # ========== ENHANCED ANALYSIS FUNCTIONS ==========
 def analyze_gpa(parsed_data: Dict) -> str:
     try:
+        gpa = float(parsed_data['student_info'].get('weighted_gpa', 0))
         if gpa >= 4.5:
             return "🌟 Excellent GPA! You're in the top tier of students."
         elif gpa >= 3.5:
 def analyze_graduation_status(parsed_data: Dict) -> str:
     try:
         total_required = sum(
+            float(req.get('required', 0))
+            for req in parsed_data.get('requirements', {}).values()
+            if req and str(req.get('required', '0')).replace('.', '').isdigit()
         )
         total_completed = sum(
+            float(req.get('completed', 0))
+            for req in parsed_data.get('requirements', {}).values()
+            if req and str(req.get('completed', '0')).replace('.', '').isdigit()
         )
         completion_percentage = (total_completed / total_required) * 100 if total_required > 0 else 0
     # GPA advice
     try:
+        gpa = float(parsed_data.get('student_info', {}).get('weighted_gpa', 0))
         if gpa < 3.0:
             advice.append("📚 Your GPA could improve. Consider:\n- Seeking tutoring for challenging subjects\n- Meeting with teachers during office hours\n- Developing better study habits")
     except (TypeError, ValueError, KeyError, AttributeError):
     # Community service advice
     try:
+        service_hours = int(parsed_data.get('student_info', {}).get('community_service_hours', 0))
         if service_hours < 100:
             advice.append("🤝 Consider more community service:\n- Many colleges value 100+ hours\n- Look for opportunities that align with your interests")
     except (TypeError, ValueError, KeyError, AttributeError):
     # Missing requirements advice
     try:
         missing_reqs = [
+            req for code, req in parsed_data.get('requirements', {}).items()
+            if req and float(req.get('percent_complete', 0)) < 100 and not code.startswith("Z-Assessment")
         ]
         if missing_reqs:
+            req_list = "\n- ".join([f"{code}: {req.get('description', '')}" for code, req in missing_reqs])
             advice.append(f"🎓 Focus on completing these requirements:\n- {req_list}")
     except (TypeError, ValueError, KeyError, AttributeError):
         pass
     # Course rigor advice
     try:
+        ap_count = sum(1 for course in parsed_data.get('course_history', [])
+                      if course and "ADVANCED PLACEMENT" in course.get('description', '').upper())
         if ap_count < 3:
             advice.append("🧠 Consider taking more challenging courses:\n- AP/IB courses can strengthen college applications\n- Shows academic rigor to admissions officers")
     except (TypeError, KeyError, AttributeError):
 def generate_college_recommendations(parsed_data: Dict) -> str:
     try:
+        gpa = float(parsed_data.get('student_info', {}).get('weighted_gpa', 0))
+        ap_count = sum(1 for course in parsed_data.get('course_history', [])
+                      if course and "ADVANCED PLACEMENT" in course.get('description', '').upper())
+        service_hours = int(parsed_data.get('student_info', {}).get('community_service_hours', 0))
         recommendations = []
         gpa_data = {
             "Type": ["Weighted GPA", "Unweighted GPA"],
             "Value": [
+                float(parsed_data.get('student_info', {}).get('weighted_gpa', 0)),
+                float(parsed_data.get('student_info', {}).get('unweighted_gpa', 0))
             ]
         }
         df = pd.DataFrame(gpa_data)
 def create_requirements_visualization(parsed_data: Dict):
     try:
         req_data = []
+        for code, req in parsed_data.get('requirements', {}).items():
+            if req and req.get('percent_complete'):
                 completion = float(req['percent_complete'])
                 req_data.append({
                     "Requirement": code,
         parser = TranscriptParser()
         try:
             parsed_data = parser.parse_transcript(text)
+            if not parsed_data:
+                raise ValueError("No data could be parsed from the transcript.")
         except Exception as e:
             raise ValueError(f"Couldn't parse transcript content. Error: {str(e)}")