Spaces:

Dannyar608
/

Final_project

Runtime error

App Files Files Community

Dannyar608 commited on May 20, 2025

Commit

df3101e

verified ·

1 Parent(s): e0ad8bb

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -92

app.py CHANGED Viewed

@@ -184,7 +184,6 @@ def validate_file(file_obj) -> None:
 def preprocess_text(text: str) -> str:
     """Normalize text for more reliable parsing"""
     text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
-    text = text.replace('|', ' ')      # Handle common OCR errors
     text = text.upper()                # Standardize case for certain fields
     return text
@@ -198,29 +197,32 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
                 import pdfplumber
                 with pdfplumber.open(file_path) as pdf:
                     for page in pdf.pages:
-                        # Try tables first
-                        tables = page.extract_tables()
                         if tables:
                             for table in tables:
-                                text += "\n".join(
-                                    " | ".join(str(cell) for cell in row if cell is not None)
-                                    for row in table
-                                ) + "\n"
-                        # Fall back to text extraction
                         page_text = page.extract_text()
                         if page_text:
                             text += page_text + "\n"
                 if not text.strip():
                     raise ValueError("PDFPlumber returned empty text")
             except Exception as e:
                 logging.warning(f"PDFPlumber failed: {str(e)}. Trying PyMuPDF...")
                 doc = fitz.open(file_path)
                 for page in doc:
                     text += page.get_text("text") + '\n'
-                if not text.strip():
-                    logging.warning("PyMuPDF returned empty text, trying OCR fallback...")
-                    text = extract_text_from_pdf_with_ocr(file_path)
         elif file_ext in ['.png', '.jpg', '.jpeg']:
             text = extract_text_with_ocr(file_path)
@@ -233,7 +235,7 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
     except Exception as e:
         logging.error(f"Text extraction error: {str(e)}")
-        raise gr.Error(f"Failed to extract text: {str(e)}\n\nPossible solutions:\n1. Try a different file format\n2. Ensure text is clear and not handwritten\n3. Check file size (<5MB)")
 def extract_text_from_pdf_with_ocr(file_path: str) -> str:
     try:
@@ -271,18 +273,35 @@ def extract_text_with_ocr(file_path: str) -> str:
         raise ValueError(f"OCR processing failed: {str(e)}")
 def clean_extracted_text(text: str) -> str:
     text = re.sub(r'\s+', ' ', text).strip()
     replacements = {
-        '|': 'I',
-        '‘': "'",
-        '’': "'",
-        '“': '"',
-        '”': '"',
-        'ﬁ': 'fi',
-        'ﬂ': 'fl'
     }
-    for wrong, right in replacements.items():
-        text = text.replace(wrong, right)
     return text
 def remove_sensitive_info(text: str) -> str:
@@ -345,7 +364,7 @@ class TranscriptParser:
             raise ValueError(f"Couldn't parse transcript content. Error: {str(e)}")
     def _parse_detailed_transcript(self, text: str) -> Optional[Dict]:
-        """Parse detailed transcript format with improved patterns"""
         try:
             parsed_data = {
                 'student_info': {},
@@ -354,102 +373,113 @@ class TranscriptParser:
                 'assessments': {}
             }
-            # Extract student info with more flexible patterns
             student_info_match = re.search(r"(\d{7}) - (.*?)\n", text)
             if student_info_match:
                 parsed_data['student_info']['id'] = student_info_match.group(1)
                 parsed_data['student_info']['name'] = student_info_match.group(2).strip()
-            # More flexible grade and year extraction
-            current_grade_match = re.search(r"Current Grade:\s*(\d+)", text, re.IGNORECASE)
-            if current_grade_match:
-                parsed_data['student_info']['grade'] = current_grade_match.group(1)
-            yog_match = re.search(r"YOG\s*(\d{4})", text, re.IGNORECASE)
             if yog_match:
                 parsed_data['student_info']['year_of_graduation'] = yog_match.group(1)
-            # Improved GPA extraction with more flexible patterns
-            gpa_matches = re.findall(r"(?:UN.?WEIGHTED|WEIGHTED)\s*GPA\s*([\d.]+)", text, re.IGNORECASE)
             if len(gpa_matches) >= 1:
                 parsed_data['student_info']['unweighted_gpa'] = float(gpa_matches[0])
             if len(gpa_matches) >= 2:
                 parsed_data['student_info']['weighted_gpa'] = float(gpa_matches[1])
-            # Community service info
-            service_hours_match = re.search(r"COMM\s*SERV\s*HOURS\s*(\d+)", text, re.IGNORECASE)
             if service_hours_match:
                 parsed_data['student_info']['community_service_hours'] = int(service_hours_match.group(1))
-            service_date_match = re.search(r"COMM\s*SERV\s*DATE\s*(\d{2}/\d{2}/\d{4})", text, re.IGNORECASE)
             if service_date_match:
                 parsed_data['student_info']['community_service_date'] = service_date_match.group(1)
-            # Credits info
-            credits_match = re.search(r"TOTAL\s*CREDITS\s*EARNED\s*([\d.]+)", text, re.IGNORECASE)
             if credits_match:
                 parsed_data['student_info']['total_credits'] = float(credits_match.group(1))
-            # Virtual grade
-            virtual_grade_match = re.search(r"VIRTUAL\s*GRADE\s*(\w+)", text, re.IGNORECASE)
             if virtual_grade_match:
                 parsed_data['student_info']['virtual_grade'] = virtual_grade_match.group(1)
-            # Extract requirements with improved pattern
-            req_pattern = re.compile(r"([A-Z]-[^\|]+)\s*\|\s*([^\|]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d]+)\s*%")
-            for match in req_pattern.finditer(text):
-                code = match.group(1).strip()
-                desc = match.group(2).strip()
-                required = float(match.group(3)) if match.group(3) else 0.0
-                waived = float(match.group(4)) if match.group(4) else 0.0
-                completed = float(match.group(5)) if match.group(5) else 0.0
-                percent = float(match.group(6)) if match.group(6) else 0.0
-                parsed_data['requirements'][code] = {
-                    "description": desc,
-                    "required": required,
-                    "waived": waived,
-                    "completed": completed,
-                    "percent_complete": percent
-                }
-            # Extract assessments with more flexible pattern
-            assess_pattern = re.compile(r"Z-([^\|]+)\s*\|\s*([^\|]*)\s*\|\s*([^\|]*)\s*\|\s*([^\|]*)\s*%", re.IGNORECASE)
-            for match in assess_pattern.finditer(text):
-                name = f"Assessment: {match.group(1).strip()}"
-                status = match.group(3).strip() if match.group(3) else ""
-                if status:
-                    parsed_data['assessments'][name] = status
-            # Handle other Z items
-            for z_item in ["Community Service Hours", "GPA"]:
-                z_match = re.search(fr"Z-{z_item.replace(' ', '.*?')}\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+)\s*%", text, re.IGNORECASE)
-                if z_match:
-                    status = z_match.group(2).strip()
-                    parsed_data['assessments'][z_item] = status
-            # Extract course history with more robust pattern
-            course_history_section = re.search(r"Requirement.*?School Year.*?GradeLv1.*?CrsNum.*?Description.*?Term.*?DstNumber.*?FG.*?Incl.*?Credits(.*?)(?:\n\s*\n|$)", text, re.DOTALL | re.IGNORECASE)
-            if course_history_section:
-                course_lines = [line.strip() for line in course_history_section.group(1).split('\n') if line.strip() and '|' in line]
                 for line in course_lines:
-                    parts = [part.strip() for part in line.split('|')]
-                    if len(parts) >= 9:
-                        course = {
-                            'requirement': parts[0] if len(parts) > 0 else "",
-                            'school_year': parts[1] if len(parts) > 1 else "",
-                            'grade_level': parts[2] if len(parts) > 2 else "",
-                            'course_code': parts[3] if len(parts) > 3 else "",
-                            'description': parts[4] if len(parts) > 4 else "",
-                            'term': parts[5] if len(parts) > 5 else "",
-                            'district_number': parts[6] if len(parts) > 6 else "",
-                            'fg': parts[7] if len(parts) > 7 else "",
-                            'included': parts[8] if len(parts) > 8 else "",
-                            'credits': parts[9] if len(parts) > 9 else "0"
-                        }
-                        # Handle "inProgress" credits
-                        if "inProgress" in course['credits'].lower():
-                            course['credits'] = "0"
-                        parsed_data['course_history'].append(course)
             return parsed_data

 def preprocess_text(text: str) -> str:
     """Normalize text for more reliable parsing"""
     text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
     text = text.upper()                # Standardize case for certain fields
     return text
                 import pdfplumber
                 with pdfplumber.open(file_path) as pdf:
                     for page in pdf.pages:
+                        # Try to extract tables first
+                        tables = page.extract_tables({
+                            "vertical_strategy": "text",
+                            "horizontal_strategy": "text",
+                            "intersection_y_tolerance": 10
+                        })
                         if tables:
                             for table in tables:
+                                for row in table:
+                                    text += " | ".join(str(cell).strip() for cell in row if cell) + "\n"
+                        # Fall back to text extraction if tables are empty
                         page_text = page.extract_text()
                         if page_text:
                             text += page_text + "\n"
                 if not text.strip():
                     raise ValueError("PDFPlumber returned empty text")
             except Exception as e:
                 logging.warning(f"PDFPlumber failed: {str(e)}. Trying PyMuPDF...")
                 doc = fitz.open(file_path)
                 for page in doc:
                     text += page.get_text("text") + '\n'
         elif file_ext in ['.png', '.jpg', '.jpeg']:
             text = extract_text_with_ocr(file_path)
     except Exception as e:
         logging.error(f"Text extraction error: {str(e)}")
+        raise ValueError(f"Failed to extract text: {str(e)}")
 def extract_text_from_pdf_with_ocr(file_path: str) -> str:
     try:
         raise ValueError(f"OCR processing failed: {str(e)}")
 def clean_extracted_text(text: str) -> str:
+    """Special cleaning for Miami-Dade transcripts"""
+    # Normalize whitespace
     text = re.sub(r'\s+', ' ', text).strip()
+    # Fix common OCR errors
     replacements = {
+        'GradeLv1': 'GradeLvl',
+        'CrsNu m': 'CrsNum',
+        'YOG': 'Year of Graduation',
+        'Comm Serv': 'Community Service',
+        r'\bA\s*-\s*': 'A-',  # Fix requirement codes
+        r'\bB\s*-\s*': 'B-',
+        r'\bC\s*-\s*': 'C-',
+        r'\bD\s*-\s*': 'D-',
+        r'\bE\s*-\s*': 'E-',
+        r'\bF\s*-\s*': 'F-',
+        r'\bG\s*-\s*': 'G-',
+        r'\bZ\s*-\s*': 'Z-'
     }
+    for pattern, replacement in replacements.items():
+        text = re.sub(pattern, replacement, text)
+    # Fix course codes with spaces
+    text = re.sub(r'(\b[A-Z]{2,4})\s(\d{3}[A-Z]?\b)', r'\1\2', text)
+    # Fix common OCR errors in credits
+    text = re.sub(r'in\s*Progress', 'inProgress', text, flags=re.IGNORECASE)
     return text
 def remove_sensitive_info(text: str) -> str:
             raise ValueError(f"Couldn't parse transcript content. Error: {str(e)}")
     def _parse_detailed_transcript(self, text: str) -> Optional[Dict]:
+        """Parse detailed transcript format with improved patterns for Miami-Dade format"""
         try:
             parsed_data = {
                 'student_info': {},
                 'assessments': {}
             }
+            # Extract student info
             student_info_match = re.search(r"(\d{7}) - (.*?)\n", text)
             if student_info_match:
                 parsed_data['student_info']['id'] = student_info_match.group(1)
                 parsed_data['student_info']['name'] = student_info_match.group(2).strip()
+            # Extract grade and year info
+            grade_match = re.search(r"Current Grade:\s*(\d+)", text)
+            if grade_match:
+                parsed_data['student_info']['grade'] = grade_match.group(1)
+            yog_match = re.search(r"YOG\s*(\d{4})", text)
             if yog_match:
                 parsed_data['student_info']['year_of_graduation'] = yog_match.group(1)
+            # Extract GPA information
+            gpa_matches = re.findall(r"(?:Un-weighted|Weighted)\s*GPA\s*([\d.]+)", text)
             if len(gpa_matches) >= 1:
                 parsed_data['student_info']['unweighted_gpa'] = float(gpa_matches[0])
             if len(gpa_matches) >= 2:
                 parsed_data['student_info']['weighted_gpa'] = float(gpa_matches[1])
+            # Extract community service info
+            service_hours_match = re.search(r"Comm\s*Serv\s*Hours\s*(\d+)", text, re.IGNORECASE)
             if service_hours_match:
                 parsed_data['student_info']['community_service_hours'] = int(service_hours_match.group(1))
+            service_date_match = re.search(r"Comm\s*Serv\s*Date\s*(\d{2}/\d{2}/\d{4})", text, re.IGNORECASE)
             if service_date_match:
                 parsed_data['student_info']['community_service_date'] = service_date_match.group(1)
+            # Extract credits info
+            credits_match = re.search(r"Total\s*Credits\s*Earned\s*([\d.]+)", text, re.IGNORECASE)
             if credits_match:
                 parsed_data['student_info']['total_credits'] = float(credits_match.group(1))
+            # Extract virtual grade
+            virtual_grade_match = re.search(r"Virtual\s*Grade\s*([A-Z])", text, re.IGNORECASE)
             if virtual_grade_match:
                 parsed_data['student_info']['virtual_grade'] = virtual_grade_match.group(1)
+            # Extract requirements - specific to this format
+            req_section = re.search(r"Code\s*Description\s*Required\s*Waived\s*Completed\s*Status(.*?)(?:\n\s*\n|$)", text, re.DOTALL)
+            if req_section:
+                req_lines = [line.strip() for line in req_section.group(1).split('\n') if line.strip()]
+                for line in req_lines:
+                    if '|' in line:  # Table format
+                        parts = [part.strip() for part in line.split('|')]
+                        if len(parts) >= 6:
+                            code = parts[0]
+                            description = parts[1]
+                            required = float(parts[2]) if parts[2] and parts[2].replace('.','').isdigit() else 0.0
+                            waived = float(parts[3]) if parts[3] and parts[3].replace('.','').isdigit() else 0.0
+                            completed = float(parts[4]) if parts[4] and parts[4].replace('.','').isdigit() else 0.0
+                            status = parts[5]
+                            # Extract percentage if available
+                            percent = 0.0
+                            percent_match = re.search(r"(\d+)%", status)
+                            if percent_match:
+                                percent = float(percent_match.group(1))
+                            parsed_data['requirements'][code] = {
+                                "description": description,
+                                "required": required,
+                                "waived": waived,
+                                "completed": completed,
+                                "percent_complete": percent,
+                                "status": status
+                            }
+            # Extract assessments
+            assess_section = re.search(r"Z-Assessment.*?\n(.*?)(?:\n\s*\n|$)", text, re.DOTALL)
+            if assess_section:
+                assess_lines = [line.strip() for line in assess_section.group(1).split('\n') if line.strip()]
+                for line in assess_lines:
+                    if '|' in line:
+                        parts = [part.strip() for part in line.split('|')]
+                        if len(parts) >= 5 and parts[0].startswith('Z-'):
+                            name = parts[0].replace('Z-', '').strip()
+                            status = parts[4]
+                            parsed_data['assessments'][name] = status
+            # Extract course history - specific to this format
+            course_section = re.search(r"Requirement\s*School Year\s*GradeLv1\s*CrsNum\s*Description\s*Term\s*DstNumber\s*FG\s*Incl\s*Credits(.*?)(?:\n\s*\n|$)", text, re.DOTALL)
+            if course_section:
+                course_lines = [line.strip() for line in course_section.group(1).split('\n') if line.strip()]
                 for line in course_lines:
+                    if '|' in line:
+                        parts = [part.strip() for part in line.split('|')]
+                        if len(parts) >= 9:
+                            course = {
+                                'requirement': parts[0],
+                                'school_year': parts[1],
+                                'grade_level': parts[2],
+                                'course_code': parts[3],
+                                'description': parts[4],
+                                'term': parts[5],
+                                'district_number': parts[6],
+                                'fg': parts[7],
+                                'included': parts[8],
+                                'credits': parts[9] if len(parts) > 9 else "0"
+                            }
+                            # Handle inProgress credits
+                            if "inProgress" in course['credits'].lower():
+                                course['credits'] = "0"
+                            parsed_data['course_history'].append(course)
             return parsed_data