Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -188,7 +188,7 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
|
|
| 188 |
for page in doc:
|
| 189 |
text += page.get_text("text") + '\n'
|
| 190 |
if not text.strip():
|
| 191 |
-
|
| 192 |
text = extract_text_from_pdf_with_ocr(file_path)
|
| 193 |
|
| 194 |
elif file_ext in ['.png', '.jpg', '.jpeg']:
|
|
@@ -203,30 +203,31 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
|
|
| 203 |
|
| 204 |
except Exception as e:
|
| 205 |
logging.error(f"Text extraction error: {str(e)}")
|
| 206 |
-
raise gr.Error(f"Failed to extract text: {str(e)}")
|
| 207 |
|
| 208 |
def extract_text_from_pdf_with_ocr(file_path: str) -> str:
|
| 209 |
-
text = ""
|
| 210 |
try:
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
| 227 |
except Exception as e:
|
| 228 |
-
raise ValueError(f"
|
| 229 |
-
return text
|
| 230 |
|
| 231 |
def extract_text_with_ocr(file_path: str) -> str:
|
| 232 |
try:
|
|
@@ -406,47 +407,56 @@ class TranscriptParser:
|
|
| 406 |
return None
|
| 407 |
|
| 408 |
def _parse_simplified_transcript(self, text: str) -> Dict:
|
| 409 |
-
"""Fallback simplified transcript parser
|
| 410 |
-
|
| 411 |
-
'
|
| 412 |
-
'
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
# Extract student information
|
| 416 |
-
name_match = re.search(r'(?:Name|Student)[:\s]+([A-Za-z,\s]+)', text, re.IGNORECASE)
|
| 417 |
-
if name_match:
|
| 418 |
-
parsed_data['student_info']['name'] = name_match.group(1).strip()
|
| 419 |
-
|
| 420 |
-
id_match = re.search(r'(?:ID|Student\s*ID)[:\s]+([A-Za-z0-9-]+)', text, re.IGNORECASE)
|
| 421 |
-
if id_match:
|
| 422 |
-
parsed_data['student_info']['id'] = id_match.group(1).strip()
|
| 423 |
-
|
| 424 |
-
gpa_match = re.search(r'(?:GPA|Grade\s*Point\s*Average)[:\s]+([0-9.]+)', text, re.IGNORECASE)
|
| 425 |
-
if gpa_match:
|
| 426 |
-
parsed_data['student_info']['gpa'] = float(gpa_match.group(1))
|
| 427 |
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
'
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
|
| 439 |
-
|
| 440 |
|
| 441 |
def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
|
| 442 |
"""Process transcript file and return simple confirmation"""
|
| 443 |
try:
|
| 444 |
if not file_obj:
|
| 445 |
-
raise
|
| 446 |
|
| 447 |
validate_file(file_obj)
|
| 448 |
file_ext = os.path.splitext(file_obj.name)[1].lower()
|
| 449 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
if progress:
|
| 451 |
progress(0.2, desc="Extracting text from file...")
|
| 452 |
|
|
@@ -476,8 +486,7 @@ def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Di
|
|
| 476 |
except Exception as e:
|
| 477 |
error_msg = f"Error processing transcript: {str(e)}"
|
| 478 |
logging.error(error_msg)
|
| 479 |
-
|
| 480 |
-
return error_msg, None
|
| 481 |
|
| 482 |
# ========== LEARNING STYLE QUIZ ==========
|
| 483 |
class LearningStyleQuiz:
|
|
@@ -866,11 +875,14 @@ def create_interface():
|
|
| 866 |
.completed-tab { background: #4CAF50 !important; color: white !important; }
|
| 867 |
.incomplete-tab { background: #E0E0E0 !important; }
|
| 868 |
.nav-message { padding: 10px; margin: 10px 0; border-radius: 4px; background-color: #ffebee; color: #c62828; }
|
| 869 |
-
.file-upload { border: 2px dashed #4CAF50 !important; padding: 20px !important; border-radius: 8px !important; }
|
|
|
|
| 870 |
.progress-bar { height: 5px; background: linear-gradient(to right, #4CAF50, #8BC34A); margin-bottom: 15px; border-radius: 3px; }
|
| 871 |
.quiz-question { margin-bottom: 15px; padding: 15px; background: #f5f5f5; border-radius: 5px; }
|
| 872 |
.quiz-results { margin-top: 20px; padding: 20px; background: #e8f5e9; border-radius: 8px; }
|
| 873 |
.error-message { color: #d32f2f; background-color: #ffebee; padding: 10px; border-radius: 4px; margin: 10px 0; }
|
|
|
|
|
|
|
| 874 |
|
| 875 |
.dark .tab-content { background-color: #2d2d2d !important; border-color: #444 !important; }
|
| 876 |
.dark .quiz-question { background-color: #3d3d3d !important; }
|
|
@@ -927,10 +939,21 @@ def create_interface():
|
|
| 927 |
transcript_output = gr.Textbox(
|
| 928 |
label="Analysis Results",
|
| 929 |
lines=5,
|
| 930 |
-
interactive=False
|
|
|
|
| 931 |
)
|
| 932 |
transcript_data = gr.State()
|
| 933 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 934 |
upload_btn.click(
|
| 935 |
fn=parse_transcript,
|
| 936 |
inputs=[file_input, tab_completed],
|
|
@@ -1143,20 +1166,22 @@ def create_interface():
|
|
| 1143 |
if tab_index <= current_tab:
|
| 1144 |
return gr.Tabs(selected=tab_index), gr.update(visible=False)
|
| 1145 |
|
| 1146 |
-
|
| 1147 |
-
|
| 1148 |
-
|
| 1149 |
-
|
| 1150 |
-
|
| 1151 |
-
|
| 1152 |
-
|
| 1153 |
-
|
| 1154 |
-
|
| 1155 |
-
|
| 1156 |
-
|
| 1157 |
-
|
|
|
|
|
|
|
|
|
|
| 1158 |
)
|
| 1159 |
-
)
|
| 1160 |
|
| 1161 |
return gr.Tabs(selected=tab_index), gr.update(visible=False)
|
| 1162 |
|
|
|
|
| 188 |
for page in doc:
|
| 189 |
text += page.get_text("text") + '\n'
|
| 190 |
if not text.strip():
|
| 191 |
+
logging.warning("PyMuPDF returned empty text, trying OCR fallback...")
|
| 192 |
text = extract_text_from_pdf_with_ocr(file_path)
|
| 193 |
|
| 194 |
elif file_ext in ['.png', '.jpg', '.jpeg']:
|
|
|
|
| 203 |
|
| 204 |
except Exception as e:
|
| 205 |
logging.error(f"Text extraction error: {str(e)}")
|
| 206 |
+
raise gr.Error(f"Failed to extract text: {str(e)}\n\nPossible solutions:\n1. Try a different file format\n2. Ensure text is clear and not handwritten\n3. Check file size (<5MB)")
|
| 207 |
|
| 208 |
def extract_text_from_pdf_with_ocr(file_path: str) -> str:
|
|
|
|
| 209 |
try:
|
| 210 |
+
import pdf2image
|
| 211 |
+
images = pdf2image.convert_from_path(file_path, dpi=300)
|
| 212 |
+
custom_config = r'--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;()-/ '
|
| 213 |
+
|
| 214 |
+
text = ""
|
| 215 |
+
for i, img in enumerate(images):
|
| 216 |
+
# Pre-process image
|
| 217 |
+
img = img.convert('L') # Grayscale
|
| 218 |
+
img = img.point(lambda x: 0 if x < 140 else 255) # Increase contrast
|
| 219 |
+
|
| 220 |
+
# OCR with retry logic
|
| 221 |
+
try:
|
| 222 |
+
page_text = pytesseract.image_to_string(img, config=custom_config)
|
| 223 |
+
if len(page_text.strip()) > 20: # Minimum viable text
|
| 224 |
+
text += f"PAGE {i+1}:\n{page_text}\n\n"
|
| 225 |
+
except Exception as e:
|
| 226 |
+
logging.warning(f"OCR failed on page {i+1}: {str(e)}")
|
| 227 |
+
|
| 228 |
+
return text if text else "No readable text found"
|
| 229 |
except Exception as e:
|
| 230 |
+
raise ValueError(f"OCR processing failed: {str(e)}")
|
|
|
|
| 231 |
|
| 232 |
def extract_text_with_ocr(file_path: str) -> str:
|
| 233 |
try:
|
|
|
|
| 407 |
return None
|
| 408 |
|
| 409 |
def _parse_simplified_transcript(self, text: str) -> Dict:
|
| 410 |
+
"""Fallback simplified transcript parser with multiple pattern attempts"""
|
| 411 |
+
patterns = [
|
| 412 |
+
(r'(?:Course|Subject)\s*Code.*?Grade.*?Credits(.*?)(?:\n\s*\n|\Z)', 'table'),
|
| 413 |
+
(r'([A-Z]{2,4}\s?\d{3}[A-Z]?)\s+(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)', 'line'),
|
| 414 |
+
(r'(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)', 'minimal')
|
| 415 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
|
| 417 |
+
for pattern, pattern_type in patterns:
|
| 418 |
+
try:
|
| 419 |
+
if pattern_type == 'table':
|
| 420 |
+
# Parse tabular data
|
| 421 |
+
courses = re.findall(r'([A-Z]{2,4}\s?\d{3}[A-Z]?)\s+(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)',
|
| 422 |
+
re.search(pattern, text, re.DOTALL).group(1))
|
| 423 |
+
elif pattern_type == 'line':
|
| 424 |
+
courses = re.findall(pattern, text)
|
| 425 |
+
else:
|
| 426 |
+
courses = re.findall(pattern, text)
|
| 427 |
+
|
| 428 |
+
if courses:
|
| 429 |
+
parsed_data = {'course_history': []}
|
| 430 |
+
for course in courses:
|
| 431 |
+
parsed_data['course_history'].append({
|
| 432 |
+
'course_code': course[0].strip(),
|
| 433 |
+
'description': course[1].strip() if len(course) > 1 else '',
|
| 434 |
+
'grade': course[2].strip() if len(course) > 2 else '',
|
| 435 |
+
'credits': float(course[3]) if len(course) > 3 else 0.0
|
| 436 |
+
})
|
| 437 |
+
return parsed_data
|
| 438 |
+
except:
|
| 439 |
+
continue
|
| 440 |
|
| 441 |
+
raise ValueError("Could not identify course information in transcript")
|
| 442 |
|
| 443 |
def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
|
| 444 |
"""Process transcript file and return simple confirmation"""
|
| 445 |
try:
|
| 446 |
if not file_obj:
|
| 447 |
+
raise gr.Error("Please upload a transcript file first (PDF or image)")
|
| 448 |
|
| 449 |
validate_file(file_obj)
|
| 450 |
file_ext = os.path.splitext(file_obj.name)[1].lower()
|
| 451 |
|
| 452 |
+
# Additional PDF validation
|
| 453 |
+
if file_ext == '.pdf':
|
| 454 |
+
try:
|
| 455 |
+
with open(file_obj.name, 'rb') as f:
|
| 456 |
+
PdfReader(f) # Test if PDF is readable
|
| 457 |
+
except Exception as e:
|
| 458 |
+
raise gr.Error(f"Invalid PDF file: {str(e)}. Please upload a non-corrupted PDF.")
|
| 459 |
+
|
| 460 |
if progress:
|
| 461 |
progress(0.2, desc="Extracting text from file...")
|
| 462 |
|
|
|
|
| 486 |
except Exception as e:
|
| 487 |
error_msg = f"Error processing transcript: {str(e)}"
|
| 488 |
logging.error(error_msg)
|
| 489 |
+
raise gr.Error(f"{error_msg}\n\nPossible solutions:\n1. Try a different file format\n2. Ensure text is clear and not handwritten\n3. Check file size (<5MB)")
|
|
|
|
| 490 |
|
| 491 |
# ========== LEARNING STYLE QUIZ ==========
|
| 492 |
class LearningStyleQuiz:
|
|
|
|
| 875 |
.completed-tab { background: #4CAF50 !important; color: white !important; }
|
| 876 |
.incomplete-tab { background: #E0E0E0 !important; }
|
| 877 |
.nav-message { padding: 10px; margin: 10px 0; border-radius: 4px; background-color: #ffebee; color: #c62828; }
|
| 878 |
+
.file-upload { border: 2px dashed #4CAF50 !important; padding: 20px !important; border-radius: 8px !important; text-align: center; }
|
| 879 |
+
.file-upload:hover { background: #f5f5f5; }
|
| 880 |
.progress-bar { height: 5px; background: linear-gradient(to right, #4CAF50, #8BC34A); margin-bottom: 15px; border-radius: 3px; }
|
| 881 |
.quiz-question { margin-bottom: 15px; padding: 15px; background: #f5f5f5; border-radius: 5px; }
|
| 882 |
.quiz-results { margin-top: 20px; padding: 20px; background: #e8f5e9; border-radius: 8px; }
|
| 883 |
.error-message { color: #d32f2f; background-color: #ffebee; padding: 10px; border-radius: 4px; margin: 10px 0; }
|
| 884 |
+
.transcript-results { border-left: 4px solid #4CAF50 !important; padding: 15px !important; background: #f8f8f8 !important; }
|
| 885 |
+
.error-box { border: 1px solid #ff4444 !important; background: #fff8f8 !important; }
|
| 886 |
|
| 887 |
.dark .tab-content { background-color: #2d2d2d !important; border-color: #444 !important; }
|
| 888 |
.dark .quiz-question { background-color: #3d3d3d !important; }
|
|
|
|
| 939 |
transcript_output = gr.Textbox(
|
| 940 |
label="Analysis Results",
|
| 941 |
lines=5,
|
| 942 |
+
interactive=False,
|
| 943 |
+
elem_classes="transcript-results"
|
| 944 |
)
|
| 945 |
transcript_data = gr.State()
|
| 946 |
|
| 947 |
+
file_input.change(
|
| 948 |
+
fn=lambda f: (
|
| 949 |
+
gr.update(visible=False),
|
| 950 |
+
gr.update(value="File ready for analysis!", visible=True) if f
|
| 951 |
+
else gr.update(value="Please upload a file", visible=False)
|
| 952 |
+
),
|
| 953 |
+
inputs=file_input,
|
| 954 |
+
outputs=[file_error, transcript_output]
|
| 955 |
+
)
|
| 956 |
+
|
| 957 |
upload_btn.click(
|
| 958 |
fn=parse_transcript,
|
| 959 |
inputs=[file_input, tab_completed],
|
|
|
|
| 1166 |
if tab_index <= current_tab:
|
| 1167 |
return gr.Tabs(selected=tab_index), gr.update(visible=False)
|
| 1168 |
|
| 1169 |
+
# Check all previous tabs are completed
|
| 1170 |
+
for i in range(tab_index):
|
| 1171 |
+
if not tab_completed_status.get(i, False):
|
| 1172 |
+
messages = [
|
| 1173 |
+
"Please complete the transcript analysis first",
|
| 1174 |
+
"Please complete the learning style quiz first",
|
| 1175 |
+
"Please fill out your personal information first",
|
| 1176 |
+
"Please save your profile first"
|
| 1177 |
+
]
|
| 1178 |
+
return (
|
| 1179 |
+
gr.Tabs(selected=i),
|
| 1180 |
+
gr.update(
|
| 1181 |
+
value=f"<div class='error-message'>⛔ {messages[i]}</div>",
|
| 1182 |
+
visible=True
|
| 1183 |
+
)
|
| 1184 |
)
|
|
|
|
| 1185 |
|
| 1186 |
return gr.Tabs(selected=tab_index), gr.update(visible=False)
|
| 1187 |
|