Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,31 +6,39 @@ import os
|
|
| 6 |
import re
|
| 7 |
from datetime import datetime
|
| 8 |
from io import BytesIO
|
| 9 |
-
import
|
|
|
|
| 10 |
|
| 11 |
# Page config MUST be first
|
| 12 |
st.set_page_config(
|
| 13 |
page_title="Medical School Personal Statement Analyzer",
|
| 14 |
page_icon="π₯",
|
| 15 |
-
layout="wide"
|
|
|
|
| 16 |
)
|
| 17 |
|
| 18 |
# Import ML libraries
|
| 19 |
-
from sentence_transformers import SentenceTransformer
|
|
|
|
| 20 |
from sklearn.preprocessing import StandardScaler
|
| 21 |
-
from sklearn.ensemble import RandomForestClassifier
|
| 22 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
| 23 |
import xgboost as xgb
|
|
|
|
| 24 |
|
| 25 |
# Import PDF generation libraries
|
| 26 |
-
|
| 27 |
-
from reportlab.lib
|
| 28 |
-
from reportlab.
|
| 29 |
-
from reportlab.
|
| 30 |
-
from reportlab.lib.
|
| 31 |
-
from reportlab.lib.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
# Categories
|
| 34 |
CATEGORIES = {
|
| 35 |
'Spark': {
|
| 36 |
'description': 'Opening that spurs interest in medicine (typically in opening paragraph)',
|
|
@@ -47,6 +55,10 @@ CATEGORIES = {
|
|
| 47 |
2: 'somewhat connected but unclear',
|
| 48 |
3: 'connected and clear',
|
| 49 |
4: 'engaging and logically flows into becoming a doctor'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
}
|
| 51 |
},
|
| 52 |
'Healthcare Experience': {
|
|
@@ -64,6 +76,10 @@ CATEGORIES = {
|
|
| 64 |
2: 'bland/boring but not problematic',
|
| 65 |
3: 'interesting and relevant',
|
| 66 |
4: 'vivid, active, thoughtful, relevant, memorable, positive'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
}
|
| 68 |
},
|
| 69 |
'Showing Doctor Qualities': {
|
|
@@ -81,6 +97,10 @@ CATEGORIES = {
|
|
| 81 |
2: 'bland/boring but not problematic',
|
| 82 |
3: 'shows some understanding',
|
| 83 |
4: 'realistic, self-aware, mature, humble, specific understanding'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
}
|
| 85 |
},
|
| 86 |
'Spin': {
|
|
@@ -97,123 +117,404 @@ CATEGORIES = {
|
|
| 97 |
2: 'some connection but generic',
|
| 98 |
3: 'clear connection',
|
| 99 |
4: 'direct, logical, and specific argument'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
}
|
| 101 |
}
|
| 102 |
}
|
| 103 |
|
| 104 |
@st.cache_resource
|
| 105 |
-
def
|
| 106 |
-
"""Load the
|
| 107 |
-
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
-
def segment_text(text):
|
| 111 |
-
"""Segment text
|
| 112 |
-
# Try to split by double newlines first
|
| 113 |
paragraphs = re.split(r'\n\s*\n', text)
|
| 114 |
paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p.strip()) > 50]
|
| 115 |
|
| 116 |
-
# If only one paragraph, try to split by sentences
|
| 117 |
if len(paragraphs) <= 1:
|
| 118 |
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 119 |
sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
|
| 120 |
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
segments = []
|
| 123 |
-
current_segment = []
|
| 124 |
-
|
| 125 |
|
| 126 |
-
for
|
| 127 |
-
|
| 128 |
-
current_length += len(sent.split())
|
| 129 |
|
| 130 |
-
if
|
| 131 |
segments.append(' '.join(current_segment))
|
| 132 |
-
current_segment = []
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
if current_segment:
|
| 136 |
segments.append(' '.join(current_segment))
|
| 137 |
|
| 138 |
-
return segments
|
| 139 |
|
| 140 |
return paragraphs
|
| 141 |
|
| 142 |
-
def
|
| 143 |
-
"""
|
|
|
|
| 144 |
text_lower = text.lower()
|
| 145 |
words = text.split()
|
| 146 |
|
| 147 |
-
#
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
|
|
|
| 150 |
for cat_name, cat_info in CATEGORIES.items():
|
| 151 |
-
|
| 152 |
-
keyword_matches = sum(1 for kw in
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
-
# Pattern matching
|
| 156 |
pattern_matches = 0
|
| 157 |
-
for pattern in cat_info
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
else:
|
| 189 |
-
|
| 190 |
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
negative_indicators = ['vague', 'generic', 'unclear', 'disconnected', 'simplistic']
|
| 194 |
|
| 195 |
-
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
elif negative_count > positive_count and score > 1:
|
| 201 |
-
score = max(score - 1, 1)
|
| 202 |
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
'
|
| 207 |
-
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
-
def
|
| 211 |
"""Analyze complete personal statement"""
|
| 212 |
-
segments = segment_text(text)
|
| 213 |
|
| 214 |
segment_results = []
|
| 215 |
for i, segment in enumerate(segments):
|
| 216 |
-
result =
|
| 217 |
result['segment_num'] = i + 1
|
| 218 |
segment_results.append(result)
|
| 219 |
|
|
@@ -244,8 +545,11 @@ def analyze_full_statement(text, embedder):
|
|
| 244 |
|
| 245 |
return segment_results, category_results
|
| 246 |
|
| 247 |
-
def create_pdf_report(segment_results, category_results
|
| 248 |
-
"""Create
|
|
|
|
|
|
|
|
|
|
| 249 |
buffer = BytesIO()
|
| 250 |
doc = SimpleDocTemplate(buffer, pagesize=letter, rightMargin=72, leftMargin=72,
|
| 251 |
topMargin=72, bottomMargin=18)
|
|
@@ -307,92 +611,6 @@ def create_pdf_report(segment_results, category_results, statement_text):
|
|
| 307 |
]))
|
| 308 |
|
| 309 |
elements.append(summary_table)
|
| 310 |
-
elements.append(Spacer(1, 30))
|
| 311 |
-
|
| 312 |
-
# Category Analysis
|
| 313 |
-
elements.append(Paragraph("CATEGORY ANALYSIS", heading_style))
|
| 314 |
-
|
| 315 |
-
category_data = [['Category', 'Status', 'Score', 'Confidence', 'Segments']]
|
| 316 |
-
for cat in CATEGORIES.keys():
|
| 317 |
-
if category_results[cat]['detected']:
|
| 318 |
-
status = "β Detected"
|
| 319 |
-
score = f"{category_results[cat]['score']}/4"
|
| 320 |
-
confidence = f"{category_results[cat]['confidence']:.1%}"
|
| 321 |
-
segments = str(category_results[cat]['num_segments'])
|
| 322 |
-
else:
|
| 323 |
-
status = "β Not Found"
|
| 324 |
-
score = "N/A"
|
| 325 |
-
confidence = "N/A"
|
| 326 |
-
segments = "0"
|
| 327 |
-
category_data.append([cat, status, score, confidence, segments])
|
| 328 |
-
|
| 329 |
-
category_table = Table(category_data, colWidths=[2*inch, 1.2*inch, 0.8*inch, 1*inch, 1*inch])
|
| 330 |
-
category_table.setStyle(TableStyle([
|
| 331 |
-
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#1f4788')),
|
| 332 |
-
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
|
| 333 |
-
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
| 334 |
-
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 335 |
-
('FONTSIZE', (0, 0), (-1, 0), 11),
|
| 336 |
-
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
| 337 |
-
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
|
| 338 |
-
('GRID', (0, 0), (-1, -1), 1, colors.black)
|
| 339 |
-
]))
|
| 340 |
-
|
| 341 |
-
elements.append(category_table)
|
| 342 |
-
elements.append(PageBreak())
|
| 343 |
-
|
| 344 |
-
# Detailed Recommendations
|
| 345 |
-
elements.append(Paragraph("RECOMMENDATIONS", heading_style))
|
| 346 |
-
|
| 347 |
-
missing_cats = [cat for cat, res in category_results.items() if not res['detected']]
|
| 348 |
-
low_score_cats = [cat for cat, res in category_results.items()
|
| 349 |
-
if res['detected'] and res['score'] and res['score'] < 3]
|
| 350 |
-
|
| 351 |
-
if missing_cats:
|
| 352 |
-
elements.append(Paragraph("<b>Missing Categories:</b>", styles['Heading3']))
|
| 353 |
-
for cat in missing_cats:
|
| 354 |
-
elements.append(Paragraph(f"β’ Add content for {cat}: {CATEGORIES[cat]['description']}", styles['Normal']))
|
| 355 |
-
elements.append(Paragraph(f" Include keywords: {', '.join(CATEGORIES[cat]['keywords'][:5])}...", styles['Normal']))
|
| 356 |
-
elements.append(Spacer(1, 12))
|
| 357 |
-
|
| 358 |
-
if low_score_cats:
|
| 359 |
-
elements.append(Paragraph("<b>Areas for Improvement:</b>", styles['Heading3']))
|
| 360 |
-
for cat in low_score_cats:
|
| 361 |
-
score = category_results[cat]['score']
|
| 362 |
-
elements.append(Paragraph(f"β’ Improve {cat} (current score: {score}/4)", styles['Normal']))
|
| 363 |
-
elements.append(Paragraph(f" Target: {CATEGORIES[cat]['rubric'][4]}", styles['Normal']))
|
| 364 |
-
elements.append(Spacer(1, 12))
|
| 365 |
-
|
| 366 |
-
if not missing_cats and not low_score_cats:
|
| 367 |
-
elements.append(Paragraph("Excellent work! All categories are present with good scores.", styles['Normal']))
|
| 368 |
-
|
| 369 |
-
# Segment Analysis Summary
|
| 370 |
-
elements.append(PageBreak())
|
| 371 |
-
elements.append(Paragraph("SEGMENT ANALYSIS", heading_style))
|
| 372 |
-
|
| 373 |
-
for segment in segment_results[:10]: # Limit to first 10 segments
|
| 374 |
-
elements.append(Paragraph(f"<b>Segment {segment['segment_num']}</b>", styles['Heading3']))
|
| 375 |
-
|
| 376 |
-
detail_data = [
|
| 377 |
-
['Category', segment['category']],
|
| 378 |
-
['Score', f"{segment['score']}/4"],
|
| 379 |
-
['Confidence', f"{segment['confidence']:.1%}"]
|
| 380 |
-
]
|
| 381 |
-
|
| 382 |
-
detail_table = Table(detail_data, colWidths=[1.5*inch, 4*inch])
|
| 383 |
-
detail_table.setStyle(TableStyle([
|
| 384 |
-
('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
|
| 385 |
-
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
|
| 386 |
-
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
|
| 387 |
-
('GRID', (0, 0), (-1, -1), 1, colors.black)
|
| 388 |
-
]))
|
| 389 |
-
|
| 390 |
-
elements.append(detail_table)
|
| 391 |
-
elements.append(Spacer(1, 6))
|
| 392 |
-
|
| 393 |
-
text_preview = segment['text'][:200] + "..." if len(segment['text']) > 200 else segment['text']
|
| 394 |
-
elements.append(Paragraph(f"<i>{text_preview}</i>", styles['Normal']))
|
| 395 |
-
elements.append(Spacer(1, 12))
|
| 396 |
|
| 397 |
# Build PDF
|
| 398 |
doc.build(elements)
|
|
@@ -402,99 +620,187 @@ def create_pdf_report(segment_results, category_results, statement_text):
|
|
| 402 |
# Main Application
|
| 403 |
def main():
|
| 404 |
st.title("π₯ Medical School Personal Statement Analyzer")
|
| 405 |
-
st.markdown("
|
| 406 |
st.markdown("---")
|
| 407 |
|
| 408 |
-
#
|
| 409 |
-
|
| 410 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
st.markdown("""
|
| 412 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
|
| 414 |
-
|
| 415 |
-
|
|
|
|
|
|
|
| 416 |
|
| 417 |
-
|
| 418 |
-
Clinical and medical experiences
|
| 419 |
|
| 420 |
-
|
| 421 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
|
| 423 |
-
|
| 424 |
-
|
|
|
|
|
|
|
| 425 |
|
| 426 |
-
|
| 427 |
-
- 4 = Excellent
|
| 428 |
-
- 3 = Good
|
| 429 |
-
- 2 = Below Average
|
| 430 |
-
- 1 = Poor
|
| 431 |
-
""")
|
| 432 |
|
| 433 |
-
st.markdown("
|
| 434 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 435 |
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
# Load model
|
| 442 |
-
embedder = load_pretrained_model()
|
| 443 |
-
|
| 444 |
-
# Main content area
|
| 445 |
-
st.header("π Upload Your Personal Statement")
|
| 446 |
-
|
| 447 |
-
# Input method selection
|
| 448 |
-
input_method = st.radio(
|
| 449 |
-
"Choose input method:",
|
| 450 |
-
["Upload Text File (.txt)", "Paste Text Directly"],
|
| 451 |
-
horizontal=True
|
| 452 |
-
)
|
| 453 |
-
|
| 454 |
-
statement_text = None
|
| 455 |
-
|
| 456 |
-
if input_method == "Upload Text File (.txt)":
|
| 457 |
-
uploaded_file = st.file_uploader(
|
| 458 |
-
"Choose a text file",
|
| 459 |
-
type=['txt'],
|
| 460 |
-
help="Upload your personal statement as a .txt file"
|
| 461 |
)
|
| 462 |
|
| 463 |
-
|
| 464 |
-
statement_text = str(uploaded_file.read(), 'utf-8')
|
| 465 |
-
st.success(f"β
File uploaded successfully ({len(statement_text)} characters)")
|
| 466 |
-
|
| 467 |
-
# Show preview
|
| 468 |
-
with st.expander("Preview Statement"):
|
| 469 |
-
st.text(statement_text[:500] + "..." if len(statement_text) > 500 else statement_text)
|
| 470 |
-
|
| 471 |
-
else: # Paste Text Directly
|
| 472 |
-
statement_text = st.text_area(
|
| 473 |
-
"Paste your personal statement here:",
|
| 474 |
-
height=400,
|
| 475 |
-
placeholder="Enter your complete personal statement...",
|
| 476 |
-
help="Paste your entire personal statement for analysis"
|
| 477 |
-
)
|
| 478 |
|
| 479 |
-
if
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
with st.spinner("Analyzing your personal statement..."):
|
| 487 |
-
# Perform analysis
|
| 488 |
-
segment_results, category_results = analyze_full_statement(statement_text, embedder)
|
| 489 |
-
|
| 490 |
-
st.success("β
Analysis Complete!")
|
| 491 |
-
st.balloons()
|
| 492 |
|
| 493 |
-
|
| 494 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
|
| 496 |
-
|
| 497 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 498 |
|
| 499 |
# Metrics
|
| 500 |
col1, col2, col3, col4 = st.columns(4)
|
|
@@ -502,11 +808,7 @@ def main():
|
|
| 502 |
detected_cats = [cat for cat, res in category_results.items() if res['detected']]
|
| 503 |
|
| 504 |
with col1:
|
| 505 |
-
st.metric(
|
| 506 |
-
"Categories Found",
|
| 507 |
-
f"{len(detected_cats)}/4",
|
| 508 |
-
delta=f"{len(detected_cats)-4}" if len(detected_cats) < 4 else "Complete"
|
| 509 |
-
)
|
| 510 |
|
| 511 |
with col2:
|
| 512 |
if detected_cats:
|
|
@@ -521,126 +823,159 @@ def main():
|
|
| 521 |
with col4:
|
| 522 |
if detected_cats:
|
| 523 |
avg_score = np.mean([category_results[cat]['score'] for cat in detected_cats])
|
| 524 |
-
if avg_score >= 3.5
|
| 525 |
-
|
| 526 |
-
color = "π’"
|
| 527 |
-
elif avg_score >= 2.5:
|
| 528 |
-
quality = "Good"
|
| 529 |
-
color = "π‘"
|
| 530 |
-
else:
|
| 531 |
-
quality = "Needs Work"
|
| 532 |
-
color = "π΄"
|
| 533 |
-
st.metric("Overall Quality", f"{color} {quality}")
|
| 534 |
else:
|
| 535 |
st.metric("Overall Quality", "N/A")
|
| 536 |
|
| 537 |
-
# Category
|
| 538 |
-
st.
|
|
|
|
| 539 |
|
| 540 |
for cat in CATEGORIES.keys():
|
| 541 |
res = category_results[cat]
|
| 542 |
-
col1, col2, col3, col4 = st.columns([3, 1, 1, 1])
|
| 543 |
-
|
| 544 |
-
with col1:
|
| 545 |
-
if res['detected']:
|
| 546 |
-
st.write(f"β
**{cat}**")
|
| 547 |
-
else:
|
| 548 |
-
st.write(f"β **{cat}** *(Not detected)*")
|
| 549 |
-
|
| 550 |
-
with col2:
|
| 551 |
-
if res['detected']:
|
| 552 |
-
st.write(f"Score: {res['score']}/4")
|
| 553 |
-
else:
|
| 554 |
-
st.write("Score: -")
|
| 555 |
-
|
| 556 |
-
with col3:
|
| 557 |
-
if res['detected']:
|
| 558 |
-
st.write(f"Confidence: {res['confidence']:.1%}")
|
| 559 |
-
else:
|
| 560 |
-
st.write("Confidence: -")
|
| 561 |
-
|
| 562 |
-
with col4:
|
| 563 |
-
if res['detected']:
|
| 564 |
-
st.write(f"Segments: {res['num_segments']}")
|
| 565 |
-
else:
|
| 566 |
-
st.write("Segments: 0")
|
| 567 |
-
|
| 568 |
if res['detected']:
|
| 569 |
-
|
|
|
|
| 570 |
st.progress(res['score'] / 4)
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
|
| 575 |
for segment in segment_results:
|
|
|
|
|
|
|
|
|
|
| 576 |
with st.expander(f"Segment {segment['segment_num']}: {segment['category']} (Score: {segment['score']}/4)"):
|
| 577 |
col1, col2 = st.columns([1, 3])
|
| 578 |
|
| 579 |
with col1:
|
| 580 |
st.metric("Category", segment['category'])
|
| 581 |
-
st.metric("Score", f"{segment['score']}/4")
|
| 582 |
st.metric("Confidence", f"{segment['confidence']:.1%}")
|
| 583 |
|
| 584 |
with col2:
|
| 585 |
st.write("**Text:**")
|
| 586 |
-
st.write(segment['text'])
|
| 587 |
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
st.
|
|
|
|
| 594 |
|
| 595 |
missing_cats = [cat for cat, res in category_results.items() if not res['detected']]
|
| 596 |
low_score_cats = [cat for cat, res in category_results.items()
|
| 597 |
if res['detected'] and res['score'] and res['score'] < 3]
|
| 598 |
|
| 599 |
if missing_cats:
|
| 600 |
-
st.error("
|
| 601 |
for cat in missing_cats:
|
| 602 |
-
st.write(f"
|
| 603 |
-
st.write(f"
|
| 604 |
-
st.write(f"**Keywords to include:** {', '.join(CATEGORIES[cat]['keywords'][:8])}")
|
| 605 |
-
st.write(f"**Target Quality:** {CATEGORIES[cat]['rubric'][4]}")
|
| 606 |
-
st.write("---")
|
| 607 |
|
| 608 |
if low_score_cats:
|
| 609 |
-
st.warning("
|
| 610 |
for cat in low_score_cats:
|
| 611 |
-
|
| 612 |
-
st.write(f"
|
| 613 |
-
st.write(f"
|
| 614 |
-
st.write(f"**Current Level:** {CATEGORIES[cat]['rubric'][current_score]}")
|
| 615 |
-
st.write(f"**Target Level:** {CATEGORIES[cat]['rubric'][4]}")
|
| 616 |
-
st.write(f"**Improvement Tips:** Add more {', '.join(CATEGORIES[cat]['keywords'][:5])}")
|
| 617 |
-
st.write("---")
|
| 618 |
|
| 619 |
if not missing_cats and not low_score_cats:
|
| 620 |
-
st.success("
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 625 |
|
| 626 |
-
#
|
| 627 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 628 |
|
| 629 |
-
|
| 630 |
-
st.download_button(
|
| 631 |
-
label="π₯ Download PDF Report",
|
| 632 |
-
data=pdf_buffer,
|
| 633 |
-
file_name=f"personal_statement_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf",
|
| 634 |
-
mime="application/pdf",
|
| 635 |
-
use_container_width=True
|
| 636 |
-
)
|
| 637 |
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 644 |
|
| 645 |
# Run the application
|
| 646 |
if __name__ == "__main__":
|
|
|
|
| 6 |
import re
|
| 7 |
from datetime import datetime
|
| 8 |
from io import BytesIO
|
| 9 |
+
import warnings
|
| 10 |
+
warnings.filterwarnings('ignore')
|
| 11 |
|
| 12 |
# Page config MUST be first
|
| 13 |
st.set_page_config(
|
| 14 |
page_title="Medical School Personal Statement Analyzer",
|
| 15 |
page_icon="π₯",
|
| 16 |
+
layout="wide",
|
| 17 |
+
initial_sidebar_state="expanded"
|
| 18 |
)
|
| 19 |
|
| 20 |
# Import ML libraries
|
| 21 |
+
from sentence_transformers import SentenceTransformer, util
|
| 22 |
+
from sklearn.model_selection import train_test_split
|
| 23 |
from sklearn.preprocessing import StandardScaler
|
|
|
|
| 24 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 25 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 26 |
import xgboost as xgb
|
| 27 |
+
import torch
|
| 28 |
|
| 29 |
# Import PDF generation libraries
|
| 30 |
+
try:
|
| 31 |
+
from reportlab.lib import colors
|
| 32 |
+
from reportlab.lib.pagesizes import letter
|
| 33 |
+
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, PageBreak
|
| 34 |
+
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
| 35 |
+
from reportlab.lib.units import inch
|
| 36 |
+
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY
|
| 37 |
+
PDF_AVAILABLE = True
|
| 38 |
+
except ImportError:
|
| 39 |
+
PDF_AVAILABLE = False
|
| 40 |
|
| 41 |
+
# Categories with detailed rubric alignment
|
| 42 |
CATEGORIES = {
|
| 43 |
'Spark': {
|
| 44 |
'description': 'Opening that spurs interest in medicine (typically in opening paragraph)',
|
|
|
|
| 55 |
2: 'somewhat connected but unclear',
|
| 56 |
3: 'connected and clear',
|
| 57 |
4: 'engaging and logically flows into becoming a doctor'
|
| 58 |
+
},
|
| 59 |
+
'rubric_features': {
|
| 60 |
+
'positive': ['engaging', 'logical', 'clear connection', 'compelling', 'authentic'],
|
| 61 |
+
'negative': ['disconnected', 'confusing', 'random', 'unclear', 'generic']
|
| 62 |
}
|
| 63 |
},
|
| 64 |
'Healthcare Experience': {
|
|
|
|
| 76 |
2: 'bland/boring but not problematic',
|
| 77 |
3: 'interesting and relevant',
|
| 78 |
4: 'vivid, active, thoughtful, relevant, memorable, positive'
|
| 79 |
+
},
|
| 80 |
+
'rubric_features': {
|
| 81 |
+
'positive': ['vivid', 'active', 'thoughtful', 'memorable', 'optimistic', 'engaged'],
|
| 82 |
+
'negative': ['passive', 'uninteresting', 'irrelevant', 'problematic', 'pessimistic']
|
| 83 |
}
|
| 84 |
},
|
| 85 |
'Showing Doctor Qualities': {
|
|
|
|
| 97 |
2: 'bland/boring but not problematic',
|
| 98 |
3: 'shows some understanding',
|
| 99 |
4: 'realistic, self-aware, mature, humble, specific understanding'
|
| 100 |
+
},
|
| 101 |
+
'rubric_features': {
|
| 102 |
+
'positive': ['realistic', 'self-aware', 'mature', 'humble', 'specific', 'clear'],
|
| 103 |
+
'negative': ['arrogant', 'immature', 'overly confident', 'simplistic', 'inaccurate']
|
| 104 |
}
|
| 105 |
},
|
| 106 |
'Spin': {
|
|
|
|
| 117 |
2: 'some connection but generic',
|
| 118 |
3: 'clear connection',
|
| 119 |
4: 'direct, logical, and specific argument'
|
| 120 |
+
},
|
| 121 |
+
'rubric_features': {
|
| 122 |
+
'positive': ['direct', 'logical', 'specific', 'clear argument', 'compelling'],
|
| 123 |
+
'negative': ['brief', 'vague', 'simplistic', 'generic', 'weak']
|
| 124 |
}
|
| 125 |
}
|
| 126 |
}
|
| 127 |
|
| 128 |
@st.cache_resource
|
| 129 |
+
def load_sentence_transformer():
|
| 130 |
+
"""Load the e5-large-v2 sentence transformer model"""
|
| 131 |
+
try:
|
| 132 |
+
# Try to load the preferred model
|
| 133 |
+
model = SentenceTransformer('intfloat/e5-large-v2')
|
| 134 |
+
return model, 'intfloat/e5-large-v2'
|
| 135 |
+
except:
|
| 136 |
+
# Fallback to lighter model if e5-large-v2 fails
|
| 137 |
+
try:
|
| 138 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 139 |
+
return model, 'all-MiniLM-L6-v2'
|
| 140 |
+
except Exception as e:
|
| 141 |
+
st.error(f"Failed to load transformer: {e}")
|
| 142 |
+
return None, None
|
| 143 |
+
|
| 144 |
+
def load_training_data_from_files():
|
| 145 |
+
"""Load and combine training data from the two Excel files"""
|
| 146 |
+
try:
|
| 147 |
+
# File paths for the Excel files
|
| 148 |
+
file1_path = "DedooseChartExcerpts_2025_8_5_1025.xlsx"
|
| 149 |
+
file2_path = "Personal Statements Coded.xlsx"
|
| 150 |
+
|
| 151 |
+
# Check if files exist
|
| 152 |
+
if not os.path.exists(file1_path) or not os.path.exists(file2_path):
|
| 153 |
+
return None
|
| 154 |
+
|
| 155 |
+
# Load Excel files
|
| 156 |
+
df1 = pd.read_excel(file1_path)
|
| 157 |
+
df2 = pd.read_excel(file2_path)
|
| 158 |
+
|
| 159 |
+
# Combine dataframes
|
| 160 |
+
combined_df = pd.concat([df1, df2], ignore_index=True)
|
| 161 |
+
|
| 162 |
+
processed_data = []
|
| 163 |
+
|
| 164 |
+
for _, row in combined_df.iterrows():
|
| 165 |
+
text = None
|
| 166 |
+
# Look for text columns
|
| 167 |
+
for col_name in ['Excerpt Copy', 'Excerpt', 'Text', 'Content']:
|
| 168 |
+
if col_name in row and pd.notna(row[col_name]):
|
| 169 |
+
text = str(row[col_name])
|
| 170 |
+
break
|
| 171 |
+
|
| 172 |
+
if not text or text.strip() == '':
|
| 173 |
+
continue
|
| 174 |
+
|
| 175 |
+
data_point = {
|
| 176 |
+
'text': text.strip(),
|
| 177 |
+
'media_title': row.get('Media Title', 'Unknown')
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
# Process categories
|
| 181 |
+
for category in CATEGORIES.keys():
|
| 182 |
+
col_applied = f"Code: {category} Applied"
|
| 183 |
+
col_weight = f"Code: {category} Weight"
|
| 184 |
+
|
| 185 |
+
is_applied = False
|
| 186 |
+
if col_applied in row:
|
| 187 |
+
applied_val = str(row[col_applied]).lower()
|
| 188 |
+
is_applied = applied_val in ['true', '1', 'yes', 't']
|
| 189 |
+
|
| 190 |
+
data_point[f"{category}_applied"] = is_applied
|
| 191 |
+
|
| 192 |
+
if is_applied and col_weight in row:
|
| 193 |
+
weight = row[col_weight]
|
| 194 |
+
if pd.isna(weight) or weight == '':
|
| 195 |
+
weight = 2
|
| 196 |
+
else:
|
| 197 |
+
try:
|
| 198 |
+
weight = int(float(weight))
|
| 199 |
+
weight = max(1, min(4, weight))
|
| 200 |
+
except:
|
| 201 |
+
weight = 2
|
| 202 |
+
else:
|
| 203 |
+
weight = 0
|
| 204 |
+
|
| 205 |
+
data_point[f"{category}_score"] = weight
|
| 206 |
+
|
| 207 |
+
processed_data.append(data_point)
|
| 208 |
+
|
| 209 |
+
return pd.DataFrame(processed_data)
|
| 210 |
+
|
| 211 |
+
except Exception as e:
|
| 212 |
+
st.error(f"Error loading training data: {str(e)}")
|
| 213 |
+
return None
|
| 214 |
|
| 215 |
+
def segment_text(text, embedder):
|
| 216 |
+
"""Segment text using semantic similarity"""
|
|
|
|
| 217 |
paragraphs = re.split(r'\n\s*\n', text)
|
| 218 |
paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p.strip()) > 50]
|
| 219 |
|
|
|
|
| 220 |
if len(paragraphs) <= 1:
|
| 221 |
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 222 |
sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
|
| 223 |
|
| 224 |
+
if len(sentences) < 3:
|
| 225 |
+
return [text]
|
| 226 |
+
|
| 227 |
+
# Use embeddings for semantic segmentation
|
| 228 |
+
embeddings = embedder.encode(sentences, convert_to_tensor=True)
|
| 229 |
+
|
| 230 |
segments = []
|
| 231 |
+
current_segment = [sentences[0]]
|
| 232 |
+
current_embedding = embeddings[0]
|
| 233 |
|
| 234 |
+
for i in range(1, len(sentences)):
|
| 235 |
+
similarity = util.cos_sim(current_embedding, embeddings[i]).item()
|
|
|
|
| 236 |
|
| 237 |
+
if similarity < 0.7 or len(' '.join(current_segment)) > 500:
|
| 238 |
segments.append(' '.join(current_segment))
|
| 239 |
+
current_segment = [sentences[i]]
|
| 240 |
+
current_embedding = embeddings[i]
|
| 241 |
+
else:
|
| 242 |
+
current_segment.append(sentences[i])
|
| 243 |
+
current_embedding = (current_embedding + embeddings[i]) / 2
|
| 244 |
|
| 245 |
if current_segment:
|
| 246 |
segments.append(' '.join(current_segment))
|
| 247 |
|
| 248 |
+
return segments
|
| 249 |
|
| 250 |
return paragraphs
|
| 251 |
|
| 252 |
+
def extract_features(text, embedder, category_focus=None):
|
| 253 |
+
"""Extract features for classification"""
|
| 254 |
+
features = []
|
| 255 |
text_lower = text.lower()
|
| 256 |
words = text.split()
|
| 257 |
|
| 258 |
+
# Basic text statistics
|
| 259 |
+
features.extend([
|
| 260 |
+
len(text),
|
| 261 |
+
len(words),
|
| 262 |
+
len(set(words)) / max(len(words), 1),
|
| 263 |
+
len(re.findall(r'[.!?]', text)),
|
| 264 |
+
text.count('I') / max(len(words), 1),
|
| 265 |
+
])
|
| 266 |
|
| 267 |
+
# Process all categories
|
| 268 |
for cat_name, cat_info in CATEGORIES.items():
|
| 269 |
+
keywords = cat_info['keywords']
|
| 270 |
+
keyword_matches = sum(1 for kw in keywords if kw.lower() in text_lower)
|
| 271 |
+
keyword_density = keyword_matches / max(len(keywords), 1)
|
| 272 |
+
|
| 273 |
+
if category_focus == cat_name:
|
| 274 |
+
keyword_density *= 2
|
| 275 |
+
|
| 276 |
+
features.append(keyword_density * 10)
|
| 277 |
|
|
|
|
| 278 |
pattern_matches = 0
|
| 279 |
+
for pattern in cat_info.get('patterns', []):
|
| 280 |
+
matches = re.findall(pattern, text_lower)
|
| 281 |
+
pattern_matches += len(matches)
|
| 282 |
+
features.append(pattern_matches)
|
| 283 |
+
|
| 284 |
+
positive_count = sum(1 for word in cat_info['rubric_features']['positive']
|
| 285 |
+
if word in text_lower)
|
| 286 |
+
negative_count = sum(1 for word in cat_info['rubric_features']['negative']
|
| 287 |
+
if word in text_lower)
|
| 288 |
+
|
| 289 |
+
features.extend([
|
| 290 |
+
positive_count / max(len(words), 1) * 100,
|
| 291 |
+
negative_count / max(len(words), 1) * 100
|
| 292 |
+
])
|
| 293 |
+
|
| 294 |
+
# Get embeddings
|
| 295 |
+
try:
|
| 296 |
+
embedding = embedder.encode(text, convert_to_tensor=False, normalize_embeddings=True)
|
| 297 |
+
if hasattr(embedding, 'cpu'):
|
| 298 |
+
embedding = embedding.cpu().numpy()
|
| 299 |
+
embedding = embedding.flatten()
|
| 300 |
+
# Limit embedding size for memory efficiency
|
| 301 |
+
embedding = embedding[:512] if len(embedding) > 512 else embedding
|
| 302 |
+
except:
|
| 303 |
+
embedding = np.zeros(512)
|
| 304 |
+
|
| 305 |
+
# Category similarity
|
| 306 |
+
if category_focus and category_focus in CATEGORIES:
|
| 307 |
+
category_text = f"{CATEGORIES[category_focus]['description']} {' '.join(CATEGORIES[category_focus]['keywords'][:10])}"
|
| 308 |
+
try:
|
| 309 |
+
category_embedding = embedder.encode(category_text, normalize_embeddings=True)
|
| 310 |
+
if hasattr(category_embedding, 'cpu'):
|
| 311 |
+
category_embedding = category_embedding.cpu().numpy()
|
| 312 |
+
category_embedding = category_embedding.flatten()[:512]
|
| 313 |
+
similarity = cosine_similarity([embedding[:512]], [category_embedding])[0][0]
|
| 314 |
+
features.append(similarity * 10)
|
| 315 |
+
except:
|
| 316 |
+
features.append(0)
|
| 317 |
else:
|
| 318 |
+
features.append(0)
|
| 319 |
|
| 320 |
+
features = np.array(features, dtype=np.float32)
|
| 321 |
+
combined_features = np.concatenate([features, embedding])
|
|
|
|
| 322 |
|
| 323 |
+
return combined_features
|
| 324 |
+
|
| 325 |
+
def train_models(df, embedder):
|
| 326 |
+
"""Train ensemble models"""
|
| 327 |
+
all_features = []
|
| 328 |
|
| 329 |
+
progress_bar = st.progress(0)
|
| 330 |
+
status_text = st.empty()
|
|
|
|
|
|
|
| 331 |
|
| 332 |
+
status_text.text("Extracting features from training data...")
|
| 333 |
+
|
| 334 |
+
for idx, row in df.iterrows():
|
| 335 |
+
text = row['text']
|
| 336 |
+
|
| 337 |
+
category_features = {}
|
| 338 |
+
for cat in CATEGORIES.keys():
|
| 339 |
+
features = extract_features(text, embedder, category_focus=cat)
|
| 340 |
+
category_features[cat] = features
|
| 341 |
+
|
| 342 |
+
true_categories = [cat for cat in CATEGORIES.keys() if row[f"{cat}_applied"]]
|
| 343 |
+
|
| 344 |
+
if true_categories:
|
| 345 |
+
features = category_features[true_categories[0]]
|
| 346 |
+
else:
|
| 347 |
+
features = np.mean(list(category_features.values()), axis=0)
|
| 348 |
+
|
| 349 |
+
all_features.append(features)
|
| 350 |
+
progress_bar.progress((idx + 1) / len(df))
|
| 351 |
+
|
| 352 |
+
X = np.array(all_features)
|
| 353 |
+
|
| 354 |
+
categories = list(CATEGORIES.keys())
|
| 355 |
+
y_class = df[[f"{cat}_applied" for cat in categories]].values.astype(float)
|
| 356 |
+
|
| 357 |
+
y_score = []
|
| 358 |
+
for _, row in df.iterrows():
|
| 359 |
+
scores = []
|
| 360 |
+
for cat in categories:
|
| 361 |
+
if row[f"{cat}_applied"]:
|
| 362 |
+
scores.append(row[f"{cat}_score"] / 4.0)
|
| 363 |
+
else:
|
| 364 |
+
scores.append(0)
|
| 365 |
+
y_score.append(scores)
|
| 366 |
+
y_score = np.array(y_score)
|
| 367 |
+
|
| 368 |
+
status_text.text("Training models...")
|
| 369 |
+
|
| 370 |
+
# Split data
|
| 371 |
+
X_train, X_test, y_class_train, y_class_test, y_score_train, y_score_test = train_test_split(
|
| 372 |
+
X, y_class, y_score, test_size=0.2, random_state=42
|
| 373 |
+
)
|
| 374 |
+
|
| 375 |
+
# Scale features
|
| 376 |
+
scaler = StandardScaler()
|
| 377 |
+
X_train_scaled = scaler.fit_transform(X_train)
|
| 378 |
+
X_test_scaled = scaler.transform(X_test)
|
| 379 |
+
|
| 380 |
+
# Train classifiers and scorers
|
| 381 |
+
classifiers = {}
|
| 382 |
+
scorers = {}
|
| 383 |
+
thresholds = {}
|
| 384 |
+
ensemble = {}
|
| 385 |
+
|
| 386 |
+
for i, cat in enumerate(categories):
|
| 387 |
+
n_positive = np.sum(y_class_train[:, i])
|
| 388 |
+
|
| 389 |
+
models = []
|
| 390 |
+
|
| 391 |
+
# XGBoost classifier
|
| 392 |
+
if n_positive >= 5:
|
| 393 |
+
xgb_clf = xgb.XGBClassifier(
|
| 394 |
+
n_estimators=100,
|
| 395 |
+
max_depth=5,
|
| 396 |
+
learning_rate=0.1,
|
| 397 |
+
random_state=42,
|
| 398 |
+
use_label_encoder=False,
|
| 399 |
+
eval_metric='logloss'
|
| 400 |
+
)
|
| 401 |
+
xgb_clf.fit(X_train_scaled, y_class_train[:, i])
|
| 402 |
+
models.append(('xgb', xgb_clf))
|
| 403 |
+
classifiers[cat] = xgb_clf
|
| 404 |
+
|
| 405 |
+
# Random Forest as backup or ensemble member
|
| 406 |
+
rf_clf = RandomForestClassifier(
|
| 407 |
+
n_estimators=100,
|
| 408 |
+
max_depth=6,
|
| 409 |
+
class_weight='balanced',
|
| 410 |
+
random_state=42
|
| 411 |
+
)
|
| 412 |
+
rf_clf.fit(X_train_scaled, y_class_train[:, i])
|
| 413 |
+
models.append(('rf', rf_clf))
|
| 414 |
+
|
| 415 |
+
if n_positive < 5:
|
| 416 |
+
classifiers[cat] = rf_clf
|
| 417 |
+
|
| 418 |
+
ensemble[cat] = models
|
| 419 |
+
thresholds[cat] = 0.5
|
| 420 |
+
|
| 421 |
+
# Train scorer
|
| 422 |
+
mask = y_class_train[:, i] == 1
|
| 423 |
+
if np.sum(mask) > 5:
|
| 424 |
+
scorer = xgb.XGBRegressor(
|
| 425 |
+
n_estimators=100,
|
| 426 |
+
max_depth=4,
|
| 427 |
+
random_state=42
|
| 428 |
+
)
|
| 429 |
+
scorer.fit(X_train_scaled[mask], y_score_train[mask, i])
|
| 430 |
+
else:
|
| 431 |
+
from sklearn.dummy import DummyRegressor
|
| 432 |
+
scorer = DummyRegressor(strategy='constant', constant=0.5)
|
| 433 |
+
scorer.fit(X_train_scaled, y_score_train[:, i])
|
| 434 |
+
|
| 435 |
+
scorers[cat] = scorer
|
| 436 |
+
|
| 437 |
+
# Calculate accuracies
|
| 438 |
+
accuracies = []
|
| 439 |
+
for i, cat in enumerate(categories):
|
| 440 |
+
preds = classifiers[cat].predict(X_test_scaled)
|
| 441 |
+
acc = np.mean(preds == y_class_test[:, i])
|
| 442 |
+
accuracies.append(acc)
|
| 443 |
+
|
| 444 |
+
status_text.empty()
|
| 445 |
+
progress_bar.empty()
|
| 446 |
+
|
| 447 |
+
return scaler, classifiers, scorers, thresholds, accuracies, ensemble
|
| 448 |
+
|
| 449 |
+
def classify_segment(text, embedder, scaler, classifiers, scorers, thresholds, ensemble=None):
|
| 450 |
+
"""Classify a segment of text"""
|
| 451 |
+
categories = list(CATEGORIES.keys())
|
| 452 |
+
category_results = {}
|
| 453 |
+
|
| 454 |
+
for cat in categories:
|
| 455 |
+
features = extract_features(text, embedder, category_focus=cat)
|
| 456 |
+
features_scaled = scaler.transform([features])
|
| 457 |
+
|
| 458 |
+
if ensemble and cat in ensemble:
|
| 459 |
+
probs = []
|
| 460 |
+
for name, model in ensemble[cat]:
|
| 461 |
+
if hasattr(model, 'predict_proba'):
|
| 462 |
+
model_probs = model.predict_proba(features_scaled)
|
| 463 |
+
if model_probs.shape[1] == 2:
|
| 464 |
+
probs.append(model_probs[0, 1])
|
| 465 |
+
|
| 466 |
+
if probs:
|
| 467 |
+
avg_prob = np.mean(probs)
|
| 468 |
+
else:
|
| 469 |
+
avg_prob = 0.5
|
| 470 |
+
else:
|
| 471 |
+
if hasattr(classifiers[cat], 'predict_proba'):
|
| 472 |
+
probs = classifiers[cat].predict_proba(features_scaled)
|
| 473 |
+
if probs.shape[1] == 2:
|
| 474 |
+
avg_prob = probs[0, 1]
|
| 475 |
+
else:
|
| 476 |
+
avg_prob = 0.5
|
| 477 |
+
else:
|
| 478 |
+
avg_prob = 0.5
|
| 479 |
+
|
| 480 |
+
category_results[cat] = avg_prob
|
| 481 |
+
|
| 482 |
+
best_category = max(category_results, key=category_results.get)
|
| 483 |
+
best_prob = category_results[best_category]
|
| 484 |
+
|
| 485 |
+
if best_prob > thresholds.get(best_category, 0.5):
|
| 486 |
+
features = extract_features(text, embedder, category_focus=best_category)
|
| 487 |
+
features_scaled = scaler.transform([features])
|
| 488 |
+
|
| 489 |
+
try:
|
| 490 |
+
score_normalized = scorers[best_category].predict(features_scaled)[0]
|
| 491 |
+
score = int(np.clip(np.round(score_normalized * 4), 1, 4))
|
| 492 |
+
except:
|
| 493 |
+
score = 2
|
| 494 |
+
|
| 495 |
+
return {
|
| 496 |
+
'category': best_category,
|
| 497 |
+
'score': score,
|
| 498 |
+
'confidence': float(best_prob),
|
| 499 |
+
'text': text,
|
| 500 |
+
'all_probabilities': category_results
|
| 501 |
+
}
|
| 502 |
+
else:
|
| 503 |
+
return {
|
| 504 |
+
'category': 'Unclassified',
|
| 505 |
+
'score': None,
|
| 506 |
+
'confidence': 0,
|
| 507 |
+
'text': text,
|
| 508 |
+
'all_probabilities': category_results
|
| 509 |
+
}
|
| 510 |
|
| 511 |
+
def analyze_statement(text, embedder, scaler, classifiers, scorers, thresholds, ensemble=None):
|
| 512 |
"""Analyze complete personal statement"""
|
| 513 |
+
segments = segment_text(text, embedder)
|
| 514 |
|
| 515 |
segment_results = []
|
| 516 |
for i, segment in enumerate(segments):
|
| 517 |
+
result = classify_segment(segment, embedder, scaler, classifiers, scorers, thresholds, ensemble)
|
| 518 |
result['segment_num'] = i + 1
|
| 519 |
segment_results.append(result)
|
| 520 |
|
|
|
|
| 545 |
|
| 546 |
return segment_results, category_results
|
| 547 |
|
| 548 |
+
def create_pdf_report(segment_results, category_results):
|
| 549 |
+
"""Create PDF report"""
|
| 550 |
+
if not PDF_AVAILABLE:
|
| 551 |
+
return None
|
| 552 |
+
|
| 553 |
buffer = BytesIO()
|
| 554 |
doc = SimpleDocTemplate(buffer, pagesize=letter, rightMargin=72, leftMargin=72,
|
| 555 |
topMargin=72, bottomMargin=18)
|
|
|
|
| 611 |
]))
|
| 612 |
|
| 613 |
elements.append(summary_table)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 614 |
|
| 615 |
# Build PDF
|
| 616 |
doc.build(elements)
|
|
|
|
| 620 |
# Main Application
|
| 621 |
def main():
|
| 622 |
st.title("π₯ Medical School Personal Statement Analyzer")
|
| 623 |
+
st.markdown("*AI-powered analysis based on medical school admission rubrics*")
|
| 624 |
st.markdown("---")
|
| 625 |
|
| 626 |
+
# Initialize session state
|
| 627 |
+
if 'model_trained' not in st.session_state:
|
| 628 |
+
st.session_state.model_trained = False
|
| 629 |
+
if 'embedder' not in st.session_state:
|
| 630 |
+
st.session_state.embedder = None
|
| 631 |
+
if 'scaler' not in st.session_state:
|
| 632 |
+
st.session_state.scaler = None
|
| 633 |
+
if 'classifiers' not in st.session_state:
|
| 634 |
+
st.session_state.classifiers = None
|
| 635 |
+
if 'scorers' not in st.session_state:
|
| 636 |
+
st.session_state.scorers = None
|
| 637 |
+
if 'thresholds' not in st.session_state:
|
| 638 |
+
st.session_state.thresholds = None
|
| 639 |
+
if 'ensemble' not in st.session_state:
|
| 640 |
+
st.session_state.ensemble = None
|
| 641 |
+
|
| 642 |
+
# Create three tabs
|
| 643 |
+
tab1, tab2, tab3 = st.tabs(["π Step 1: Train Model", "π Step 2: Analyze Statements", "π Step 3: View Rubrics"])
|
| 644 |
+
|
| 645 |
+
# STEP 1: TRAIN MODEL
|
| 646 |
+
with tab1:
|
| 647 |
+
st.header("Step 1: Train the AI Model")
|
| 648 |
st.markdown("""
|
| 649 |
+
### Instructions:
|
| 650 |
+
Click the 'Train Model' button to automatically train the AI using:
|
| 651 |
+
- Pre-loaded Excel training files
|
| 652 |
+
- State-of-the-art e5-large-v2 transformer model
|
| 653 |
+
- Ensemble classification algorithms
|
| 654 |
+
""")
|
| 655 |
|
| 656 |
+
# Check if models already exist in session
|
| 657 |
+
if st.session_state.model_trained:
|
| 658 |
+
st.success("β
Model is already trained and ready for analysis!")
|
| 659 |
+
st.info("You can proceed to Step 2 to analyze statements, or retrain if needed.")
|
| 660 |
|
| 661 |
+
st.markdown("---")
|
|
|
|
| 662 |
|
| 663 |
+
# Train button
|
| 664 |
+
if st.button("π Train Model", type="primary", use_container_width=True):
|
| 665 |
+
# Load training data
|
| 666 |
+
with st.spinner("Loading training data from Excel files..."):
|
| 667 |
+
df = load_training_data_from_files()
|
| 668 |
+
|
| 669 |
+
if df is None or df.empty:
|
| 670 |
+
st.error("""
|
| 671 |
+
β Could not load training data. Please ensure these files are present:
|
| 672 |
+
- DedooseChartExcerpts_2025_8_5_1025.xlsx
|
| 673 |
+
- Personal Statements Coded.xlsx
|
| 674 |
+
""")
|
| 675 |
+
st.stop()
|
| 676 |
+
|
| 677 |
+
st.success(f"β
Loaded {len(df)} training samples")
|
| 678 |
+
|
| 679 |
+
# Show data distribution
|
| 680 |
+
st.subheader("Training Data Distribution:")
|
| 681 |
+
dist_cols = st.columns(4)
|
| 682 |
+
for idx, cat in enumerate(CATEGORIES.keys()):
|
| 683 |
+
if f"{cat}_applied" in df.columns:
|
| 684 |
+
count = df[f"{cat}_applied"].sum()
|
| 685 |
+
with dist_cols[idx % 4]:
|
| 686 |
+
st.metric(cat, f"{int(count)} samples")
|
| 687 |
+
|
| 688 |
+
# Load transformer model
|
| 689 |
+
with st.spinner("Loading e5-large-v2 transformer model..."):
|
| 690 |
+
if st.session_state.embedder is None:
|
| 691 |
+
embedder, embedder_name = load_sentence_transformer()
|
| 692 |
+
st.session_state.embedder = embedder
|
| 693 |
+
else:
|
| 694 |
+
embedder = st.session_state.embedder
|
| 695 |
+
embedder_name = 'intfloat/e5-large-v2'
|
| 696 |
+
|
| 697 |
+
if embedder is None:
|
| 698 |
+
st.error("Failed to load transformer model")
|
| 699 |
+
st.stop()
|
| 700 |
+
|
| 701 |
+
st.info(f"Using model: {embedder_name}")
|
| 702 |
+
|
| 703 |
+
# Train models
|
| 704 |
+
st.subheader("Training Progress:")
|
| 705 |
+
scaler, classifiers, scorers, thresholds, accuracies, ensemble = train_models(df, embedder)
|
| 706 |
+
|
| 707 |
+
# Save to session state
|
| 708 |
+
st.session_state.scaler = scaler
|
| 709 |
+
st.session_state.classifiers = classifiers
|
| 710 |
+
st.session_state.scorers = scorers
|
| 711 |
+
st.session_state.thresholds = thresholds
|
| 712 |
+
st.session_state.ensemble = ensemble
|
| 713 |
+
st.session_state.model_trained = True
|
| 714 |
+
|
| 715 |
+
st.success("β
Training Complete!")
|
| 716 |
+
|
| 717 |
+
# Show performance metrics
|
| 718 |
+
st.subheader("Model Performance:")
|
| 719 |
+
metrics_cols = st.columns(4)
|
| 720 |
+
for idx, (cat, acc) in enumerate(zip(CATEGORIES.keys(), accuracies)):
|
| 721 |
+
with metrics_cols[idx % 4]:
|
| 722 |
+
st.metric(cat, f"{acc:.1%} accuracy")
|
| 723 |
+
|
| 724 |
+
avg_accuracy = np.mean(accuracies)
|
| 725 |
+
st.metric("**Overall Model Accuracy**", f"{avg_accuracy:.1%}")
|
| 726 |
+
|
| 727 |
+
st.balloons()
|
| 728 |
+
|
| 729 |
+
# STEP 2: ANALYZE STATEMENTS
|
| 730 |
+
with tab2:
|
| 731 |
+
st.header("Step 2: Analyze Personal Statements")
|
| 732 |
|
| 733 |
+
# Check if models are trained
|
| 734 |
+
if not st.session_state.model_trained:
|
| 735 |
+
st.warning("β οΈ No trained models found. Please complete Step 1: Train Model first.")
|
| 736 |
+
st.stop()
|
| 737 |
|
| 738 |
+
st.success("β
Models loaded successfully")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 739 |
|
| 740 |
+
st.markdown("""
|
| 741 |
+
### Instructions:
|
| 742 |
+
Upload or paste a personal statement to receive:
|
| 743 |
+
- Category detection and scoring (1-4)
|
| 744 |
+
- Segment-by-segment analysis
|
| 745 |
+
- Detailed recommendations
|
| 746 |
+
- Downloadable PDF report
|
| 747 |
+
""")
|
| 748 |
|
| 749 |
+
# Input method selection
|
| 750 |
+
input_method = st.radio(
|
| 751 |
+
"Choose input method:",
|
| 752 |
+
["Upload Text File (.txt)", "Paste Text Directly"],
|
| 753 |
+
horizontal=True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 754 |
)
|
| 755 |
|
| 756 |
+
statement_text = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 757 |
|
| 758 |
+
if input_method == "Upload Text File (.txt)":
|
| 759 |
+
uploaded_file = st.file_uploader(
|
| 760 |
+
"Choose a text file",
|
| 761 |
+
type=['txt'],
|
| 762 |
+
help="Upload your personal statement as a .txt file"
|
| 763 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 764 |
|
| 765 |
+
if uploaded_file is not None:
|
| 766 |
+
statement_text = str(uploaded_file.read(), 'utf-8')
|
| 767 |
+
st.success(f"β
File uploaded ({len(statement_text)} characters)")
|
| 768 |
+
|
| 769 |
+
with st.expander("Preview Statement"):
|
| 770 |
+
st.text(statement_text[:500] + "..." if len(statement_text) > 500 else statement_text)
|
| 771 |
+
|
| 772 |
+
else: # Paste Text Directly
|
| 773 |
+
statement_text = st.text_area(
|
| 774 |
+
"Paste your personal statement here:",
|
| 775 |
+
height=400,
|
| 776 |
+
placeholder="Enter your complete personal statement...",
|
| 777 |
+
help="Paste your entire personal statement for analysis"
|
| 778 |
+
)
|
| 779 |
|
| 780 |
+
if statement_text:
|
| 781 |
+
st.info(f"π Statement length: {len(statement_text)} characters, {len(statement_text.split())} words")
|
| 782 |
+
|
| 783 |
+
# Analyze button
|
| 784 |
+
if statement_text and len(statement_text) > 100:
|
| 785 |
+
if st.button("π¬ Analyze Statement", type="primary", use_container_width=True):
|
| 786 |
+
|
| 787 |
+
with st.spinner("Analyzing your personal statement..."):
|
| 788 |
+
segment_results, category_results = analyze_statement(
|
| 789 |
+
statement_text,
|
| 790 |
+
st.session_state.embedder,
|
| 791 |
+
st.session_state.scaler,
|
| 792 |
+
st.session_state.classifiers,
|
| 793 |
+
st.session_state.scorers,
|
| 794 |
+
st.session_state.thresholds,
|
| 795 |
+
st.session_state.ensemble
|
| 796 |
+
)
|
| 797 |
+
|
| 798 |
+
st.success("β
Analysis Complete!")
|
| 799 |
+
st.balloons()
|
| 800 |
+
|
| 801 |
+
# Display results
|
| 802 |
+
st.markdown("---")
|
| 803 |
+
st.subheader("π Overall Summary")
|
| 804 |
|
| 805 |
# Metrics
|
| 806 |
col1, col2, col3, col4 = st.columns(4)
|
|
|
|
| 808 |
detected_cats = [cat for cat, res in category_results.items() if res['detected']]
|
| 809 |
|
| 810 |
with col1:
|
| 811 |
+
st.metric("Categories Found", f"{len(detected_cats)}/4")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
|
| 813 |
with col2:
|
| 814 |
if detected_cats:
|
|
|
|
| 823 |
with col4:
|
| 824 |
if detected_cats:
|
| 825 |
avg_score = np.mean([category_results[cat]['score'] for cat in detected_cats])
|
| 826 |
+
quality = "Excellent" if avg_score >= 3.5 else "Good" if avg_score >= 2.5 else "Needs Work"
|
| 827 |
+
st.metric("Overall Quality", quality)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 828 |
else:
|
| 829 |
st.metric("Overall Quality", "N/A")
|
| 830 |
|
| 831 |
+
# Category Analysis
|
| 832 |
+
st.markdown("---")
|
| 833 |
+
st.subheader("π Category Analysis")
|
| 834 |
|
| 835 |
for cat in CATEGORIES.keys():
|
| 836 |
res = category_results[cat]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 837 |
if res['detected']:
|
| 838 |
+
icon = "β
" if res['score'] >= 3 else "β οΈ" if res['score'] >= 2 else "β"
|
| 839 |
+
st.write(f"{icon} **{cat}**: Score {res['score']}/4 (Confidence: {res['confidence']:.1%})")
|
| 840 |
st.progress(res['score'] / 4)
|
| 841 |
+
else:
|
| 842 |
+
st.write(f"β **{cat}**: Not detected")
|
| 843 |
+
st.progress(0)
|
| 844 |
+
|
| 845 |
+
# Segment Details
|
| 846 |
+
st.markdown("---")
|
| 847 |
+
st.subheader("π Segment-by-Segment Analysis")
|
| 848 |
|
| 849 |
for segment in segment_results:
|
| 850 |
+
quality_map = {1: "Poor", 2: "Below Average", 3: "Good", 4: "Excellent", None: "N/A"}
|
| 851 |
+
quality = quality_map.get(segment['score'], "N/A")
|
| 852 |
+
|
| 853 |
with st.expander(f"Segment {segment['segment_num']}: {segment['category']} (Score: {segment['score']}/4)"):
|
| 854 |
col1, col2 = st.columns([1, 3])
|
| 855 |
|
| 856 |
with col1:
|
| 857 |
st.metric("Category", segment['category'])
|
| 858 |
+
st.metric("Score", f"{segment['score']}/4" if segment['score'] else "N/A")
|
| 859 |
st.metric("Confidence", f"{segment['confidence']:.1%}")
|
| 860 |
|
| 861 |
with col2:
|
| 862 |
st.write("**Text:**")
|
| 863 |
+
st.write(segment['text'][:500] + "..." if len(segment['text']) > 500 else segment['text'])
|
| 864 |
|
| 865 |
+
if segment['category'] != 'Unclassified' and segment['score']:
|
| 866 |
+
st.write("**Rubric:**")
|
| 867 |
+
st.info(CATEGORIES[segment['category']]['rubric'][segment['score']])
|
| 868 |
+
|
| 869 |
+
# Recommendations
|
| 870 |
+
st.markdown("---")
|
| 871 |
+
st.subheader("π‘ Recommendations")
|
| 872 |
|
| 873 |
missing_cats = [cat for cat, res in category_results.items() if not res['detected']]
|
| 874 |
low_score_cats = [cat for cat, res in category_results.items()
|
| 875 |
if res['detected'] and res['score'] and res['score'] < 3]
|
| 876 |
|
| 877 |
if missing_cats:
|
| 878 |
+
st.error("**Missing Categories - Must Add:**")
|
| 879 |
for cat in missing_cats:
|
| 880 |
+
st.write(f"**{cat}:** {CATEGORIES[cat]['description']}")
|
| 881 |
+
st.write(f"Keywords: {', '.join(CATEGORIES[cat]['keywords'][:8])}")
|
|
|
|
|
|
|
|
|
|
| 882 |
|
| 883 |
if low_score_cats:
|
| 884 |
+
st.warning("**Low-Scoring Categories - Improve:**")
|
| 885 |
for cat in low_score_cats:
|
| 886 |
+
score = category_results[cat]['score']
|
| 887 |
+
st.write(f"**{cat}** (Score: {score}/4)")
|
| 888 |
+
st.write(f"Target: {CATEGORIES[cat]['rubric'][4]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 889 |
|
| 890 |
if not missing_cats and not low_score_cats:
|
| 891 |
+
st.success("Excellent! All categories present with good scores.")
|
| 892 |
+
|
| 893 |
+
# Download Report
|
| 894 |
+
st.markdown("---")
|
| 895 |
+
if PDF_AVAILABLE:
|
| 896 |
+
pdf_buffer = create_pdf_report(segment_results, category_results)
|
| 897 |
+
if pdf_buffer:
|
| 898 |
+
st.download_button(
|
| 899 |
+
label="π₯ Download PDF Report",
|
| 900 |
+
data=pdf_buffer,
|
| 901 |
+
file_name=f"analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf",
|
| 902 |
+
mime="application/pdf",
|
| 903 |
+
use_container_width=True
|
| 904 |
+
)
|
| 905 |
+
else:
|
| 906 |
+
# CSV fallback
|
| 907 |
+
results_data = []
|
| 908 |
+
for seg in segment_results:
|
| 909 |
+
results_data.append({
|
| 910 |
+
'Segment': seg['segment_num'],
|
| 911 |
+
'Category': seg['category'],
|
| 912 |
+
'Score': seg['score'],
|
| 913 |
+
'Confidence': seg['confidence']
|
| 914 |
+
})
|
| 915 |
+
|
| 916 |
+
results_df = pd.DataFrame(results_data)
|
| 917 |
+
csv = results_df.to_csv(index=False)
|
| 918 |
+
|
| 919 |
+
st.download_button(
|
| 920 |
+
label="π₯ Download CSV Report",
|
| 921 |
+
data=csv,
|
| 922 |
+
file_name=f"analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
|
| 923 |
+
mime="text/csv",
|
| 924 |
+
use_container_width=True
|
| 925 |
+
)
|
| 926 |
+
|
| 927 |
+
elif statement_text and len(statement_text) <= 100:
|
| 928 |
+
st.warning("β οΈ Please enter a longer statement (minimum 100 characters)")
|
| 929 |
+
else:
|
| 930 |
+
st.info("π Please upload or paste your personal statement to begin analysis")
|
| 931 |
+
|
| 932 |
+
# STEP 3: VIEW RUBRICS
|
| 933 |
+
with tab3:
|
| 934 |
+
st.header("Step 3: Understanding the Scoring Rubrics")
|
| 935 |
+
|
| 936 |
+
st.markdown("""
|
| 937 |
+
The AI model evaluates personal statements based on **4 key categories**,
|
| 938 |
+
each scored on a scale of **1 (Poor) to 4 (Excellent)**.
|
| 939 |
+
""")
|
| 940 |
+
|
| 941 |
+
for category, info in CATEGORIES.items():
|
| 942 |
+
with st.expander(f"**{category}** - {info['description']}", expanded=False):
|
| 943 |
|
| 944 |
+
# Scoring Criteria
|
| 945 |
+
st.subheader("Scoring Criteria:")
|
| 946 |
+
for score in [4, 3, 2, 1]:
|
| 947 |
+
quality = ['Poor', 'Below Average', 'Good', 'Excellent'][score-1]
|
| 948 |
+
if score == 4:
|
| 949 |
+
st.success(f"**Score {score} ({quality}):** {info['rubric'][score]}")
|
| 950 |
+
elif score == 3:
|
| 951 |
+
st.info(f"**Score {score} ({quality}):** {info['rubric'][score]}")
|
| 952 |
+
elif score == 2:
|
| 953 |
+
st.warning(f"**Score {score} ({quality}):** {info['rubric'][score]}")
|
| 954 |
+
else:
|
| 955 |
+
st.error(f"**Score {score} ({quality}):** {info['rubric'][score]}")
|
| 956 |
|
| 957 |
+
st.markdown("---")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 958 |
|
| 959 |
+
# Keywords and indicators
|
| 960 |
+
col1, col2 = st.columns(2)
|
| 961 |
+
|
| 962 |
+
with col1:
|
| 963 |
+
st.markdown("**Key Terms:**")
|
| 964 |
+
st.write(', '.join(info['keywords'][:10]))
|
| 965 |
+
|
| 966 |
+
with col2:
|
| 967 |
+
st.markdown("**Quality Indicators:**")
|
| 968 |
+
st.write(f"β
Positive: {', '.join(info['rubric_features']['positive'][:5])}")
|
| 969 |
+
st.write(f"β Avoid: {', '.join(info['rubric_features']['negative'][:5])}")
|
| 970 |
+
|
| 971 |
+
st.markdown("---")
|
| 972 |
+
st.info("""
|
| 973 |
+
### Tips for High Scores:
|
| 974 |
+
- **Spark (4/4):** Create an engaging opening that clearly connects to your medical journey
|
| 975 |
+
- **Healthcare Experience (4/4):** Show active participation with vivid, thoughtful descriptions
|
| 976 |
+
- **Doctor Qualities (4/4):** Demonstrate mature, realistic understanding with specific examples
|
| 977 |
+
- **Spin (4/4):** Make direct, logical connections between experiences and medical career
|
| 978 |
+
""")
|
| 979 |
|
| 980 |
# Run the application
|
| 981 |
if __name__ == "__main__":
|