stevafernandes commited on
Commit
e8445d0
Β·
verified Β·
1 Parent(s): 7b1f193

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +678 -343
app.py CHANGED
@@ -6,31 +6,39 @@ import os
6
  import re
7
  from datetime import datetime
8
  from io import BytesIO
9
- import base64
 
10
 
11
  # Page config MUST be first
12
  st.set_page_config(
13
  page_title="Medical School Personal Statement Analyzer",
14
  page_icon="πŸ₯",
15
- layout="wide"
 
16
  )
17
 
18
  # Import ML libraries
19
- from sentence_transformers import SentenceTransformer
 
20
  from sklearn.preprocessing import StandardScaler
21
- from sklearn.ensemble import RandomForestClassifier
22
  from sklearn.metrics.pairwise import cosine_similarity
 
23
  import xgboost as xgb
 
24
 
25
  # Import PDF generation libraries
26
- from reportlab.lib import colors
27
- from reportlab.lib.pagesizes import letter
28
- from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, PageBreak
29
- from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
30
- from reportlab.lib.units import inch
31
- from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY
 
 
 
 
32
 
33
- # Categories definition with rubrics
34
  CATEGORIES = {
35
  'Spark': {
36
  'description': 'Opening that spurs interest in medicine (typically in opening paragraph)',
@@ -47,6 +55,10 @@ CATEGORIES = {
47
  2: 'somewhat connected but unclear',
48
  3: 'connected and clear',
49
  4: 'engaging and logically flows into becoming a doctor'
 
 
 
 
50
  }
51
  },
52
  'Healthcare Experience': {
@@ -64,6 +76,10 @@ CATEGORIES = {
64
  2: 'bland/boring but not problematic',
65
  3: 'interesting and relevant',
66
  4: 'vivid, active, thoughtful, relevant, memorable, positive'
 
 
 
 
67
  }
68
  },
69
  'Showing Doctor Qualities': {
@@ -81,6 +97,10 @@ CATEGORIES = {
81
  2: 'bland/boring but not problematic',
82
  3: 'shows some understanding',
83
  4: 'realistic, self-aware, mature, humble, specific understanding'
 
 
 
 
84
  }
85
  },
86
  'Spin': {
@@ -97,123 +117,404 @@ CATEGORIES = {
97
  2: 'some connection but generic',
98
  3: 'clear connection',
99
  4: 'direct, logical, and specific argument'
 
 
 
 
100
  }
101
  }
102
  }
103
 
104
  @st.cache_resource
105
- def load_pretrained_model():
106
- """Load the pre-trained sentence transformer model"""
107
- model = SentenceTransformer('all-MiniLM-L6-v2')
108
- return model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
- def segment_text(text):
111
- """Segment text into meaningful paragraphs/chunks"""
112
- # Try to split by double newlines first
113
  paragraphs = re.split(r'\n\s*\n', text)
114
  paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p.strip()) > 50]
115
 
116
- # If only one paragraph, try to split by sentences
117
  if len(paragraphs) <= 1:
118
  sentences = re.split(r'(?<=[.!?])\s+', text)
119
  sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
120
 
121
- # Group sentences into segments of ~300 words
 
 
 
 
 
122
  segments = []
123
- current_segment = []
124
- current_length = 0
125
 
126
- for sent in sentences:
127
- current_segment.append(sent)
128
- current_length += len(sent.split())
129
 
130
- if current_length > 100: # About 300-400 characters
131
  segments.append(' '.join(current_segment))
132
- current_segment = []
133
- current_length = 0
 
 
 
134
 
135
  if current_segment:
136
  segments.append(' '.join(current_segment))
137
 
138
- return segments if segments else [text]
139
 
140
  return paragraphs
141
 
142
- def analyze_segment(text, embedder):
143
- """Analyze a single segment to determine category and score"""
 
144
  text_lower = text.lower()
145
  words = text.split()
146
 
147
- # Calculate scores for each category
148
- category_scores = {}
 
 
 
 
 
 
149
 
 
150
  for cat_name, cat_info in CATEGORIES.items():
151
- # Keyword matching
152
- keyword_matches = sum(1 for kw in cat_info['keywords'] if kw.lower() in text_lower)
153
- keyword_score = keyword_matches / len(cat_info['keywords'])
 
 
 
 
 
154
 
155
- # Pattern matching
156
  pattern_matches = 0
157
- for pattern in cat_info['patterns']:
158
- if re.search(pattern, text_lower):
159
- pattern_matches += 1
160
- pattern_score = pattern_matches / len(cat_info['patterns']) if cat_info['patterns'] else 0
161
-
162
- # Semantic similarity using embeddings
163
- category_text = f"{cat_info['description']} {' '.join(cat_info['keywords'][:10])}"
164
- text_embedding = embedder.encode(text)
165
- category_embedding = embedder.encode(category_text)
166
-
167
- if hasattr(text_embedding, 'cpu'):
168
- text_embedding = text_embedding.cpu().numpy()
169
- category_embedding = category_embedding.cpu().numpy()
170
-
171
- similarity = cosine_similarity([text_embedding], [category_embedding])[0][0]
172
-
173
- # Combined score
174
- combined_score = (keyword_score * 0.3 + pattern_score * 0.2 + similarity * 0.5)
175
- category_scores[cat_name] = combined_score
176
-
177
- # Select best category
178
- best_category = max(category_scores, key=category_scores.get)
179
- confidence = category_scores[best_category]
180
-
181
- # Determine quality score (1-4) based on rubric
182
- if confidence > 0.7:
183
- score = 4
184
- elif confidence > 0.5:
185
- score = 3
186
- elif confidence > 0.3:
187
- score = 2
 
 
 
 
 
 
 
188
  else:
189
- score = 1
190
 
191
- # Adjust score based on text quality indicators
192
- positive_indicators = ['vivid', 'thoughtful', 'specific', 'logical', 'mature']
193
- negative_indicators = ['vague', 'generic', 'unclear', 'disconnected', 'simplistic']
194
 
195
- positive_count = sum(1 for ind in positive_indicators if ind in text_lower)
196
- negative_count = sum(1 for ind in negative_indicators if ind in text_lower)
 
 
 
197
 
198
- if positive_count > negative_count and score < 4:
199
- score = min(score + 1, 4)
200
- elif negative_count > positive_count and score > 1:
201
- score = max(score - 1, 1)
202
 
203
- return {
204
- 'category': best_category,
205
- 'score': score,
206
- 'confidence': confidence,
207
- 'text': text
208
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
- def analyze_full_statement(text, embedder):
211
  """Analyze complete personal statement"""
212
- segments = segment_text(text)
213
 
214
  segment_results = []
215
  for i, segment in enumerate(segments):
216
- result = analyze_segment(segment, embedder)
217
  result['segment_num'] = i + 1
218
  segment_results.append(result)
219
 
@@ -244,8 +545,11 @@ def analyze_full_statement(text, embedder):
244
 
245
  return segment_results, category_results
246
 
247
- def create_pdf_report(segment_results, category_results, statement_text):
248
- """Create a professional PDF report"""
 
 
 
249
  buffer = BytesIO()
250
  doc = SimpleDocTemplate(buffer, pagesize=letter, rightMargin=72, leftMargin=72,
251
  topMargin=72, bottomMargin=18)
@@ -307,92 +611,6 @@ def create_pdf_report(segment_results, category_results, statement_text):
307
  ]))
308
 
309
  elements.append(summary_table)
310
- elements.append(Spacer(1, 30))
311
-
312
- # Category Analysis
313
- elements.append(Paragraph("CATEGORY ANALYSIS", heading_style))
314
-
315
- category_data = [['Category', 'Status', 'Score', 'Confidence', 'Segments']]
316
- for cat in CATEGORIES.keys():
317
- if category_results[cat]['detected']:
318
- status = "βœ“ Detected"
319
- score = f"{category_results[cat]['score']}/4"
320
- confidence = f"{category_results[cat]['confidence']:.1%}"
321
- segments = str(category_results[cat]['num_segments'])
322
- else:
323
- status = "βœ— Not Found"
324
- score = "N/A"
325
- confidence = "N/A"
326
- segments = "0"
327
- category_data.append([cat, status, score, confidence, segments])
328
-
329
- category_table = Table(category_data, colWidths=[2*inch, 1.2*inch, 0.8*inch, 1*inch, 1*inch])
330
- category_table.setStyle(TableStyle([
331
- ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#1f4788')),
332
- ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
333
- ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
334
- ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
335
- ('FONTSIZE', (0, 0), (-1, 0), 11),
336
- ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
337
- ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
338
- ('GRID', (0, 0), (-1, -1), 1, colors.black)
339
- ]))
340
-
341
- elements.append(category_table)
342
- elements.append(PageBreak())
343
-
344
- # Detailed Recommendations
345
- elements.append(Paragraph("RECOMMENDATIONS", heading_style))
346
-
347
- missing_cats = [cat for cat, res in category_results.items() if not res['detected']]
348
- low_score_cats = [cat for cat, res in category_results.items()
349
- if res['detected'] and res['score'] and res['score'] < 3]
350
-
351
- if missing_cats:
352
- elements.append(Paragraph("<b>Missing Categories:</b>", styles['Heading3']))
353
- for cat in missing_cats:
354
- elements.append(Paragraph(f"β€’ Add content for {cat}: {CATEGORIES[cat]['description']}", styles['Normal']))
355
- elements.append(Paragraph(f" Include keywords: {', '.join(CATEGORIES[cat]['keywords'][:5])}...", styles['Normal']))
356
- elements.append(Spacer(1, 12))
357
-
358
- if low_score_cats:
359
- elements.append(Paragraph("<b>Areas for Improvement:</b>", styles['Heading3']))
360
- for cat in low_score_cats:
361
- score = category_results[cat]['score']
362
- elements.append(Paragraph(f"β€’ Improve {cat} (current score: {score}/4)", styles['Normal']))
363
- elements.append(Paragraph(f" Target: {CATEGORIES[cat]['rubric'][4]}", styles['Normal']))
364
- elements.append(Spacer(1, 12))
365
-
366
- if not missing_cats and not low_score_cats:
367
- elements.append(Paragraph("Excellent work! All categories are present with good scores.", styles['Normal']))
368
-
369
- # Segment Analysis Summary
370
- elements.append(PageBreak())
371
- elements.append(Paragraph("SEGMENT ANALYSIS", heading_style))
372
-
373
- for segment in segment_results[:10]: # Limit to first 10 segments
374
- elements.append(Paragraph(f"<b>Segment {segment['segment_num']}</b>", styles['Heading3']))
375
-
376
- detail_data = [
377
- ['Category', segment['category']],
378
- ['Score', f"{segment['score']}/4"],
379
- ['Confidence', f"{segment['confidence']:.1%}"]
380
- ]
381
-
382
- detail_table = Table(detail_data, colWidths=[1.5*inch, 4*inch])
383
- detail_table.setStyle(TableStyle([
384
- ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
385
- ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
386
- ('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
387
- ('GRID', (0, 0), (-1, -1), 1, colors.black)
388
- ]))
389
-
390
- elements.append(detail_table)
391
- elements.append(Spacer(1, 6))
392
-
393
- text_preview = segment['text'][:200] + "..." if len(segment['text']) > 200 else segment['text']
394
- elements.append(Paragraph(f"<i>{text_preview}</i>", styles['Normal']))
395
- elements.append(Spacer(1, 12))
396
 
397
  # Build PDF
398
  doc.build(elements)
@@ -402,99 +620,187 @@ def create_pdf_report(segment_results, category_results, statement_text):
402
  # Main Application
403
  def main():
404
  st.title("πŸ₯ Medical School Personal Statement Analyzer")
405
- st.markdown("Faith Marie Kurtyka, Cole Krudwig, Sean Dore, Sara Avila, George (Guy) McHendry, Steven Fernandes")
406
  st.markdown("---")
407
 
408
- # Sidebar with information
409
- with st.sidebar:
410
- st.header("ℹ️ About This Tool")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  st.markdown("""
412
- This analyzer evaluates personal statements across 4 key categories:
 
 
 
 
 
413
 
414
- **πŸ“Œ Spark**
415
- Opening that shows your interest in medicine
 
 
416
 
417
- **πŸ₯ Healthcare Experience**
418
- Clinical and medical experiences
419
 
420
- **πŸ’ͺ Doctor Qualities**
421
- Leadership and character traits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
 
423
- **πŸ”— Spin**
424
- Connecting experiences to medical career
 
 
425
 
426
- Each category is scored 1-4:
427
- - 4 = Excellent
428
- - 3 = Good
429
- - 2 = Below Average
430
- - 1 = Poor
431
- """)
432
 
433
- st.markdown("---")
434
- st.markdown("### πŸ“Š Scoring Rubrics")
 
 
 
 
 
 
435
 
436
- for cat_name, cat_info in CATEGORIES.items():
437
- with st.expander(cat_name):
438
- for score in [4, 3, 2, 1]:
439
- st.write(f"**Score {score}:** {cat_info['rubric'][score]}")
440
-
441
- # Load model
442
- embedder = load_pretrained_model()
443
-
444
- # Main content area
445
- st.header("πŸ“ Upload Your Personal Statement")
446
-
447
- # Input method selection
448
- input_method = st.radio(
449
- "Choose input method:",
450
- ["Upload Text File (.txt)", "Paste Text Directly"],
451
- horizontal=True
452
- )
453
-
454
- statement_text = None
455
-
456
- if input_method == "Upload Text File (.txt)":
457
- uploaded_file = st.file_uploader(
458
- "Choose a text file",
459
- type=['txt'],
460
- help="Upload your personal statement as a .txt file"
461
  )
462
 
463
- if uploaded_file is not None:
464
- statement_text = str(uploaded_file.read(), 'utf-8')
465
- st.success(f"βœ… File uploaded successfully ({len(statement_text)} characters)")
466
-
467
- # Show preview
468
- with st.expander("Preview Statement"):
469
- st.text(statement_text[:500] + "..." if len(statement_text) > 500 else statement_text)
470
-
471
- else: # Paste Text Directly
472
- statement_text = st.text_area(
473
- "Paste your personal statement here:",
474
- height=400,
475
- placeholder="Enter your complete personal statement...",
476
- help="Paste your entire personal statement for analysis"
477
- )
478
 
479
- if statement_text:
480
- st.info(f"πŸ“Š Statement length: {len(statement_text)} characters, {len(statement_text.split())} words")
481
-
482
- # Analyze button
483
- if statement_text and len(statement_text) > 100:
484
- if st.button("πŸ”¬ Analyze Statement", type="primary", use_container_width=True):
485
-
486
- with st.spinner("Analyzing your personal statement..."):
487
- # Perform analysis
488
- segment_results, category_results = analyze_full_statement(statement_text, embedder)
489
-
490
- st.success("βœ… Analysis Complete!")
491
- st.balloons()
492
 
493
- # Display results in tabs
494
- tab1, tab2, tab3, tab4 = st.tabs(["πŸ“Š Summary", "πŸ“ Segments", "πŸ’‘ Recommendations", "πŸ“₯ Download Report"])
 
 
 
 
 
 
 
 
 
 
 
 
495
 
496
- with tab1:
497
- st.header("Overall Summary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
498
 
499
  # Metrics
500
  col1, col2, col3, col4 = st.columns(4)
@@ -502,11 +808,7 @@ def main():
502
  detected_cats = [cat for cat, res in category_results.items() if res['detected']]
503
 
504
  with col1:
505
- st.metric(
506
- "Categories Found",
507
- f"{len(detected_cats)}/4",
508
- delta=f"{len(detected_cats)-4}" if len(detected_cats) < 4 else "Complete"
509
- )
510
 
511
  with col2:
512
  if detected_cats:
@@ -521,126 +823,159 @@ def main():
521
  with col4:
522
  if detected_cats:
523
  avg_score = np.mean([category_results[cat]['score'] for cat in detected_cats])
524
- if avg_score >= 3.5:
525
- quality = "Excellent"
526
- color = "🟒"
527
- elif avg_score >= 2.5:
528
- quality = "Good"
529
- color = "🟑"
530
- else:
531
- quality = "Needs Work"
532
- color = "πŸ”΄"
533
- st.metric("Overall Quality", f"{color} {quality}")
534
  else:
535
  st.metric("Overall Quality", "N/A")
536
 
537
- # Category breakdown
538
- st.subheader("Category Analysis")
 
539
 
540
  for cat in CATEGORIES.keys():
541
  res = category_results[cat]
542
- col1, col2, col3, col4 = st.columns([3, 1, 1, 1])
543
-
544
- with col1:
545
- if res['detected']:
546
- st.write(f"βœ… **{cat}**")
547
- else:
548
- st.write(f"❌ **{cat}** *(Not detected)*")
549
-
550
- with col2:
551
- if res['detected']:
552
- st.write(f"Score: {res['score']}/4")
553
- else:
554
- st.write("Score: -")
555
-
556
- with col3:
557
- if res['detected']:
558
- st.write(f"Confidence: {res['confidence']:.1%}")
559
- else:
560
- st.write("Confidence: -")
561
-
562
- with col4:
563
- if res['detected']:
564
- st.write(f"Segments: {res['num_segments']}")
565
- else:
566
- st.write("Segments: 0")
567
-
568
  if res['detected']:
569
- # Progress bar for score
 
570
  st.progress(res['score'] / 4)
571
-
572
- with tab2:
573
- st.header("Segment-by-Segment Analysis")
 
 
 
 
574
 
575
  for segment in segment_results:
 
 
 
576
  with st.expander(f"Segment {segment['segment_num']}: {segment['category']} (Score: {segment['score']}/4)"):
577
  col1, col2 = st.columns([1, 3])
578
 
579
  with col1:
580
  st.metric("Category", segment['category'])
581
- st.metric("Score", f"{segment['score']}/4")
582
  st.metric("Confidence", f"{segment['confidence']:.1%}")
583
 
584
  with col2:
585
  st.write("**Text:**")
586
- st.write(segment['text'])
587
 
588
- # Show rubric for this score
589
- st.write("**Rubric for this score:**")
590
- st.info(CATEGORIES[segment['category']]['rubric'][segment['score']])
591
-
592
- with tab3:
593
- st.header("Recommendations for Improvement")
 
594
 
595
  missing_cats = [cat for cat, res in category_results.items() if not res['detected']]
596
  low_score_cats = [cat for cat, res in category_results.items()
597
  if res['detected'] and res['score'] and res['score'] < 3]
598
 
599
  if missing_cats:
600
- st.error("🚨 **Missing Categories - Must Add:**")
601
  for cat in missing_cats:
602
- st.write(f"### {cat}")
603
- st.write(f"**Description:** {CATEGORIES[cat]['description']}")
604
- st.write(f"**Keywords to include:** {', '.join(CATEGORIES[cat]['keywords'][:8])}")
605
- st.write(f"**Target Quality:** {CATEGORIES[cat]['rubric'][4]}")
606
- st.write("---")
607
 
608
  if low_score_cats:
609
- st.warning("⚠️ **Low-Scoring Categories - Should Improve:**")
610
  for cat in low_score_cats:
611
- current_score = category_results[cat]['score']
612
- st.write(f"### {cat}")
613
- st.write(f"**Current Score:** {current_score}/4")
614
- st.write(f"**Current Level:** {CATEGORIES[cat]['rubric'][current_score]}")
615
- st.write(f"**Target Level:** {CATEGORIES[cat]['rubric'][4]}")
616
- st.write(f"**Improvement Tips:** Add more {', '.join(CATEGORIES[cat]['keywords'][:5])}")
617
- st.write("---")
618
 
619
  if not missing_cats and not low_score_cats:
620
- st.success("πŸŽ‰ Excellent work! All categories are present with good scores.")
621
- st.write("Your personal statement effectively covers all required elements.")
622
-
623
- with tab4:
624
- st.header("Download Analysis Report")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
625
 
626
- # Generate PDF
627
- pdf_buffer = create_pdf_report(segment_results, category_results, statement_text)
 
 
 
 
 
 
 
 
 
 
628
 
629
- # Download button
630
- st.download_button(
631
- label="πŸ“₯ Download PDF Report",
632
- data=pdf_buffer,
633
- file_name=f"personal_statement_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf",
634
- mime="application/pdf",
635
- use_container_width=True
636
- )
637
 
638
- st.info("The PDF report includes detailed analysis, scores, and recommendations for your personal statement.")
639
-
640
- elif statement_text and len(statement_text) <= 100:
641
- st.warning("⚠️ Please enter a longer statement (minimum 100 characters) for meaningful analysis.")
642
- else:
643
- st.info("πŸ‘† Please upload or paste your personal statement to begin analysis.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
644
 
645
  # Run the application
646
  if __name__ == "__main__":
 
6
  import re
7
  from datetime import datetime
8
  from io import BytesIO
9
+ import warnings
10
+ warnings.filterwarnings('ignore')
11
 
12
  # Page config MUST be first
13
  st.set_page_config(
14
  page_title="Medical School Personal Statement Analyzer",
15
  page_icon="πŸ₯",
16
+ layout="wide",
17
+ initial_sidebar_state="expanded"
18
  )
19
 
20
  # Import ML libraries
21
+ from sentence_transformers import SentenceTransformer, util
22
+ from sklearn.model_selection import train_test_split
23
  from sklearn.preprocessing import StandardScaler
 
24
  from sklearn.metrics.pairwise import cosine_similarity
25
+ from sklearn.ensemble import RandomForestClassifier
26
  import xgboost as xgb
27
+ import torch
28
 
29
  # Import PDF generation libraries
30
+ try:
31
+ from reportlab.lib import colors
32
+ from reportlab.lib.pagesizes import letter
33
+ from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, PageBreak
34
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
35
+ from reportlab.lib.units import inch
36
+ from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY
37
+ PDF_AVAILABLE = True
38
+ except ImportError:
39
+ PDF_AVAILABLE = False
40
 
41
+ # Categories with detailed rubric alignment
42
  CATEGORIES = {
43
  'Spark': {
44
  'description': 'Opening that spurs interest in medicine (typically in opening paragraph)',
 
55
  2: 'somewhat connected but unclear',
56
  3: 'connected and clear',
57
  4: 'engaging and logically flows into becoming a doctor'
58
+ },
59
+ 'rubric_features': {
60
+ 'positive': ['engaging', 'logical', 'clear connection', 'compelling', 'authentic'],
61
+ 'negative': ['disconnected', 'confusing', 'random', 'unclear', 'generic']
62
  }
63
  },
64
  'Healthcare Experience': {
 
76
  2: 'bland/boring but not problematic',
77
  3: 'interesting and relevant',
78
  4: 'vivid, active, thoughtful, relevant, memorable, positive'
79
+ },
80
+ 'rubric_features': {
81
+ 'positive': ['vivid', 'active', 'thoughtful', 'memorable', 'optimistic', 'engaged'],
82
+ 'negative': ['passive', 'uninteresting', 'irrelevant', 'problematic', 'pessimistic']
83
  }
84
  },
85
  'Showing Doctor Qualities': {
 
97
  2: 'bland/boring but not problematic',
98
  3: 'shows some understanding',
99
  4: 'realistic, self-aware, mature, humble, specific understanding'
100
+ },
101
+ 'rubric_features': {
102
+ 'positive': ['realistic', 'self-aware', 'mature', 'humble', 'specific', 'clear'],
103
+ 'negative': ['arrogant', 'immature', 'overly confident', 'simplistic', 'inaccurate']
104
  }
105
  },
106
  'Spin': {
 
117
  2: 'some connection but generic',
118
  3: 'clear connection',
119
  4: 'direct, logical, and specific argument'
120
+ },
121
+ 'rubric_features': {
122
+ 'positive': ['direct', 'logical', 'specific', 'clear argument', 'compelling'],
123
+ 'negative': ['brief', 'vague', 'simplistic', 'generic', 'weak']
124
  }
125
  }
126
  }
127
 
128
  @st.cache_resource
129
+ def load_sentence_transformer():
130
+ """Load the e5-large-v2 sentence transformer model"""
131
+ try:
132
+ # Try to load the preferred model
133
+ model = SentenceTransformer('intfloat/e5-large-v2')
134
+ return model, 'intfloat/e5-large-v2'
135
+ except:
136
+ # Fallback to lighter model if e5-large-v2 fails
137
+ try:
138
+ model = SentenceTransformer('all-MiniLM-L6-v2')
139
+ return model, 'all-MiniLM-L6-v2'
140
+ except Exception as e:
141
+ st.error(f"Failed to load transformer: {e}")
142
+ return None, None
143
+
144
+ def load_training_data_from_files():
145
+ """Load and combine training data from the two Excel files"""
146
+ try:
147
+ # File paths for the Excel files
148
+ file1_path = "DedooseChartExcerpts_2025_8_5_1025.xlsx"
149
+ file2_path = "Personal Statements Coded.xlsx"
150
+
151
+ # Check if files exist
152
+ if not os.path.exists(file1_path) or not os.path.exists(file2_path):
153
+ return None
154
+
155
+ # Load Excel files
156
+ df1 = pd.read_excel(file1_path)
157
+ df2 = pd.read_excel(file2_path)
158
+
159
+ # Combine dataframes
160
+ combined_df = pd.concat([df1, df2], ignore_index=True)
161
+
162
+ processed_data = []
163
+
164
+ for _, row in combined_df.iterrows():
165
+ text = None
166
+ # Look for text columns
167
+ for col_name in ['Excerpt Copy', 'Excerpt', 'Text', 'Content']:
168
+ if col_name in row and pd.notna(row[col_name]):
169
+ text = str(row[col_name])
170
+ break
171
+
172
+ if not text or text.strip() == '':
173
+ continue
174
+
175
+ data_point = {
176
+ 'text': text.strip(),
177
+ 'media_title': row.get('Media Title', 'Unknown')
178
+ }
179
+
180
+ # Process categories
181
+ for category in CATEGORIES.keys():
182
+ col_applied = f"Code: {category} Applied"
183
+ col_weight = f"Code: {category} Weight"
184
+
185
+ is_applied = False
186
+ if col_applied in row:
187
+ applied_val = str(row[col_applied]).lower()
188
+ is_applied = applied_val in ['true', '1', 'yes', 't']
189
+
190
+ data_point[f"{category}_applied"] = is_applied
191
+
192
+ if is_applied and col_weight in row:
193
+ weight = row[col_weight]
194
+ if pd.isna(weight) or weight == '':
195
+ weight = 2
196
+ else:
197
+ try:
198
+ weight = int(float(weight))
199
+ weight = max(1, min(4, weight))
200
+ except:
201
+ weight = 2
202
+ else:
203
+ weight = 0
204
+
205
+ data_point[f"{category}_score"] = weight
206
+
207
+ processed_data.append(data_point)
208
+
209
+ return pd.DataFrame(processed_data)
210
+
211
+ except Exception as e:
212
+ st.error(f"Error loading training data: {str(e)}")
213
+ return None
214
 
215
+ def segment_text(text, embedder):
216
+ """Segment text using semantic similarity"""
 
217
  paragraphs = re.split(r'\n\s*\n', text)
218
  paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p.strip()) > 50]
219
 
 
220
  if len(paragraphs) <= 1:
221
  sentences = re.split(r'(?<=[.!?])\s+', text)
222
  sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
223
 
224
+ if len(sentences) < 3:
225
+ return [text]
226
+
227
+ # Use embeddings for semantic segmentation
228
+ embeddings = embedder.encode(sentences, convert_to_tensor=True)
229
+
230
  segments = []
231
+ current_segment = [sentences[0]]
232
+ current_embedding = embeddings[0]
233
 
234
+ for i in range(1, len(sentences)):
235
+ similarity = util.cos_sim(current_embedding, embeddings[i]).item()
 
236
 
237
+ if similarity < 0.7 or len(' '.join(current_segment)) > 500:
238
  segments.append(' '.join(current_segment))
239
+ current_segment = [sentences[i]]
240
+ current_embedding = embeddings[i]
241
+ else:
242
+ current_segment.append(sentences[i])
243
+ current_embedding = (current_embedding + embeddings[i]) / 2
244
 
245
  if current_segment:
246
  segments.append(' '.join(current_segment))
247
 
248
+ return segments
249
 
250
  return paragraphs
251
 
252
+ def extract_features(text, embedder, category_focus=None):
253
+ """Extract features for classification"""
254
+ features = []
255
  text_lower = text.lower()
256
  words = text.split()
257
 
258
+ # Basic text statistics
259
+ features.extend([
260
+ len(text),
261
+ len(words),
262
+ len(set(words)) / max(len(words), 1),
263
+ len(re.findall(r'[.!?]', text)),
264
+ text.count('I') / max(len(words), 1),
265
+ ])
266
 
267
+ # Process all categories
268
  for cat_name, cat_info in CATEGORIES.items():
269
+ keywords = cat_info['keywords']
270
+ keyword_matches = sum(1 for kw in keywords if kw.lower() in text_lower)
271
+ keyword_density = keyword_matches / max(len(keywords), 1)
272
+
273
+ if category_focus == cat_name:
274
+ keyword_density *= 2
275
+
276
+ features.append(keyword_density * 10)
277
 
 
278
  pattern_matches = 0
279
+ for pattern in cat_info.get('patterns', []):
280
+ matches = re.findall(pattern, text_lower)
281
+ pattern_matches += len(matches)
282
+ features.append(pattern_matches)
283
+
284
+ positive_count = sum(1 for word in cat_info['rubric_features']['positive']
285
+ if word in text_lower)
286
+ negative_count = sum(1 for word in cat_info['rubric_features']['negative']
287
+ if word in text_lower)
288
+
289
+ features.extend([
290
+ positive_count / max(len(words), 1) * 100,
291
+ negative_count / max(len(words), 1) * 100
292
+ ])
293
+
294
+ # Get embeddings
295
+ try:
296
+ embedding = embedder.encode(text, convert_to_tensor=False, normalize_embeddings=True)
297
+ if hasattr(embedding, 'cpu'):
298
+ embedding = embedding.cpu().numpy()
299
+ embedding = embedding.flatten()
300
+ # Limit embedding size for memory efficiency
301
+ embedding = embedding[:512] if len(embedding) > 512 else embedding
302
+ except:
303
+ embedding = np.zeros(512)
304
+
305
+ # Category similarity
306
+ if category_focus and category_focus in CATEGORIES:
307
+ category_text = f"{CATEGORIES[category_focus]['description']} {' '.join(CATEGORIES[category_focus]['keywords'][:10])}"
308
+ try:
309
+ category_embedding = embedder.encode(category_text, normalize_embeddings=True)
310
+ if hasattr(category_embedding, 'cpu'):
311
+ category_embedding = category_embedding.cpu().numpy()
312
+ category_embedding = category_embedding.flatten()[:512]
313
+ similarity = cosine_similarity([embedding[:512]], [category_embedding])[0][0]
314
+ features.append(similarity * 10)
315
+ except:
316
+ features.append(0)
317
  else:
318
+ features.append(0)
319
 
320
+ features = np.array(features, dtype=np.float32)
321
+ combined_features = np.concatenate([features, embedding])
 
322
 
323
+ return combined_features
324
+
325
+ def train_models(df, embedder):
326
+ """Train ensemble models"""
327
+ all_features = []
328
 
329
+ progress_bar = st.progress(0)
330
+ status_text = st.empty()
 
 
331
 
332
+ status_text.text("Extracting features from training data...")
333
+
334
+ for idx, row in df.iterrows():
335
+ text = row['text']
336
+
337
+ category_features = {}
338
+ for cat in CATEGORIES.keys():
339
+ features = extract_features(text, embedder, category_focus=cat)
340
+ category_features[cat] = features
341
+
342
+ true_categories = [cat for cat in CATEGORIES.keys() if row[f"{cat}_applied"]]
343
+
344
+ if true_categories:
345
+ features = category_features[true_categories[0]]
346
+ else:
347
+ features = np.mean(list(category_features.values()), axis=0)
348
+
349
+ all_features.append(features)
350
+ progress_bar.progress((idx + 1) / len(df))
351
+
352
+ X = np.array(all_features)
353
+
354
+ categories = list(CATEGORIES.keys())
355
+ y_class = df[[f"{cat}_applied" for cat in categories]].values.astype(float)
356
+
357
+ y_score = []
358
+ for _, row in df.iterrows():
359
+ scores = []
360
+ for cat in categories:
361
+ if row[f"{cat}_applied"]:
362
+ scores.append(row[f"{cat}_score"] / 4.0)
363
+ else:
364
+ scores.append(0)
365
+ y_score.append(scores)
366
+ y_score = np.array(y_score)
367
+
368
+ status_text.text("Training models...")
369
+
370
+ # Split data
371
+ X_train, X_test, y_class_train, y_class_test, y_score_train, y_score_test = train_test_split(
372
+ X, y_class, y_score, test_size=0.2, random_state=42
373
+ )
374
+
375
+ # Scale features
376
+ scaler = StandardScaler()
377
+ X_train_scaled = scaler.fit_transform(X_train)
378
+ X_test_scaled = scaler.transform(X_test)
379
+
380
+ # Train classifiers and scorers
381
+ classifiers = {}
382
+ scorers = {}
383
+ thresholds = {}
384
+ ensemble = {}
385
+
386
+ for i, cat in enumerate(categories):
387
+ n_positive = np.sum(y_class_train[:, i])
388
+
389
+ models = []
390
+
391
+ # XGBoost classifier
392
+ if n_positive >= 5:
393
+ xgb_clf = xgb.XGBClassifier(
394
+ n_estimators=100,
395
+ max_depth=5,
396
+ learning_rate=0.1,
397
+ random_state=42,
398
+ use_label_encoder=False,
399
+ eval_metric='logloss'
400
+ )
401
+ xgb_clf.fit(X_train_scaled, y_class_train[:, i])
402
+ models.append(('xgb', xgb_clf))
403
+ classifiers[cat] = xgb_clf
404
+
405
+ # Random Forest as backup or ensemble member
406
+ rf_clf = RandomForestClassifier(
407
+ n_estimators=100,
408
+ max_depth=6,
409
+ class_weight='balanced',
410
+ random_state=42
411
+ )
412
+ rf_clf.fit(X_train_scaled, y_class_train[:, i])
413
+ models.append(('rf', rf_clf))
414
+
415
+ if n_positive < 5:
416
+ classifiers[cat] = rf_clf
417
+
418
+ ensemble[cat] = models
419
+ thresholds[cat] = 0.5
420
+
421
+ # Train scorer
422
+ mask = y_class_train[:, i] == 1
423
+ if np.sum(mask) > 5:
424
+ scorer = xgb.XGBRegressor(
425
+ n_estimators=100,
426
+ max_depth=4,
427
+ random_state=42
428
+ )
429
+ scorer.fit(X_train_scaled[mask], y_score_train[mask, i])
430
+ else:
431
+ from sklearn.dummy import DummyRegressor
432
+ scorer = DummyRegressor(strategy='constant', constant=0.5)
433
+ scorer.fit(X_train_scaled, y_score_train[:, i])
434
+
435
+ scorers[cat] = scorer
436
+
437
+ # Calculate accuracies
438
+ accuracies = []
439
+ for i, cat in enumerate(categories):
440
+ preds = classifiers[cat].predict(X_test_scaled)
441
+ acc = np.mean(preds == y_class_test[:, i])
442
+ accuracies.append(acc)
443
+
444
+ status_text.empty()
445
+ progress_bar.empty()
446
+
447
+ return scaler, classifiers, scorers, thresholds, accuracies, ensemble
448
+
449
+ def classify_segment(text, embedder, scaler, classifiers, scorers, thresholds, ensemble=None):
450
+ """Classify a segment of text"""
451
+ categories = list(CATEGORIES.keys())
452
+ category_results = {}
453
+
454
+ for cat in categories:
455
+ features = extract_features(text, embedder, category_focus=cat)
456
+ features_scaled = scaler.transform([features])
457
+
458
+ if ensemble and cat in ensemble:
459
+ probs = []
460
+ for name, model in ensemble[cat]:
461
+ if hasattr(model, 'predict_proba'):
462
+ model_probs = model.predict_proba(features_scaled)
463
+ if model_probs.shape[1] == 2:
464
+ probs.append(model_probs[0, 1])
465
+
466
+ if probs:
467
+ avg_prob = np.mean(probs)
468
+ else:
469
+ avg_prob = 0.5
470
+ else:
471
+ if hasattr(classifiers[cat], 'predict_proba'):
472
+ probs = classifiers[cat].predict_proba(features_scaled)
473
+ if probs.shape[1] == 2:
474
+ avg_prob = probs[0, 1]
475
+ else:
476
+ avg_prob = 0.5
477
+ else:
478
+ avg_prob = 0.5
479
+
480
+ category_results[cat] = avg_prob
481
+
482
+ best_category = max(category_results, key=category_results.get)
483
+ best_prob = category_results[best_category]
484
+
485
+ if best_prob > thresholds.get(best_category, 0.5):
486
+ features = extract_features(text, embedder, category_focus=best_category)
487
+ features_scaled = scaler.transform([features])
488
+
489
+ try:
490
+ score_normalized = scorers[best_category].predict(features_scaled)[0]
491
+ score = int(np.clip(np.round(score_normalized * 4), 1, 4))
492
+ except:
493
+ score = 2
494
+
495
+ return {
496
+ 'category': best_category,
497
+ 'score': score,
498
+ 'confidence': float(best_prob),
499
+ 'text': text,
500
+ 'all_probabilities': category_results
501
+ }
502
+ else:
503
+ return {
504
+ 'category': 'Unclassified',
505
+ 'score': None,
506
+ 'confidence': 0,
507
+ 'text': text,
508
+ 'all_probabilities': category_results
509
+ }
510
 
511
+ def analyze_statement(text, embedder, scaler, classifiers, scorers, thresholds, ensemble=None):
512
  """Analyze complete personal statement"""
513
+ segments = segment_text(text, embedder)
514
 
515
  segment_results = []
516
  for i, segment in enumerate(segments):
517
+ result = classify_segment(segment, embedder, scaler, classifiers, scorers, thresholds, ensemble)
518
  result['segment_num'] = i + 1
519
  segment_results.append(result)
520
 
 
545
 
546
  return segment_results, category_results
547
 
548
+ def create_pdf_report(segment_results, category_results):
549
+ """Create PDF report"""
550
+ if not PDF_AVAILABLE:
551
+ return None
552
+
553
  buffer = BytesIO()
554
  doc = SimpleDocTemplate(buffer, pagesize=letter, rightMargin=72, leftMargin=72,
555
  topMargin=72, bottomMargin=18)
 
611
  ]))
612
 
613
  elements.append(summary_table)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
614
 
615
  # Build PDF
616
  doc.build(elements)
 
620
  # Main Application
621
  def main():
622
  st.title("πŸ₯ Medical School Personal Statement Analyzer")
623
+ st.markdown("*AI-powered analysis based on medical school admission rubrics*")
624
  st.markdown("---")
625
 
626
+ # Initialize session state
627
+ if 'model_trained' not in st.session_state:
628
+ st.session_state.model_trained = False
629
+ if 'embedder' not in st.session_state:
630
+ st.session_state.embedder = None
631
+ if 'scaler' not in st.session_state:
632
+ st.session_state.scaler = None
633
+ if 'classifiers' not in st.session_state:
634
+ st.session_state.classifiers = None
635
+ if 'scorers' not in st.session_state:
636
+ st.session_state.scorers = None
637
+ if 'thresholds' not in st.session_state:
638
+ st.session_state.thresholds = None
639
+ if 'ensemble' not in st.session_state:
640
+ st.session_state.ensemble = None
641
+
642
+ # Create three tabs
643
+ tab1, tab2, tab3 = st.tabs(["πŸ“š Step 1: Train Model", "πŸ“ Step 2: Analyze Statements", "πŸ“Š Step 3: View Rubrics"])
644
+
645
+ # STEP 1: TRAIN MODEL
646
+ with tab1:
647
+ st.header("Step 1: Train the AI Model")
648
  st.markdown("""
649
+ ### Instructions:
650
+ Click the 'Train Model' button to automatically train the AI using:
651
+ - Pre-loaded Excel training files
652
+ - State-of-the-art e5-large-v2 transformer model
653
+ - Ensemble classification algorithms
654
+ """)
655
 
656
+ # Check if models already exist in session
657
+ if st.session_state.model_trained:
658
+ st.success("βœ… Model is already trained and ready for analysis!")
659
+ st.info("You can proceed to Step 2 to analyze statements, or retrain if needed.")
660
 
661
+ st.markdown("---")
 
662
 
663
+ # Train button
664
+ if st.button("πŸš€ Train Model", type="primary", use_container_width=True):
665
+ # Load training data
666
+ with st.spinner("Loading training data from Excel files..."):
667
+ df = load_training_data_from_files()
668
+
669
+ if df is None or df.empty:
670
+ st.error("""
671
+ ❌ Could not load training data. Please ensure these files are present:
672
+ - DedooseChartExcerpts_2025_8_5_1025.xlsx
673
+ - Personal Statements Coded.xlsx
674
+ """)
675
+ st.stop()
676
+
677
+ st.success(f"βœ… Loaded {len(df)} training samples")
678
+
679
+ # Show data distribution
680
+ st.subheader("Training Data Distribution:")
681
+ dist_cols = st.columns(4)
682
+ for idx, cat in enumerate(CATEGORIES.keys()):
683
+ if f"{cat}_applied" in df.columns:
684
+ count = df[f"{cat}_applied"].sum()
685
+ with dist_cols[idx % 4]:
686
+ st.metric(cat, f"{int(count)} samples")
687
+
688
+ # Load transformer model
689
+ with st.spinner("Loading e5-large-v2 transformer model..."):
690
+ if st.session_state.embedder is None:
691
+ embedder, embedder_name = load_sentence_transformer()
692
+ st.session_state.embedder = embedder
693
+ else:
694
+ embedder = st.session_state.embedder
695
+ embedder_name = 'intfloat/e5-large-v2'
696
+
697
+ if embedder is None:
698
+ st.error("Failed to load transformer model")
699
+ st.stop()
700
+
701
+ st.info(f"Using model: {embedder_name}")
702
+
703
+ # Train models
704
+ st.subheader("Training Progress:")
705
+ scaler, classifiers, scorers, thresholds, accuracies, ensemble = train_models(df, embedder)
706
+
707
+ # Save to session state
708
+ st.session_state.scaler = scaler
709
+ st.session_state.classifiers = classifiers
710
+ st.session_state.scorers = scorers
711
+ st.session_state.thresholds = thresholds
712
+ st.session_state.ensemble = ensemble
713
+ st.session_state.model_trained = True
714
+
715
+ st.success("βœ… Training Complete!")
716
+
717
+ # Show performance metrics
718
+ st.subheader("Model Performance:")
719
+ metrics_cols = st.columns(4)
720
+ for idx, (cat, acc) in enumerate(zip(CATEGORIES.keys(), accuracies)):
721
+ with metrics_cols[idx % 4]:
722
+ st.metric(cat, f"{acc:.1%} accuracy")
723
+
724
+ avg_accuracy = np.mean(accuracies)
725
+ st.metric("**Overall Model Accuracy**", f"{avg_accuracy:.1%}")
726
+
727
+ st.balloons()
728
+
729
+ # STEP 2: ANALYZE STATEMENTS
730
+ with tab2:
731
+ st.header("Step 2: Analyze Personal Statements")
732
 
733
+ # Check if models are trained
734
+ if not st.session_state.model_trained:
735
+ st.warning("⚠️ No trained models found. Please complete Step 1: Train Model first.")
736
+ st.stop()
737
 
738
+ st.success("βœ… Models loaded successfully")
 
 
 
 
 
739
 
740
+ st.markdown("""
741
+ ### Instructions:
742
+ Upload or paste a personal statement to receive:
743
+ - Category detection and scoring (1-4)
744
+ - Segment-by-segment analysis
745
+ - Detailed recommendations
746
+ - Downloadable PDF report
747
+ """)
748
 
749
+ # Input method selection
750
+ input_method = st.radio(
751
+ "Choose input method:",
752
+ ["Upload Text File (.txt)", "Paste Text Directly"],
753
+ horizontal=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
754
  )
755
 
756
+ statement_text = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
757
 
758
+ if input_method == "Upload Text File (.txt)":
759
+ uploaded_file = st.file_uploader(
760
+ "Choose a text file",
761
+ type=['txt'],
762
+ help="Upload your personal statement as a .txt file"
763
+ )
 
 
 
 
 
 
 
764
 
765
+ if uploaded_file is not None:
766
+ statement_text = str(uploaded_file.read(), 'utf-8')
767
+ st.success(f"βœ… File uploaded ({len(statement_text)} characters)")
768
+
769
+ with st.expander("Preview Statement"):
770
+ st.text(statement_text[:500] + "..." if len(statement_text) > 500 else statement_text)
771
+
772
+ else: # Paste Text Directly
773
+ statement_text = st.text_area(
774
+ "Paste your personal statement here:",
775
+ height=400,
776
+ placeholder="Enter your complete personal statement...",
777
+ help="Paste your entire personal statement for analysis"
778
+ )
779
 
780
+ if statement_text:
781
+ st.info(f"πŸ“Š Statement length: {len(statement_text)} characters, {len(statement_text.split())} words")
782
+
783
+ # Analyze button
784
+ if statement_text and len(statement_text) > 100:
785
+ if st.button("πŸ”¬ Analyze Statement", type="primary", use_container_width=True):
786
+
787
+ with st.spinner("Analyzing your personal statement..."):
788
+ segment_results, category_results = analyze_statement(
789
+ statement_text,
790
+ st.session_state.embedder,
791
+ st.session_state.scaler,
792
+ st.session_state.classifiers,
793
+ st.session_state.scorers,
794
+ st.session_state.thresholds,
795
+ st.session_state.ensemble
796
+ )
797
+
798
+ st.success("βœ… Analysis Complete!")
799
+ st.balloons()
800
+
801
+ # Display results
802
+ st.markdown("---")
803
+ st.subheader("πŸ“Š Overall Summary")
804
 
805
  # Metrics
806
  col1, col2, col3, col4 = st.columns(4)
 
808
  detected_cats = [cat for cat, res in category_results.items() if res['detected']]
809
 
810
  with col1:
811
+ st.metric("Categories Found", f"{len(detected_cats)}/4")
 
 
 
 
812
 
813
  with col2:
814
  if detected_cats:
 
823
  with col4:
824
  if detected_cats:
825
  avg_score = np.mean([category_results[cat]['score'] for cat in detected_cats])
826
+ quality = "Excellent" if avg_score >= 3.5 else "Good" if avg_score >= 2.5 else "Needs Work"
827
+ st.metric("Overall Quality", quality)
 
 
 
 
 
 
 
 
828
  else:
829
  st.metric("Overall Quality", "N/A")
830
 
831
+ # Category Analysis
832
+ st.markdown("---")
833
+ st.subheader("πŸ“‹ Category Analysis")
834
 
835
  for cat in CATEGORIES.keys():
836
  res = category_results[cat]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
837
  if res['detected']:
838
+ icon = "βœ…" if res['score'] >= 3 else "⚠️" if res['score'] >= 2 else "❌"
839
+ st.write(f"{icon} **{cat}**: Score {res['score']}/4 (Confidence: {res['confidence']:.1%})")
840
  st.progress(res['score'] / 4)
841
+ else:
842
+ st.write(f"❌ **{cat}**: Not detected")
843
+ st.progress(0)
844
+
845
+ # Segment Details
846
+ st.markdown("---")
847
+ st.subheader("πŸ“ Segment-by-Segment Analysis")
848
 
849
  for segment in segment_results:
850
+ quality_map = {1: "Poor", 2: "Below Average", 3: "Good", 4: "Excellent", None: "N/A"}
851
+ quality = quality_map.get(segment['score'], "N/A")
852
+
853
  with st.expander(f"Segment {segment['segment_num']}: {segment['category']} (Score: {segment['score']}/4)"):
854
  col1, col2 = st.columns([1, 3])
855
 
856
  with col1:
857
  st.metric("Category", segment['category'])
858
+ st.metric("Score", f"{segment['score']}/4" if segment['score'] else "N/A")
859
  st.metric("Confidence", f"{segment['confidence']:.1%}")
860
 
861
  with col2:
862
  st.write("**Text:**")
863
+ st.write(segment['text'][:500] + "..." if len(segment['text']) > 500 else segment['text'])
864
 
865
+ if segment['category'] != 'Unclassified' and segment['score']:
866
+ st.write("**Rubric:**")
867
+ st.info(CATEGORIES[segment['category']]['rubric'][segment['score']])
868
+
869
+ # Recommendations
870
+ st.markdown("---")
871
+ st.subheader("πŸ’‘ Recommendations")
872
 
873
  missing_cats = [cat for cat, res in category_results.items() if not res['detected']]
874
  low_score_cats = [cat for cat, res in category_results.items()
875
  if res['detected'] and res['score'] and res['score'] < 3]
876
 
877
  if missing_cats:
878
+ st.error("**Missing Categories - Must Add:**")
879
  for cat in missing_cats:
880
+ st.write(f"**{cat}:** {CATEGORIES[cat]['description']}")
881
+ st.write(f"Keywords: {', '.join(CATEGORIES[cat]['keywords'][:8])}")
 
 
 
882
 
883
  if low_score_cats:
884
+ st.warning("**Low-Scoring Categories - Improve:**")
885
  for cat in low_score_cats:
886
+ score = category_results[cat]['score']
887
+ st.write(f"**{cat}** (Score: {score}/4)")
888
+ st.write(f"Target: {CATEGORIES[cat]['rubric'][4]}")
 
 
 
 
889
 
890
  if not missing_cats and not low_score_cats:
891
+ st.success("Excellent! All categories present with good scores.")
892
+
893
+ # Download Report
894
+ st.markdown("---")
895
+ if PDF_AVAILABLE:
896
+ pdf_buffer = create_pdf_report(segment_results, category_results)
897
+ if pdf_buffer:
898
+ st.download_button(
899
+ label="πŸ“₯ Download PDF Report",
900
+ data=pdf_buffer,
901
+ file_name=f"analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf",
902
+ mime="application/pdf",
903
+ use_container_width=True
904
+ )
905
+ else:
906
+ # CSV fallback
907
+ results_data = []
908
+ for seg in segment_results:
909
+ results_data.append({
910
+ 'Segment': seg['segment_num'],
911
+ 'Category': seg['category'],
912
+ 'Score': seg['score'],
913
+ 'Confidence': seg['confidence']
914
+ })
915
+
916
+ results_df = pd.DataFrame(results_data)
917
+ csv = results_df.to_csv(index=False)
918
+
919
+ st.download_button(
920
+ label="πŸ“₯ Download CSV Report",
921
+ data=csv,
922
+ file_name=f"analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
923
+ mime="text/csv",
924
+ use_container_width=True
925
+ )
926
+
927
+ elif statement_text and len(statement_text) <= 100:
928
+ st.warning("⚠️ Please enter a longer statement (minimum 100 characters)")
929
+ else:
930
+ st.info("πŸ‘† Please upload or paste your personal statement to begin analysis")
931
+
932
+ # STEP 3: VIEW RUBRICS
933
+ with tab3:
934
+ st.header("Step 3: Understanding the Scoring Rubrics")
935
+
936
+ st.markdown("""
937
+ The AI model evaluates personal statements based on **4 key categories**,
938
+ each scored on a scale of **1 (Poor) to 4 (Excellent)**.
939
+ """)
940
+
941
+ for category, info in CATEGORIES.items():
942
+ with st.expander(f"**{category}** - {info['description']}", expanded=False):
943
 
944
+ # Scoring Criteria
945
+ st.subheader("Scoring Criteria:")
946
+ for score in [4, 3, 2, 1]:
947
+ quality = ['Poor', 'Below Average', 'Good', 'Excellent'][score-1]
948
+ if score == 4:
949
+ st.success(f"**Score {score} ({quality}):** {info['rubric'][score]}")
950
+ elif score == 3:
951
+ st.info(f"**Score {score} ({quality}):** {info['rubric'][score]}")
952
+ elif score == 2:
953
+ st.warning(f"**Score {score} ({quality}):** {info['rubric'][score]}")
954
+ else:
955
+ st.error(f"**Score {score} ({quality}):** {info['rubric'][score]}")
956
 
957
+ st.markdown("---")
 
 
 
 
 
 
 
958
 
959
+ # Keywords and indicators
960
+ col1, col2 = st.columns(2)
961
+
962
+ with col1:
963
+ st.markdown("**Key Terms:**")
964
+ st.write(', '.join(info['keywords'][:10]))
965
+
966
+ with col2:
967
+ st.markdown("**Quality Indicators:**")
968
+ st.write(f"βœ… Positive: {', '.join(info['rubric_features']['positive'][:5])}")
969
+ st.write(f"❌ Avoid: {', '.join(info['rubric_features']['negative'][:5])}")
970
+
971
+ st.markdown("---")
972
+ st.info("""
973
+ ### Tips for High Scores:
974
+ - **Spark (4/4):** Create an engaging opening that clearly connects to your medical journey
975
+ - **Healthcare Experience (4/4):** Show active participation with vivid, thoughtful descriptions
976
+ - **Doctor Qualities (4/4):** Demonstrate mature, realistic understanding with specific examples
977
+ - **Spin (4/4):** Make direct, logical connections between experiences and medical career
978
+ """)
979
 
980
  # Run the application
981
  if __name__ == "__main__":