Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,674 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pickle
|
| 5 |
+
import os
|
| 6 |
+
import torch
|
| 7 |
+
from sentence_transformers import SentenceTransformer, util
|
| 8 |
+
from sklearn.model_selection import train_test_split
|
| 9 |
+
from sklearn.preprocessing import StandardScaler
|
| 10 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 11 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 12 |
+
import xgboost as xgb
|
| 13 |
+
import re
|
| 14 |
+
import warnings
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
import base64
|
| 17 |
+
from io import BytesIO
|
| 18 |
+
warnings.filterwarnings('ignore')
|
| 19 |
+
|
| 20 |
+
# Set page config
|
| 21 |
+
st.set_page_config(
|
| 22 |
+
page_title="Medical School Personal Statement Analyzer",
|
| 23 |
+
page_icon="π₯",
|
| 24 |
+
layout="wide",
|
| 25 |
+
initial_sidebar_state="expanded"
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# Categories with detailed rubric alignment
|
| 29 |
+
CATEGORIES = {
|
| 30 |
+
'Spark': {
|
| 31 |
+
'description': 'Opening that spurs interest in medicine (typically in opening paragraph)',
|
| 32 |
+
'keywords': ['growing up', 'childhood', 'family', 'realized', 'inspired', 'first',
|
| 33 |
+
'beginning', 'early', 'experience that', 'moment', 'when I was',
|
| 34 |
+
'journey began', 'sparked my interest', 'drew me to medicine',
|
| 35 |
+
'passion for medicine', 'calling', 'fascinated', 'curiosity'],
|
| 36 |
+
'patterns': [
|
| 37 |
+
r'when I was \d+', r'at age \d+', r'since I was', r'as a child',
|
| 38 |
+
r'early in my life', r'growing up', r'my journey to medicine'
|
| 39 |
+
],
|
| 40 |
+
'rubric': {
|
| 41 |
+
1: 'disconnected from being a doctor or confusing/random',
|
| 42 |
+
2: 'somewhat connected but unclear',
|
| 43 |
+
3: 'connected and clear',
|
| 44 |
+
4: 'engaging and logically flows into becoming a doctor'
|
| 45 |
+
},
|
| 46 |
+
'rubric_features': {
|
| 47 |
+
'positive': ['engaging', 'logical', 'clear connection', 'compelling', 'authentic'],
|
| 48 |
+
'negative': ['disconnected', 'confusing', 'random', 'unclear', 'generic']
|
| 49 |
+
}
|
| 50 |
+
},
|
| 51 |
+
'Healthcare Experience': {
|
| 52 |
+
'description': 'Watching/participating in healthcare - medical professional at work',
|
| 53 |
+
'keywords': ['shadowed', 'clinical', 'hospital', 'patient', 'doctor', 'physician',
|
| 54 |
+
'medical', 'treatment', 'observed', 'volunteer', 'clinic', 'rounds',
|
| 55 |
+
'surgery', 'emergency', 'ICU', 'residency', 'internship', 'scrubs',
|
| 56 |
+
'stethoscope', 'diagnosis', 'prognosis', 'bedside', 'ward', 'unit',
|
| 57 |
+
'healthcare', 'care team', 'medical team', 'attending', 'resident'],
|
| 58 |
+
'patterns': [
|
| 59 |
+
r'\d+ hours', r'volunteered at', r'shadowing', r'clinical experience',
|
| 60 |
+
r'medical mission', r'worked in .+ hospital', r'during my rotation'
|
| 61 |
+
],
|
| 62 |
+
'rubric': {
|
| 63 |
+
1: 'passive observation, uninteresting, irrelevant, problematic, negative tone',
|
| 64 |
+
2: 'bland/boring but not problematic',
|
| 65 |
+
3: 'interesting and relevant',
|
| 66 |
+
4: 'vivid, active, thoughtful, relevant, memorable, positive and optimistic'
|
| 67 |
+
},
|
| 68 |
+
'rubric_features': {
|
| 69 |
+
'positive': ['vivid', 'active', 'thoughtful', 'memorable', 'optimistic', 'engaged'],
|
| 70 |
+
'negative': ['passive', 'uninteresting', 'irrelevant', 'problematic', 'pessimistic']
|
| 71 |
+
}
|
| 72 |
+
},
|
| 73 |
+
'Showing Doctor Qualities': {
|
| 74 |
+
'description': 'Stories/examples portraying vision of doctor role and appealing aspects',
|
| 75 |
+
'keywords': ['leadership', 'empathy', 'compassion', 'responsibility', 'communication',
|
| 76 |
+
'advocate', 'caring', 'helping', 'service', 'volunteer', 'president',
|
| 77 |
+
'led', 'organized', 'taught', 'mentored', 'integrity', 'ethical',
|
| 78 |
+
'professional', 'dedication', 'perseverance', 'resilience', 'humble',
|
| 79 |
+
'self-aware', 'mature', 'understanding', 'patient-centered', 'holistic'],
|
| 80 |
+
'patterns': [
|
| 81 |
+
r'as (president|leader|captain)', r'I organized', r'I founded',
|
| 82 |
+
r'demonstrated .+ leadership', r'showed .+ compassion'
|
| 83 |
+
],
|
| 84 |
+
'rubric': {
|
| 85 |
+
1: 'arrogant, immature, overly confident, inaccurate understanding, negative tone',
|
| 86 |
+
2: 'bland/boring but not problematic',
|
| 87 |
+
3: 'shows some understanding',
|
| 88 |
+
4: 'realistic, self-aware, mature, humble, specific and clear understanding, positive'
|
| 89 |
+
},
|
| 90 |
+
'rubric_features': {
|
| 91 |
+
'positive': ['realistic', 'self-aware', 'mature', 'humble', 'specific', 'clear'],
|
| 92 |
+
'negative': ['arrogant', 'immature', 'overly confident', 'simplistic', 'inaccurate']
|
| 93 |
+
}
|
| 94 |
+
},
|
| 95 |
+
'Spin': {
|
| 96 |
+
'description': 'Explaining why experiences qualify them to be a doctor',
|
| 97 |
+
'keywords': ['learned', 'taught me', 'showed me', 'realized', 'understood',
|
| 98 |
+
'because', 'therefore', 'this experience', 'through this',
|
| 99 |
+
'as a doctor', 'future physician', 'will help me', 'prepared me',
|
| 100 |
+
'equipped me', 'qualified', 'ready', 'capable', 'competent',
|
| 101 |
+
'skills necessary', 'attributes required', 'prepared for'],
|
| 102 |
+
'patterns': [
|
| 103 |
+
r'this .+ taught me', r'I learned that', r'prepared me for',
|
| 104 |
+
r'qualified me to', r'because of this', r'therefore I'
|
| 105 |
+
],
|
| 106 |
+
'rubric': {
|
| 107 |
+
1: 'brief, vague, simplistic connection to being a doctor, generic',
|
| 108 |
+
2: 'some connection but generic',
|
| 109 |
+
3: 'clear connection',
|
| 110 |
+
4: 'direct, logical, and specific argument connecting experience to profession'
|
| 111 |
+
},
|
| 112 |
+
'rubric_features': {
|
| 113 |
+
'positive': ['direct', 'logical', 'specific', 'clear argument', 'compelling connection'],
|
| 114 |
+
'negative': ['brief', 'vague', 'simplistic', 'generic', 'weak connection']
|
| 115 |
+
}
|
| 116 |
+
}
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
# Model paths
|
| 120 |
+
MODEL_DIR = "trained_models"
|
| 121 |
+
EMBEDDER_PATH = os.path.join(MODEL_DIR, "embedder_name.txt")
|
| 122 |
+
CLASSIFIER_PATH = os.path.join(MODEL_DIR, "classifier.pkl")
|
| 123 |
+
SCORER_PATH = os.path.join(MODEL_DIR, "scorer.pkl")
|
| 124 |
+
SCALER_PATH = os.path.join(MODEL_DIR, "scaler.pkl")
|
| 125 |
+
THRESHOLD_PATH = os.path.join(MODEL_DIR, "thresholds.pkl")
|
| 126 |
+
ENSEMBLE_PATH = os.path.join(MODEL_DIR, "ensemble.pkl")
|
| 127 |
+
|
| 128 |
+
@st.cache_resource
|
| 129 |
+
def load_sentence_transformer():
|
| 130 |
+
"""Load sentence transformer model"""
|
| 131 |
+
models_to_try = [
|
| 132 |
+
'all-MiniLM-L6-v2', # Lightweight and reliable
|
| 133 |
+
'all-mpnet-base-v2' # Good alternative
|
| 134 |
+
]
|
| 135 |
+
|
| 136 |
+
for model_name in models_to_try:
|
| 137 |
+
try:
|
| 138 |
+
model = SentenceTransformer(model_name)
|
| 139 |
+
return model, model_name
|
| 140 |
+
except:
|
| 141 |
+
continue
|
| 142 |
+
|
| 143 |
+
return SentenceTransformer('all-MiniLM-L6-v2'), 'all-MiniLM-L6-v2'
|
| 144 |
+
|
| 145 |
+
def segment_text(text, embedder):
|
| 146 |
+
"""Segment text into meaningful chunks"""
|
| 147 |
+
paragraphs = re.split(r'\n\s*\n', text)
|
| 148 |
+
paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p.strip()) > 50]
|
| 149 |
+
|
| 150 |
+
if len(paragraphs) <= 1:
|
| 151 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 152 |
+
sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
|
| 153 |
+
|
| 154 |
+
if len(sentences) < 3:
|
| 155 |
+
return [text]
|
| 156 |
+
|
| 157 |
+
# Group sentences into segments
|
| 158 |
+
segments = []
|
| 159 |
+
current_segment = []
|
| 160 |
+
for sent in sentences:
|
| 161 |
+
current_segment.append(sent)
|
| 162 |
+
if len(' '.join(current_segment)) > 300:
|
| 163 |
+
segments.append(' '.join(current_segment))
|
| 164 |
+
current_segment = []
|
| 165 |
+
if current_segment:
|
| 166 |
+
segments.append(' '.join(current_segment))
|
| 167 |
+
return segments
|
| 168 |
+
|
| 169 |
+
return paragraphs
|
| 170 |
+
|
| 171 |
+
def extract_features(text, embedder, category_focus=None):
|
| 172 |
+
"""Extract features for classification"""
|
| 173 |
+
features = []
|
| 174 |
+
text_lower = text.lower()
|
| 175 |
+
words = text.split()
|
| 176 |
+
|
| 177 |
+
# Basic text statistics
|
| 178 |
+
features.extend([
|
| 179 |
+
len(text),
|
| 180 |
+
len(words),
|
| 181 |
+
len(set(words)) / max(len(words), 1),
|
| 182 |
+
len(re.findall(r'[.!?]', text)),
|
| 183 |
+
text.count('I') / max(len(words), 1),
|
| 184 |
+
])
|
| 185 |
+
|
| 186 |
+
# Process all categories
|
| 187 |
+
for cat_name, cat_info in CATEGORIES.items():
|
| 188 |
+
keywords = cat_info['keywords']
|
| 189 |
+
keyword_matches = sum(1 for kw in keywords if kw.lower() in text_lower)
|
| 190 |
+
keyword_density = keyword_matches / max(len(keywords), 1)
|
| 191 |
+
|
| 192 |
+
if category_focus == cat_name:
|
| 193 |
+
keyword_density *= 2
|
| 194 |
+
|
| 195 |
+
features.append(keyword_density * 10)
|
| 196 |
+
|
| 197 |
+
pattern_matches = 0
|
| 198 |
+
for pattern in cat_info.get('patterns', []):
|
| 199 |
+
matches = re.findall(pattern, text_lower)
|
| 200 |
+
pattern_matches += len(matches)
|
| 201 |
+
features.append(pattern_matches)
|
| 202 |
+
|
| 203 |
+
positive_count = sum(1 for word in cat_info['rubric_features']['positive']
|
| 204 |
+
if word in text_lower)
|
| 205 |
+
negative_count = sum(1 for word in cat_info['rubric_features']['negative']
|
| 206 |
+
if word in text_lower)
|
| 207 |
+
|
| 208 |
+
features.extend([
|
| 209 |
+
positive_count / max(len(words), 1) * 100,
|
| 210 |
+
negative_count / max(len(words), 1) * 100
|
| 211 |
+
])
|
| 212 |
+
|
| 213 |
+
# Get embeddings
|
| 214 |
+
try:
|
| 215 |
+
embedding = embedder.encode(text, convert_to_tensor=False, normalize_embeddings=True)
|
| 216 |
+
except:
|
| 217 |
+
embedding = embedder.encode(text)
|
| 218 |
+
|
| 219 |
+
# Category similarity
|
| 220 |
+
if category_focus and category_focus in CATEGORIES:
|
| 221 |
+
category_text = f"{CATEGORIES[category_focus]['description']} {' '.join(CATEGORIES[category_focus]['keywords'][:10])}"
|
| 222 |
+
try:
|
| 223 |
+
category_embedding = embedder.encode(category_text, normalize_embeddings=True)
|
| 224 |
+
similarity = cosine_similarity([embedding], [category_embedding])[0][0]
|
| 225 |
+
features.append(similarity * 10)
|
| 226 |
+
except:
|
| 227 |
+
features.append(0)
|
| 228 |
+
else:
|
| 229 |
+
features.append(0)
|
| 230 |
+
|
| 231 |
+
features = np.array(features, dtype=np.float32)
|
| 232 |
+
combined_features = np.concatenate([features, embedding[:256]]) # Limit embedding size
|
| 233 |
+
|
| 234 |
+
return combined_features
|
| 235 |
+
|
| 236 |
+
def load_training_data(file1, file2):
|
| 237 |
+
"""Load and combine training data from Excel files"""
|
| 238 |
+
try:
|
| 239 |
+
df1 = pd.read_excel(file1)
|
| 240 |
+
df2 = pd.read_excel(file2)
|
| 241 |
+
except Exception as e:
|
| 242 |
+
st.error(f"Error reading Excel files: {str(e)}")
|
| 243 |
+
return pd.DataFrame()
|
| 244 |
+
|
| 245 |
+
combined_df = pd.concat([df1, df2], ignore_index=True)
|
| 246 |
+
processed_data = []
|
| 247 |
+
|
| 248 |
+
for _, row in combined_df.iterrows():
|
| 249 |
+
text = None
|
| 250 |
+
for col_name in ['Excerpt Copy', 'Excerpt', 'Text', 'Content']:
|
| 251 |
+
if col_name in row and pd.notna(row[col_name]):
|
| 252 |
+
text = str(row[col_name])
|
| 253 |
+
break
|
| 254 |
+
|
| 255 |
+
if not text or text.strip() == '':
|
| 256 |
+
continue
|
| 257 |
+
|
| 258 |
+
data_point = {'text': text.strip()}
|
| 259 |
+
|
| 260 |
+
for category in CATEGORIES.keys():
|
| 261 |
+
col_applied = f"Code: {category} Applied"
|
| 262 |
+
col_weight = f"Code: {category} Weight"
|
| 263 |
+
|
| 264 |
+
is_applied = False
|
| 265 |
+
if col_applied in row:
|
| 266 |
+
applied_val = str(row[col_applied]).lower()
|
| 267 |
+
is_applied = applied_val in ['true', '1', 'yes', 't']
|
| 268 |
+
|
| 269 |
+
data_point[f"{category}_applied"] = is_applied
|
| 270 |
+
|
| 271 |
+
if is_applied and col_weight in row:
|
| 272 |
+
weight = row[col_weight]
|
| 273 |
+
if pd.isna(weight) or weight == '':
|
| 274 |
+
weight = 2
|
| 275 |
+
else:
|
| 276 |
+
try:
|
| 277 |
+
weight = int(float(weight))
|
| 278 |
+
weight = max(1, min(4, weight))
|
| 279 |
+
except:
|
| 280 |
+
weight = 2
|
| 281 |
+
else:
|
| 282 |
+
weight = 0
|
| 283 |
+
|
| 284 |
+
data_point[f"{category}_score"] = weight
|
| 285 |
+
|
| 286 |
+
processed_data.append(data_point)
|
| 287 |
+
|
| 288 |
+
return pd.DataFrame(processed_data)
|
| 289 |
+
|
| 290 |
+
def train_models(df, embedder):
|
| 291 |
+
"""Train classification and scoring models"""
|
| 292 |
+
all_features = []
|
| 293 |
+
|
| 294 |
+
progress_bar = st.progress(0)
|
| 295 |
+
status_text = st.empty()
|
| 296 |
+
|
| 297 |
+
status_text.text("Extracting features from training data...")
|
| 298 |
+
|
| 299 |
+
for idx, row in df.iterrows():
|
| 300 |
+
text = row['text']
|
| 301 |
+
|
| 302 |
+
category_features = {}
|
| 303 |
+
for cat in CATEGORIES.keys():
|
| 304 |
+
features = extract_features(text, embedder, category_focus=cat)
|
| 305 |
+
category_features[cat] = features
|
| 306 |
+
|
| 307 |
+
true_categories = [cat for cat in CATEGORIES.keys() if row[f"{cat}_applied"]]
|
| 308 |
+
|
| 309 |
+
if true_categories:
|
| 310 |
+
features = category_features[true_categories[0]]
|
| 311 |
+
else:
|
| 312 |
+
features = np.mean(list(category_features.values()), axis=0)
|
| 313 |
+
|
| 314 |
+
all_features.append(features)
|
| 315 |
+
progress_bar.progress((idx + 1) / len(df))
|
| 316 |
+
|
| 317 |
+
X = np.array(all_features)
|
| 318 |
+
|
| 319 |
+
categories = list(CATEGORIES.keys())
|
| 320 |
+
y_class = df[[f"{cat}_applied" for cat in categories]].values.astype(float)
|
| 321 |
+
|
| 322 |
+
y_score = []
|
| 323 |
+
for _, row in df.iterrows():
|
| 324 |
+
scores = []
|
| 325 |
+
for cat in categories:
|
| 326 |
+
if row[f"{cat}_applied"]:
|
| 327 |
+
scores.append(row[f"{cat}_score"] / 4.0)
|
| 328 |
+
else:
|
| 329 |
+
scores.append(0)
|
| 330 |
+
y_score.append(scores)
|
| 331 |
+
y_score = np.array(y_score)
|
| 332 |
+
|
| 333 |
+
status_text.text("Training models...")
|
| 334 |
+
|
| 335 |
+
# Split data
|
| 336 |
+
X_train, X_test, y_class_train, y_class_test, y_score_train, y_score_test = train_test_split(
|
| 337 |
+
X, y_class, y_score, test_size=0.2, random_state=42
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
# Scale features
|
| 341 |
+
scaler = StandardScaler()
|
| 342 |
+
X_train_scaled = scaler.fit_transform(X_train)
|
| 343 |
+
X_test_scaled = scaler.transform(X_test)
|
| 344 |
+
|
| 345 |
+
# Train classifiers
|
| 346 |
+
classifiers = {}
|
| 347 |
+
scorers = {}
|
| 348 |
+
thresholds = {}
|
| 349 |
+
|
| 350 |
+
for i, cat in enumerate(categories):
|
| 351 |
+
# Train classifier
|
| 352 |
+
clf = RandomForestClassifier(
|
| 353 |
+
n_estimators=100,
|
| 354 |
+
max_depth=6,
|
| 355 |
+
class_weight='balanced',
|
| 356 |
+
random_state=42
|
| 357 |
+
)
|
| 358 |
+
clf.fit(X_train_scaled, y_class_train[:, i])
|
| 359 |
+
classifiers[cat] = clf
|
| 360 |
+
|
| 361 |
+
# Train scorer
|
| 362 |
+
mask = y_class_train[:, i] == 1
|
| 363 |
+
if np.sum(mask) > 5:
|
| 364 |
+
scorer = xgb.XGBRegressor(
|
| 365 |
+
n_estimators=100,
|
| 366 |
+
max_depth=4,
|
| 367 |
+
random_state=42
|
| 368 |
+
)
|
| 369 |
+
scorer.fit(X_train_scaled[mask], y_score_train[mask, i])
|
| 370 |
+
else:
|
| 371 |
+
from sklearn.dummy import DummyRegressor
|
| 372 |
+
scorer = DummyRegressor(strategy='constant', constant=0.5)
|
| 373 |
+
scorer.fit(X_train_scaled, y_score_train[:, i])
|
| 374 |
+
|
| 375 |
+
scorers[cat] = scorer
|
| 376 |
+
thresholds[cat] = 0.5
|
| 377 |
+
|
| 378 |
+
status_text.empty()
|
| 379 |
+
progress_bar.empty()
|
| 380 |
+
|
| 381 |
+
return scaler, classifiers, scorers, thresholds
|
| 382 |
+
|
| 383 |
+
def save_models(embedder_name, scaler, classifiers, scorers, thresholds):
|
| 384 |
+
"""Save all trained models"""
|
| 385 |
+
os.makedirs(MODEL_DIR, exist_ok=True)
|
| 386 |
+
|
| 387 |
+
with open(EMBEDDER_PATH, 'w') as f:
|
| 388 |
+
f.write(embedder_name)
|
| 389 |
+
|
| 390 |
+
with open(SCALER_PATH, 'wb') as f:
|
| 391 |
+
pickle.dump(scaler, f)
|
| 392 |
+
|
| 393 |
+
with open(CLASSIFIER_PATH, 'wb') as f:
|
| 394 |
+
pickle.dump(classifiers, f)
|
| 395 |
+
|
| 396 |
+
with open(SCORER_PATH, 'wb') as f:
|
| 397 |
+
pickle.dump(scorers, f)
|
| 398 |
+
|
| 399 |
+
with open(THRESHOLD_PATH, 'wb') as f:
|
| 400 |
+
pickle.dump(thresholds, f)
|
| 401 |
+
|
| 402 |
+
def load_saved_models():
|
| 403 |
+
"""Load all saved models"""
|
| 404 |
+
try:
|
| 405 |
+
with open(EMBEDDER_PATH, 'r') as f:
|
| 406 |
+
embedder_name = f.read().strip()
|
| 407 |
+
|
| 408 |
+
embedder = SentenceTransformer(embedder_name)
|
| 409 |
+
|
| 410 |
+
with open(SCALER_PATH, 'rb') as f:
|
| 411 |
+
scaler = pickle.load(f)
|
| 412 |
+
|
| 413 |
+
with open(CLASSIFIER_PATH, 'rb') as f:
|
| 414 |
+
classifiers = pickle.load(f)
|
| 415 |
+
|
| 416 |
+
with open(SCORER_PATH, 'rb') as f:
|
| 417 |
+
scorers = pickle.load(f)
|
| 418 |
+
|
| 419 |
+
with open(THRESHOLD_PATH, 'rb') as f:
|
| 420 |
+
thresholds = pickle.load(f)
|
| 421 |
+
|
| 422 |
+
return embedder, scaler, classifiers, scorers, thresholds
|
| 423 |
+
except:
|
| 424 |
+
return None, None, None, None, None
|
| 425 |
+
|
| 426 |
+
def classify_segment(text, embedder, scaler, classifiers, scorers, thresholds):
|
| 427 |
+
"""Classify a segment of text"""
|
| 428 |
+
categories = list(CATEGORIES.keys())
|
| 429 |
+
category_results = {}
|
| 430 |
+
|
| 431 |
+
for cat in categories:
|
| 432 |
+
features = extract_features(text, embedder, category_focus=cat)
|
| 433 |
+
features_scaled = scaler.transform([features])
|
| 434 |
+
|
| 435 |
+
prob = classifiers[cat].predict_proba(features_scaled)[0, 1] if hasattr(classifiers[cat], 'predict_proba') else 0
|
| 436 |
+
category_results[cat] = prob
|
| 437 |
+
|
| 438 |
+
best_category = max(category_results, key=category_results.get)
|
| 439 |
+
best_prob = category_results[best_category]
|
| 440 |
+
|
| 441 |
+
if best_prob > thresholds.get(best_category, 0.5):
|
| 442 |
+
features = extract_features(text, embedder, category_focus=best_category)
|
| 443 |
+
features_scaled = scaler.transform([features])
|
| 444 |
+
|
| 445 |
+
try:
|
| 446 |
+
score_normalized = scorers[best_category].predict(features_scaled)[0]
|
| 447 |
+
score = int(np.clip(np.round(score_normalized * 4), 1, 4))
|
| 448 |
+
except:
|
| 449 |
+
score = 2
|
| 450 |
+
|
| 451 |
+
return {
|
| 452 |
+
'category': best_category,
|
| 453 |
+
'score': score,
|
| 454 |
+
'confidence': float(best_prob),
|
| 455 |
+
'text': text
|
| 456 |
+
}
|
| 457 |
+
else:
|
| 458 |
+
return {
|
| 459 |
+
'category': 'Unclassified',
|
| 460 |
+
'score': None,
|
| 461 |
+
'confidence': 0,
|
| 462 |
+
'text': text
|
| 463 |
+
}
|
| 464 |
+
|
| 465 |
+
def analyze_statement(text, embedder, scaler, classifiers, scorers, thresholds):
|
| 466 |
+
"""Analyze complete personal statement"""
|
| 467 |
+
segments = segment_text(text, embedder)
|
| 468 |
+
|
| 469 |
+
segment_results = []
|
| 470 |
+
for i, segment in enumerate(segments):
|
| 471 |
+
result = classify_segment(segment, embedder, scaler, classifiers, scorers, thresholds)
|
| 472 |
+
result['segment_num'] = i + 1
|
| 473 |
+
segment_results.append(result)
|
| 474 |
+
|
| 475 |
+
category_results = {}
|
| 476 |
+
for cat in CATEGORIES.keys():
|
| 477 |
+
cat_segments = [r for r in segment_results if r['category'] == cat]
|
| 478 |
+
if cat_segments:
|
| 479 |
+
scores = [s['score'] for s in cat_segments]
|
| 480 |
+
avg_score = np.mean(scores)
|
| 481 |
+
max_confidence = max([s['confidence'] for s in cat_segments])
|
| 482 |
+
|
| 483 |
+
category_results[cat] = {
|
| 484 |
+
'detected': True,
|
| 485 |
+
'score': int(np.round(avg_score)),
|
| 486 |
+
'confidence': max_confidence,
|
| 487 |
+
'num_segments': len(cat_segments)
|
| 488 |
+
}
|
| 489 |
+
else:
|
| 490 |
+
category_results[cat] = {
|
| 491 |
+
'detected': False,
|
| 492 |
+
'score': None,
|
| 493 |
+
'confidence': 0,
|
| 494 |
+
'num_segments': 0
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
+
return segment_results, category_results
|
| 498 |
+
|
| 499 |
+
# Main application
|
| 500 |
+
def main():
|
| 501 |
+
st.title("π₯ Medical School Personal Statement Analyzer")
|
| 502 |
+
st.markdown("*AI-powered analysis based on medical school admission rubrics*")
|
| 503 |
+
st.markdown("---")
|
| 504 |
+
|
| 505 |
+
# Sidebar
|
| 506 |
+
with st.sidebar:
|
| 507 |
+
st.header("βΉοΈ About")
|
| 508 |
+
st.markdown("""
|
| 509 |
+
This tool analyzes personal statements based on 4 key categories:
|
| 510 |
+
- **Spark**: Opening that shows interest in medicine
|
| 511 |
+
- **Healthcare Experience**: Clinical/medical experiences
|
| 512 |
+
- **Doctor Qualities**: Leadership and character traits
|
| 513 |
+
- **Spin**: Connecting experiences to medical career
|
| 514 |
+
|
| 515 |
+
Each category is scored 1-4 (Poor to Excellent)
|
| 516 |
+
""")
|
| 517 |
+
|
| 518 |
+
# Create tabs
|
| 519 |
+
tab1, tab2, tab3 = st.tabs(["π Train Model", "π Analyze Statement", "π View Rubrics"])
|
| 520 |
+
|
| 521 |
+
# Train Model Tab
|
| 522 |
+
with tab1:
|
| 523 |
+
st.header("Train the AI Model")
|
| 524 |
+
|
| 525 |
+
if all(os.path.exists(p) for p in [CLASSIFIER_PATH, SCORER_PATH, SCALER_PATH]):
|
| 526 |
+
st.success("β Models already trained. You can analyze statements or retrain.")
|
| 527 |
+
|
| 528 |
+
st.markdown("Upload training data files (Excel format with coded excerpts)")
|
| 529 |
+
|
| 530 |
+
col1, col2 = st.columns(2)
|
| 531 |
+
with col1:
|
| 532 |
+
file1 = st.file_uploader("Training File 1", type=['xlsx'], key="file1")
|
| 533 |
+
with col2:
|
| 534 |
+
file2 = st.file_uploader("Training File 2", type=['xlsx'], key="file2")
|
| 535 |
+
|
| 536 |
+
if file1 and file2:
|
| 537 |
+
if st.button("Start Training", type="primary"):
|
| 538 |
+
try:
|
| 539 |
+
# Load data
|
| 540 |
+
with st.spinner("Loading training data..."):
|
| 541 |
+
df = load_training_data(file1, file2)
|
| 542 |
+
|
| 543 |
+
if df.empty:
|
| 544 |
+
st.error("No valid training data found.")
|
| 545 |
+
return
|
| 546 |
+
|
| 547 |
+
st.success(f"β Loaded {len(df)} training samples")
|
| 548 |
+
|
| 549 |
+
# Load embedder
|
| 550 |
+
with st.spinner("Loading transformer model..."):
|
| 551 |
+
embedder, embedder_name = load_sentence_transformer()
|
| 552 |
+
|
| 553 |
+
# Train
|
| 554 |
+
scaler, classifiers, scorers, thresholds = train_models(df, embedder)
|
| 555 |
+
|
| 556 |
+
# Save
|
| 557 |
+
save_models(embedder_name, scaler, classifiers, scorers, thresholds)
|
| 558 |
+
st.success("β Training complete! Models saved.")
|
| 559 |
+
|
| 560 |
+
except Exception as e:
|
| 561 |
+
st.error(f"Training failed: {str(e)}")
|
| 562 |
+
|
| 563 |
+
# Analyze Statement Tab
|
| 564 |
+
with tab2:
|
| 565 |
+
st.header("Analyze Personal Statement")
|
| 566 |
+
|
| 567 |
+
if not all(os.path.exists(p) for p in [CLASSIFIER_PATH, SCORER_PATH, SCALER_PATH]):
|
| 568 |
+
st.warning("β οΈ Please train the model first (Tab 1)")
|
| 569 |
+
return
|
| 570 |
+
|
| 571 |
+
# Load models
|
| 572 |
+
embedder, scaler, classifiers, scorers, thresholds = load_saved_models()
|
| 573 |
+
|
| 574 |
+
if embedder is None:
|
| 575 |
+
st.error("Failed to load models. Please retrain.")
|
| 576 |
+
return
|
| 577 |
+
|
| 578 |
+
# Input method
|
| 579 |
+
input_method = st.radio("Choose input method:", ["Paste Text", "Upload File"])
|
| 580 |
+
|
| 581 |
+
text_to_analyze = None
|
| 582 |
+
|
| 583 |
+
if input_method == "Paste Text":
|
| 584 |
+
text_to_analyze = st.text_area(
|
| 585 |
+
"Paste your personal statement here:",
|
| 586 |
+
height=300,
|
| 587 |
+
placeholder="Enter your personal statement..."
|
| 588 |
+
)
|
| 589 |
+
else:
|
| 590 |
+
uploaded_file = st.file_uploader("Upload statement (.txt)", type=['txt'])
|
| 591 |
+
if uploaded_file:
|
| 592 |
+
text_to_analyze = str(uploaded_file.read(), 'utf-8')
|
| 593 |
+
st.success("File uploaded successfully!")
|
| 594 |
+
|
| 595 |
+
if text_to_analyze and st.button("Analyze Statement", type="primary"):
|
| 596 |
+
with st.spinner("Analyzing..."):
|
| 597 |
+
segment_results, category_results = analyze_statement(
|
| 598 |
+
text_to_analyze, embedder, scaler, classifiers, scorers, thresholds
|
| 599 |
+
)
|
| 600 |
+
|
| 601 |
+
# Display results
|
| 602 |
+
st.success("β Analysis complete!")
|
| 603 |
+
|
| 604 |
+
# Summary
|
| 605 |
+
st.subheader("π Overall Summary")
|
| 606 |
+
cols = st.columns(4)
|
| 607 |
+
|
| 608 |
+
detected = [cat for cat, res in category_results.items() if res['detected']]
|
| 609 |
+
|
| 610 |
+
with cols[0]:
|
| 611 |
+
st.metric("Categories Found", f"{len(detected)}/4")
|
| 612 |
+
with cols[1]:
|
| 613 |
+
if detected:
|
| 614 |
+
avg_score = np.mean([category_results[cat]['score'] for cat in detected])
|
| 615 |
+
st.metric("Average Score", f"{avg_score:.1f}/4")
|
| 616 |
+
else:
|
| 617 |
+
st.metric("Average Score", "N/A")
|
| 618 |
+
with cols[2]:
|
| 619 |
+
st.metric("Total Segments", len(segment_results))
|
| 620 |
+
with cols[3]:
|
| 621 |
+
quality = "Excellent" if len(detected) == 4 and avg_score >= 3.5 else "Good" if len(detected) >= 3 else "Needs Work"
|
| 622 |
+
st.metric("Overall", quality)
|
| 623 |
+
|
| 624 |
+
# Category breakdown
|
| 625 |
+
st.subheader("π Category Analysis")
|
| 626 |
+
for cat in CATEGORIES.keys():
|
| 627 |
+
res = category_results[cat]
|
| 628 |
+
if res['detected']:
|
| 629 |
+
icon = "β
" if res['score'] >= 3 else "β οΈ" if res['score'] >= 2 else "β"
|
| 630 |
+
st.write(f"{icon} **{cat}**: Score {res['score']}/4 (Confidence: {res['confidence']:.1%})")
|
| 631 |
+
else:
|
| 632 |
+
st.write(f"β **{cat}**: Not detected")
|
| 633 |
+
|
| 634 |
+
# Segment details
|
| 635 |
+
st.subheader("π Segment Details")
|
| 636 |
+
for seg in segment_results:
|
| 637 |
+
with st.expander(f"Segment {seg['segment_num']}: {seg['category']}"):
|
| 638 |
+
st.write(f"**Score:** {seg['score']}/4" if seg['score'] else "N/A")
|
| 639 |
+
st.write(f"**Confidence:** {seg['confidence']:.1%}")
|
| 640 |
+
st.write(f"**Text:** {seg['text'][:300]}...")
|
| 641 |
+
|
| 642 |
+
# Recommendations
|
| 643 |
+
st.subheader("π‘ Recommendations")
|
| 644 |
+
missing = [cat for cat, res in category_results.items() if not res['detected']]
|
| 645 |
+
low_score = [cat for cat, res in category_results.items()
|
| 646 |
+
if res['detected'] and res['score'] and res['score'] < 3]
|
| 647 |
+
|
| 648 |
+
if missing:
|
| 649 |
+
st.warning("**Missing Categories:**")
|
| 650 |
+
for cat in missing:
|
| 651 |
+
st.write(f"β’ Add content for **{cat}**: {CATEGORIES[cat]['description']}")
|
| 652 |
+
|
| 653 |
+
if low_score:
|
| 654 |
+
st.info("**Areas to Improve:**")
|
| 655 |
+
for cat in low_score:
|
| 656 |
+
st.write(f"β’ Strengthen **{cat}** (current score: {category_results[cat]['score']}/4)")
|
| 657 |
+
|
| 658 |
+
if not missing and not low_score:
|
| 659 |
+
st.success("Excellent work! All categories present with good scores.")
|
| 660 |
+
|
| 661 |
+
# View Rubrics Tab
|
| 662 |
+
with tab3:
|
| 663 |
+
st.header("Scoring Rubrics")
|
| 664 |
+
|
| 665 |
+
for category, info in CATEGORIES.items():
|
| 666 |
+
with st.expander(f"**{category}**"):
|
| 667 |
+
st.write(f"**Description:** {info['description']}")
|
| 668 |
+
st.write("**Scoring Criteria:**")
|
| 669 |
+
for score in [4, 3, 2, 1]:
|
| 670 |
+
st.write(f"β’ **Score {score}:** {info['rubric'][score]}")
|
| 671 |
+
st.write(f"**Key Terms:** {', '.join(info['keywords'][:8])}")
|
| 672 |
+
|
| 673 |
+
if __name__ == "__main__":
|
| 674 |
+
main()
|