Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,665 +3,300 @@ import pandas as pd
|
|
| 3 |
import numpy as np
|
| 4 |
import pickle
|
| 5 |
import os
|
| 6 |
-
import
|
| 7 |
-
from sentence_transformers import SentenceTransformer, util
|
| 8 |
from sklearn.model_selection import train_test_split
|
| 9 |
from sklearn.preprocessing import StandardScaler
|
| 10 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 11 |
from sklearn.ensemble import RandomForestClassifier
|
|
|
|
| 12 |
import xgboost as xgb
|
| 13 |
import re
|
| 14 |
import warnings
|
| 15 |
-
from datetime import datetime
|
| 16 |
-
import base64
|
| 17 |
-
from io import BytesIO
|
| 18 |
warnings.filterwarnings('ignore')
|
| 19 |
|
| 20 |
-
#
|
| 21 |
st.set_page_config(
|
| 22 |
page_title="Medical School Personal Statement Analyzer",
|
| 23 |
page_icon="🏥",
|
| 24 |
-
layout="wide"
|
| 25 |
-
initial_sidebar_state="expanded"
|
| 26 |
)
|
| 27 |
|
| 28 |
-
# Categories
|
| 29 |
CATEGORIES = {
|
| 30 |
'Spark': {
|
| 31 |
-
'description': 'Opening that spurs interest in medicine
|
| 32 |
'keywords': ['growing up', 'childhood', 'family', 'realized', 'inspired', 'first',
|
| 33 |
-
'beginning', 'early', 'experience that', 'moment', 'when I was',
|
| 34 |
-
|
| 35 |
-
'passion for medicine', 'calling', 'fascinated', 'curiosity'],
|
| 36 |
-
'patterns': [
|
| 37 |
-
r'when I was \d+', r'at age \d+', r'since I was', r'as a child',
|
| 38 |
-
r'early in my life', r'growing up', r'my journey to medicine'
|
| 39 |
-
],
|
| 40 |
'rubric': {
|
| 41 |
-
1: 'disconnected
|
| 42 |
2: 'somewhat connected but unclear',
|
| 43 |
3: 'connected and clear',
|
| 44 |
-
4: 'engaging and
|
| 45 |
},
|
| 46 |
'rubric_features': {
|
| 47 |
-
'positive': ['engaging', 'logical', 'clear
|
| 48 |
'negative': ['disconnected', 'confusing', 'random', 'unclear', 'generic']
|
| 49 |
}
|
| 50 |
},
|
| 51 |
'Healthcare Experience': {
|
| 52 |
-
'description': '
|
| 53 |
'keywords': ['shadowed', 'clinical', 'hospital', 'patient', 'doctor', 'physician',
|
| 54 |
-
'medical', 'treatment', 'observed', 'volunteer', 'clinic',
|
| 55 |
-
|
| 56 |
-
'stethoscope', 'diagnosis', 'prognosis', 'bedside', 'ward', 'unit',
|
| 57 |
-
'healthcare', 'care team', 'medical team', 'attending', 'resident'],
|
| 58 |
-
'patterns': [
|
| 59 |
-
r'\d+ hours', r'volunteered at', r'shadowing', r'clinical experience',
|
| 60 |
-
r'medical mission', r'worked in .+ hospital', r'during my rotation'
|
| 61 |
-
],
|
| 62 |
'rubric': {
|
| 63 |
-
1: 'passive
|
| 64 |
-
2: 'bland
|
| 65 |
3: 'interesting and relevant',
|
| 66 |
-
4: 'vivid, active, thoughtful,
|
| 67 |
},
|
| 68 |
'rubric_features': {
|
| 69 |
-
'positive': ['vivid', 'active', 'thoughtful', 'memorable', 'optimistic'
|
| 70 |
-
'negative': ['passive', 'uninteresting', 'irrelevant', 'problematic'
|
| 71 |
}
|
| 72 |
},
|
| 73 |
'Showing Doctor Qualities': {
|
| 74 |
-
'description': '
|
| 75 |
'keywords': ['leadership', 'empathy', 'compassion', 'responsibility', 'communication',
|
| 76 |
-
'advocate', 'caring', 'helping', 'service', 'volunteer',
|
| 77 |
-
|
| 78 |
-
'professional', 'dedication', 'perseverance', 'resilience', 'humble',
|
| 79 |
-
'self-aware', 'mature', 'understanding', 'patient-centered', 'holistic'],
|
| 80 |
-
'patterns': [
|
| 81 |
-
r'as (president|leader|captain)', r'I organized', r'I founded',
|
| 82 |
-
r'demonstrated .+ leadership', r'showed .+ compassion'
|
| 83 |
-
],
|
| 84 |
'rubric': {
|
| 85 |
-
1: 'arrogant, immature,
|
| 86 |
-
2: 'bland
|
| 87 |
3: 'shows some understanding',
|
| 88 |
-
4: 'realistic,
|
| 89 |
},
|
| 90 |
'rubric_features': {
|
| 91 |
-
'positive': ['realistic', 'self-aware', 'mature', 'humble', 'specific'
|
| 92 |
-
'negative': ['arrogant', 'immature', 'overly confident', 'simplistic'
|
| 93 |
}
|
| 94 |
},
|
| 95 |
'Spin': {
|
| 96 |
-
'description': '
|
| 97 |
'keywords': ['learned', 'taught me', 'showed me', 'realized', 'understood',
|
| 98 |
-
'because', 'therefore', 'this experience', '
|
| 99 |
-
|
| 100 |
-
'equipped me', 'qualified', 'ready', 'capable', 'competent',
|
| 101 |
-
'skills necessary', 'attributes required', 'prepared for'],
|
| 102 |
-
'patterns': [
|
| 103 |
-
r'this .+ taught me', r'I learned that', r'prepared me for',
|
| 104 |
-
r'qualified me to', r'because of this', r'therefore I'
|
| 105 |
-
],
|
| 106 |
'rubric': {
|
| 107 |
-
1: '
|
| 108 |
2: 'some connection but generic',
|
| 109 |
3: 'clear connection',
|
| 110 |
-
4: 'direct, logical,
|
| 111 |
},
|
| 112 |
'rubric_features': {
|
| 113 |
-
'positive': ['direct', 'logical', 'specific', 'clear argument'
|
| 114 |
-
'negative': ['brief', 'vague', 'simplistic', 'generic'
|
| 115 |
}
|
| 116 |
}
|
| 117 |
}
|
| 118 |
|
| 119 |
# Model paths
|
| 120 |
MODEL_DIR = "trained_models"
|
| 121 |
-
EMBEDDER_PATH = os.path.join(MODEL_DIR, "embedder_name.txt")
|
| 122 |
-
CLASSIFIER_PATH = os.path.join(MODEL_DIR, "classifier.pkl")
|
| 123 |
-
SCORER_PATH = os.path.join(MODEL_DIR, "scorer.pkl")
|
| 124 |
-
SCALER_PATH = os.path.join(MODEL_DIR, "scaler.pkl")
|
| 125 |
-
THRESHOLD_PATH = os.path.join(MODEL_DIR, "thresholds.pkl")
|
| 126 |
|
|
|
|
| 127 |
@st.cache_resource
|
| 128 |
-
def
|
| 129 |
-
"""Load sentence transformer model"""
|
| 130 |
try:
|
| 131 |
-
|
| 132 |
-
return model, 'all-MiniLM-L6-v2'
|
| 133 |
except:
|
| 134 |
-
|
| 135 |
-
return None, None
|
| 136 |
|
| 137 |
-
def
|
| 138 |
-
"""Segment text into meaningful chunks"""
|
| 139 |
-
paragraphs = re.split(r'\n\s*\n', text)
|
| 140 |
-
paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p.strip()) > 50]
|
| 141 |
-
|
| 142 |
-
if len(paragraphs) <= 1:
|
| 143 |
-
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 144 |
-
sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
|
| 145 |
-
|
| 146 |
-
if len(sentences) < 3:
|
| 147 |
-
return [text]
|
| 148 |
-
|
| 149 |
-
segments = []
|
| 150 |
-
current_segment = []
|
| 151 |
-
for sent in sentences:
|
| 152 |
-
current_segment.append(sent)
|
| 153 |
-
if len(' '.join(current_segment)) > 300:
|
| 154 |
-
segments.append(' '.join(current_segment))
|
| 155 |
-
current_segment = []
|
| 156 |
-
if current_segment:
|
| 157 |
-
segments.append(' '.join(current_segment))
|
| 158 |
-
return segments
|
| 159 |
-
|
| 160 |
-
return paragraphs
|
| 161 |
-
|
| 162 |
-
def extract_features(text, embedder, category_focus=None):
|
| 163 |
-
"""Extract features for classification"""
|
| 164 |
features = []
|
| 165 |
text_lower = text.lower()
|
| 166 |
words = text.split()
|
| 167 |
|
| 168 |
-
# Basic
|
| 169 |
features.extend([
|
| 170 |
len(text),
|
| 171 |
len(words),
|
| 172 |
-
len(set(words)) / max(len(words), 1)
|
| 173 |
-
len(re.findall(r'[.!?]', text)),
|
| 174 |
-
text.count('I') / max(len(words), 1),
|
| 175 |
])
|
| 176 |
|
| 177 |
-
#
|
| 178 |
for cat_name, cat_info in CATEGORIES.items():
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
keyword_density = keyword_matches / max(len(keywords), 1)
|
| 182 |
-
|
| 183 |
-
if category_focus == cat_name:
|
| 184 |
-
keyword_density *= 2
|
| 185 |
-
|
| 186 |
-
features.append(keyword_density * 10)
|
| 187 |
-
|
| 188 |
-
pattern_matches = 0
|
| 189 |
-
for pattern in cat_info.get('patterns', []):
|
| 190 |
-
matches = re.findall(pattern, text_lower)
|
| 191 |
-
pattern_matches += len(matches)
|
| 192 |
-
features.append(pattern_matches)
|
| 193 |
-
|
| 194 |
-
positive_count = sum(1 for word in cat_info['rubric_features']['positive']
|
| 195 |
-
if word in text_lower)
|
| 196 |
-
negative_count = sum(1 for word in cat_info['rubric_features']['negative']
|
| 197 |
-
if word in text_lower)
|
| 198 |
-
|
| 199 |
-
features.extend([
|
| 200 |
-
positive_count / max(len(words), 1) * 100,
|
| 201 |
-
negative_count / max(len(words), 1) * 100
|
| 202 |
-
])
|
| 203 |
|
| 204 |
-
# Get
|
| 205 |
try:
|
| 206 |
-
embedding = embedder.encode(text
|
| 207 |
if hasattr(embedding, 'cpu'):
|
| 208 |
embedding = embedding.cpu().numpy()
|
| 209 |
-
embedding = embedding.flatten()[:
|
| 210 |
except:
|
| 211 |
-
embedding = np.zeros(
|
| 212 |
-
|
| 213 |
-
# Category similarity
|
| 214 |
-
if category_focus and category_focus in CATEGORIES:
|
| 215 |
-
category_text = f"{CATEGORIES[category_focus]['description']} {' '.join(CATEGORIES[category_focus]['keywords'][:10])}"
|
| 216 |
-
try:
|
| 217 |
-
category_embedding = embedder.encode(category_text)
|
| 218 |
-
if hasattr(category_embedding, 'cpu'):
|
| 219 |
-
category_embedding = category_embedding.cpu().numpy()
|
| 220 |
-
category_embedding = category_embedding.flatten()
|
| 221 |
-
similarity = cosine_similarity([embedding], [category_embedding[:256]])[0][0]
|
| 222 |
-
features.append(similarity * 10)
|
| 223 |
-
except:
|
| 224 |
-
features.append(0)
|
| 225 |
-
else:
|
| 226 |
-
features.append(0)
|
| 227 |
|
| 228 |
-
|
| 229 |
-
combined_features = np.concatenate([features, embedding])
|
| 230 |
-
|
| 231 |
-
return combined_features
|
| 232 |
|
| 233 |
-
def
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
df1 = pd.read_excel(file1)
|
| 237 |
-
df2 = pd.read_excel(file2)
|
| 238 |
-
except Exception as e:
|
| 239 |
-
st.error(f"Error reading Excel files: {str(e)}")
|
| 240 |
-
return pd.DataFrame()
|
| 241 |
-
|
| 242 |
-
combined_df = pd.concat([df1, df2], ignore_index=True)
|
| 243 |
-
processed_data = []
|
| 244 |
|
| 245 |
-
for _, row in combined_df.iterrows():
|
| 246 |
-
text = None
|
| 247 |
-
for col_name in ['Excerpt Copy', 'Excerpt', 'Text', 'Content']:
|
| 248 |
-
if col_name in row and pd.notna(row[col_name]):
|
| 249 |
-
text = str(row[col_name])
|
| 250 |
-
break
|
| 251 |
-
|
| 252 |
-
if not text or text.strip() == '':
|
| 253 |
-
continue
|
| 254 |
-
|
| 255 |
-
data_point = {'text': text.strip()}
|
| 256 |
-
|
| 257 |
-
for category in CATEGORIES.keys():
|
| 258 |
-
col_applied = f"Code: {category} Applied"
|
| 259 |
-
col_weight = f"Code: {category} Weight"
|
| 260 |
-
|
| 261 |
-
is_applied = False
|
| 262 |
-
if col_applied in row:
|
| 263 |
-
applied_val = str(row[col_applied]).lower()
|
| 264 |
-
is_applied = applied_val in ['true', '1', 'yes', 't']
|
| 265 |
-
|
| 266 |
-
data_point[f"{category}_applied"] = is_applied
|
| 267 |
-
|
| 268 |
-
if is_applied and col_weight in row:
|
| 269 |
-
weight = row[col_weight]
|
| 270 |
-
if pd.isna(weight) or weight == '':
|
| 271 |
-
weight = 2
|
| 272 |
-
else:
|
| 273 |
-
try:
|
| 274 |
-
weight = int(float(weight))
|
| 275 |
-
weight = max(1, min(4, weight))
|
| 276 |
-
except:
|
| 277 |
-
weight = 2
|
| 278 |
-
else:
|
| 279 |
-
weight = 0
|
| 280 |
-
|
| 281 |
-
data_point[f"{category}_score"] = weight
|
| 282 |
-
|
| 283 |
-
processed_data.append(data_point)
|
| 284 |
-
|
| 285 |
-
return pd.DataFrame(processed_data)
|
| 286 |
-
|
| 287 |
-
def train_models(df, embedder):
|
| 288 |
-
"""Train classification and scoring models"""
|
| 289 |
-
all_features = []
|
| 290 |
-
|
| 291 |
-
progress_bar = st.progress(0)
|
| 292 |
-
status_text = st.empty()
|
| 293 |
-
|
| 294 |
-
status_text.text("Extracting features from training data...")
|
| 295 |
-
|
| 296 |
-
for idx, row in df.iterrows():
|
| 297 |
-
text = row['text']
|
| 298 |
-
|
| 299 |
-
category_features = {}
|
| 300 |
-
for cat in CATEGORIES.keys():
|
| 301 |
-
features = extract_features(text, embedder, category_focus=cat)
|
| 302 |
-
category_features[cat] = features
|
| 303 |
-
|
| 304 |
-
true_categories = [cat for cat in CATEGORIES.keys() if row[f"{cat}_applied"]]
|
| 305 |
-
|
| 306 |
-
if true_categories:
|
| 307 |
-
features = category_features[true_categories[0]]
|
| 308 |
-
else:
|
| 309 |
-
features = np.mean(list(category_features.values()), axis=0)
|
| 310 |
-
|
| 311 |
-
all_features.append(features)
|
| 312 |
-
progress_bar.progress((idx + 1) / len(df))
|
| 313 |
-
|
| 314 |
-
X = np.array(all_features)
|
| 315 |
-
|
| 316 |
-
categories = list(CATEGORIES.keys())
|
| 317 |
-
y_class = df[[f"{cat}_applied" for cat in categories]].values.astype(float)
|
| 318 |
-
|
| 319 |
-
y_score = []
|
| 320 |
for _, row in df.iterrows():
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
)
|
| 336 |
-
|
| 337 |
-
#
|
| 338 |
scaler = StandardScaler()
|
| 339 |
-
|
| 340 |
-
X_test_scaled = scaler.transform(X_test)
|
| 341 |
-
|
| 342 |
-
# Train classifiers
|
| 343 |
-
classifiers = {}
|
| 344 |
-
scorers = {}
|
| 345 |
-
thresholds = {}
|
| 346 |
-
|
| 347 |
-
for i, cat in enumerate(categories):
|
| 348 |
-
# Train classifier
|
| 349 |
-
clf = RandomForestClassifier(
|
| 350 |
-
n_estimators=100,
|
| 351 |
-
max_depth=6,
|
| 352 |
-
class_weight='balanced',
|
| 353 |
-
random_state=42
|
| 354 |
-
)
|
| 355 |
-
clf.fit(X_train_scaled, y_class_train[:, i])
|
| 356 |
-
classifiers[cat] = clf
|
| 357 |
-
|
| 358 |
-
# Train scorer
|
| 359 |
-
mask = y_class_train[:, i] == 1
|
| 360 |
-
if np.sum(mask) > 5:
|
| 361 |
-
scorer = xgb.XGBRegressor(
|
| 362 |
-
n_estimators=100,
|
| 363 |
-
max_depth=4,
|
| 364 |
-
random_state=42
|
| 365 |
-
)
|
| 366 |
-
scorer.fit(X_train_scaled[mask], y_score_train[mask, i])
|
| 367 |
-
else:
|
| 368 |
-
from sklearn.dummy import DummyRegressor
|
| 369 |
-
scorer = DummyRegressor(strategy='constant', constant=0.5)
|
| 370 |
-
scorer.fit(X_train_scaled, y_score_train[:, i])
|
| 371 |
-
|
| 372 |
-
scorers[cat] = scorer
|
| 373 |
-
thresholds[cat] = 0.5
|
| 374 |
-
|
| 375 |
-
status_text.empty()
|
| 376 |
-
progress_bar.empty()
|
| 377 |
-
|
| 378 |
-
return scaler, classifiers, scorers, thresholds
|
| 379 |
-
|
| 380 |
-
def save_models(embedder_name, scaler, classifiers, scorers, thresholds):
|
| 381 |
-
"""Save all trained models"""
|
| 382 |
-
os.makedirs(MODEL_DIR, exist_ok=True)
|
| 383 |
|
| 384 |
-
|
| 385 |
-
|
| 386 |
|
| 387 |
-
|
| 388 |
-
pickle.dump(scaler, f)
|
| 389 |
-
|
| 390 |
-
with open(CLASSIFIER_PATH, 'wb') as f:
|
| 391 |
-
pickle.dump(classifiers, f)
|
| 392 |
-
|
| 393 |
-
with open(SCORER_PATH, 'wb') as f:
|
| 394 |
-
pickle.dump(scorers, f)
|
| 395 |
-
|
| 396 |
-
with open(THRESHOLD_PATH, 'wb') as f:
|
| 397 |
-
pickle.dump(thresholds, f)
|
| 398 |
|
| 399 |
-
def
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
embedder_name = f.read().strip()
|
| 404 |
-
|
| 405 |
-
embedder = SentenceTransformer(embedder_name)
|
| 406 |
-
|
| 407 |
-
with open(SCALER_PATH, 'rb') as f:
|
| 408 |
-
scaler = pickle.load(f)
|
| 409 |
-
|
| 410 |
-
with open(CLASSIFIER_PATH, 'rb') as f:
|
| 411 |
-
classifiers = pickle.load(f)
|
| 412 |
-
|
| 413 |
-
with open(SCORER_PATH, 'rb') as f:
|
| 414 |
-
scorers = pickle.load(f)
|
| 415 |
-
|
| 416 |
-
with open(THRESHOLD_PATH, 'rb') as f:
|
| 417 |
-
thresholds = pickle.load(f)
|
| 418 |
-
|
| 419 |
-
return embedder, scaler, classifiers, scorers, thresholds
|
| 420 |
-
except:
|
| 421 |
-
return None, None, None, None, None
|
| 422 |
-
|
| 423 |
-
def classify_segment(text, embedder, scaler, classifiers, scorers, thresholds):
|
| 424 |
-
"""Classify a segment of text"""
|
| 425 |
-
categories = list(CATEGORIES.keys())
|
| 426 |
-
category_results = {}
|
| 427 |
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
features_scaled = scaler.transform([features])
|
| 431 |
-
|
| 432 |
-
prob = classifiers[cat].predict_proba(features_scaled)[0, 1] if hasattr(classifiers[cat], 'predict_proba') else 0
|
| 433 |
-
category_results[cat] = prob
|
| 434 |
-
|
| 435 |
-
best_category = max(category_results, key=category_results.get)
|
| 436 |
-
best_prob = category_results[best_category]
|
| 437 |
|
| 438 |
-
|
| 439 |
-
|
|
|
|
| 440 |
features_scaled = scaler.transform([features])
|
| 441 |
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
score = int(np.clip(np.round(score_normalized * 4), 1, 4))
|
| 445 |
-
except:
|
| 446 |
-
score = 2
|
| 447 |
|
| 448 |
-
|
| 449 |
-
'
|
| 450 |
-
'
|
| 451 |
-
'confidence':
|
| 452 |
-
'text':
|
| 453 |
-
}
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
'category': 'Unclassified',
|
| 457 |
-
'score': None,
|
| 458 |
-
'confidence': 0,
|
| 459 |
-
'text': text
|
| 460 |
-
}
|
| 461 |
|
| 462 |
-
|
| 463 |
-
"""Analyze complete personal statement"""
|
| 464 |
-
segments = segment_text(text, embedder)
|
| 465 |
-
|
| 466 |
-
segment_results = []
|
| 467 |
-
for i, segment in enumerate(segments):
|
| 468 |
-
result = classify_segment(segment, embedder, scaler, classifiers, scorers, thresholds)
|
| 469 |
-
result['segment_num'] = i + 1
|
| 470 |
-
segment_results.append(result)
|
| 471 |
-
|
| 472 |
-
category_results = {}
|
| 473 |
-
for cat in CATEGORIES.keys():
|
| 474 |
-
cat_segments = [r for r in segment_results if r['category'] == cat]
|
| 475 |
-
if cat_segments:
|
| 476 |
-
scores = [s['score'] for s in cat_segments]
|
| 477 |
-
avg_score = np.mean(scores)
|
| 478 |
-
max_confidence = max([s['confidence'] for s in cat_segments])
|
| 479 |
-
|
| 480 |
-
category_results[cat] = {
|
| 481 |
-
'detected': True,
|
| 482 |
-
'score': int(np.round(avg_score)),
|
| 483 |
-
'confidence': max_confidence,
|
| 484 |
-
'num_segments': len(cat_segments)
|
| 485 |
-
}
|
| 486 |
-
else:
|
| 487 |
-
category_results[cat] = {
|
| 488 |
-
'detected': False,
|
| 489 |
-
'score': None,
|
| 490 |
-
'confidence': 0,
|
| 491 |
-
'num_segments': 0
|
| 492 |
-
}
|
| 493 |
-
|
| 494 |
-
return segment_results, category_results
|
| 495 |
-
|
| 496 |
-
# Main UI Code
|
| 497 |
st.title("🏥 Medical School Personal Statement Analyzer")
|
| 498 |
-
st.markdown("
|
| 499 |
-
st.markdown("---")
|
| 500 |
|
| 501 |
-
#
|
| 502 |
-
|
| 503 |
-
st.
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
- **Doctor Qualities**: Leadership and character traits
|
| 509 |
-
- **Spin**: Connecting experiences to medical career
|
| 510 |
-
|
| 511 |
-
Each category is scored 1-4 (Poor to Excellent)
|
| 512 |
-
""")
|
| 513 |
|
| 514 |
-
#
|
| 515 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 516 |
|
| 517 |
-
# Train Model Tab
|
| 518 |
with tab1:
|
| 519 |
-
st.header("Train the
|
| 520 |
-
|
| 521 |
-
if all(os.path.exists(p) for p in [CLASSIFIER_PATH, SCORER_PATH, SCALER_PATH]):
|
| 522 |
-
st.success("✓ Models already trained. You can analyze statements or retrain.")
|
| 523 |
|
| 524 |
-
st.markdown("Upload
|
| 525 |
|
| 526 |
-
|
| 527 |
-
with col1:
|
| 528 |
-
file1 = st.file_uploader("Training File 1", type=['xlsx'], key="file1")
|
| 529 |
-
with col2:
|
| 530 |
-
file2 = st.file_uploader("Training File 2", type=['xlsx'], key="file2")
|
| 531 |
|
| 532 |
-
if
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 538 |
|
| 539 |
-
if
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
scaler,
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
st.success("
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
|
| 561 |
-
# Analyze Statement Tab
|
| 562 |
with tab2:
|
| 563 |
-
st.header("Analyze Personal Statement")
|
| 564 |
|
| 565 |
-
if not
|
| 566 |
-
st.warning("
|
| 567 |
else:
|
| 568 |
-
|
| 569 |
-
embedder, scaler, classifiers, scorers, thresholds = load_saved_models()
|
| 570 |
|
| 571 |
-
if
|
| 572 |
-
st.
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
|
|
|
|
|
|
|
|
|
| 576 |
|
| 577 |
-
|
| 578 |
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
placeholder="Enter your personal statement..."
|
| 584 |
-
)
|
| 585 |
-
else:
|
| 586 |
-
uploaded_file = st.file_uploader("Upload statement (.txt)", type=['txt'])
|
| 587 |
-
if uploaded_file:
|
| 588 |
-
text_to_analyze = str(uploaded_file.read(), 'utf-8')
|
| 589 |
-
st.success("File uploaded successfully!")
|
| 590 |
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
)
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
with cols[0]:
|
| 607 |
-
st.metric("Categories Found", f"{len(detected)}/4")
|
| 608 |
-
with cols[1]:
|
| 609 |
-
if detected:
|
| 610 |
-
avg_score = np.mean([category_results[cat]['score'] for cat in detected])
|
| 611 |
-
st.metric("Average Score", f"{avg_score:.1f}/4")
|
| 612 |
-
else:
|
| 613 |
-
st.metric("Average Score", "N/A")
|
| 614 |
-
with cols[2]:
|
| 615 |
-
st.metric("Total Segments", len(segment_results))
|
| 616 |
-
with cols[3]:
|
| 617 |
-
quality = "Excellent" if len(detected) == 4 and avg_score >= 3.5 else "Good" if len(detected) >= 3 else "Needs Work"
|
| 618 |
-
st.metric("Overall", quality)
|
| 619 |
-
|
| 620 |
-
# Category breakdown
|
| 621 |
-
st.subheader("📋 Category Analysis")
|
| 622 |
-
for cat in CATEGORIES.keys():
|
| 623 |
-
res = category_results[cat]
|
| 624 |
-
if res['detected']:
|
| 625 |
-
icon = "✅" if res['score'] >= 3 else "⚠️" if res['score'] >= 2 else "❌"
|
| 626 |
-
st.write(f"{icon} **{cat}**: Score {res['score']}/4 (Confidence: {res['confidence']:.1%})")
|
| 627 |
-
else:
|
| 628 |
-
st.write(f"❌ **{cat}**: Not detected")
|
| 629 |
-
|
| 630 |
-
# Segment details
|
| 631 |
-
st.subheader("📝 Segment Details")
|
| 632 |
-
for seg in segment_results:
|
| 633 |
-
with st.expander(f"Segment {seg['segment_num']}: {seg['category']}"):
|
| 634 |
-
st.write(f"**Score:** {seg['score']}/4" if seg['score'] else "N/A")
|
| 635 |
-
st.write(f"**Confidence:** {seg['confidence']:.1%}")
|
| 636 |
-
st.write(f"**Text:** {seg['text'][:300]}...")
|
| 637 |
-
|
| 638 |
-
# Recommendations
|
| 639 |
-
st.subheader("💡 Recommendations")
|
| 640 |
-
missing = [cat for cat, res in category_results.items() if not res['detected']]
|
| 641 |
-
low_score = [cat for cat, res in category_results.items()
|
| 642 |
-
if res['detected'] and res['score'] and res['score'] < 3]
|
| 643 |
-
|
| 644 |
-
if missing:
|
| 645 |
-
st.warning("**Missing Categories:**")
|
| 646 |
-
for cat in missing:
|
| 647 |
-
st.write(f"• Add content for **{cat}**: {CATEGORIES[cat]['description']}")
|
| 648 |
-
|
| 649 |
-
if low_score:
|
| 650 |
-
st.info("**Areas to Improve:**")
|
| 651 |
-
for cat in low_score:
|
| 652 |
-
st.write(f"• Strengthen **{cat}** (current score: {category_results[cat]['score']}/4)")
|
| 653 |
-
|
| 654 |
-
if not missing and not low_score:
|
| 655 |
-
st.success("Excellent work! All categories present with good scores.")
|
| 656 |
|
| 657 |
-
# View Rubrics Tab
|
| 658 |
with tab3:
|
| 659 |
st.header("Scoring Rubrics")
|
| 660 |
|
| 661 |
for category, info in CATEGORIES.items():
|
| 662 |
-
with st.expander(
|
| 663 |
st.write(f"**Description:** {info['description']}")
|
| 664 |
-
st.write("**Scoring
|
| 665 |
for score in [4, 3, 2, 1]:
|
| 666 |
-
st.write(f"•
|
| 667 |
-
st.write(f"**
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
import pickle
|
| 5 |
import os
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
|
|
|
| 7 |
from sklearn.model_selection import train_test_split
|
| 8 |
from sklearn.preprocessing import StandardScaler
|
| 9 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 10 |
from sklearn.ensemble import RandomForestClassifier
|
| 11 |
+
from sklearn.dummy import DummyRegressor
|
| 12 |
import xgboost as xgb
|
| 13 |
import re
|
| 14 |
import warnings
|
|
|
|
|
|
|
|
|
|
| 15 |
warnings.filterwarnings('ignore')
|
| 16 |
|
| 17 |
+
# Initialize Streamlit - MUST BE AT THE TOP
|
| 18 |
st.set_page_config(
|
| 19 |
page_title="Medical School Personal Statement Analyzer",
|
| 20 |
page_icon="🏥",
|
| 21 |
+
layout="wide"
|
|
|
|
| 22 |
)
|
| 23 |
|
| 24 |
+
# Categories definition
|
| 25 |
CATEGORIES = {
|
| 26 |
'Spark': {
|
| 27 |
+
'description': 'Opening that spurs interest in medicine',
|
| 28 |
'keywords': ['growing up', 'childhood', 'family', 'realized', 'inspired', 'first',
|
| 29 |
+
'beginning', 'early', 'experience that', 'moment', 'when I was'],
|
| 30 |
+
'patterns': [r'when I was \d+', r'at age \d+', r'since I was', r'as a child'],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
'rubric': {
|
| 32 |
+
1: 'disconnected or confusing',
|
| 33 |
2: 'somewhat connected but unclear',
|
| 34 |
3: 'connected and clear',
|
| 35 |
+
4: 'engaging and logical flow'
|
| 36 |
},
|
| 37 |
'rubric_features': {
|
| 38 |
+
'positive': ['engaging', 'logical', 'clear', 'compelling', 'authentic'],
|
| 39 |
'negative': ['disconnected', 'confusing', 'random', 'unclear', 'generic']
|
| 40 |
}
|
| 41 |
},
|
| 42 |
'Healthcare Experience': {
|
| 43 |
+
'description': 'Clinical/medical experiences',
|
| 44 |
'keywords': ['shadowed', 'clinical', 'hospital', 'patient', 'doctor', 'physician',
|
| 45 |
+
'medical', 'treatment', 'observed', 'volunteer', 'clinic'],
|
| 46 |
+
'patterns': [r'\d+ hours', r'volunteered at', r'shadowing', r'clinical experience'],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
'rubric': {
|
| 48 |
+
1: 'passive, uninteresting, negative',
|
| 49 |
+
2: 'bland but not problematic',
|
| 50 |
3: 'interesting and relevant',
|
| 51 |
+
4: 'vivid, active, thoughtful, memorable'
|
| 52 |
},
|
| 53 |
'rubric_features': {
|
| 54 |
+
'positive': ['vivid', 'active', 'thoughtful', 'memorable', 'optimistic'],
|
| 55 |
+
'negative': ['passive', 'uninteresting', 'irrelevant', 'problematic']
|
| 56 |
}
|
| 57 |
},
|
| 58 |
'Showing Doctor Qualities': {
|
| 59 |
+
'description': 'Leadership and doctor qualities',
|
| 60 |
'keywords': ['leadership', 'empathy', 'compassion', 'responsibility', 'communication',
|
| 61 |
+
'advocate', 'caring', 'helping', 'service', 'volunteer'],
|
| 62 |
+
'patterns': [r'as (president|leader|captain)', r'I organized', r'I founded'],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
'rubric': {
|
| 64 |
+
1: 'arrogant, immature, inaccurate',
|
| 65 |
+
2: 'bland but not problematic',
|
| 66 |
3: 'shows some understanding',
|
| 67 |
+
4: 'realistic, mature, humble, clear'
|
| 68 |
},
|
| 69 |
'rubric_features': {
|
| 70 |
+
'positive': ['realistic', 'self-aware', 'mature', 'humble', 'specific'],
|
| 71 |
+
'negative': ['arrogant', 'immature', 'overly confident', 'simplistic']
|
| 72 |
}
|
| 73 |
},
|
| 74 |
'Spin': {
|
| 75 |
+
'description': 'Connecting experiences to medical career',
|
| 76 |
'keywords': ['learned', 'taught me', 'showed me', 'realized', 'understood',
|
| 77 |
+
'because', 'therefore', 'this experience', 'prepared me'],
|
| 78 |
+
'patterns': [r'this .+ taught me', r'I learned that', r'prepared me for'],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
'rubric': {
|
| 80 |
+
1: 'vague, simplistic, generic',
|
| 81 |
2: 'some connection but generic',
|
| 82 |
3: 'clear connection',
|
| 83 |
+
4: 'direct, logical, specific argument'
|
| 84 |
},
|
| 85 |
'rubric_features': {
|
| 86 |
+
'positive': ['direct', 'logical', 'specific', 'clear argument'],
|
| 87 |
+
'negative': ['brief', 'vague', 'simplistic', 'generic']
|
| 88 |
}
|
| 89 |
}
|
| 90 |
}
|
| 91 |
|
| 92 |
# Model paths
|
| 93 |
MODEL_DIR = "trained_models"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
+
# Helper functions
|
| 96 |
@st.cache_resource
|
| 97 |
+
def load_transformer():
|
|
|
|
| 98 |
try:
|
| 99 |
+
return SentenceTransformer('all-MiniLM-L6-v2')
|
|
|
|
| 100 |
except:
|
| 101 |
+
return None
|
|
|
|
| 102 |
|
| 103 |
+
def extract_features(text, embedder):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
features = []
|
| 105 |
text_lower = text.lower()
|
| 106 |
words = text.split()
|
| 107 |
|
| 108 |
+
# Basic stats
|
| 109 |
features.extend([
|
| 110 |
len(text),
|
| 111 |
len(words),
|
| 112 |
+
len(set(words)) / max(len(words), 1)
|
|
|
|
|
|
|
| 113 |
])
|
| 114 |
|
| 115 |
+
# Category features
|
| 116 |
for cat_name, cat_info in CATEGORIES.items():
|
| 117 |
+
keyword_count = sum(1 for kw in cat_info['keywords'] if kw.lower() in text_lower)
|
| 118 |
+
features.append(keyword_count / len(cat_info['keywords']))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
+
# Get embedding
|
| 121 |
try:
|
| 122 |
+
embedding = embedder.encode(text)
|
| 123 |
if hasattr(embedding, 'cpu'):
|
| 124 |
embedding = embedding.cpu().numpy()
|
| 125 |
+
embedding = embedding.flatten()[:128] # Reduced size
|
| 126 |
except:
|
| 127 |
+
embedding = np.zeros(128)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
+
return np.concatenate([features, embedding])
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
+
def train_simple_model(df, embedder):
|
| 132 |
+
X = []
|
| 133 |
+
y_labels = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
for _, row in df.iterrows():
|
| 136 |
+
if 'text' in row:
|
| 137 |
+
text = str(row['text'])
|
| 138 |
+
features = extract_features(text, embedder)
|
| 139 |
+
X.append(features)
|
| 140 |
+
|
| 141 |
+
# Find category
|
| 142 |
+
label = 'Unknown'
|
| 143 |
+
for cat in CATEGORIES.keys():
|
| 144 |
+
if f"Code: {cat} Applied" in row:
|
| 145 |
+
if row[f"Code: {cat} Applied"] in [True, 1, '1', 'true', 'True']:
|
| 146 |
+
label = cat
|
| 147 |
+
break
|
| 148 |
+
y_labels.append(label)
|
| 149 |
+
|
| 150 |
+
X = np.array(X)
|
| 151 |
+
|
| 152 |
+
# Train classifier
|
| 153 |
scaler = StandardScaler()
|
| 154 |
+
X_scaled = scaler.fit_transform(X)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
+
clf = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)
|
| 157 |
+
clf.fit(X_scaled, y_labels)
|
| 158 |
|
| 159 |
+
return scaler, clf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
+
def analyze_text(text, embedder, scaler, clf):
|
| 162 |
+
# Split into paragraphs
|
| 163 |
+
paragraphs = text.split('\n\n')
|
| 164 |
+
paragraphs = [p.strip() for p in paragraphs if len(p.strip()) > 50]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
+
if not paragraphs:
|
| 167 |
+
paragraphs = [text]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
+
results = []
|
| 170 |
+
for i, para in enumerate(paragraphs):
|
| 171 |
+
features = extract_features(para, embedder)
|
| 172 |
features_scaled = scaler.transform([features])
|
| 173 |
|
| 174 |
+
pred = clf.predict(features_scaled)[0]
|
| 175 |
+
prob = max(clf.predict_proba(features_scaled)[0])
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
+
results.append({
|
| 178 |
+
'segment': i + 1,
|
| 179 |
+
'category': pred,
|
| 180 |
+
'confidence': prob,
|
| 181 |
+
'text': para[:200] + '...' if len(para) > 200 else para
|
| 182 |
+
})
|
| 183 |
+
|
| 184 |
+
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
+
# MAIN APP STARTS HERE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
st.title("🏥 Medical School Personal Statement Analyzer")
|
| 188 |
+
st.markdown("Analyze personal statements based on medical school rubrics")
|
|
|
|
| 189 |
|
| 190 |
+
# Initialize session state
|
| 191 |
+
if 'model_trained' not in st.session_state:
|
| 192 |
+
st.session_state.model_trained = False
|
| 193 |
+
if 'scaler' not in st.session_state:
|
| 194 |
+
st.session_state.scaler = None
|
| 195 |
+
if 'clf' not in st.session_state:
|
| 196 |
+
st.session_state.clf = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
+
# Load transformer
|
| 199 |
+
embedder = load_transformer()
|
| 200 |
+
if embedder is None:
|
| 201 |
+
st.error("Failed to load model. Please refresh the page.")
|
| 202 |
+
st.stop()
|
| 203 |
+
|
| 204 |
+
# Tabs
|
| 205 |
+
tab1, tab2, tab3 = st.tabs(["Train Model", "Analyze Statement", "View Rubrics"])
|
| 206 |
|
|
|
|
| 207 |
with tab1:
|
| 208 |
+
st.header("Step 1: Train the Model")
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
+
st.markdown("Upload Excel files with coded personal statement excerpts")
|
| 211 |
|
| 212 |
+
uploaded_file = st.file_uploader("Upload Training Data", type=['xlsx', 'csv'])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
+
if uploaded_file:
|
| 215 |
+
try:
|
| 216 |
+
if uploaded_file.name.endswith('.csv'):
|
| 217 |
+
df = pd.read_csv(uploaded_file)
|
| 218 |
+
else:
|
| 219 |
+
df = pd.read_excel(uploaded_file)
|
| 220 |
+
|
| 221 |
+
st.success(f"Loaded {len(df)} rows")
|
| 222 |
+
|
| 223 |
+
# Process data
|
| 224 |
+
processed_data = []
|
| 225 |
+
for _, row in df.iterrows():
|
| 226 |
+
text_col = None
|
| 227 |
+
for col in ['Excerpt Copy', 'Excerpt', 'Text', 'Content']:
|
| 228 |
+
if col in row and pd.notna(row[col]):
|
| 229 |
+
text_col = col
|
| 230 |
+
break
|
| 231 |
|
| 232 |
+
if text_col:
|
| 233 |
+
processed_data.append({
|
| 234 |
+
'text': str(row[text_col]),
|
| 235 |
+
**{col: row[col] for col in row.index if 'Code:' in col}
|
| 236 |
+
})
|
| 237 |
+
|
| 238 |
+
if processed_data:
|
| 239 |
+
train_df = pd.DataFrame(processed_data)
|
| 240 |
+
|
| 241 |
+
if st.button("Train Model"):
|
| 242 |
+
with st.spinner("Training..."):
|
| 243 |
+
scaler, clf = train_simple_model(train_df, embedder)
|
| 244 |
+
st.session_state.scaler = scaler
|
| 245 |
+
st.session_state.clf = clf
|
| 246 |
+
st.session_state.model_trained = True
|
| 247 |
+
st.success("Model trained successfully!")
|
| 248 |
+
else:
|
| 249 |
+
st.error("No valid text data found")
|
| 250 |
+
|
| 251 |
+
except Exception as e:
|
| 252 |
+
st.error(f"Error: {str(e)}")
|
| 253 |
|
|
|
|
| 254 |
with tab2:
|
| 255 |
+
st.header("Step 2: Analyze Personal Statement")
|
| 256 |
|
| 257 |
+
if not st.session_state.model_trained:
|
| 258 |
+
st.warning("Please train the model first in Step 1")
|
| 259 |
else:
|
| 260 |
+
text_input = st.text_area("Paste your personal statement:", height=300)
|
|
|
|
| 261 |
|
| 262 |
+
if text_input and st.button("Analyze"):
|
| 263 |
+
with st.spinner("Analyzing..."):
|
| 264 |
+
results = analyze_text(
|
| 265 |
+
text_input,
|
| 266 |
+
embedder,
|
| 267 |
+
st.session_state.scaler,
|
| 268 |
+
st.session_state.clf
|
| 269 |
+
)
|
| 270 |
|
| 271 |
+
st.success("Analysis Complete!")
|
| 272 |
|
| 273 |
+
# Summary
|
| 274 |
+
st.subheader("Summary")
|
| 275 |
+
categories_found = list(set([r['category'] for r in results if r['category'] != 'Unknown']))
|
| 276 |
+
st.metric("Categories Found", f"{len(categories_found)}/4")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
|
| 278 |
+
# Details
|
| 279 |
+
st.subheader("Segment Analysis")
|
| 280 |
+
for result in results:
|
| 281 |
+
with st.expander(f"Segment {result['segment']}: {result['category']}"):
|
| 282 |
+
st.write(f"**Confidence:** {result['confidence']:.1%}")
|
| 283 |
+
st.write(f"**Text:** {result['text']}")
|
| 284 |
+
|
| 285 |
+
# Recommendations
|
| 286 |
+
st.subheader("Recommendations")
|
| 287 |
+
missing = [cat for cat in CATEGORIES.keys() if cat not in categories_found]
|
| 288 |
+
if missing:
|
| 289 |
+
st.warning("Missing categories:")
|
| 290 |
+
for cat in missing:
|
| 291 |
+
st.write(f"• Add {cat}: {CATEGORIES[cat]['description']}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
|
|
|
| 293 |
with tab3:
|
| 294 |
st.header("Scoring Rubrics")
|
| 295 |
|
| 296 |
for category, info in CATEGORIES.items():
|
| 297 |
+
with st.expander(category):
|
| 298 |
st.write(f"**Description:** {info['description']}")
|
| 299 |
+
st.write("**Scoring:**")
|
| 300 |
for score in [4, 3, 2, 1]:
|
| 301 |
+
st.write(f"• Score {score}: {info['rubric'][score]}")
|
| 302 |
+
st.write(f"**Keywords:** {', '.join(info['keywords'][:5])}...")
|