stevafernandes commited on
Commit
6ddadb4
·
verified ·
1 Parent(s): e96c629

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +185 -550
app.py CHANGED
@@ -3,665 +3,300 @@ import pandas as pd
3
  import numpy as np
4
  import pickle
5
  import os
6
- import torch
7
- from sentence_transformers import SentenceTransformer, util
8
  from sklearn.model_selection import train_test_split
9
  from sklearn.preprocessing import StandardScaler
10
  from sklearn.metrics.pairwise import cosine_similarity
11
  from sklearn.ensemble import RandomForestClassifier
 
12
  import xgboost as xgb
13
  import re
14
  import warnings
15
- from datetime import datetime
16
- import base64
17
- from io import BytesIO
18
  warnings.filterwarnings('ignore')
19
 
20
- # Set page config
21
  st.set_page_config(
22
  page_title="Medical School Personal Statement Analyzer",
23
  page_icon="🏥",
24
- layout="wide",
25
- initial_sidebar_state="expanded"
26
  )
27
 
28
- # Categories with detailed rubric alignment
29
  CATEGORIES = {
30
  'Spark': {
31
- 'description': 'Opening that spurs interest in medicine (typically in opening paragraph)',
32
  'keywords': ['growing up', 'childhood', 'family', 'realized', 'inspired', 'first',
33
- 'beginning', 'early', 'experience that', 'moment', 'when I was',
34
- 'journey began', 'sparked my interest', 'drew me to medicine',
35
- 'passion for medicine', 'calling', 'fascinated', 'curiosity'],
36
- 'patterns': [
37
- r'when I was \d+', r'at age \d+', r'since I was', r'as a child',
38
- r'early in my life', r'growing up', r'my journey to medicine'
39
- ],
40
  'rubric': {
41
- 1: 'disconnected from being a doctor or confusing/random',
42
  2: 'somewhat connected but unclear',
43
  3: 'connected and clear',
44
- 4: 'engaging and logically flows into becoming a doctor'
45
  },
46
  'rubric_features': {
47
- 'positive': ['engaging', 'logical', 'clear connection', 'compelling', 'authentic'],
48
  'negative': ['disconnected', 'confusing', 'random', 'unclear', 'generic']
49
  }
50
  },
51
  'Healthcare Experience': {
52
- 'description': 'Watching/participating in healthcare - medical professional at work',
53
  'keywords': ['shadowed', 'clinical', 'hospital', 'patient', 'doctor', 'physician',
54
- 'medical', 'treatment', 'observed', 'volunteer', 'clinic', 'rounds',
55
- 'surgery', 'emergency', 'ICU', 'residency', 'internship', 'scrubs',
56
- 'stethoscope', 'diagnosis', 'prognosis', 'bedside', 'ward', 'unit',
57
- 'healthcare', 'care team', 'medical team', 'attending', 'resident'],
58
- 'patterns': [
59
- r'\d+ hours', r'volunteered at', r'shadowing', r'clinical experience',
60
- r'medical mission', r'worked in .+ hospital', r'during my rotation'
61
- ],
62
  'rubric': {
63
- 1: 'passive observation, uninteresting, irrelevant, problematic, negative tone',
64
- 2: 'bland/boring but not problematic',
65
  3: 'interesting and relevant',
66
- 4: 'vivid, active, thoughtful, relevant, memorable, positive and optimistic'
67
  },
68
  'rubric_features': {
69
- 'positive': ['vivid', 'active', 'thoughtful', 'memorable', 'optimistic', 'engaged'],
70
- 'negative': ['passive', 'uninteresting', 'irrelevant', 'problematic', 'pessimistic']
71
  }
72
  },
73
  'Showing Doctor Qualities': {
74
- 'description': 'Stories/examples portraying vision of doctor role and appealing aspects',
75
  'keywords': ['leadership', 'empathy', 'compassion', 'responsibility', 'communication',
76
- 'advocate', 'caring', 'helping', 'service', 'volunteer', 'president',
77
- 'led', 'organized', 'taught', 'mentored', 'integrity', 'ethical',
78
- 'professional', 'dedication', 'perseverance', 'resilience', 'humble',
79
- 'self-aware', 'mature', 'understanding', 'patient-centered', 'holistic'],
80
- 'patterns': [
81
- r'as (president|leader|captain)', r'I organized', r'I founded',
82
- r'demonstrated .+ leadership', r'showed .+ compassion'
83
- ],
84
  'rubric': {
85
- 1: 'arrogant, immature, overly confident, inaccurate understanding, negative tone',
86
- 2: 'bland/boring but not problematic',
87
  3: 'shows some understanding',
88
- 4: 'realistic, self-aware, mature, humble, specific and clear understanding, positive'
89
  },
90
  'rubric_features': {
91
- 'positive': ['realistic', 'self-aware', 'mature', 'humble', 'specific', 'clear'],
92
- 'negative': ['arrogant', 'immature', 'overly confident', 'simplistic', 'inaccurate']
93
  }
94
  },
95
  'Spin': {
96
- 'description': 'Explaining why experiences qualify them to be a doctor',
97
  'keywords': ['learned', 'taught me', 'showed me', 'realized', 'understood',
98
- 'because', 'therefore', 'this experience', 'through this',
99
- 'as a doctor', 'future physician', 'will help me', 'prepared me',
100
- 'equipped me', 'qualified', 'ready', 'capable', 'competent',
101
- 'skills necessary', 'attributes required', 'prepared for'],
102
- 'patterns': [
103
- r'this .+ taught me', r'I learned that', r'prepared me for',
104
- r'qualified me to', r'because of this', r'therefore I'
105
- ],
106
  'rubric': {
107
- 1: 'brief, vague, simplistic connection to being a doctor, generic',
108
  2: 'some connection but generic',
109
  3: 'clear connection',
110
- 4: 'direct, logical, and specific argument connecting experience to profession'
111
  },
112
  'rubric_features': {
113
- 'positive': ['direct', 'logical', 'specific', 'clear argument', 'compelling connection'],
114
- 'negative': ['brief', 'vague', 'simplistic', 'generic', 'weak connection']
115
  }
116
  }
117
  }
118
 
119
  # Model paths
120
  MODEL_DIR = "trained_models"
121
- EMBEDDER_PATH = os.path.join(MODEL_DIR, "embedder_name.txt")
122
- CLASSIFIER_PATH = os.path.join(MODEL_DIR, "classifier.pkl")
123
- SCORER_PATH = os.path.join(MODEL_DIR, "scorer.pkl")
124
- SCALER_PATH = os.path.join(MODEL_DIR, "scaler.pkl")
125
- THRESHOLD_PATH = os.path.join(MODEL_DIR, "thresholds.pkl")
126
 
 
127
  @st.cache_resource
128
- def load_sentence_transformer():
129
- """Load sentence transformer model"""
130
  try:
131
- model = SentenceTransformer('all-MiniLM-L6-v2')
132
- return model, 'all-MiniLM-L6-v2'
133
  except:
134
- st.error("Failed to load sentence transformer model")
135
- return None, None
136
 
137
- def segment_text(text, embedder):
138
- """Segment text into meaningful chunks"""
139
- paragraphs = re.split(r'\n\s*\n', text)
140
- paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p.strip()) > 50]
141
-
142
- if len(paragraphs) <= 1:
143
- sentences = re.split(r'(?<=[.!?])\s+', text)
144
- sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
145
-
146
- if len(sentences) < 3:
147
- return [text]
148
-
149
- segments = []
150
- current_segment = []
151
- for sent in sentences:
152
- current_segment.append(sent)
153
- if len(' '.join(current_segment)) > 300:
154
- segments.append(' '.join(current_segment))
155
- current_segment = []
156
- if current_segment:
157
- segments.append(' '.join(current_segment))
158
- return segments
159
-
160
- return paragraphs
161
-
162
- def extract_features(text, embedder, category_focus=None):
163
- """Extract features for classification"""
164
  features = []
165
  text_lower = text.lower()
166
  words = text.split()
167
 
168
- # Basic text statistics
169
  features.extend([
170
  len(text),
171
  len(words),
172
- len(set(words)) / max(len(words), 1),
173
- len(re.findall(r'[.!?]', text)),
174
- text.count('I') / max(len(words), 1),
175
  ])
176
 
177
- # Process all categories
178
  for cat_name, cat_info in CATEGORIES.items():
179
- keywords = cat_info['keywords']
180
- keyword_matches = sum(1 for kw in keywords if kw.lower() in text_lower)
181
- keyword_density = keyword_matches / max(len(keywords), 1)
182
-
183
- if category_focus == cat_name:
184
- keyword_density *= 2
185
-
186
- features.append(keyword_density * 10)
187
-
188
- pattern_matches = 0
189
- for pattern in cat_info.get('patterns', []):
190
- matches = re.findall(pattern, text_lower)
191
- pattern_matches += len(matches)
192
- features.append(pattern_matches)
193
-
194
- positive_count = sum(1 for word in cat_info['rubric_features']['positive']
195
- if word in text_lower)
196
- negative_count = sum(1 for word in cat_info['rubric_features']['negative']
197
- if word in text_lower)
198
-
199
- features.extend([
200
- positive_count / max(len(words), 1) * 100,
201
- negative_count / max(len(words), 1) * 100
202
- ])
203
 
204
- # Get embeddings
205
  try:
206
- embedding = embedder.encode(text, convert_to_tensor=False)
207
  if hasattr(embedding, 'cpu'):
208
  embedding = embedding.cpu().numpy()
209
- embedding = embedding.flatten()[:256] # Limit size
210
  except:
211
- embedding = np.zeros(256)
212
-
213
- # Category similarity
214
- if category_focus and category_focus in CATEGORIES:
215
- category_text = f"{CATEGORIES[category_focus]['description']} {' '.join(CATEGORIES[category_focus]['keywords'][:10])}"
216
- try:
217
- category_embedding = embedder.encode(category_text)
218
- if hasattr(category_embedding, 'cpu'):
219
- category_embedding = category_embedding.cpu().numpy()
220
- category_embedding = category_embedding.flatten()
221
- similarity = cosine_similarity([embedding], [category_embedding[:256]])[0][0]
222
- features.append(similarity * 10)
223
- except:
224
- features.append(0)
225
- else:
226
- features.append(0)
227
 
228
- features = np.array(features, dtype=np.float32)
229
- combined_features = np.concatenate([features, embedding])
230
-
231
- return combined_features
232
 
233
- def load_training_data(file1, file2):
234
- """Load and combine training data from Excel files"""
235
- try:
236
- df1 = pd.read_excel(file1)
237
- df2 = pd.read_excel(file2)
238
- except Exception as e:
239
- st.error(f"Error reading Excel files: {str(e)}")
240
- return pd.DataFrame()
241
-
242
- combined_df = pd.concat([df1, df2], ignore_index=True)
243
- processed_data = []
244
 
245
- for _, row in combined_df.iterrows():
246
- text = None
247
- for col_name in ['Excerpt Copy', 'Excerpt', 'Text', 'Content']:
248
- if col_name in row and pd.notna(row[col_name]):
249
- text = str(row[col_name])
250
- break
251
-
252
- if not text or text.strip() == '':
253
- continue
254
-
255
- data_point = {'text': text.strip()}
256
-
257
- for category in CATEGORIES.keys():
258
- col_applied = f"Code: {category} Applied"
259
- col_weight = f"Code: {category} Weight"
260
-
261
- is_applied = False
262
- if col_applied in row:
263
- applied_val = str(row[col_applied]).lower()
264
- is_applied = applied_val in ['true', '1', 'yes', 't']
265
-
266
- data_point[f"{category}_applied"] = is_applied
267
-
268
- if is_applied and col_weight in row:
269
- weight = row[col_weight]
270
- if pd.isna(weight) or weight == '':
271
- weight = 2
272
- else:
273
- try:
274
- weight = int(float(weight))
275
- weight = max(1, min(4, weight))
276
- except:
277
- weight = 2
278
- else:
279
- weight = 0
280
-
281
- data_point[f"{category}_score"] = weight
282
-
283
- processed_data.append(data_point)
284
-
285
- return pd.DataFrame(processed_data)
286
-
287
- def train_models(df, embedder):
288
- """Train classification and scoring models"""
289
- all_features = []
290
-
291
- progress_bar = st.progress(0)
292
- status_text = st.empty()
293
-
294
- status_text.text("Extracting features from training data...")
295
-
296
- for idx, row in df.iterrows():
297
- text = row['text']
298
-
299
- category_features = {}
300
- for cat in CATEGORIES.keys():
301
- features = extract_features(text, embedder, category_focus=cat)
302
- category_features[cat] = features
303
-
304
- true_categories = [cat for cat in CATEGORIES.keys() if row[f"{cat}_applied"]]
305
-
306
- if true_categories:
307
- features = category_features[true_categories[0]]
308
- else:
309
- features = np.mean(list(category_features.values()), axis=0)
310
-
311
- all_features.append(features)
312
- progress_bar.progress((idx + 1) / len(df))
313
-
314
- X = np.array(all_features)
315
-
316
- categories = list(CATEGORIES.keys())
317
- y_class = df[[f"{cat}_applied" for cat in categories]].values.astype(float)
318
-
319
- y_score = []
320
  for _, row in df.iterrows():
321
- scores = []
322
- for cat in categories:
323
- if row[f"{cat}_applied"]:
324
- scores.append(row[f"{cat}_score"] / 4.0)
325
- else:
326
- scores.append(0)
327
- y_score.append(scores)
328
- y_score = np.array(y_score)
329
-
330
- status_text.text("Training models...")
331
-
332
- # Split data
333
- X_train, X_test, y_class_train, y_class_test, y_score_train, y_score_test = train_test_split(
334
- X, y_class, y_score, test_size=0.2, random_state=42
335
- )
336
-
337
- # Scale features
338
  scaler = StandardScaler()
339
- X_train_scaled = scaler.fit_transform(X_train)
340
- X_test_scaled = scaler.transform(X_test)
341
-
342
- # Train classifiers
343
- classifiers = {}
344
- scorers = {}
345
- thresholds = {}
346
-
347
- for i, cat in enumerate(categories):
348
- # Train classifier
349
- clf = RandomForestClassifier(
350
- n_estimators=100,
351
- max_depth=6,
352
- class_weight='balanced',
353
- random_state=42
354
- )
355
- clf.fit(X_train_scaled, y_class_train[:, i])
356
- classifiers[cat] = clf
357
-
358
- # Train scorer
359
- mask = y_class_train[:, i] == 1
360
- if np.sum(mask) > 5:
361
- scorer = xgb.XGBRegressor(
362
- n_estimators=100,
363
- max_depth=4,
364
- random_state=42
365
- )
366
- scorer.fit(X_train_scaled[mask], y_score_train[mask, i])
367
- else:
368
- from sklearn.dummy import DummyRegressor
369
- scorer = DummyRegressor(strategy='constant', constant=0.5)
370
- scorer.fit(X_train_scaled, y_score_train[:, i])
371
-
372
- scorers[cat] = scorer
373
- thresholds[cat] = 0.5
374
-
375
- status_text.empty()
376
- progress_bar.empty()
377
-
378
- return scaler, classifiers, scorers, thresholds
379
-
380
- def save_models(embedder_name, scaler, classifiers, scorers, thresholds):
381
- """Save all trained models"""
382
- os.makedirs(MODEL_DIR, exist_ok=True)
383
 
384
- with open(EMBEDDER_PATH, 'w') as f:
385
- f.write(embedder_name)
386
 
387
- with open(SCALER_PATH, 'wb') as f:
388
- pickle.dump(scaler, f)
389
-
390
- with open(CLASSIFIER_PATH, 'wb') as f:
391
- pickle.dump(classifiers, f)
392
-
393
- with open(SCORER_PATH, 'wb') as f:
394
- pickle.dump(scorers, f)
395
-
396
- with open(THRESHOLD_PATH, 'wb') as f:
397
- pickle.dump(thresholds, f)
398
 
399
- def load_saved_models():
400
- """Load all saved models"""
401
- try:
402
- with open(EMBEDDER_PATH, 'r') as f:
403
- embedder_name = f.read().strip()
404
-
405
- embedder = SentenceTransformer(embedder_name)
406
-
407
- with open(SCALER_PATH, 'rb') as f:
408
- scaler = pickle.load(f)
409
-
410
- with open(CLASSIFIER_PATH, 'rb') as f:
411
- classifiers = pickle.load(f)
412
-
413
- with open(SCORER_PATH, 'rb') as f:
414
- scorers = pickle.load(f)
415
-
416
- with open(THRESHOLD_PATH, 'rb') as f:
417
- thresholds = pickle.load(f)
418
-
419
- return embedder, scaler, classifiers, scorers, thresholds
420
- except:
421
- return None, None, None, None, None
422
-
423
- def classify_segment(text, embedder, scaler, classifiers, scorers, thresholds):
424
- """Classify a segment of text"""
425
- categories = list(CATEGORIES.keys())
426
- category_results = {}
427
 
428
- for cat in categories:
429
- features = extract_features(text, embedder, category_focus=cat)
430
- features_scaled = scaler.transform([features])
431
-
432
- prob = classifiers[cat].predict_proba(features_scaled)[0, 1] if hasattr(classifiers[cat], 'predict_proba') else 0
433
- category_results[cat] = prob
434
-
435
- best_category = max(category_results, key=category_results.get)
436
- best_prob = category_results[best_category]
437
 
438
- if best_prob > thresholds.get(best_category, 0.5):
439
- features = extract_features(text, embedder, category_focus=best_category)
 
440
  features_scaled = scaler.transform([features])
441
 
442
- try:
443
- score_normalized = scorers[best_category].predict(features_scaled)[0]
444
- score = int(np.clip(np.round(score_normalized * 4), 1, 4))
445
- except:
446
- score = 2
447
 
448
- return {
449
- 'category': best_category,
450
- 'score': score,
451
- 'confidence': float(best_prob),
452
- 'text': text
453
- }
454
- else:
455
- return {
456
- 'category': 'Unclassified',
457
- 'score': None,
458
- 'confidence': 0,
459
- 'text': text
460
- }
461
 
462
- def analyze_statement(text, embedder, scaler, classifiers, scorers, thresholds):
463
- """Analyze complete personal statement"""
464
- segments = segment_text(text, embedder)
465
-
466
- segment_results = []
467
- for i, segment in enumerate(segments):
468
- result = classify_segment(segment, embedder, scaler, classifiers, scorers, thresholds)
469
- result['segment_num'] = i + 1
470
- segment_results.append(result)
471
-
472
- category_results = {}
473
- for cat in CATEGORIES.keys():
474
- cat_segments = [r for r in segment_results if r['category'] == cat]
475
- if cat_segments:
476
- scores = [s['score'] for s in cat_segments]
477
- avg_score = np.mean(scores)
478
- max_confidence = max([s['confidence'] for s in cat_segments])
479
-
480
- category_results[cat] = {
481
- 'detected': True,
482
- 'score': int(np.round(avg_score)),
483
- 'confidence': max_confidence,
484
- 'num_segments': len(cat_segments)
485
- }
486
- else:
487
- category_results[cat] = {
488
- 'detected': False,
489
- 'score': None,
490
- 'confidence': 0,
491
- 'num_segments': 0
492
- }
493
-
494
- return segment_results, category_results
495
-
496
- # Main UI Code
497
  st.title("🏥 Medical School Personal Statement Analyzer")
498
- st.markdown("*AI-powered analysis based on medical school admission rubrics*")
499
- st.markdown("---")
500
 
501
- # Sidebar
502
- with st.sidebar:
503
- st.header("ℹ️ About")
504
- st.markdown("""
505
- This tool analyzes personal statements based on 4 key categories:
506
- - **Spark**: Opening that shows interest in medicine
507
- - **Healthcare Experience**: Clinical/medical experiences
508
- - **Doctor Qualities**: Leadership and character traits
509
- - **Spin**: Connecting experiences to medical career
510
-
511
- Each category is scored 1-4 (Poor to Excellent)
512
- """)
513
 
514
- # Create tabs
515
- tab1, tab2, tab3 = st.tabs(["📚 Train Model", "📝 Analyze Statement", "📊 View Rubrics"])
 
 
 
 
 
 
516
 
517
- # Train Model Tab
518
  with tab1:
519
- st.header("Train the AI Model")
520
-
521
- if all(os.path.exists(p) for p in [CLASSIFIER_PATH, SCORER_PATH, SCALER_PATH]):
522
- st.success("✓ Models already trained. You can analyze statements or retrain.")
523
 
524
- st.markdown("Upload training data files (Excel format with coded excerpts)")
525
 
526
- col1, col2 = st.columns(2)
527
- with col1:
528
- file1 = st.file_uploader("Training File 1", type=['xlsx'], key="file1")
529
- with col2:
530
- file2 = st.file_uploader("Training File 2", type=['xlsx'], key="file2")
531
 
532
- if file1 and file2:
533
- if st.button("Start Training", type="primary"):
534
- try:
535
- # Load data
536
- with st.spinner("Loading training data..."):
537
- df = load_training_data(file1, file2)
 
 
 
 
 
 
 
 
 
 
 
538
 
539
- if df.empty:
540
- st.error("No valid training data found.")
541
- else:
542
- st.success(f"✓ Loaded {len(df)} training samples")
543
-
544
- # Load embedder
545
- with st.spinner("Loading transformer model..."):
546
- embedder, embedder_name = load_sentence_transformer()
547
-
548
- if embedder is not None:
549
- # Train
550
- scaler, classifiers, scorers, thresholds = train_models(df, embedder)
551
-
552
- # Save
553
- save_models(embedder_name, scaler, classifiers, scorers, thresholds)
554
- st.success(" Training complete! Models saved.")
555
- else:
556
- st.error("Failed to load transformer model")
557
-
558
- except Exception as e:
559
- st.error(f"Training failed: {str(e)}")
560
 
561
- # Analyze Statement Tab
562
  with tab2:
563
- st.header("Analyze Personal Statement")
564
 
565
- if not all(os.path.exists(p) for p in [CLASSIFIER_PATH, SCORER_PATH, SCALER_PATH]):
566
- st.warning("⚠️ Please train the model first (Tab 1)")
567
  else:
568
- # Load models
569
- embedder, scaler, classifiers, scorers, thresholds = load_saved_models()
570
 
571
- if embedder is None:
572
- st.error("Failed to load models. Please retrain.")
573
- else:
574
- # Input method
575
- input_method = st.radio("Choose input method:", ["Paste Text", "Upload File"])
 
 
 
576
 
577
- text_to_analyze = None
578
 
579
- if input_method == "Paste Text":
580
- text_to_analyze = st.text_area(
581
- "Paste your personal statement here:",
582
- height=300,
583
- placeholder="Enter your personal statement..."
584
- )
585
- else:
586
- uploaded_file = st.file_uploader("Upload statement (.txt)", type=['txt'])
587
- if uploaded_file:
588
- text_to_analyze = str(uploaded_file.read(), 'utf-8')
589
- st.success("File uploaded successfully!")
590
 
591
- if text_to_analyze and st.button("Analyze Statement", type="primary"):
592
- with st.spinner("Analyzing..."):
593
- segment_results, category_results = analyze_statement(
594
- text_to_analyze, embedder, scaler, classifiers, scorers, thresholds
595
- )
596
-
597
- # Display results
598
- st.success("✓ Analysis complete!")
599
-
600
- # Summary
601
- st.subheader("📊 Overall Summary")
602
- cols = st.columns(4)
603
-
604
- detected = [cat for cat, res in category_results.items() if res['detected']]
605
-
606
- with cols[0]:
607
- st.metric("Categories Found", f"{len(detected)}/4")
608
- with cols[1]:
609
- if detected:
610
- avg_score = np.mean([category_results[cat]['score'] for cat in detected])
611
- st.metric("Average Score", f"{avg_score:.1f}/4")
612
- else:
613
- st.metric("Average Score", "N/A")
614
- with cols[2]:
615
- st.metric("Total Segments", len(segment_results))
616
- with cols[3]:
617
- quality = "Excellent" if len(detected) == 4 and avg_score >= 3.5 else "Good" if len(detected) >= 3 else "Needs Work"
618
- st.metric("Overall", quality)
619
-
620
- # Category breakdown
621
- st.subheader("📋 Category Analysis")
622
- for cat in CATEGORIES.keys():
623
- res = category_results[cat]
624
- if res['detected']:
625
- icon = "✅" if res['score'] >= 3 else "⚠️" if res['score'] >= 2 else "❌"
626
- st.write(f"{icon} **{cat}**: Score {res['score']}/4 (Confidence: {res['confidence']:.1%})")
627
- else:
628
- st.write(f"❌ **{cat}**: Not detected")
629
-
630
- # Segment details
631
- st.subheader("📝 Segment Details")
632
- for seg in segment_results:
633
- with st.expander(f"Segment {seg['segment_num']}: {seg['category']}"):
634
- st.write(f"**Score:** {seg['score']}/4" if seg['score'] else "N/A")
635
- st.write(f"**Confidence:** {seg['confidence']:.1%}")
636
- st.write(f"**Text:** {seg['text'][:300]}...")
637
-
638
- # Recommendations
639
- st.subheader("💡 Recommendations")
640
- missing = [cat for cat, res in category_results.items() if not res['detected']]
641
- low_score = [cat for cat, res in category_results.items()
642
- if res['detected'] and res['score'] and res['score'] < 3]
643
-
644
- if missing:
645
- st.warning("**Missing Categories:**")
646
- for cat in missing:
647
- st.write(f"• Add content for **{cat}**: {CATEGORIES[cat]['description']}")
648
-
649
- if low_score:
650
- st.info("**Areas to Improve:**")
651
- for cat in low_score:
652
- st.write(f"• Strengthen **{cat}** (current score: {category_results[cat]['score']}/4)")
653
-
654
- if not missing and not low_score:
655
- st.success("Excellent work! All categories present with good scores.")
656
 
657
- # View Rubrics Tab
658
  with tab3:
659
  st.header("Scoring Rubrics")
660
 
661
  for category, info in CATEGORIES.items():
662
- with st.expander(f"**{category}**"):
663
  st.write(f"**Description:** {info['description']}")
664
- st.write("**Scoring Criteria:**")
665
  for score in [4, 3, 2, 1]:
666
- st.write(f"• **Score {score}:** {info['rubric'][score]}")
667
- st.write(f"**Key Terms:** {', '.join(info['keywords'][:8])}")
 
3
  import numpy as np
4
  import pickle
5
  import os
6
+ from sentence_transformers import SentenceTransformer
 
7
  from sklearn.model_selection import train_test_split
8
  from sklearn.preprocessing import StandardScaler
9
  from sklearn.metrics.pairwise import cosine_similarity
10
  from sklearn.ensemble import RandomForestClassifier
11
+ from sklearn.dummy import DummyRegressor
12
  import xgboost as xgb
13
  import re
14
  import warnings
 
 
 
15
  warnings.filterwarnings('ignore')
16
 
17
+ # Initialize Streamlit - MUST BE AT THE TOP
18
  st.set_page_config(
19
  page_title="Medical School Personal Statement Analyzer",
20
  page_icon="🏥",
21
+ layout="wide"
 
22
  )
23
 
24
+ # Categories definition
25
  CATEGORIES = {
26
  'Spark': {
27
+ 'description': 'Opening that spurs interest in medicine',
28
  'keywords': ['growing up', 'childhood', 'family', 'realized', 'inspired', 'first',
29
+ 'beginning', 'early', 'experience that', 'moment', 'when I was'],
30
+ 'patterns': [r'when I was \d+', r'at age \d+', r'since I was', r'as a child'],
 
 
 
 
 
31
  'rubric': {
32
+ 1: 'disconnected or confusing',
33
  2: 'somewhat connected but unclear',
34
  3: 'connected and clear',
35
+ 4: 'engaging and logical flow'
36
  },
37
  'rubric_features': {
38
+ 'positive': ['engaging', 'logical', 'clear', 'compelling', 'authentic'],
39
  'negative': ['disconnected', 'confusing', 'random', 'unclear', 'generic']
40
  }
41
  },
42
  'Healthcare Experience': {
43
+ 'description': 'Clinical/medical experiences',
44
  'keywords': ['shadowed', 'clinical', 'hospital', 'patient', 'doctor', 'physician',
45
+ 'medical', 'treatment', 'observed', 'volunteer', 'clinic'],
46
+ 'patterns': [r'\d+ hours', r'volunteered at', r'shadowing', r'clinical experience'],
 
 
 
 
 
 
47
  'rubric': {
48
+ 1: 'passive, uninteresting, negative',
49
+ 2: 'bland but not problematic',
50
  3: 'interesting and relevant',
51
+ 4: 'vivid, active, thoughtful, memorable'
52
  },
53
  'rubric_features': {
54
+ 'positive': ['vivid', 'active', 'thoughtful', 'memorable', 'optimistic'],
55
+ 'negative': ['passive', 'uninteresting', 'irrelevant', 'problematic']
56
  }
57
  },
58
  'Showing Doctor Qualities': {
59
+ 'description': 'Leadership and doctor qualities',
60
  'keywords': ['leadership', 'empathy', 'compassion', 'responsibility', 'communication',
61
+ 'advocate', 'caring', 'helping', 'service', 'volunteer'],
62
+ 'patterns': [r'as (president|leader|captain)', r'I organized', r'I founded'],
 
 
 
 
 
 
63
  'rubric': {
64
+ 1: 'arrogant, immature, inaccurate',
65
+ 2: 'bland but not problematic',
66
  3: 'shows some understanding',
67
+ 4: 'realistic, mature, humble, clear'
68
  },
69
  'rubric_features': {
70
+ 'positive': ['realistic', 'self-aware', 'mature', 'humble', 'specific'],
71
+ 'negative': ['arrogant', 'immature', 'overly confident', 'simplistic']
72
  }
73
  },
74
  'Spin': {
75
+ 'description': 'Connecting experiences to medical career',
76
  'keywords': ['learned', 'taught me', 'showed me', 'realized', 'understood',
77
+ 'because', 'therefore', 'this experience', 'prepared me'],
78
+ 'patterns': [r'this .+ taught me', r'I learned that', r'prepared me for'],
 
 
 
 
 
 
79
  'rubric': {
80
+ 1: 'vague, simplistic, generic',
81
  2: 'some connection but generic',
82
  3: 'clear connection',
83
+ 4: 'direct, logical, specific argument'
84
  },
85
  'rubric_features': {
86
+ 'positive': ['direct', 'logical', 'specific', 'clear argument'],
87
+ 'negative': ['brief', 'vague', 'simplistic', 'generic']
88
  }
89
  }
90
  }
91
 
92
  # Model paths
93
  MODEL_DIR = "trained_models"
 
 
 
 
 
94
 
95
+ # Helper functions
96
  @st.cache_resource
97
+ def load_transformer():
 
98
  try:
99
+ return SentenceTransformer('all-MiniLM-L6-v2')
 
100
  except:
101
+ return None
 
102
 
103
+ def extract_features(text, embedder):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  features = []
105
  text_lower = text.lower()
106
  words = text.split()
107
 
108
+ # Basic stats
109
  features.extend([
110
  len(text),
111
  len(words),
112
+ len(set(words)) / max(len(words), 1)
 
 
113
  ])
114
 
115
+ # Category features
116
  for cat_name, cat_info in CATEGORIES.items():
117
+ keyword_count = sum(1 for kw in cat_info['keywords'] if kw.lower() in text_lower)
118
+ features.append(keyword_count / len(cat_info['keywords']))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
+ # Get embedding
121
  try:
122
+ embedding = embedder.encode(text)
123
  if hasattr(embedding, 'cpu'):
124
  embedding = embedding.cpu().numpy()
125
+ embedding = embedding.flatten()[:128] # Reduced size
126
  except:
127
+ embedding = np.zeros(128)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
+ return np.concatenate([features, embedding])
 
 
 
130
 
131
+ def train_simple_model(df, embedder):
132
+ X = []
133
+ y_labels = []
 
 
 
 
 
 
 
 
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  for _, row in df.iterrows():
136
+ if 'text' in row:
137
+ text = str(row['text'])
138
+ features = extract_features(text, embedder)
139
+ X.append(features)
140
+
141
+ # Find category
142
+ label = 'Unknown'
143
+ for cat in CATEGORIES.keys():
144
+ if f"Code: {cat} Applied" in row:
145
+ if row[f"Code: {cat} Applied"] in [True, 1, '1', 'true', 'True']:
146
+ label = cat
147
+ break
148
+ y_labels.append(label)
149
+
150
+ X = np.array(X)
151
+
152
+ # Train classifier
153
  scaler = StandardScaler()
154
+ X_scaled = scaler.fit_transform(X)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
+ clf = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)
157
+ clf.fit(X_scaled, y_labels)
158
 
159
+ return scaler, clf
 
 
 
 
 
 
 
 
 
 
160
 
161
+ def analyze_text(text, embedder, scaler, clf):
162
+ # Split into paragraphs
163
+ paragraphs = text.split('\n\n')
164
+ paragraphs = [p.strip() for p in paragraphs if len(p.strip()) > 50]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
+ if not paragraphs:
167
+ paragraphs = [text]
 
 
 
 
 
 
 
168
 
169
+ results = []
170
+ for i, para in enumerate(paragraphs):
171
+ features = extract_features(para, embedder)
172
  features_scaled = scaler.transform([features])
173
 
174
+ pred = clf.predict(features_scaled)[0]
175
+ prob = max(clf.predict_proba(features_scaled)[0])
 
 
 
176
 
177
+ results.append({
178
+ 'segment': i + 1,
179
+ 'category': pred,
180
+ 'confidence': prob,
181
+ 'text': para[:200] + '...' if len(para) > 200 else para
182
+ })
183
+
184
+ return results
 
 
 
 
 
185
 
186
+ # MAIN APP STARTS HERE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  st.title("🏥 Medical School Personal Statement Analyzer")
188
+ st.markdown("Analyze personal statements based on medical school rubrics")
 
189
 
190
+ # Initialize session state
191
+ if 'model_trained' not in st.session_state:
192
+ st.session_state.model_trained = False
193
+ if 'scaler' not in st.session_state:
194
+ st.session_state.scaler = None
195
+ if 'clf' not in st.session_state:
196
+ st.session_state.clf = None
 
 
 
 
 
197
 
198
+ # Load transformer
199
+ embedder = load_transformer()
200
+ if embedder is None:
201
+ st.error("Failed to load model. Please refresh the page.")
202
+ st.stop()
203
+
204
+ # Tabs
205
+ tab1, tab2, tab3 = st.tabs(["Train Model", "Analyze Statement", "View Rubrics"])
206
 
 
207
  with tab1:
208
+ st.header("Step 1: Train the Model")
 
 
 
209
 
210
+ st.markdown("Upload Excel files with coded personal statement excerpts")
211
 
212
+ uploaded_file = st.file_uploader("Upload Training Data", type=['xlsx', 'csv'])
 
 
 
 
213
 
214
+ if uploaded_file:
215
+ try:
216
+ if uploaded_file.name.endswith('.csv'):
217
+ df = pd.read_csv(uploaded_file)
218
+ else:
219
+ df = pd.read_excel(uploaded_file)
220
+
221
+ st.success(f"Loaded {len(df)} rows")
222
+
223
+ # Process data
224
+ processed_data = []
225
+ for _, row in df.iterrows():
226
+ text_col = None
227
+ for col in ['Excerpt Copy', 'Excerpt', 'Text', 'Content']:
228
+ if col in row and pd.notna(row[col]):
229
+ text_col = col
230
+ break
231
 
232
+ if text_col:
233
+ processed_data.append({
234
+ 'text': str(row[text_col]),
235
+ **{col: row[col] for col in row.index if 'Code:' in col}
236
+ })
237
+
238
+ if processed_data:
239
+ train_df = pd.DataFrame(processed_data)
240
+
241
+ if st.button("Train Model"):
242
+ with st.spinner("Training..."):
243
+ scaler, clf = train_simple_model(train_df, embedder)
244
+ st.session_state.scaler = scaler
245
+ st.session_state.clf = clf
246
+ st.session_state.model_trained = True
247
+ st.success("Model trained successfully!")
248
+ else:
249
+ st.error("No valid text data found")
250
+
251
+ except Exception as e:
252
+ st.error(f"Error: {str(e)}")
253
 
 
254
  with tab2:
255
+ st.header("Step 2: Analyze Personal Statement")
256
 
257
+ if not st.session_state.model_trained:
258
+ st.warning("Please train the model first in Step 1")
259
  else:
260
+ text_input = st.text_area("Paste your personal statement:", height=300)
 
261
 
262
+ if text_input and st.button("Analyze"):
263
+ with st.spinner("Analyzing..."):
264
+ results = analyze_text(
265
+ text_input,
266
+ embedder,
267
+ st.session_state.scaler,
268
+ st.session_state.clf
269
+ )
270
 
271
+ st.success("Analysis Complete!")
272
 
273
+ # Summary
274
+ st.subheader("Summary")
275
+ categories_found = list(set([r['category'] for r in results if r['category'] != 'Unknown']))
276
+ st.metric("Categories Found", f"{len(categories_found)}/4")
 
 
 
 
 
 
 
277
 
278
+ # Details
279
+ st.subheader("Segment Analysis")
280
+ for result in results:
281
+ with st.expander(f"Segment {result['segment']}: {result['category']}"):
282
+ st.write(f"**Confidence:** {result['confidence']:.1%}")
283
+ st.write(f"**Text:** {result['text']}")
284
+
285
+ # Recommendations
286
+ st.subheader("Recommendations")
287
+ missing = [cat for cat in CATEGORIES.keys() if cat not in categories_found]
288
+ if missing:
289
+ st.warning("Missing categories:")
290
+ for cat in missing:
291
+ st.write(f"• Add {cat}: {CATEGORIES[cat]['description']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
 
293
  with tab3:
294
  st.header("Scoring Rubrics")
295
 
296
  for category, info in CATEGORIES.items():
297
+ with st.expander(category):
298
  st.write(f"**Description:** {info['description']}")
299
+ st.write("**Scoring:**")
300
  for score in [4, 3, 2, 1]:
301
+ st.write(f"• Score {score}: {info['rubric'][score]}")
302
+ st.write(f"**Keywords:** {', '.join(info['keywords'][:5])}...")