stevafernandes commited on
Commit
7737fc0
Β·
verified Β·
1 Parent(s): 4448b93

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +674 -0
app.py ADDED
@@ -0,0 +1,674 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import pickle
5
+ import os
6
+ import torch
7
+ from sentence_transformers import SentenceTransformer, util
8
+ from sklearn.model_selection import train_test_split
9
+ from sklearn.preprocessing import StandardScaler
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+ from sklearn.ensemble import RandomForestClassifier
12
+ import xgboost as xgb
13
+ import re
14
+ import warnings
15
+ from datetime import datetime
16
+ import base64
17
+ from io import BytesIO
18
+ warnings.filterwarnings('ignore')
19
+
20
+ # Set page config
21
+ st.set_page_config(
22
+ page_title="Medical School Personal Statement Analyzer",
23
+ page_icon="πŸ₯",
24
+ layout="wide",
25
+ initial_sidebar_state="expanded"
26
+ )
27
+
28
+ # Categories with detailed rubric alignment
29
+ CATEGORIES = {
30
+ 'Spark': {
31
+ 'description': 'Opening that spurs interest in medicine (typically in opening paragraph)',
32
+ 'keywords': ['growing up', 'childhood', 'family', 'realized', 'inspired', 'first',
33
+ 'beginning', 'early', 'experience that', 'moment', 'when I was',
34
+ 'journey began', 'sparked my interest', 'drew me to medicine',
35
+ 'passion for medicine', 'calling', 'fascinated', 'curiosity'],
36
+ 'patterns': [
37
+ r'when I was \d+', r'at age \d+', r'since I was', r'as a child',
38
+ r'early in my life', r'growing up', r'my journey to medicine'
39
+ ],
40
+ 'rubric': {
41
+ 1: 'disconnected from being a doctor or confusing/random',
42
+ 2: 'somewhat connected but unclear',
43
+ 3: 'connected and clear',
44
+ 4: 'engaging and logically flows into becoming a doctor'
45
+ },
46
+ 'rubric_features': {
47
+ 'positive': ['engaging', 'logical', 'clear connection', 'compelling', 'authentic'],
48
+ 'negative': ['disconnected', 'confusing', 'random', 'unclear', 'generic']
49
+ }
50
+ },
51
+ 'Healthcare Experience': {
52
+ 'description': 'Watching/participating in healthcare - medical professional at work',
53
+ 'keywords': ['shadowed', 'clinical', 'hospital', 'patient', 'doctor', 'physician',
54
+ 'medical', 'treatment', 'observed', 'volunteer', 'clinic', 'rounds',
55
+ 'surgery', 'emergency', 'ICU', 'residency', 'internship', 'scrubs',
56
+ 'stethoscope', 'diagnosis', 'prognosis', 'bedside', 'ward', 'unit',
57
+ 'healthcare', 'care team', 'medical team', 'attending', 'resident'],
58
+ 'patterns': [
59
+ r'\d+ hours', r'volunteered at', r'shadowing', r'clinical experience',
60
+ r'medical mission', r'worked in .+ hospital', r'during my rotation'
61
+ ],
62
+ 'rubric': {
63
+ 1: 'passive observation, uninteresting, irrelevant, problematic, negative tone',
64
+ 2: 'bland/boring but not problematic',
65
+ 3: 'interesting and relevant',
66
+ 4: 'vivid, active, thoughtful, relevant, memorable, positive and optimistic'
67
+ },
68
+ 'rubric_features': {
69
+ 'positive': ['vivid', 'active', 'thoughtful', 'memorable', 'optimistic', 'engaged'],
70
+ 'negative': ['passive', 'uninteresting', 'irrelevant', 'problematic', 'pessimistic']
71
+ }
72
+ },
73
+ 'Showing Doctor Qualities': {
74
+ 'description': 'Stories/examples portraying vision of doctor role and appealing aspects',
75
+ 'keywords': ['leadership', 'empathy', 'compassion', 'responsibility', 'communication',
76
+ 'advocate', 'caring', 'helping', 'service', 'volunteer', 'president',
77
+ 'led', 'organized', 'taught', 'mentored', 'integrity', 'ethical',
78
+ 'professional', 'dedication', 'perseverance', 'resilience', 'humble',
79
+ 'self-aware', 'mature', 'understanding', 'patient-centered', 'holistic'],
80
+ 'patterns': [
81
+ r'as (president|leader|captain)', r'I organized', r'I founded',
82
+ r'demonstrated .+ leadership', r'showed .+ compassion'
83
+ ],
84
+ 'rubric': {
85
+ 1: 'arrogant, immature, overly confident, inaccurate understanding, negative tone',
86
+ 2: 'bland/boring but not problematic',
87
+ 3: 'shows some understanding',
88
+ 4: 'realistic, self-aware, mature, humble, specific and clear understanding, positive'
89
+ },
90
+ 'rubric_features': {
91
+ 'positive': ['realistic', 'self-aware', 'mature', 'humble', 'specific', 'clear'],
92
+ 'negative': ['arrogant', 'immature', 'overly confident', 'simplistic', 'inaccurate']
93
+ }
94
+ },
95
+ 'Spin': {
96
+ 'description': 'Explaining why experiences qualify them to be a doctor',
97
+ 'keywords': ['learned', 'taught me', 'showed me', 'realized', 'understood',
98
+ 'because', 'therefore', 'this experience', 'through this',
99
+ 'as a doctor', 'future physician', 'will help me', 'prepared me',
100
+ 'equipped me', 'qualified', 'ready', 'capable', 'competent',
101
+ 'skills necessary', 'attributes required', 'prepared for'],
102
+ 'patterns': [
103
+ r'this .+ taught me', r'I learned that', r'prepared me for',
104
+ r'qualified me to', r'because of this', r'therefore I'
105
+ ],
106
+ 'rubric': {
107
+ 1: 'brief, vague, simplistic connection to being a doctor, generic',
108
+ 2: 'some connection but generic',
109
+ 3: 'clear connection',
110
+ 4: 'direct, logical, and specific argument connecting experience to profession'
111
+ },
112
+ 'rubric_features': {
113
+ 'positive': ['direct', 'logical', 'specific', 'clear argument', 'compelling connection'],
114
+ 'negative': ['brief', 'vague', 'simplistic', 'generic', 'weak connection']
115
+ }
116
+ }
117
+ }
118
+
119
+ # Model paths
120
+ MODEL_DIR = "trained_models"
121
+ EMBEDDER_PATH = os.path.join(MODEL_DIR, "embedder_name.txt")
122
+ CLASSIFIER_PATH = os.path.join(MODEL_DIR, "classifier.pkl")
123
+ SCORER_PATH = os.path.join(MODEL_DIR, "scorer.pkl")
124
+ SCALER_PATH = os.path.join(MODEL_DIR, "scaler.pkl")
125
+ THRESHOLD_PATH = os.path.join(MODEL_DIR, "thresholds.pkl")
126
+ ENSEMBLE_PATH = os.path.join(MODEL_DIR, "ensemble.pkl")
127
+
128
+ @st.cache_resource
129
+ def load_sentence_transformer():
130
+ """Load sentence transformer model"""
131
+ models_to_try = [
132
+ 'all-MiniLM-L6-v2', # Lightweight and reliable
133
+ 'all-mpnet-base-v2' # Good alternative
134
+ ]
135
+
136
+ for model_name in models_to_try:
137
+ try:
138
+ model = SentenceTransformer(model_name)
139
+ return model, model_name
140
+ except:
141
+ continue
142
+
143
+ return SentenceTransformer('all-MiniLM-L6-v2'), 'all-MiniLM-L6-v2'
144
+
145
+ def segment_text(text, embedder):
146
+ """Segment text into meaningful chunks"""
147
+ paragraphs = re.split(r'\n\s*\n', text)
148
+ paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p.strip()) > 50]
149
+
150
+ if len(paragraphs) <= 1:
151
+ sentences = re.split(r'(?<=[.!?])\s+', text)
152
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
153
+
154
+ if len(sentences) < 3:
155
+ return [text]
156
+
157
+ # Group sentences into segments
158
+ segments = []
159
+ current_segment = []
160
+ for sent in sentences:
161
+ current_segment.append(sent)
162
+ if len(' '.join(current_segment)) > 300:
163
+ segments.append(' '.join(current_segment))
164
+ current_segment = []
165
+ if current_segment:
166
+ segments.append(' '.join(current_segment))
167
+ return segments
168
+
169
+ return paragraphs
170
+
171
+ def extract_features(text, embedder, category_focus=None):
172
+ """Extract features for classification"""
173
+ features = []
174
+ text_lower = text.lower()
175
+ words = text.split()
176
+
177
+ # Basic text statistics
178
+ features.extend([
179
+ len(text),
180
+ len(words),
181
+ len(set(words)) / max(len(words), 1),
182
+ len(re.findall(r'[.!?]', text)),
183
+ text.count('I') / max(len(words), 1),
184
+ ])
185
+
186
+ # Process all categories
187
+ for cat_name, cat_info in CATEGORIES.items():
188
+ keywords = cat_info['keywords']
189
+ keyword_matches = sum(1 for kw in keywords if kw.lower() in text_lower)
190
+ keyword_density = keyword_matches / max(len(keywords), 1)
191
+
192
+ if category_focus == cat_name:
193
+ keyword_density *= 2
194
+
195
+ features.append(keyword_density * 10)
196
+
197
+ pattern_matches = 0
198
+ for pattern in cat_info.get('patterns', []):
199
+ matches = re.findall(pattern, text_lower)
200
+ pattern_matches += len(matches)
201
+ features.append(pattern_matches)
202
+
203
+ positive_count = sum(1 for word in cat_info['rubric_features']['positive']
204
+ if word in text_lower)
205
+ negative_count = sum(1 for word in cat_info['rubric_features']['negative']
206
+ if word in text_lower)
207
+
208
+ features.extend([
209
+ positive_count / max(len(words), 1) * 100,
210
+ negative_count / max(len(words), 1) * 100
211
+ ])
212
+
213
+ # Get embeddings
214
+ try:
215
+ embedding = embedder.encode(text, convert_to_tensor=False, normalize_embeddings=True)
216
+ except:
217
+ embedding = embedder.encode(text)
218
+
219
+ # Category similarity
220
+ if category_focus and category_focus in CATEGORIES:
221
+ category_text = f"{CATEGORIES[category_focus]['description']} {' '.join(CATEGORIES[category_focus]['keywords'][:10])}"
222
+ try:
223
+ category_embedding = embedder.encode(category_text, normalize_embeddings=True)
224
+ similarity = cosine_similarity([embedding], [category_embedding])[0][0]
225
+ features.append(similarity * 10)
226
+ except:
227
+ features.append(0)
228
+ else:
229
+ features.append(0)
230
+
231
+ features = np.array(features, dtype=np.float32)
232
+ combined_features = np.concatenate([features, embedding[:256]]) # Limit embedding size
233
+
234
+ return combined_features
235
+
236
+ def load_training_data(file1, file2):
237
+ """Load and combine training data from Excel files"""
238
+ try:
239
+ df1 = pd.read_excel(file1)
240
+ df2 = pd.read_excel(file2)
241
+ except Exception as e:
242
+ st.error(f"Error reading Excel files: {str(e)}")
243
+ return pd.DataFrame()
244
+
245
+ combined_df = pd.concat([df1, df2], ignore_index=True)
246
+ processed_data = []
247
+
248
+ for _, row in combined_df.iterrows():
249
+ text = None
250
+ for col_name in ['Excerpt Copy', 'Excerpt', 'Text', 'Content']:
251
+ if col_name in row and pd.notna(row[col_name]):
252
+ text = str(row[col_name])
253
+ break
254
+
255
+ if not text or text.strip() == '':
256
+ continue
257
+
258
+ data_point = {'text': text.strip()}
259
+
260
+ for category in CATEGORIES.keys():
261
+ col_applied = f"Code: {category} Applied"
262
+ col_weight = f"Code: {category} Weight"
263
+
264
+ is_applied = False
265
+ if col_applied in row:
266
+ applied_val = str(row[col_applied]).lower()
267
+ is_applied = applied_val in ['true', '1', 'yes', 't']
268
+
269
+ data_point[f"{category}_applied"] = is_applied
270
+
271
+ if is_applied and col_weight in row:
272
+ weight = row[col_weight]
273
+ if pd.isna(weight) or weight == '':
274
+ weight = 2
275
+ else:
276
+ try:
277
+ weight = int(float(weight))
278
+ weight = max(1, min(4, weight))
279
+ except:
280
+ weight = 2
281
+ else:
282
+ weight = 0
283
+
284
+ data_point[f"{category}_score"] = weight
285
+
286
+ processed_data.append(data_point)
287
+
288
+ return pd.DataFrame(processed_data)
289
+
290
+ def train_models(df, embedder):
291
+ """Train classification and scoring models"""
292
+ all_features = []
293
+
294
+ progress_bar = st.progress(0)
295
+ status_text = st.empty()
296
+
297
+ status_text.text("Extracting features from training data...")
298
+
299
+ for idx, row in df.iterrows():
300
+ text = row['text']
301
+
302
+ category_features = {}
303
+ for cat in CATEGORIES.keys():
304
+ features = extract_features(text, embedder, category_focus=cat)
305
+ category_features[cat] = features
306
+
307
+ true_categories = [cat for cat in CATEGORIES.keys() if row[f"{cat}_applied"]]
308
+
309
+ if true_categories:
310
+ features = category_features[true_categories[0]]
311
+ else:
312
+ features = np.mean(list(category_features.values()), axis=0)
313
+
314
+ all_features.append(features)
315
+ progress_bar.progress((idx + 1) / len(df))
316
+
317
+ X = np.array(all_features)
318
+
319
+ categories = list(CATEGORIES.keys())
320
+ y_class = df[[f"{cat}_applied" for cat in categories]].values.astype(float)
321
+
322
+ y_score = []
323
+ for _, row in df.iterrows():
324
+ scores = []
325
+ for cat in categories:
326
+ if row[f"{cat}_applied"]:
327
+ scores.append(row[f"{cat}_score"] / 4.0)
328
+ else:
329
+ scores.append(0)
330
+ y_score.append(scores)
331
+ y_score = np.array(y_score)
332
+
333
+ status_text.text("Training models...")
334
+
335
+ # Split data
336
+ X_train, X_test, y_class_train, y_class_test, y_score_train, y_score_test = train_test_split(
337
+ X, y_class, y_score, test_size=0.2, random_state=42
338
+ )
339
+
340
+ # Scale features
341
+ scaler = StandardScaler()
342
+ X_train_scaled = scaler.fit_transform(X_train)
343
+ X_test_scaled = scaler.transform(X_test)
344
+
345
+ # Train classifiers
346
+ classifiers = {}
347
+ scorers = {}
348
+ thresholds = {}
349
+
350
+ for i, cat in enumerate(categories):
351
+ # Train classifier
352
+ clf = RandomForestClassifier(
353
+ n_estimators=100,
354
+ max_depth=6,
355
+ class_weight='balanced',
356
+ random_state=42
357
+ )
358
+ clf.fit(X_train_scaled, y_class_train[:, i])
359
+ classifiers[cat] = clf
360
+
361
+ # Train scorer
362
+ mask = y_class_train[:, i] == 1
363
+ if np.sum(mask) > 5:
364
+ scorer = xgb.XGBRegressor(
365
+ n_estimators=100,
366
+ max_depth=4,
367
+ random_state=42
368
+ )
369
+ scorer.fit(X_train_scaled[mask], y_score_train[mask, i])
370
+ else:
371
+ from sklearn.dummy import DummyRegressor
372
+ scorer = DummyRegressor(strategy='constant', constant=0.5)
373
+ scorer.fit(X_train_scaled, y_score_train[:, i])
374
+
375
+ scorers[cat] = scorer
376
+ thresholds[cat] = 0.5
377
+
378
+ status_text.empty()
379
+ progress_bar.empty()
380
+
381
+ return scaler, classifiers, scorers, thresholds
382
+
383
+ def save_models(embedder_name, scaler, classifiers, scorers, thresholds):
384
+ """Save all trained models"""
385
+ os.makedirs(MODEL_DIR, exist_ok=True)
386
+
387
+ with open(EMBEDDER_PATH, 'w') as f:
388
+ f.write(embedder_name)
389
+
390
+ with open(SCALER_PATH, 'wb') as f:
391
+ pickle.dump(scaler, f)
392
+
393
+ with open(CLASSIFIER_PATH, 'wb') as f:
394
+ pickle.dump(classifiers, f)
395
+
396
+ with open(SCORER_PATH, 'wb') as f:
397
+ pickle.dump(scorers, f)
398
+
399
+ with open(THRESHOLD_PATH, 'wb') as f:
400
+ pickle.dump(thresholds, f)
401
+
402
+ def load_saved_models():
403
+ """Load all saved models"""
404
+ try:
405
+ with open(EMBEDDER_PATH, 'r') as f:
406
+ embedder_name = f.read().strip()
407
+
408
+ embedder = SentenceTransformer(embedder_name)
409
+
410
+ with open(SCALER_PATH, 'rb') as f:
411
+ scaler = pickle.load(f)
412
+
413
+ with open(CLASSIFIER_PATH, 'rb') as f:
414
+ classifiers = pickle.load(f)
415
+
416
+ with open(SCORER_PATH, 'rb') as f:
417
+ scorers = pickle.load(f)
418
+
419
+ with open(THRESHOLD_PATH, 'rb') as f:
420
+ thresholds = pickle.load(f)
421
+
422
+ return embedder, scaler, classifiers, scorers, thresholds
423
+ except:
424
+ return None, None, None, None, None
425
+
426
+ def classify_segment(text, embedder, scaler, classifiers, scorers, thresholds):
427
+ """Classify a segment of text"""
428
+ categories = list(CATEGORIES.keys())
429
+ category_results = {}
430
+
431
+ for cat in categories:
432
+ features = extract_features(text, embedder, category_focus=cat)
433
+ features_scaled = scaler.transform([features])
434
+
435
+ prob = classifiers[cat].predict_proba(features_scaled)[0, 1] if hasattr(classifiers[cat], 'predict_proba') else 0
436
+ category_results[cat] = prob
437
+
438
+ best_category = max(category_results, key=category_results.get)
439
+ best_prob = category_results[best_category]
440
+
441
+ if best_prob > thresholds.get(best_category, 0.5):
442
+ features = extract_features(text, embedder, category_focus=best_category)
443
+ features_scaled = scaler.transform([features])
444
+
445
+ try:
446
+ score_normalized = scorers[best_category].predict(features_scaled)[0]
447
+ score = int(np.clip(np.round(score_normalized * 4), 1, 4))
448
+ except:
449
+ score = 2
450
+
451
+ return {
452
+ 'category': best_category,
453
+ 'score': score,
454
+ 'confidence': float(best_prob),
455
+ 'text': text
456
+ }
457
+ else:
458
+ return {
459
+ 'category': 'Unclassified',
460
+ 'score': None,
461
+ 'confidence': 0,
462
+ 'text': text
463
+ }
464
+
465
+ def analyze_statement(text, embedder, scaler, classifiers, scorers, thresholds):
466
+ """Analyze complete personal statement"""
467
+ segments = segment_text(text, embedder)
468
+
469
+ segment_results = []
470
+ for i, segment in enumerate(segments):
471
+ result = classify_segment(segment, embedder, scaler, classifiers, scorers, thresholds)
472
+ result['segment_num'] = i + 1
473
+ segment_results.append(result)
474
+
475
+ category_results = {}
476
+ for cat in CATEGORIES.keys():
477
+ cat_segments = [r for r in segment_results if r['category'] == cat]
478
+ if cat_segments:
479
+ scores = [s['score'] for s in cat_segments]
480
+ avg_score = np.mean(scores)
481
+ max_confidence = max([s['confidence'] for s in cat_segments])
482
+
483
+ category_results[cat] = {
484
+ 'detected': True,
485
+ 'score': int(np.round(avg_score)),
486
+ 'confidence': max_confidence,
487
+ 'num_segments': len(cat_segments)
488
+ }
489
+ else:
490
+ category_results[cat] = {
491
+ 'detected': False,
492
+ 'score': None,
493
+ 'confidence': 0,
494
+ 'num_segments': 0
495
+ }
496
+
497
+ return segment_results, category_results
498
+
499
+ # Main application
500
+ def main():
501
+ st.title("πŸ₯ Medical School Personal Statement Analyzer")
502
+ st.markdown("*AI-powered analysis based on medical school admission rubrics*")
503
+ st.markdown("---")
504
+
505
+ # Sidebar
506
+ with st.sidebar:
507
+ st.header("ℹ️ About")
508
+ st.markdown("""
509
+ This tool analyzes personal statements based on 4 key categories:
510
+ - **Spark**: Opening that shows interest in medicine
511
+ - **Healthcare Experience**: Clinical/medical experiences
512
+ - **Doctor Qualities**: Leadership and character traits
513
+ - **Spin**: Connecting experiences to medical career
514
+
515
+ Each category is scored 1-4 (Poor to Excellent)
516
+ """)
517
+
518
+ # Create tabs
519
+ tab1, tab2, tab3 = st.tabs(["πŸ“š Train Model", "πŸ“ Analyze Statement", "πŸ“Š View Rubrics"])
520
+
521
+ # Train Model Tab
522
+ with tab1:
523
+ st.header("Train the AI Model")
524
+
525
+ if all(os.path.exists(p) for p in [CLASSIFIER_PATH, SCORER_PATH, SCALER_PATH]):
526
+ st.success("βœ“ Models already trained. You can analyze statements or retrain.")
527
+
528
+ st.markdown("Upload training data files (Excel format with coded excerpts)")
529
+
530
+ col1, col2 = st.columns(2)
531
+ with col1:
532
+ file1 = st.file_uploader("Training File 1", type=['xlsx'], key="file1")
533
+ with col2:
534
+ file2 = st.file_uploader("Training File 2", type=['xlsx'], key="file2")
535
+
536
+ if file1 and file2:
537
+ if st.button("Start Training", type="primary"):
538
+ try:
539
+ # Load data
540
+ with st.spinner("Loading training data..."):
541
+ df = load_training_data(file1, file2)
542
+
543
+ if df.empty:
544
+ st.error("No valid training data found.")
545
+ return
546
+
547
+ st.success(f"βœ“ Loaded {len(df)} training samples")
548
+
549
+ # Load embedder
550
+ with st.spinner("Loading transformer model..."):
551
+ embedder, embedder_name = load_sentence_transformer()
552
+
553
+ # Train
554
+ scaler, classifiers, scorers, thresholds = train_models(df, embedder)
555
+
556
+ # Save
557
+ save_models(embedder_name, scaler, classifiers, scorers, thresholds)
558
+ st.success("βœ“ Training complete! Models saved.")
559
+
560
+ except Exception as e:
561
+ st.error(f"Training failed: {str(e)}")
562
+
563
+ # Analyze Statement Tab
564
+ with tab2:
565
+ st.header("Analyze Personal Statement")
566
+
567
+ if not all(os.path.exists(p) for p in [CLASSIFIER_PATH, SCORER_PATH, SCALER_PATH]):
568
+ st.warning("⚠️ Please train the model first (Tab 1)")
569
+ return
570
+
571
+ # Load models
572
+ embedder, scaler, classifiers, scorers, thresholds = load_saved_models()
573
+
574
+ if embedder is None:
575
+ st.error("Failed to load models. Please retrain.")
576
+ return
577
+
578
+ # Input method
579
+ input_method = st.radio("Choose input method:", ["Paste Text", "Upload File"])
580
+
581
+ text_to_analyze = None
582
+
583
+ if input_method == "Paste Text":
584
+ text_to_analyze = st.text_area(
585
+ "Paste your personal statement here:",
586
+ height=300,
587
+ placeholder="Enter your personal statement..."
588
+ )
589
+ else:
590
+ uploaded_file = st.file_uploader("Upload statement (.txt)", type=['txt'])
591
+ if uploaded_file:
592
+ text_to_analyze = str(uploaded_file.read(), 'utf-8')
593
+ st.success("File uploaded successfully!")
594
+
595
+ if text_to_analyze and st.button("Analyze Statement", type="primary"):
596
+ with st.spinner("Analyzing..."):
597
+ segment_results, category_results = analyze_statement(
598
+ text_to_analyze, embedder, scaler, classifiers, scorers, thresholds
599
+ )
600
+
601
+ # Display results
602
+ st.success("βœ“ Analysis complete!")
603
+
604
+ # Summary
605
+ st.subheader("πŸ“Š Overall Summary")
606
+ cols = st.columns(4)
607
+
608
+ detected = [cat for cat, res in category_results.items() if res['detected']]
609
+
610
+ with cols[0]:
611
+ st.metric("Categories Found", f"{len(detected)}/4")
612
+ with cols[1]:
613
+ if detected:
614
+ avg_score = np.mean([category_results[cat]['score'] for cat in detected])
615
+ st.metric("Average Score", f"{avg_score:.1f}/4")
616
+ else:
617
+ st.metric("Average Score", "N/A")
618
+ with cols[2]:
619
+ st.metric("Total Segments", len(segment_results))
620
+ with cols[3]:
621
+ quality = "Excellent" if len(detected) == 4 and avg_score >= 3.5 else "Good" if len(detected) >= 3 else "Needs Work"
622
+ st.metric("Overall", quality)
623
+
624
+ # Category breakdown
625
+ st.subheader("πŸ“‹ Category Analysis")
626
+ for cat in CATEGORIES.keys():
627
+ res = category_results[cat]
628
+ if res['detected']:
629
+ icon = "βœ…" if res['score'] >= 3 else "⚠️" if res['score'] >= 2 else "❌"
630
+ st.write(f"{icon} **{cat}**: Score {res['score']}/4 (Confidence: {res['confidence']:.1%})")
631
+ else:
632
+ st.write(f"❌ **{cat}**: Not detected")
633
+
634
+ # Segment details
635
+ st.subheader("πŸ“ Segment Details")
636
+ for seg in segment_results:
637
+ with st.expander(f"Segment {seg['segment_num']}: {seg['category']}"):
638
+ st.write(f"**Score:** {seg['score']}/4" if seg['score'] else "N/A")
639
+ st.write(f"**Confidence:** {seg['confidence']:.1%}")
640
+ st.write(f"**Text:** {seg['text'][:300]}...")
641
+
642
+ # Recommendations
643
+ st.subheader("πŸ’‘ Recommendations")
644
+ missing = [cat for cat, res in category_results.items() if not res['detected']]
645
+ low_score = [cat for cat, res in category_results.items()
646
+ if res['detected'] and res['score'] and res['score'] < 3]
647
+
648
+ if missing:
649
+ st.warning("**Missing Categories:**")
650
+ for cat in missing:
651
+ st.write(f"β€’ Add content for **{cat}**: {CATEGORIES[cat]['description']}")
652
+
653
+ if low_score:
654
+ st.info("**Areas to Improve:**")
655
+ for cat in low_score:
656
+ st.write(f"β€’ Strengthen **{cat}** (current score: {category_results[cat]['score']}/4)")
657
+
658
+ if not missing and not low_score:
659
+ st.success("Excellent work! All categories present with good scores.")
660
+
661
+ # View Rubrics Tab
662
+ with tab3:
663
+ st.header("Scoring Rubrics")
664
+
665
+ for category, info in CATEGORIES.items():
666
+ with st.expander(f"**{category}**"):
667
+ st.write(f"**Description:** {info['description']}")
668
+ st.write("**Scoring Criteria:**")
669
+ for score in [4, 3, 2, 1]:
670
+ st.write(f"β€’ **Score {score}:** {info['rubric'][score]}")
671
+ st.write(f"**Key Terms:** {', '.join(info['keywords'][:8])}")
672
+
673
+ if __name__ == "__main__":
674
+ main()