Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import pickle | |
| import os | |
| import re | |
| from datetime import datetime | |
| from io import BytesIO | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Page config MUST be first | |
| st.set_page_config( | |
| page_title="Medical School Personal Statement Analyzer", | |
| page_icon="π₯", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Import ML libraries | |
| from sentence_transformers import SentenceTransformer, util | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sklearn.ensemble import RandomForestClassifier | |
| import xgboost as xgb | |
| import torch | |
| # Import PDF generation libraries | |
| try: | |
| from reportlab.lib import colors | |
| from reportlab.lib.pagesizes import letter | |
| from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, PageBreak | |
| from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle | |
| from reportlab.lib.units import inch | |
| from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY | |
| PDF_AVAILABLE = True | |
| except ImportError: | |
| PDF_AVAILABLE = False | |
| # Categories with detailed rubric alignment | |
| CATEGORIES = { | |
| 'Spark': { | |
| 'description': 'Opening that spurs interest in medicine (typically in opening paragraph)', | |
| 'keywords': ['growing up', 'childhood', 'family', 'realized', 'inspired', 'first', | |
| 'beginning', 'early', 'experience that', 'moment', 'when I was', | |
| 'journey began', 'sparked my interest', 'drew me to medicine', | |
| 'passion for medicine', 'calling', 'fascinated', 'curiosity'], | |
| 'patterns': [ | |
| r'when I was \d+', r'at age \d+', r'since I was', r'as a child', | |
| r'early in my life', r'growing up', r'my journey to medicine' | |
| ], | |
| 'rubric': { | |
| 1: 'disconnected from being a doctor or confusing/random', | |
| 2: 'somewhat connected but unclear', | |
| 3: 'connected and clear', | |
| 4: 'engaging and logically flows into becoming a doctor' | |
| }, | |
| 'rubric_features': { | |
| 'positive': ['engaging', 'logical', 'clear connection', 'compelling', 'authentic'], | |
| 'negative': ['disconnected', 'confusing', 'random', 'unclear', 'generic'] | |
| } | |
| }, | |
| 'Healthcare Experience': { | |
| 'description': 'Watching/participating in healthcare - medical professional at work', | |
| 'keywords': ['shadowed', 'clinical', 'hospital', 'patient', 'doctor', 'physician', | |
| 'medical', 'treatment', 'observed', 'volunteer', 'clinic', 'rounds', | |
| 'surgery', 'emergency', 'ICU', 'residency', 'internship', 'scrubs', | |
| 'stethoscope', 'diagnosis', 'prognosis', 'bedside', 'ward', 'unit'], | |
| 'patterns': [ | |
| r'\d+ hours', r'volunteered at', r'shadowing', r'clinical experience', | |
| r'medical mission', r'worked in .+ hospital', r'during my rotation' | |
| ], | |
| 'rubric': { | |
| 1: 'passive observation, uninteresting, irrelevant, negative tone', | |
| 2: 'bland/boring but not problematic', | |
| 3: 'interesting and relevant', | |
| 4: 'vivid, active, thoughtful, relevant, memorable, positive' | |
| }, | |
| 'rubric_features': { | |
| 'positive': ['vivid', 'active', 'thoughtful', 'memorable', 'optimistic', 'engaged'], | |
| 'negative': ['passive', 'uninteresting', 'irrelevant', 'problematic', 'pessimistic'] | |
| } | |
| }, | |
| 'Showing Doctor Qualities': { | |
| 'description': 'Stories/examples portraying vision of doctor role and appealing aspects', | |
| 'keywords': ['leadership', 'empathy', 'compassion', 'responsibility', 'communication', | |
| 'advocate', 'caring', 'helping', 'service', 'volunteer', 'president', | |
| 'led', 'organized', 'taught', 'mentored', 'integrity', 'ethical', | |
| 'professional', 'dedication', 'perseverance', 'resilience', 'humble'], | |
| 'patterns': [ | |
| r'as (president|leader|captain)', r'I organized', r'I founded', | |
| r'demonstrated .+ leadership', r'showed .+ compassion' | |
| ], | |
| 'rubric': { | |
| 1: 'arrogant, immature, overly confident, inaccurate understanding', | |
| 2: 'bland/boring but not problematic', | |
| 3: 'shows some understanding', | |
| 4: 'realistic, self-aware, mature, humble, specific understanding' | |
| }, | |
| 'rubric_features': { | |
| 'positive': ['realistic', 'self-aware', 'mature', 'humble', 'specific', 'clear'], | |
| 'negative': ['arrogant', 'immature', 'overly confident', 'simplistic', 'inaccurate'] | |
| } | |
| }, | |
| 'Spin': { | |
| 'description': 'Explaining why experiences qualify them to be a doctor', | |
| 'keywords': ['learned', 'taught me', 'showed me', 'realized', 'understood', | |
| 'because', 'therefore', 'this experience', 'through this', | |
| 'as a doctor', 'future physician', 'will help me', 'prepared me'], | |
| 'patterns': [ | |
| r'this .+ taught me', r'I learned that', r'prepared me for', | |
| r'qualified me to', r'because of this', r'therefore I' | |
| ], | |
| 'rubric': { | |
| 1: 'brief, vague, simplistic connection, generic', | |
| 2: 'some connection but generic', | |
| 3: 'clear connection', | |
| 4: 'direct, logical, and specific argument' | |
| }, | |
| 'rubric_features': { | |
| 'positive': ['direct', 'logical', 'specific', 'clear argument', 'compelling'], | |
| 'negative': ['brief', 'vague', 'simplistic', 'generic', 'weak'] | |
| } | |
| } | |
| } | |
| def load_sentence_transformer(): | |
| """Load the e5-large-v2 sentence transformer model""" | |
| try: | |
| # Try to load the preferred model | |
| model = SentenceTransformer('intfloat/e5-large-v2') | |
| return model, 'intfloat/e5-large-v2' | |
| except: | |
| # Fallback to lighter model if e5-large-v2 fails | |
| try: | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| return model, 'all-MiniLM-L6-v2' | |
| except Exception as e: | |
| st.error(f"Failed to load transformer: {e}") | |
| return None, None | |
| def load_training_data_from_files(): | |
| """Load and combine training data from the two Excel files""" | |
| try: | |
| # File paths for the Excel files | |
| file1_path = "DedooseChartExcerpts_2025_8_5_1025.xlsx" | |
| file2_path = "Personal Statements Coded.xlsx" | |
| # Check if files exist | |
| if not os.path.exists(file1_path) or not os.path.exists(file2_path): | |
| return None | |
| # Load Excel files | |
| df1 = pd.read_excel(file1_path) | |
| df2 = pd.read_excel(file2_path) | |
| # Combine dataframes | |
| combined_df = pd.concat([df1, df2], ignore_index=True) | |
| processed_data = [] | |
| for _, row in combined_df.iterrows(): | |
| text = None | |
| # Look for text columns | |
| for col_name in ['Excerpt Copy', 'Excerpt', 'Text', 'Content']: | |
| if col_name in row and pd.notna(row[col_name]): | |
| text = str(row[col_name]) | |
| break | |
| if not text or text.strip() == '': | |
| continue | |
| data_point = { | |
| 'text': text.strip(), | |
| 'media_title': row.get('Media Title', 'Unknown') | |
| } | |
| # Process categories | |
| for category in CATEGORIES.keys(): | |
| col_applied = f"Code: {category} Applied" | |
| col_weight = f"Code: {category} Weight" | |
| is_applied = False | |
| if col_applied in row: | |
| applied_val = str(row[col_applied]).lower() | |
| is_applied = applied_val in ['true', '1', 'yes', 't'] | |
| data_point[f"{category}_applied"] = is_applied | |
| if is_applied and col_weight in row: | |
| weight = row[col_weight] | |
| if pd.isna(weight) or weight == '': | |
| weight = 2 | |
| else: | |
| try: | |
| weight = int(float(weight)) | |
| weight = max(1, min(4, weight)) | |
| except: | |
| weight = 2 | |
| else: | |
| weight = 0 | |
| data_point[f"{category}_score"] = weight | |
| processed_data.append(data_point) | |
| return pd.DataFrame(processed_data) | |
| except Exception as e: | |
| st.error(f"Error loading training data: {str(e)}") | |
| return None | |
| def segment_text(text, embedder): | |
| """Segment text using semantic similarity""" | |
| paragraphs = re.split(r'\n\s*\n', text) | |
| paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p.strip()) > 50] | |
| if len(paragraphs) <= 1: | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| sentences = [s.strip() for s in sentences if len(s.strip()) > 20] | |
| if len(sentences) < 3: | |
| return [text] | |
| # Use embeddings for semantic segmentation | |
| embeddings = embedder.encode(sentences, convert_to_tensor=True) | |
| segments = [] | |
| current_segment = [sentences[0]] | |
| current_embedding = embeddings[0] | |
| for i in range(1, len(sentences)): | |
| similarity = util.cos_sim(current_embedding, embeddings[i]).item() | |
| if similarity < 0.7 or len(' '.join(current_segment)) > 500: | |
| segments.append(' '.join(current_segment)) | |
| current_segment = [sentences[i]] | |
| current_embedding = embeddings[i] | |
| else: | |
| current_segment.append(sentences[i]) | |
| current_embedding = (current_embedding + embeddings[i]) / 2 | |
| if current_segment: | |
| segments.append(' '.join(current_segment)) | |
| return segments | |
| return paragraphs | |
| def extract_features(text, embedder, category_focus=None): | |
| """Extract features for classification""" | |
| features = [] | |
| text_lower = text.lower() | |
| words = text.split() | |
| # Basic text statistics | |
| features.extend([ | |
| len(text), | |
| len(words), | |
| len(set(words)) / max(len(words), 1), | |
| len(re.findall(r'[.!?]', text)), | |
| text.count('I') / max(len(words), 1), | |
| ]) | |
| # Process all categories | |
| for cat_name, cat_info in CATEGORIES.items(): | |
| keywords = cat_info['keywords'] | |
| keyword_matches = sum(1 for kw in keywords if kw.lower() in text_lower) | |
| keyword_density = keyword_matches / max(len(keywords), 1) | |
| if category_focus == cat_name: | |
| keyword_density *= 2 | |
| features.append(keyword_density * 10) | |
| pattern_matches = 0 | |
| for pattern in cat_info.get('patterns', []): | |
| matches = re.findall(pattern, text_lower) | |
| pattern_matches += len(matches) | |
| features.append(pattern_matches) | |
| positive_count = sum(1 for word in cat_info['rubric_features']['positive'] | |
| if word in text_lower) | |
| negative_count = sum(1 for word in cat_info['rubric_features']['negative'] | |
| if word in text_lower) | |
| features.extend([ | |
| positive_count / max(len(words), 1) * 100, | |
| negative_count / max(len(words), 1) * 100 | |
| ]) | |
| # Get embeddings | |
| try: | |
| embedding = embedder.encode(text, convert_to_tensor=False, normalize_embeddings=True) | |
| if hasattr(embedding, 'cpu'): | |
| embedding = embedding.cpu().numpy() | |
| embedding = embedding.flatten() | |
| # Limit embedding size for memory efficiency | |
| embedding = embedding[:512] if len(embedding) > 512 else embedding | |
| except: | |
| embedding = np.zeros(512) | |
| # Category similarity | |
| if category_focus and category_focus in CATEGORIES: | |
| category_text = f"{CATEGORIES[category_focus]['description']} {' '.join(CATEGORIES[category_focus]['keywords'][:10])}" | |
| try: | |
| category_embedding = embedder.encode(category_text, normalize_embeddings=True) | |
| if hasattr(category_embedding, 'cpu'): | |
| category_embedding = category_embedding.cpu().numpy() | |
| category_embedding = category_embedding.flatten()[:512] | |
| similarity = cosine_similarity([embedding[:512]], [category_embedding])[0][0] | |
| features.append(similarity * 10) | |
| except: | |
| features.append(0) | |
| else: | |
| features.append(0) | |
| features = np.array(features, dtype=np.float32) | |
| combined_features = np.concatenate([features, embedding]) | |
| return combined_features | |
| def train_models(df, embedder): | |
| """Train ensemble models""" | |
| all_features = [] | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| status_text.text("Extracting features from training data...") | |
| for idx, row in df.iterrows(): | |
| text = row['text'] | |
| category_features = {} | |
| for cat in CATEGORIES.keys(): | |
| features = extract_features(text, embedder, category_focus=cat) | |
| category_features[cat] = features | |
| true_categories = [cat for cat in CATEGORIES.keys() if row[f"{cat}_applied"]] | |
| if true_categories: | |
| features = category_features[true_categories[0]] | |
| else: | |
| features = np.mean(list(category_features.values()), axis=0) | |
| all_features.append(features) | |
| progress_bar.progress((idx + 1) / len(df)) | |
| X = np.array(all_features) | |
| categories = list(CATEGORIES.keys()) | |
| y_class = df[[f"{cat}_applied" for cat in categories]].values.astype(float) | |
| y_score = [] | |
| for _, row in df.iterrows(): | |
| scores = [] | |
| for cat in categories: | |
| if row[f"{cat}_applied"]: | |
| scores.append(row[f"{cat}_score"] / 4.0) | |
| else: | |
| scores.append(0) | |
| y_score.append(scores) | |
| y_score = np.array(y_score) | |
| status_text.text("Training models...") | |
| # Split data | |
| X_train, X_test, y_class_train, y_class_test, y_score_train, y_score_test = train_test_split( | |
| X, y_class, y_score, test_size=0.2, random_state=42 | |
| ) | |
| # Scale features | |
| scaler = StandardScaler() | |
| X_train_scaled = scaler.fit_transform(X_train) | |
| X_test_scaled = scaler.transform(X_test) | |
| # Train classifiers and scorers | |
| classifiers = {} | |
| scorers = {} | |
| thresholds = {} | |
| ensemble = {} | |
| for i, cat in enumerate(categories): | |
| n_positive = np.sum(y_class_train[:, i]) | |
| models = [] | |
| # XGBoost classifier | |
| if n_positive >= 5: | |
| xgb_clf = xgb.XGBClassifier( | |
| n_estimators=100, | |
| max_depth=5, | |
| learning_rate=0.1, | |
| random_state=42, | |
| use_label_encoder=False, | |
| eval_metric='logloss' | |
| ) | |
| xgb_clf.fit(X_train_scaled, y_class_train[:, i]) | |
| models.append(('xgb', xgb_clf)) | |
| classifiers[cat] = xgb_clf | |
| # Random Forest as backup or ensemble member | |
| rf_clf = RandomForestClassifier( | |
| n_estimators=100, | |
| max_depth=6, | |
| class_weight='balanced', | |
| random_state=42 | |
| ) | |
| rf_clf.fit(X_train_scaled, y_class_train[:, i]) | |
| models.append(('rf', rf_clf)) | |
| if n_positive < 5: | |
| classifiers[cat] = rf_clf | |
| ensemble[cat] = models | |
| thresholds[cat] = 0.5 | |
| # Train scorer | |
| mask = y_class_train[:, i] == 1 | |
| if np.sum(mask) > 5: | |
| scorer = xgb.XGBRegressor( | |
| n_estimators=100, | |
| max_depth=4, | |
| random_state=42 | |
| ) | |
| scorer.fit(X_train_scaled[mask], y_score_train[mask, i]) | |
| else: | |
| from sklearn.dummy import DummyRegressor | |
| scorer = DummyRegressor(strategy='constant', constant=0.5) | |
| scorer.fit(X_train_scaled, y_score_train[:, i]) | |
| scorers[cat] = scorer | |
| # Calculate accuracies | |
| accuracies = [] | |
| for i, cat in enumerate(categories): | |
| preds = classifiers[cat].predict(X_test_scaled) | |
| acc = np.mean(preds == y_class_test[:, i]) | |
| accuracies.append(acc) | |
| status_text.empty() | |
| progress_bar.empty() | |
| return scaler, classifiers, scorers, thresholds, accuracies, ensemble | |
| def classify_segment(text, embedder, scaler, classifiers, scorers, thresholds, ensemble=None): | |
| """Classify a segment of text""" | |
| categories = list(CATEGORIES.keys()) | |
| category_results = {} | |
| for cat in categories: | |
| features = extract_features(text, embedder, category_focus=cat) | |
| features_scaled = scaler.transform([features]) | |
| if ensemble and cat in ensemble: | |
| probs = [] | |
| for name, model in ensemble[cat]: | |
| if hasattr(model, 'predict_proba'): | |
| model_probs = model.predict_proba(features_scaled) | |
| if model_probs.shape[1] == 2: | |
| probs.append(model_probs[0, 1]) | |
| if probs: | |
| avg_prob = np.mean(probs) | |
| else: | |
| avg_prob = 0.5 | |
| else: | |
| if hasattr(classifiers[cat], 'predict_proba'): | |
| probs = classifiers[cat].predict_proba(features_scaled) | |
| if probs.shape[1] == 2: | |
| avg_prob = probs[0, 1] | |
| else: | |
| avg_prob = 0.5 | |
| else: | |
| avg_prob = 0.5 | |
| category_results[cat] = avg_prob | |
| best_category = max(category_results, key=category_results.get) | |
| best_prob = category_results[best_category] | |
| if best_prob > thresholds.get(best_category, 0.5): | |
| features = extract_features(text, embedder, category_focus=best_category) | |
| features_scaled = scaler.transform([features]) | |
| try: | |
| score_normalized = scorers[best_category].predict(features_scaled)[0] | |
| score = int(np.clip(np.round(score_normalized * 4), 1, 4)) | |
| except: | |
| score = 2 | |
| return { | |
| 'category': best_category, | |
| 'score': score, | |
| 'confidence': float(best_prob), | |
| 'text': text, | |
| 'all_probabilities': category_results | |
| } | |
| else: | |
| return { | |
| 'category': 'Unclassified', | |
| 'score': None, | |
| 'confidence': 0, | |
| 'text': text, | |
| 'all_probabilities': category_results | |
| } | |
| def analyze_statement(text, embedder, scaler, classifiers, scorers, thresholds, ensemble=None): | |
| """Analyze complete personal statement""" | |
| segments = segment_text(text, embedder) | |
| segment_results = [] | |
| for i, segment in enumerate(segments): | |
| result = classify_segment(segment, embedder, scaler, classifiers, scorers, thresholds, ensemble) | |
| result['segment_num'] = i + 1 | |
| segment_results.append(result) | |
| # Aggregate results by category | |
| category_results = {} | |
| for cat in CATEGORIES.keys(): | |
| cat_segments = [r for r in segment_results if r['category'] == cat] | |
| if cat_segments: | |
| scores = [s['score'] for s in cat_segments] | |
| avg_score = np.mean(scores) | |
| max_confidence = max([s['confidence'] for s in cat_segments]) | |
| category_results[cat] = { | |
| 'detected': True, | |
| 'score': int(np.round(avg_score)), | |
| 'confidence': max_confidence, | |
| 'num_segments': len(cat_segments), | |
| 'segments': cat_segments | |
| } | |
| else: | |
| category_results[cat] = { | |
| 'detected': False, | |
| 'score': None, | |
| 'confidence': 0, | |
| 'num_segments': 0, | |
| 'segments': [] | |
| } | |
| return segment_results, category_results | |
| def create_pdf_report(segment_results, category_results): | |
| """Create PDF report""" | |
| if not PDF_AVAILABLE: | |
| return None | |
| buffer = BytesIO() | |
| doc = SimpleDocTemplate(buffer, pagesize=letter, rightMargin=72, leftMargin=72, | |
| topMargin=72, bottomMargin=18) | |
| elements = [] | |
| styles = getSampleStyleSheet() | |
| # Custom styles | |
| title_style = ParagraphStyle( | |
| 'CustomTitle', | |
| parent=styles['Heading1'], | |
| fontSize=24, | |
| textColor=colors.HexColor('#1f4788'), | |
| spaceAfter=30, | |
| alignment=TA_CENTER | |
| ) | |
| heading_style = ParagraphStyle( | |
| 'CustomHeading', | |
| parent=styles['Heading2'], | |
| fontSize=14, | |
| textColor=colors.HexColor('#1f4788'), | |
| spaceAfter=12, | |
| spaceBefore=12 | |
| ) | |
| # Title | |
| elements.append(Paragraph("Medical School Personal Statement Analysis", title_style)) | |
| elements.append(Spacer(1, 12)) | |
| # Date | |
| elements.append(Paragraph(f"Generated: {datetime.now().strftime('%B %d, %Y at %I:%M %p')}", styles['Normal'])) | |
| elements.append(Spacer(1, 20)) | |
| # Executive Summary | |
| elements.append(Paragraph("EXECUTIVE SUMMARY", heading_style)) | |
| detected_cats = [cat for cat, res in category_results.items() if res['detected']] | |
| avg_score = np.mean([category_results[cat]['score'] for cat in detected_cats]) if detected_cats else 0 | |
| summary_data = [ | |
| ['Metric', 'Value'], | |
| ['Categories Found', f"{len(detected_cats)}/4"], | |
| ['Average Score', f"{avg_score:.2f}/4"], | |
| ['Total Segments', str(len(segment_results))], | |
| ['Overall Assessment', 'Excellent' if avg_score >= 3.5 else 'Good' if avg_score >= 2.5 else 'Needs Improvement'] | |
| ] | |
| summary_table = Table(summary_data, colWidths=[3*inch, 2*inch]) | |
| summary_table.setStyle(TableStyle([ | |
| ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#1f4788')), | |
| ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), | |
| ('ALIGN', (0, 0), (-1, -1), 'LEFT'), | |
| ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), | |
| ('FONTSIZE', (0, 0), (-1, 0), 12), | |
| ('BOTTOMPADDING', (0, 0), (-1, 0), 12), | |
| ('BACKGROUND', (0, 1), (-1, -1), colors.beige), | |
| ('GRID', (0, 0), (-1, -1), 1, colors.black) | |
| ])) | |
| elements.append(summary_table) | |
| # Build PDF | |
| doc.build(elements) | |
| buffer.seek(0) | |
| return buffer | |
| # Main Application | |
| def main(): | |
| st.title("π₯ Medical School Personal Statement Analyzer") | |
| st.markdown("*Faith Marie Kurtyka, Cole Krudwig, Sean Dore, Sara Avila, George (Guy) McHendry, Steven Fernandes*") | |
| st.markdown("---") | |
| # Initialize session state | |
| if 'model_trained' not in st.session_state: | |
| st.session_state.model_trained = False | |
| if 'embedder' not in st.session_state: | |
| st.session_state.embedder = None | |
| if 'scaler' not in st.session_state: | |
| st.session_state.scaler = None | |
| if 'classifiers' not in st.session_state: | |
| st.session_state.classifiers = None | |
| if 'scorers' not in st.session_state: | |
| st.session_state.scorers = None | |
| if 'thresholds' not in st.session_state: | |
| st.session_state.thresholds = None | |
| if 'ensemble' not in st.session_state: | |
| st.session_state.ensemble = None | |
| # Create three tabs | |
| tab1, tab2, tab3 = st.tabs(["π Step 1: Train Model", "π Step 2: Analyze Statements", "π Step 3: View Rubrics"]) | |
| # STEP 1: TRAIN MODEL | |
| with tab1: | |
| st.header("Step 1: Train the AI Model") | |
| st.markdown(""" | |
| ### Instructions: | |
| Click the 'Train Model' button to automatically train the AI using: | |
| - Pre-loaded Excel training files | |
| - State-of-the-art e5-large-v2 transformer model | |
| - Ensemble classification algorithms | |
| """) | |
| # Check if models already exist in session | |
| if st.session_state.model_trained: | |
| st.success("β Model is already trained and ready for analysis!") | |
| st.info("You can proceed to Step 2 to analyze statements, or retrain if needed.") | |
| st.markdown("---") | |
| # Train button | |
| if st.button("π Train Model", type="primary", use_container_width=True): | |
| # Load training data | |
| with st.spinner("Loading training data from Excel files..."): | |
| df = load_training_data_from_files() | |
| if df is None or df.empty: | |
| st.error(""" | |
| β Could not load training data. Please ensure these files are present: | |
| - DedooseChartExcerpts_2025_8_5_1025.xlsx | |
| - Personal Statements Coded.xlsx | |
| """) | |
| st.stop() | |
| st.success(f"β Loaded {len(df)} training samples") | |
| # Show data distribution | |
| st.subheader("Training Data Distribution:") | |
| dist_cols = st.columns(4) | |
| for idx, cat in enumerate(CATEGORIES.keys()): | |
| if f"{cat}_applied" in df.columns: | |
| count = df[f"{cat}_applied"].sum() | |
| with dist_cols[idx % 4]: | |
| st.metric(cat, f"{int(count)} samples") | |
| # Load transformer model | |
| with st.spinner("Loading e5-large-v2 transformer model..."): | |
| if st.session_state.embedder is None: | |
| embedder, embedder_name = load_sentence_transformer() | |
| st.session_state.embedder = embedder | |
| else: | |
| embedder = st.session_state.embedder | |
| embedder_name = 'intfloat/e5-large-v2' | |
| if embedder is None: | |
| st.error("Failed to load transformer model") | |
| st.stop() | |
| st.info(f"Using model: {embedder_name}") | |
| # Train models | |
| st.subheader("Training Progress:") | |
| scaler, classifiers, scorers, thresholds, accuracies, ensemble = train_models(df, embedder) | |
| # Save to session state | |
| st.session_state.scaler = scaler | |
| st.session_state.classifiers = classifiers | |
| st.session_state.scorers = scorers | |
| st.session_state.thresholds = thresholds | |
| st.session_state.ensemble = ensemble | |
| st.session_state.model_trained = True | |
| st.success("β Training Complete!") | |
| # Show performance metrics | |
| st.subheader("Model Performance:") | |
| metrics_cols = st.columns(4) | |
| for idx, (cat, acc) in enumerate(zip(CATEGORIES.keys(), accuracies)): | |
| with metrics_cols[idx % 4]: | |
| st.metric(cat, f"{acc:.1%} accuracy") | |
| avg_accuracy = np.mean(accuracies) | |
| st.metric("**Overall Model Accuracy**", f"{avg_accuracy:.1%}") | |
| st.balloons() | |
| # STEP 2: ANALYZE STATEMENTS | |
| with tab2: | |
| st.header("Step 2: Analyze Personal Statements") | |
| # Check if models are trained | |
| if not st.session_state.model_trained: | |
| st.warning("β οΈ No trained models found. Please complete Step 1: Train Model first.") | |
| st.stop() | |
| st.success("β Models loaded successfully") | |
| st.markdown(""" | |
| ### Instructions: | |
| Upload or paste a personal statement to receive: | |
| - Category detection and scoring (1-4) | |
| - Segment-by-segment analysis | |
| - Detailed recommendations | |
| - Downloadable PDF report | |
| """) | |
| # Input method selection | |
| input_method = st.radio( | |
| "Choose input method:", | |
| ["Upload Text File (.txt)", "Paste Text Directly"], | |
| horizontal=True | |
| ) | |
| statement_text = None | |
| if input_method == "Upload Text File (.txt)": | |
| uploaded_file = st.file_uploader( | |
| "Choose a text file", | |
| type=['txt'], | |
| help="Upload your personal statement as a .txt file" | |
| ) | |
| if uploaded_file is not None: | |
| statement_text = str(uploaded_file.read(), 'utf-8') | |
| st.success(f"β File uploaded ({len(statement_text)} characters)") | |
| with st.expander("Preview Statement"): | |
| st.text(statement_text[:500] + "..." if len(statement_text) > 500 else statement_text) | |
| else: # Paste Text Directly | |
| statement_text = st.text_area( | |
| "Paste your personal statement here:", | |
| height=400, | |
| placeholder="Enter your complete personal statement...", | |
| help="Paste your entire personal statement for analysis" | |
| ) | |
| if statement_text: | |
| st.info(f"π Statement length: {len(statement_text)} characters, {len(statement_text.split())} words") | |
| # Analyze button | |
| if statement_text and len(statement_text) > 100: | |
| if st.button("π¬ Analyze Statement", type="primary", use_container_width=True): | |
| with st.spinner("Analyzing your personal statement..."): | |
| segment_results, category_results = analyze_statement( | |
| statement_text, | |
| st.session_state.embedder, | |
| st.session_state.scaler, | |
| st.session_state.classifiers, | |
| st.session_state.scorers, | |
| st.session_state.thresholds, | |
| st.session_state.ensemble | |
| ) | |
| st.success("β Analysis Complete!") | |
| st.balloons() | |
| # Display results | |
| st.markdown("---") | |
| st.subheader("π Overall Summary") | |
| # Metrics | |
| col1, col2, col3, col4 = st.columns(4) | |
| detected_cats = [cat for cat, res in category_results.items() if res['detected']] | |
| with col1: | |
| st.metric("Categories Found", f"{len(detected_cats)}/4") | |
| with col2: | |
| if detected_cats: | |
| avg_score = np.mean([category_results[cat]['score'] for cat in detected_cats]) | |
| st.metric("Average Score", f"{avg_score:.1f}/4") | |
| else: | |
| st.metric("Average Score", "N/A") | |
| with col3: | |
| st.metric("Total Segments", len(segment_results)) | |
| with col4: | |
| if detected_cats: | |
| avg_score = np.mean([category_results[cat]['score'] for cat in detected_cats]) | |
| quality = "Excellent" if avg_score >= 3.5 else "Good" if avg_score >= 2.5 else "Needs Work" | |
| st.metric("Overall Quality", quality) | |
| else: | |
| st.metric("Overall Quality", "N/A") | |
| # Category Analysis | |
| st.markdown("---") | |
| st.subheader("π Category Analysis") | |
| for cat in CATEGORIES.keys(): | |
| res = category_results[cat] | |
| if res['detected']: | |
| icon = "β " if res['score'] >= 3 else "β οΈ" if res['score'] >= 2 else "β" | |
| st.write(f"{icon} **{cat}**: Score {res['score']}/4 (Confidence: {res['confidence']:.1%})") | |
| st.progress(res['score'] / 4) | |
| else: | |
| st.write(f"β **{cat}**: Not detected") | |
| st.progress(0) | |
| # Segment Details | |
| st.markdown("---") | |
| st.subheader("π Segment-by-Segment Analysis") | |
| for segment in segment_results: | |
| quality_map = {1: "Poor", 2: "Below Average", 3: "Good", 4: "Excellent", None: "N/A"} | |
| quality = quality_map.get(segment['score'], "N/A") | |
| with st.expander(f"Segment {segment['segment_num']}: {segment['category']} (Score: {segment['score']}/4)"): | |
| col1, col2 = st.columns([1, 3]) | |
| with col1: | |
| st.metric("Category", segment['category']) | |
| st.metric("Score", f"{segment['score']}/4" if segment['score'] else "N/A") | |
| st.metric("Confidence", f"{segment['confidence']:.1%}") | |
| with col2: | |
| st.write("**Text:**") | |
| st.write(segment['text'][:500] + "..." if len(segment['text']) > 500 else segment['text']) | |
| if segment['category'] != 'Unclassified' and segment['score']: | |
| st.write("**Rubric:**") | |
| st.info(CATEGORIES[segment['category']]['rubric'][segment['score']]) | |
| # Recommendations | |
| st.markdown("---") | |
| st.subheader("π‘ Recommendations") | |
| missing_cats = [cat for cat, res in category_results.items() if not res['detected']] | |
| low_score_cats = [cat for cat, res in category_results.items() | |
| if res['detected'] and res['score'] and res['score'] < 3] | |
| if missing_cats: | |
| st.error("**Missing Categories - Must Add:**") | |
| for cat in missing_cats: | |
| st.write(f"**{cat}:** {CATEGORIES[cat]['description']}") | |
| st.write(f"Keywords: {', '.join(CATEGORIES[cat]['keywords'][:8])}") | |
| if low_score_cats: | |
| st.warning("**Low-Scoring Categories - Improve:**") | |
| for cat in low_score_cats: | |
| score = category_results[cat]['score'] | |
| st.write(f"**{cat}** (Score: {score}/4)") | |
| st.write(f"Target: {CATEGORIES[cat]['rubric'][4]}") | |
| if not missing_cats and not low_score_cats: | |
| st.success("Excellent! All categories present with good scores.") | |
| # Download Report | |
| st.markdown("---") | |
| if PDF_AVAILABLE: | |
| pdf_buffer = create_pdf_report(segment_results, category_results) | |
| if pdf_buffer: | |
| st.download_button( | |
| label="π₯ Download PDF Report", | |
| data=pdf_buffer, | |
| file_name=f"analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf", | |
| mime="application/pdf", | |
| use_container_width=True | |
| ) | |
| else: | |
| # CSV fallback | |
| results_data = [] | |
| for seg in segment_results: | |
| results_data.append({ | |
| 'Segment': seg['segment_num'], | |
| 'Category': seg['category'], | |
| 'Score': seg['score'], | |
| 'Confidence': seg['confidence'] | |
| }) | |
| results_df = pd.DataFrame(results_data) | |
| csv = results_df.to_csv(index=False) | |
| st.download_button( | |
| label="π₯ Download CSV Report", | |
| data=csv, | |
| file_name=f"analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", | |
| mime="text/csv", | |
| use_container_width=True | |
| ) | |
| elif statement_text and len(statement_text) <= 100: | |
| st.warning("β οΈ Please enter a longer statement (minimum 100 characters)") | |
| else: | |
| st.info("π Please upload or paste your personal statement to begin analysis") | |
| # STEP 3: VIEW RUBRICS | |
| with tab3: | |
| st.header("Step 3: Understanding the Scoring Rubrics") | |
| st.markdown(""" | |
| The AI model evaluates personal statements based on **4 key categories**, | |
| each scored on a scale of **1 (Poor) to 4 (Excellent)**. | |
| """) | |
| for category, info in CATEGORIES.items(): | |
| with st.expander(f"**{category}** - {info['description']}", expanded=False): | |
| # Scoring Criteria | |
| st.subheader("Scoring Criteria:") | |
| for score in [4, 3, 2, 1]: | |
| quality = ['Poor', 'Below Average', 'Good', 'Excellent'][score-1] | |
| if score == 4: | |
| st.success(f"**Score {score} ({quality}):** {info['rubric'][score]}") | |
| elif score == 3: | |
| st.info(f"**Score {score} ({quality}):** {info['rubric'][score]}") | |
| elif score == 2: | |
| st.warning(f"**Score {score} ({quality}):** {info['rubric'][score]}") | |
| else: | |
| st.error(f"**Score {score} ({quality}):** {info['rubric'][score]}") | |
| st.markdown("---") | |
| # Keywords and indicators | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.markdown("**Key Terms:**") | |
| st.write(', '.join(info['keywords'][:10])) | |
| with col2: | |
| st.markdown("**Quality Indicators:**") | |
| st.write(f"β Positive: {', '.join(info['rubric_features']['positive'][:5])}") | |
| st.write(f"β Avoid: {', '.join(info['rubric_features']['negative'][:5])}") | |
| st.markdown("---") | |
| st.info(""" | |
| ### Tips for High Scores: | |
| - **Spark (4/4):** Create an engaging opening that clearly connects to your medical journey | |
| - **Healthcare Experience (4/4):** Show active participation with vivid, thoughtful descriptions | |
| - **Doctor Qualities (4/4):** Demonstrate mature, realistic understanding with specific examples | |
| - **Spin (4/4):** Make direct, logical connections between experiences and medical career | |
| """) | |
| # Run the application | |
| if __name__ == "__main__": | |
| main() |