import streamlit as st import pandas as pd import numpy as np import pickle import os import re from datetime import datetime from io import BytesIO import warnings warnings.filterwarnings('ignore') # Page config MUST be first st.set_page_config( page_title="Medical School Personal Statement Analyzer", page_icon="🏥", layout="wide", initial_sidebar_state="expanded" ) # Import ML libraries from sentence_transformers import SentenceTransformer, util from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.metrics.pairwise import cosine_similarity from sklearn.ensemble import RandomForestClassifier import xgboost as xgb import torch # Import PDF generation libraries try: from reportlab.lib import colors from reportlab.lib.pagesizes import letter from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, PageBreak from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle from reportlab.lib.units import inch from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY PDF_AVAILABLE = True except ImportError: PDF_AVAILABLE = False # Categories with detailed rubric alignment CATEGORIES = { 'Spark': { 'description': 'Opening that spurs interest in medicine (typically in opening paragraph)', 'keywords': ['growing up', 'childhood', 'family', 'realized', 'inspired', 'first', 'beginning', 'early', 'experience that', 'moment', 'when I was', 'journey began', 'sparked my interest', 'drew me to medicine', 'passion for medicine', 'calling', 'fascinated', 'curiosity'], 'patterns': [ r'when I was \d+', r'at age \d+', r'since I was', r'as a child', r'early in my life', r'growing up', r'my journey to medicine' ], 'rubric': { 1: 'disconnected from being a doctor or confusing/random', 2: 'somewhat connected but unclear', 3: 'connected and clear', 4: 'engaging and logically flows into becoming a doctor' }, 'rubric_features': { 'positive': ['engaging', 'logical', 'clear connection', 'compelling', 'authentic'], 'negative': ['disconnected', 'confusing', 'random', 'unclear', 'generic'] } }, 'Healthcare Experience': { 'description': 'Watching/participating in healthcare - medical professional at work', 'keywords': ['shadowed', 'clinical', 'hospital', 'patient', 'doctor', 'physician', 'medical', 'treatment', 'observed', 'volunteer', 'clinic', 'rounds', 'surgery', 'emergency', 'ICU', 'residency', 'internship', 'scrubs', 'stethoscope', 'diagnosis', 'prognosis', 'bedside', 'ward', 'unit'], 'patterns': [ r'\d+ hours', r'volunteered at', r'shadowing', r'clinical experience', r'medical mission', r'worked in .+ hospital', r'during my rotation' ], 'rubric': { 1: 'passive observation, uninteresting, irrelevant, negative tone', 2: 'bland/boring but not problematic', 3: 'interesting and relevant', 4: 'vivid, active, thoughtful, relevant, memorable, positive' }, 'rubric_features': { 'positive': ['vivid', 'active', 'thoughtful', 'memorable', 'optimistic', 'engaged'], 'negative': ['passive', 'uninteresting', 'irrelevant', 'problematic', 'pessimistic'] } }, 'Showing Doctor Qualities': { 'description': 'Stories/examples portraying vision of doctor role and appealing aspects', 'keywords': ['leadership', 'empathy', 'compassion', 'responsibility', 'communication', 'advocate', 'caring', 'helping', 'service', 'volunteer', 'president', 'led', 'organized', 'taught', 'mentored', 'integrity', 'ethical', 'professional', 'dedication', 'perseverance', 'resilience', 'humble'], 'patterns': [ r'as (president|leader|captain)', r'I organized', r'I founded', r'demonstrated .+ leadership', r'showed .+ compassion' ], 'rubric': { 1: 'arrogant, immature, overly confident, inaccurate understanding', 2: 'bland/boring but not problematic', 3: 'shows some understanding', 4: 'realistic, self-aware, mature, humble, specific understanding' }, 'rubric_features': { 'positive': ['realistic', 'self-aware', 'mature', 'humble', 'specific', 'clear'], 'negative': ['arrogant', 'immature', 'overly confident', 'simplistic', 'inaccurate'] } }, 'Spin': { 'description': 'Explaining why experiences qualify them to be a doctor', 'keywords': ['learned', 'taught me', 'showed me', 'realized', 'understood', 'because', 'therefore', 'this experience', 'through this', 'as a doctor', 'future physician', 'will help me', 'prepared me'], 'patterns': [ r'this .+ taught me', r'I learned that', r'prepared me for', r'qualified me to', r'because of this', r'therefore I' ], 'rubric': { 1: 'brief, vague, simplistic connection, generic', 2: 'some connection but generic', 3: 'clear connection', 4: 'direct, logical, and specific argument' }, 'rubric_features': { 'positive': ['direct', 'logical', 'specific', 'clear argument', 'compelling'], 'negative': ['brief', 'vague', 'simplistic', 'generic', 'weak'] } } } @st.cache_resource def load_sentence_transformer(): """Load the e5-large-v2 sentence transformer model""" try: # Try to load the preferred model model = SentenceTransformer('intfloat/e5-large-v2') return model, 'intfloat/e5-large-v2' except: # Fallback to lighter model if e5-large-v2 fails try: model = SentenceTransformer('all-MiniLM-L6-v2') return model, 'all-MiniLM-L6-v2' except Exception as e: st.error(f"Failed to load transformer: {e}") return None, None def load_training_data_from_files(): """Load and combine training data from the two Excel files""" try: # File paths for the Excel files file1_path = "DedooseChartExcerpts_2025_8_5_1025.xlsx" file2_path = "Personal Statements Coded.xlsx" # Check if files exist if not os.path.exists(file1_path) or not os.path.exists(file2_path): return None # Load Excel files df1 = pd.read_excel(file1_path) df2 = pd.read_excel(file2_path) # Combine dataframes combined_df = pd.concat([df1, df2], ignore_index=True) processed_data = [] for _, row in combined_df.iterrows(): text = None # Look for text columns for col_name in ['Excerpt Copy', 'Excerpt', 'Text', 'Content']: if col_name in row and pd.notna(row[col_name]): text = str(row[col_name]) break if not text or text.strip() == '': continue data_point = { 'text': text.strip(), 'media_title': row.get('Media Title', 'Unknown') } # Process categories for category in CATEGORIES.keys(): col_applied = f"Code: {category} Applied" col_weight = f"Code: {category} Weight" is_applied = False if col_applied in row: applied_val = str(row[col_applied]).lower() is_applied = applied_val in ['true', '1', 'yes', 't'] data_point[f"{category}_applied"] = is_applied if is_applied and col_weight in row: weight = row[col_weight] if pd.isna(weight) or weight == '': weight = 2 else: try: weight = int(float(weight)) weight = max(1, min(4, weight)) except: weight = 2 else: weight = 0 data_point[f"{category}_score"] = weight processed_data.append(data_point) return pd.DataFrame(processed_data) except Exception as e: st.error(f"Error loading training data: {str(e)}") return None def segment_text(text, embedder): """Segment text using semantic similarity""" paragraphs = re.split(r'\n\s*\n', text) paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p.strip()) > 50] if len(paragraphs) <= 1: sentences = re.split(r'(?<=[.!?])\s+', text) sentences = [s.strip() for s in sentences if len(s.strip()) > 20] if len(sentences) < 3: return [text] # Use embeddings for semantic segmentation embeddings = embedder.encode(sentences, convert_to_tensor=True) segments = [] current_segment = [sentences[0]] current_embedding = embeddings[0] for i in range(1, len(sentences)): similarity = util.cos_sim(current_embedding, embeddings[i]).item() if similarity < 0.7 or len(' '.join(current_segment)) > 500: segments.append(' '.join(current_segment)) current_segment = [sentences[i]] current_embedding = embeddings[i] else: current_segment.append(sentences[i]) current_embedding = (current_embedding + embeddings[i]) / 2 if current_segment: segments.append(' '.join(current_segment)) return segments return paragraphs def extract_features(text, embedder, category_focus=None): """Extract features for classification""" features = [] text_lower = text.lower() words = text.split() # Basic text statistics features.extend([ len(text), len(words), len(set(words)) / max(len(words), 1), len(re.findall(r'[.!?]', text)), text.count('I') / max(len(words), 1), ]) # Process all categories for cat_name, cat_info in CATEGORIES.items(): keywords = cat_info['keywords'] keyword_matches = sum(1 for kw in keywords if kw.lower() in text_lower) keyword_density = keyword_matches / max(len(keywords), 1) if category_focus == cat_name: keyword_density *= 2 features.append(keyword_density * 10) pattern_matches = 0 for pattern in cat_info.get('patterns', []): matches = re.findall(pattern, text_lower) pattern_matches += len(matches) features.append(pattern_matches) positive_count = sum(1 for word in cat_info['rubric_features']['positive'] if word in text_lower) negative_count = sum(1 for word in cat_info['rubric_features']['negative'] if word in text_lower) features.extend([ positive_count / max(len(words), 1) * 100, negative_count / max(len(words), 1) * 100 ]) # Get embeddings try: embedding = embedder.encode(text, convert_to_tensor=False, normalize_embeddings=True) if hasattr(embedding, 'cpu'): embedding = embedding.cpu().numpy() embedding = embedding.flatten() # Limit embedding size for memory efficiency embedding = embedding[:512] if len(embedding) > 512 else embedding except: embedding = np.zeros(512) # Category similarity if category_focus and category_focus in CATEGORIES: category_text = f"{CATEGORIES[category_focus]['description']} {' '.join(CATEGORIES[category_focus]['keywords'][:10])}" try: category_embedding = embedder.encode(category_text, normalize_embeddings=True) if hasattr(category_embedding, 'cpu'): category_embedding = category_embedding.cpu().numpy() category_embedding = category_embedding.flatten()[:512] similarity = cosine_similarity([embedding[:512]], [category_embedding])[0][0] features.append(similarity * 10) except: features.append(0) else: features.append(0) features = np.array(features, dtype=np.float32) combined_features = np.concatenate([features, embedding]) return combined_features def train_models(df, embedder): """Train ensemble models""" all_features = [] progress_bar = st.progress(0) status_text = st.empty() status_text.text("Extracting features from training data...") for idx, row in df.iterrows(): text = row['text'] category_features = {} for cat in CATEGORIES.keys(): features = extract_features(text, embedder, category_focus=cat) category_features[cat] = features true_categories = [cat for cat in CATEGORIES.keys() if row[f"{cat}_applied"]] if true_categories: features = category_features[true_categories[0]] else: features = np.mean(list(category_features.values()), axis=0) all_features.append(features) progress_bar.progress((idx + 1) / len(df)) X = np.array(all_features) categories = list(CATEGORIES.keys()) y_class = df[[f"{cat}_applied" for cat in categories]].values.astype(float) y_score = [] for _, row in df.iterrows(): scores = [] for cat in categories: if row[f"{cat}_applied"]: scores.append(row[f"{cat}_score"] / 4.0) else: scores.append(0) y_score.append(scores) y_score = np.array(y_score) status_text.text("Training models...") # Split data X_train, X_test, y_class_train, y_class_test, y_score_train, y_score_test = train_test_split( X, y_class, y_score, test_size=0.2, random_state=42 ) # Scale features scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # Train classifiers and scorers classifiers = {} scorers = {} thresholds = {} ensemble = {} for i, cat in enumerate(categories): n_positive = np.sum(y_class_train[:, i]) models = [] # XGBoost classifier if n_positive >= 5: xgb_clf = xgb.XGBClassifier( n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42, use_label_encoder=False, eval_metric='logloss' ) xgb_clf.fit(X_train_scaled, y_class_train[:, i]) models.append(('xgb', xgb_clf)) classifiers[cat] = xgb_clf # Random Forest as backup or ensemble member rf_clf = RandomForestClassifier( n_estimators=100, max_depth=6, class_weight='balanced', random_state=42 ) rf_clf.fit(X_train_scaled, y_class_train[:, i]) models.append(('rf', rf_clf)) if n_positive < 5: classifiers[cat] = rf_clf ensemble[cat] = models thresholds[cat] = 0.5 # Train scorer mask = y_class_train[:, i] == 1 if np.sum(mask) > 5: scorer = xgb.XGBRegressor( n_estimators=100, max_depth=4, random_state=42 ) scorer.fit(X_train_scaled[mask], y_score_train[mask, i]) else: from sklearn.dummy import DummyRegressor scorer = DummyRegressor(strategy='constant', constant=0.5) scorer.fit(X_train_scaled, y_score_train[:, i]) scorers[cat] = scorer # Calculate accuracies accuracies = [] for i, cat in enumerate(categories): preds = classifiers[cat].predict(X_test_scaled) acc = np.mean(preds == y_class_test[:, i]) accuracies.append(acc) status_text.empty() progress_bar.empty() return scaler, classifiers, scorers, thresholds, accuracies, ensemble def classify_segment(text, embedder, scaler, classifiers, scorers, thresholds, ensemble=None): """Classify a segment of text""" categories = list(CATEGORIES.keys()) category_results = {} for cat in categories: features = extract_features(text, embedder, category_focus=cat) features_scaled = scaler.transform([features]) if ensemble and cat in ensemble: probs = [] for name, model in ensemble[cat]: if hasattr(model, 'predict_proba'): model_probs = model.predict_proba(features_scaled) if model_probs.shape[1] == 2: probs.append(model_probs[0, 1]) if probs: avg_prob = np.mean(probs) else: avg_prob = 0.5 else: if hasattr(classifiers[cat], 'predict_proba'): probs = classifiers[cat].predict_proba(features_scaled) if probs.shape[1] == 2: avg_prob = probs[0, 1] else: avg_prob = 0.5 else: avg_prob = 0.5 category_results[cat] = avg_prob best_category = max(category_results, key=category_results.get) best_prob = category_results[best_category] if best_prob > thresholds.get(best_category, 0.5): features = extract_features(text, embedder, category_focus=best_category) features_scaled = scaler.transform([features]) try: score_normalized = scorers[best_category].predict(features_scaled)[0] score = int(np.clip(np.round(score_normalized * 4), 1, 4)) except: score = 2 return { 'category': best_category, 'score': score, 'confidence': float(best_prob), 'text': text, 'all_probabilities': category_results } else: return { 'category': 'Unclassified', 'score': None, 'confidence': 0, 'text': text, 'all_probabilities': category_results } def analyze_statement(text, embedder, scaler, classifiers, scorers, thresholds, ensemble=None): """Analyze complete personal statement""" segments = segment_text(text, embedder) segment_results = [] for i, segment in enumerate(segments): result = classify_segment(segment, embedder, scaler, classifiers, scorers, thresholds, ensemble) result['segment_num'] = i + 1 segment_results.append(result) # Aggregate results by category category_results = {} for cat in CATEGORIES.keys(): cat_segments = [r for r in segment_results if r['category'] == cat] if cat_segments: scores = [s['score'] for s in cat_segments] avg_score = np.mean(scores) max_confidence = max([s['confidence'] for s in cat_segments]) category_results[cat] = { 'detected': True, 'score': int(np.round(avg_score)), 'confidence': max_confidence, 'num_segments': len(cat_segments), 'segments': cat_segments } else: category_results[cat] = { 'detected': False, 'score': None, 'confidence': 0, 'num_segments': 0, 'segments': [] } return segment_results, category_results def create_pdf_report(segment_results, category_results): """Create PDF report""" if not PDF_AVAILABLE: return None buffer = BytesIO() doc = SimpleDocTemplate(buffer, pagesize=letter, rightMargin=72, leftMargin=72, topMargin=72, bottomMargin=18) elements = [] styles = getSampleStyleSheet() # Custom styles title_style = ParagraphStyle( 'CustomTitle', parent=styles['Heading1'], fontSize=24, textColor=colors.HexColor('#1f4788'), spaceAfter=30, alignment=TA_CENTER ) heading_style = ParagraphStyle( 'CustomHeading', parent=styles['Heading2'], fontSize=14, textColor=colors.HexColor('#1f4788'), spaceAfter=12, spaceBefore=12 ) # Title elements.append(Paragraph("Medical School Personal Statement Analysis", title_style)) elements.append(Spacer(1, 12)) # Date elements.append(Paragraph(f"Generated: {datetime.now().strftime('%B %d, %Y at %I:%M %p')}", styles['Normal'])) elements.append(Spacer(1, 20)) # Executive Summary elements.append(Paragraph("EXECUTIVE SUMMARY", heading_style)) detected_cats = [cat for cat, res in category_results.items() if res['detected']] avg_score = np.mean([category_results[cat]['score'] for cat in detected_cats]) if detected_cats else 0 summary_data = [ ['Metric', 'Value'], ['Categories Found', f"{len(detected_cats)}/4"], ['Average Score', f"{avg_score:.2f}/4"], ['Total Segments', str(len(segment_results))], ['Overall Assessment', 'Excellent' if avg_score >= 3.5 else 'Good' if avg_score >= 2.5 else 'Needs Improvement'] ] summary_table = Table(summary_data, colWidths=[3*inch, 2*inch]) summary_table.setStyle(TableStyle([ ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#1f4788')), ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), ('ALIGN', (0, 0), (-1, -1), 'LEFT'), ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), ('FONTSIZE', (0, 0), (-1, 0), 12), ('BOTTOMPADDING', (0, 0), (-1, 0), 12), ('BACKGROUND', (0, 1), (-1, -1), colors.beige), ('GRID', (0, 0), (-1, -1), 1, colors.black) ])) elements.append(summary_table) # Build PDF doc.build(elements) buffer.seek(0) return buffer # Main Application def main(): st.title("🏥 Medical School Personal Statement Analyzer") st.markdown("*Faith Marie Kurtyka, Cole Krudwig, Sean Dore, Sara Avila, George (Guy) McHendry, Steven Fernandes*") st.markdown("---") # Initialize session state if 'model_trained' not in st.session_state: st.session_state.model_trained = False if 'embedder' not in st.session_state: st.session_state.embedder = None if 'scaler' not in st.session_state: st.session_state.scaler = None if 'classifiers' not in st.session_state: st.session_state.classifiers = None if 'scorers' not in st.session_state: st.session_state.scorers = None if 'thresholds' not in st.session_state: st.session_state.thresholds = None if 'ensemble' not in st.session_state: st.session_state.ensemble = None # Create three tabs tab1, tab2, tab3 = st.tabs(["📚 Step 1: Train Model", "📝 Step 2: Analyze Statements", "📊 Step 3: View Rubrics"]) # STEP 1: TRAIN MODEL with tab1: st.header("Step 1: Train the AI Model") st.markdown(""" ### Instructions: Click the 'Train Model' button to automatically train the AI using: - Pre-loaded Excel training files - State-of-the-art e5-large-v2 transformer model - Ensemble classification algorithms """) # Check if models already exist in session if st.session_state.model_trained: st.success("✅ Model is already trained and ready for analysis!") st.info("You can proceed to Step 2 to analyze statements, or retrain if needed.") st.markdown("---") # Train button if st.button("🚀 Train Model", type="primary", use_container_width=True): # Load training data with st.spinner("Loading training data from Excel files..."): df = load_training_data_from_files() if df is None or df.empty: st.error(""" ❌ Could not load training data. Please ensure these files are present: - DedooseChartExcerpts_2025_8_5_1025.xlsx - Personal Statements Coded.xlsx """) st.stop() st.success(f"✅ Loaded {len(df)} training samples") # Show data distribution st.subheader("Training Data Distribution:") dist_cols = st.columns(4) for idx, cat in enumerate(CATEGORIES.keys()): if f"{cat}_applied" in df.columns: count = df[f"{cat}_applied"].sum() with dist_cols[idx % 4]: st.metric(cat, f"{int(count)} samples") # Load transformer model with st.spinner("Loading e5-large-v2 transformer model..."): if st.session_state.embedder is None: embedder, embedder_name = load_sentence_transformer() st.session_state.embedder = embedder else: embedder = st.session_state.embedder embedder_name = 'intfloat/e5-large-v2' if embedder is None: st.error("Failed to load transformer model") st.stop() st.info(f"Using model: {embedder_name}") # Train models st.subheader("Training Progress:") scaler, classifiers, scorers, thresholds, accuracies, ensemble = train_models(df, embedder) # Save to session state st.session_state.scaler = scaler st.session_state.classifiers = classifiers st.session_state.scorers = scorers st.session_state.thresholds = thresholds st.session_state.ensemble = ensemble st.session_state.model_trained = True st.success("✅ Training Complete!") # Show performance metrics st.subheader("Model Performance:") metrics_cols = st.columns(4) for idx, (cat, acc) in enumerate(zip(CATEGORIES.keys(), accuracies)): with metrics_cols[idx % 4]: st.metric(cat, f"{acc:.1%} accuracy") avg_accuracy = np.mean(accuracies) st.metric("**Overall Model Accuracy**", f"{avg_accuracy:.1%}") st.balloons() # STEP 2: ANALYZE STATEMENTS with tab2: st.header("Step 2: Analyze Personal Statements") # Check if models are trained if not st.session_state.model_trained: st.warning("⚠️ No trained models found. Please complete Step 1: Train Model first.") st.stop() st.success("✅ Models loaded successfully") st.markdown(""" ### Instructions: Upload or paste a personal statement to receive: - Category detection and scoring (1-4) - Segment-by-segment analysis - Detailed recommendations - Downloadable PDF report """) # Input method selection input_method = st.radio( "Choose input method:", ["Upload Text File (.txt)", "Paste Text Directly"], horizontal=True ) statement_text = None if input_method == "Upload Text File (.txt)": uploaded_file = st.file_uploader( "Choose a text file", type=['txt'], help="Upload your personal statement as a .txt file" ) if uploaded_file is not None: statement_text = str(uploaded_file.read(), 'utf-8') st.success(f"✅ File uploaded ({len(statement_text)} characters)") with st.expander("Preview Statement"): st.text(statement_text[:500] + "..." if len(statement_text) > 500 else statement_text) else: # Paste Text Directly statement_text = st.text_area( "Paste your personal statement here:", height=400, placeholder="Enter your complete personal statement...", help="Paste your entire personal statement for analysis" ) if statement_text: st.info(f"📊 Statement length: {len(statement_text)} characters, {len(statement_text.split())} words") # Analyze button if statement_text and len(statement_text) > 100: if st.button("🔬 Analyze Statement", type="primary", use_container_width=True): with st.spinner("Analyzing your personal statement..."): segment_results, category_results = analyze_statement( statement_text, st.session_state.embedder, st.session_state.scaler, st.session_state.classifiers, st.session_state.scorers, st.session_state.thresholds, st.session_state.ensemble ) st.success("✅ Analysis Complete!") st.balloons() # Display results st.markdown("---") st.subheader("📊 Overall Summary") # Metrics col1, col2, col3, col4 = st.columns(4) detected_cats = [cat for cat, res in category_results.items() if res['detected']] with col1: st.metric("Categories Found", f"{len(detected_cats)}/4") with col2: if detected_cats: avg_score = np.mean([category_results[cat]['score'] for cat in detected_cats]) st.metric("Average Score", f"{avg_score:.1f}/4") else: st.metric("Average Score", "N/A") with col3: st.metric("Total Segments", len(segment_results)) with col4: if detected_cats: avg_score = np.mean([category_results[cat]['score'] for cat in detected_cats]) quality = "Excellent" if avg_score >= 3.5 else "Good" if avg_score >= 2.5 else "Needs Work" st.metric("Overall Quality", quality) else: st.metric("Overall Quality", "N/A") # Category Analysis st.markdown("---") st.subheader("📋 Category Analysis") for cat in CATEGORIES.keys(): res = category_results[cat] if res['detected']: icon = "✅" if res['score'] >= 3 else "⚠️" if res['score'] >= 2 else "❌" st.write(f"{icon} **{cat}**: Score {res['score']}/4 (Confidence: {res['confidence']:.1%})") st.progress(res['score'] / 4) else: st.write(f"❌ **{cat}**: Not detected") st.progress(0) # Segment Details st.markdown("---") st.subheader("📝 Segment-by-Segment Analysis") for segment in segment_results: quality_map = {1: "Poor", 2: "Below Average", 3: "Good", 4: "Excellent", None: "N/A"} quality = quality_map.get(segment['score'], "N/A") with st.expander(f"Segment {segment['segment_num']}: {segment['category']} (Score: {segment['score']}/4)"): col1, col2 = st.columns([1, 3]) with col1: st.metric("Category", segment['category']) st.metric("Score", f"{segment['score']}/4" if segment['score'] else "N/A") st.metric("Confidence", f"{segment['confidence']:.1%}") with col2: st.write("**Text:**") st.write(segment['text'][:500] + "..." if len(segment['text']) > 500 else segment['text']) if segment['category'] != 'Unclassified' and segment['score']: st.write("**Rubric:**") st.info(CATEGORIES[segment['category']]['rubric'][segment['score']]) # Recommendations st.markdown("---") st.subheader("💡 Recommendations") missing_cats = [cat for cat, res in category_results.items() if not res['detected']] low_score_cats = [cat for cat, res in category_results.items() if res['detected'] and res['score'] and res['score'] < 3] if missing_cats: st.error("**Missing Categories - Must Add:**") for cat in missing_cats: st.write(f"**{cat}:** {CATEGORIES[cat]['description']}") st.write(f"Keywords: {', '.join(CATEGORIES[cat]['keywords'][:8])}") if low_score_cats: st.warning("**Low-Scoring Categories - Improve:**") for cat in low_score_cats: score = category_results[cat]['score'] st.write(f"**{cat}** (Score: {score}/4)") st.write(f"Target: {CATEGORIES[cat]['rubric'][4]}") if not missing_cats and not low_score_cats: st.success("Excellent! All categories present with good scores.") # Download Report st.markdown("---") if PDF_AVAILABLE: pdf_buffer = create_pdf_report(segment_results, category_results) if pdf_buffer: st.download_button( label="📥 Download PDF Report", data=pdf_buffer, file_name=f"analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf", mime="application/pdf", use_container_width=True ) else: # CSV fallback results_data = [] for seg in segment_results: results_data.append({ 'Segment': seg['segment_num'], 'Category': seg['category'], 'Score': seg['score'], 'Confidence': seg['confidence'] }) results_df = pd.DataFrame(results_data) csv = results_df.to_csv(index=False) st.download_button( label="📥 Download CSV Report", data=csv, file_name=f"analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", mime="text/csv", use_container_width=True ) elif statement_text and len(statement_text) <= 100: st.warning("⚠️ Please enter a longer statement (minimum 100 characters)") else: st.info("👆 Please upload or paste your personal statement to begin analysis") # STEP 3: VIEW RUBRICS with tab3: st.header("Step 3: Understanding the Scoring Rubrics") st.markdown(""" The AI model evaluates personal statements based on **4 key categories**, each scored on a scale of **1 (Poor) to 4 (Excellent)**. """) for category, info in CATEGORIES.items(): with st.expander(f"**{category}** - {info['description']}", expanded=False): # Scoring Criteria st.subheader("Scoring Criteria:") for score in [4, 3, 2, 1]: quality = ['Poor', 'Below Average', 'Good', 'Excellent'][score-1] if score == 4: st.success(f"**Score {score} ({quality}):** {info['rubric'][score]}") elif score == 3: st.info(f"**Score {score} ({quality}):** {info['rubric'][score]}") elif score == 2: st.warning(f"**Score {score} ({quality}):** {info['rubric'][score]}") else: st.error(f"**Score {score} ({quality}):** {info['rubric'][score]}") st.markdown("---") # Keywords and indicators col1, col2 = st.columns(2) with col1: st.markdown("**Key Terms:**") st.write(', '.join(info['keywords'][:10])) with col2: st.markdown("**Quality Indicators:**") st.write(f"✅ Positive: {', '.join(info['rubric_features']['positive'][:5])}") st.write(f"❌ Avoid: {', '.join(info['rubric_features']['negative'][:5])}") st.markdown("---") st.info(""" ### Tips for High Scores: - **Spark (4/4):** Create an engaging opening that clearly connects to your medical journey - **Healthcare Experience (4/4):** Show active participation with vivid, thoughtful descriptions - **Doctor Qualities (4/4):** Demonstrate mature, realistic understanding with specific examples - **Spin (4/4):** Make direct, logical connections between experiences and medical career """) # Run the application if __name__ == "__main__": main()