Spaces:

stevafernandes
/

personal_statement

Sleeping

App Files Files Community

stevafernandes commited on Aug 25, 2025

Commit

7737fc0

verified ·

1 Parent(s): 4448b93

Create app.py

Browse files

Files changed (1) hide show

app.py +674 -0

app.py ADDED Viewed

	@@ -0,0 +1,674 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import pickle
+import os
+import torch
+from sentence_transformers import SentenceTransformer, util
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.ensemble import RandomForestClassifier
+import xgboost as xgb
+import re
+import warnings
+from datetime import datetime
+import base64
+from io import BytesIO
+warnings.filterwarnings('ignore')
+# Set page config
+st.set_page_config(
+    page_title="Medical School Personal Statement Analyzer",
+    page_icon="🏥",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Categories with detailed rubric alignment
+CATEGORIES = {
+    'Spark': {
+        'description': 'Opening that spurs interest in medicine (typically in opening paragraph)',
+        'keywords': ['growing up', 'childhood', 'family', 'realized', 'inspired', 'first',
+                    'beginning', 'early', 'experience that', 'moment', 'when I was',
+                    'journey began', 'sparked my interest', 'drew me to medicine',
+                    'passion for medicine', 'calling', 'fascinated', 'curiosity'],
+        'patterns': [
+            r'when I was \d+', r'at age \d+', r'since I was', r'as a child',
+            r'early in my life', r'growing up', r'my journey to medicine'
+        ],
+        'rubric': {
+            1: 'disconnected from being a doctor or confusing/random',
+            2: 'somewhat connected but unclear',
+            3: 'connected and clear',
+            4: 'engaging and logically flows into becoming a doctor'
+        },
+        'rubric_features': {
+            'positive': ['engaging', 'logical', 'clear connection', 'compelling', 'authentic'],
+            'negative': ['disconnected', 'confusing', 'random', 'unclear', 'generic']
+        }
+    },
+    'Healthcare Experience': {
+        'description': 'Watching/participating in healthcare - medical professional at work',
+        'keywords': ['shadowed', 'clinical', 'hospital', 'patient', 'doctor', 'physician',
+                    'medical', 'treatment', 'observed', 'volunteer', 'clinic', 'rounds',
+                    'surgery', 'emergency', 'ICU', 'residency', 'internship', 'scrubs',
+                    'stethoscope', 'diagnosis', 'prognosis', 'bedside', 'ward', 'unit',
+                    'healthcare', 'care team', 'medical team', 'attending', 'resident'],
+        'patterns': [
+            r'\d+ hours', r'volunteered at', r'shadowing', r'clinical experience',
+            r'medical mission', r'worked in .+ hospital', r'during my rotation'
+        ],
+        'rubric': {
+            1: 'passive observation, uninteresting, irrelevant, problematic, negative tone',
+            2: 'bland/boring but not problematic',
+            3: 'interesting and relevant',
+            4: 'vivid, active, thoughtful, relevant, memorable, positive and optimistic'
+        },
+        'rubric_features': {
+            'positive': ['vivid', 'active', 'thoughtful', 'memorable', 'optimistic', 'engaged'],
+            'negative': ['passive', 'uninteresting', 'irrelevant', 'problematic', 'pessimistic']
+        }
+    },
+    'Showing Doctor Qualities': {
+        'description': 'Stories/examples portraying vision of doctor role and appealing aspects',
+        'keywords': ['leadership', 'empathy', 'compassion', 'responsibility', 'communication',
+                    'advocate', 'caring', 'helping', 'service', 'volunteer', 'president',
+                    'led', 'organized', 'taught', 'mentored', 'integrity', 'ethical',
+                    'professional', 'dedication', 'perseverance', 'resilience', 'humble',
+                    'self-aware', 'mature', 'understanding', 'patient-centered', 'holistic'],
+        'patterns': [
+            r'as (president|leader|captain)', r'I organized', r'I founded',
+            r'demonstrated .+ leadership', r'showed .+ compassion'
+        ],
+        'rubric': {
+            1: 'arrogant, immature, overly confident, inaccurate understanding, negative tone',
+            2: 'bland/boring but not problematic',
+            3: 'shows some understanding',
+            4: 'realistic, self-aware, mature, humble, specific and clear understanding, positive'
+        },
+        'rubric_features': {
+            'positive': ['realistic', 'self-aware', 'mature', 'humble', 'specific', 'clear'],
+            'negative': ['arrogant', 'immature', 'overly confident', 'simplistic', 'inaccurate']
+        }
+    },
+    'Spin': {
+        'description': 'Explaining why experiences qualify them to be a doctor',
+        'keywords': ['learned', 'taught me', 'showed me', 'realized', 'understood',
+                    'because', 'therefore', 'this experience', 'through this',
+                    'as a doctor', 'future physician', 'will help me', 'prepared me',
+                    'equipped me', 'qualified', 'ready', 'capable', 'competent',
+                    'skills necessary', 'attributes required', 'prepared for'],
+        'patterns': [
+            r'this .+ taught me', r'I learned that', r'prepared me for',
+            r'qualified me to', r'because of this', r'therefore I'
+        ],
+        'rubric': {
+            1: 'brief, vague, simplistic connection to being a doctor, generic',
+            2: 'some connection but generic',
+            3: 'clear connection',
+            4: 'direct, logical, and specific argument connecting experience to profession'
+        },
+        'rubric_features': {
+            'positive': ['direct', 'logical', 'specific', 'clear argument', 'compelling connection'],
+            'negative': ['brief', 'vague', 'simplistic', 'generic', 'weak connection']
+        }
+    }
+}
+# Model paths
+MODEL_DIR = "trained_models"
+EMBEDDER_PATH = os.path.join(MODEL_DIR, "embedder_name.txt")
+CLASSIFIER_PATH = os.path.join(MODEL_DIR, "classifier.pkl")
+SCORER_PATH = os.path.join(MODEL_DIR, "scorer.pkl")
+SCALER_PATH = os.path.join(MODEL_DIR, "scaler.pkl")
+THRESHOLD_PATH = os.path.join(MODEL_DIR, "thresholds.pkl")
+ENSEMBLE_PATH = os.path.join(MODEL_DIR, "ensemble.pkl")
+@st.cache_resource
+def load_sentence_transformer():
+    """Load sentence transformer model"""
+    models_to_try = [
+        'all-MiniLM-L6-v2',  # Lightweight and reliable
+        'all-mpnet-base-v2'  # Good alternative
+    ]
+    for model_name in models_to_try:
+        try:
+            model = SentenceTransformer(model_name)
+            return model, model_name
+        except:
+            continue
+    return SentenceTransformer('all-MiniLM-L6-v2'), 'all-MiniLM-L6-v2'
+def segment_text(text, embedder):
+    """Segment text into meaningful chunks"""
+    paragraphs = re.split(r'\n\s*\n', text)
+    paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p.strip()) > 50]
+    if len(paragraphs) <= 1:
+        sentences = re.split(r'(?<=[.!?])\s+', text)
+        sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
+        if len(sentences) < 3:
+            return [text]
+        # Group sentences into segments
+        segments = []
+        current_segment = []
+        for sent in sentences:
+            current_segment.append(sent)
+            if len(' '.join(current_segment)) > 300:
+                segments.append(' '.join(current_segment))
+                current_segment = []
+        if current_segment:
+            segments.append(' '.join(current_segment))
+        return segments
+    return paragraphs
+def extract_features(text, embedder, category_focus=None):
+    """Extract features for classification"""
+    features = []
+    text_lower = text.lower()
+    words = text.split()
+    # Basic text statistics
+    features.extend([
+        len(text),
+        len(words),
+        len(set(words)) / max(len(words), 1),
+        len(re.findall(r'[.!?]', text)),
+        text.count('I') / max(len(words), 1),
+    ])
+    # Process all categories
+    for cat_name, cat_info in CATEGORIES.items():
+        keywords = cat_info['keywords']
+        keyword_matches = sum(1 for kw in keywords if kw.lower() in text_lower)
+        keyword_density = keyword_matches / max(len(keywords), 1)
+        if category_focus == cat_name:
+            keyword_density *= 2
+        features.append(keyword_density * 10)
+        pattern_matches = 0
+        for pattern in cat_info.get('patterns', []):
+            matches = re.findall(pattern, text_lower)
+            pattern_matches += len(matches)
+        features.append(pattern_matches)
+        positive_count = sum(1 for word in cat_info['rubric_features']['positive']
+                           if word in text_lower)
+        negative_count = sum(1 for word in cat_info['rubric_features']['negative']
+                           if word in text_lower)
+        features.extend([
+            positive_count / max(len(words), 1) * 100,
+            negative_count / max(len(words), 1) * 100
+        ])
+    # Get embeddings
+    try:
+        embedding = embedder.encode(text, convert_to_tensor=False, normalize_embeddings=True)
+    except:
+        embedding = embedder.encode(text)
+    # Category similarity
+    if category_focus and category_focus in CATEGORIES:
+        category_text = f"{CATEGORIES[category_focus]['description']} {' '.join(CATEGORIES[category_focus]['keywords'][:10])}"
+        try:
+            category_embedding = embedder.encode(category_text, normalize_embeddings=True)
+            similarity = cosine_similarity([embedding], [category_embedding])[0][0]
+            features.append(similarity * 10)
+        except:
+            features.append(0)
+    else:
+        features.append(0)
+    features = np.array(features, dtype=np.float32)
+    combined_features = np.concatenate([features, embedding[:256]])  # Limit embedding size
+    return combined_features
+def load_training_data(file1, file2):
+    """Load and combine training data from Excel files"""
+    try:
+        df1 = pd.read_excel(file1)
+        df2 = pd.read_excel(file2)
+    except Exception as e:
+        st.error(f"Error reading Excel files: {str(e)}")
+        return pd.DataFrame()
+    combined_df = pd.concat([df1, df2], ignore_index=True)
+    processed_data = []
+    for _, row in combined_df.iterrows():
+        text = None
+        for col_name in ['Excerpt Copy', 'Excerpt', 'Text', 'Content']:
+            if col_name in row and pd.notna(row[col_name]):
+                text = str(row[col_name])
+                break
+        if not text or text.strip() == '':
+            continue
+        data_point = {'text': text.strip()}
+        for category in CATEGORIES.keys():
+            col_applied = f"Code: {category} Applied"
+            col_weight = f"Code: {category} Weight"
+            is_applied = False
+            if col_applied in row:
+                applied_val = str(row[col_applied]).lower()
+                is_applied = applied_val in ['true', '1', 'yes', 't']
+            data_point[f"{category}_applied"] = is_applied
+            if is_applied and col_weight in row:
+                weight = row[col_weight]
+                if pd.isna(weight) or weight == '':
+                    weight = 2
+                else:
+                    try:
+                        weight = int(float(weight))
+                        weight = max(1, min(4, weight))
+                    except:
+                        weight = 2
+            else:
+                weight = 0
+            data_point[f"{category}_score"] = weight
+        processed_data.append(data_point)
+    return pd.DataFrame(processed_data)
+def train_models(df, embedder):
+    """Train classification and scoring models"""
+    all_features = []
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+    status_text.text("Extracting features from training data...")
+    for idx, row in df.iterrows():
+        text = row['text']
+        category_features = {}
+        for cat in CATEGORIES.keys():
+            features = extract_features(text, embedder, category_focus=cat)
+            category_features[cat] = features
+        true_categories = [cat for cat in CATEGORIES.keys() if row[f"{cat}_applied"]]
+        if true_categories:
+            features = category_features[true_categories[0]]
+        else:
+            features = np.mean(list(category_features.values()), axis=0)
+        all_features.append(features)
+        progress_bar.progress((idx + 1) / len(df))
+    X = np.array(all_features)
+    categories = list(CATEGORIES.keys())
+    y_class = df[[f"{cat}_applied" for cat in categories]].values.astype(float)
+    y_score = []
+    for _, row in df.iterrows():
+        scores = []
+        for cat in categories:
+            if row[f"{cat}_applied"]:
+                scores.append(row[f"{cat}_score"] / 4.0)
+            else:
+                scores.append(0)
+        y_score.append(scores)
+    y_score = np.array(y_score)
+    status_text.text("Training models...")
+    # Split data
+    X_train, X_test, y_class_train, y_class_test, y_score_train, y_score_test = train_test_split(
+        X, y_class, y_score, test_size=0.2, random_state=42
+    )
+    # Scale features
+    scaler = StandardScaler()
+    X_train_scaled = scaler.fit_transform(X_train)
+    X_test_scaled = scaler.transform(X_test)
+    # Train classifiers
+    classifiers = {}
+    scorers = {}
+    thresholds = {}
+    for i, cat in enumerate(categories):
+        # Train classifier
+        clf = RandomForestClassifier(
+            n_estimators=100,
+            max_depth=6,
+            class_weight='balanced',
+            random_state=42
+        )
+        clf.fit(X_train_scaled, y_class_train[:, i])
+        classifiers[cat] = clf
+        # Train scorer
+        mask = y_class_train[:, i] == 1
+        if np.sum(mask) > 5:
+            scorer = xgb.XGBRegressor(
+                n_estimators=100,
+                max_depth=4,
+                random_state=42
+            )
+            scorer.fit(X_train_scaled[mask], y_score_train[mask, i])
+        else:
+            from sklearn.dummy import DummyRegressor
+            scorer = DummyRegressor(strategy='constant', constant=0.5)
+            scorer.fit(X_train_scaled, y_score_train[:, i])
+        scorers[cat] = scorer
+        thresholds[cat] = 0.5
+    status_text.empty()
+    progress_bar.empty()
+    return scaler, classifiers, scorers, thresholds
+def save_models(embedder_name, scaler, classifiers, scorers, thresholds):
+    """Save all trained models"""
+    os.makedirs(MODEL_DIR, exist_ok=True)
+    with open(EMBEDDER_PATH, 'w') as f:
+        f.write(embedder_name)
+    with open(SCALER_PATH, 'wb') as f:
+        pickle.dump(scaler, f)
+    with open(CLASSIFIER_PATH, 'wb') as f:
+        pickle.dump(classifiers, f)
+    with open(SCORER_PATH, 'wb') as f:
+        pickle.dump(scorers, f)
+    with open(THRESHOLD_PATH, 'wb') as f:
+        pickle.dump(thresholds, f)
+def load_saved_models():
+    """Load all saved models"""
+    try:
+        with open(EMBEDDER_PATH, 'r') as f:
+            embedder_name = f.read().strip()
+        embedder = SentenceTransformer(embedder_name)
+        with open(SCALER_PATH, 'rb') as f:
+            scaler = pickle.load(f)
+        with open(CLASSIFIER_PATH, 'rb') as f:
+            classifiers = pickle.load(f)
+        with open(SCORER_PATH, 'rb') as f:
+            scorers = pickle.load(f)
+        with open(THRESHOLD_PATH, 'rb') as f:
+            thresholds = pickle.load(f)
+        return embedder, scaler, classifiers, scorers, thresholds
+    except:
+        return None, None, None, None, None
+def classify_segment(text, embedder, scaler, classifiers, scorers, thresholds):
+    """Classify a segment of text"""
+    categories = list(CATEGORIES.keys())
+    category_results = {}
+    for cat in categories:
+        features = extract_features(text, embedder, category_focus=cat)
+        features_scaled = scaler.transform([features])
+        prob = classifiers[cat].predict_proba(features_scaled)[0, 1] if hasattr(classifiers[cat], 'predict_proba') else 0
+        category_results[cat] = prob
+    best_category = max(category_results, key=category_results.get)
+    best_prob = category_results[best_category]
+    if best_prob > thresholds.get(best_category, 0.5):
+        features = extract_features(text, embedder, category_focus=best_category)
+        features_scaled = scaler.transform([features])
+        try:
+            score_normalized = scorers[best_category].predict(features_scaled)[0]
+            score = int(np.clip(np.round(score_normalized * 4), 1, 4))
+        except:
+            score = 2
+        return {
+            'category': best_category,
+            'score': score,
+            'confidence': float(best_prob),
+            'text': text
+        }
+    else:
+        return {
+            'category': 'Unclassified',
+            'score': None,
+            'confidence': 0,
+            'text': text
+        }
+def analyze_statement(text, embedder, scaler, classifiers, scorers, thresholds):
+    """Analyze complete personal statement"""
+    segments = segment_text(text, embedder)
+    segment_results = []
+    for i, segment in enumerate(segments):
+        result = classify_segment(segment, embedder, scaler, classifiers, scorers, thresholds)
+        result['segment_num'] = i + 1
+        segment_results.append(result)
+    category_results = {}
+    for cat in CATEGORIES.keys():
+        cat_segments = [r for r in segment_results if r['category'] == cat]
+        if cat_segments:
+            scores = [s['score'] for s in cat_segments]
+            avg_score = np.mean(scores)
+            max_confidence = max([s['confidence'] for s in cat_segments])
+            category_results[cat] = {
+                'detected': True,
+                'score': int(np.round(avg_score)),
+                'confidence': max_confidence,
+                'num_segments': len(cat_segments)
+            }
+        else:
+            category_results[cat] = {
+                'detected': False,
+                'score': None,
+                'confidence': 0,
+                'num_segments': 0
+            }
+    return segment_results, category_results
+# Main application
+def main():
+    st.title("🏥 Medical School Personal Statement Analyzer")
+    st.markdown("*AI-powered analysis based on medical school admission rubrics*")
+    st.markdown("---")
+    # Sidebar
+    with st.sidebar:
+        st.header("ℹ️ About")
+        st.markdown("""
+        This tool analyzes personal statements based on 4 key categories:
+        - **Spark**: Opening that shows interest in medicine
+        - **Healthcare Experience**: Clinical/medical experiences
+        - **Doctor Qualities**: Leadership and character traits
+        - **Spin**: Connecting experiences to medical career
+        Each category is scored 1-4 (Poor to Excellent)
+        """)
+    # Create tabs
+    tab1, tab2, tab3 = st.tabs(["📚 Train Model", "📝 Analyze Statement", "📊 View Rubrics"])
+    # Train Model Tab
+    with tab1:
+        st.header("Train the AI Model")
+        if all(os.path.exists(p) for p in [CLASSIFIER_PATH, SCORER_PATH, SCALER_PATH]):
+            st.success("✓ Models already trained. You can analyze statements or retrain.")
+        st.markdown("Upload training data files (Excel format with coded excerpts)")
+        col1, col2 = st.columns(2)
+        with col1:
+            file1 = st.file_uploader("Training File 1", type=['xlsx'], key="file1")
+        with col2:
+            file2 = st.file_uploader("Training File 2", type=['xlsx'], key="file2")
+        if file1 and file2:
+            if st.button("Start Training", type="primary"):
+                try:
+                    # Load data
+                    with st.spinner("Loading training data..."):
+                        df = load_training_data(file1, file2)
+                    if df.empty:
+                        st.error("No valid training data found.")
+                        return
+                    st.success(f"✓ Loaded {len(df)} training samples")
+                    # Load embedder
+                    with st.spinner("Loading transformer model..."):
+                        embedder, embedder_name = load_sentence_transformer()
+                    # Train
+                    scaler, classifiers, scorers, thresholds = train_models(df, embedder)
+                    # Save
+                    save_models(embedder_name, scaler, classifiers, scorers, thresholds)
+                    st.success("✓ Training complete! Models saved.")
+                except Exception as e:
+                    st.error(f"Training failed: {str(e)}")
+    # Analyze Statement Tab
+    with tab2:
+        st.header("Analyze Personal Statement")
+        if not all(os.path.exists(p) for p in [CLASSIFIER_PATH, SCORER_PATH, SCALER_PATH]):
+            st.warning("⚠️ Please train the model first (Tab 1)")
+            return
+        # Load models
+        embedder, scaler, classifiers, scorers, thresholds = load_saved_models()
+        if embedder is None:
+            st.error("Failed to load models. Please retrain.")
+            return
+        # Input method
+        input_method = st.radio("Choose input method:", ["Paste Text", "Upload File"])
+        text_to_analyze = None
+        if input_method == "Paste Text":
+            text_to_analyze = st.text_area(
+                "Paste your personal statement here:",
+                height=300,
+                placeholder="Enter your personal statement..."
+            )
+        else:
+            uploaded_file = st.file_uploader("Upload statement (.txt)", type=['txt'])
+            if uploaded_file:
+                text_to_analyze = str(uploaded_file.read(), 'utf-8')
+                st.success("File uploaded successfully!")
+        if text_to_analyze and st.button("Analyze Statement", type="primary"):
+            with st.spinner("Analyzing..."):
+                segment_results, category_results = analyze_statement(
+                    text_to_analyze, embedder, scaler, classifiers, scorers, thresholds
+                )
+            # Display results
+            st.success("✓ Analysis complete!")
+            # Summary
+            st.subheader("📊 Overall Summary")
+            cols = st.columns(4)
+            detected = [cat for cat, res in category_results.items() if res['detected']]
+            with cols[0]:
+                st.metric("Categories Found", f"{len(detected)}/4")
+            with cols[1]:
+                if detected:
+                    avg_score = np.mean([category_results[cat]['score'] for cat in detected])
+                    st.metric("Average Score", f"{avg_score:.1f}/4")
+                else:
+                    st.metric("Average Score", "N/A")
+            with cols[2]:
+                st.metric("Total Segments", len(segment_results))
+            with cols[3]:
+                quality = "Excellent" if len(detected) == 4 and avg_score >= 3.5 else "Good" if len(detected) >= 3 else "Needs Work"
+                st.metric("Overall", quality)
+            # Category breakdown
+            st.subheader("📋 Category Analysis")
+            for cat in CATEGORIES.keys():
+                res = category_results[cat]
+                if res['detected']:
+                    icon = "✅" if res['score'] >= 3 else "⚠️" if res['score'] >= 2 else "❌"
+                    st.write(f"{icon} **{cat}**: Score {res['score']}/4 (Confidence: {res['confidence']:.1%})")
+                else:
+                    st.write(f"❌ **{cat}**: Not detected")
+            # Segment details
+            st.subheader("📝 Segment Details")
+            for seg in segment_results:
+                with st.expander(f"Segment {seg['segment_num']}: {seg['category']}"):
+                    st.write(f"**Score:** {seg['score']}/4" if seg['score'] else "N/A")
+                    st.write(f"**Confidence:** {seg['confidence']:.1%}")
+                    st.write(f"**Text:** {seg['text'][:300]}...")
+            # Recommendations
+            st.subheader("💡 Recommendations")
+            missing = [cat for cat, res in category_results.items() if not res['detected']]
+            low_score = [cat for cat, res in category_results.items()
+                        if res['detected'] and res['score'] and res['score'] < 3]
+            if missing:
+                st.warning("**Missing Categories:**")
+                for cat in missing:
+                    st.write(f"• Add content for **{cat}**: {CATEGORIES[cat]['description']}")
+            if low_score:
+                st.info("**Areas to Improve:**")
+                for cat in low_score:
+                    st.write(f"• Strengthen **{cat}** (current score: {category_results[cat]['score']}/4)")
+            if not missing and not low_score:
+                st.success("Excellent work! All categories present with good scores.")
+    # View Rubrics Tab
+    with tab3:
+        st.header("Scoring Rubrics")
+        for category, info in CATEGORIES.items():
+            with st.expander(f"**{category}**"):
+                st.write(f"**Description:** {info['description']}")
+                st.write("**Scoring Criteria:**")
+                for score in [4, 3, 2, 1]:
+                    st.write(f"• **Score {score}:** {info['rubric'][score]}")
+                st.write(f"**Key Terms:** {', '.join(info['keywords'][:8])}")
+if __name__ == "__main__":
+    main()