Spaces:

stevafernandes
/

personal_statement

Sleeping

App Files Files Community

stevafernandes commited on Aug 25, 2025

Commit

6ddadb4

verified ·

1 Parent(s): e96c629

Update app.py

Browse files

Files changed (1) hide show

app.py +185 -550

app.py CHANGED Viewed

@@ -3,665 +3,300 @@ import pandas as pd
 import numpy as np
 import pickle
 import os
-import torch
-from sentence_transformers import SentenceTransformer, util
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.ensemble import RandomForestClassifier
 import xgboost as xgb
 import re
 import warnings
-from datetime import datetime
-import base64
-from io import BytesIO
 warnings.filterwarnings('ignore')
-# Set page config
 st.set_page_config(
     page_title="Medical School Personal Statement Analyzer",
     page_icon="🏥",
-    layout="wide",
-    initial_sidebar_state="expanded"
 )
-# Categories with detailed rubric alignment
 CATEGORIES = {
     'Spark': {
-        'description': 'Opening that spurs interest in medicine (typically in opening paragraph)',
         'keywords': ['growing up', 'childhood', 'family', 'realized', 'inspired', 'first',
-                    'beginning', 'early', 'experience that', 'moment', 'when I was',
-                    'journey began', 'sparked my interest', 'drew me to medicine',
-                    'passion for medicine', 'calling', 'fascinated', 'curiosity'],
-        'patterns': [
-            r'when I was \d+', r'at age \d+', r'since I was', r'as a child',
-            r'early in my life', r'growing up', r'my journey to medicine'
-        ],
         'rubric': {
-            1: 'disconnected from being a doctor or confusing/random',
             2: 'somewhat connected but unclear',
             3: 'connected and clear',
-            4: 'engaging and logically flows into becoming a doctor'
         },
         'rubric_features': {
-            'positive': ['engaging', 'logical', 'clear connection', 'compelling', 'authentic'],
             'negative': ['disconnected', 'confusing', 'random', 'unclear', 'generic']
         }
     },
     'Healthcare Experience': {
-        'description': 'Watching/participating in healthcare - medical professional at work',
         'keywords': ['shadowed', 'clinical', 'hospital', 'patient', 'doctor', 'physician',
-                    'medical', 'treatment', 'observed', 'volunteer', 'clinic', 'rounds',
-                    'surgery', 'emergency', 'ICU', 'residency', 'internship', 'scrubs',
-                    'stethoscope', 'diagnosis', 'prognosis', 'bedside', 'ward', 'unit',
-                    'healthcare', 'care team', 'medical team', 'attending', 'resident'],
-        'patterns': [
-            r'\d+ hours', r'volunteered at', r'shadowing', r'clinical experience',
-            r'medical mission', r'worked in .+ hospital', r'during my rotation'
-        ],
         'rubric': {
-            1: 'passive observation, uninteresting, irrelevant, problematic, negative tone',
-            2: 'bland/boring but not problematic',
             3: 'interesting and relevant',
-            4: 'vivid, active, thoughtful, relevant, memorable, positive and optimistic'
         },
         'rubric_features': {
-            'positive': ['vivid', 'active', 'thoughtful', 'memorable', 'optimistic', 'engaged'],
-            'negative': ['passive', 'uninteresting', 'irrelevant', 'problematic', 'pessimistic']
         }
     },
     'Showing Doctor Qualities': {
-        'description': 'Stories/examples portraying vision of doctor role and appealing aspects',
         'keywords': ['leadership', 'empathy', 'compassion', 'responsibility', 'communication',
-                    'advocate', 'caring', 'helping', 'service', 'volunteer', 'president',
-                    'led', 'organized', 'taught', 'mentored', 'integrity', 'ethical',
-                    'professional', 'dedication', 'perseverance', 'resilience', 'humble',
-                    'self-aware', 'mature', 'understanding', 'patient-centered', 'holistic'],
-        'patterns': [
-            r'as (president|leader|captain)', r'I organized', r'I founded',
-            r'demonstrated .+ leadership', r'showed .+ compassion'
-        ],
         'rubric': {
-            1: 'arrogant, immature, overly confident, inaccurate understanding, negative tone',
-            2: 'bland/boring but not problematic',
             3: 'shows some understanding',
-            4: 'realistic, self-aware, mature, humble, specific and clear understanding, positive'
         },
         'rubric_features': {
-            'positive': ['realistic', 'self-aware', 'mature', 'humble', 'specific', 'clear'],
-            'negative': ['arrogant', 'immature', 'overly confident', 'simplistic', 'inaccurate']
         }
     },
     'Spin': {
-        'description': 'Explaining why experiences qualify them to be a doctor',
         'keywords': ['learned', 'taught me', 'showed me', 'realized', 'understood',
-                    'because', 'therefore', 'this experience', 'through this',
-                    'as a doctor', 'future physician', 'will help me', 'prepared me',
-                    'equipped me', 'qualified', 'ready', 'capable', 'competent',
-                    'skills necessary', 'attributes required', 'prepared for'],
-        'patterns': [
-            r'this .+ taught me', r'I learned that', r'prepared me for',
-            r'qualified me to', r'because of this', r'therefore I'
-        ],
         'rubric': {
-            1: 'brief, vague, simplistic connection to being a doctor, generic',
             2: 'some connection but generic',
             3: 'clear connection',
-            4: 'direct, logical, and specific argument connecting experience to profession'
         },
         'rubric_features': {
-            'positive': ['direct', 'logical', 'specific', 'clear argument', 'compelling connection'],
-            'negative': ['brief', 'vague', 'simplistic', 'generic', 'weak connection']
         }
     }
 }
 # Model paths
 MODEL_DIR = "trained_models"
-EMBEDDER_PATH = os.path.join(MODEL_DIR, "embedder_name.txt")
-CLASSIFIER_PATH = os.path.join(MODEL_DIR, "classifier.pkl")
-SCORER_PATH = os.path.join(MODEL_DIR, "scorer.pkl")
-SCALER_PATH = os.path.join(MODEL_DIR, "scaler.pkl")
-THRESHOLD_PATH = os.path.join(MODEL_DIR, "thresholds.pkl")
 @st.cache_resource
-def load_sentence_transformer():
-    """Load sentence transformer model"""
     try:
-        model = SentenceTransformer('all-MiniLM-L6-v2')
-        return model, 'all-MiniLM-L6-v2'
     except:
-        st.error("Failed to load sentence transformer model")
-        return None, None
-def segment_text(text, embedder):
-    """Segment text into meaningful chunks"""
-    paragraphs = re.split(r'\n\s*\n', text)
-    paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p.strip()) > 50]
-    if len(paragraphs) <= 1:
-        sentences = re.split(r'(?<=[.!?])\s+', text)
-        sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
-        if len(sentences) < 3:
-            return [text]
-        segments = []
-        current_segment = []
-        for sent in sentences:
-            current_segment.append(sent)
-            if len(' '.join(current_segment)) > 300:
-                segments.append(' '.join(current_segment))
-                current_segment = []
-        if current_segment:
-            segments.append(' '.join(current_segment))
-        return segments
-    return paragraphs
-def extract_features(text, embedder, category_focus=None):
-    """Extract features for classification"""
     features = []
     text_lower = text.lower()
     words = text.split()
-    # Basic text statistics
     features.extend([
         len(text),
         len(words),
-        len(set(words)) / max(len(words), 1),
-        len(re.findall(r'[.!?]', text)),
-        text.count('I') / max(len(words), 1),
     ])
-    # Process all categories
     for cat_name, cat_info in CATEGORIES.items():
-        keywords = cat_info['keywords']
-        keyword_matches = sum(1 for kw in keywords if kw.lower() in text_lower)
-        keyword_density = keyword_matches / max(len(keywords), 1)
-        if category_focus == cat_name:
-            keyword_density *= 2
-        features.append(keyword_density * 10)
-        pattern_matches = 0
-        for pattern in cat_info.get('patterns', []):
-            matches = re.findall(pattern, text_lower)
-            pattern_matches += len(matches)
-        features.append(pattern_matches)
-        positive_count = sum(1 for word in cat_info['rubric_features']['positive']
-                           if word in text_lower)
-        negative_count = sum(1 for word in cat_info['rubric_features']['negative']
-                           if word in text_lower)
-        features.extend([
-            positive_count / max(len(words), 1) * 100,
-            negative_count / max(len(words), 1) * 100
-        ])
-    # Get embeddings
     try:
-        embedding = embedder.encode(text, convert_to_tensor=False)
         if hasattr(embedding, 'cpu'):
             embedding = embedding.cpu().numpy()
-        embedding = embedding.flatten()[:256]  # Limit size
     except:
-        embedding = np.zeros(256)
-    # Category similarity
-    if category_focus and category_focus in CATEGORIES:
-        category_text = f"{CATEGORIES[category_focus]['description']} {' '.join(CATEGORIES[category_focus]['keywords'][:10])}"
-        try:
-            category_embedding = embedder.encode(category_text)
-            if hasattr(category_embedding, 'cpu'):
-                category_embedding = category_embedding.cpu().numpy()
-            category_embedding = category_embedding.flatten()
-            similarity = cosine_similarity([embedding], [category_embedding[:256]])[0][0]
-            features.append(similarity * 10)
-        except:
-            features.append(0)
-    else:
-        features.append(0)
-    features = np.array(features, dtype=np.float32)
-    combined_features = np.concatenate([features, embedding])
-    return combined_features
-def load_training_data(file1, file2):
-    """Load and combine training data from Excel files"""
-    try:
-        df1 = pd.read_excel(file1)
-        df2 = pd.read_excel(file2)
-    except Exception as e:
-        st.error(f"Error reading Excel files: {str(e)}")
-        return pd.DataFrame()
-    combined_df = pd.concat([df1, df2], ignore_index=True)
-    processed_data = []
-    for _, row in combined_df.iterrows():
-        text = None
-        for col_name in ['Excerpt Copy', 'Excerpt', 'Text', 'Content']:
-            if col_name in row and pd.notna(row[col_name]):
-                text = str(row[col_name])
-                break
-        if not text or text.strip() == '':
-            continue
-        data_point = {'text': text.strip()}
-        for category in CATEGORIES.keys():
-            col_applied = f"Code: {category} Applied"
-            col_weight = f"Code: {category} Weight"
-            is_applied = False
-            if col_applied in row:
-                applied_val = str(row[col_applied]).lower()
-                is_applied = applied_val in ['true', '1', 'yes', 't']
-            data_point[f"{category}_applied"] = is_applied
-            if is_applied and col_weight in row:
-                weight = row[col_weight]
-                if pd.isna(weight) or weight == '':
-                    weight = 2
-                else:
-                    try:
-                        weight = int(float(weight))
-                        weight = max(1, min(4, weight))
-                    except:
-                        weight = 2
-            else:
-                weight = 0
-            data_point[f"{category}_score"] = weight
-        processed_data.append(data_point)
-    return pd.DataFrame(processed_data)
-def train_models(df, embedder):
-    """Train classification and scoring models"""
-    all_features = []
-    progress_bar = st.progress(0)
-    status_text = st.empty()
-    status_text.text("Extracting features from training data...")
-    for idx, row in df.iterrows():
-        text = row['text']
-        category_features = {}
-        for cat in CATEGORIES.keys():
-            features = extract_features(text, embedder, category_focus=cat)
-            category_features[cat] = features
-        true_categories = [cat for cat in CATEGORIES.keys() if row[f"{cat}_applied"]]
-        if true_categories:
-            features = category_features[true_categories[0]]
-        else:
-            features = np.mean(list(category_features.values()), axis=0)
-        all_features.append(features)
-        progress_bar.progress((idx + 1) / len(df))
-    X = np.array(all_features)
-    categories = list(CATEGORIES.keys())
-    y_class = df[[f"{cat}_applied" for cat in categories]].values.astype(float)
-    y_score = []
     for _, row in df.iterrows():
-        scores = []
-        for cat in categories:
-            if row[f"{cat}_applied"]:
-                scores.append(row[f"{cat}_score"] / 4.0)
-            else:
-                scores.append(0)
-        y_score.append(scores)
-    y_score = np.array(y_score)
-    status_text.text("Training models...")
-    # Split data
-    X_train, X_test, y_class_train, y_class_test, y_score_train, y_score_test = train_test_split(
-        X, y_class, y_score, test_size=0.2, random_state=42
-    )
-    # Scale features
     scaler = StandardScaler()
-    X_train_scaled = scaler.fit_transform(X_train)
-    X_test_scaled = scaler.transform(X_test)
-    # Train classifiers
-    classifiers = {}
-    scorers = {}
-    thresholds = {}
-    for i, cat in enumerate(categories):
-        # Train classifier
-        clf = RandomForestClassifier(
-            n_estimators=100,
-            max_depth=6,
-            class_weight='balanced',
-            random_state=42
-        )
-        clf.fit(X_train_scaled, y_class_train[:, i])
-        classifiers[cat] = clf
-        # Train scorer
-        mask = y_class_train[:, i] == 1
-        if np.sum(mask) > 5:
-            scorer = xgb.XGBRegressor(
-                n_estimators=100,
-                max_depth=4,
-                random_state=42
-            )
-            scorer.fit(X_train_scaled[mask], y_score_train[mask, i])
-        else:
-            from sklearn.dummy import DummyRegressor
-            scorer = DummyRegressor(strategy='constant', constant=0.5)
-            scorer.fit(X_train_scaled, y_score_train[:, i])
-        scorers[cat] = scorer
-        thresholds[cat] = 0.5
-    status_text.empty()
-    progress_bar.empty()
-    return scaler, classifiers, scorers, thresholds
-def save_models(embedder_name, scaler, classifiers, scorers, thresholds):
-    """Save all trained models"""
-    os.makedirs(MODEL_DIR, exist_ok=True)
-    with open(EMBEDDER_PATH, 'w') as f:
-        f.write(embedder_name)
-    with open(SCALER_PATH, 'wb') as f:
-        pickle.dump(scaler, f)
-    with open(CLASSIFIER_PATH, 'wb') as f:
-        pickle.dump(classifiers, f)
-    with open(SCORER_PATH, 'wb') as f:
-        pickle.dump(scorers, f)
-    with open(THRESHOLD_PATH, 'wb') as f:
-        pickle.dump(thresholds, f)
-def load_saved_models():
-    """Load all saved models"""
-    try:
-        with open(EMBEDDER_PATH, 'r') as f:
-            embedder_name = f.read().strip()
-        embedder = SentenceTransformer(embedder_name)
-        with open(SCALER_PATH, 'rb') as f:
-            scaler = pickle.load(f)
-        with open(CLASSIFIER_PATH, 'rb') as f:
-            classifiers = pickle.load(f)
-        with open(SCORER_PATH, 'rb') as f:
-            scorers = pickle.load(f)
-        with open(THRESHOLD_PATH, 'rb') as f:
-            thresholds = pickle.load(f)
-        return embedder, scaler, classifiers, scorers, thresholds
-    except:
-        return None, None, None, None, None
-def classify_segment(text, embedder, scaler, classifiers, scorers, thresholds):
-    """Classify a segment of text"""
-    categories = list(CATEGORIES.keys())
-    category_results = {}
-    for cat in categories:
-        features = extract_features(text, embedder, category_focus=cat)
-        features_scaled = scaler.transform([features])
-        prob = classifiers[cat].predict_proba(features_scaled)[0, 1] if hasattr(classifiers[cat], 'predict_proba') else 0
-        category_results[cat] = prob
-    best_category = max(category_results, key=category_results.get)
-    best_prob = category_results[best_category]
-    if best_prob > thresholds.get(best_category, 0.5):
-        features = extract_features(text, embedder, category_focus=best_category)
         features_scaled = scaler.transform([features])
-        try:
-            score_normalized = scorers[best_category].predict(features_scaled)[0]
-            score = int(np.clip(np.round(score_normalized * 4), 1, 4))
-        except:
-            score = 2
-        return {
-            'category': best_category,
-            'score': score,
-            'confidence': float(best_prob),
-            'text': text
-        }
-    else:
-        return {
-            'category': 'Unclassified',
-            'score': None,
-            'confidence': 0,
-            'text': text
-        }
-def analyze_statement(text, embedder, scaler, classifiers, scorers, thresholds):
-    """Analyze complete personal statement"""
-    segments = segment_text(text, embedder)
-    segment_results = []
-    for i, segment in enumerate(segments):
-        result = classify_segment(segment, embedder, scaler, classifiers, scorers, thresholds)
-        result['segment_num'] = i + 1
-        segment_results.append(result)
-    category_results = {}
-    for cat in CATEGORIES.keys():
-        cat_segments = [r for r in segment_results if r['category'] == cat]
-        if cat_segments:
-            scores = [s['score'] for s in cat_segments]
-            avg_score = np.mean(scores)
-            max_confidence = max([s['confidence'] for s in cat_segments])
-            category_results[cat] = {
-                'detected': True,
-                'score': int(np.round(avg_score)),
-                'confidence': max_confidence,
-                'num_segments': len(cat_segments)
-            }
-        else:
-            category_results[cat] = {
-                'detected': False,
-                'score': None,
-                'confidence': 0,
-                'num_segments': 0
-            }
-    return segment_results, category_results
-# Main UI Code
 st.title("🏥 Medical School Personal Statement Analyzer")
-st.markdown("*AI-powered analysis based on medical school admission rubrics*")
-st.markdown("---")
-# Sidebar
-with st.sidebar:
-    st.header("ℹ️ About")
-    st.markdown("""
-    This tool analyzes personal statements based on 4 key categories:
-    - **Spark**: Opening that shows interest in medicine
-    - **Healthcare Experience**: Clinical/medical experiences
-    - **Doctor Qualities**: Leadership and character traits
-    - **Spin**: Connecting experiences to medical career
-    Each category is scored 1-4 (Poor to Excellent)
-    """)
-# Create tabs
-tab1, tab2, tab3 = st.tabs(["📚 Train Model", "📝 Analyze Statement", "📊 View Rubrics"])
-# Train Model Tab
 with tab1:
-    st.header("Train the AI Model")
-    if all(os.path.exists(p) for p in [CLASSIFIER_PATH, SCORER_PATH, SCALER_PATH]):
-        st.success("✓ Models already trained. You can analyze statements or retrain.")
-    st.markdown("Upload training data files (Excel format with coded excerpts)")
-    col1, col2 = st.columns(2)
-    with col1:
-        file1 = st.file_uploader("Training File 1", type=['xlsx'], key="file1")
-    with col2:
-        file2 = st.file_uploader("Training File 2", type=['xlsx'], key="file2")
-    if file1 and file2:
-        if st.button("Start Training", type="primary"):
-            try:
-                # Load data
-                with st.spinner("Loading training data..."):
-                    df = load_training_data(file1, file2)
-                if df.empty:
-                    st.error("No valid training data found.")
-                else:
-                    st.success(f"✓ Loaded {len(df)} training samples")
-                    # Load embedder
-                    with st.spinner("Loading transformer model..."):
-                        embedder, embedder_name = load_sentence_transformer()
-                    if embedder is not None:
-                        # Train
-                        scaler, classifiers, scorers, thresholds = train_models(df, embedder)
-                        # Save
-                        save_models(embedder_name, scaler, classifiers, scorers, thresholds)
-                        st.success("✓ Training complete! Models saved.")
-                    else:
-                        st.error("Failed to load transformer model")
-            except Exception as e:
-                st.error(f"Training failed: {str(e)}")
-# Analyze Statement Tab
 with tab2:
-    st.header("Analyze Personal Statement")
-    if not all(os.path.exists(p) for p in [CLASSIFIER_PATH, SCORER_PATH, SCALER_PATH]):
-        st.warning("⚠️ Please train the model first (Tab 1)")
     else:
-        # Load models
-        embedder, scaler, classifiers, scorers, thresholds = load_saved_models()
-        if embedder is None:
-            st.error("Failed to load models. Please retrain.")
-        else:
-            # Input method
-            input_method = st.radio("Choose input method:", ["Paste Text", "Upload File"])
-            text_to_analyze = None
-            if input_method == "Paste Text":
-                text_to_analyze = st.text_area(
-                    "Paste your personal statement here:",
-                    height=300,
-                    placeholder="Enter your personal statement..."
-                )
-            else:
-                uploaded_file = st.file_uploader("Upload statement (.txt)", type=['txt'])
-                if uploaded_file:
-                    text_to_analyze = str(uploaded_file.read(), 'utf-8')
-                    st.success("File uploaded successfully!")
-            if text_to_analyze and st.button("Analyze Statement", type="primary"):
-                with st.spinner("Analyzing..."):
-                    segment_results, category_results = analyze_statement(
-                        text_to_analyze, embedder, scaler, classifiers, scorers, thresholds
-                    )
-                # Display results
-                st.success("✓ Analysis complete!")
-                # Summary
-                st.subheader("📊 Overall Summary")
-                cols = st.columns(4)
-                detected = [cat for cat, res in category_results.items() if res['detected']]
-                with cols[0]:
-                    st.metric("Categories Found", f"{len(detected)}/4")
-                with cols[1]:
-                    if detected:
-                        avg_score = np.mean([category_results[cat]['score'] for cat in detected])
-                        st.metric("Average Score", f"{avg_score:.1f}/4")
-                    else:
-                        st.metric("Average Score", "N/A")
-                with cols[2]:
-                    st.metric("Total Segments", len(segment_results))
-                with cols[3]:
-                    quality = "Excellent" if len(detected) == 4 and avg_score >= 3.5 else "Good" if len(detected) >= 3 else "Needs Work"
-                    st.metric("Overall", quality)
-                # Category breakdown
-                st.subheader("📋 Category Analysis")
-                for cat in CATEGORIES.keys():
-                    res = category_results[cat]
-                    if res['detected']:
-                        icon = "✅" if res['score'] >= 3 else "⚠️" if res['score'] >= 2 else "❌"
-                        st.write(f"{icon} **{cat}**: Score {res['score']}/4 (Confidence: {res['confidence']:.1%})")
-                    else:
-                        st.write(f"❌ **{cat}**: Not detected")
-                # Segment details
-                st.subheader("📝 Segment Details")
-                for seg in segment_results:
-                    with st.expander(f"Segment {seg['segment_num']}: {seg['category']}"):
-                        st.write(f"**Score:** {seg['score']}/4" if seg['score'] else "N/A")
-                        st.write(f"**Confidence:** {seg['confidence']:.1%}")
-                        st.write(f"**Text:** {seg['text'][:300]}...")
-                # Recommendations
-                st.subheader("💡 Recommendations")
-                missing = [cat for cat, res in category_results.items() if not res['detected']]
-                low_score = [cat for cat, res in category_results.items()
-                            if res['detected'] and res['score'] and res['score'] < 3]
-                if missing:
-                    st.warning("**Missing Categories:**")
-                    for cat in missing:
-                        st.write(f"• Add content for **{cat}**: {CATEGORIES[cat]['description']}")
-                if low_score:
-                    st.info("**Areas to Improve:**")
-                    for cat in low_score:
-                        st.write(f"• Strengthen **{cat}** (current score: {category_results[cat]['score']}/4)")
-                if not missing and not low_score:
-                    st.success("Excellent work! All categories present with good scores.")
-# View Rubrics Tab
 with tab3:
     st.header("Scoring Rubrics")
     for category, info in CATEGORIES.items():
-        with st.expander(f"**{category}**"):
             st.write(f"**Description:** {info['description']}")
-            st.write("**Scoring Criteria:**")
             for score in [4, 3, 2, 1]:
-                st.write(f"• **Score {score}:** {info['rubric'][score]}")
-            st.write(f"**Key Terms:** {', '.join(info['keywords'][:8])}")

 import numpy as np
 import pickle
 import os
+from sentence_transformers import SentenceTransformer
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.ensemble import RandomForestClassifier
+from sklearn.dummy import DummyRegressor
 import xgboost as xgb
 import re
 import warnings
 warnings.filterwarnings('ignore')
+# Initialize Streamlit - MUST BE AT THE TOP
 st.set_page_config(
     page_title="Medical School Personal Statement Analyzer",
     page_icon="🏥",
+    layout="wide"
 )
+# Categories definition
 CATEGORIES = {
     'Spark': {
+        'description': 'Opening that spurs interest in medicine',
         'keywords': ['growing up', 'childhood', 'family', 'realized', 'inspired', 'first',
+                    'beginning', 'early', 'experience that', 'moment', 'when I was'],
+        'patterns': [r'when I was \d+', r'at age \d+', r'since I was', r'as a child'],
         'rubric': {
+            1: 'disconnected or confusing',
             2: 'somewhat connected but unclear',
             3: 'connected and clear',
+            4: 'engaging and logical flow'
         },
         'rubric_features': {
+            'positive': ['engaging', 'logical', 'clear', 'compelling', 'authentic'],
             'negative': ['disconnected', 'confusing', 'random', 'unclear', 'generic']
         }
     },
     'Healthcare Experience': {
+        'description': 'Clinical/medical experiences',
         'keywords': ['shadowed', 'clinical', 'hospital', 'patient', 'doctor', 'physician',
+                    'medical', 'treatment', 'observed', 'volunteer', 'clinic'],
+        'patterns': [r'\d+ hours', r'volunteered at', r'shadowing', r'clinical experience'],
         'rubric': {
+            1: 'passive, uninteresting, negative',
+            2: 'bland but not problematic',
             3: 'interesting and relevant',
+            4: 'vivid, active, thoughtful, memorable'
         },
         'rubric_features': {
+            'positive': ['vivid', 'active', 'thoughtful', 'memorable', 'optimistic'],
+            'negative': ['passive', 'uninteresting', 'irrelevant', 'problematic']
         }
     },
     'Showing Doctor Qualities': {
+        'description': 'Leadership and doctor qualities',
         'keywords': ['leadership', 'empathy', 'compassion', 'responsibility', 'communication',
+                    'advocate', 'caring', 'helping', 'service', 'volunteer'],
+        'patterns': [r'as (president|leader|captain)', r'I organized', r'I founded'],
         'rubric': {
+            1: 'arrogant, immature, inaccurate',
+            2: 'bland but not problematic',
             3: 'shows some understanding',
+            4: 'realistic, mature, humble, clear'
         },
         'rubric_features': {
+            'positive': ['realistic', 'self-aware', 'mature', 'humble', 'specific'],
+            'negative': ['arrogant', 'immature', 'overly confident', 'simplistic']
         }
     },
     'Spin': {
+        'description': 'Connecting experiences to medical career',
         'keywords': ['learned', 'taught me', 'showed me', 'realized', 'understood',
+                    'because', 'therefore', 'this experience', 'prepared me'],
+        'patterns': [r'this .+ taught me', r'I learned that', r'prepared me for'],
         'rubric': {
+            1: 'vague, simplistic, generic',
             2: 'some connection but generic',
             3: 'clear connection',
+            4: 'direct, logical, specific argument'
         },
         'rubric_features': {
+            'positive': ['direct', 'logical', 'specific', 'clear argument'],
+            'negative': ['brief', 'vague', 'simplistic', 'generic']
         }
     }
 }
 # Model paths
 MODEL_DIR = "trained_models"
+# Helper functions
 @st.cache_resource
+def load_transformer():
     try:
+        return SentenceTransformer('all-MiniLM-L6-v2')
     except:
+        return None
+def extract_features(text, embedder):
     features = []
     text_lower = text.lower()
     words = text.split()
+    # Basic stats
     features.extend([
         len(text),
         len(words),
+        len(set(words)) / max(len(words), 1)
     ])
+    # Category features
     for cat_name, cat_info in CATEGORIES.items():
+        keyword_count = sum(1 for kw in cat_info['keywords'] if kw.lower() in text_lower)
+        features.append(keyword_count / len(cat_info['keywords']))
+    # Get embedding
     try:
+        embedding = embedder.encode(text)
         if hasattr(embedding, 'cpu'):
             embedding = embedding.cpu().numpy()
+        embedding = embedding.flatten()[:128]  # Reduced size
     except:
+        embedding = np.zeros(128)
+    return np.concatenate([features, embedding])
+def train_simple_model(df, embedder):
+    X = []
+    y_labels = []
     for _, row in df.iterrows():
+        if 'text' in row:
+            text = str(row['text'])
+            features = extract_features(text, embedder)
+            X.append(features)
+            # Find category
+            label = 'Unknown'
+            for cat in CATEGORIES.keys():
+                if f"Code: {cat} Applied" in row:
+                    if row[f"Code: {cat} Applied"] in [True, 1, '1', 'true', 'True']:
+                        label = cat
+                        break
+            y_labels.append(label)
+    X = np.array(X)
+    # Train classifier
     scaler = StandardScaler()
+    X_scaled = scaler.fit_transform(X)
+    clf = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)
+    clf.fit(X_scaled, y_labels)
+    return scaler, clf
+def analyze_text(text, embedder, scaler, clf):
+    # Split into paragraphs
+    paragraphs = text.split('\n\n')
+    paragraphs = [p.strip() for p in paragraphs if len(p.strip()) > 50]
+    if not paragraphs:
+        paragraphs = [text]
+    results = []
+    for i, para in enumerate(paragraphs):
+        features = extract_features(para, embedder)
         features_scaled = scaler.transform([features])
+        pred = clf.predict(features_scaled)[0]
+        prob = max(clf.predict_proba(features_scaled)[0])
+        results.append({
+            'segment': i + 1,
+            'category': pred,
+            'confidence': prob,
+            'text': para[:200] + '...' if len(para) > 200 else para
+        })
+    return results
+# MAIN APP STARTS HERE
 st.title("🏥 Medical School Personal Statement Analyzer")
+st.markdown("Analyze personal statements based on medical school rubrics")
+# Initialize session state
+if 'model_trained' not in st.session_state:
+    st.session_state.model_trained = False
+if 'scaler' not in st.session_state:
+    st.session_state.scaler = None
+if 'clf' not in st.session_state:
+    st.session_state.clf = None
+# Load transformer
+embedder = load_transformer()
+if embedder is None:
+    st.error("Failed to load model. Please refresh the page.")
+    st.stop()
+# Tabs
+tab1, tab2, tab3 = st.tabs(["Train Model", "Analyze Statement", "View Rubrics"])
 with tab1:
+    st.header("Step 1: Train the Model")
+    st.markdown("Upload Excel files with coded personal statement excerpts")
+    uploaded_file = st.file_uploader("Upload Training Data", type=['xlsx', 'csv'])
+    if uploaded_file:
+        try:
+            if uploaded_file.name.endswith('.csv'):
+                df = pd.read_csv(uploaded_file)
+            else:
+                df = pd.read_excel(uploaded_file)
+            st.success(f"Loaded {len(df)} rows")
+            # Process data
+            processed_data = []
+            for _, row in df.iterrows():
+                text_col = None
+                for col in ['Excerpt Copy', 'Excerpt', 'Text', 'Content']:
+                    if col in row and pd.notna(row[col]):
+                        text_col = col
+                        break
+                if text_col:
+                    processed_data.append({
+                        'text': str(row[text_col]),
+                        **{col: row[col] for col in row.index if 'Code:' in col}
+                    })
+            if processed_data:
+                train_df = pd.DataFrame(processed_data)
+                if st.button("Train Model"):
+                    with st.spinner("Training..."):
+                        scaler, clf = train_simple_model(train_df, embedder)
+                        st.session_state.scaler = scaler
+                        st.session_state.clf = clf
+                        st.session_state.model_trained = True
+                        st.success("Model trained successfully!")
+            else:
+                st.error("No valid text data found")
+        except Exception as e:
+            st.error(f"Error: {str(e)}")
 with tab2:
+    st.header("Step 2: Analyze Personal Statement")
+    if not st.session_state.model_trained:
+        st.warning("Please train the model first in Step 1")
     else:
+        text_input = st.text_area("Paste your personal statement:", height=300)
+        if text_input and st.button("Analyze"):
+            with st.spinner("Analyzing..."):
+                results = analyze_text(
+                    text_input,
+                    embedder,
+                    st.session_state.scaler,
+                    st.session_state.clf
+                )
+            st.success("Analysis Complete!")
+            # Summary
+            st.subheader("Summary")
+            categories_found = list(set([r['category'] for r in results if r['category'] != 'Unknown']))
+            st.metric("Categories Found", f"{len(categories_found)}/4")
+            # Details
+            st.subheader("Segment Analysis")
+            for result in results:
+                with st.expander(f"Segment {result['segment']}: {result['category']}"):
+                    st.write(f"**Confidence:** {result['confidence']:.1%}")
+                    st.write(f"**Text:** {result['text']}")
+            # Recommendations
+            st.subheader("Recommendations")
+            missing = [cat for cat in CATEGORIES.keys() if cat not in categories_found]
+            if missing:
+                st.warning("Missing categories:")
+                for cat in missing:
+                    st.write(f"• Add {cat}: {CATEGORIES[cat]['description']}")
 with tab3:
     st.header("Scoring Rubrics")
     for category, info in CATEGORIES.items():
+        with st.expander(category):
             st.write(f"**Description:** {info['description']}")
+            st.write("**Scoring:**")
             for score in [4, 3, 2, 1]:
+                st.write(f"• Score {score}: {info['rubric'][score]}")
+            st.write(f"**Keywords:** {', '.join(info['keywords'][:5])}...")