Spaces:

stevafernandes
/

personal_statement

Sleeping

App Files Files Community

stevafernandes commited on Aug 25, 2025

Commit

dc544bb

verified ·

1 Parent(s): a289e39

Update app.py

Browse files

Files changed (1) hide show

app.py +240 -151

app.py CHANGED Viewed

@@ -3,104 +3,121 @@ import pandas as pd
 import numpy as np
 import pickle
 import os
-from sentence_transformers import SentenceTransformer
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import StandardScaler
-from sklearn.metrics.pairwise import cosine_similarity
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.dummy import DummyRegressor
-import xgboost as xgb
 import re
-import warnings
-warnings.filterwarnings('ignore')
-# Initialize Streamlit - MUST BE AT THE TOP
 st.set_page_config(
     page_title="Medical School Personal Statement Analyzer",
     page_icon="🏥",
     layout="wide"
 )
 # Categories definition
 CATEGORIES = {
     'Spark': {
         'description': 'Opening that spurs interest in medicine',
         'keywords': ['growing up', 'childhood', 'family', 'realized', 'inspired', 'first',
                     'beginning', 'early', 'experience that', 'moment', 'when I was'],
-        'patterns': [r'when I was \d+', r'at age \d+', r'since I was', r'as a child'],
         'rubric': {
             1: 'disconnected or confusing',
-            2: 'somewhat connected but unclear',
             3: 'connected and clear',
             4: 'engaging and logical flow'
-        },
-        'rubric_features': {
-            'positive': ['engaging', 'logical', 'clear', 'compelling', 'authentic'],
-            'negative': ['disconnected', 'confusing', 'random', 'unclear', 'generic']
         }
     },
     'Healthcare Experience': {
         'description': 'Clinical/medical experiences',
         'keywords': ['shadowed', 'clinical', 'hospital', 'patient', 'doctor', 'physician',
                     'medical', 'treatment', 'observed', 'volunteer', 'clinic'],
-        'patterns': [r'\d+ hours', r'volunteered at', r'shadowing', r'clinical experience'],
         'rubric': {
             1: 'passive, uninteresting, negative',
             2: 'bland but not problematic',
             3: 'interesting and relevant',
             4: 'vivid, active, thoughtful, memorable'
-        },
-        'rubric_features': {
-            'positive': ['vivid', 'active', 'thoughtful', 'memorable', 'optimistic'],
-            'negative': ['passive', 'uninteresting', 'irrelevant', 'problematic']
         }
     },
     'Showing Doctor Qualities': {
         'description': 'Leadership and doctor qualities',
         'keywords': ['leadership', 'empathy', 'compassion', 'responsibility', 'communication',
                     'advocate', 'caring', 'helping', 'service', 'volunteer'],
-        'patterns': [r'as (president|leader|captain)', r'I organized', r'I founded'],
         'rubric': {
             1: 'arrogant, immature, inaccurate',
             2: 'bland but not problematic',
             3: 'shows some understanding',
             4: 'realistic, mature, humble, clear'
-        },
-        'rubric_features': {
-            'positive': ['realistic', 'self-aware', 'mature', 'humble', 'specific'],
-            'negative': ['arrogant', 'immature', 'overly confident', 'simplistic']
         }
     },
     'Spin': {
         'description': 'Connecting experiences to medical career',
         'keywords': ['learned', 'taught me', 'showed me', 'realized', 'understood',
                     'because', 'therefore', 'this experience', 'prepared me'],
-        'patterns': [r'this .+ taught me', r'I learned that', r'prepared me for'],
         'rubric': {
             1: 'vague, simplistic, generic',
             2: 'some connection but generic',
             3: 'clear connection',
             4: 'direct, logical, specific argument'
-        },
-        'rubric_features': {
-            'positive': ['direct', 'logical', 'specific', 'clear argument'],
-            'negative': ['brief', 'vague', 'simplistic', 'generic']
         }
     }
 }
-# Model paths
-MODEL_DIR = "trained_models"
-# Helper functions
-@st.cache_resource
-def load_transformer():
     try:
-        return SentenceTransformer('all-MiniLM-L6-v2')
-    except:
         return None
 def extract_features(text, embedder):
     features = []
     text_lower = text.lower()
     words = text.split()
@@ -112,25 +129,32 @@ def extract_features(text, embedder):
         len(set(words)) / max(len(words), 1)
     ])
-    # Category features
     for cat_name, cat_info in CATEGORIES.items():
         keyword_count = sum(1 for kw in cat_info['keywords'] if kw.lower() in text_lower)
         features.append(keyword_count / len(cat_info['keywords']))
     # Get embedding
-    try:
-        embedding = embedder.encode(text)
-        if hasattr(embedding, 'cpu'):
-            embedding = embedding.cpu().numpy()
-        embedding = embedding.flatten()[:128]  # Reduced size
-    except:
         embedding = np.zeros(128)
     return np.concatenate([features, embedding])
-def train_simple_model(df, embedder):
     X = []
-    y_labels = []
     for _, row in df.iterrows():
         if 'text' in row:
@@ -138,28 +162,36 @@ def train_simple_model(df, embedder):
             features = extract_features(text, embedder)
             X.append(features)
-            # Find category
             label = 'Unknown'
             for cat in CATEGORIES.keys():
-                if f"Code: {cat} Applied" in row:
-                    if row[f"Code: {cat} Applied"] in [True, 1, '1', 'true', 'True']:
                         label = cat
                         break
-            y_labels.append(label)
     X = np.array(X)
-    # Train classifier
     scaler = StandardScaler()
     X_scaled = scaler.fit_transform(X)
     clf = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)
-    clf.fit(X_scaled, y_labels)
     return scaler, clf
-def analyze_text(text, embedder, scaler, clf):
-    # Split into paragraphs
     paragraphs = text.split('\n\n')
     paragraphs = [p.strip() for p in paragraphs if len(p.strip()) > 50]
@@ -183,120 +215,177 @@ def analyze_text(text, embedder, scaler, clf):
     return results
-# MAIN APP STARTS HERE
-st.title("🏥 Medical School Personal Statement Analyzer")
-st.markdown("Analyze personal statements based on medical school rubrics")
-# Initialize session state
-if 'model_trained' not in st.session_state:
-    st.session_state.model_trained = False
-if 'scaler' not in st.session_state:
-    st.session_state.scaler = None
-if 'clf' not in st.session_state:
-    st.session_state.clf = None
-# Load transformer
-embedder = load_transformer()
-if embedder is None:
-    st.error("Failed to load model. Please refresh the page.")
-    st.stop()
-# Tabs
-tab1, tab2, tab3 = st.tabs(["Train Model", "Analyze Statement", "View Rubrics"])
-with tab1:
-    st.header("Step 1: Train the Model")
-    st.markdown("Upload Excel files with coded personal statement excerpts")
-    uploaded_file = st.file_uploader("Upload Training Data", type=['xlsx', 'csv'])
-    if uploaded_file:
-        try:
-            if uploaded_file.name.endswith('.csv'):
-                df = pd.read_csv(uploaded_file)
-            else:
-                df = pd.read_excel(uploaded_file)
-            st.success(f"Loaded {len(df)} rows")
-            # Process data
-            processed_data = []
-            for _, row in df.iterrows():
-                text_col = None
-                for col in ['Excerpt Copy', 'Excerpt', 'Text', 'Content']:
-                    if col in row and pd.notna(row[col]):
-                        text_col = col
-                        break
-                if text_col:
-                    processed_data.append({
-                        'text': str(row[text_col]),
-                        **{col: row[col] for col in row.index if 'Code:' in col}
-                    })
-            if processed_data:
-                train_df = pd.DataFrame(processed_data)
-                if st.button("Train Model"):
-                    with st.spinner("Training..."):
-                        scaler, clf = train_simple_model(train_df, embedder)
-                        st.session_state.scaler = scaler
-                        st.session_state.clf = clf
-                        st.session_state.model_trained = True
-                        st.success("Model trained successfully!")
-            else:
-                st.error("No valid text data found")
-        except Exception as e:
-            st.error(f"Error: {str(e)}")
-with tab2:
-    st.header("Step 2: Analyze Personal Statement")
-    if not st.session_state.model_trained:
-        st.warning("Please train the model first in Step 1")
-    else:
-        text_input = st.text_area("Paste your personal statement:", height=300)
-        if text_input and st.button("Analyze"):
             with st.spinner("Analyzing..."):
-                results = analyze_text(
-                    text_input,
-                    embedder,
-                    st.session_state.scaler,
-                    st.session_state.clf
-                )
             st.success("Analysis Complete!")
             # Summary
-            st.subheader("Summary")
             categories_found = list(set([r['category'] for r in results if r['category'] != 'Unknown']))
-            st.metric("Categories Found", f"{len(categories_found)}/4")
-            # Details
-            st.subheader("Segment Analysis")
             for result in results:
                 with st.expander(f"Segment {result['segment']}: {result['category']}"):
-                    st.write(f"**Confidence:** {result['confidence']:.1%}")
-                    st.write(f"**Text:** {result['text']}")
             # Recommendations
-            st.subheader("Recommendations")
             missing = [cat for cat in CATEGORIES.keys() if cat not in categories_found]
             if missing:
-                st.warning("Missing categories:")
                 for cat in missing:
-                    st.write(f"• Add {cat}: {CATEGORIES[cat]['description']}")
-with tab3:
-    st.header("Scoring Rubrics")
-    for category, info in CATEGORIES.items():
-        with st.expander(category):
-            st.write(f"**Description:** {info['description']}")
-            st.write("**Scoring:**")
-            for score in [4, 3, 2, 1]:
-                st.write(f"• Score {score}: {info['rubric'][score]}")
-            st.write(f"**Keywords:** {', '.join(info['keywords'][:5])}...")

 import numpy as np
 import pickle
 import os
 import re
+from io import BytesIO
+# Page config MUST be first
 st.set_page_config(
     page_title="Medical School Personal Statement Analyzer",
     page_icon="🏥",
     layout="wide"
 )
+# Import ML libraries after streamlit
+try:
+    from sentence_transformers import SentenceTransformer
+    from sklearn.preprocessing import StandardScaler
+    from sklearn.ensemble import RandomForestClassifier
+    from sklearn.metrics.pairwise import cosine_similarity
+    import xgboost as xgb
+    ML_AVAILABLE = True
+except ImportError as e:
+    ML_AVAILABLE = False
+    st.error(f"ML libraries not loaded: {e}")
 # Categories definition
 CATEGORIES = {
     'Spark': {
         'description': 'Opening that spurs interest in medicine',
         'keywords': ['growing up', 'childhood', 'family', 'realized', 'inspired', 'first',
                     'beginning', 'early', 'experience that', 'moment', 'when I was'],
         'rubric': {
             1: 'disconnected or confusing',
+            2: 'somewhat connected but unclear',
             3: 'connected and clear',
             4: 'engaging and logical flow'
         }
     },
     'Healthcare Experience': {
         'description': 'Clinical/medical experiences',
         'keywords': ['shadowed', 'clinical', 'hospital', 'patient', 'doctor', 'physician',
                     'medical', 'treatment', 'observed', 'volunteer', 'clinic'],
         'rubric': {
             1: 'passive, uninteresting, negative',
             2: 'bland but not problematic',
             3: 'interesting and relevant',
             4: 'vivid, active, thoughtful, memorable'
         }
     },
     'Showing Doctor Qualities': {
         'description': 'Leadership and doctor qualities',
         'keywords': ['leadership', 'empathy', 'compassion', 'responsibility', 'communication',
                     'advocate', 'caring', 'helping', 'service', 'volunteer'],
         'rubric': {
             1: 'arrogant, immature, inaccurate',
             2: 'bland but not problematic',
             3: 'shows some understanding',
             4: 'realistic, mature, humble, clear'
         }
     },
     'Spin': {
         'description': 'Connecting experiences to medical career',
         'keywords': ['learned', 'taught me', 'showed me', 'realized', 'understood',
                     'because', 'therefore', 'this experience', 'prepared me'],
         'rubric': {
             1: 'vague, simplistic, generic',
             2: 'some connection but generic',
             3: 'clear connection',
             4: 'direct, logical, specific argument'
         }
     }
 }
+def load_model():
+    """Load the sentence transformer model"""
+    if not ML_AVAILABLE:
+        return None
     try:
+        with st.spinner("Loading AI model..."):
+            model = SentenceTransformer('all-MiniLM-L6-v2')
+        return model
+    except Exception as e:
+        st.error(f"Failed to load model: {e}")
         return None
+def analyze_text_simple(text):
+    """Simple keyword-based analysis without ML"""
+    paragraphs = text.split('\n\n')
+    paragraphs = [p.strip() for p in paragraphs if len(p.strip()) > 50]
+    if not paragraphs:
+        paragraphs = [text]
+    results = []
+    for i, para in enumerate(paragraphs):
+        para_lower = para.lower()
+        # Find best matching category
+        best_category = 'Unknown'
+        best_score = 0
+        for cat_name, cat_info in CATEGORIES.items():
+            score = sum(1 for kw in cat_info['keywords'] if kw.lower() in para_lower)
+            if score > best_score:
+                best_score = score
+                best_category = cat_name
+        results.append({
+            'segment': i + 1,
+            'category': best_category,
+            'keyword_matches': best_score,
+            'text': para[:200] + '...' if len(para) > 200 else para
+        })
+    return results
 def extract_features(text, embedder):
+    """Extract features for ML analysis"""
     features = []
     text_lower = text.lower()
     words = text.split()
         len(set(words)) / max(len(words), 1)
     ])
+    # Category keyword features
     for cat_name, cat_info in CATEGORIES.items():
         keyword_count = sum(1 for kw in cat_info['keywords'] if kw.lower() in text_lower)
         features.append(keyword_count / len(cat_info['keywords']))
     # Get embedding
+    if embedder:
+        try:
+            embedding = embedder.encode(text)
+            if hasattr(embedding, 'cpu'):
+                embedding = embedding.cpu().numpy()
+            embedding = embedding.flatten()[:128]
+        except:
+            embedding = np.zeros(128)
+    else:
         embedding = np.zeros(128)
     return np.concatenate([features, embedding])
+def train_model(df, embedder):
+    """Train a simple classifier"""
+    if not ML_AVAILABLE:
+        return None, None
     X = []
+    y = []
     for _, row in df.iterrows():
         if 'text' in row:
             features = extract_features(text, embedder)
             X.append(features)
+            # Find category label
             label = 'Unknown'
             for cat in CATEGORIES.keys():
+                col_name = f"Code: {cat} Applied"
+                if col_name in row:
+                    if row[col_name] in [True, 1, '1', 'true', 'True', 'yes', 'Yes']:
                         label = cat
                         break
+            y.append(label)
+    if not X:
+        return None, None
     X = np.array(X)
+    # Scale features
     scaler = StandardScaler()
     X_scaled = scaler.fit_transform(X)
+    # Train classifier
     clf = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)
+    clf.fit(X_scaled, y)
     return scaler, clf
+def analyze_with_model(text, embedder, scaler, clf):
+    """Analyze text using trained model"""
+    if not ML_AVAILABLE or not all([embedder, scaler, clf]):
+        return analyze_text_simple(text)
     paragraphs = text.split('\n\n')
     paragraphs = [p.strip() for p in paragraphs if len(p.strip()) > 50]
     return results
+# Main App
+def main():
+    st.title("🏥 Medical School Personal Statement Analyzer")
+    st.markdown("Analyze personal statements based on medical school admission rubrics")
+    # Initialize session state
+    if 'model_trained' not in st.session_state:
+        st.session_state['model_trained'] = False
+    if 'embedder' not in st.session_state:
+        st.session_state['embedder'] = None
+    if 'scaler' not in st.session_state:
+        st.session_state['scaler'] = None
+    if 'clf' not in st.session_state:
+        st.session_state['clf'] = None
+    # Tabs
+    tab1, tab2, tab3 = st.tabs(["📚 Train Model", "📝 Analyze Statement", "📊 View Rubrics"])
+    with tab1:
+        st.header("Train the AI Model")
+        if ML_AVAILABLE:
+            st.info("Upload an Excel file with coded personal statement excerpts to train the model.")
+            uploaded_file = st.file_uploader("Upload Training Data", type=['xlsx', 'csv'])
+            if uploaded_file:
+                try:
+                    # Read file
+                    if uploaded_file.name.endswith('.csv'):
+                        df = pd.read_csv(uploaded_file)
+                    else:
+                        df = pd.read_excel(uploaded_file)
+                    st.success(f"Loaded {len(df)} rows")
+                    # Show sample of data
+                    st.write("Sample of data:")
+                    st.dataframe(df.head())
+                    # Process data
+                    processed_data = []
+                    for _, row in df.iterrows():
+                        # Find text column
+                        text_col = None
+                        for col in ['Excerpt Copy', 'Excerpt', 'Text', 'Content']:
+                            if col in df.columns and pd.notna(row[col]):
+                                text_col = col
+                                break
+                        if text_col:
+                            row_data = {'text': str(row[text_col])}
+                            # Add category columns
+                            for col in df.columns:
+                                if 'Code:' in col:
+                                    row_data[col] = row[col]
+                            processed_data.append(row_data)
+                    if processed_data:
+                        train_df = pd.DataFrame(processed_data)
+                        st.write(f"Found {len(train_df)} valid training samples")
+                        if st.button("Train Model", type="primary"):
+                            # Load embedder if needed
+                            if st.session_state['embedder'] is None:
+                                st.session_state['embedder'] = load_model()
+                            if st.session_state['embedder']:
+                                with st.spinner("Training model..."):
+                                    scaler, clf = train_model(train_df, st.session_state['embedder'])
+                                if scaler and clf:
+                                    st.session_state['scaler'] = scaler
+                                    st.session_state['clf'] = clf
+                                    st.session_state['model_trained'] = True
+                                    st.success("✅ Model trained successfully!")
+                                else:
+                                    st.error("Training failed. Check your data format.")
+                            else:
+                                st.error("Could not load the AI model.")
+                    else:
+                        st.error("No valid text data found in the file.")
+                except Exception as e:
+                    st.error(f"Error reading file: {e}")
+        else:
+            st.warning("ML libraries not available. Using keyword-based analysis only.")
+    with tab2:
+        st.header("Analyze Personal Statement")
+        analysis_method = "ML" if st.session_state['model_trained'] else "Keyword"
+        st.info(f"Using {analysis_method}-based analysis")
+        text_input = st.text_area(
+            "Paste your personal statement here:",
+            height=300,
+            placeholder="Enter your personal statement text..."
+        )
+        if text_input and st.button("Analyze", type="primary"):
             with st.spinner("Analyzing..."):
+                if st.session_state['model_trained']:
+                    results = analyze_with_model(
+                        text_input,
+                        st.session_state['embedder'],
+                        st.session_state['scaler'],
+                        st.session_state['clf']
+                    )
+                else:
+                    results = analyze_text_simple(text_input)
             st.success("Analysis Complete!")
             # Summary
+            st.subheader("📊 Summary")
             categories_found = list(set([r['category'] for r in results if r['category'] != 'Unknown']))
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Categories Found", f"{len(categories_found)}/4")
+            with col2:
+                st.metric("Segments Analyzed", len(results))
+            with col3:
+                quality = "Good" if len(categories_found) >= 3 else "Needs Work"
+                st.metric("Overall", quality)
+            # Category presence
+            st.subheader("📋 Category Coverage")
+            for cat in CATEGORIES.keys():
+                if cat in categories_found:
+                    st.write(f"✅ **{cat}**: Found")
+                else:
+                    st.write(f"❌ **{cat}**: Not detected")
+            # Segment details
+            st.subheader("📝 Segment Analysis")
             for result in results:
                 with st.expander(f"Segment {result['segment']}: {result['category']}"):
+                    if 'confidence' in result:
+                        st.write(f"**Confidence:** {result['confidence']:.1%}")
+                    elif 'keyword_matches' in result:
+                        st.write(f"**Keyword Matches:** {result['keyword_matches']}")
+                    st.write(f"**Text Preview:** {result['text']}")
             # Recommendations
+            st.subheader("💡 Recommendations")
             missing = [cat for cat in CATEGORIES.keys() if cat not in categories_found]
             if missing:
+                st.warning("**Missing Categories - Add content for:**")
                 for cat in missing:
+                    st.write(f"• **{cat}**: {CATEGORIES[cat]['description']}")
+                    st.write(f"  Keywords: {', '.join(CATEGORIES[cat]['keywords'][:5])}...")
+            else:
+                st.success("Great! All categories are represented in your statement.")
+    with tab3:
+        st.header("Scoring Rubrics")
+        st.info("Understanding how each category is evaluated")
+        for category, info in CATEGORIES.items():
+            with st.expander(f"**{category}** - {info['description']}"):
+                st.write("**Scoring Criteria:**")
+                for score in [4, 3, 2, 1]:
+                    quality = ['Poor', 'Below Average', 'Good', 'Excellent'][score-1]
+                    st.write(f"• **Score {score} ({quality}):** {info['rubric'][score]}")
+                st.write(f"\n**Key Terms:** {', '.join(info['keywords'])}")
+# Run the app
+if __name__ == "__main__":
+    main()
+else:
+    # This ensures the app runs when imported by Streamlit
+    main()