Spaces:

stevafernandes
/

personal_statement

Sleeping

App Files Files Community

stevafernandes commited on Aug 25, 2025

Commit

e96c629

verified ·

1 Parent(s): 25a20f2

Update app.py

Browse files

Files changed (1) hide show

app.py +171 -178

app.py CHANGED Viewed

@@ -123,24 +123,16 @@ CLASSIFIER_PATH = os.path.join(MODEL_DIR, "classifier.pkl")
 SCORER_PATH = os.path.join(MODEL_DIR, "scorer.pkl")
 SCALER_PATH = os.path.join(MODEL_DIR, "scaler.pkl")
 THRESHOLD_PATH = os.path.join(MODEL_DIR, "thresholds.pkl")
-ENSEMBLE_PATH = os.path.join(MODEL_DIR, "ensemble.pkl")
 @st.cache_resource
 def load_sentence_transformer():
     """Load sentence transformer model"""
-    models_to_try = [
-        'all-MiniLM-L6-v2',  # Lightweight and reliable
-        'all-mpnet-base-v2'  # Good alternative
-    ]
-    for model_name in models_to_try:
-        try:
-            model = SentenceTransformer(model_name)
-            return model, model_name
-        except:
-            continue
-    return SentenceTransformer('all-MiniLM-L6-v2'), 'all-MiniLM-L6-v2'
 def segment_text(text, embedder):
     """Segment text into meaningful chunks"""
@@ -154,7 +146,6 @@ def segment_text(text, embedder):
         if len(sentences) < 3:
             return [text]
-        # Group sentences into segments
         segments = []
         current_segment = []
         for sent in sentences:
@@ -212,16 +203,22 @@ def extract_features(text, embedder, category_focus=None):
     # Get embeddings
     try:
-        embedding = embedder.encode(text, convert_to_tensor=False, normalize_embeddings=True)
     except:
-        embedding = embedder.encode(text)
     # Category similarity
     if category_focus and category_focus in CATEGORIES:
         category_text = f"{CATEGORIES[category_focus]['description']} {' '.join(CATEGORIES[category_focus]['keywords'][:10])}"
         try:
-            category_embedding = embedder.encode(category_text, normalize_embeddings=True)
-            similarity = cosine_similarity([embedding], [category_embedding])[0][0]
             features.append(similarity * 10)
         except:
             features.append(0)
@@ -229,7 +226,7 @@ def extract_features(text, embedder, category_focus=None):
         features.append(0)
     features = np.array(features, dtype=np.float32)
-    combined_features = np.concatenate([features, embedding[:256]])  # Limit embedding size
     return combined_features
@@ -496,179 +493,175 @@ def analyze_statement(text, embedder, scaler, classifiers, scorers, thresholds):
     return segment_results, category_results
-# Main application
-def main():
-    st.title("🏥 Medical School Personal Statement Analyzer")
-    st.markdown("*AI-powered analysis based on medical school admission rubrics*")
-    st.markdown("---")
-    # Sidebar
-    with st.sidebar:
-        st.header("ℹ️ About")
-        st.markdown("""
-        This tool analyzes personal statements based on 4 key categories:
-        - **Spark**: Opening that shows interest in medicine
-        - **Healthcare Experience**: Clinical/medical experiences
-        - **Doctor Qualities**: Leadership and character traits
-        - **Spin**: Connecting experiences to medical career
-        Each category is scored 1-4 (Poor to Excellent)
-        """)
-    # Create tabs
-    tab1, tab2, tab3 = st.tabs(["📚 Train Model", "📝 Analyze Statement", "📊 View Rubrics"])
-    # Train Model Tab
-    with tab1:
-        st.header("Train the AI Model")
-        if all(os.path.exists(p) for p in [CLASSIFIER_PATH, SCORER_PATH, SCALER_PATH]):
-            st.success("✓ Models already trained. You can analyze statements or retrain.")
-        st.markdown("Upload training data files (Excel format with coded excerpts)")
-        col1, col2 = st.columns(2)
-        with col1:
-            file1 = st.file_uploader("Training File 1", type=['xlsx'], key="file1")
-        with col2:
-            file2 = st.file_uploader("Training File 2", type=['xlsx'], key="file2")
-        if file1 and file2:
-            if st.button("Start Training", type="primary"):
-                try:
-                    # Load data
-                    with st.spinner("Loading training data..."):
-                        df = load_training_data(file1, file2)
-                    if df.empty:
-                        st.error("No valid training data found.")
-                        return
                     st.success(f"✓ Loaded {len(df)} training samples")
                     # Load embedder
                     with st.spinner("Loading transformer model..."):
                         embedder, embedder_name = load_sentence_transformer()
-                    # Train
-                    scaler, classifiers, scorers, thresholds = train_models(df, embedder)
-                    # Save
-                    save_models(embedder_name, scaler, classifiers, scorers, thresholds)
-                    st.success("✓ Training complete! Models saved.")
-                except Exception as e:
-                    st.error(f"Training failed: {str(e)}")
-    # Analyze Statement Tab
-    with tab2:
-        st.header("Analyze Personal Statement")
-        if not all(os.path.exists(p) for p in [CLASSIFIER_PATH, SCORER_PATH, SCALER_PATH]):
-            st.warning("⚠️ Please train the model first (Tab 1)")
-            return
         # Load models
         embedder, scaler, classifiers, scorers, thresholds = load_saved_models()
         if embedder is None:
             st.error("Failed to load models. Please retrain.")
-            return
-        # Input method
-        input_method = st.radio("Choose input method:", ["Paste Text", "Upload File"])
-        text_to_analyze = None
-        if input_method == "Paste Text":
-            text_to_analyze = st.text_area(
-                "Paste your personal statement here:",
-                height=300,
-                placeholder="Enter your personal statement..."
-            )
         else:
-            uploaded_file = st.file_uploader("Upload statement (.txt)", type=['txt'])
-            if uploaded_file:
-                text_to_analyze = str(uploaded_file.read(), 'utf-8')
-                st.success("File uploaded successfully!")
-        if text_to_analyze and st.button("Analyze Statement", type="primary"):
-            with st.spinner("Analyzing..."):
-                segment_results, category_results = analyze_statement(
-                    text_to_analyze, embedder, scaler, classifiers, scorers, thresholds
-                )
-            # Display results
-            st.success("✓ Analysis complete!")
-            # Summary
-            st.subheader("📊 Overall Summary")
-            cols = st.columns(4)
-            detected = [cat for cat, res in category_results.items() if res['detected']]
-            with cols[0]:
-                st.metric("Categories Found", f"{len(detected)}/4")
-            with cols[1]:
-                if detected:
-                    avg_score = np.mean([category_results[cat]['score'] for cat in detected])
-                    st.metric("Average Score", f"{avg_score:.1f}/4")
-                else:
-                    st.metric("Average Score", "N/A")
-            with cols[2]:
-                st.metric("Total Segments", len(segment_results))
-            with cols[3]:
-                quality = "Excellent" if len(detected) == 4 and avg_score >= 3.5 else "Good" if len(detected) >= 3 else "Needs Work"
-                st.metric("Overall", quality)
-            # Category breakdown
-            st.subheader("📋 Category Analysis")
-            for cat in CATEGORIES.keys():
-                res = category_results[cat]
-                if res['detected']:
-                    icon = "✅" if res['score'] >= 3 else "⚠️" if res['score'] >= 2 else "❌"
-                    st.write(f"{icon} **{cat}**: Score {res['score']}/4 (Confidence: {res['confidence']:.1%})")
-                else:
-                    st.write(f"❌ **{cat}**: Not detected")
-            # Segment details
-            st.subheader("📝 Segment Details")
-            for seg in segment_results:
-                with st.expander(f"Segment {seg['segment_num']}: {seg['category']}"):
-                    st.write(f"**Score:** {seg['score']}/4" if seg['score'] else "N/A")
-                    st.write(f"**Confidence:** {seg['confidence']:.1%}")
-                    st.write(f"**Text:** {seg['text'][:300]}...")
-            # Recommendations
-            st.subheader("💡 Recommendations")
-            missing = [cat for cat, res in category_results.items() if not res['detected']]
-            low_score = [cat for cat, res in category_results.items()
-                        if res['detected'] and res['score'] and res['score'] < 3]
-            if missing:
-                st.warning("**Missing Categories:**")
-                for cat in missing:
-                    st.write(f"• Add content for **{cat}**: {CATEGORIES[cat]['description']}")
-            if low_score:
-                st.info("**Areas to Improve:**")
-                for cat in low_score:
-                    st.write(f"• Strengthen **{cat}** (current score: {category_results[cat]['score']}/4)")
-            if not missing and not low_score:
-                st.success("Excellent work! All categories present with good scores.")
-    # View Rubrics Tab
-    with tab3:
-        st.header("Scoring Rubrics")
-        for category, info in CATEGORIES.items():
-            with st.expander(f"**{category}**"):
-                st.write(f"**Description:** {info['description']}")
-                st.write("**Scoring Criteria:**")
-                for score in [4, 3, 2, 1]:
-                    st.write(f"• **Score {score}:** {info['rubric'][score]}")
-                st.write(f"**Key Terms:** {', '.join(info['keywords'][:8])}")
-if __name__ == "__main__":
-    main()

 SCORER_PATH = os.path.join(MODEL_DIR, "scorer.pkl")
 SCALER_PATH = os.path.join(MODEL_DIR, "scaler.pkl")
 THRESHOLD_PATH = os.path.join(MODEL_DIR, "thresholds.pkl")
 @st.cache_resource
 def load_sentence_transformer():
     """Load sentence transformer model"""
+    try:
+        model = SentenceTransformer('all-MiniLM-L6-v2')
+        return model, 'all-MiniLM-L6-v2'
+    except:
+        st.error("Failed to load sentence transformer model")
+        return None, None
 def segment_text(text, embedder):
     """Segment text into meaningful chunks"""
         if len(sentences) < 3:
             return [text]
         segments = []
         current_segment = []
         for sent in sentences:
     # Get embeddings
     try:
+        embedding = embedder.encode(text, convert_to_tensor=False)
+        if hasattr(embedding, 'cpu'):
+            embedding = embedding.cpu().numpy()
+        embedding = embedding.flatten()[:256]  # Limit size
     except:
+        embedding = np.zeros(256)
     # Category similarity
     if category_focus and category_focus in CATEGORIES:
         category_text = f"{CATEGORIES[category_focus]['description']} {' '.join(CATEGORIES[category_focus]['keywords'][:10])}"
         try:
+            category_embedding = embedder.encode(category_text)
+            if hasattr(category_embedding, 'cpu'):
+                category_embedding = category_embedding.cpu().numpy()
+            category_embedding = category_embedding.flatten()
+            similarity = cosine_similarity([embedding], [category_embedding[:256]])[0][0]
             features.append(similarity * 10)
         except:
             features.append(0)
         features.append(0)
     features = np.array(features, dtype=np.float32)
+    combined_features = np.concatenate([features, embedding])
     return combined_features
     return segment_results, category_results
+# Main UI Code
+st.title("🏥 Medical School Personal Statement Analyzer")
+st.markdown("*AI-powered analysis based on medical school admission rubrics*")
+st.markdown("---")
+# Sidebar
+with st.sidebar:
+    st.header("ℹ️ About")
+    st.markdown("""
+    This tool analyzes personal statements based on 4 key categories:
+    - **Spark**: Opening that shows interest in medicine
+    - **Healthcare Experience**: Clinical/medical experiences
+    - **Doctor Qualities**: Leadership and character traits
+    - **Spin**: Connecting experiences to medical career
+    Each category is scored 1-4 (Poor to Excellent)
+    """)
+# Create tabs
+tab1, tab2, tab3 = st.tabs(["📚 Train Model", "📝 Analyze Statement", "📊 View Rubrics"])
+# Train Model Tab
+with tab1:
+    st.header("Train the AI Model")
+    if all(os.path.exists(p) for p in [CLASSIFIER_PATH, SCORER_PATH, SCALER_PATH]):
+        st.success("✓ Models already trained. You can analyze statements or retrain.")
+    st.markdown("Upload training data files (Excel format with coded excerpts)")
+    col1, col2 = st.columns(2)
+    with col1:
+        file1 = st.file_uploader("Training File 1", type=['xlsx'], key="file1")
+    with col2:
+        file2 = st.file_uploader("Training File 2", type=['xlsx'], key="file2")
+    if file1 and file2:
+        if st.button("Start Training", type="primary"):
+            try:
+                # Load data
+                with st.spinner("Loading training data..."):
+                    df = load_training_data(file1, file2)
+                if df.empty:
+                    st.error("No valid training data found.")
+                else:
                     st.success(f"✓ Loaded {len(df)} training samples")
                     # Load embedder
                     with st.spinner("Loading transformer model..."):
                         embedder, embedder_name = load_sentence_transformer()
+                    if embedder is not None:
+                        # Train
+                        scaler, classifiers, scorers, thresholds = train_models(df, embedder)
+                        # Save
+                        save_models(embedder_name, scaler, classifiers, scorers, thresholds)
+                        st.success("✓ Training complete! Models saved.")
+                    else:
+                        st.error("Failed to load transformer model")
+            except Exception as e:
+                st.error(f"Training failed: {str(e)}")
+# Analyze Statement Tab
+with tab2:
+    st.header("Analyze Personal Statement")
+    if not all(os.path.exists(p) for p in [CLASSIFIER_PATH, SCORER_PATH, SCALER_PATH]):
+        st.warning("⚠️ Please train the model first (Tab 1)")
+    else:
         # Load models
         embedder, scaler, classifiers, scorers, thresholds = load_saved_models()
         if embedder is None:
             st.error("Failed to load models. Please retrain.")
         else:
+            # Input method
+            input_method = st.radio("Choose input method:", ["Paste Text", "Upload File"])
+            text_to_analyze = None
+            if input_method == "Paste Text":
+                text_to_analyze = st.text_area(
+                    "Paste your personal statement here:",
+                    height=300,
+                    placeholder="Enter your personal statement..."
+                )
+            else:
+                uploaded_file = st.file_uploader("Upload statement (.txt)", type=['txt'])
+                if uploaded_file:
+                    text_to_analyze = str(uploaded_file.read(), 'utf-8')
+                    st.success("File uploaded successfully!")
+            if text_to_analyze and st.button("Analyze Statement", type="primary"):
+                with st.spinner("Analyzing..."):
+                    segment_results, category_results = analyze_statement(
+                        text_to_analyze, embedder, scaler, classifiers, scorers, thresholds
+                    )
+                # Display results
+                st.success("✓ Analysis complete!")
+                # Summary
+                st.subheader("📊 Overall Summary")
+                cols = st.columns(4)
+                detected = [cat for cat, res in category_results.items() if res['detected']]
+                with cols[0]:
+                    st.metric("Categories Found", f"{len(detected)}/4")
+                with cols[1]:
+                    if detected:
+                        avg_score = np.mean([category_results[cat]['score'] for cat in detected])
+                        st.metric("Average Score", f"{avg_score:.1f}/4")
+                    else:
+                        st.metric("Average Score", "N/A")
+                with cols[2]:
+                    st.metric("Total Segments", len(segment_results))
+                with cols[3]:
+                    quality = "Excellent" if len(detected) == 4 and avg_score >= 3.5 else "Good" if len(detected) >= 3 else "Needs Work"
+                    st.metric("Overall", quality)
+                # Category breakdown
+                st.subheader("📋 Category Analysis")
+                for cat in CATEGORIES.keys():
+                    res = category_results[cat]
+                    if res['detected']:
+                        icon = "✅" if res['score'] >= 3 else "⚠️" if res['score'] >= 2 else "❌"
+                        st.write(f"{icon} **{cat}**: Score {res['score']}/4 (Confidence: {res['confidence']:.1%})")
+                    else:
+                        st.write(f"❌ **{cat}**: Not detected")
+                # Segment details
+                st.subheader("📝 Segment Details")
+                for seg in segment_results:
+                    with st.expander(f"Segment {seg['segment_num']}: {seg['category']}"):
+                        st.write(f"**Score:** {seg['score']}/4" if seg['score'] else "N/A")
+                        st.write(f"**Confidence:** {seg['confidence']:.1%}")
+                        st.write(f"**Text:** {seg['text'][:300]}...")
+                # Recommendations
+                st.subheader("💡 Recommendations")
+                missing = [cat for cat, res in category_results.items() if not res['detected']]
+                low_score = [cat for cat, res in category_results.items()
+                            if res['detected'] and res['score'] and res['score'] < 3]
+                if missing:
+                    st.warning("**Missing Categories:**")
+                    for cat in missing:
+                        st.write(f"• Add content for **{cat}**: {CATEGORIES[cat]['description']}")
+                if low_score:
+                    st.info("**Areas to Improve:**")
+                    for cat in low_score:
+                        st.write(f"• Strengthen **{cat}** (current score: {category_results[cat]['score']}/4)")
+                if not missing and not low_score:
+                    st.success("Excellent work! All categories present with good scores.")
+# View Rubrics Tab
+with tab3:
+    st.header("Scoring Rubrics")
+    for category, info in CATEGORIES.items():
+        with st.expander(f"**{category}**"):
+            st.write(f"**Description:** {info['description']}")
+            st.write("**Scoring Criteria:**")
+            for score in [4, 3, 2, 1]:
+                st.write(f"• **Score {score}:** {info['rubric'][score]}")
+            st.write(f"**Key Terms:** {', '.join(info['keywords'][:8])}")