Spaces:

stevafernandes
/

personal_statement

Sleeping

App Files Files Community

stevafernandes commited on Aug 25, 2025

Commit

e8445d0

verified ·

1 Parent(s): 7b1f193

Update app.py

Browse files

Files changed (1) hide show

app.py +678 -343

app.py CHANGED Viewed

@@ -6,31 +6,39 @@ import os
 import re
 from datetime import datetime
 from io import BytesIO
-import base64
 # Page config MUST be first
 st.set_page_config(
     page_title="Medical School Personal Statement Analyzer",
     page_icon="🏥",
-    layout="wide"
 )
 # Import ML libraries
-from sentence_transformers import SentenceTransformer
 from sklearn.preprocessing import StandardScaler
-from sklearn.ensemble import RandomForestClassifier
 from sklearn.metrics.pairwise import cosine_similarity
 import xgboost as xgb
 # Import PDF generation libraries
-from reportlab.lib import colors
-from reportlab.lib.pagesizes import letter
-from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, PageBreak
-from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
-from reportlab.lib.units import inch
-from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY
-# Categories definition with rubrics
 CATEGORIES = {
     'Spark': {
         'description': 'Opening that spurs interest in medicine (typically in opening paragraph)',
@@ -47,6 +55,10 @@ CATEGORIES = {
             2: 'somewhat connected but unclear',
             3: 'connected and clear',
             4: 'engaging and logically flows into becoming a doctor'
         }
     },
     'Healthcare Experience': {
@@ -64,6 +76,10 @@ CATEGORIES = {
             2: 'bland/boring but not problematic',
             3: 'interesting and relevant',
             4: 'vivid, active, thoughtful, relevant, memorable, positive'
         }
     },
     'Showing Doctor Qualities': {
@@ -81,6 +97,10 @@ CATEGORIES = {
             2: 'bland/boring but not problematic',
             3: 'shows some understanding',
             4: 'realistic, self-aware, mature, humble, specific understanding'
         }
     },
     'Spin': {
@@ -97,123 +117,404 @@ CATEGORIES = {
             2: 'some connection but generic',
             3: 'clear connection',
             4: 'direct, logical, and specific argument'
         }
     }
 }
 @st.cache_resource
-def load_pretrained_model():
-    """Load the pre-trained sentence transformer model"""
-    model = SentenceTransformer('all-MiniLM-L6-v2')
-    return model
-def segment_text(text):
-    """Segment text into meaningful paragraphs/chunks"""
-    # Try to split by double newlines first
     paragraphs = re.split(r'\n\s*\n', text)
     paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p.strip()) > 50]
-    # If only one paragraph, try to split by sentences
     if len(paragraphs) <= 1:
         sentences = re.split(r'(?<=[.!?])\s+', text)
         sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
-        # Group sentences into segments of ~300 words
         segments = []
-        current_segment = []
-        current_length = 0
-        for sent in sentences:
-            current_segment.append(sent)
-            current_length += len(sent.split())
-            if current_length > 100:  # About 300-400 characters
                 segments.append(' '.join(current_segment))
-                current_segment = []
-                current_length = 0
         if current_segment:
             segments.append(' '.join(current_segment))
-        return segments if segments else [text]
     return paragraphs
-def analyze_segment(text, embedder):
-    """Analyze a single segment to determine category and score"""
     text_lower = text.lower()
     words = text.split()
-    # Calculate scores for each category
-    category_scores = {}
     for cat_name, cat_info in CATEGORIES.items():
-        # Keyword matching
-        keyword_matches = sum(1 for kw in cat_info['keywords'] if kw.lower() in text_lower)
-        keyword_score = keyword_matches / len(cat_info['keywords'])
-        # Pattern matching
         pattern_matches = 0
-        for pattern in cat_info['patterns']:
-            if re.search(pattern, text_lower):
-                pattern_matches += 1
-        pattern_score = pattern_matches / len(cat_info['patterns']) if cat_info['patterns'] else 0
-        # Semantic similarity using embeddings
-        category_text = f"{cat_info['description']} {' '.join(cat_info['keywords'][:10])}"
-        text_embedding = embedder.encode(text)
-        category_embedding = embedder.encode(category_text)
-        if hasattr(text_embedding, 'cpu'):
-            text_embedding = text_embedding.cpu().numpy()
-            category_embedding = category_embedding.cpu().numpy()
-        similarity = cosine_similarity([text_embedding], [category_embedding])[0][0]
-        # Combined score
-        combined_score = (keyword_score * 0.3 + pattern_score * 0.2 + similarity * 0.5)
-        category_scores[cat_name] = combined_score
-    # Select best category
-    best_category = max(category_scores, key=category_scores.get)
-    confidence = category_scores[best_category]
-    # Determine quality score (1-4) based on rubric
-    if confidence > 0.7:
-        score = 4
-    elif confidence > 0.5:
-        score = 3
-    elif confidence > 0.3:
-        score = 2
     else:
-        score = 1
-    # Adjust score based on text quality indicators
-    positive_indicators = ['vivid', 'thoughtful', 'specific', 'logical', 'mature']
-    negative_indicators = ['vague', 'generic', 'unclear', 'disconnected', 'simplistic']
-    positive_count = sum(1 for ind in positive_indicators if ind in text_lower)
-    negative_count = sum(1 for ind in negative_indicators if ind in text_lower)
-    if positive_count > negative_count and score < 4:
-        score = min(score + 1, 4)
-    elif negative_count > positive_count and score > 1:
-        score = max(score - 1, 1)
-    return {
-        'category': best_category,
-        'score': score,
-        'confidence': confidence,
-        'text': text
-    }
-def analyze_full_statement(text, embedder):
     """Analyze complete personal statement"""
-    segments = segment_text(text)
     segment_results = []
     for i, segment in enumerate(segments):
-        result = analyze_segment(segment, embedder)
         result['segment_num'] = i + 1
         segment_results.append(result)
@@ -244,8 +545,11 @@ def analyze_full_statement(text, embedder):
     return segment_results, category_results
-def create_pdf_report(segment_results, category_results, statement_text):
-    """Create a professional PDF report"""
     buffer = BytesIO()
     doc = SimpleDocTemplate(buffer, pagesize=letter, rightMargin=72, leftMargin=72,
                            topMargin=72, bottomMargin=18)
@@ -307,92 +611,6 @@ def create_pdf_report(segment_results, category_results, statement_text):
     ]))
     elements.append(summary_table)
-    elements.append(Spacer(1, 30))
-    # Category Analysis
-    elements.append(Paragraph("CATEGORY ANALYSIS", heading_style))
-    category_data = [['Category', 'Status', 'Score', 'Confidence', 'Segments']]
-    for cat in CATEGORIES.keys():
-        if category_results[cat]['detected']:
-            status = "✓ Detected"
-            score = f"{category_results[cat]['score']}/4"
-            confidence = f"{category_results[cat]['confidence']:.1%}"
-            segments = str(category_results[cat]['num_segments'])
-        else:
-            status = "✗ Not Found"
-            score = "N/A"
-            confidence = "N/A"
-            segments = "0"
-        category_data.append([cat, status, score, confidence, segments])
-    category_table = Table(category_data, colWidths=[2*inch, 1.2*inch, 0.8*inch, 1*inch, 1*inch])
-    category_table.setStyle(TableStyle([
-        ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#1f4788')),
-        ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
-        ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
-        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
-        ('FONTSIZE', (0, 0), (-1, 0), 11),
-        ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
-        ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
-        ('GRID', (0, 0), (-1, -1), 1, colors.black)
-    ]))
-    elements.append(category_table)
-    elements.append(PageBreak())
-    # Detailed Recommendations
-    elements.append(Paragraph("RECOMMENDATIONS", heading_style))
-    missing_cats = [cat for cat, res in category_results.items() if not res['detected']]
-    low_score_cats = [cat for cat, res in category_results.items()
-                     if res['detected'] and res['score'] and res['score'] < 3]
-    if missing_cats:
-        elements.append(Paragraph("<b>Missing Categories:</b>", styles['Heading3']))
-        for cat in missing_cats:
-            elements.append(Paragraph(f"• Add content for {cat}: {CATEGORIES[cat]['description']}", styles['Normal']))
-            elements.append(Paragraph(f"  Include keywords: {', '.join(CATEGORIES[cat]['keywords'][:5])}...", styles['Normal']))
-        elements.append(Spacer(1, 12))
-    if low_score_cats:
-        elements.append(Paragraph("<b>Areas for Improvement:</b>", styles['Heading3']))
-        for cat in low_score_cats:
-            score = category_results[cat]['score']
-            elements.append(Paragraph(f"• Improve {cat} (current score: {score}/4)", styles['Normal']))
-            elements.append(Paragraph(f"  Target: {CATEGORIES[cat]['rubric'][4]}", styles['Normal']))
-        elements.append(Spacer(1, 12))
-    if not missing_cats and not low_score_cats:
-        elements.append(Paragraph("Excellent work! All categories are present with good scores.", styles['Normal']))
-    # Segment Analysis Summary
-    elements.append(PageBreak())
-    elements.append(Paragraph("SEGMENT ANALYSIS", heading_style))
-    for segment in segment_results[:10]:  # Limit to first 10 segments
-        elements.append(Paragraph(f"<b>Segment {segment['segment_num']}</b>", styles['Heading3']))
-        detail_data = [
-            ['Category', segment['category']],
-            ['Score', f"{segment['score']}/4"],
-            ['Confidence', f"{segment['confidence']:.1%}"]
-        ]
-        detail_table = Table(detail_data, colWidths=[1.5*inch, 4*inch])
-        detail_table.setStyle(TableStyle([
-            ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
-            ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
-            ('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
-            ('GRID', (0, 0), (-1, -1), 1, colors.black)
-        ]))
-        elements.append(detail_table)
-        elements.append(Spacer(1, 6))
-        text_preview = segment['text'][:200] + "..." if len(segment['text']) > 200 else segment['text']
-        elements.append(Paragraph(f"<i>{text_preview}</i>", styles['Normal']))
-        elements.append(Spacer(1, 12))
     # Build PDF
     doc.build(elements)
@@ -402,99 +620,187 @@ def create_pdf_report(segment_results, category_results, statement_text):
 # Main Application
 def main():
     st.title("🏥 Medical School Personal Statement Analyzer")
-    st.markdown("Faith Marie Kurtyka, Cole Krudwig, Sean Dore, Sara Avila, George (Guy) McHendry, Steven Fernandes")
     st.markdown("---")
-    # Sidebar with information
-    with st.sidebar:
-        st.header("ℹ️ About This Tool")
         st.markdown("""
-        This analyzer evaluates personal statements across 4 key categories:
-        **📌 Spark**
-        Opening that shows your interest in medicine
-        **🏥 Healthcare Experience**
-        Clinical and medical experiences
-        **💪 Doctor Qualities**
-        Leadership and character traits
-        **🔗 Spin**
-        Connecting experiences to medical career
-        Each category is scored 1-4:
-        - 4 = Excellent
-        - 3 = Good
-        - 2 = Below Average
-        - 1 = Poor
-        """)
-        st.markdown("---")
-        st.markdown("### 📊 Scoring Rubrics")
-        for cat_name, cat_info in CATEGORIES.items():
-            with st.expander(cat_name):
-                for score in [4, 3, 2, 1]:
-                    st.write(f"**Score {score}:** {cat_info['rubric'][score]}")
-    # Load model
-    embedder = load_pretrained_model()
-    # Main content area
-    st.header("📝 Upload Your Personal Statement")
-    # Input method selection
-    input_method = st.radio(
-        "Choose input method:",
-        ["Upload Text File (.txt)", "Paste Text Directly"],
-        horizontal=True
-    )
-    statement_text = None
-    if input_method == "Upload Text File (.txt)":
-        uploaded_file = st.file_uploader(
-            "Choose a text file",
-            type=['txt'],
-            help="Upload your personal statement as a .txt file"
         )
-        if uploaded_file is not None:
-            statement_text = str(uploaded_file.read(), 'utf-8')
-            st.success(f"✅ File uploaded successfully ({len(statement_text)} characters)")
-            # Show preview
-            with st.expander("Preview Statement"):
-                st.text(statement_text[:500] + "..." if len(statement_text) > 500 else statement_text)
-    else:  # Paste Text Directly
-        statement_text = st.text_area(
-            "Paste your personal statement here:",
-            height=400,
-            placeholder="Enter your complete personal statement...",
-            help="Paste your entire personal statement for analysis"
-        )
-        if statement_text:
-            st.info(f"📊 Statement length: {len(statement_text)} characters, {len(statement_text.split())} words")
-    # Analyze button
-    if statement_text and len(statement_text) > 100:
-        if st.button("🔬 Analyze Statement", type="primary", use_container_width=True):
-            with st.spinner("Analyzing your personal statement..."):
-                # Perform analysis
-                segment_results, category_results = analyze_full_statement(statement_text, embedder)
-            st.success("✅ Analysis Complete!")
-            st.balloons()
-            # Display results in tabs
-            tab1, tab2, tab3, tab4 = st.tabs(["📊 Summary", "📝 Segments", "💡 Recommendations", "📥 Download Report"])
-            with tab1:
-                st.header("Overall Summary")
                 # Metrics
                 col1, col2, col3, col4 = st.columns(4)
@@ -502,11 +808,7 @@ def main():
                 detected_cats = [cat for cat, res in category_results.items() if res['detected']]
                 with col1:
-                    st.metric(
-                        "Categories Found",
-                        f"{len(detected_cats)}/4",
-                        delta=f"{len(detected_cats)-4}" if len(detected_cats) < 4 else "Complete"
-                    )
                 with col2:
                     if detected_cats:
@@ -521,126 +823,159 @@ def main():
                 with col4:
                     if detected_cats:
                         avg_score = np.mean([category_results[cat]['score'] for cat in detected_cats])
-                        if avg_score >= 3.5:
-                            quality = "Excellent"
-                            color = "🟢"
-                        elif avg_score >= 2.5:
-                            quality = "Good"
-                            color = "🟡"
-                        else:
-                            quality = "Needs Work"
-                            color = "🔴"
-                        st.metric("Overall Quality", f"{color} {quality}")
                     else:
                         st.metric("Overall Quality", "N/A")
-                # Category breakdown
-                st.subheader("Category Analysis")
                 for cat in CATEGORIES.keys():
                     res = category_results[cat]
-                    col1, col2, col3, col4 = st.columns([3, 1, 1, 1])
-                    with col1:
-                        if res['detected']:
-                            st.write(f"✅ **{cat}**")
-                        else:
-                            st.write(f"❌ **{cat}** *(Not detected)*")
-                    with col2:
-                        if res['detected']:
-                            st.write(f"Score: {res['score']}/4")
-                        else:
-                            st.write("Score: -")
-                    with col3:
-                        if res['detected']:
-                            st.write(f"Confidence: {res['confidence']:.1%}")
-                        else:
-                            st.write("Confidence: -")
-                    with col4:
-                        if res['detected']:
-                            st.write(f"Segments: {res['num_segments']}")
-                        else:
-                            st.write("Segments: 0")
                     if res['detected']:
-                        # Progress bar for score
                         st.progress(res['score'] / 4)
-            with tab2:
-                st.header("Segment-by-Segment Analysis")
                 for segment in segment_results:
                     with st.expander(f"Segment {segment['segment_num']}: {segment['category']} (Score: {segment['score']}/4)"):
                         col1, col2 = st.columns([1, 3])
                         with col1:
                             st.metric("Category", segment['category'])
-                            st.metric("Score", f"{segment['score']}/4")
                             st.metric("Confidence", f"{segment['confidence']:.1%}")
                         with col2:
                             st.write("**Text:**")
-                            st.write(segment['text'])
-                            # Show rubric for this score
-                            st.write("**Rubric for this score:**")
-                            st.info(CATEGORIES[segment['category']]['rubric'][segment['score']])
-            with tab3:
-                st.header("Recommendations for Improvement")
                 missing_cats = [cat for cat, res in category_results.items() if not res['detected']]
                 low_score_cats = [cat for cat, res in category_results.items()
                                if res['detected'] and res['score'] and res['score'] < 3]
                 if missing_cats:
-                    st.error("🚨 **Missing Categories - Must Add:**")
                     for cat in missing_cats:
-                        st.write(f"### {cat}")
-                        st.write(f"**Description:** {CATEGORIES[cat]['description']}")
-                        st.write(f"**Keywords to include:** {', '.join(CATEGORIES[cat]['keywords'][:8])}")
-                        st.write(f"**Target Quality:** {CATEGORIES[cat]['rubric'][4]}")
-                        st.write("---")
                 if low_score_cats:
-                    st.warning("⚠️ **Low-Scoring Categories - Should Improve:**")
                     for cat in low_score_cats:
-                        current_score = category_results[cat]['score']
-                        st.write(f"### {cat}")
-                        st.write(f"**Current Score:** {current_score}/4")
-                        st.write(f"**Current Level:** {CATEGORIES[cat]['rubric'][current_score]}")
-                        st.write(f"**Target Level:** {CATEGORIES[cat]['rubric'][4]}")
-                        st.write(f"**Improvement Tips:** Add more {', '.join(CATEGORIES[cat]['keywords'][:5])}")
-                        st.write("---")
                 if not missing_cats and not low_score_cats:
-                    st.success("🎉 Excellent work! All categories are present with good scores.")
-                    st.write("Your personal statement effectively covers all required elements.")
-            with tab4:
-                st.header("Download Analysis Report")
-                # Generate PDF
-                pdf_buffer = create_pdf_report(segment_results, category_results, statement_text)
-                # Download button
-                st.download_button(
-                    label="📥 Download PDF Report",
-                    data=pdf_buffer,
-                    file_name=f"personal_statement_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf",
-                    mime="application/pdf",
-                    use_container_width=True
-                )
-                st.info("The PDF report includes detailed analysis, scores, and recommendations for your personal statement.")
-    elif statement_text and len(statement_text) <= 100:
-        st.warning("⚠️ Please enter a longer statement (minimum 100 characters) for meaningful analysis.")
-    else:
-        st.info("👆 Please upload or paste your personal statement to begin analysis.")
 # Run the application
 if __name__ == "__main__":

 import re
 from datetime import datetime
 from io import BytesIO
+import warnings
+warnings.filterwarnings('ignore')
 # Page config MUST be first
 st.set_page_config(
     page_title="Medical School Personal Statement Analyzer",
     page_icon="🏥",
+    layout="wide",
+    initial_sidebar_state="expanded"
 )
 # Import ML libraries
+from sentence_transformers import SentenceTransformer, util
+from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
 from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.ensemble import RandomForestClassifier
 import xgboost as xgb
+import torch
 # Import PDF generation libraries
+try:
+    from reportlab.lib import colors
+    from reportlab.lib.pagesizes import letter
+    from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, PageBreak
+    from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+    from reportlab.lib.units import inch
+    from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY
+    PDF_AVAILABLE = True
+except ImportError:
+    PDF_AVAILABLE = False
+# Categories with detailed rubric alignment
 CATEGORIES = {
     'Spark': {
         'description': 'Opening that spurs interest in medicine (typically in opening paragraph)',
             2: 'somewhat connected but unclear',
             3: 'connected and clear',
             4: 'engaging and logically flows into becoming a doctor'
+        },
+        'rubric_features': {
+            'positive': ['engaging', 'logical', 'clear connection', 'compelling', 'authentic'],
+            'negative': ['disconnected', 'confusing', 'random', 'unclear', 'generic']
         }
     },
     'Healthcare Experience': {
             2: 'bland/boring but not problematic',
             3: 'interesting and relevant',
             4: 'vivid, active, thoughtful, relevant, memorable, positive'
+        },
+        'rubric_features': {
+            'positive': ['vivid', 'active', 'thoughtful', 'memorable', 'optimistic', 'engaged'],
+            'negative': ['passive', 'uninteresting', 'irrelevant', 'problematic', 'pessimistic']
         }
     },
     'Showing Doctor Qualities': {
             2: 'bland/boring but not problematic',
             3: 'shows some understanding',
             4: 'realistic, self-aware, mature, humble, specific understanding'
+        },
+        'rubric_features': {
+            'positive': ['realistic', 'self-aware', 'mature', 'humble', 'specific', 'clear'],
+            'negative': ['arrogant', 'immature', 'overly confident', 'simplistic', 'inaccurate']
         }
     },
     'Spin': {
             2: 'some connection but generic',
             3: 'clear connection',
             4: 'direct, logical, and specific argument'
+        },
+        'rubric_features': {
+            'positive': ['direct', 'logical', 'specific', 'clear argument', 'compelling'],
+            'negative': ['brief', 'vague', 'simplistic', 'generic', 'weak']
         }
     }
 }
 @st.cache_resource
+def load_sentence_transformer():
+    """Load the e5-large-v2 sentence transformer model"""
+    try:
+        # Try to load the preferred model
+        model = SentenceTransformer('intfloat/e5-large-v2')
+        return model, 'intfloat/e5-large-v2'
+    except:
+        # Fallback to lighter model if e5-large-v2 fails
+        try:
+            model = SentenceTransformer('all-MiniLM-L6-v2')
+            return model, 'all-MiniLM-L6-v2'
+        except Exception as e:
+            st.error(f"Failed to load transformer: {e}")
+            return None, None
+def load_training_data_from_files():
+    """Load and combine training data from the two Excel files"""
+    try:
+        # File paths for the Excel files
+        file1_path = "DedooseChartExcerpts_2025_8_5_1025.xlsx"
+        file2_path = "Personal Statements Coded.xlsx"
+        # Check if files exist
+        if not os.path.exists(file1_path) or not os.path.exists(file2_path):
+            return None
+        # Load Excel files
+        df1 = pd.read_excel(file1_path)
+        df2 = pd.read_excel(file2_path)
+        # Combine dataframes
+        combined_df = pd.concat([df1, df2], ignore_index=True)
+        processed_data = []
+        for _, row in combined_df.iterrows():
+            text = None
+            # Look for text columns
+            for col_name in ['Excerpt Copy', 'Excerpt', 'Text', 'Content']:
+                if col_name in row and pd.notna(row[col_name]):
+                    text = str(row[col_name])
+                    break
+            if not text or text.strip() == '':
+                continue
+            data_point = {
+                'text': text.strip(),
+                'media_title': row.get('Media Title', 'Unknown')
+            }
+            # Process categories
+            for category in CATEGORIES.keys():
+                col_applied = f"Code: {category} Applied"
+                col_weight = f"Code: {category} Weight"
+                is_applied = False
+                if col_applied in row:
+                    applied_val = str(row[col_applied]).lower()
+                    is_applied = applied_val in ['true', '1', 'yes', 't']
+                data_point[f"{category}_applied"] = is_applied
+                if is_applied and col_weight in row:
+                    weight = row[col_weight]
+                    if pd.isna(weight) or weight == '':
+                        weight = 2
+                    else:
+                        try:
+                            weight = int(float(weight))
+                            weight = max(1, min(4, weight))
+                        except:
+                            weight = 2
+                else:
+                    weight = 0
+                data_point[f"{category}_score"] = weight
+            processed_data.append(data_point)
+        return pd.DataFrame(processed_data)
+    except Exception as e:
+        st.error(f"Error loading training data: {str(e)}")
+        return None
+def segment_text(text, embedder):
+    """Segment text using semantic similarity"""
     paragraphs = re.split(r'\n\s*\n', text)
     paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p.strip()) > 50]
     if len(paragraphs) <= 1:
         sentences = re.split(r'(?<=[.!?])\s+', text)
         sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
+        if len(sentences) < 3:
+            return [text]
+        # Use embeddings for semantic segmentation
+        embeddings = embedder.encode(sentences, convert_to_tensor=True)
         segments = []
+        current_segment = [sentences[0]]
+        current_embedding = embeddings[0]
+        for i in range(1, len(sentences)):
+            similarity = util.cos_sim(current_embedding, embeddings[i]).item()
+            if similarity < 0.7 or len(' '.join(current_segment)) > 500:
                 segments.append(' '.join(current_segment))
+                current_segment = [sentences[i]]
+                current_embedding = embeddings[i]
+            else:
+                current_segment.append(sentences[i])
+                current_embedding = (current_embedding + embeddings[i]) / 2
         if current_segment:
             segments.append(' '.join(current_segment))
+        return segments
     return paragraphs
+def extract_features(text, embedder, category_focus=None):
+    """Extract features for classification"""
+    features = []
     text_lower = text.lower()
     words = text.split()
+    # Basic text statistics
+    features.extend([
+        len(text),
+        len(words),
+        len(set(words)) / max(len(words), 1),
+        len(re.findall(r'[.!?]', text)),
+        text.count('I') / max(len(words), 1),
+    ])
+    # Process all categories
     for cat_name, cat_info in CATEGORIES.items():
+        keywords = cat_info['keywords']
+        keyword_matches = sum(1 for kw in keywords if kw.lower() in text_lower)
+        keyword_density = keyword_matches / max(len(keywords), 1)
+        if category_focus == cat_name:
+            keyword_density *= 2
+        features.append(keyword_density * 10)
         pattern_matches = 0
+        for pattern in cat_info.get('patterns', []):
+            matches = re.findall(pattern, text_lower)
+            pattern_matches += len(matches)
+        features.append(pattern_matches)
+        positive_count = sum(1 for word in cat_info['rubric_features']['positive']
+                           if word in text_lower)
+        negative_count = sum(1 for word in cat_info['rubric_features']['negative']
+                           if word in text_lower)
+        features.extend([
+            positive_count / max(len(words), 1) * 100,
+            negative_count / max(len(words), 1) * 100
+        ])
+    # Get embeddings
+    try:
+        embedding = embedder.encode(text, convert_to_tensor=False, normalize_embeddings=True)
+        if hasattr(embedding, 'cpu'):
+            embedding = embedding.cpu().numpy()
+        embedding = embedding.flatten()
+        # Limit embedding size for memory efficiency
+        embedding = embedding[:512] if len(embedding) > 512 else embedding
+    except:
+        embedding = np.zeros(512)
+    # Category similarity
+    if category_focus and category_focus in CATEGORIES:
+        category_text = f"{CATEGORIES[category_focus]['description']} {' '.join(CATEGORIES[category_focus]['keywords'][:10])}"
+        try:
+            category_embedding = embedder.encode(category_text, normalize_embeddings=True)
+            if hasattr(category_embedding, 'cpu'):
+                category_embedding = category_embedding.cpu().numpy()
+            category_embedding = category_embedding.flatten()[:512]
+            similarity = cosine_similarity([embedding[:512]], [category_embedding])[0][0]
+            features.append(similarity * 10)
+        except:
+            features.append(0)
     else:
+        features.append(0)
+    features = np.array(features, dtype=np.float32)
+    combined_features = np.concatenate([features, embedding])
+    return combined_features
+def train_models(df, embedder):
+    """Train ensemble models"""
+    all_features = []
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+    status_text.text("Extracting features from training data...")
+    for idx, row in df.iterrows():
+        text = row['text']
+        category_features = {}
+        for cat in CATEGORIES.keys():
+            features = extract_features(text, embedder, category_focus=cat)
+            category_features[cat] = features
+        true_categories = [cat for cat in CATEGORIES.keys() if row[f"{cat}_applied"]]
+        if true_categories:
+            features = category_features[true_categories[0]]
+        else:
+            features = np.mean(list(category_features.values()), axis=0)
+        all_features.append(features)
+        progress_bar.progress((idx + 1) / len(df))
+    X = np.array(all_features)
+    categories = list(CATEGORIES.keys())
+    y_class = df[[f"{cat}_applied" for cat in categories]].values.astype(float)
+    y_score = []
+    for _, row in df.iterrows():
+        scores = []
+        for cat in categories:
+            if row[f"{cat}_applied"]:
+                scores.append(row[f"{cat}_score"] / 4.0)
+            else:
+                scores.append(0)
+        y_score.append(scores)
+    y_score = np.array(y_score)
+    status_text.text("Training models...")
+    # Split data
+    X_train, X_test, y_class_train, y_class_test, y_score_train, y_score_test = train_test_split(
+        X, y_class, y_score, test_size=0.2, random_state=42
+    )
+    # Scale features
+    scaler = StandardScaler()
+    X_train_scaled = scaler.fit_transform(X_train)
+    X_test_scaled = scaler.transform(X_test)
+    # Train classifiers and scorers
+    classifiers = {}
+    scorers = {}
+    thresholds = {}
+    ensemble = {}
+    for i, cat in enumerate(categories):
+        n_positive = np.sum(y_class_train[:, i])
+        models = []
+        # XGBoost classifier
+        if n_positive >= 5:
+            xgb_clf = xgb.XGBClassifier(
+                n_estimators=100,
+                max_depth=5,
+                learning_rate=0.1,
+                random_state=42,
+                use_label_encoder=False,
+                eval_metric='logloss'
+            )
+            xgb_clf.fit(X_train_scaled, y_class_train[:, i])
+            models.append(('xgb', xgb_clf))
+            classifiers[cat] = xgb_clf
+        # Random Forest as backup or ensemble member
+        rf_clf = RandomForestClassifier(
+            n_estimators=100,
+            max_depth=6,
+            class_weight='balanced',
+            random_state=42
+        )
+        rf_clf.fit(X_train_scaled, y_class_train[:, i])
+        models.append(('rf', rf_clf))
+        if n_positive < 5:
+            classifiers[cat] = rf_clf
+        ensemble[cat] = models
+        thresholds[cat] = 0.5
+        # Train scorer
+        mask = y_class_train[:, i] == 1
+        if np.sum(mask) > 5:
+            scorer = xgb.XGBRegressor(
+                n_estimators=100,
+                max_depth=4,
+                random_state=42
+            )
+            scorer.fit(X_train_scaled[mask], y_score_train[mask, i])
+        else:
+            from sklearn.dummy import DummyRegressor
+            scorer = DummyRegressor(strategy='constant', constant=0.5)
+            scorer.fit(X_train_scaled, y_score_train[:, i])
+        scorers[cat] = scorer
+    # Calculate accuracies
+    accuracies = []
+    for i, cat in enumerate(categories):
+        preds = classifiers[cat].predict(X_test_scaled)
+        acc = np.mean(preds == y_class_test[:, i])
+        accuracies.append(acc)
+    status_text.empty()
+    progress_bar.empty()
+    return scaler, classifiers, scorers, thresholds, accuracies, ensemble
+def classify_segment(text, embedder, scaler, classifiers, scorers, thresholds, ensemble=None):
+    """Classify a segment of text"""
+    categories = list(CATEGORIES.keys())
+    category_results = {}
+    for cat in categories:
+        features = extract_features(text, embedder, category_focus=cat)
+        features_scaled = scaler.transform([features])
+        if ensemble and cat in ensemble:
+            probs = []
+            for name, model in ensemble[cat]:
+                if hasattr(model, 'predict_proba'):
+                    model_probs = model.predict_proba(features_scaled)
+                    if model_probs.shape[1] == 2:
+                        probs.append(model_probs[0, 1])
+            if probs:
+                avg_prob = np.mean(probs)
+            else:
+                avg_prob = 0.5
+        else:
+            if hasattr(classifiers[cat], 'predict_proba'):
+                probs = classifiers[cat].predict_proba(features_scaled)
+                if probs.shape[1] == 2:
+                    avg_prob = probs[0, 1]
+                else:
+                    avg_prob = 0.5
+            else:
+                avg_prob = 0.5
+        category_results[cat] = avg_prob
+    best_category = max(category_results, key=category_results.get)
+    best_prob = category_results[best_category]
+    if best_prob > thresholds.get(best_category, 0.5):
+        features = extract_features(text, embedder, category_focus=best_category)
+        features_scaled = scaler.transform([features])
+        try:
+            score_normalized = scorers[best_category].predict(features_scaled)[0]
+            score = int(np.clip(np.round(score_normalized * 4), 1, 4))
+        except:
+            score = 2
+        return {
+            'category': best_category,
+            'score': score,
+            'confidence': float(best_prob),
+            'text': text,
+            'all_probabilities': category_results
+        }
+    else:
+        return {
+            'category': 'Unclassified',
+            'score': None,
+            'confidence': 0,
+            'text': text,
+            'all_probabilities': category_results
+        }
+def analyze_statement(text, embedder, scaler, classifiers, scorers, thresholds, ensemble=None):
     """Analyze complete personal statement"""
+    segments = segment_text(text, embedder)
     segment_results = []
     for i, segment in enumerate(segments):
+        result = classify_segment(segment, embedder, scaler, classifiers, scorers, thresholds, ensemble)
         result['segment_num'] = i + 1
         segment_results.append(result)
     return segment_results, category_results
+def create_pdf_report(segment_results, category_results):
+    """Create PDF report"""
+    if not PDF_AVAILABLE:
+        return None
     buffer = BytesIO()
     doc = SimpleDocTemplate(buffer, pagesize=letter, rightMargin=72, leftMargin=72,
                            topMargin=72, bottomMargin=18)
     ]))
     elements.append(summary_table)
     # Build PDF
     doc.build(elements)
 # Main Application
 def main():
     st.title("🏥 Medical School Personal Statement Analyzer")
+    st.markdown("*AI-powered analysis based on medical school admission rubrics*")
     st.markdown("---")
+    # Initialize session state
+    if 'model_trained' not in st.session_state:
+        st.session_state.model_trained = False
+    if 'embedder' not in st.session_state:
+        st.session_state.embedder = None
+    if 'scaler' not in st.session_state:
+        st.session_state.scaler = None
+    if 'classifiers' not in st.session_state:
+        st.session_state.classifiers = None
+    if 'scorers' not in st.session_state:
+        st.session_state.scorers = None
+    if 'thresholds' not in st.session_state:
+        st.session_state.thresholds = None
+    if 'ensemble' not in st.session_state:
+        st.session_state.ensemble = None
+    # Create three tabs
+    tab1, tab2, tab3 = st.tabs(["📚 Step 1: Train Model", "📝 Step 2: Analyze Statements", "📊 Step 3: View Rubrics"])
+    # STEP 1: TRAIN MODEL
+    with tab1:
+        st.header("Step 1: Train the AI Model")
         st.markdown("""
+        ### Instructions:
+        Click the 'Train Model' button to automatically train the AI using:
+        - Pre-loaded Excel training files
+        - State-of-the-art e5-large-v2 transformer model
+        - Ensemble classification algorithms
+        """)
+        # Check if models already exist in session
+        if st.session_state.model_trained:
+            st.success("✅ Model is already trained and ready for analysis!")
+            st.info("You can proceed to Step 2 to analyze statements, or retrain if needed.")
+        st.markdown("---")
+        # Train button
+        if st.button("🚀 Train Model", type="primary", use_container_width=True):
+            # Load training data
+            with st.spinner("Loading training data from Excel files..."):
+                df = load_training_data_from_files()
+            if df is None or df.empty:
+                st.error("""
+                ❌ Could not load training data. Please ensure these files are present:
+                - DedooseChartExcerpts_2025_8_5_1025.xlsx
+                - Personal Statements Coded.xlsx
+                """)
+                st.stop()
+            st.success(f"✅ Loaded {len(df)} training samples")
+            # Show data distribution
+            st.subheader("Training Data Distribution:")
+            dist_cols = st.columns(4)
+            for idx, cat in enumerate(CATEGORIES.keys()):
+                if f"{cat}_applied" in df.columns:
+                    count = df[f"{cat}_applied"].sum()
+                    with dist_cols[idx % 4]:
+                        st.metric(cat, f"{int(count)} samples")
+            # Load transformer model
+            with st.spinner("Loading e5-large-v2 transformer model..."):
+                if st.session_state.embedder is None:
+                    embedder, embedder_name = load_sentence_transformer()
+                    st.session_state.embedder = embedder
+                else:
+                    embedder = st.session_state.embedder
+                    embedder_name = 'intfloat/e5-large-v2'
+            if embedder is None:
+                st.error("Failed to load transformer model")
+                st.stop()
+            st.info(f"Using model: {embedder_name}")
+            # Train models
+            st.subheader("Training Progress:")
+            scaler, classifiers, scorers, thresholds, accuracies, ensemble = train_models(df, embedder)
+            # Save to session state
+            st.session_state.scaler = scaler
+            st.session_state.classifiers = classifiers
+            st.session_state.scorers = scorers
+            st.session_state.thresholds = thresholds
+            st.session_state.ensemble = ensemble
+            st.session_state.model_trained = True
+            st.success("✅ Training Complete!")
+            # Show performance metrics
+            st.subheader("Model Performance:")
+            metrics_cols = st.columns(4)
+            for idx, (cat, acc) in enumerate(zip(CATEGORIES.keys(), accuracies)):
+                with metrics_cols[idx % 4]:
+                    st.metric(cat, f"{acc:.1%} accuracy")
+            avg_accuracy = np.mean(accuracies)
+            st.metric("**Overall Model Accuracy**", f"{avg_accuracy:.1%}")
+            st.balloons()
+    # STEP 2: ANALYZE STATEMENTS
+    with tab2:
+        st.header("Step 2: Analyze Personal Statements")
+        # Check if models are trained
+        if not st.session_state.model_trained:
+            st.warning("⚠️ No trained models found. Please complete Step 1: Train Model first.")
+            st.stop()
+        st.success("✅ Models loaded successfully")
+        st.markdown("""
+        ### Instructions:
+        Upload or paste a personal statement to receive:
+        - Category detection and scoring (1-4)
+        - Segment-by-segment analysis
+        - Detailed recommendations
+        - Downloadable PDF report
+        """)
+        # Input method selection
+        input_method = st.radio(
+            "Choose input method:",
+            ["Upload Text File (.txt)", "Paste Text Directly"],
+            horizontal=True
         )
+        statement_text = None
+        if input_method == "Upload Text File (.txt)":
+            uploaded_file = st.file_uploader(
+                "Choose a text file",
+                type=['txt'],
+                help="Upload your personal statement as a .txt file"
+            )
+            if uploaded_file is not None:
+                statement_text = str(uploaded_file.read(), 'utf-8')
+                st.success(f"✅ File uploaded ({len(statement_text)} characters)")
+                with st.expander("Preview Statement"):
+                    st.text(statement_text[:500] + "..." if len(statement_text) > 500 else statement_text)
+        else:  # Paste Text Directly
+            statement_text = st.text_area(
+                "Paste your personal statement here:",
+                height=400,
+                placeholder="Enter your complete personal statement...",
+                help="Paste your entire personal statement for analysis"
+            )
+            if statement_text:
+                st.info(f"📊 Statement length: {len(statement_text)} characters, {len(statement_text.split())} words")
+        # Analyze button
+        if statement_text and len(statement_text) > 100:
+            if st.button("🔬 Analyze Statement", type="primary", use_container_width=True):
+                with st.spinner("Analyzing your personal statement..."):
+                    segment_results, category_results = analyze_statement(
+                        statement_text,
+                        st.session_state.embedder,
+                        st.session_state.scaler,
+                        st.session_state.classifiers,
+                        st.session_state.scorers,
+                        st.session_state.thresholds,
+                        st.session_state.ensemble
+                    )
+                st.success("✅ Analysis Complete!")
+                st.balloons()
+                # Display results
+                st.markdown("---")
+                st.subheader("📊 Overall Summary")
                 # Metrics
                 col1, col2, col3, col4 = st.columns(4)
                 detected_cats = [cat for cat, res in category_results.items() if res['detected']]
                 with col1:
+                    st.metric("Categories Found", f"{len(detected_cats)}/4")
                 with col2:
                     if detected_cats:
                 with col4:
                     if detected_cats:
                         avg_score = np.mean([category_results[cat]['score'] for cat in detected_cats])
+                        quality = "Excellent" if avg_score >= 3.5 else "Good" if avg_score >= 2.5 else "Needs Work"
+                        st.metric("Overall Quality", quality)
                     else:
                         st.metric("Overall Quality", "N/A")
+                # Category Analysis
+                st.markdown("---")
+                st.subheader("📋 Category Analysis")
                 for cat in CATEGORIES.keys():
                     res = category_results[cat]
                     if res['detected']:
+                        icon = "✅" if res['score'] >= 3 else "⚠️" if res['score'] >= 2 else "❌"
+                        st.write(f"{icon} **{cat}**: Score {res['score']}/4 (Confidence: {res['confidence']:.1%})")
                         st.progress(res['score'] / 4)
+                    else:
+                        st.write(f"❌ **{cat}**: Not detected")
+                        st.progress(0)
+                # Segment Details
+                st.markdown("---")
+                st.subheader("📝 Segment-by-Segment Analysis")
                 for segment in segment_results:
+                    quality_map = {1: "Poor", 2: "Below Average", 3: "Good", 4: "Excellent", None: "N/A"}
+                    quality = quality_map.get(segment['score'], "N/A")
                     with st.expander(f"Segment {segment['segment_num']}: {segment['category']} (Score: {segment['score']}/4)"):
                         col1, col2 = st.columns([1, 3])
                         with col1:
                             st.metric("Category", segment['category'])
+                            st.metric("Score", f"{segment['score']}/4" if segment['score'] else "N/A")
                             st.metric("Confidence", f"{segment['confidence']:.1%}")
                         with col2:
                             st.write("**Text:**")
+                            st.write(segment['text'][:500] + "..." if len(segment['text']) > 500 else segment['text'])
+                            if segment['category'] != 'Unclassified' and segment['score']:
+                                st.write("**Rubric:**")
+                                st.info(CATEGORIES[segment['category']]['rubric'][segment['score']])
+                # Recommendations
+                st.markdown("---")
+                st.subheader("💡 Recommendations")
                 missing_cats = [cat for cat, res in category_results.items() if not res['detected']]
                 low_score_cats = [cat for cat, res in category_results.items()
                                if res['detected'] and res['score'] and res['score'] < 3]
                 if missing_cats:
+                    st.error("**Missing Categories - Must Add:**")
                     for cat in missing_cats:
+                        st.write(f"**{cat}:** {CATEGORIES[cat]['description']}")
+                        st.write(f"Keywords: {', '.join(CATEGORIES[cat]['keywords'][:8])}")
                 if low_score_cats:
+                    st.warning("**Low-Scoring Categories - Improve:**")
                     for cat in low_score_cats:
+                        score = category_results[cat]['score']
+                        st.write(f"**{cat}** (Score: {score}/4)")
+                        st.write(f"Target: {CATEGORIES[cat]['rubric'][4]}")
                 if not missing_cats and not low_score_cats:
+                    st.success("Excellent! All categories present with good scores.")
+                # Download Report
+                st.markdown("---")
+                if PDF_AVAILABLE:
+                    pdf_buffer = create_pdf_report(segment_results, category_results)
+                    if pdf_buffer:
+                        st.download_button(
+                            label="📥 Download PDF Report",
+                            data=pdf_buffer,
+                            file_name=f"analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf",
+                            mime="application/pdf",
+                            use_container_width=True
+                        )
+                else:
+                    # CSV fallback
+                    results_data = []
+                    for seg in segment_results:
+                        results_data.append({
+                            'Segment': seg['segment_num'],
+                            'Category': seg['category'],
+                            'Score': seg['score'],
+                            'Confidence': seg['confidence']
+                        })
+                    results_df = pd.DataFrame(results_data)
+                    csv = results_df.to_csv(index=False)
+                    st.download_button(
+                        label="📥 Download CSV Report",
+                        data=csv,
+                        file_name=f"analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
+                        mime="text/csv",
+                        use_container_width=True
+                    )
+        elif statement_text and len(statement_text) <= 100:
+            st.warning("⚠️ Please enter a longer statement (minimum 100 characters)")
+        else:
+            st.info("👆 Please upload or paste your personal statement to begin analysis")
+    # STEP 3: VIEW RUBRICS
+    with tab3:
+        st.header("Step 3: Understanding the Scoring Rubrics")
+        st.markdown("""
+        The AI model evaluates personal statements based on **4 key categories**,
+        each scored on a scale of **1 (Poor) to 4 (Excellent)**.
+        """)
+        for category, info in CATEGORIES.items():
+            with st.expander(f"**{category}** - {info['description']}", expanded=False):
+                # Scoring Criteria
+                st.subheader("Scoring Criteria:")
+                for score in [4, 3, 2, 1]:
+                    quality = ['Poor', 'Below Average', 'Good', 'Excellent'][score-1]
+                    if score == 4:
+                        st.success(f"**Score {score} ({quality}):** {info['rubric'][score]}")
+                    elif score == 3:
+                        st.info(f"**Score {score} ({quality}):** {info['rubric'][score]}")
+                    elif score == 2:
+                        st.warning(f"**Score {score} ({quality}):** {info['rubric'][score]}")
+                    else:
+                        st.error(f"**Score {score} ({quality}):** {info['rubric'][score]}")
+                st.markdown("---")
+                # Keywords and indicators
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.markdown("**Key Terms:**")
+                    st.write(', '.join(info['keywords'][:10]))
+                with col2:
+                    st.markdown("**Quality Indicators:**")
+                    st.write(f"✅ Positive: {', '.join(info['rubric_features']['positive'][:5])}")
+                    st.write(f"❌ Avoid: {', '.join(info['rubric_features']['negative'][:5])}")
+        st.markdown("---")
+        st.info("""
+        ### Tips for High Scores:
+        - **Spark (4/4):** Create an engaging opening that clearly connects to your medical journey
+        - **Healthcare Experience (4/4):** Show active participation with vivid, thoughtful descriptions
+        - **Doctor Qualities (4/4):** Demonstrate mature, realistic understanding with specific examples
+        - **Spin (4/4):** Make direct, logical connections between experiences and medical career
+        """)
 # Run the application
 if __name__ == "__main__":