Spaces:

gmu-czi
/

gls

Build error

File size: 8,441 Bytes

import streamlit as st
import spacy
import textdescriptives as td
import pandas as pd
import math
import numpy as np
import joblib
import os
from pathlib import Path

# Set page config
st.set_page_config(page_title="Text Grade Level Assignment", page_icon="📚", layout="wide")

def check_password():
    """Returns `True` if the user had the correct password."""

    def password_entered():
        """Checks whether a password entered by the user is correct."""
        if st.session_state["password"] == "gradelevel":
            st.session_state["password_correct"] = True
            del st.session_state["password"]  # don't store password
        else:
            st.session_state["password_correct"] = False

    if "password_correct" not in st.session_state:
        # First run, show input for password.
        st.text_input(
            "Password", type="password", on_change=password_entered, key="password"
        )
        return False
    elif not st.session_state["password_correct"]:
        # Password not correct, show input + error.
        st.text_input(
            "Password", type="password", on_change=password_entered, key="password"
        )
        st.error("😕 Password incorrect")
        return False
    else:
        # Password correct.
        return True

if not check_password():
    st.stop()

st.title("📚 Text Grade Level Assignment")
st.markdown("Assign the grade level complexity of your text using quantitative metrics.")

# Cache the heavy model loading
@st.cache_resource
def load_spacy_model():
    try:
        # if not spacy.util.is_package("en_core_web_sm"):
        st.warning("Downloading spacy model 'en_core_web_sm'... this might take a while.")
        #     spacy.cli.download("en_core_web_sm")
        nlp = spacy.load("en_core_web_sm")
        nlp.add_pipe("textdescriptives/all")
        return nlp
    except Exception as e:
        st.error(f"Error loading Spacy model: {e}")
        return None

nlp = load_spacy_model()

# Grade band mapping
GRADE_BAND_ORDER = {
    "K-1": 0,
    "2-3": 1,
    "4-5": 2,
    "6-8": 3,
    "9-10": 4,
    "11-CCR": 5,
    "CCR+": 6
}

REVERSE_MAPPING = {v: k for k, v in GRADE_BAND_ORDER.items()}

def get_grade_level(predicted_order):
    """Turns model predicted grade band order into the grade level string."""
    # Clamp the prediction to valid range 0-6
    predicted_order = max(0, min(6, round(predicted_order)))
    return REVERSE_MAPPING.get(predicted_order, "Unknown")

# Load the regression model
MODEL_PATH = Path(__file__).parent.parent / "models" / "grade_level_quant_regression_model.pkl"

@st.cache_resource
def load_regression_model():
    if not os.path.exists(MODEL_PATH):
        return None
    try:
        return joblib.load(MODEL_PATH)
    except Exception as e:
        st.error(f"Error loading model file: {e}")
        return None

model = load_regression_model()

def clean_value(val, default=0.0):
    """Returns the default value if val is NaN or None, otherwise returns val."""
    if val is None or math.isnan(val):
        return default
    return val

def analyze_text(text, nlp_model, regression_model):
    """
    Analyzes text and returns metrics and predicted grade level.
    Returns: (grade_level, metrics_dict)
    """
    if not text or not isinstance(text, str) or not text.strip():
        return None, None

    try:
        # Process text
        doc = nlp_model(text)
        doc_stats = td.extract_dict(doc)[0]

        # Extract Key Metrics
        metrics = {
            "FK_score": clean_value(round(doc_stats['flesch_kincaid_grade'], 2)),
            "Gunning_fog": clean_value(round(doc_stats['gunning_fog'], 2)),
            "Smog": clean_value(round(doc_stats['smog'], 2)),
            "Lix": clean_value(round(doc_stats['lix'], 2)),
            "Rix": clean_value(round(doc_stats['rix'], 2)),
            "complexity_score_entropy": clean_value(round(doc_stats['entropy'], 2)),
            "Sentence_Length": clean_value(round(doc_stats['sentence_length_mean'], 2))
        }

        # Prepare for Prediction
        selected_var = ['FK_score', 'Gunning_fog', 'Smog', 'Lix', 'Rix', 'complexity_score_entropy', 'Sentence_Length']
        
        # Create DataFrame with single row
        input_data = [[metrics[col] for col in selected_var]]
        new_data_processed = pd.DataFrame(input_data, columns=selected_var)

        # Predict
        raw_prediction = regression_model.predict(new_data_processed)[0]
        grade_band = get_grade_level(raw_prediction)
        
        return grade_band, metrics

    except Exception as e:
        # Check if it's the "division by zero" error common with empty/weird text in textdescriptives
        return "Error", {}

# --- Sidebar for Batch Processing ---
with st.sidebar:
    st.title("Upload your csv file for batch processing")
    st.markdown("*!!! The CSV file must contain a column named **text**.*")
    uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
    
    # Process Button (Added for explicit action) or Auto-process
    # User said: "allow user to upload CSV file ... and process text"
    # Usually auto-process on upload is fine.

if uploaded_file is not None and model is not None and nlp is not None:
    st.divider()
    st.header("Batch Processing Results")
    try:
        df = pd.read_csv(uploaded_file)
        if "text" not in df.columns:
            st.error("The CSV file must contain a column named 'text'.")
        else:
            if st.button("Process CSV"):
                progress_bar = st.progress(0, text="Processing rows...")
                results = []
                
                total_rows = len(df)
                for index, row in df.iterrows():
                    text = str(row["text"])
                    grade, metrics = analyze_text(text, nlp, model)
                    
                    row_result = row.to_dict()
                    row_result["predicted_grade_level"] = grade if grade else "N/A"
                    row_result["metrics"] = metrics if metrics else "N/A"
                    
                    results.append(row_result)
                    
                    # Update progress
                    progress_bar.progress((index + 1) / total_rows, text=f"Processing row {index+1}/{total_rows}")

                progress_bar.empty()
                
                # Create result DF
                
                result_df = pd.DataFrame(results)
                expanded_df = result_df['metrics'].apply(pd.Series)
                final_df = pd.concat([result_df.drop('metrics', axis=1), expanded_df], axis=1)

                # Show first 5 rows
                st.subheader("Preview (First 5 Rows)")
                st.dataframe(final_df.head(5))
                
                # Download button
                csv = final_df.to_csv(index=False).encode('utf-8')
                st.download_button(
                    label="Download results as CSV",
                    data=csv,
                    file_name='grade_level_predictions.csv',
                    mime='text/csv',
                )
    except Exception as e:
        st.error(f"Error processing CSV: {e}")


# --- Main Application Area ---

if model is None:
    st.warning(f"⚠️ Model file not found at `{MODEL_PATH}`.")
    st.info("Please place your `grade_level_quant_regression_model.pkl` file in the `models` directory at the root of your project.")

else:
    # Input Area
    st.subheader("Single Text Analysis")
    text_input = st.text_area("Enter text to analyze:", height=200, placeholder="Paste your text here...")

    if st.button("Grade Level Prediction", type="primary"):
        if not text_input.strip():
            st.warning("Please enter some text first.")
        elif nlp is None:
            st.error("Text processing model (Spacy) is not available.")
        else:
            with st.spinner("Analyzing text complexity..."):
                grade_band, metrics = analyze_text(text_input, nlp, model)

                if grade_band == "Error":
                    st.error("An error occurred during analysis. Please check your input text.")
                elif grade_band:
                    # Output
                    st.success(f"### Assigned Grade band based on Quant Metrics: **{grade_band}**")
                    
                    with st.expander("View Detailed Metrics"):
                        st.json(metrics)