import streamlit as st import spacy import textdescriptives as td import pandas as pd import math import numpy as np import joblib import os from pathlib import Path # Set page config st.set_page_config(page_title="Text Grade Level Assignment", page_icon="📚", layout="wide") def check_password(): """Returns `True` if the user had the correct password.""" def password_entered(): """Checks whether a password entered by the user is correct.""" if st.session_state["password"] == "gradelevel": st.session_state["password_correct"] = True del st.session_state["password"] # don't store password else: st.session_state["password_correct"] = False if "password_correct" not in st.session_state: # First run, show input for password. st.text_input( "Password", type="password", on_change=password_entered, key="password" ) return False elif not st.session_state["password_correct"]: # Password not correct, show input + error. st.text_input( "Password", type="password", on_change=password_entered, key="password" ) st.error("😕 Password incorrect") return False else: # Password correct. return True if not check_password(): st.stop() st.title("📚 Text Grade Level Assignment") st.markdown("Assign the grade level complexity of your text using quantitative metrics.") # Cache the heavy model loading @st.cache_resource def load_spacy_model(): try: # if not spacy.util.is_package("en_core_web_sm"): st.warning("Downloading spacy model 'en_core_web_sm'... this might take a while.") # spacy.cli.download("en_core_web_sm") nlp = spacy.load("en_core_web_sm") nlp.add_pipe("textdescriptives/all") return nlp except Exception as e: st.error(f"Error loading Spacy model: {e}") return None nlp = load_spacy_model() # Grade band mapping GRADE_BAND_ORDER = { "K-1": 0, "2-3": 1, "4-5": 2, "6-8": 3, "9-10": 4, "11-CCR": 5, "CCR+": 6 } REVERSE_MAPPING = {v: k for k, v in GRADE_BAND_ORDER.items()} def get_grade_level(predicted_order): """Turns model predicted grade band order into the grade level string.""" # Clamp the prediction to valid range 0-6 predicted_order = max(0, min(6, round(predicted_order))) return REVERSE_MAPPING.get(predicted_order, "Unknown") # Load the regression model MODEL_PATH = Path(__file__).parent.parent / "models" / "grade_level_quant_regression_model.pkl" @st.cache_resource def load_regression_model(): if not os.path.exists(MODEL_PATH): return None try: return joblib.load(MODEL_PATH) except Exception as e: st.error(f"Error loading model file: {e}") return None model = load_regression_model() def clean_value(val, default=0.0): """Returns the default value if val is NaN or None, otherwise returns val.""" if val is None or math.isnan(val): return default return val def analyze_text(text, nlp_model, regression_model): """ Analyzes text and returns metrics and predicted grade level. Returns: (grade_level, metrics_dict) """ if not text or not isinstance(text, str) or not text.strip(): return None, None try: # Process text doc = nlp_model(text) doc_stats = td.extract_dict(doc)[0] # Extract Key Metrics metrics = { "FK_score": clean_value(round(doc_stats['flesch_kincaid_grade'], 2)), "Gunning_fog": clean_value(round(doc_stats['gunning_fog'], 2)), "Smog": clean_value(round(doc_stats['smog'], 2)), "Lix": clean_value(round(doc_stats['lix'], 2)), "Rix": clean_value(round(doc_stats['rix'], 2)), "complexity_score_entropy": clean_value(round(doc_stats['entropy'], 2)), "Sentence_Length": clean_value(round(doc_stats['sentence_length_mean'], 2)) } # Prepare for Prediction selected_var = ['FK_score', 'Gunning_fog', 'Smog', 'Lix', 'Rix', 'complexity_score_entropy', 'Sentence_Length'] # Create DataFrame with single row input_data = [[metrics[col] for col in selected_var]] new_data_processed = pd.DataFrame(input_data, columns=selected_var) # Predict raw_prediction = regression_model.predict(new_data_processed)[0] grade_band = get_grade_level(raw_prediction) return grade_band, metrics except Exception as e: # Check if it's the "division by zero" error common with empty/weird text in textdescriptives return "Error", {} # --- Sidebar for Batch Processing --- with st.sidebar: st.title("Upload your csv file for batch processing") st.markdown("*!!! The CSV file must contain a column named **text**.*") uploaded_file = st.file_uploader("Upload CSV", type=["csv"]) # Process Button (Added for explicit action) or Auto-process # User said: "allow user to upload CSV file ... and process text" # Usually auto-process on upload is fine. if uploaded_file is not None and model is not None and nlp is not None: st.divider() st.header("Batch Processing Results") try: df = pd.read_csv(uploaded_file) if "text" not in df.columns: st.error("The CSV file must contain a column named 'text'.") else: if st.button("Process CSV"): progress_bar = st.progress(0, text="Processing rows...") results = [] total_rows = len(df) for index, row in df.iterrows(): text = str(row["text"]) grade, metrics = analyze_text(text, nlp, model) row_result = row.to_dict() row_result["predicted_grade_level"] = grade if grade else "N/A" row_result["metrics"] = metrics if metrics else "N/A" results.append(row_result) # Update progress progress_bar.progress((index + 1) / total_rows, text=f"Processing row {index+1}/{total_rows}") progress_bar.empty() # Create result DF result_df = pd.DataFrame(results) expanded_df = result_df['metrics'].apply(pd.Series) final_df = pd.concat([result_df.drop('metrics', axis=1), expanded_df], axis=1) # Show first 5 rows st.subheader("Preview (First 5 Rows)") st.dataframe(final_df.head(5)) # Download button csv = final_df.to_csv(index=False).encode('utf-8') st.download_button( label="Download results as CSV", data=csv, file_name='grade_level_predictions.csv', mime='text/csv', ) except Exception as e: st.error(f"Error processing CSV: {e}") # --- Main Application Area --- if model is None: st.warning(f"⚠️ Model file not found at `{MODEL_PATH}`.") st.info("Please place your `grade_level_quant_regression_model.pkl` file in the `models` directory at the root of your project.") else: # Input Area st.subheader("Single Text Analysis") text_input = st.text_area("Enter text to analyze:", height=200, placeholder="Paste your text here...") if st.button("Grade Level Prediction", type="primary"): if not text_input.strip(): st.warning("Please enter some text first.") elif nlp is None: st.error("Text processing model (Spacy) is not available.") else: with st.spinner("Analyzing text complexity..."): grade_band, metrics = analyze_text(text_input, nlp, model) if grade_band == "Error": st.error("An error occurred during analysis. Please check your input text.") elif grade_band: # Output st.success(f"### Assigned Grade band based on Quant Metrics: **{grade_band}**") with st.expander("View Detailed Metrics"): st.json(metrics)