File size: 8,441 Bytes
3366cdc d8aafab 3366cdc d8aafab | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 | import streamlit as st
import spacy
import textdescriptives as td
import pandas as pd
import math
import numpy as np
import joblib
import os
from pathlib import Path
# Set page config
st.set_page_config(page_title="Text Grade Level Assignment", page_icon="π", layout="wide")
def check_password():
"""Returns `True` if the user had the correct password."""
def password_entered():
"""Checks whether a password entered by the user is correct."""
if st.session_state["password"] == "gradelevel":
st.session_state["password_correct"] = True
del st.session_state["password"] # don't store password
else:
st.session_state["password_correct"] = False
if "password_correct" not in st.session_state:
# First run, show input for password.
st.text_input(
"Password", type="password", on_change=password_entered, key="password"
)
return False
elif not st.session_state["password_correct"]:
# Password not correct, show input + error.
st.text_input(
"Password", type="password", on_change=password_entered, key="password"
)
st.error("π Password incorrect")
return False
else:
# Password correct.
return True
if not check_password():
st.stop()
st.title("π Text Grade Level Assignment")
st.markdown("Assign the grade level complexity of your text using quantitative metrics.")
# Cache the heavy model loading
@st.cache_resource
def load_spacy_model():
try:
# if not spacy.util.is_package("en_core_web_sm"):
st.warning("Downloading spacy model 'en_core_web_sm'... this might take a while.")
# spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textdescriptives/all")
return nlp
except Exception as e:
st.error(f"Error loading Spacy model: {e}")
return None
nlp = load_spacy_model()
# Grade band mapping
GRADE_BAND_ORDER = {
"K-1": 0,
"2-3": 1,
"4-5": 2,
"6-8": 3,
"9-10": 4,
"11-CCR": 5,
"CCR+": 6
}
REVERSE_MAPPING = {v: k for k, v in GRADE_BAND_ORDER.items()}
def get_grade_level(predicted_order):
"""Turns model predicted grade band order into the grade level string."""
# Clamp the prediction to valid range 0-6
predicted_order = max(0, min(6, round(predicted_order)))
return REVERSE_MAPPING.get(predicted_order, "Unknown")
# Load the regression model
MODEL_PATH = Path(__file__).parent.parent / "models" / "grade_level_quant_regression_model.pkl"
@st.cache_resource
def load_regression_model():
if not os.path.exists(MODEL_PATH):
return None
try:
return joblib.load(MODEL_PATH)
except Exception as e:
st.error(f"Error loading model file: {e}")
return None
model = load_regression_model()
def clean_value(val, default=0.0):
"""Returns the default value if val is NaN or None, otherwise returns val."""
if val is None or math.isnan(val):
return default
return val
def analyze_text(text, nlp_model, regression_model):
"""
Analyzes text and returns metrics and predicted grade level.
Returns: (grade_level, metrics_dict)
"""
if not text or not isinstance(text, str) or not text.strip():
return None, None
try:
# Process text
doc = nlp_model(text)
doc_stats = td.extract_dict(doc)[0]
# Extract Key Metrics
metrics = {
"FK_score": clean_value(round(doc_stats['flesch_kincaid_grade'], 2)),
"Gunning_fog": clean_value(round(doc_stats['gunning_fog'], 2)),
"Smog": clean_value(round(doc_stats['smog'], 2)),
"Lix": clean_value(round(doc_stats['lix'], 2)),
"Rix": clean_value(round(doc_stats['rix'], 2)),
"complexity_score_entropy": clean_value(round(doc_stats['entropy'], 2)),
"Sentence_Length": clean_value(round(doc_stats['sentence_length_mean'], 2))
}
# Prepare for Prediction
selected_var = ['FK_score', 'Gunning_fog', 'Smog', 'Lix', 'Rix', 'complexity_score_entropy', 'Sentence_Length']
# Create DataFrame with single row
input_data = [[metrics[col] for col in selected_var]]
new_data_processed = pd.DataFrame(input_data, columns=selected_var)
# Predict
raw_prediction = regression_model.predict(new_data_processed)[0]
grade_band = get_grade_level(raw_prediction)
return grade_band, metrics
except Exception as e:
# Check if it's the "division by zero" error common with empty/weird text in textdescriptives
return "Error", {}
# --- Sidebar for Batch Processing ---
with st.sidebar:
st.title("Upload your csv file for batch processing")
st.markdown("*!!! The CSV file must contain a column named **text**.*")
uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
# Process Button (Added for explicit action) or Auto-process
# User said: "allow user to upload CSV file ... and process text"
# Usually auto-process on upload is fine.
if uploaded_file is not None and model is not None and nlp is not None:
st.divider()
st.header("Batch Processing Results")
try:
df = pd.read_csv(uploaded_file)
if "text" not in df.columns:
st.error("The CSV file must contain a column named 'text'.")
else:
if st.button("Process CSV"):
progress_bar = st.progress(0, text="Processing rows...")
results = []
total_rows = len(df)
for index, row in df.iterrows():
text = str(row["text"])
grade, metrics = analyze_text(text, nlp, model)
row_result = row.to_dict()
row_result["predicted_grade_level"] = grade if grade else "N/A"
row_result["metrics"] = metrics if metrics else "N/A"
results.append(row_result)
# Update progress
progress_bar.progress((index + 1) / total_rows, text=f"Processing row {index+1}/{total_rows}")
progress_bar.empty()
# Create result DF
result_df = pd.DataFrame(results)
expanded_df = result_df['metrics'].apply(pd.Series)
final_df = pd.concat([result_df.drop('metrics', axis=1), expanded_df], axis=1)
# Show first 5 rows
st.subheader("Preview (First 5 Rows)")
st.dataframe(final_df.head(5))
# Download button
csv = final_df.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download results as CSV",
data=csv,
file_name='grade_level_predictions.csv',
mime='text/csv',
)
except Exception as e:
st.error(f"Error processing CSV: {e}")
# --- Main Application Area ---
if model is None:
st.warning(f"β οΈ Model file not found at `{MODEL_PATH}`.")
st.info("Please place your `grade_level_quant_regression_model.pkl` file in the `models` directory at the root of your project.")
else:
# Input Area
st.subheader("Single Text Analysis")
text_input = st.text_area("Enter text to analyze:", height=200, placeholder="Paste your text here...")
if st.button("Grade Level Prediction", type="primary"):
if not text_input.strip():
st.warning("Please enter some text first.")
elif nlp is None:
st.error("Text processing model (Spacy) is not available.")
else:
with st.spinner("Analyzing text complexity..."):
grade_band, metrics = analyze_text(text_input, nlp, model)
if grade_band == "Error":
st.error("An error occurred during analysis. Please check your input text.")
elif grade_band:
# Output
st.success(f"### Assigned Grade band based on Quant Metrics: **{grade_band}**")
with st.expander("View Detailed Metrics"):
st.json(metrics)
|