gls / src /streamlit_app.py
Gary Mu
add textdescriptive app
d8aafab
import streamlit as st
import spacy
import textdescriptives as td
import pandas as pd
import math
import numpy as np
import joblib
import os
from pathlib import Path
# Set page config
st.set_page_config(page_title="Text Grade Level Assignment", page_icon="πŸ“š", layout="wide")
def check_password():
"""Returns `True` if the user had the correct password."""
def password_entered():
"""Checks whether a password entered by the user is correct."""
if st.session_state["password"] == "gradelevel":
st.session_state["password_correct"] = True
del st.session_state["password"] # don't store password
else:
st.session_state["password_correct"] = False
if "password_correct" not in st.session_state:
# First run, show input for password.
st.text_input(
"Password", type="password", on_change=password_entered, key="password"
)
return False
elif not st.session_state["password_correct"]:
# Password not correct, show input + error.
st.text_input(
"Password", type="password", on_change=password_entered, key="password"
)
st.error("πŸ˜• Password incorrect")
return False
else:
# Password correct.
return True
if not check_password():
st.stop()
st.title("πŸ“š Text Grade Level Assignment")
st.markdown("Assign the grade level complexity of your text using quantitative metrics.")
# Cache the heavy model loading
@st.cache_resource
def load_spacy_model():
try:
# if not spacy.util.is_package("en_core_web_sm"):
st.warning("Downloading spacy model 'en_core_web_sm'... this might take a while.")
# spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textdescriptives/all")
return nlp
except Exception as e:
st.error(f"Error loading Spacy model: {e}")
return None
nlp = load_spacy_model()
# Grade band mapping
GRADE_BAND_ORDER = {
"K-1": 0,
"2-3": 1,
"4-5": 2,
"6-8": 3,
"9-10": 4,
"11-CCR": 5,
"CCR+": 6
}
REVERSE_MAPPING = {v: k for k, v in GRADE_BAND_ORDER.items()}
def get_grade_level(predicted_order):
"""Turns model predicted grade band order into the grade level string."""
# Clamp the prediction to valid range 0-6
predicted_order = max(0, min(6, round(predicted_order)))
return REVERSE_MAPPING.get(predicted_order, "Unknown")
# Load the regression model
MODEL_PATH = Path(__file__).parent.parent / "models" / "grade_level_quant_regression_model.pkl"
@st.cache_resource
def load_regression_model():
if not os.path.exists(MODEL_PATH):
return None
try:
return joblib.load(MODEL_PATH)
except Exception as e:
st.error(f"Error loading model file: {e}")
return None
model = load_regression_model()
def clean_value(val, default=0.0):
"""Returns the default value if val is NaN or None, otherwise returns val."""
if val is None or math.isnan(val):
return default
return val
def analyze_text(text, nlp_model, regression_model):
"""
Analyzes text and returns metrics and predicted grade level.
Returns: (grade_level, metrics_dict)
"""
if not text or not isinstance(text, str) or not text.strip():
return None, None
try:
# Process text
doc = nlp_model(text)
doc_stats = td.extract_dict(doc)[0]
# Extract Key Metrics
metrics = {
"FK_score": clean_value(round(doc_stats['flesch_kincaid_grade'], 2)),
"Gunning_fog": clean_value(round(doc_stats['gunning_fog'], 2)),
"Smog": clean_value(round(doc_stats['smog'], 2)),
"Lix": clean_value(round(doc_stats['lix'], 2)),
"Rix": clean_value(round(doc_stats['rix'], 2)),
"complexity_score_entropy": clean_value(round(doc_stats['entropy'], 2)),
"Sentence_Length": clean_value(round(doc_stats['sentence_length_mean'], 2))
}
# Prepare for Prediction
selected_var = ['FK_score', 'Gunning_fog', 'Smog', 'Lix', 'Rix', 'complexity_score_entropy', 'Sentence_Length']
# Create DataFrame with single row
input_data = [[metrics[col] for col in selected_var]]
new_data_processed = pd.DataFrame(input_data, columns=selected_var)
# Predict
raw_prediction = regression_model.predict(new_data_processed)[0]
grade_band = get_grade_level(raw_prediction)
return grade_band, metrics
except Exception as e:
# Check if it's the "division by zero" error common with empty/weird text in textdescriptives
return "Error", {}
# --- Sidebar for Batch Processing ---
with st.sidebar:
st.title("Upload your csv file for batch processing")
st.markdown("*!!! The CSV file must contain a column named **text**.*")
uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
# Process Button (Added for explicit action) or Auto-process
# User said: "allow user to upload CSV file ... and process text"
# Usually auto-process on upload is fine.
if uploaded_file is not None and model is not None and nlp is not None:
st.divider()
st.header("Batch Processing Results")
try:
df = pd.read_csv(uploaded_file)
if "text" not in df.columns:
st.error("The CSV file must contain a column named 'text'.")
else:
if st.button("Process CSV"):
progress_bar = st.progress(0, text="Processing rows...")
results = []
total_rows = len(df)
for index, row in df.iterrows():
text = str(row["text"])
grade, metrics = analyze_text(text, nlp, model)
row_result = row.to_dict()
row_result["predicted_grade_level"] = grade if grade else "N/A"
row_result["metrics"] = metrics if metrics else "N/A"
results.append(row_result)
# Update progress
progress_bar.progress((index + 1) / total_rows, text=f"Processing row {index+1}/{total_rows}")
progress_bar.empty()
# Create result DF
result_df = pd.DataFrame(results)
expanded_df = result_df['metrics'].apply(pd.Series)
final_df = pd.concat([result_df.drop('metrics', axis=1), expanded_df], axis=1)
# Show first 5 rows
st.subheader("Preview (First 5 Rows)")
st.dataframe(final_df.head(5))
# Download button
csv = final_df.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download results as CSV",
data=csv,
file_name='grade_level_predictions.csv',
mime='text/csv',
)
except Exception as e:
st.error(f"Error processing CSV: {e}")
# --- Main Application Area ---
if model is None:
st.warning(f"⚠️ Model file not found at `{MODEL_PATH}`.")
st.info("Please place your `grade_level_quant_regression_model.pkl` file in the `models` directory at the root of your project.")
else:
# Input Area
st.subheader("Single Text Analysis")
text_input = st.text_area("Enter text to analyze:", height=200, placeholder="Paste your text here...")
if st.button("Grade Level Prediction", type="primary"):
if not text_input.strip():
st.warning("Please enter some text first.")
elif nlp is None:
st.error("Text processing model (Spacy) is not available.")
else:
with st.spinner("Analyzing text complexity..."):
grade_band, metrics = analyze_text(text_input, nlp, model)
if grade_band == "Error":
st.error("An error occurred during analysis. Please check your input text.")
elif grade_band:
# Output
st.success(f"### Assigned Grade band based on Quant Metrics: **{grade_band}**")
with st.expander("View Detailed Metrics"):
st.json(metrics)