Spaces:

gmu-czi
/

gls

Build error

gls / src /streamlit_app.py

Gary Mu

add textdescriptive app

d8aafab about 1 month ago

8.44 kB

	import streamlit as st
	import spacy
	import textdescriptives as td
	import pandas as pd
	import math
	import numpy as np
	import joblib
	import os
	from pathlib import Path

	# Set page config
	st.set_page_config(page_title="Text Grade Level Assignment", page_icon="📚", layout="wide")

	def check_password():
	"""Returns `True` if the user had the correct password."""

	def password_entered():
	"""Checks whether a password entered by the user is correct."""
	if st.session_state["password"] == "gradelevel":
	st.session_state["password_correct"] = True
	del st.session_state["password"] # don't store password
	else:
	st.session_state["password_correct"] = False

	if "password_correct" not in st.session_state:
	# First run, show input for password.
	st.text_input(
	"Password", type="password", on_change=password_entered, key="password"
	)
	return False
	elif not st.session_state["password_correct"]:
	# Password not correct, show input + error.
	st.text_input(
	"Password", type="password", on_change=password_entered, key="password"
	)
	st.error("😕 Password incorrect")
	return False
	else:
	# Password correct.
	return True

	if not check_password():
	st.stop()

	st.title("📚 Text Grade Level Assignment")
	st.markdown("Assign the grade level complexity of your text using quantitative metrics.")

	# Cache the heavy model loading
	@st.cache_resource
	def load_spacy_model():
	try:
	# if not spacy.util.is_package("en_core_web_sm"):
	st.warning("Downloading spacy model 'en_core_web_sm'... this might take a while.")
	# spacy.cli.download("en_core_web_sm")
	nlp = spacy.load("en_core_web_sm")
	nlp.add_pipe("textdescriptives/all")
	return nlp
	except Exception as e:
	st.error(f"Error loading Spacy model: {e}")
	return None

	nlp = load_spacy_model()

	# Grade band mapping
	GRADE_BAND_ORDER = {
	"K-1": 0,
	"2-3": 1,
	"4-5": 2,
	"6-8": 3,
	"9-10": 4,
	"11-CCR": 5,
	"CCR+": 6
	}

	REVERSE_MAPPING = {v: k for k, v in GRADE_BAND_ORDER.items()}

	def get_grade_level(predicted_order):
	"""Turns model predicted grade band order into the grade level string."""
	# Clamp the prediction to valid range 0-6
	predicted_order = max(0, min(6, round(predicted_order)))
	return REVERSE_MAPPING.get(predicted_order, "Unknown")

	# Load the regression model
	MODEL_PATH = Path(__file__).parent.parent / "models" / "grade_level_quant_regression_model.pkl"

	@st.cache_resource
	def load_regression_model():
	if not os.path.exists(MODEL_PATH):
	return None
	try:
	return joblib.load(MODEL_PATH)
	except Exception as e:
	st.error(f"Error loading model file: {e}")
	return None

	model = load_regression_model()

	def clean_value(val, default=0.0):
	"""Returns the default value if val is NaN or None, otherwise returns val."""
	if val is None or math.isnan(val):
	return default
	return val

	def analyze_text(text, nlp_model, regression_model):
	"""
	Analyzes text and returns metrics and predicted grade level.
	Returns: (grade_level, metrics_dict)
	"""
	if not text or not isinstance(text, str) or not text.strip():
	return None, None

	try:
	# Process text
	doc = nlp_model(text)
	doc_stats = td.extract_dict(doc)[0]

	# Extract Key Metrics
	metrics = {
	"FK_score": clean_value(round(doc_stats['flesch_kincaid_grade'], 2)),
	"Gunning_fog": clean_value(round(doc_stats['gunning_fog'], 2)),
	"Smog": clean_value(round(doc_stats['smog'], 2)),
	"Lix": clean_value(round(doc_stats['lix'], 2)),
	"Rix": clean_value(round(doc_stats['rix'], 2)),
	"complexity_score_entropy": clean_value(round(doc_stats['entropy'], 2)),
	"Sentence_Length": clean_value(round(doc_stats['sentence_length_mean'], 2))
	}

	# Prepare for Prediction
	selected_var = ['FK_score', 'Gunning_fog', 'Smog', 'Lix', 'Rix', 'complexity_score_entropy', 'Sentence_Length']

	# Create DataFrame with single row
	input_data = [[metrics[col] for col in selected_var]]
	new_data_processed = pd.DataFrame(input_data, columns=selected_var)

	# Predict
	raw_prediction = regression_model.predict(new_data_processed)[0]
	grade_band = get_grade_level(raw_prediction)

	return grade_band, metrics

	except Exception as e:
	# Check if it's the "division by zero" error common with empty/weird text in textdescriptives
	return "Error", {}

	# --- Sidebar for Batch Processing ---
	with st.sidebar:
	st.title("Upload your csv file for batch processing")
	st.markdown("!!! The CSV file must contain a column named text.")
	uploaded_file = st.file_uploader("Upload CSV", type=["csv"])

	# Process Button (Added for explicit action) or Auto-process
	# User said: "allow user to upload CSV file ... and process text"
	# Usually auto-process on upload is fine.

	if uploaded_file is not None and model is not None and nlp is not None:
	st.divider()
	st.header("Batch Processing Results")
	try:
	df = pd.read_csv(uploaded_file)
	if "text" not in df.columns:
	st.error("The CSV file must contain a column named 'text'.")
	else:
	if st.button("Process CSV"):
	progress_bar = st.progress(0, text="Processing rows...")
	results = []

	total_rows = len(df)
	for index, row in df.iterrows():
	text = str(row["text"])
	grade, metrics = analyze_text(text, nlp, model)

	row_result = row.to_dict()
	row_result["predicted_grade_level"] = grade if grade else "N/A"
	row_result["metrics"] = metrics if metrics else "N/A"

	results.append(row_result)

	# Update progress
	progress_bar.progress((index + 1) / total_rows, text=f"Processing row {index+1}/{total_rows}")

	progress_bar.empty()

	# Create result DF

	result_df = pd.DataFrame(results)
	expanded_df = result_df['metrics'].apply(pd.Series)
	final_df = pd.concat([result_df.drop('metrics', axis=1), expanded_df], axis=1)

	# Show first 5 rows
	st.subheader("Preview (First 5 Rows)")
	st.dataframe(final_df.head(5))

	# Download button
	csv = final_df.to_csv(index=False).encode('utf-8')
	st.download_button(
	label="Download results as CSV",
	data=csv,
	file_name='grade_level_predictions.csv',
	mime='text/csv',
	)
	except Exception as e:
	st.error(f"Error processing CSV: {e}")


	# --- Main Application Area ---

	if model is None:
	st.warning(f"⚠️ Model file not found at `{MODEL_PATH}`.")
	st.info("Please place your `grade_level_quant_regression_model.pkl` file in the `models` directory at the root of your project.")

	else:
	# Input Area
	st.subheader("Single Text Analysis")
	text_input = st.text_area("Enter text to analyze:", height=200, placeholder="Paste your text here...")

	if st.button("Grade Level Prediction", type="primary"):
	if not text_input.strip():
	st.warning("Please enter some text first.")
	elif nlp is None:
	st.error("Text processing model (Spacy) is not available.")
	else:
	with st.spinner("Analyzing text complexity..."):
	grade_band, metrics = analyze_text(text_input, nlp, model)

	if grade_band == "Error":
	st.error("An error occurred during analysis. Please check your input text.")
	elif grade_band:
	# Output
	st.success(f"### Assigned Grade band based on Quant Metrics: {grade_band}")

	with st.expander("View Detailed Metrics"):
	st.json(metrics)