Spaces:

gmu-czi
/

gls

Build error

App Files Files Community

Gary Mu commited on Jan 26

Commit

d8aafab

1 Parent(s): 43da35e

add textdescriptive app

Browse files

Files changed (4) hide show

.DS_Store +0 -0
models/grade_level_quant_regression_model.pkl +0 -0
requirements.txt +10 -2
src/streamlit_app.py +230 -38

.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

models/grade_level_quant_regression_model.pkl ADDED Viewed

Binary file (1.11 kB). View file

requirements.txt CHANGED Viewed

@@ -1,3 +1,11 @@
-altair
 pandas
-streamlit

+altair<5
+joblib
+matplotlib
+numpy<2
 pandas
+python-dotenv
+scikit-learn==1.2.2
+spacy
+streamlit
+textdescriptives
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,232 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+import spacy
+import textdescriptives as td
+import pandas as pd
+import math
+import numpy as np
+import joblib
+import os
+from pathlib import Path
+# Set page config
+st.set_page_config(page_title="Text Grade Level Assignment", page_icon="📚", layout="wide")
+def check_password():
+    """Returns `True` if the user had the correct password."""
+    def password_entered():
+        """Checks whether a password entered by the user is correct."""
+        if st.session_state["password"] == "gradelevel":
+            st.session_state["password_correct"] = True
+            del st.session_state["password"]  # don't store password
+        else:
+            st.session_state["password_correct"] = False
+    if "password_correct" not in st.session_state:
+        # First run, show input for password.
+        st.text_input(
+            "Password", type="password", on_change=password_entered, key="password"
+        )
+        return False
+    elif not st.session_state["password_correct"]:
+        # Password not correct, show input + error.
+        st.text_input(
+            "Password", type="password", on_change=password_entered, key="password"
+        )
+        st.error("😕 Password incorrect")
+        return False
+    else:
+        # Password correct.
+        return True
+if not check_password():
+    st.stop()
+st.title("📚 Text Grade Level Assignment")
+st.markdown("Assign the grade level complexity of your text using quantitative metrics.")
+# Cache the heavy model loading
+@st.cache_resource
+def load_spacy_model():
+    try:
+        # if not spacy.util.is_package("en_core_web_sm"):
+        st.warning("Downloading spacy model 'en_core_web_sm'... this might take a while.")
+        #     spacy.cli.download("en_core_web_sm")
+        nlp = spacy.load("en_core_web_sm")
+        nlp.add_pipe("textdescriptives/all")
+        return nlp
+    except Exception as e:
+        st.error(f"Error loading Spacy model: {e}")
+        return None
+nlp = load_spacy_model()
+# Grade band mapping
+GRADE_BAND_ORDER = {
+    "K-1": 0,
+    "2-3": 1,
+    "4-5": 2,
+    "6-8": 3,
+    "9-10": 4,
+    "11-CCR": 5,
+    "CCR+": 6
+}
+REVERSE_MAPPING = {v: k for k, v in GRADE_BAND_ORDER.items()}
+def get_grade_level(predicted_order):
+    """Turns model predicted grade band order into the grade level string."""
+    # Clamp the prediction to valid range 0-6
+    predicted_order = max(0, min(6, round(predicted_order)))
+    return REVERSE_MAPPING.get(predicted_order, "Unknown")
+# Load the regression model
+MODEL_PATH = Path(__file__).parent.parent / "models" / "grade_level_quant_regression_model.pkl"
+@st.cache_resource
+def load_regression_model():
+    if not os.path.exists(MODEL_PATH):
+        return None
+    try:
+        return joblib.load(MODEL_PATH)
+    except Exception as e:
+        st.error(f"Error loading model file: {e}")
+        return None
+model = load_regression_model()
+def clean_value(val, default=0.0):
+    """Returns the default value if val is NaN or None, otherwise returns val."""
+    if val is None or math.isnan(val):
+        return default
+    return val
+def analyze_text(text, nlp_model, regression_model):
+    """
+    Analyzes text and returns metrics and predicted grade level.
+    Returns: (grade_level, metrics_dict)
+    """
+    if not text or not isinstance(text, str) or not text.strip():
+        return None, None
+    try:
+        # Process text
+        doc = nlp_model(text)
+        doc_stats = td.extract_dict(doc)[0]
+        # Extract Key Metrics
+        metrics = {
+            "FK_score": clean_value(round(doc_stats['flesch_kincaid_grade'], 2)),
+            "Gunning_fog": clean_value(round(doc_stats['gunning_fog'], 2)),
+            "Smog": clean_value(round(doc_stats['smog'], 2)),
+            "Lix": clean_value(round(doc_stats['lix'], 2)),
+            "Rix": clean_value(round(doc_stats['rix'], 2)),
+            "complexity_score_entropy": clean_value(round(doc_stats['entropy'], 2)),
+            "Sentence_Length": clean_value(round(doc_stats['sentence_length_mean'], 2))
+        }
+        # Prepare for Prediction
+        selected_var = ['FK_score', 'Gunning_fog', 'Smog', 'Lix', 'Rix', 'complexity_score_entropy', 'Sentence_Length']
+        # Create DataFrame with single row
+        input_data = [[metrics[col] for col in selected_var]]
+        new_data_processed = pd.DataFrame(input_data, columns=selected_var)
+        # Predict
+        raw_prediction = regression_model.predict(new_data_processed)[0]
+        grade_band = get_grade_level(raw_prediction)
+        return grade_band, metrics
+    except Exception as e:
+        # Check if it's the "division by zero" error common with empty/weird text in textdescriptives
+        return "Error", {}
+# --- Sidebar for Batch Processing ---
+with st.sidebar:
+    st.title("Upload your csv file for batch processing")
+    st.markdown("*!!! The CSV file must contain a column named **text**.*")
+    uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
+    # Process Button (Added for explicit action) or Auto-process
+    # User said: "allow user to upload CSV file ... and process text"
+    # Usually auto-process on upload is fine.
+if uploaded_file is not None and model is not None and nlp is not None:
+    st.divider()
+    st.header("Batch Processing Results")
+    try:
+        df = pd.read_csv(uploaded_file)
+        if "text" not in df.columns:
+            st.error("The CSV file must contain a column named 'text'.")
+        else:
+            if st.button("Process CSV"):
+                progress_bar = st.progress(0, text="Processing rows...")
+                results = []
+                total_rows = len(df)
+                for index, row in df.iterrows():
+                    text = str(row["text"])
+                    grade, metrics = analyze_text(text, nlp, model)
+                    row_result = row.to_dict()
+                    row_result["predicted_grade_level"] = grade if grade else "N/A"
+                    row_result["metrics"] = metrics if metrics else "N/A"
+                    results.append(row_result)
+                    # Update progress
+                    progress_bar.progress((index + 1) / total_rows, text=f"Processing row {index+1}/{total_rows}")
+                progress_bar.empty()
+                # Create result DF
+                result_df = pd.DataFrame(results)
+                expanded_df = result_df['metrics'].apply(pd.Series)
+                final_df = pd.concat([result_df.drop('metrics', axis=1), expanded_df], axis=1)
+                # Show first 5 rows
+                st.subheader("Preview (First 5 Rows)")
+                st.dataframe(final_df.head(5))
+                # Download button
+                csv = final_df.to_csv(index=False).encode('utf-8')
+                st.download_button(
+                    label="Download results as CSV",
+                    data=csv,
+                    file_name='grade_level_predictions.csv',
+                    mime='text/csv',
+                )
+    except Exception as e:
+        st.error(f"Error processing CSV: {e}")
+# --- Main Application Area ---
+if model is None:
+    st.warning(f"⚠️ Model file not found at `{MODEL_PATH}`.")
+    st.info("Please place your `grade_level_quant_regression_model.pkl` file in the `models` directory at the root of your project.")
+else:
+    # Input Area
+    st.subheader("Single Text Analysis")
+    text_input = st.text_area("Enter text to analyze:", height=200, placeholder="Paste your text here...")
+    if st.button("Grade Level Prediction", type="primary"):
+        if not text_input.strip():
+            st.warning("Please enter some text first.")
+        elif nlp is None:
+            st.error("Text processing model (Spacy) is not available.")
+        else:
+            with st.spinner("Analyzing text complexity..."):
+                grade_band, metrics = analyze_text(text_input, nlp, model)
+                if grade_band == "Error":
+                    st.error("An error occurred during analysis. Please check your input text.")
+                elif grade_band:
+                    # Output
+                    st.success(f"### Assigned Grade band based on Quant Metrics: **{grade_band}**")
+                    with st.expander("View Detailed Metrics"):
+                        st.json(metrics)