Gary Mu commited on
Commit
d8aafab
·
1 Parent(s): 43da35e

add textdescriptive app

Browse files
.DS_Store ADDED
Binary file (8.2 kB). View file
 
models/grade_level_quant_regression_model.pkl ADDED
Binary file (1.11 kB). View file
 
requirements.txt CHANGED
@@ -1,3 +1,11 @@
1
- altair
 
 
 
2
  pandas
3
- streamlit
 
 
 
 
 
 
1
+ altair<5
2
+ joblib
3
+ matplotlib
4
+ numpy<2
5
  pandas
6
+ python-dotenv
7
+ scikit-learn==1.2.2
8
+ spacy
9
+ streamlit
10
+ textdescriptives
11
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
src/streamlit_app.py CHANGED
@@ -1,40 +1,232 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
1
  import streamlit as st
2
+ import spacy
3
+ import textdescriptives as td
4
+ import pandas as pd
5
+ import math
6
+ import numpy as np
7
+ import joblib
8
+ import os
9
+ from pathlib import Path
10
+
11
+ # Set page config
12
+ st.set_page_config(page_title="Text Grade Level Assignment", page_icon="📚", layout="wide")
13
+
14
+ def check_password():
15
+ """Returns `True` if the user had the correct password."""
16
+
17
+ def password_entered():
18
+ """Checks whether a password entered by the user is correct."""
19
+ if st.session_state["password"] == "gradelevel":
20
+ st.session_state["password_correct"] = True
21
+ del st.session_state["password"] # don't store password
22
+ else:
23
+ st.session_state["password_correct"] = False
24
+
25
+ if "password_correct" not in st.session_state:
26
+ # First run, show input for password.
27
+ st.text_input(
28
+ "Password", type="password", on_change=password_entered, key="password"
29
+ )
30
+ return False
31
+ elif not st.session_state["password_correct"]:
32
+ # Password not correct, show input + error.
33
+ st.text_input(
34
+ "Password", type="password", on_change=password_entered, key="password"
35
+ )
36
+ st.error("😕 Password incorrect")
37
+ return False
38
+ else:
39
+ # Password correct.
40
+ return True
41
+
42
+ if not check_password():
43
+ st.stop()
44
+
45
+ st.title("📚 Text Grade Level Assignment")
46
+ st.markdown("Assign the grade level complexity of your text using quantitative metrics.")
47
+
48
+ # Cache the heavy model loading
49
+ @st.cache_resource
50
+ def load_spacy_model():
51
+ try:
52
+ # if not spacy.util.is_package("en_core_web_sm"):
53
+ st.warning("Downloading spacy model 'en_core_web_sm'... this might take a while.")
54
+ # spacy.cli.download("en_core_web_sm")
55
+ nlp = spacy.load("en_core_web_sm")
56
+ nlp.add_pipe("textdescriptives/all")
57
+ return nlp
58
+ except Exception as e:
59
+ st.error(f"Error loading Spacy model: {e}")
60
+ return None
61
+
62
+ nlp = load_spacy_model()
63
+
64
+ # Grade band mapping
65
+ GRADE_BAND_ORDER = {
66
+ "K-1": 0,
67
+ "2-3": 1,
68
+ "4-5": 2,
69
+ "6-8": 3,
70
+ "9-10": 4,
71
+ "11-CCR": 5,
72
+ "CCR+": 6
73
+ }
74
+
75
+ REVERSE_MAPPING = {v: k for k, v in GRADE_BAND_ORDER.items()}
76
+
77
+ def get_grade_level(predicted_order):
78
+ """Turns model predicted grade band order into the grade level string."""
79
+ # Clamp the prediction to valid range 0-6
80
+ predicted_order = max(0, min(6, round(predicted_order)))
81
+ return REVERSE_MAPPING.get(predicted_order, "Unknown")
82
+
83
+ # Load the regression model
84
+ MODEL_PATH = Path(__file__).parent.parent / "models" / "grade_level_quant_regression_model.pkl"
85
+
86
+ @st.cache_resource
87
+ def load_regression_model():
88
+ if not os.path.exists(MODEL_PATH):
89
+ return None
90
+ try:
91
+ return joblib.load(MODEL_PATH)
92
+ except Exception as e:
93
+ st.error(f"Error loading model file: {e}")
94
+ return None
95
+
96
+ model = load_regression_model()
97
+
98
+ def clean_value(val, default=0.0):
99
+ """Returns the default value if val is NaN or None, otherwise returns val."""
100
+ if val is None or math.isnan(val):
101
+ return default
102
+ return val
103
+
104
+ def analyze_text(text, nlp_model, regression_model):
105
+ """
106
+ Analyzes text and returns metrics and predicted grade level.
107
+ Returns: (grade_level, metrics_dict)
108
+ """
109
+ if not text or not isinstance(text, str) or not text.strip():
110
+ return None, None
111
+
112
+ try:
113
+ # Process text
114
+ doc = nlp_model(text)
115
+ doc_stats = td.extract_dict(doc)[0]
116
+
117
+ # Extract Key Metrics
118
+ metrics = {
119
+ "FK_score": clean_value(round(doc_stats['flesch_kincaid_grade'], 2)),
120
+ "Gunning_fog": clean_value(round(doc_stats['gunning_fog'], 2)),
121
+ "Smog": clean_value(round(doc_stats['smog'], 2)),
122
+ "Lix": clean_value(round(doc_stats['lix'], 2)),
123
+ "Rix": clean_value(round(doc_stats['rix'], 2)),
124
+ "complexity_score_entropy": clean_value(round(doc_stats['entropy'], 2)),
125
+ "Sentence_Length": clean_value(round(doc_stats['sentence_length_mean'], 2))
126
+ }
127
+
128
+ # Prepare for Prediction
129
+ selected_var = ['FK_score', 'Gunning_fog', 'Smog', 'Lix', 'Rix', 'complexity_score_entropy', 'Sentence_Length']
130
+
131
+ # Create DataFrame with single row
132
+ input_data = [[metrics[col] for col in selected_var]]
133
+ new_data_processed = pd.DataFrame(input_data, columns=selected_var)
134
+
135
+ # Predict
136
+ raw_prediction = regression_model.predict(new_data_processed)[0]
137
+ grade_band = get_grade_level(raw_prediction)
138
+
139
+ return grade_band, metrics
140
+
141
+ except Exception as e:
142
+ # Check if it's the "division by zero" error common with empty/weird text in textdescriptives
143
+ return "Error", {}
144
+
145
+ # --- Sidebar for Batch Processing ---
146
+ with st.sidebar:
147
+ st.title("Upload your csv file for batch processing")
148
+ st.markdown("*!!! The CSV file must contain a column named **text**.*")
149
+ uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
150
+
151
+ # Process Button (Added for explicit action) or Auto-process
152
+ # User said: "allow user to upload CSV file ... and process text"
153
+ # Usually auto-process on upload is fine.
154
+
155
+ if uploaded_file is not None and model is not None and nlp is not None:
156
+ st.divider()
157
+ st.header("Batch Processing Results")
158
+ try:
159
+ df = pd.read_csv(uploaded_file)
160
+ if "text" not in df.columns:
161
+ st.error("The CSV file must contain a column named 'text'.")
162
+ else:
163
+ if st.button("Process CSV"):
164
+ progress_bar = st.progress(0, text="Processing rows...")
165
+ results = []
166
+
167
+ total_rows = len(df)
168
+ for index, row in df.iterrows():
169
+ text = str(row["text"])
170
+ grade, metrics = analyze_text(text, nlp, model)
171
+
172
+ row_result = row.to_dict()
173
+ row_result["predicted_grade_level"] = grade if grade else "N/A"
174
+ row_result["metrics"] = metrics if metrics else "N/A"
175
+
176
+ results.append(row_result)
177
+
178
+ # Update progress
179
+ progress_bar.progress((index + 1) / total_rows, text=f"Processing row {index+1}/{total_rows}")
180
+
181
+ progress_bar.empty()
182
+
183
+ # Create result DF
184
+
185
+ result_df = pd.DataFrame(results)
186
+ expanded_df = result_df['metrics'].apply(pd.Series)
187
+ final_df = pd.concat([result_df.drop('metrics', axis=1), expanded_df], axis=1)
188
+
189
+ # Show first 5 rows
190
+ st.subheader("Preview (First 5 Rows)")
191
+ st.dataframe(final_df.head(5))
192
+
193
+ # Download button
194
+ csv = final_df.to_csv(index=False).encode('utf-8')
195
+ st.download_button(
196
+ label="Download results as CSV",
197
+ data=csv,
198
+ file_name='grade_level_predictions.csv',
199
+ mime='text/csv',
200
+ )
201
+ except Exception as e:
202
+ st.error(f"Error processing CSV: {e}")
203
+
204
+
205
+ # --- Main Application Area ---
206
+
207
+ if model is None:
208
+ st.warning(f"⚠️ Model file not found at `{MODEL_PATH}`.")
209
+ st.info("Please place your `grade_level_quant_regression_model.pkl` file in the `models` directory at the root of your project.")
210
+
211
+ else:
212
+ # Input Area
213
+ st.subheader("Single Text Analysis")
214
+ text_input = st.text_area("Enter text to analyze:", height=200, placeholder="Paste your text here...")
215
+
216
+ if st.button("Grade Level Prediction", type="primary"):
217
+ if not text_input.strip():
218
+ st.warning("Please enter some text first.")
219
+ elif nlp is None:
220
+ st.error("Text processing model (Spacy) is not available.")
221
+ else:
222
+ with st.spinner("Analyzing text complexity..."):
223
+ grade_band, metrics = analyze_text(text_input, nlp, model)
224
 
225
+ if grade_band == "Error":
226
+ st.error("An error occurred during analysis. Please check your input text.")
227
+ elif grade_band:
228
+ # Output
229
+ st.success(f"### Assigned Grade band based on Quant Metrics: **{grade_band}**")
230
+
231
+ with st.expander("View Detailed Metrics"):
232
+ st.json(metrics)