lynn-twinkl
commited on
Commit
·
164201a
1
Parent(s):
6c1f317
Added min max scaler to index and explainer for index
Browse files
app.py
CHANGED
|
@@ -6,27 +6,29 @@ import streamlit as st
|
|
| 6 |
import pandas as pd
|
| 7 |
from io import BytesIO
|
| 8 |
|
| 9 |
-
#
|
| 10 |
|
| 11 |
from functions.extract_usage import extract_usage
|
| 12 |
-
from functions.necessity_index import compute_necessity
|
| 13 |
from functions.column_detection import detect_freeform_answer_col
|
| 14 |
import typing
|
| 15 |
|
| 16 |
-
#
|
|
|
|
| 17 |
@st.cache_data
|
| 18 |
def load_and_process(raw_csv: bytes) -> typing.Tuple[pd.DataFrame, str]:
|
| 19 |
"""
|
| 20 |
Load CSV from raw bytes, detect freeform column, compute necessity scores,
|
| 21 |
and extract usage items. Returns processed DataFrame and freeform column name.
|
| 22 |
"""
|
| 23 |
-
# Read
|
| 24 |
df_orig = pd.read_csv(BytesIO(raw_csv))
|
| 25 |
-
# Detect
|
| 26 |
freeform_col = detect_freeform_answer_col(df_orig)
|
| 27 |
# Compute necessity scores
|
| 28 |
scored = df_orig.join(df_orig[freeform_col].apply(compute_necessity))
|
| 29 |
-
|
|
|
|
| 30 |
docs = df_orig[freeform_col].to_list()
|
| 31 |
usage = extract_usage(docs)
|
| 32 |
scored['Usage'] = usage
|
|
@@ -41,10 +43,9 @@ st.title("Grant Applications Helper")
|
|
| 41 |
uploaded_file = st.file_uploader("Upload grant applications file for analysis", type='csv')
|
| 42 |
|
| 43 |
if uploaded_file is not None:
|
| 44 |
-
# Read raw bytes for caching and repeated use
|
| 45 |
raw = uploaded_file.read()
|
| 46 |
|
| 47 |
-
# --- Original Data Preview ---
|
| 48 |
st.markdown("""
|
| 49 |
### Data Preview
|
| 50 |
Here's the data you uploaded!
|
|
@@ -53,12 +54,15 @@ if uploaded_file is not None:
|
|
| 53 |
df_orig = pd.read_csv(BytesIO(raw))
|
| 54 |
st.dataframe(df_orig)
|
| 55 |
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
| 57 |
df, freeform_col = load_and_process(raw)
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
min_idx = float(df['necessity_index'].min())
|
| 63 |
max_idx = float(df['necessity_index'].max())
|
| 64 |
filter_range = st.sidebar.slider(
|
|
@@ -70,14 +74,47 @@ if uploaded_file is not None:
|
|
| 70 |
st.sidebar.markdown(f"**Total Applications:** {len(df)}")
|
| 71 |
st.sidebar.markdown(f"**Filtered Applications:** {len(filtered_df)}")
|
| 72 |
|
| 73 |
-
|
|
|
|
|
|
|
| 74 |
st.subheader("Necessity Index Distribution")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
st.bar_chart(df['necessity_index'])
|
| 76 |
|
| 77 |
# Review applications
|
| 78 |
st.subheader("Applications")
|
| 79 |
for idx, row in filtered_df.iterrows():
|
| 80 |
-
with st.expander(f"Application {idx} | Necessity: {row['necessity_index']:.1f}"):
|
| 81 |
col1, col2 = st.columns((1, 3))
|
| 82 |
col1.metric("Necessity", f"{row['necessity_index']:.1f}")
|
| 83 |
col1.metric("Urgency", f"{row['urgency_score']}")
|
|
@@ -85,13 +122,13 @@ if uploaded_file is not None:
|
|
| 85 |
col1.metric("Vulnerability", f"{row['vulnerability_score']}")
|
| 86 |
# Clean usage items
|
| 87 |
usage_items = [item for item in row['Usage'] if item and item.lower() != 'none']
|
|
|
|
|
|
|
| 88 |
if usage_items:
|
| 89 |
-
col2.markdown("**
|
| 90 |
col2.write(", ".join(usage_items))
|
| 91 |
else:
|
| 92 |
col2.markdown("*No specific usage items extracted.*")
|
| 93 |
-
col2.markdown("**Excerpt:**")
|
| 94 |
-
col2.write(row[freeform_col])
|
| 95 |
# Shortlist checkbox
|
| 96 |
st.checkbox(
|
| 97 |
"Shortlist this application",
|
|
|
|
| 6 |
import pandas as pd
|
| 7 |
from io import BytesIO
|
| 8 |
|
| 9 |
+
# ---- FUNCTIONS ----
|
| 10 |
|
| 11 |
from functions.extract_usage import extract_usage
|
| 12 |
+
from functions.necessity_index import compute_necessity, index_scaler
|
| 13 |
from functions.column_detection import detect_freeform_answer_col
|
| 14 |
import typing
|
| 15 |
|
| 16 |
+
# ---- CACHEABLE PROCESSING ----
|
| 17 |
+
|
| 18 |
@st.cache_data
|
| 19 |
def load_and_process(raw_csv: bytes) -> typing.Tuple[pd.DataFrame, str]:
|
| 20 |
"""
|
| 21 |
Load CSV from raw bytes, detect freeform column, compute necessity scores,
|
| 22 |
and extract usage items. Returns processed DataFrame and freeform column name.
|
| 23 |
"""
|
| 24 |
+
# Read uploaded data
|
| 25 |
df_orig = pd.read_csv(BytesIO(raw_csv))
|
| 26 |
+
# Detect freeform column
|
| 27 |
freeform_col = detect_freeform_answer_col(df_orig)
|
| 28 |
# Compute necessity scores
|
| 29 |
scored = df_orig.join(df_orig[freeform_col].apply(compute_necessity))
|
| 30 |
+
scored['necessity_index'] = index_scaler(scored['necessity_index'].values)
|
| 31 |
+
# LangChain function for extracting usage
|
| 32 |
docs = df_orig[freeform_col].to_list()
|
| 33 |
usage = extract_usage(docs)
|
| 34 |
scored['Usage'] = usage
|
|
|
|
| 43 |
uploaded_file = st.file_uploader("Upload grant applications file for analysis", type='csv')
|
| 44 |
|
| 45 |
if uploaded_file is not None:
|
| 46 |
+
# Read raw bytes for caching and repeated use --> this ensure all the processing isn't repeated when a user changes the filters
|
| 47 |
raw = uploaded_file.read()
|
| 48 |
|
|
|
|
| 49 |
st.markdown("""
|
| 50 |
### Data Preview
|
| 51 |
Here's the data you uploaded!
|
|
|
|
| 54 |
df_orig = pd.read_csv(BytesIO(raw))
|
| 55 |
st.dataframe(df_orig)
|
| 56 |
|
| 57 |
+
st.divider()
|
| 58 |
+
|
| 59 |
+
## ---- PROCESSED DATA (CACHED) ----
|
| 60 |
+
|
| 61 |
df, freeform_col = load_and_process(raw)
|
| 62 |
|
| 63 |
+
## ---- INTERACTIVE FILTERING & REVIEW INTERFACE ----
|
| 64 |
+
|
| 65 |
+
st.sidebar.title("Filters")
|
| 66 |
min_idx = float(df['necessity_index'].min())
|
| 67 |
max_idx = float(df['necessity_index'].max())
|
| 68 |
filter_range = st.sidebar.slider(
|
|
|
|
| 74 |
st.sidebar.markdown(f"**Total Applications:** {len(df)}")
|
| 75 |
st.sidebar.markdown(f"**Filtered Applications:** {len(filtered_df)}")
|
| 76 |
|
| 77 |
+
## ---- NECESSITY INDEX CHART ----
|
| 78 |
+
|
| 79 |
+
st.header("Processed Applications")
|
| 80 |
st.subheader("Necessity Index Distribution")
|
| 81 |
+
|
| 82 |
+
with st.expander("Learn about our indexing algorithm", icon='🌱'):
|
| 83 |
+
st.markdown(
|
| 84 |
+
"""
|
| 85 |
+
This algorithm is designed to help us evaluate grant applications by assigning a "necessity score" to each application. Here's how it works:
|
| 86 |
+
|
| 87 |
+
#### Lexicon
|
| 88 |
+
|
| 89 |
+
**Keywords:** The algorithm looks for specific keywords in the application text. These keywords are grouped into five main categories:
|
| 90 |
+
- **Urgency:** Words like "urgent" and "immediate" that express pressing needs.
|
| 91 |
+
- **Severity:** Words like "severe" and "desperate" that indicate serious issues.
|
| 92 |
+
- **Vulnerability:** Words like "disability" "SEND", "underserved" that represent at-risk groups.
|
| 93 |
+
- **Emotional Appeal:** Words like "hope", "committed", "passionate", and "love" that communicate an emotional connection to the mission of each application.
|
| 94 |
+
- **Superlatives:** Words like "completely", "massive", and "significantly" that can serve as proxies for passionate writing.
|
| 95 |
+
|
| 96 |
+
#### Weights
|
| 97 |
+
|
| 98 |
+
Each category is **assigned a weight accroding to its importance** with **urgency and vulnerability markers** having the highest weights, followed by **severity markers**, and lastly, **superlatives and emotional appeal markers**.
|
| 99 |
+
|
| 100 |
+
#### Scoring Process
|
| 101 |
+
|
| 102 |
+
- **Need Identification:** The algorithm scans through the application text to count
|
| 103 |
+
how many times each keyword appears.
|
| 104 |
+
- **Weighted Scoring:** It multiplies these counts by the corresponding category weight to compute a total "necessity index", and then normalises the values to produce a range between 0 and 1.
|
| 105 |
+
|
| 106 |
+
#### What It Means
|
| 107 |
+
|
| 108 |
+
Together, all these scores help can help us rank the applications based on how critical the prize is to each school, with a **higher necessity index suggesting a more compelling case** and a lower score suggesting a "want" rather than a "need".
|
| 109 |
+
"""
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
st.bar_chart(df['necessity_index'])
|
| 113 |
|
| 114 |
# Review applications
|
| 115 |
st.subheader("Applications")
|
| 116 |
for idx, row in filtered_df.iterrows():
|
| 117 |
+
with st.expander(f"Application \#{idx} | Necessity: {row['necessity_index']:.1f}"):
|
| 118 |
col1, col2 = st.columns((1, 3))
|
| 119 |
col1.metric("Necessity", f"{row['necessity_index']:.1f}")
|
| 120 |
col1.metric("Urgency", f"{row['urgency_score']}")
|
|
|
|
| 122 |
col1.metric("Vulnerability", f"{row['vulnerability_score']}")
|
| 123 |
# Clean usage items
|
| 124 |
usage_items = [item for item in row['Usage'] if item and item.lower() != 'none']
|
| 125 |
+
col2.markdown("**EXCERPT**")
|
| 126 |
+
col2.write(row[freeform_col])
|
| 127 |
if usage_items:
|
| 128 |
+
col2.markdown("**EXTRACTED USAGE ITEMS:**")
|
| 129 |
col2.write(", ".join(usage_items))
|
| 130 |
else:
|
| 131 |
col2.markdown("*No specific usage items extracted.*")
|
|
|
|
|
|
|
| 132 |
# Shortlist checkbox
|
| 133 |
st.checkbox(
|
| 134 |
"Shortlist this application",
|