lynn-twinkl
commited on
Commit
·
3475989
1
Parent(s):
147f63f
Implemented auto shortlisting
Browse files- app.py +27 -6
- functions/shortlist.py +72 -0
app.py
CHANGED
|
@@ -13,6 +13,7 @@ from streamlit_extras.metric_cards import style_metric_cards
|
|
| 13 |
from functions.extract_usage import extract_usage
|
| 14 |
from functions.necessity_index import compute_necessity, index_scaler, qcut_labels
|
| 15 |
from functions.column_detection import detect_freeform_answer_col
|
|
|
|
| 16 |
import typing
|
| 17 |
|
| 18 |
# ---- CACHEABLE PROCESSING ----
|
|
@@ -105,27 +106,47 @@ if uploaded_file is not None:
|
|
| 105 |
key=f"shortlist_{idx}"
|
| 106 |
)
|
| 107 |
|
| 108 |
-
# Shortlist summary and download
|
| 109 |
shortlisted = [
|
| 110 |
i for i in filtered_df.index
|
| 111 |
if st.session_state.get(f"shortlist_{i}", False)
|
| 112 |
]
|
| 113 |
-
st.sidebar.markdown(f"**Shortlisted:** {len(shortlisted)}")
|
| 114 |
if shortlisted:
|
| 115 |
csv = df.loc[shortlisted].to_csv(index=False).encode('utf-8')
|
| 116 |
st.sidebar.download_button(
|
| 117 |
-
"Download Shortlist", csv, "shortlist.csv", "text/csv"
|
| 118 |
)
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
with tab2:
|
| 121 |
st.write("")
|
| 122 |
|
| 123 |
-
col1, col2 = st.columns(
|
| 124 |
col1.metric("Avg. Word Count", f"{df['word_count'].mean().round(1)}")
|
| 125 |
-
col2.metric("
|
|
|
|
| 126 |
st.html("<br>")
|
| 127 |
|
| 128 |
-
st.subheader("Necessity Index Distribution")
|
| 129 |
st.write("")
|
| 130 |
st.write("")
|
| 131 |
# Histogram of necessity index colored by priority labels
|
|
|
|
| 13 |
from functions.extract_usage import extract_usage
|
| 14 |
from functions.necessity_index import compute_necessity, index_scaler, qcut_labels
|
| 15 |
from functions.column_detection import detect_freeform_answer_col
|
| 16 |
+
from functions.shortlist import shortlist_applications
|
| 17 |
import typing
|
| 18 |
|
| 19 |
# ---- CACHEABLE PROCESSING ----
|
|
|
|
| 106 |
key=f"shortlist_{idx}"
|
| 107 |
)
|
| 108 |
|
| 109 |
+
# Shortlist summary and download (manual)
|
| 110 |
shortlisted = [
|
| 111 |
i for i in filtered_df.index
|
| 112 |
if st.session_state.get(f"shortlist_{i}", False)
|
| 113 |
]
|
| 114 |
+
st.sidebar.markdown(f"**Manual Shortlisted:** {len(shortlisted)}")
|
| 115 |
if shortlisted:
|
| 116 |
csv = df.loc[shortlisted].to_csv(index=False).encode('utf-8')
|
| 117 |
st.sidebar.download_button(
|
| 118 |
+
"Download Manual Shortlist", csv, "shortlist.csv", "text/csv"
|
| 119 |
)
|
| 120 |
|
| 121 |
+
# Automatic Shortlisting
|
| 122 |
+
st.sidebar.header("Automatic Shortlisting")
|
| 123 |
+
max_k = len(filtered_df)
|
| 124 |
+
default_k = min(5, max_k)
|
| 125 |
+
num_auto = st.sidebar.number_input(
|
| 126 |
+
"Number to shortlist automatically",
|
| 127 |
+
min_value=1, max_value=max_k,
|
| 128 |
+
value=default_k, step=1
|
| 129 |
+
)
|
| 130 |
+
if st.sidebar.button("Generate Auto Shortlist"):
|
| 131 |
+
auto_short = shortlist_applications(filtered_df, k=num_auto)
|
| 132 |
+
st.sidebar.markdown(f"**Auto Shortlisted:** {len(auto_short)}")
|
| 133 |
+
csv_auto = auto_short.to_csv(index=False).encode('utf-8')
|
| 134 |
+
st.sidebar.download_button(
|
| 135 |
+
"Download Auto Shortlist", csv_auto, "auto_shortlist.csv", "text/csv"
|
| 136 |
+
)
|
| 137 |
+
st.subheader("Auto Shortlist Results")
|
| 138 |
+
st.dataframe(auto_short, hide_index=True)
|
| 139 |
+
|
| 140 |
with tab2:
|
| 141 |
st.write("")
|
| 142 |
|
| 143 |
+
col1, col2, col3 = st.columns(3)
|
| 144 |
col1.metric("Avg. Word Count", f"{df['word_count'].mean().round(1)}")
|
| 145 |
+
col2.metric("Median N.I", df['necessity_index'].median())
|
| 146 |
+
col3.metric("Total Applications", len(df))
|
| 147 |
st.html("<br>")
|
| 148 |
|
| 149 |
+
st.subheader("Necessity Index (NI) Distribution")
|
| 150 |
st.write("")
|
| 151 |
st.write("")
|
| 152 |
# Histogram of necessity index colored by priority labels
|
functions/shortlist.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
def shortlist_applications(
|
| 4 |
+
df: pd.DataFrame,
|
| 5 |
+
k: int = None,
|
| 6 |
+
threshold: float = None,
|
| 7 |
+
weight_necessity: float = 0.5,
|
| 8 |
+
weight_length: float = 0.3,
|
| 9 |
+
weight_usage: float = 0.2
|
| 10 |
+
) -> pd.DataFrame:
|
| 11 |
+
"""
|
| 12 |
+
Automatically shortlist grant applications by combining necessity index,
|
| 13 |
+
application length (favoring longer submissions), and whether usage was specified.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
df: Processed DataFrame including columns 'necessity_index', 'word_count', and 'Usage'.
|
| 17 |
+
k: Number of top applications to select. Mutually exclusive with threshold.
|
| 18 |
+
threshold: Score threshold above which to select applications. Mutually exclusive with k.
|
| 19 |
+
weight_necessity: Weight for necessity_index (0 to 1).
|
| 20 |
+
weight_length: Weight for length score (0 to 1).
|
| 21 |
+
weight_usage: Weight for usage inclusion (0 to 1).
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
DataFrame of shortlisted applications sorted by descending combined score.
|
| 25 |
+
"""
|
| 26 |
+
# Ensure exactly one of k or threshold is provided
|
| 27 |
+
if (k is None and threshold is None) or (k is not None and threshold is not None):
|
| 28 |
+
raise ValueError("Provide exactly one of k or threshold")
|
| 29 |
+
|
| 30 |
+
# Normalize necessity_index (assumed already between 0 and 1)
|
| 31 |
+
necessity = df['necessity_index']
|
| 32 |
+
|
| 33 |
+
# Compute length score: longer applications score higher (more context is valued)
|
| 34 |
+
word_counts = df['word_count']
|
| 35 |
+
min_wc, max_wc = word_counts.min(), word_counts.max()
|
| 36 |
+
if max_wc != min_wc:
|
| 37 |
+
length_score = (word_counts - min_wc) / (max_wc - min_wc)
|
| 38 |
+
else:
|
| 39 |
+
length_score = pd.Series([0.5] * len(df), index=df.index)
|
| 40 |
+
|
| 41 |
+
# Compute usage score: 1 if any usage items specified, else 0
|
| 42 |
+
def has_usage(items):
|
| 43 |
+
return any(
|
| 44 |
+
item and isinstance(item, str) and item.strip().lower() != 'none'
|
| 45 |
+
for item in items
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
usage_score = df['Usage'].apply(has_usage).astype(float)
|
| 49 |
+
|
| 50 |
+
# Combine scores with normalized weights
|
| 51 |
+
total_weight = weight_necessity + weight_length + weight_usage
|
| 52 |
+
weights = {
|
| 53 |
+
'necessity': weight_necessity / total_weight,
|
| 54 |
+
'length': weight_length / total_weight,
|
| 55 |
+
'usage': weight_usage / total_weight,
|
| 56 |
+
}
|
| 57 |
+
combined = (
|
| 58 |
+
weights['necessity'] * necessity +
|
| 59 |
+
weights['length'] * length_score +
|
| 60 |
+
weights['usage'] * usage_score
|
| 61 |
+
)
|
| 62 |
+
df = df.copy()
|
| 63 |
+
df['auto_shortlist_score'] = combined
|
| 64 |
+
|
| 65 |
+
# Select applications based on k or threshold
|
| 66 |
+
df_sorted = df.sort_values('auto_shortlist_score', ascending=False)
|
| 67 |
+
if k is not None:
|
| 68 |
+
result = df_sorted.head(k)
|
| 69 |
+
else:
|
| 70 |
+
result = df_sorted[df_sorted['auto_shortlist_score'] >= threshold]
|
| 71 |
+
|
| 72 |
+
return result
|