lynn-twinkl commited on
Commit
3475989
·
1 Parent(s): 147f63f

Implemented auto shortlisting

Browse files
Files changed (2) hide show
  1. app.py +27 -6
  2. functions/shortlist.py +72 -0
app.py CHANGED
@@ -13,6 +13,7 @@ from streamlit_extras.metric_cards import style_metric_cards
13
  from functions.extract_usage import extract_usage
14
  from functions.necessity_index import compute_necessity, index_scaler, qcut_labels
15
  from functions.column_detection import detect_freeform_answer_col
 
16
  import typing
17
 
18
  # ---- CACHEABLE PROCESSING ----
@@ -105,27 +106,47 @@ if uploaded_file is not None:
105
  key=f"shortlist_{idx}"
106
  )
107
 
108
- # Shortlist summary and download
109
  shortlisted = [
110
  i for i in filtered_df.index
111
  if st.session_state.get(f"shortlist_{i}", False)
112
  ]
113
- st.sidebar.markdown(f"**Shortlisted:** {len(shortlisted)}")
114
  if shortlisted:
115
  csv = df.loc[shortlisted].to_csv(index=False).encode('utf-8')
116
  st.sidebar.download_button(
117
- "Download Shortlist", csv, "shortlist.csv", "text/csv"
118
  )
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  with tab2:
121
  st.write("")
122
 
123
- col1, col2 = st.columns(2)
124
  col1.metric("Avg. Word Count", f"{df['word_count'].mean().round(1)}")
125
- col2.metric("Total Applications", len(df))
 
126
  st.html("<br>")
127
 
128
- st.subheader("Necessity Index Distribution")
129
  st.write("")
130
  st.write("")
131
  # Histogram of necessity index colored by priority labels
 
13
  from functions.extract_usage import extract_usage
14
  from functions.necessity_index import compute_necessity, index_scaler, qcut_labels
15
  from functions.column_detection import detect_freeform_answer_col
16
+ from functions.shortlist import shortlist_applications
17
  import typing
18
 
19
  # ---- CACHEABLE PROCESSING ----
 
106
  key=f"shortlist_{idx}"
107
  )
108
 
109
+ # Shortlist summary and download (manual)
110
  shortlisted = [
111
  i for i in filtered_df.index
112
  if st.session_state.get(f"shortlist_{i}", False)
113
  ]
114
+ st.sidebar.markdown(f"**Manual Shortlisted:** {len(shortlisted)}")
115
  if shortlisted:
116
  csv = df.loc[shortlisted].to_csv(index=False).encode('utf-8')
117
  st.sidebar.download_button(
118
+ "Download Manual Shortlist", csv, "shortlist.csv", "text/csv"
119
  )
120
 
121
+ # Automatic Shortlisting
122
+ st.sidebar.header("Automatic Shortlisting")
123
+ max_k = len(filtered_df)
124
+ default_k = min(5, max_k)
125
+ num_auto = st.sidebar.number_input(
126
+ "Number to shortlist automatically",
127
+ min_value=1, max_value=max_k,
128
+ value=default_k, step=1
129
+ )
130
+ if st.sidebar.button("Generate Auto Shortlist"):
131
+ auto_short = shortlist_applications(filtered_df, k=num_auto)
132
+ st.sidebar.markdown(f"**Auto Shortlisted:** {len(auto_short)}")
133
+ csv_auto = auto_short.to_csv(index=False).encode('utf-8')
134
+ st.sidebar.download_button(
135
+ "Download Auto Shortlist", csv_auto, "auto_shortlist.csv", "text/csv"
136
+ )
137
+ st.subheader("Auto Shortlist Results")
138
+ st.dataframe(auto_short, hide_index=True)
139
+
140
  with tab2:
141
  st.write("")
142
 
143
+ col1, col2, col3 = st.columns(3)
144
  col1.metric("Avg. Word Count", f"{df['word_count'].mean().round(1)}")
145
+ col2.metric("Median N.I", df['necessity_index'].median())
146
+ col3.metric("Total Applications", len(df))
147
  st.html("<br>")
148
 
149
+ st.subheader("Necessity Index (NI) Distribution")
150
  st.write("")
151
  st.write("")
152
  # Histogram of necessity index colored by priority labels
functions/shortlist.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def shortlist_applications(
4
+ df: pd.DataFrame,
5
+ k: int = None,
6
+ threshold: float = None,
7
+ weight_necessity: float = 0.5,
8
+ weight_length: float = 0.3,
9
+ weight_usage: float = 0.2
10
+ ) -> pd.DataFrame:
11
+ """
12
+ Automatically shortlist grant applications by combining necessity index,
13
+ application length (favoring longer submissions), and whether usage was specified.
14
+
15
+ Args:
16
+ df: Processed DataFrame including columns 'necessity_index', 'word_count', and 'Usage'.
17
+ k: Number of top applications to select. Mutually exclusive with threshold.
18
+ threshold: Score threshold above which to select applications. Mutually exclusive with k.
19
+ weight_necessity: Weight for necessity_index (0 to 1).
20
+ weight_length: Weight for length score (0 to 1).
21
+ weight_usage: Weight for usage inclusion (0 to 1).
22
+
23
+ Returns:
24
+ DataFrame of shortlisted applications sorted by descending combined score.
25
+ """
26
+ # Ensure exactly one of k or threshold is provided
27
+ if (k is None and threshold is None) or (k is not None and threshold is not None):
28
+ raise ValueError("Provide exactly one of k or threshold")
29
+
30
+ # Normalize necessity_index (assumed already between 0 and 1)
31
+ necessity = df['necessity_index']
32
+
33
+ # Compute length score: longer applications score higher (more context is valued)
34
+ word_counts = df['word_count']
35
+ min_wc, max_wc = word_counts.min(), word_counts.max()
36
+ if max_wc != min_wc:
37
+ length_score = (word_counts - min_wc) / (max_wc - min_wc)
38
+ else:
39
+ length_score = pd.Series([0.5] * len(df), index=df.index)
40
+
41
+ # Compute usage score: 1 if any usage items specified, else 0
42
+ def has_usage(items):
43
+ return any(
44
+ item and isinstance(item, str) and item.strip().lower() != 'none'
45
+ for item in items
46
+ )
47
+
48
+ usage_score = df['Usage'].apply(has_usage).astype(float)
49
+
50
+ # Combine scores with normalized weights
51
+ total_weight = weight_necessity + weight_length + weight_usage
52
+ weights = {
53
+ 'necessity': weight_necessity / total_weight,
54
+ 'length': weight_length / total_weight,
55
+ 'usage': weight_usage / total_weight,
56
+ }
57
+ combined = (
58
+ weights['necessity'] * necessity +
59
+ weights['length'] * length_score +
60
+ weights['usage'] * usage_score
61
+ )
62
+ df = df.copy()
63
+ df['auto_shortlist_score'] = combined
64
+
65
+ # Select applications based on k or threshold
66
+ df_sorted = df.sort_values('auto_shortlist_score', ascending=False)
67
+ if k is not None:
68
+ result = df_sorted.head(k)
69
+ else:
70
+ result = df_sorted[df_sorted['auto_shortlist_score'] >= threshold]
71
+
72
+ return result