Spaces:

TwinklData
/

Community_Collections_App

Sleeping

App Files Files Community

lynn-twinkl commited on Apr 18, 2025

Commit

3475989

1 Parent(s): 147f63f

Implemented auto shortlisting

Browse files

Files changed (2) hide show

app.py +27 -6
functions/shortlist.py +72 -0

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ from streamlit_extras.metric_cards import style_metric_cards
 from functions.extract_usage import extract_usage
 from functions.necessity_index import compute_necessity, index_scaler, qcut_labels
 from functions.column_detection import detect_freeform_answer_col
 import typing
 # ---- CACHEABLE PROCESSING ----
@@ -105,27 +106,47 @@ if uploaded_file is not None:
                     key=f"shortlist_{idx}"
                 )
-        # Shortlist summary and download
         shortlisted = [
             i for i in filtered_df.index
             if st.session_state.get(f"shortlist_{i}", False)
         ]
-        st.sidebar.markdown(f"**Shortlisted:** {len(shortlisted)}")
         if shortlisted:
             csv = df.loc[shortlisted].to_csv(index=False).encode('utf-8')
             st.sidebar.download_button(
-                "Download Shortlist", csv, "shortlist.csv", "text/csv"
             )
     with tab2:
         st.write("")
-        col1, col2 = st.columns(2)
         col1.metric("Avg. Word Count", f"{df['word_count'].mean().round(1)}")
-        col2.metric("Total Applications", len(df))
         st.html("<br>")
-        st.subheader("Necessity Index Distribution")
         st.write("")
         st.write("")
         # Histogram of necessity index colored by priority labels

 from functions.extract_usage import extract_usage
 from functions.necessity_index import compute_necessity, index_scaler, qcut_labels
 from functions.column_detection import detect_freeform_answer_col
+from functions.shortlist import shortlist_applications
 import typing
 # ---- CACHEABLE PROCESSING ----
                     key=f"shortlist_{idx}"
                 )
+        # Shortlist summary and download (manual)
         shortlisted = [
             i for i in filtered_df.index
             if st.session_state.get(f"shortlist_{i}", False)
         ]
+        st.sidebar.markdown(f"**Manual Shortlisted:** {len(shortlisted)}")
         if shortlisted:
             csv = df.loc[shortlisted].to_csv(index=False).encode('utf-8')
             st.sidebar.download_button(
+                "Download Manual Shortlist", csv, "shortlist.csv", "text/csv"
             )
+        # Automatic Shortlisting
+        st.sidebar.header("Automatic Shortlisting")
+        max_k = len(filtered_df)
+        default_k = min(5, max_k)
+        num_auto = st.sidebar.number_input(
+            "Number to shortlist automatically",
+            min_value=1, max_value=max_k,
+            value=default_k, step=1
+        )
+        if st.sidebar.button("Generate Auto Shortlist"):
+            auto_short = shortlist_applications(filtered_df, k=num_auto)
+            st.sidebar.markdown(f"**Auto Shortlisted:** {len(auto_short)}")
+            csv_auto = auto_short.to_csv(index=False).encode('utf-8')
+            st.sidebar.download_button(
+                "Download Auto Shortlist", csv_auto, "auto_shortlist.csv", "text/csv"
+            )
+            st.subheader("Auto Shortlist Results")
+            st.dataframe(auto_short, hide_index=True)
     with tab2:
         st.write("")
+        col1, col2, col3 = st.columns(3)
         col1.metric("Avg. Word Count", f"{df['word_count'].mean().round(1)}")
+        col2.metric("Median N.I", df['necessity_index'].median())
+        col3.metric("Total Applications", len(df))
         st.html("<br>")
+        st.subheader("Necessity Index (NI) Distribution")
         st.write("")
         st.write("")
         # Histogram of necessity index colored by priority labels

functions/shortlist.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import pandas as pd
+def shortlist_applications(
+    df: pd.DataFrame,
+    k: int = None,
+    threshold: float = None,
+    weight_necessity: float = 0.5,
+    weight_length: float = 0.3,
+    weight_usage: float = 0.2
+) -> pd.DataFrame:
+    """
+    Automatically shortlist grant applications by combining necessity index,
+    application length (favoring longer submissions), and whether usage was specified.
+    Args:
+        df: Processed DataFrame including columns 'necessity_index', 'word_count', and 'Usage'.
+        k: Number of top applications to select. Mutually exclusive with threshold.
+        threshold: Score threshold above which to select applications. Mutually exclusive with k.
+        weight_necessity: Weight for necessity_index (0 to 1).
+        weight_length: Weight for length score (0 to 1).
+        weight_usage: Weight for usage inclusion (0 to 1).
+    Returns:
+        DataFrame of shortlisted applications sorted by descending combined score.
+    """
+    # Ensure exactly one of k or threshold is provided
+    if (k is None and threshold is None) or (k is not None and threshold is not None):
+        raise ValueError("Provide exactly one of k or threshold")
+    # Normalize necessity_index (assumed already between 0 and 1)
+    necessity = df['necessity_index']
+    # Compute length score: longer applications score higher (more context is valued)
+    word_counts = df['word_count']
+    min_wc, max_wc = word_counts.min(), word_counts.max()
+    if max_wc != min_wc:
+        length_score = (word_counts - min_wc) / (max_wc - min_wc)
+    else:
+        length_score = pd.Series([0.5] * len(df), index=df.index)
+    # Compute usage score: 1 if any usage items specified, else 0
+    def has_usage(items):
+        return any(
+            item and isinstance(item, str) and item.strip().lower() != 'none'
+            for item in items
+        )
+    usage_score = df['Usage'].apply(has_usage).astype(float)
+    # Combine scores with normalized weights
+    total_weight = weight_necessity + weight_length + weight_usage
+    weights = {
+        'necessity': weight_necessity / total_weight,
+        'length': weight_length / total_weight,
+        'usage': weight_usage / total_weight,
+    }
+    combined = (
+        weights['necessity'] * necessity +
+        weights['length'] * length_score +
+        weights['usage'] * usage_score
+    )
+    df = df.copy()
+    df['auto_shortlist_score'] = combined
+    # Select applications based on k or threshold
+    df_sorted = df.sort_values('auto_shortlist_score', ascending=False)
+    if k is not None:
+        result = df_sorted.head(k)
+    else:
+        result = df_sorted[df_sorted['auto_shortlist_score'] >= threshold]
+    return result