|
|
import pandas as pd |
|
|
|
|
|
def shortlist_applications( |
|
|
df: pd.DataFrame, |
|
|
k: int = None, |
|
|
threshold: float = None, |
|
|
weight_necessity: float = 0.55, |
|
|
weight_length: float = 0.1, |
|
|
weight_usage: float = 0.35 |
|
|
) -> pd.DataFrame: |
|
|
""" |
|
|
Automatically shortlist grant applications by combining necessity index, |
|
|
application length (favoring longer submissions), and the specificity of the |
|
|
requested usage list. |
|
|
|
|
|
Args: |
|
|
df: Processed DataFrame including columns 'necessity_index', 'word_count', and 'Usage'. |
|
|
k: Number of top applications to select. Mutually exclusive with threshold. |
|
|
threshold: Score threshold above which to select applications. Mutually exclusive with k. |
|
|
weight_necessity: Weight for necessity_index (0 to 1). |
|
|
weight_length: Weight for length score (0 to 1). |
|
|
weight_usage: Weight for usage specificity (0 to 1). |
|
|
|
|
|
Returns: |
|
|
DataFrame of shortlisted applications sorted by descending combined score. |
|
|
""" |
|
|
|
|
|
if (k is None and threshold is None) or (k is not None and threshold is not None): |
|
|
raise ValueError("Provide exactly one of k or threshold") |
|
|
|
|
|
|
|
|
necessity = df['necessity_index'] |
|
|
|
|
|
|
|
|
word_counts = df['word_count'] |
|
|
min_wc, max_wc = word_counts.min(), word_counts.max() |
|
|
if max_wc != min_wc: |
|
|
length_score = (word_counts - min_wc) / (max_wc - min_wc) |
|
|
else: |
|
|
length_score = pd.Series([0.5] * len(df), index=df.index) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def count_valid_usage(items): |
|
|
"""Return the number of meaningful usage entries in *items*. |
|
|
|
|
|
The Usage column is expected to contain a list of strings (output of |
|
|
`extract_usage.extract_usage`). We treat empty strings and the literal |
|
|
"None" (case‑insensitive) as non‑entries. |
|
|
""" |
|
|
if not isinstance(items, (list, tuple, set)): |
|
|
return 0 |
|
|
return sum( |
|
|
1 |
|
|
for item in items |
|
|
if isinstance(item, str) and item.strip() and item.strip().lower() != "none" |
|
|
) |
|
|
|
|
|
usage_counts = df["usage"].apply(count_valid_usage) |
|
|
|
|
|
min_uc, max_uc = usage_counts.min(), usage_counts.max() |
|
|
if max_uc != min_uc: |
|
|
usage_score = (usage_counts - min_uc) / (max_uc - min_uc) |
|
|
else: |
|
|
|
|
|
usage_score = pd.Series([0.5] * len(df), index=df.index) |
|
|
|
|
|
|
|
|
total_weight = weight_necessity + weight_length + weight_usage |
|
|
weights = { |
|
|
'necessity': weight_necessity / total_weight, |
|
|
'length': weight_length / total_weight, |
|
|
'usage': weight_usage / total_weight, |
|
|
} |
|
|
combined = ( |
|
|
weights['necessity'] * necessity + |
|
|
weights['length'] * length_score + |
|
|
weights['usage'] * usage_score |
|
|
) |
|
|
df = df.copy() |
|
|
df['shortlist_score'] = combined |
|
|
|
|
|
|
|
|
df_sorted = df.sort_values('shortlist_score', ascending=False) |
|
|
if k is not None: |
|
|
result = df_sorted.head(k) |
|
|
else: |
|
|
result = df_sorted[df_sorted['shortlist_score'] >= threshold] |
|
|
|
|
|
return result |
|
|
|