Spaces:

TwinklData
/

Community_Collections_App

Sleeping

lynn-twinkl

Implemented heartfelt prediction model

7d6f84e 11 months ago

9.82 kB

	################################
	# CONFIGURATION
	################################

	import streamlit as st
	import pandas as pd
	import altair as alt
	import joblib
	from io import BytesIO
	import os
	from streamlit_extras.metric_cards import style_metric_cards

	# ---- FUNCTIONS ----

	from src.extract_usage import extract_usage
	from src.necessity_index import compute_necessity, index_scaler, qcut_labels
	from src.column_detection import detect_freeform_col
	from src.shortlist import shortlist_applications
	from src.twinkl_originals import find_book_candidates
	from src.preprocess_text import normalise_text
	from typing import Tuple

	##################################
	# CACHED PROCESSING FUNCTION
	##################################

	# -----------------------------------------------------------------------------
	# Heavy processing (IO + NLP) is cached to avoid re‑executing when the UI state
	# changes. The function only re‑runs if the file contents change.
	# -----------------------------------------------------------------------------

	@st.cache_resource
	def load_heartfelt_predictor():
	model_path = os.path.join("src", "models", "heartfelt_pipeline.joblib")
	return joblib.load(model_path)

	@st.cache_data(show_spinner=True)
	def load_and_process(raw_csv: bytes) -> Tuple[pd.DataFrame, str]:
	"""
	Load CSV from raw bytes, detect freeform column, compute necessity scores,
	and extract usage items. Returns processed DataFrame and freeform column name.
	"""
	# Read Uploaded Data
	df_orig = pd.read_csv(BytesIO(raw_csv))

	# Detect freeform column
	freeform_col = detect_freeform_col(df_orig)

	df_orig = df_orig[df_orig[freeform_col].notna()]

	#Word Count
	df_orig['word_count'] = df_orig[freeform_col].fillna('').str.split().str.len()

	# Compute Necessity Scores
	scored = df_orig.join(df_orig[freeform_col].apply(compute_necessity))
	scored['necessity_index'] = index_scaler(scored['necessity_index'].values)
	scored['priority'] = qcut_labels(scored['necessity_index'])

	# Find Twinkl Originals Candidates
	scored['book_candidates'] = find_book_candidates(scored, freeform_col)

	# Label Heartfelt Applications
	scored['clean_text'] = scored[freeform_col].map(normalise_text)
	model = load_heartfelt_predictor()
	scored['is_heartfelt'] = model.predict(scored['clean_text'].astype(str))



	# Usage Extraction
	docs = df_orig[freeform_col].to_list()
	#scored['Usage'] = extract_usage(docs)

	return scored, freeform_col

	# -----------------------------------------------------------------------------
	# Derivative computations that rely only on the processed DataFrame are also
	# cached. These are lightweight but still benefit from caching because this
	# function might be called multiple times during widget interaction.
	# -----------------------------------------------------------------------------


	@st.cache_data(show_spinner=True)
	def compute_shortlist(df: pd.DataFrame) -> pd.DataFrame:
	"""Pre‑compute shortlist_score for all rows (used for both modes)."""
	return shortlist_applications(df, k=len(df))

	################################
	# APP SCRIPT
	################################

	style_metric_cards(box_shadow=False, border_left_color='#E7F4FF',background_color='#E7F4FF', border_size_px=0, border_radius_px=6)

	st.title("Community Collections Helper")

	uploaded_file = st.file_uploader("Upload grant applications file for analysis", type='csv')

	if uploaded_file is not None:
	# Read file from raw bytes for caching and repeated use --> this ensure all the processing isn't repeated when a user changes the filters
	raw = uploaded_file.read()

	## ---- PROCESSED DATA (CACHED) ----

	df, freeform_col = load_and_process(raw)

	st.dataframe(df)

	## ---- INTERACTIVE FILTERING & REVIEW INTERFACE ----

	with st.sidebar:
	st.title("Shortlist Mode")

	quantile_map = {"strict": 0.75, "generous": 0.5}
	mode = st.segmented_control(
	"Select one option",
	options=["strict", "generous"],
	default="strict",
	)

	scored_full = compute_shortlist(df)
	threshold_score = scored_full["shortlist_score"].quantile(quantile_map[mode])
	auto_short_df = scored_full[scored_full["shortlist_score"] >= threshold_score]

	st.title("Filters")
	min_idx = float(df['necessity_index'].min())
	max_idx = float(df['necessity_index'].max())
	filter_range = st.sidebar.slider(
	"Necessity Index Range", min_value=min_idx, max_value=max_idx, value=(min_idx, max_idx)
	)
	filtered_df = df[(~df.index.isin(auto_short_df.index)) & (df['necessity_index'].between(filter_range[0], filter_range[1]))]

	st.markdown(f"Total Applications: {len(df)}")
	st.markdown(f"Filtered Applications: {len(filtered_df)}")

	## ----------------- MAIN PANEL ----------------

	tab1, tab2 = st.tabs(["Shortlist Manager","Insights"])

	## ---------- SHORTLIST MANAGER TAB -----------

	with tab1:

	st.header("✨ Automatic Shortlist")
	st.markdown("Here's your automatically genereated shortlist! If you'd like to manually add additional applications, you may do so on the section below!")

	csv_auto = auto_short_df.to_csv(index=False).encode("utf-8")
	st.download_button(
	label="Download Shortlist",
	data=csv_auto,
	file_name="shortlist.csv",
	mime="text/csv",
	icon='⬇️'
	)
	st.markdown("#### Shortlist Preview")
	st.write("")
	total_col, shortlistCounter_col, mode_col = st.columns(3)

	total_col.metric("Applications Submitted", len(df))
	shortlistCounter_col.metric("Shorlist Length", len(auto_short_df))
	mode_col.metric("Mode", mode)

	shorltist_cols_to_show = [
	'Id',
	freeform_col,
	'Usage',
	'necessity_index',
	'urgency_score',
	'severity_score',
	'vulnerability_score',
	'shortlist_score',
	'book_candidates',
	]

	st.dataframe(auto_short_df.loc[:, shorltist_cols_to_show], hide_index=True)

	## REVIEW APPLICATIONS

	st.header("🌸 Manual Filtering")
	st.markdown(
	"""
	Below you'll find applications that did not make it into the shortlist for you to manually review or append to the shortlist if desired.

	You may use the side panel filters to more easily sort through applications that you'd like to review.
	"""
	)

	st.markdown("#### Filtered Applications")
	st.write("")
	for idx, row in filtered_df.iterrows():
	with st.expander(f"Application \#{idx}"):
	st.write("")
	col1, col2, col3, col4 = st.columns(4)
	col1.metric("Necessity", f"{row['necessity_index']:.1f}")
	col2.metric("Urgency", f"{int(row['urgency_score'])}")
	col3.metric("Severity", f"{int(row['severity_score'])}")
	col4.metric("Vulnerability", f"{int(row['vulnerability_score'])}")

	# HTML for clean usage items
	usage_items = [item for item in row['Usage'] if item and item.lower() != 'none']
	st.markdown("##### Excerpt")
	st.write(row[freeform_col])
	if usage_items:
	st.markdown("##### Usage")
	pills_html = "".join(
	f"<span style='display:inline-block;background-color:#E7F4FF;color:#125E9E;border-radius:20px;padding:4px 10px;margin:2px;font-size:0.95rem;'>{item}</span>"
	for item in usage_items
	)
	st.markdown(pills_html, unsafe_allow_html=True)
	else:
	st.caption("No usage found")
	st.write("")

	st.checkbox(
	"Add to shortlist",
	key=f"shortlist_{idx}"
	)

	# Shortlist summary and download (manual)
	shortlisted = [
	i for i in filtered_df.index
	if st.session_state.get(f"shortlist_{i}", False)
	]
	st.sidebar.markdown(f"Manual Shortlisted: {len(shortlisted)}")
	if shortlisted:
	csv = df.loc[shortlisted].to_csv(index=False).encode('utf-8')
	st.sidebar.download_button(
	"Download Manual Shortlist", csv, "shortlist.csv", "text/csv"
	)


	## ------------ INSIGHTS TAB -----------

	with tab2:
	st.write("")

	col1, col2, col3 = st.columns(3)
	col1.metric("Avg. Word Count", f"{df['word_count'].mean().round(1)}")
	col2.metric("Median N.I", df['necessity_index'].median().round(2))
	col3.metric("Total Applications", len(df))
	st.html("<br>")

	st.subheader("Necessity Index (NI) Distribution")
	st.write("")
	st.write("")
	# Histogram of necessity index colored by priority labels
	chart = alt.Chart(df).mark_bar().encode(
	x=alt.X('necessity_index:Q', bin=alt.Bin(maxbins=20), title='Necessity Index'),
	y='count()',
	color=alt.Color(
	'priority:N',
	scale=alt.Scale(
	domain=['low', 'medium', 'high', 'priority'],
	range=['#a7d6fd', '#FFA500', '#FF5733', '#FF0000']
	),
	legend=alt.Legend(title='Priority')
	)
	)
	st.altair_chart(chart, use_container_width=True)
	st.dataframe(df, hide_index=True)