lynn-twinkl commited on
Commit
164201a
·
1 Parent(s): 6c1f317

Added min max scaler to index and explainer for index

Browse files
Files changed (1) hide show
  1. app.py +54 -17
app.py CHANGED
@@ -6,27 +6,29 @@ import streamlit as st
6
  import pandas as pd
7
  from io import BytesIO
8
 
9
- # -- FUNCTIONS --
10
 
11
  from functions.extract_usage import extract_usage
12
- from functions.necessity_index import compute_necessity
13
  from functions.column_detection import detect_freeform_answer_col
14
  import typing
15
 
16
- # -- CACHEABLE PROCESSING --
 
17
  @st.cache_data
18
  def load_and_process(raw_csv: bytes) -> typing.Tuple[pd.DataFrame, str]:
19
  """
20
  Load CSV from raw bytes, detect freeform column, compute necessity scores,
21
  and extract usage items. Returns processed DataFrame and freeform column name.
22
  """
23
- # Read original data
24
  df_orig = pd.read_csv(BytesIO(raw_csv))
25
- # Detect narrative column
26
  freeform_col = detect_freeform_answer_col(df_orig)
27
  # Compute necessity scores
28
  scored = df_orig.join(df_orig[freeform_col].apply(compute_necessity))
29
- # Extract usage via AI
 
30
  docs = df_orig[freeform_col].to_list()
31
  usage = extract_usage(docs)
32
  scored['Usage'] = usage
@@ -41,10 +43,9 @@ st.title("Grant Applications Helper")
41
  uploaded_file = st.file_uploader("Upload grant applications file for analysis", type='csv')
42
 
43
  if uploaded_file is not None:
44
- # Read raw bytes for caching and repeated use
45
  raw = uploaded_file.read()
46
 
47
- # --- Original Data Preview ---
48
  st.markdown("""
49
  ### Data Preview
50
  Here's the data you uploaded!
@@ -53,12 +54,15 @@ if uploaded_file is not None:
53
  df_orig = pd.read_csv(BytesIO(raw))
54
  st.dataframe(df_orig)
55
 
56
- # --- Processed Data (cached): add scores & extracted usage ---
 
 
 
57
  df, freeform_col = load_and_process(raw)
58
 
59
- # -- Interactive Filtering & Review Interface --
60
- st.sidebar.header("Filters")
61
- # Filter by necessity index
62
  min_idx = float(df['necessity_index'].min())
63
  max_idx = float(df['necessity_index'].max())
64
  filter_range = st.sidebar.slider(
@@ -70,14 +74,47 @@ if uploaded_file is not None:
70
  st.sidebar.markdown(f"**Total Applications:** {len(df)}")
71
  st.sidebar.markdown(f"**Filtered Applications:** {len(filtered_df)}")
72
 
73
- # Distribution chart
 
 
74
  st.subheader("Necessity Index Distribution")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  st.bar_chart(df['necessity_index'])
76
 
77
  # Review applications
78
  st.subheader("Applications")
79
  for idx, row in filtered_df.iterrows():
80
- with st.expander(f"Application {idx} | Necessity: {row['necessity_index']:.1f}"):
81
  col1, col2 = st.columns((1, 3))
82
  col1.metric("Necessity", f"{row['necessity_index']:.1f}")
83
  col1.metric("Urgency", f"{row['urgency_score']}")
@@ -85,13 +122,13 @@ if uploaded_file is not None:
85
  col1.metric("Vulnerability", f"{row['vulnerability_score']}")
86
  # Clean usage items
87
  usage_items = [item for item in row['Usage'] if item and item.lower() != 'none']
 
 
88
  if usage_items:
89
- col2.markdown("**Extracted Usage Items:**")
90
  col2.write(", ".join(usage_items))
91
  else:
92
  col2.markdown("*No specific usage items extracted.*")
93
- col2.markdown("**Excerpt:**")
94
- col2.write(row[freeform_col])
95
  # Shortlist checkbox
96
  st.checkbox(
97
  "Shortlist this application",
 
6
  import pandas as pd
7
  from io import BytesIO
8
 
9
+ # ---- FUNCTIONS ----
10
 
11
  from functions.extract_usage import extract_usage
12
+ from functions.necessity_index import compute_necessity, index_scaler
13
  from functions.column_detection import detect_freeform_answer_col
14
  import typing
15
 
16
+ # ---- CACHEABLE PROCESSING ----
17
+
18
  @st.cache_data
19
  def load_and_process(raw_csv: bytes) -> typing.Tuple[pd.DataFrame, str]:
20
  """
21
  Load CSV from raw bytes, detect freeform column, compute necessity scores,
22
  and extract usage items. Returns processed DataFrame and freeform column name.
23
  """
24
+ # Read uploaded data
25
  df_orig = pd.read_csv(BytesIO(raw_csv))
26
+ # Detect freeform column
27
  freeform_col = detect_freeform_answer_col(df_orig)
28
  # Compute necessity scores
29
  scored = df_orig.join(df_orig[freeform_col].apply(compute_necessity))
30
+ scored['necessity_index'] = index_scaler(scored['necessity_index'].values)
31
+ # LangChain function for extracting usage
32
  docs = df_orig[freeform_col].to_list()
33
  usage = extract_usage(docs)
34
  scored['Usage'] = usage
 
43
  uploaded_file = st.file_uploader("Upload grant applications file for analysis", type='csv')
44
 
45
  if uploaded_file is not None:
46
+ # Read raw bytes for caching and repeated use --> this ensure all the processing isn't repeated when a user changes the filters
47
  raw = uploaded_file.read()
48
 
 
49
  st.markdown("""
50
  ### Data Preview
51
  Here's the data you uploaded!
 
54
  df_orig = pd.read_csv(BytesIO(raw))
55
  st.dataframe(df_orig)
56
 
57
+ st.divider()
58
+
59
+ ## ---- PROCESSED DATA (CACHED) ----
60
+
61
  df, freeform_col = load_and_process(raw)
62
 
63
+ ## ---- INTERACTIVE FILTERING & REVIEW INTERFACE ----
64
+
65
+ st.sidebar.title("Filters")
66
  min_idx = float(df['necessity_index'].min())
67
  max_idx = float(df['necessity_index'].max())
68
  filter_range = st.sidebar.slider(
 
74
  st.sidebar.markdown(f"**Total Applications:** {len(df)}")
75
  st.sidebar.markdown(f"**Filtered Applications:** {len(filtered_df)}")
76
 
77
+ ## ---- NECESSITY INDEX CHART ----
78
+
79
+ st.header("Processed Applications")
80
  st.subheader("Necessity Index Distribution")
81
+
82
+ with st.expander("Learn about our indexing algorithm", icon='🌱'):
83
+ st.markdown(
84
+ """
85
+ This algorithm is designed to help us evaluate grant applications by assigning a "necessity score" to each application. Here's how it works:
86
+
87
+ #### Lexicon
88
+
89
+ **Keywords:** The algorithm looks for specific keywords in the application text. These keywords are grouped into five main categories:
90
+ - **Urgency:** Words like "urgent" and "immediate" that express pressing needs.
91
+ - **Severity:** Words like "severe" and "desperate" that indicate serious issues.
92
+ - **Vulnerability:** Words like "disability" "SEND", "underserved" that represent at-risk groups.
93
+ - **Emotional Appeal:** Words like "hope", "committed", "passionate", and "love" that communicate an emotional connection to the mission of each application.
94
+ - **Superlatives:** Words like "completely", "massive", and "significantly" that can serve as proxies for passionate writing.
95
+
96
+ #### Weights
97
+
98
+ Each category is **assigned a weight accroding to its importance** with **urgency and vulnerability markers** having the highest weights, followed by **severity markers**, and lastly, **superlatives and emotional appeal markers**.
99
+
100
+ #### Scoring Process
101
+
102
+ - **Need Identification:** The algorithm scans through the application text to count
103
+ how many times each keyword appears.
104
+ - **Weighted Scoring:** It multiplies these counts by the corresponding category weight to compute a total "necessity index", and then normalises the values to produce a range between 0 and 1.
105
+
106
+ #### What It Means
107
+
108
+ Together, all these scores help can help us rank the applications based on how critical the prize is to each school, with a **higher necessity index suggesting a more compelling case** and a lower score suggesting a "want" rather than a "need".
109
+ """
110
+ )
111
+
112
  st.bar_chart(df['necessity_index'])
113
 
114
  # Review applications
115
  st.subheader("Applications")
116
  for idx, row in filtered_df.iterrows():
117
+ with st.expander(f"Application \#{idx} | Necessity: {row['necessity_index']:.1f}"):
118
  col1, col2 = st.columns((1, 3))
119
  col1.metric("Necessity", f"{row['necessity_index']:.1f}")
120
  col1.metric("Urgency", f"{row['urgency_score']}")
 
122
  col1.metric("Vulnerability", f"{row['vulnerability_score']}")
123
  # Clean usage items
124
  usage_items = [item for item in row['Usage'] if item and item.lower() != 'none']
125
+ col2.markdown("**EXCERPT**")
126
+ col2.write(row[freeform_col])
127
  if usage_items:
128
+ col2.markdown("**EXTRACTED USAGE ITEMS:**")
129
  col2.write(", ".join(usage_items))
130
  else:
131
  col2.markdown("*No specific usage items extracted.*")
 
 
132
  # Shortlist checkbox
133
  st.checkbox(
134
  "Shortlist this application",