ofermend commited on
Commit
0e2da72
·
1 Parent(s): 7cd85bf
Files changed (2) hide show
  1. app/app.py +22 -5
  2. app/app_utils.py +75 -0
app/app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import pandas as pd
3
  import plotly.graph_objects as go
4
 
5
- from app_utils import load_results, visualize_leaderboard
6
 
7
  results_df = load_results()
8
 
@@ -20,11 +20,15 @@ def leaderboard(
20
  filter_models_by_name: str = "",
21
  high_ar_only: bool = False,
22
  size_filter: str = "all",
23
- access_filter: str = "all"
 
24
  ):
25
  """Filter and display the leaderboard."""
26
  df = results_df.copy()
27
 
 
 
 
28
  # Filter by answer rate if toggle is on
29
  if high_ar_only:
30
  df = df[df["Answer %"] >= 95]
@@ -76,6 +80,11 @@ with gr.Blocks(
76
  }
77
  footer { display: none !important; }
78
  .modebar { display: none !important; }
 
 
 
 
 
79
  """
80
  ) as demo:
81
  gr.HTML(
@@ -101,12 +110,19 @@ with gr.Blocks(
101
  size_filter = gr.Radio(
102
  choices=["all", "small", "large"],
103
  value="all",
104
- label="Model size"
 
105
  )
106
  access_filter = gr.Radio(
107
  choices=["all", "commercial", "open"],
108
  value="all",
109
- label="Model type"
 
 
 
 
 
 
110
  )
111
 
112
  with gr.Row():
@@ -116,7 +132,7 @@ with gr.Blocks(
116
  max_height=500
117
  )
118
 
119
- inputs = [filter_input, high_ar_toggle, size_filter, access_filter]
120
  outputs = [plot_output, table_output]
121
 
122
  # Load initial data on page load
@@ -127,6 +143,7 @@ with gr.Blocks(
127
  high_ar_toggle.change(fn=leaderboard, inputs=inputs, outputs=outputs)
128
  size_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
129
  access_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
 
130
 
131
 
132
  if __name__ == "__main__":
 
2
  import pandas as pd
3
  import plotly.graph_objects as go
4
 
5
+ from app_utils import load_results, visualize_leaderboard, apply_data_slice, DATA_SLICE_MAP
6
 
7
  results_df = load_results()
8
 
 
20
  filter_models_by_name: str = "",
21
  high_ar_only: bool = False,
22
  size_filter: str = "all",
23
+ access_filter: str = "all",
24
+ data_slice: str = "Overall"
25
  ):
26
  """Filter and display the leaderboard."""
27
  df = results_df.copy()
28
 
29
+ # Apply data slice first (recalculates metrics and re-sorts)
30
+ df = apply_data_slice(df, data_slice)
31
+
32
  # Filter by answer rate if toggle is on
33
  if high_ar_only:
34
  df = df[df["Answer %"] >= 95]
 
80
  }
81
  footer { display: none !important; }
82
  .modebar { display: none !important; }
83
+ .horizontal-radio .wrap {
84
+ display: flex !important;
85
+ flex-direction: row !important;
86
+ gap: 8px !important;
87
+ }
88
  """
89
  ) as demo:
90
  gr.HTML(
 
110
  size_filter = gr.Radio(
111
  choices=["all", "small", "large"],
112
  value="all",
113
+ label="Model size",
114
+ elem_classes=["horizontal-radio"]
115
  )
116
  access_filter = gr.Radio(
117
  choices=["all", "commercial", "open"],
118
  value="all",
119
+ label="Model type",
120
+ elem_classes=["horizontal-radio"]
121
+ )
122
+ data_slice = gr.Dropdown(
123
+ choices=list(DATA_SLICE_MAP.keys()),
124
+ value="Overall",
125
+ label="Data Slice"
126
  )
127
 
128
  with gr.Row():
 
132
  max_height=500
133
  )
134
 
135
+ inputs = [filter_input, high_ar_toggle, size_filter, access_filter, data_slice]
136
  outputs = [plot_output, table_output]
137
 
138
  # Load initial data on page load
 
143
  high_ar_toggle.change(fn=leaderboard, inputs=inputs, outputs=outputs)
144
  size_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
145
  access_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
146
+ data_slice.change(fn=leaderboard, inputs=inputs, outputs=outputs)
147
 
148
 
149
  if __name__ == "__main__":
app/app_utils.py CHANGED
@@ -57,6 +57,8 @@ def extract_info_from_result_file(result_file):
57
  "Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
58
  "Model Size": model_size,
59
  "Accessibility": accessibility,
 
 
60
  }
61
  return result
62
 
@@ -110,6 +112,79 @@ def load_results(results_dir: str = "/tmp/hhem_results"):
110
 
111
  return results_df
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  # %%
114
  def determine_font_size(LLM: str, hallucination_percent: float) -> int:
115
  # based on both hallucination percent and LLM name, determine font size
 
57
  "Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
58
  "Model Size": model_size,
59
  "Accessibility": accessibility,
60
+ "category_results": info.get("category_results", {}),
61
+ "text_complexity_results": info.get("text_complexity_results", {}),
62
  }
63
  return result
64
 
 
112
 
113
  return results_df
114
 
115
+
116
+ # Mapping from dropdown display values to internal keys
117
+ DATA_SLICE_MAP = {
118
+ "Overall": ("overall", None),
119
+ "Low Complexity": ("complexity", "low_complexity_text"),
120
+ "High Complexity": ("complexity", "high_complexity_text"),
121
+ "Business": ("category", "business"),
122
+ "Education": ("category", "education"),
123
+ "Finance": ("category", "finance"),
124
+ "Law": ("category", "law"),
125
+ "Medicine": ("category", "medicine"),
126
+ "Politics": ("category", "politics"),
127
+ "Science": ("category", "science"),
128
+ "Sports": ("category", "sports"),
129
+ "Stocks": ("category", "stocks"),
130
+ "Technology": ("category", "technology"),
131
+ }
132
+
133
+
134
+ def apply_data_slice(df: pd.DataFrame, slice_name: str) -> pd.DataFrame:
135
+ """Apply a data slice filter to recalculate metrics.
136
+
137
+ Args:
138
+ df: DataFrame with category_results and text_complexity_results columns
139
+ slice_name: Display name of the slice (e.g., "Overall", "Low Complexity", "Business")
140
+
141
+ Returns:
142
+ DataFrame with recalculated metrics, sorted by Hallucination % ascending
143
+ """
144
+ if slice_name not in DATA_SLICE_MAP:
145
+ return df
146
+
147
+ slice_type, slice_key = DATA_SLICE_MAP[slice_name]
148
+
149
+ if slice_type == "overall":
150
+ return df
151
+
152
+ result_df = df.copy()
153
+ rows_to_keep = []
154
+
155
+ for idx, row in result_df.iterrows():
156
+ if slice_type == "complexity":
157
+ data = row.get("text_complexity_results", {})
158
+ else: # category
159
+ data = row.get("category_results", {})
160
+
161
+ if not data or slice_key not in data:
162
+ continue
163
+
164
+ slice_data = data[slice_key]
165
+ if not slice_data:
166
+ continue
167
+
168
+ # Update metrics from slice data
169
+ result_df.at[idx, "Hallucination %"] = round(
170
+ slice_data.get("hallucination_rate", 0), 3
171
+ )
172
+ result_df.at[idx, "Answer %"] = round(
173
+ slice_data.get("answer_rate", 0), 3
174
+ )
175
+ result_df.at[idx, "Avg Summary Words"] = round(
176
+ slice_data.get("average_summary_length", 0), 3
177
+ )
178
+ rows_to_keep.append(idx)
179
+
180
+ # Filter to only rows with data for this slice
181
+ result_df = result_df.loc[rows_to_keep]
182
+
183
+ # Re-sort by hallucination rate
184
+ result_df = result_df.sort_values(by="Hallucination %", ascending=True)
185
+
186
+ return result_df
187
+
188
  # %%
189
  def determine_font_size(LLM: str, hallucination_percent: float) -> int:
190
  # based on both hallucination percent and LLM name, determine font size