danielrosehill Claude commited on
Commit
2bedd25
·
1 Parent(s): b7f99ba

Fix country selector by loading data from local JSON file

Browse files

- Replace datasets library loading with direct JSON file loading
- Add data.json (32MB) containing combined GVFD data with 104,564 records
- Update all functions to use correct column names (country, topic, value, iso_code)
- Simplify data loading and remove dependency on datasets library
- Fix get_countries() and get_categories() to properly extract from loaded data
- Update all visualization functions with correct column references
- Country selector now shows all 268 available countries/locations
- Enable Git LFS for JSON files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (4) hide show
  1. .gitattributes +1 -0
  2. app.py +65 -124
  3. data.json +3 -0
  4. requirements.txt +0 -1
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.json filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -2,16 +2,20 @@ import gradio as gr
2
  import pandas as pd
3
  import plotly.express as px
4
  import plotly.graph_objects as go
5
- from datasets import load_dataset
 
6
  import numpy as np
7
 
8
  # Load the dataset
9
  def load_data():
10
- """Load the GVFD dataset from Hugging Face"""
11
  try:
12
- dataset = load_dataset("danielrosehill/Global-Value-Factor-Database-Refactor-V2")
13
- # Assuming the main data is in the 'train' split - adjust if needed
14
- df = pd.DataFrame(dataset['train'])
 
 
 
15
  return df
16
  except Exception as e:
17
  print(f"Error loading dataset: {e}")
@@ -25,22 +29,19 @@ def get_countries():
25
  """Get sorted list of unique countries from the dataset"""
26
  if df.empty:
27
  return []
28
- # Assuming there's a 'country' column - adjust based on actual column name
29
- country_col = [col for col in df.columns if 'country' in col.lower()]
30
- if country_col:
31
- return sorted(df[country_col[0]].dropna().unique().tolist())
32
  return []
33
 
34
  def get_categories():
35
  """Get available categories from the dataset"""
36
- categories = [
37
- "Air Pollution",
38
- "Land Use and Conservation",
39
- "Waste Generation",
40
- "Water Consumption",
41
- "Water Pollution"
42
- ]
43
- return categories
44
 
45
  def filter_data(countries, categories, min_value=None, max_value=None):
46
  """Filter dataset based on user selections"""
@@ -51,22 +52,18 @@ def filter_data(countries, categories, min_value=None, max_value=None):
51
 
52
  # Filter by countries
53
  if countries and len(countries) > 0:
54
- country_col = [col for col in df.columns if 'country' in col.lower()][0]
55
- filtered_df = filtered_df[filtered_df[country_col].isin(countries)]
56
 
57
- # Filter by categories
58
  if categories and len(categories) > 0:
59
- category_col = [col for col in df.columns if 'category' in col.lower()]
60
- if category_col:
61
- filtered_df = filtered_df[filtered_df[category_col[0]].isin(categories)]
62
 
63
  # Filter by value range
64
- value_col = [col for col in df.columns if 'value' in col.lower() or 'factor' in col.lower()]
65
- if value_col and (min_value is not None or max_value is not None):
66
  if min_value is not None:
67
- filtered_df = filtered_df[filtered_df[value_col[0]] >= min_value]
68
  if max_value is not None:
69
- filtered_df = filtered_df[filtered_df[value_col[0]] <= max_value]
70
 
71
  return filtered_df
72
 
@@ -83,32 +80,17 @@ def create_bar_chart(countries, categories):
83
  )
84
  return fig
85
 
86
- # Identify columns
87
- country_col = [col for col in filtered_df.columns if 'country' in col.lower()][0]
88
- category_col = [col for col in filtered_df.columns if 'category' in col.lower()]
89
- value_col = [col for col in filtered_df.columns if 'value' in col.lower() or 'factor' in col.lower()][0]
90
-
91
- # Group by country and category
92
- if category_col:
93
- grouped = filtered_df.groupby([country_col, category_col[0]])[value_col].mean().reset_index()
94
- fig = px.bar(
95
- grouped,
96
- x=country_col,
97
- y=value_col,
98
- color=category_col[0],
99
- title="Value Factors by Country and Category",
100
- labels={value_col: "Value Factor (USD)", country_col: "Country"},
101
- barmode='group'
102
- )
103
- else:
104
- grouped = filtered_df.groupby(country_col)[value_col].mean().reset_index()
105
- fig = px.bar(
106
- grouped,
107
- x=country_col,
108
- y=value_col,
109
- title="Value Factors by Country",
110
- labels={value_col: "Value Factor (USD)", country_col: "Country"}
111
- )
112
 
113
  fig.update_layout(xaxis_tickangle=-45, height=600)
114
  return fig
@@ -126,32 +108,21 @@ def create_map_visualization(countries, categories):
126
  )
127
  return fig
128
 
129
- # Identify columns
130
- country_col = [col for col in filtered_df.columns if 'country' in col.lower()][0]
131
- value_col = [col for col in filtered_df.columns if 'value' in col.lower() or 'factor' in col.lower()][0]
132
- iso_col = [col for col in filtered_df.columns if 'iso' in col.lower() or 'code' in col.lower()]
133
-
134
  # Aggregate by country
135
- country_data = filtered_df.groupby(country_col)[value_col].mean().reset_index()
136
-
137
- # Try to use ISO codes if available
138
- if iso_col:
139
- iso_data = filtered_df.groupby(country_col)[iso_col[0]].first().reset_index()
140
- country_data = country_data.merge(iso_data, on=country_col)
141
- location_col = iso_col[0]
142
- locationmode = 'ISO-3'
143
- else:
144
- location_col = country_col
145
- locationmode = 'country names'
146
 
147
  fig = px.choropleth(
148
  country_data,
149
- locations=location_col,
150
- locationmode=locationmode,
151
- color=value_col,
152
- hover_name=country_col,
153
  title="Global Value Factors by Country",
154
- labels={value_col: "Avg Value Factor (USD)"},
155
  color_continuous_scale="Viridis"
156
  )
157
 
@@ -171,31 +142,17 @@ def create_comparison_chart(countries, categories):
171
  )
172
  return fig
173
 
174
- # Identify columns
175
- country_col = [col for col in filtered_df.columns if 'country' in col.lower()][0]
176
- category_col = [col for col in filtered_df.columns if 'category' in col.lower()]
177
- value_col = [col for col in filtered_df.columns if 'value' in col.lower() or 'factor' in col.lower()][0]
178
-
179
- if category_col:
180
- grouped = filtered_df.groupby([category_col[0], country_col])[value_col].mean().reset_index()
181
- fig = px.bar(
182
- grouped,
183
- x=category_col[0],
184
- y=value_col,
185
- color=country_col,
186
- title="Category Comparison Across Countries",
187
- labels={value_col: "Value Factor (USD)", category_col[0]: "Category"},
188
- barmode='group'
189
- )
190
- else:
191
- grouped = filtered_df.groupby(country_col)[value_col].mean().reset_index()
192
- fig = px.bar(
193
- grouped,
194
- x=country_col,
195
- y=value_col,
196
- title="Value Factors by Country",
197
- labels={value_col: "Value Factor (USD)", country_col: "Country"}
198
- )
199
 
200
  fig.update_layout(xaxis_tickangle=-45, height=600)
201
  return fig
@@ -213,28 +170,14 @@ def create_box_plot(countries, categories):
213
  )
214
  return fig
215
 
216
- # Identify columns
217
- country_col = [col for col in filtered_df.columns if 'country' in col.lower()][0]
218
- category_col = [col for col in filtered_df.columns if 'category' in col.lower()]
219
- value_col = [col for col in filtered_df.columns if 'value' in col.lower() or 'factor' in col.lower()][0]
220
-
221
- if category_col:
222
- fig = px.box(
223
- filtered_df,
224
- x=category_col[0],
225
- y=value_col,
226
- color=country_col,
227
- title="Distribution of Value Factors",
228
- labels={value_col: "Value Factor (USD)", category_col[0]: "Category"}
229
- )
230
- else:
231
- fig = px.box(
232
- filtered_df,
233
- x=country_col,
234
- y=value_col,
235
- title="Distribution of Value Factors by Country",
236
- labels={value_col: "Value Factor (USD)", country_col: "Country"}
237
- )
238
 
239
  fig.update_layout(xaxis_tickangle=-45, height=600)
240
  return fig
@@ -246,9 +189,7 @@ def get_summary_stats(countries, categories):
246
  if filtered_df.empty:
247
  return "No data available for the selected filters"
248
 
249
- value_col = [col for col in filtered_df.columns if 'value' in col.lower() or 'factor' in col.lower()][0]
250
-
251
- stats = filtered_df[value_col].describe()
252
 
253
  summary = f"""
254
  ### Summary Statistics
 
2
  import pandas as pd
3
  import plotly.express as px
4
  import plotly.graph_objects as go
5
+ import json
6
+ import os
7
  import numpy as np
8
 
9
  # Load the dataset
10
  def load_data():
11
+ """Load the GVFD dataset from local JSON file"""
12
  try:
13
+ json_path = os.path.join(os.path.dirname(__file__), 'data.json')
14
+ with open(json_path, 'r') as f:
15
+ data = json.load(f)
16
+ # Extract records from the JSON structure
17
+ records = data.get('records', [])
18
+ df = pd.DataFrame(records)
19
  return df
20
  except Exception as e:
21
  print(f"Error loading dataset: {e}")
 
29
  """Get sorted list of unique countries from the dataset"""
30
  if df.empty:
31
  return []
32
+ # The column is named 'country' in the JSON data
33
+ if 'country' in df.columns:
34
+ return sorted(df['country'].dropna().unique().tolist())
 
35
  return []
36
 
37
  def get_categories():
38
  """Get available categories from the dataset"""
39
+ if df.empty:
40
+ return []
41
+ # Get unique topics from the data (topic column contains the categories)
42
+ if 'topic' in df.columns:
43
+ return sorted(df['topic'].dropna().unique().tolist())
44
+ return []
 
 
45
 
46
  def filter_data(countries, categories, min_value=None, max_value=None):
47
  """Filter dataset based on user selections"""
 
52
 
53
  # Filter by countries
54
  if countries and len(countries) > 0:
55
+ filtered_df = filtered_df[filtered_df['country'].isin(countries)]
 
56
 
57
+ # Filter by categories (using 'topic' column)
58
  if categories and len(categories) > 0:
59
+ filtered_df = filtered_df[filtered_df['topic'].isin(categories)]
 
 
60
 
61
  # Filter by value range
62
+ if min_value is not None or max_value is not None:
 
63
  if min_value is not None:
64
+ filtered_df = filtered_df[filtered_df['value'] >= min_value]
65
  if max_value is not None:
66
+ filtered_df = filtered_df[filtered_df['value'] <= max_value]
67
 
68
  return filtered_df
69
 
 
80
  )
81
  return fig
82
 
83
+ # Group by country and topic (category)
84
+ grouped = filtered_df.groupby(['country', 'topic'])['value'].mean().reset_index()
85
+ fig = px.bar(
86
+ grouped,
87
+ x='country',
88
+ y='value',
89
+ color='topic',
90
+ title="Value Factors by Country and Category",
91
+ labels={'value': "Value Factor (USD)", 'country': "Country", 'topic': "Category"},
92
+ barmode='group'
93
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  fig.update_layout(xaxis_tickangle=-45, height=600)
96
  return fig
 
108
  )
109
  return fig
110
 
 
 
 
 
 
111
  # Aggregate by country
112
+ country_data = filtered_df.groupby('country')['value'].mean().reset_index()
113
+
114
+ # Get ISO codes for the map
115
+ iso_data = filtered_df.groupby('country')['iso_code'].first().reset_index()
116
+ country_data = country_data.merge(iso_data, on='country')
 
 
 
 
 
 
117
 
118
  fig = px.choropleth(
119
  country_data,
120
+ locations='iso_code',
121
+ locationmode='ISO-3',
122
+ color='value',
123
+ hover_name='country',
124
  title="Global Value Factors by Country",
125
+ labels={'value': "Avg Value Factor (USD)"},
126
  color_continuous_scale="Viridis"
127
  )
128
 
 
142
  )
143
  return fig
144
 
145
+ # Group by topic (category) and country
146
+ grouped = filtered_df.groupby(['topic', 'country'])['value'].mean().reset_index()
147
+ fig = px.bar(
148
+ grouped,
149
+ x='topic',
150
+ y='value',
151
+ color='country',
152
+ title="Category Comparison Across Countries",
153
+ labels={'value': "Value Factor (USD)", 'topic': "Category"},
154
+ barmode='group'
155
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  fig.update_layout(xaxis_tickangle=-45, height=600)
158
  return fig
 
170
  )
171
  return fig
172
 
173
+ fig = px.box(
174
+ filtered_df,
175
+ x='topic',
176
+ y='value',
177
+ color='country',
178
+ title="Distribution of Value Factors",
179
+ labels={'value': "Value Factor (USD)", 'topic': "Category"}
180
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
  fig.update_layout(xaxis_tickangle=-45, height=600)
183
  return fig
 
189
  if filtered_df.empty:
190
  return "No data available for the selected filters"
191
 
192
+ stats = filtered_df['value'].describe()
 
 
193
 
194
  summary = f"""
195
  ### Summary Statistics
data.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ee86343b07d7781ed0d742c5fed758df71ef6ba8cfd28a0686ed2bf7be2c815
3
+ size 33633568
requirements.txt CHANGED
@@ -1,5 +1,4 @@
1
  gradio==5.49.1
2
  pandas>=2.0.0
3
  plotly>=5.18.0
4
- datasets>=2.14.0
5
  numpy>=1.24.0
 
1
  gradio==5.49.1
2
  pandas>=2.0.0
3
  plotly>=5.18.0
 
4
  numpy>=1.24.0