19arjun89 commited on
Commit
7f8a9be
·
verified ·
1 Parent(s): 8b9d4ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -107
app.py CHANGED
@@ -1,29 +1,39 @@
1
-
2
  import os
 
3
  from collections import Counter
4
 
5
  import gradio as gr
6
  import pandas as pd
7
  import plotly.express as px
8
- import plotly.graph_objects as go
9
  import pycountry
10
  from datasets import load_dataset
11
 
12
- # === Config ===
 
 
 
13
  VISITS_URL = os.getenv(
14
  "VISITS_URL",
15
  "https://huggingface.co/datasets/19arjun89/ai_recruiting_agent_usage/resolve/main/usage/visits_enriched.jsonl",
16
  )
17
 
18
- # Optional: You can keep this env var, but this version uses Plotly Geo (no Mapbox needed)
19
  MAPBOX_TOKEN = os.getenv("MAPBOX_TOKEN", "").strip()
20
 
21
- # Safety cap for very large jsonl files
 
 
 
 
 
 
22
  MAX_ROWS = int(os.getenv("MAX_ROWS", "500000"))
23
 
24
 
 
 
 
25
  def normalize_country_name(country: str | None) -> str | None:
26
- """Normalize country field; return None for empty/Unknown."""
27
  if not country or not isinstance(country, str):
28
  return None
29
  c = country.strip()
@@ -33,6 +43,7 @@ def normalize_country_name(country: str | None) -> str | None:
33
 
34
 
35
  def iso2_to_iso3(country_code: str | None) -> str | None:
 
36
  if not country_code or not isinstance(country_code, str):
37
  return None
38
  c2 = country_code.strip().upper()
@@ -46,7 +57,6 @@ def iso2_to_iso3(country_code: str | None) -> str | None:
46
 
47
 
48
  def load_rows_streaming():
49
- """Stream rows from visits.jsonl without loading the entire file into memory."""
50
  ds = load_dataset(
51
  "json",
52
  data_files=VISITS_URL,
@@ -59,152 +69,122 @@ def load_rows_streaming():
59
  break
60
 
61
 
62
- def build_report():
63
- """
64
- Aggregate usage events by country and render:
65
- - Choropleth map with labels (country + usage events)
66
- - Table with country name + usage events
67
- """
68
 
69
- # Count by country name (table)
70
- country_counts = Counter()
71
 
72
- # Count by ISO3 (map), also store a display name per ISO3
73
- iso3_counts = Counter()
74
- iso3_to_name = {}
 
 
 
 
 
75
 
 
 
 
76
  scanned = 0
77
- mappable = 0
78
  skipped_session_start = 0
79
  missing_country = 0
80
  invalid_country_code = 0
81
 
 
 
 
 
 
82
  for row in load_rows_streaming():
83
  scanned += 1
84
-
85
- # 1) Skip session starts
86
  event_type = str(row.get("event", "") or "").strip().lower()
87
  if event_type == "session_start":
88
  skipped_session_start += 1
89
  continue
90
-
91
- # 2) Missing country
92
  country = normalize_country_name(row.get("final_country"))
93
  if not country:
94
  missing_country += 1
95
  continue
96
-
97
- # 3) Invalid / missing country code
98
  iso3 = iso2_to_iso3(row.get("final_country_code"))
99
  if not iso3:
100
  invalid_country_code += 1
101
  continue
102
 
 
103
  country_counts[country] += 1
104
-
105
  iso3_counts[iso3] += 1
106
  iso3_to_name.setdefault(iso3, country)
107
- mappable += 1
108
-
109
 
110
- # Table dataframe (country name + usage events)
111
  table_df = (
112
  pd.DataFrame([{"country": k, "usage events": v} for k, v in country_counts.items()])
113
  .sort_values("usage events", ascending=False)
114
  .reset_index(drop=True)
115
  )
116
 
117
- # Map dataframe
118
  map_df = (
119
  pd.DataFrame(
120
  [
121
- {"iso3": iso3, "country": iso3_to_name.get(iso3, iso3), "usage events": count}
122
- for iso3, count in iso3_counts.items()
123
  ]
124
  )
125
  .sort_values("usage events", ascending=False)
126
  .reset_index(drop=True)
127
  )
128
 
 
 
 
 
 
 
 
 
 
129
  if map_df.empty:
130
  fig = px.scatter(title="No mappable data found")
131
- fig.update_layout(height=720, margin=dict(l=0, r=0, t=40, b=0))
132
  summary = (
133
- f"Rows scanned: {scanned:,} • Countries (table): {len(table_df):,} • "
 
 
 
 
 
134
  f"Total usage events: {int(table_df['usage events'].sum()) if len(table_df) else 0:,}"
135
  )
136
  return fig, table_df.head(50), summary
137
 
138
- # Choropleth (built-in polygons; reliable)
139
- fig = px.choropleth(
 
 
140
  map_df,
 
 
141
  locations="iso3",
142
  color="usage events",
143
- projection="natural earth",
144
- title=None,
 
 
 
 
145
  )
146
 
 
147
  fig.update_layout(
148
- height=720,
149
  margin=dict(l=0, r=0, t=0, b=0),
150
- paper_bgcolor="white",
151
- )
152
-
153
- fig.update_geos(
154
- showframe=False,
155
- showcoastlines=False,
156
- showcountries=True,
157
- countrycolor="rgba(0,0,0,0.25)",
158
- bgcolor="rgba(0,0,0,0)",
159
- domain=dict(x=[0, 1], y=[0, 1]),
160
- fitbounds="locations",
161
- )
162
-
163
- # Labels overlay (always visible)
164
- # Tip: keep labels to top N to avoid clutter if you grow beyond ~30 countries
165
- labels_df = map_df.copy()
166
- labels_df["label"] = labels_df["country"] + "<br>" + labels_df["usage events"].astype(str)
167
-
168
- # ===============================
169
- # Label shadow (dark background)
170
- # ===============================
171
- fig.add_trace(
172
- go.Scattergeo(
173
- locations=labels_df["iso3"],
174
- locationmode="ISO-3",
175
- text=labels_df["label"],
176
- mode="text",
177
- textfont=dict(
178
- size=13, # slightly bigger
179
- color="black",
180
- family="Arial",
181
- ),
182
- hoverinfo="skip",
183
- showlegend=False,
184
- )
185
- )
186
-
187
- # ===============================
188
- # Main label (white foreground)
189
- # ===============================
190
- fig.add_trace(
191
- go.Scattergeo(
192
- locations=labels_df["iso3"],
193
- locationmode="ISO-3",
194
- text=labels_df["label"],
195
- mode="text",
196
- textfont=dict(
197
- size=11,
198
- color="white",
199
- family="Arial",
200
- ),
201
- hoverinfo="skip",
202
- showlegend=False,
203
- )
204
  )
205
 
206
-
207
- # Title
208
  fig.add_annotation(
209
  text="Usage Events by Country",
210
  x=0.01,
@@ -217,32 +197,34 @@ def build_report():
217
  font=dict(size=20),
218
  )
219
 
220
- accounted = (
221
- skipped_session_start
222
- + missing_country
223
- + invalid_country_code
224
- + mappable
225
- )
226
-
227
  summary = (
228
  f"Rows scanned: {scanned:,}\n"
229
  f"- Session starts skipped: {skipped_session_start:,}\n"
230
  f"- Missing country: {missing_country:,}\n"
231
  f"- Invalid country code: {invalid_country_code:,}\n"
232
- f"- Rows mappable: {mappable:,}\n\n"
233
  f"Accounted rows: {accounted:,} / {scanned:,}\n"
234
  f"Countries (table): {len(table_df):,}\n"
235
- f"Countries (map): {len(map_df):,}\n"
236
  f"Total usage events: {int(table_df['usage events'].sum()) if len(table_df) else 0:,}"
237
  )
238
 
239
  return fig, table_df.head(50), summary
240
 
241
 
 
 
 
242
  with gr.Blocks(title="AI Recruiting Agent — Usage Map") as demo:
243
  gr.Markdown(
244
- "# AI Recruiting Agent — Usage by Country\n"
245
- "This Space reads **only** `usage/visits.jsonl` and plots **usage events** by country."
 
 
 
 
 
246
  )
247
 
248
  run = gr.Button("Generate map")
@@ -257,4 +239,3 @@ with gr.Blocks(title="AI Recruiting Agent — Usage Map") as demo:
257
  )
258
 
259
  demo.launch()
260
-
 
 
1
  import os
2
+ import json
3
  from collections import Counter
4
 
5
  import gradio as gr
6
  import pandas as pd
7
  import plotly.express as px
 
8
  import pycountry
9
  from datasets import load_dataset
10
 
11
+
12
+ # =========================
13
+ # Config
14
+ # =========================
15
  VISITS_URL = os.getenv(
16
  "VISITS_URL",
17
  "https://huggingface.co/datasets/19arjun89/ai_recruiting_agent_usage/resolve/main/usage/visits_enriched.jsonl",
18
  )
19
 
20
+ # Set this as a HF Space SECRET named MAPBOX_TOKEN
21
  MAPBOX_TOKEN = os.getenv("MAPBOX_TOKEN", "").strip()
22
 
23
+ # Path to your GeoJSON (commit into the Space repo)
24
+ GEOJSON_PATH = os.getenv("GEOJSON_PATH", "countries.geojson")
25
+
26
+ # IMPORTANT: Set this to match the property name inside your GeoJSON features.
27
+ # Common values: "properties.ISO_A3" or "properties.ADM0_A3"
28
+ GEOJSON_FEATURE_ID_KEY = os.getenv("GEOJSON_FEATURE_ID_KEY", "properties.ISO_A3")
29
+
30
  MAX_ROWS = int(os.getenv("MAX_ROWS", "500000"))
31
 
32
 
33
+ # =========================
34
+ # Helpers
35
+ # =========================
36
  def normalize_country_name(country: str | None) -> str | None:
 
37
  if not country or not isinstance(country, str):
38
  return None
39
  c = country.strip()
 
43
 
44
 
45
  def iso2_to_iso3(country_code: str | None) -> str | None:
46
+ """Convert ISO-2 -> ISO-3 for map matching."""
47
  if not country_code or not isinstance(country_code, str):
48
  return None
49
  c2 = country_code.strip().upper()
 
57
 
58
 
59
  def load_rows_streaming():
 
60
  ds = load_dataset(
61
  "json",
62
  data_files=VISITS_URL,
 
69
  break
70
 
71
 
72
+ def load_geojson(path: str) -> dict:
73
+ with open(path, "r", encoding="utf-8") as f:
74
+ return json.load(f)
 
 
 
75
 
 
 
76
 
77
+ # =========================
78
+ # Main report builder
79
+ # =========================
80
+ def build_report():
81
+ if not MAPBOX_TOKEN:
82
+ # We can still run, but Mapbox will not render nicely without token.
83
+ # We'll still build a figure (it may appear blank/limited).
84
+ pass
85
 
86
+ countries_geojson = load_geojson(GEOJSON_PATH)
87
+
88
+ # Counters for clean reconciliation
89
  scanned = 0
 
90
  skipped_session_start = 0
91
  missing_country = 0
92
  invalid_country_code = 0
93
 
94
+ # Table (country name) and map (iso3)
95
+ country_counts = Counter()
96
+ iso3_counts = Counter()
97
+ iso3_to_name = {}
98
+
99
  for row in load_rows_streaming():
100
  scanned += 1
101
+
 
102
  event_type = str(row.get("event", "") or "").strip().lower()
103
  if event_type == "session_start":
104
  skipped_session_start += 1
105
  continue
106
+
 
107
  country = normalize_country_name(row.get("final_country"))
108
  if not country:
109
  missing_country += 1
110
  continue
111
+
 
112
  iso3 = iso2_to_iso3(row.get("final_country_code"))
113
  if not iso3:
114
  invalid_country_code += 1
115
  continue
116
 
117
+ # Count it
118
  country_counts[country] += 1
 
119
  iso3_counts[iso3] += 1
120
  iso3_to_name.setdefault(iso3, country)
 
 
121
 
122
+ # Build table dataframe
123
  table_df = (
124
  pd.DataFrame([{"country": k, "usage events": v} for k, v in country_counts.items()])
125
  .sort_values("usage events", ascending=False)
126
  .reset_index(drop=True)
127
  )
128
 
129
+ # Build map dataframe
130
  map_df = (
131
  pd.DataFrame(
132
  [
133
+ {"iso3": iso3, "country": iso3_to_name.get(iso3, iso3), "usage events": cnt}
134
+ for iso3, cnt in iso3_counts.items()
135
  ]
136
  )
137
  .sort_values("usage events", ascending=False)
138
  .reset_index(drop=True)
139
  )
140
 
141
+ # Reconciliation
142
+ rows_mappable = int(map_df["usage events"].sum()) # note: this is TOTAL events, not rows
143
+ mappable_rows_count = int(sum(iso3_counts.values())) # count of rows after filters (events counted)
144
+ accounted = skipped_session_start + missing_country + invalid_country_code + mappable_rows_count
145
+
146
+ # If you want “Rows mappable” to mean “rows that made it to map”, use mappable_rows_count
147
+ # If you want “Total usage events” (same thing here), use table_df sum.
148
+
149
+ # Map figure
150
  if map_df.empty:
151
  fig = px.scatter(title="No mappable data found")
152
+ fig.update_layout(height=740, margin=dict(l=0, r=0, t=40, b=0))
153
  summary = (
154
+ f"Rows scanned: {scanned:,}\n"
155
+ f"- Session starts skipped: {skipped_session_start:,}\n"
156
+ f"- Missing country: {missing_country:,}\n"
157
+ f"- Invalid country code: {invalid_country_code:,}\n\n"
158
+ f"Accounted rows: {accounted:,} / {scanned:,}\n"
159
+ f"Countries (table): {len(table_df):,}\n"
160
  f"Total usage events: {int(table_df['usage events'].sum()) if len(table_df) else 0:,}"
161
  )
162
  return fig, table_df.head(50), summary
163
 
164
+ # Mapbox choropleth using GeoJSON
165
+ px.set_mapbox_access_token(MAPBOX_TOKEN)
166
+
167
+ fig = px.choropleth_mapbox(
168
  map_df,
169
+ geojson=countries_geojson,
170
+ featureidkey=GEOJSON_FEATURE_ID_KEY,
171
  locations="iso3",
172
  color="usage events",
173
+ hover_name="country",
174
+ hover_data={"usage events": True, "iso3": True},
175
+ mapbox_style="carto-positron", # clean, modern
176
+ opacity=0.75,
177
+ zoom=0.75,
178
+ center={"lat": 15, "lon": 0},
179
  )
180
 
181
+ # Full-bleed layout
182
  fig.update_layout(
183
+ height=740,
184
  margin=dict(l=0, r=0, t=0, b=0),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  )
186
 
187
+ # Dashboard title
 
188
  fig.add_annotation(
189
  text="Usage Events by Country",
190
  x=0.01,
 
197
  font=dict(size=20),
198
  )
199
 
200
+ # Summary text (clean math)
 
 
 
 
 
 
201
  summary = (
202
  f"Rows scanned: {scanned:,}\n"
203
  f"- Session starts skipped: {skipped_session_start:,}\n"
204
  f"- Missing country: {missing_country:,}\n"
205
  f"- Invalid country code: {invalid_country_code:,}\n"
206
+ f"- Rows mapped: {mappable_rows_count:,}\n\n"
207
  f"Accounted rows: {accounted:,} / {scanned:,}\n"
208
  f"Countries (table): {len(table_df):,}\n"
209
+ f"Countries (map): {map_df['iso3'].nunique():,}\n"
210
  f"Total usage events: {int(table_df['usage events'].sum()) if len(table_df) else 0:,}"
211
  )
212
 
213
  return fig, table_df.head(50), summary
214
 
215
 
216
+ # =========================
217
+ # UI
218
+ # =========================
219
  with gr.Blocks(title="AI Recruiting Agent — Usage Map") as demo:
220
  gr.Markdown(
221
+ "# AI Recruiting Agent — Usage by Country (Mapbox)\n"
222
+ "This Space reads **only** `visits_enriched.jsonl`, excludes `event=session_start`, "
223
+ "and plots **usage events** by country.\n\n"
224
+ "**Setup:**\n"
225
+ "- Add Space Secret `MAPBOX_TOKEN`\n"
226
+ "- Commit `countries.geojson`\n"
227
+ "- If your GeoJSON ISO3 field isn’t `ISO_A3`, set env var `GEOJSON_FEATURE_ID_KEY`\n"
228
  )
229
 
230
  run = gr.Button("Generate map")
 
239
  )
240
 
241
  demo.launch()