19arjun89 commited on
Commit
c081289
·
verified ·
1 Parent(s): 08fd623

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -46
app.py CHANGED
@@ -7,20 +7,21 @@ import plotly.express as px
7
  import pycountry
8
  from datasets import load_dataset
9
 
10
-
11
  VISITS_URL = os.getenv(
12
  "VISITS_URL",
13
  "https://huggingface.co/datasets/19arjun89/ai_recruiting_agent_usage/resolve/main/usage/visits.jsonl",
14
  )
15
 
16
- # Optional: set MAPBOX_TOKEN in Space Secrets for best-looking map
17
  MAPBOX_TOKEN = os.getenv("MAPBOX_TOKEN", "").strip()
18
 
19
- # Safety cap in case jsonl grows huge
20
  MAX_ROWS = int(os.getenv("MAX_ROWS", "500000"))
21
 
22
 
23
  def normalize_country_name(country: str | None) -> str | None:
 
24
  if not country or not isinstance(country, str):
25
  return None
26
  c = country.strip()
@@ -29,7 +30,7 @@ def normalize_country_name(country: str | None) -> str | None:
29
  return c
30
 
31
 
32
- def country_to_iso3(country_name: str) -> str | None:
33
  """Convert country name -> ISO3 for mapping."""
34
  try:
35
  rec = pycountry.countries.search_fuzzy(country_name)[0]
@@ -39,6 +40,7 @@ def country_to_iso3(country_name: str) -> str | None:
39
 
40
 
41
  def load_rows_streaming():
 
42
  ds = load_dataset(
43
  "json",
44
  data_files=VISITS_URL,
@@ -52,14 +54,23 @@ def load_rows_streaming():
52
 
53
 
54
  def build_report(url_contains: str):
 
 
 
 
 
55
  url_contains = (url_contains or "").strip().lower()
56
 
57
- # Count by ISO3 for map + by country name for table
58
- iso3_counts = Counter()
59
  country_counts = Counter()
60
 
 
 
 
 
61
  scanned = 0
62
- mapped = 0
 
63
 
64
  for row in load_rows_streaming():
65
  scanned += 1
@@ -67,104 +78,135 @@ def build_report(url_contains: str):
67
  space_url = str(row.get("space_url", "") or "")
68
  if url_contains and url_contains not in space_url.lower():
69
  continue
 
70
 
71
  country = normalize_country_name(row.get("country"))
72
  if not country:
73
  continue
74
 
75
- iso3 = country_to_iso3(country)
 
 
 
 
76
  if not iso3:
77
- # If pycountry can't resolve (e.g., odd strings), skip for map,
78
- # but still keep it in the table if you want. Here we keep it.
79
- country_counts[country] += 1
80
  continue
81
 
82
- mapped += 1
83
  iso3_counts[iso3] += 1
84
- country_counts[country] += 1
 
85
 
86
- # Build table (country name, hits)
87
  table_df = (
88
  pd.DataFrame([{"country": k, "hits": v} for k, v in country_counts.items()])
89
  .sort_values("hits", ascending=False)
90
  .reset_index(drop=True)
91
  )
92
 
93
- # Build map dataframe (ISO3, hits)
94
  map_df = (
95
- pd.DataFrame([{"iso3": k, "hits": v} for k, v in iso3_counts.items()])
 
 
 
 
 
96
  .sort_values("hits", ascending=False)
97
  .reset_index(drop=True)
98
  )
99
 
100
- # Choose best map option
101
- if len(map_df) == 0:
102
- fig = px.choropleth(
103
- pd.DataFrame({"iso3": [], "hits": []}),
104
- locations="iso3",
105
- color="hits",
106
- projection="natural earth",
107
- title="Hits by Country",
108
  )
109
- summary = f"No mappable rows found. Rows scanned: {scanned:,}"
110
  return fig, table_df.head(50), summary
111
 
112
  if MAPBOX_TOKEN:
113
- # Higher-quality choropleth with Mapbox
114
  px.set_mapbox_access_token(MAPBOX_TOKEN)
 
115
  fig = px.choropleth_mapbox(
116
  map_df,
117
  locations="iso3",
118
  color="hits",
119
- hover_name="iso3",
 
120
  color_continuous_scale="Viridis",
121
  mapbox_style="carto-positron",
122
- zoom=0.6,
123
  center={"lat": 15, "lon": 0},
124
- opacity=0.65,
125
- title="Hits by Country (Mapbox)",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  )
127
- fig.update_layout(margin={"r": 0, "t": 50, "l": 0, "b": 0})
128
  else:
129
- # Fallback: built-in world map (no token needed)
130
  fig = px.choropleth(
131
  map_df,
132
  locations="iso3",
133
  color="hits",
134
- projection="natural earth",
135
  title="Hits by Country",
136
- hover_name="iso3",
 
 
 
 
 
 
 
 
 
 
 
 
137
  )
138
 
139
  summary = (
140
- f"Rows scanned: {scanned:,} • "
141
- f"Rows mapped: {mapped:,} • "
142
- f"Countries (table): {len(table_df):,} • "
143
- f"Countries (map): {len(map_df):,} • "
144
- f"Total hits: {int(table_df['hits'].sum()) if len(table_df) else 0:,}"
145
  )
146
 
147
  return fig, table_df.head(50), summary
148
 
149
 
150
- with gr.Blocks(title="AI Recruiting Agent Usage Map") as demo:
151
  gr.Markdown(
152
  "# AI Recruiting Agent — Usage by Country\n"
153
- "Loads **only** `usage/visits.jsonl` and visualizes hits by country.\n\n"
154
- "- Table uses **country names**\n"
155
- "- Map uses ISO3 internally for plotting"
156
  )
157
 
158
  url_contains = gr.Textbox(
159
  label="Space URL contains (optional)",
160
- placeholder="AI_Recruiting_Agent",
161
  value="AI_Recruiting_Agent",
 
162
  )
163
 
164
  run = gr.Button("Generate map")
165
  summary = gr.Markdown()
166
- plot = gr.Plot()
167
- table = gr.Dataframe(label="Top countries (by name)", interactive=False)
168
 
169
  run.click(
170
  fn=build_report,
 
7
  import pycountry
8
  from datasets import load_dataset
9
 
10
+ # === Config ===
11
  VISITS_URL = os.getenv(
12
  "VISITS_URL",
13
  "https://huggingface.co/datasets/19arjun89/ai_recruiting_agent_usage/resolve/main/usage/visits.jsonl",
14
  )
15
 
16
+ # Add this as a Hugging Face Space SECRET named MAPBOX_TOKEN
17
  MAPBOX_TOKEN = os.getenv("MAPBOX_TOKEN", "").strip()
18
 
19
+ # Safety cap for very large jsonl files
20
  MAX_ROWS = int(os.getenv("MAX_ROWS", "500000"))
21
 
22
 
23
  def normalize_country_name(country: str | None) -> str | None:
24
+ """Normalize country field; return None for empty/Unknown."""
25
  if not country or not isinstance(country, str):
26
  return None
27
  c = country.strip()
 
30
  return c
31
 
32
 
33
+ def country_name_to_iso3(country_name: str) -> str | None:
34
  """Convert country name -> ISO3 for mapping."""
35
  try:
36
  rec = pycountry.countries.search_fuzzy(country_name)[0]
 
40
 
41
 
42
  def load_rows_streaming():
43
+ """Stream rows from visits.jsonl without loading the entire file into memory."""
44
  ds = load_dataset(
45
  "json",
46
  data_files=VISITS_URL,
 
54
 
55
 
56
  def build_report(url_contains: str):
57
+ """
58
+ Aggregate hits by country and render:
59
+ - Mapbox choropleth (ISO3 internally, country name on hover)
60
+ - Table with country name + hits
61
+ """
62
  url_contains = (url_contains or "").strip().lower()
63
 
64
+ # Count by country name
 
65
  country_counts = Counter()
66
 
67
+ # For map: count by iso3, also remember a "display name" per iso3
68
+ iso3_counts = Counter()
69
+ iso3_to_name = {}
70
+
71
  scanned = 0
72
+ matched_url = 0
73
+ mappable = 0
74
 
75
  for row in load_rows_streaming():
76
  scanned += 1
 
78
  space_url = str(row.get("space_url", "") or "")
79
  if url_contains and url_contains not in space_url.lower():
80
  continue
81
+ matched_url += 1
82
 
83
  country = normalize_country_name(row.get("country"))
84
  if not country:
85
  continue
86
 
87
+ # Table count uses raw country field (normalized)
88
+ country_counts[country] += 1
89
+
90
+ # Map count uses ISO3 (skip if we can't resolve)
91
+ iso3 = country_name_to_iso3(country)
92
  if not iso3:
 
 
 
93
  continue
94
 
 
95
  iso3_counts[iso3] += 1
96
+ iso3_to_name.setdefault(iso3, country)
97
+ mappable += 1
98
 
99
+ # Table dataframe
100
  table_df = (
101
  pd.DataFrame([{"country": k, "hits": v} for k, v in country_counts.items()])
102
  .sort_values("hits", ascending=False)
103
  .reset_index(drop=True)
104
  )
105
 
106
+ # Map dataframe
107
  map_df = (
108
+ pd.DataFrame(
109
+ [
110
+ {"iso3": iso3, "country": iso3_to_name.get(iso3, iso3), "hits": hits}
111
+ for iso3, hits in iso3_counts.items()
112
+ ]
113
+ )
114
  .sort_values("hits", ascending=False)
115
  .reset_index(drop=True)
116
  )
117
 
118
+ # Build figure
119
+ if map_df.empty:
120
+ fig = px.scatter(title="No mappable data found")
121
+ fig.update_layout(height=720, margin=dict(l=0, r=0, t=40, b=0))
122
+ summary = (
123
+ f"Rows scanned: {scanned:,} • Rows after URL filter: {matched_url:,} • "
124
+ f"Countries (table): {len(table_df):,} • Total hits: {int(table_df['hits'].sum()) if len(table_df) else 0:,}"
 
125
  )
 
126
  return fig, table_df.head(50), summary
127
 
128
  if MAPBOX_TOKEN:
 
129
  px.set_mapbox_access_token(MAPBOX_TOKEN)
130
+
131
  fig = px.choropleth_mapbox(
132
  map_df,
133
  locations="iso3",
134
  color="hits",
135
+ hover_name="country",
136
+ hover_data={"iso3": True, "hits": True, "country": False},
137
  color_continuous_scale="Viridis",
138
  mapbox_style="carto-positron",
139
+ zoom=0.7,
140
  center={"lat": 15, "lon": 0},
141
+ opacity=0.75,
142
+ title=None, # We'll add a custom title annotation instead
143
+ )
144
+ fig.update_layout(
145
+ height=720,
146
+ margin=dict(l=0, r=0, t=0, b=0),
147
+ )
148
+ # Add a simple dashboard-style title in the corner
149
+ fig.add_annotation(
150
+ text="Hits by Country",
151
+ x=0.01,
152
+ y=0.99,
153
+ xref="paper",
154
+ yref="paper",
155
+ xanchor="left",
156
+ yanchor="top",
157
+ showarrow=False,
158
+ font=dict(size=20),
159
  )
 
160
  else:
161
+ # Fallback to non-Mapbox choropleth if token is missing
162
  fig = px.choropleth(
163
  map_df,
164
  locations="iso3",
165
  color="hits",
166
+ hover_name="country",
167
  title="Hits by Country",
168
+ )
169
+ fig.update_layout(
170
+ height=720,
171
+ margin=dict(l=0, r=0, t=40, b=0),
172
+ )
173
+ fig.update_geos(
174
+ showframe=False,
175
+ showcoastlines=False,
176
+ showcountries=True,
177
+ countrycolor="rgba(0,0,0,0.15)",
178
+ bgcolor="rgba(0,0,0,0)",
179
+ domain=dict(x=[0, 1], y=[0, 1]),
180
+ fitbounds="locations",
181
  )
182
 
183
  summary = (
184
+ f"Rows scanned: {scanned:,} • Rows after URL filter: {matched_url:,} • "
185
+ f"Rows mappable: {mappable:,} • Countries (table): {len(table_df):,} • "
186
+ f"Countries (map): {len(map_df):,} • Total hits: {int(table_df['hits'].sum()) if len(table_df) else 0:,}"
 
 
187
  )
188
 
189
  return fig, table_df.head(50), summary
190
 
191
 
192
+ with gr.Blocks(title="AI Recruiting Agent Usage Map") as demo:
193
  gr.Markdown(
194
  "# AI Recruiting Agent — Usage by Country\n"
195
+ "This Space reads **only** `usage/visits.jsonl` and plots hits by country.\n\n"
196
+ "- Set **MAPBOX_TOKEN** as a Space *Secret* for the best-looking map.\n"
197
+ "- (Optional) Filter by `space_url` substring if you ever log multiple spaces."
198
  )
199
 
200
  url_contains = gr.Textbox(
201
  label="Space URL contains (optional)",
 
202
  value="AI_Recruiting_Agent",
203
+ placeholder="AI_Recruiting_Agent",
204
  )
205
 
206
  run = gr.Button("Generate map")
207
  summary = gr.Markdown()
208
+ plot = gr.Plot(height=720)
209
+ table = gr.Dataframe(label="Top countries", interactive=False)
210
 
211
  run.click(
212
  fn=build_report,