19arjun89 commited on
Commit
d2449b1
·
verified ·
1 Parent(s): 83599d2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +171 -0
app.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from collections import Counter
3
+ from datetime import datetime
4
+ from dateutil import parser as dateparser
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+ import plotly.express as px
9
+ import pycountry
10
+ from datasets import load_dataset
11
+
12
+
13
+ VISITS_URL = os.getenv(
14
+ "VISITS_URL",
15
+ "https://huggingface.co/datasets/19arjun89/ai_recruiting_agent_usage/resolve/main/usage/visits.jsonl",
16
+ )
17
+
18
+ # If your dataset is private, set HF_TOKEN as a Space secret and pass it below.
19
+ HF_TOKEN = os.getenv("HF_TOKEN", None)
20
+
21
+ # Safety cap in case the jsonl explodes in size; set higher later if you want
22
+ MAX_ROWS = int(os.getenv("MAX_ROWS", "500000"))
23
+
24
+
25
+ def to_iso3(country: str | None, country_code: str | None) -> str | None:
26
+ """Map country name / ISO2 -> ISO3 (needed for Plotly choropleth)."""
27
+ # ISO2 present?
28
+ if country_code and isinstance(country_code, str) and len(country_code.strip()) == 2:
29
+ try:
30
+ c2 = country_code.strip().upper()
31
+ rec = pycountry.countries.get(alpha_2=c2)
32
+ return rec.alpha_3 if rec else None
33
+ except Exception:
34
+ pass
35
+
36
+ # ISO3 already?
37
+ if country and isinstance(country, str):
38
+ c = country.strip()
39
+ if len(c) == 3 and c.isalpha():
40
+ return c.upper()
41
+
42
+ # Fuzzy match country name
43
+ try:
44
+ rec = pycountry.countries.search_fuzzy(c)[0]
45
+ return rec.alpha_3
46
+ except Exception:
47
+ return None
48
+
49
+ return None
50
+
51
+
52
+ def parse_ts(ts_val) -> datetime | None:
53
+ if not ts_val:
54
+ return None
55
+ try:
56
+ # Handles ISO strings like "2026-02-01T12:34:56Z"
57
+ return dateparser.parse(str(ts_val))
58
+ except Exception:
59
+ return None
60
+
61
+
62
+ def load_streaming_rows():
63
+ ds = load_dataset(
64
+ "json",
65
+ data_files=VISITS_URL,
66
+ split="train",
67
+ streaming=True,
68
+ token=HF_TOKEN,
69
+ )
70
+ n = 0
71
+ for row in ds:
72
+ yield row
73
+ n += 1
74
+ if n >= MAX_ROWS:
75
+ break
76
+
77
+
78
+ def build_report(start_date: str, end_date: str, url_contains: str, include_unknown: bool):
79
+ # Parse filters
80
+ sd = dateparser.parse(start_date).date() if start_date.strip() else None
81
+ ed = dateparser.parse(end_date).date() if end_date.strip() else None
82
+ url_contains = url_contains.strip().lower()
83
+
84
+ counts = Counter()
85
+ raw_country_counts = Counter()
86
+ scanned = 0
87
+ matched = 0
88
+
89
+ for row in load_streaming_rows():
90
+ scanned += 1
91
+
92
+ # optional URL filter (if you ever log multiple space URLs)
93
+ space_url = str(row.get("space_url", "") or "")
94
+ if url_contains and url_contains not in space_url.lower():
95
+ continue
96
+
97
+ # optional date filter
98
+ ts = parse_ts(row.get("ts_utc"))
99
+ if ts:
100
+ d = ts.date()
101
+ if sd and d < sd:
102
+ continue
103
+ if ed and d > ed:
104
+ continue
105
+
106
+ country = row.get("country")
107
+ country_code = row.get("country_code")
108
+
109
+ if not include_unknown and (not country or str(country).strip().lower() == "unknown"):
110
+ continue
111
+
112
+ iso3 = to_iso3(country, country_code)
113
+ if not iso3:
114
+ continue
115
+
116
+ matched += 1
117
+ counts[iso3] += 1
118
+ raw_country_counts[str(country)] += 1
119
+
120
+ if not counts:
121
+ empty_fig = px.choropleth(
122
+ pd.DataFrame({"iso3": [], "hits": []}),
123
+ locations="iso3",
124
+ color="hits",
125
+ projection="natural earth",
126
+ title="Hits by Country",
127
+ )
128
+ return empty_fig, pd.DataFrame(columns=["iso3", "hits"]), f"No rows matched. Rows scanned: {scanned:,}"
129
+
130
+ agg = pd.DataFrame([{"iso3": k, "hits": v} for k, v in counts.items()]).sort_values("hits", ascending=False)
131
+
132
+ fig = px.choropleth(
133
+ agg,
134
+ locations="iso3",
135
+ color="hits",
136
+ projection="natural earth",
137
+ title="Hits by Country",
138
+ hover_name="iso3",
139
+ )
140
+
141
+ top = agg.head(30).reset_index(drop=True)
142
+ summary = f"Rows scanned: {scanned:,} • Rows mapped: {matched:,} • Countries: {len(agg):,} • Total hits: {int(agg['hits'].sum()):,}"
143
+
144
+ return fig, top, summary
145
+
146
+
147
+ with gr.Blocks(title="AI Recruiting Agent Usage Map") as demo:
148
+ gr.Markdown(
149
+ "# AI Recruiting Agent — Usage by Country\n"
150
+ "Loads **only** `usage/visits.jsonl` and visualizes hits by country."
151
+ )
152
+
153
+ with gr.Row():
154
+ start_date = gr.Textbox(label="Start date (optional)", placeholder="2026-01-01")
155
+ end_date = gr.Textbox(label="End date (optional)", placeholder="2026-02-05")
156
+ url_contains = gr.Textbox(label="Space URL contains (optional)", placeholder="AI_Recruiting_Agent")
157
+
158
+ include_unknown = gr.Checkbox(label="Include 'Unknown' country rows", value=False)
159
+
160
+ run = gr.Button("Generate map")
161
+ summary = gr.Markdown()
162
+ plot = gr.Plot()
163
+ table = gr.Dataframe(label="Top countries", interactive=False)
164
+
165
+ run.click(
166
+ fn=build_report,
167
+ inputs=[start_date, end_date, url_contains, include_unknown],
168
+ outputs=[plot, table, summary],
169
+ )
170
+
171
+ demo.launch()