Spaces:

19arjun89
/

AI_Recruiting_Agent_Usage

Running

App Files Files Community

19arjun89 commited on Feb 8

Commit

7f8a9be

verified ·

1 Parent(s): 8b9d4ec

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -107

app.py CHANGED Viewed

@@ -1,29 +1,39 @@
 import os
 from collections import Counter
 import gradio as gr
 import pandas as pd
 import plotly.express as px
-import plotly.graph_objects as go
 import pycountry
 from datasets import load_dataset
-# === Config ===
 VISITS_URL = os.getenv(
     "VISITS_URL",
     "https://huggingface.co/datasets/19arjun89/ai_recruiting_agent_usage/resolve/main/usage/visits_enriched.jsonl",
 )
-# Optional: You can keep this env var, but this version uses Plotly Geo (no Mapbox needed)
 MAPBOX_TOKEN = os.getenv("MAPBOX_TOKEN", "").strip()
-# Safety cap for very large jsonl files
 MAX_ROWS = int(os.getenv("MAX_ROWS", "500000"))
 def normalize_country_name(country: str | None) -> str | None:
-    """Normalize country field; return None for empty/Unknown."""
     if not country or not isinstance(country, str):
         return None
     c = country.strip()
@@ -33,6 +43,7 @@ def normalize_country_name(country: str | None) -> str | None:
 def iso2_to_iso3(country_code: str | None) -> str | None:
     if not country_code or not isinstance(country_code, str):
         return None
     c2 = country_code.strip().upper()
@@ -46,7 +57,6 @@ def iso2_to_iso3(country_code: str | None) -> str | None:
 def load_rows_streaming():
-    """Stream rows from visits.jsonl without loading the entire file into memory."""
     ds = load_dataset(
         "json",
         data_files=VISITS_URL,
@@ -59,152 +69,122 @@ def load_rows_streaming():
             break
-def build_report():
-    """
-    Aggregate usage events by country and render:
-    - Choropleth map with labels (country + usage events)
-    - Table with country name + usage events
-    """
-    # Count by country name (table)
-    country_counts = Counter()
-    # Count by ISO3 (map), also store a display name per ISO3
-    iso3_counts = Counter()
-    iso3_to_name = {}
     scanned = 0
-    mappable = 0
     skipped_session_start = 0
     missing_country = 0
     invalid_country_code = 0
     for row in load_rows_streaming():
         scanned += 1
-        # 1) Skip session starts
         event_type = str(row.get("event", "") or "").strip().lower()
         if event_type == "session_start":
             skipped_session_start += 1
             continue
-        # 2) Missing country
         country = normalize_country_name(row.get("final_country"))
         if not country:
             missing_country += 1
             continue
-        # 3) Invalid / missing country code
         iso3 = iso2_to_iso3(row.get("final_country_code"))
         if not iso3:
             invalid_country_code += 1
             continue
         country_counts[country] += 1
         iso3_counts[iso3] += 1
         iso3_to_name.setdefault(iso3, country)
-        mappable += 1
-    # Table dataframe (country name + usage events)
     table_df = (
         pd.DataFrame([{"country": k, "usage events": v} for k, v in country_counts.items()])
         .sort_values("usage events", ascending=False)
         .reset_index(drop=True)
     )
-    # Map dataframe
     map_df = (
         pd.DataFrame(
             [
-                {"iso3": iso3, "country": iso3_to_name.get(iso3, iso3), "usage events": count}
-                for iso3, count in iso3_counts.items()
             ]
         )
         .sort_values("usage events", ascending=False)
         .reset_index(drop=True)
     )
     if map_df.empty:
         fig = px.scatter(title="No mappable data found")
-        fig.update_layout(height=720, margin=dict(l=0, r=0, t=40, b=0))
         summary = (
-            f"Rows scanned: {scanned:,} • Countries (table): {len(table_df):,} • "
             f"Total usage events: {int(table_df['usage events'].sum()) if len(table_df) else 0:,}"
         )
         return fig, table_df.head(50), summary
-    # Choropleth (built-in polygons; reliable)
-    fig = px.choropleth(
         map_df,
         locations="iso3",
         color="usage events",
-        projection="natural earth",
-        title=None,
     )
     fig.update_layout(
-        height=720,
         margin=dict(l=0, r=0, t=0, b=0),
-        paper_bgcolor="white",
-    )
-    fig.update_geos(
-        showframe=False,
-        showcoastlines=False,
-        showcountries=True,
-        countrycolor="rgba(0,0,0,0.25)",
-        bgcolor="rgba(0,0,0,0)",
-        domain=dict(x=[0, 1], y=[0, 1]),
-        fitbounds="locations",
-    )
-    # Labels overlay (always visible)
-    # Tip: keep labels to top N to avoid clutter if you grow beyond ~30 countries
-    labels_df = map_df.copy()
-    labels_df["label"] = labels_df["country"] + "<br>" + labels_df["usage events"].astype(str)
-    # ===============================
-    # Label shadow (dark background)
-    # ===============================
-    fig.add_trace(
-        go.Scattergeo(
-            locations=labels_df["iso3"],
-            locationmode="ISO-3",
-            text=labels_df["label"],
-            mode="text",
-            textfont=dict(
-                size=13,          # slightly bigger
-                color="black",
-                family="Arial",
-            ),
-            hoverinfo="skip",
-            showlegend=False,
-        )
-    )
-    # ===============================
-    # Main label (white foreground)
-    # ===============================
-    fig.add_trace(
-        go.Scattergeo(
-            locations=labels_df["iso3"],
-            locationmode="ISO-3",
-            text=labels_df["label"],
-            mode="text",
-            textfont=dict(
-                size=11,
-                color="white",
-                family="Arial",
-            ),
-            hoverinfo="skip",
-            showlegend=False,
-        )
     )
-    # Title
     fig.add_annotation(
         text="Usage Events by Country",
         x=0.01,
@@ -217,32 +197,34 @@ def build_report():
         font=dict(size=20),
     )
-    accounted = (
-        skipped_session_start
-        + missing_country
-        + invalid_country_code
-        + mappable
-    )
     summary = (
         f"Rows scanned: {scanned:,}\n"
         f"- Session starts skipped: {skipped_session_start:,}\n"
         f"- Missing country: {missing_country:,}\n"
         f"- Invalid country code: {invalid_country_code:,}\n"
-        f"- Rows mappable: {mappable:,}\n\n"
         f"Accounted rows: {accounted:,} / {scanned:,}\n"
         f"Countries (table): {len(table_df):,}\n"
-        f"Countries (map): {len(map_df):,}\n"
         f"Total usage events: {int(table_df['usage events'].sum()) if len(table_df) else 0:,}"
     )
     return fig, table_df.head(50), summary
 with gr.Blocks(title="AI Recruiting Agent — Usage Map") as demo:
     gr.Markdown(
-        "# AI Recruiting Agent — Usage by Country\n"
-        "This Space reads **only** `usage/visits.jsonl` and plots **usage events** by country."
     )
     run = gr.Button("Generate map")
@@ -257,4 +239,3 @@ with gr.Blocks(title="AI Recruiting Agent — Usage Map") as demo:
     )
 demo.launch()

 import os
+import json
 from collections import Counter
 import gradio as gr
 import pandas as pd
 import plotly.express as px
 import pycountry
 from datasets import load_dataset
+# =========================
+# Config
+# =========================
 VISITS_URL = os.getenv(
     "VISITS_URL",
     "https://huggingface.co/datasets/19arjun89/ai_recruiting_agent_usage/resolve/main/usage/visits_enriched.jsonl",
 )
+# Set this as a HF Space SECRET named MAPBOX_TOKEN
 MAPBOX_TOKEN = os.getenv("MAPBOX_TOKEN", "").strip()
+# Path to your GeoJSON (commit into the Space repo)
+GEOJSON_PATH = os.getenv("GEOJSON_PATH", "countries.geojson")
+# IMPORTANT: Set this to match the property name inside your GeoJSON features.
+# Common values: "properties.ISO_A3" or "properties.ADM0_A3"
+GEOJSON_FEATURE_ID_KEY = os.getenv("GEOJSON_FEATURE_ID_KEY", "properties.ISO_A3")
 MAX_ROWS = int(os.getenv("MAX_ROWS", "500000"))
+# =========================
+# Helpers
+# =========================
 def normalize_country_name(country: str | None) -> str | None:
     if not country or not isinstance(country, str):
         return None
     c = country.strip()
 def iso2_to_iso3(country_code: str | None) -> str | None:
+    """Convert ISO-2 -> ISO-3 for map matching."""
     if not country_code or not isinstance(country_code, str):
         return None
     c2 = country_code.strip().upper()
 def load_rows_streaming():
     ds = load_dataset(
         "json",
         data_files=VISITS_URL,
             break
+def load_geojson(path: str) -> dict:
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+# =========================
+# Main report builder
+# =========================
+def build_report():
+    if not MAPBOX_TOKEN:
+        # We can still run, but Mapbox will not render nicely without token.
+        # We'll still build a figure (it may appear blank/limited).
+        pass
+    countries_geojson = load_geojson(GEOJSON_PATH)
+    # Counters for clean reconciliation
     scanned = 0
     skipped_session_start = 0
     missing_country = 0
     invalid_country_code = 0
+    # Table (country name) and map (iso3)
+    country_counts = Counter()
+    iso3_counts = Counter()
+    iso3_to_name = {}
     for row in load_rows_streaming():
         scanned += 1
         event_type = str(row.get("event", "") or "").strip().lower()
         if event_type == "session_start":
             skipped_session_start += 1
             continue
         country = normalize_country_name(row.get("final_country"))
         if not country:
             missing_country += 1
             continue
         iso3 = iso2_to_iso3(row.get("final_country_code"))
         if not iso3:
             invalid_country_code += 1
             continue
+        # Count it
         country_counts[country] += 1
         iso3_counts[iso3] += 1
         iso3_to_name.setdefault(iso3, country)
+    # Build table dataframe
     table_df = (
         pd.DataFrame([{"country": k, "usage events": v} for k, v in country_counts.items()])
         .sort_values("usage events", ascending=False)
         .reset_index(drop=True)
     )
+    # Build map dataframe
     map_df = (
         pd.DataFrame(
             [
+                {"iso3": iso3, "country": iso3_to_name.get(iso3, iso3), "usage events": cnt}
+                for iso3, cnt in iso3_counts.items()
             ]
         )
         .sort_values("usage events", ascending=False)
         .reset_index(drop=True)
     )
+    # Reconciliation
+    rows_mappable = int(map_df["usage events"].sum())  # note: this is TOTAL events, not rows
+    mappable_rows_count = int(sum(iso3_counts.values()))  # count of rows after filters (events counted)
+    accounted = skipped_session_start + missing_country + invalid_country_code + mappable_rows_count
+    # If you want “Rows mappable” to mean “rows that made it to map”, use mappable_rows_count
+    # If you want “Total usage events” (same thing here), use table_df sum.
+    # Map figure
     if map_df.empty:
         fig = px.scatter(title="No mappable data found")
+        fig.update_layout(height=740, margin=dict(l=0, r=0, t=40, b=0))
         summary = (
+            f"Rows scanned: {scanned:,}\n"
+            f"- Session starts skipped: {skipped_session_start:,}\n"
+            f"- Missing country: {missing_country:,}\n"
+            f"- Invalid country code: {invalid_country_code:,}\n\n"
+            f"Accounted rows: {accounted:,} / {scanned:,}\n"
+            f"Countries (table): {len(table_df):,}\n"
             f"Total usage events: {int(table_df['usage events'].sum()) if len(table_df) else 0:,}"
         )
         return fig, table_df.head(50), summary
+    # Mapbox choropleth using GeoJSON
+    px.set_mapbox_access_token(MAPBOX_TOKEN)
+    fig = px.choropleth_mapbox(
         map_df,
+        geojson=countries_geojson,
+        featureidkey=GEOJSON_FEATURE_ID_KEY,
         locations="iso3",
         color="usage events",
+        hover_name="country",
+        hover_data={"usage events": True, "iso3": True},
+        mapbox_style="carto-positron",  # clean, modern
+        opacity=0.75,
+        zoom=0.75,
+        center={"lat": 15, "lon": 0},
     )
+    # Full-bleed layout
     fig.update_layout(
+        height=740,
         margin=dict(l=0, r=0, t=0, b=0),
     )
+    # Dashboard title
     fig.add_annotation(
         text="Usage Events by Country",
         x=0.01,
         font=dict(size=20),
     )
+    # Summary text (clean math)
     summary = (
         f"Rows scanned: {scanned:,}\n"
         f"- Session starts skipped: {skipped_session_start:,}\n"
         f"- Missing country: {missing_country:,}\n"
         f"- Invalid country code: {invalid_country_code:,}\n"
+        f"- Rows mapped: {mappable_rows_count:,}\n\n"
         f"Accounted rows: {accounted:,} / {scanned:,}\n"
         f"Countries (table): {len(table_df):,}\n"
+        f"Countries (map): {map_df['iso3'].nunique():,}\n"
         f"Total usage events: {int(table_df['usage events'].sum()) if len(table_df) else 0:,}"
     )
     return fig, table_df.head(50), summary
+# =========================
+# UI
+# =========================
 with gr.Blocks(title="AI Recruiting Agent — Usage Map") as demo:
     gr.Markdown(
+        "# AI Recruiting Agent — Usage by Country (Mapbox)\n"
+        "This Space reads **only** `visits_enriched.jsonl`, excludes `event=session_start`, "
+        "and plots **usage events** by country.\n\n"
+        "**Setup:**\n"
+        "- Add Space Secret `MAPBOX_TOKEN`\n"
+        "- Commit `countries.geojson`\n"
+        "- If your GeoJSON ISO3 field isn’t `ISO_A3`, set env var `GEOJSON_FEATURE_ID_KEY`\n"
     )
     run = gr.Button("Generate map")
     )
 demo.launch()