Spaces:

19arjun89
/

AI_Recruiting_Agent_Usage

Sleeping

App Files Files Community

19arjun89 commited on Feb 5

Commit

c081289

verified ·

1 Parent(s): 08fd623

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -46

app.py CHANGED Viewed

@@ -7,20 +7,21 @@ import plotly.express as px
 import pycountry
 from datasets import load_dataset
 VISITS_URL = os.getenv(
     "VISITS_URL",
     "https://huggingface.co/datasets/19arjun89/ai_recruiting_agent_usage/resolve/main/usage/visits.jsonl",
 )
-# Optional: set MAPBOX_TOKEN in Space Secrets for best-looking map
 MAPBOX_TOKEN = os.getenv("MAPBOX_TOKEN", "").strip()
-# Safety cap in case jsonl grows huge
 MAX_ROWS = int(os.getenv("MAX_ROWS", "500000"))
 def normalize_country_name(country: str | None) -> str | None:
     if not country or not isinstance(country, str):
         return None
     c = country.strip()
@@ -29,7 +30,7 @@ def normalize_country_name(country: str | None) -> str | None:
     return c
-def country_to_iso3(country_name: str) -> str | None:
     """Convert country name -> ISO3 for mapping."""
     try:
         rec = pycountry.countries.search_fuzzy(country_name)[0]
@@ -39,6 +40,7 @@ def country_to_iso3(country_name: str) -> str | None:
 def load_rows_streaming():
     ds = load_dataset(
         "json",
         data_files=VISITS_URL,
@@ -52,14 +54,23 @@ def load_rows_streaming():
 def build_report(url_contains: str):
     url_contains = (url_contains or "").strip().lower()
-    # Count by ISO3 for map + by country name for table
-    iso3_counts = Counter()
     country_counts = Counter()
     scanned = 0
-    mapped = 0
     for row in load_rows_streaming():
         scanned += 1
@@ -67,104 +78,135 @@ def build_report(url_contains: str):
         space_url = str(row.get("space_url", "") or "")
         if url_contains and url_contains not in space_url.lower():
             continue
         country = normalize_country_name(row.get("country"))
         if not country:
             continue
-        iso3 = country_to_iso3(country)
         if not iso3:
-            # If pycountry can't resolve (e.g., odd strings), skip for map,
-            # but still keep it in the table if you want. Here we keep it.
-            country_counts[country] += 1
             continue
-        mapped += 1
         iso3_counts[iso3] += 1
-        country_counts[country] += 1
-    # Build table (country name, hits)
     table_df = (
         pd.DataFrame([{"country": k, "hits": v} for k, v in country_counts.items()])
         .sort_values("hits", ascending=False)
         .reset_index(drop=True)
     )
-    # Build map dataframe (ISO3, hits)
     map_df = (
-        pd.DataFrame([{"iso3": k, "hits": v} for k, v in iso3_counts.items()])
         .sort_values("hits", ascending=False)
         .reset_index(drop=True)
     )
-    # Choose best map option
-    if len(map_df) == 0:
-        fig = px.choropleth(
-            pd.DataFrame({"iso3": [], "hits": []}),
-            locations="iso3",
-            color="hits",
-            projection="natural earth",
-            title="Hits by Country",
         )
-        summary = f"No mappable rows found. Rows scanned: {scanned:,}"
         return fig, table_df.head(50), summary
     if MAPBOX_TOKEN:
-        # Higher-quality choropleth with Mapbox
         px.set_mapbox_access_token(MAPBOX_TOKEN)
         fig = px.choropleth_mapbox(
             map_df,
             locations="iso3",
             color="hits",
-            hover_name="iso3",
             color_continuous_scale="Viridis",
             mapbox_style="carto-positron",
-            zoom=0.6,
             center={"lat": 15, "lon": 0},
-            opacity=0.65,
-            title="Hits by Country (Mapbox)",
         )
-        fig.update_layout(margin={"r": 0, "t": 50, "l": 0, "b": 0})
     else:
-        # Fallback: built-in world map (no token needed)
         fig = px.choropleth(
             map_df,
             locations="iso3",
             color="hits",
-            projection="natural earth",
             title="Hits by Country",
-            hover_name="iso3",
         )
     summary = (
-        f"Rows scanned: {scanned:,} • "
-        f"Rows mapped: {mapped:,} • "
-        f"Countries (table): {len(table_df):,} • "
-        f"Countries (map): {len(map_df):,} • "
-        f"Total hits: {int(table_df['hits'].sum()) if len(table_df) else 0:,}"
     )
     return fig, table_df.head(50), summary
-with gr.Blocks(title="AI Recruiting Agent Usage Map") as demo:
     gr.Markdown(
         "# AI Recruiting Agent — Usage by Country\n"
-        "Loads **only** `usage/visits.jsonl` and visualizes hits by country.\n\n"
-        "- Table uses **country names**\n"
-        "- Map uses ISO3 internally for plotting"
     )
     url_contains = gr.Textbox(
         label="Space URL contains (optional)",
-        placeholder="AI_Recruiting_Agent",
         value="AI_Recruiting_Agent",
     )
     run = gr.Button("Generate map")
     summary = gr.Markdown()
-    plot = gr.Plot()
-    table = gr.Dataframe(label="Top countries (by name)", interactive=False)
     run.click(
         fn=build_report,

 import pycountry
 from datasets import load_dataset
+# === Config ===
 VISITS_URL = os.getenv(
     "VISITS_URL",
     "https://huggingface.co/datasets/19arjun89/ai_recruiting_agent_usage/resolve/main/usage/visits.jsonl",
 )
+# Add this as a Hugging Face Space SECRET named MAPBOX_TOKEN
 MAPBOX_TOKEN = os.getenv("MAPBOX_TOKEN", "").strip()
+# Safety cap for very large jsonl files
 MAX_ROWS = int(os.getenv("MAX_ROWS", "500000"))
 def normalize_country_name(country: str | None) -> str | None:
+    """Normalize country field; return None for empty/Unknown."""
     if not country or not isinstance(country, str):
         return None
     c = country.strip()
     return c
+def country_name_to_iso3(country_name: str) -> str | None:
     """Convert country name -> ISO3 for mapping."""
     try:
         rec = pycountry.countries.search_fuzzy(country_name)[0]
 def load_rows_streaming():
+    """Stream rows from visits.jsonl without loading the entire file into memory."""
     ds = load_dataset(
         "json",
         data_files=VISITS_URL,
 def build_report(url_contains: str):
+    """
+    Aggregate hits by country and render:
+    - Mapbox choropleth (ISO3 internally, country name on hover)
+    - Table with country name + hits
+    """
     url_contains = (url_contains or "").strip().lower()
+    # Count by country name
     country_counts = Counter()
+    # For map: count by iso3, also remember a "display name" per iso3
+    iso3_counts = Counter()
+    iso3_to_name = {}
     scanned = 0
+    matched_url = 0
+    mappable = 0
     for row in load_rows_streaming():
         scanned += 1
         space_url = str(row.get("space_url", "") or "")
         if url_contains and url_contains not in space_url.lower():
             continue
+        matched_url += 1
         country = normalize_country_name(row.get("country"))
         if not country:
             continue
+        # Table count uses raw country field (normalized)
+        country_counts[country] += 1
+        # Map count uses ISO3 (skip if we can't resolve)
+        iso3 = country_name_to_iso3(country)
         if not iso3:
             continue
         iso3_counts[iso3] += 1
+        iso3_to_name.setdefault(iso3, country)
+        mappable += 1
+    # Table dataframe
     table_df = (
         pd.DataFrame([{"country": k, "hits": v} for k, v in country_counts.items()])
         .sort_values("hits", ascending=False)
         .reset_index(drop=True)
     )
+    # Map dataframe
     map_df = (
+        pd.DataFrame(
+            [
+                {"iso3": iso3, "country": iso3_to_name.get(iso3, iso3), "hits": hits}
+                for iso3, hits in iso3_counts.items()
+            ]
+        )
         .sort_values("hits", ascending=False)
         .reset_index(drop=True)
     )
+    # Build figure
+    if map_df.empty:
+        fig = px.scatter(title="No mappable data found")
+        fig.update_layout(height=720, margin=dict(l=0, r=0, t=40, b=0))
+        summary = (
+            f"Rows scanned: {scanned:,} • Rows after URL filter: {matched_url:,} • "
+            f"Countries (table): {len(table_df):,} • Total hits: {int(table_df['hits'].sum()) if len(table_df) else 0:,}"
         )
         return fig, table_df.head(50), summary
     if MAPBOX_TOKEN:
         px.set_mapbox_access_token(MAPBOX_TOKEN)
         fig = px.choropleth_mapbox(
             map_df,
             locations="iso3",
             color="hits",
+            hover_name="country",
+            hover_data={"iso3": True, "hits": True, "country": False},
             color_continuous_scale="Viridis",
             mapbox_style="carto-positron",
+            zoom=0.7,
             center={"lat": 15, "lon": 0},
+            opacity=0.75,
+            title=None,  # We'll add a custom title annotation instead
+        )
+        fig.update_layout(
+            height=720,
+            margin=dict(l=0, r=0, t=0, b=0),
+        )
+        # Add a simple dashboard-style title in the corner
+        fig.add_annotation(
+            text="Hits by Country",
+            x=0.01,
+            y=0.99,
+            xref="paper",
+            yref="paper",
+            xanchor="left",
+            yanchor="top",
+            showarrow=False,
+            font=dict(size=20),
         )
     else:
+        # Fallback to non-Mapbox choropleth if token is missing
         fig = px.choropleth(
             map_df,
             locations="iso3",
             color="hits",
+            hover_name="country",
             title="Hits by Country",
+        )
+        fig.update_layout(
+            height=720,
+            margin=dict(l=0, r=0, t=40, b=0),
+        )
+        fig.update_geos(
+            showframe=False,
+            showcoastlines=False,
+            showcountries=True,
+            countrycolor="rgba(0,0,0,0.15)",
+            bgcolor="rgba(0,0,0,0)",
+            domain=dict(x=[0, 1], y=[0, 1]),
+            fitbounds="locations",
         )
     summary = (
+        f"Rows scanned: {scanned:,} • Rows after URL filter: {matched_url:,} • "
+        f"Rows mappable: {mappable:,} • Countries (table): {len(table_df):,} • "
+        f"Countries (map): {len(map_df):,} • Total hits: {int(table_df['hits'].sum()) if len(table_df) else 0:,}"
     )
     return fig, table_df.head(50), summary
+with gr.Blocks(title="AI Recruiting Agent — Usage Map") as demo:
     gr.Markdown(
         "# AI Recruiting Agent — Usage by Country\n"
+        "This Space reads **only** `usage/visits.jsonl` and plots hits by country.\n\n"
+        "- Set **MAPBOX_TOKEN** as a Space *Secret* for the best-looking map.\n"
+        "- (Optional) Filter by `space_url` substring if you ever log multiple spaces."
     )
     url_contains = gr.Textbox(
         label="Space URL contains (optional)",
         value="AI_Recruiting_Agent",
+        placeholder="AI_Recruiting_Agent",
     )
     run = gr.Button("Generate map")
     summary = gr.Markdown()
+    plot = gr.Plot(height=720)
+    table = gr.Dataframe(label="Top countries", interactive=False)
     run.click(
         fn=build_report,