Spaces:

19arjun89
/

AI_Recruiting_Agent_Usage

Sleeping

App Files Files Community

19arjun89 commited on Feb 5

Commit

d2449b1

verified ·

1 Parent(s): 83599d2

Create app.py

Browse files

Files changed (1) hide show

app.py +171 -0

app.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import os
+from collections import Counter
+from datetime import datetime
+from dateutil import parser as dateparser
+import gradio as gr
+import pandas as pd
+import plotly.express as px
+import pycountry
+from datasets import load_dataset
+VISITS_URL = os.getenv(
+    "VISITS_URL",
+    "https://huggingface.co/datasets/19arjun89/ai_recruiting_agent_usage/resolve/main/usage/visits.jsonl",
+)
+# If your dataset is private, set HF_TOKEN as a Space secret and pass it below.
+HF_TOKEN = os.getenv("HF_TOKEN", None)
+# Safety cap in case the jsonl explodes in size; set higher later if you want
+MAX_ROWS = int(os.getenv("MAX_ROWS", "500000"))
+def to_iso3(country: str | None, country_code: str | None) -> str | None:
+    """Map country name / ISO2 -> ISO3 (needed for Plotly choropleth)."""
+    # ISO2 present?
+    if country_code and isinstance(country_code, str) and len(country_code.strip()) == 2:
+        try:
+            c2 = country_code.strip().upper()
+            rec = pycountry.countries.get(alpha_2=c2)
+            return rec.alpha_3 if rec else None
+        except Exception:
+            pass
+    # ISO3 already?
+    if country and isinstance(country, str):
+        c = country.strip()
+        if len(c) == 3 and c.isalpha():
+            return c.upper()
+        # Fuzzy match country name
+        try:
+            rec = pycountry.countries.search_fuzzy(c)[0]
+            return rec.alpha_3
+        except Exception:
+            return None
+    return None
+def parse_ts(ts_val) -> datetime | None:
+    if not ts_val:
+        return None
+    try:
+        # Handles ISO strings like "2026-02-01T12:34:56Z"
+        return dateparser.parse(str(ts_val))
+    except Exception:
+        return None
+def load_streaming_rows():
+    ds = load_dataset(
+        "json",
+        data_files=VISITS_URL,
+        split="train",
+        streaming=True,
+        token=HF_TOKEN,
+    )
+    n = 0
+    for row in ds:
+        yield row
+        n += 1
+        if n >= MAX_ROWS:
+            break
+def build_report(start_date: str, end_date: str, url_contains: str, include_unknown: bool):
+    # Parse filters
+    sd = dateparser.parse(start_date).date() if start_date.strip() else None
+    ed = dateparser.parse(end_date).date() if end_date.strip() else None
+    url_contains = url_contains.strip().lower()
+    counts = Counter()
+    raw_country_counts = Counter()
+    scanned = 0
+    matched = 0
+    for row in load_streaming_rows():
+        scanned += 1
+        # optional URL filter (if you ever log multiple space URLs)
+        space_url = str(row.get("space_url", "") or "")
+        if url_contains and url_contains not in space_url.lower():
+            continue
+        # optional date filter
+        ts = parse_ts(row.get("ts_utc"))
+        if ts:
+            d = ts.date()
+            if sd and d < sd:
+                continue
+            if ed and d > ed:
+                continue
+        country = row.get("country")
+        country_code = row.get("country_code")
+        if not include_unknown and (not country or str(country).strip().lower() == "unknown"):
+            continue
+        iso3 = to_iso3(country, country_code)
+        if not iso3:
+            continue
+        matched += 1
+        counts[iso3] += 1
+        raw_country_counts[str(country)] += 1
+    if not counts:
+        empty_fig = px.choropleth(
+            pd.DataFrame({"iso3": [], "hits": []}),
+            locations="iso3",
+            color="hits",
+            projection="natural earth",
+            title="Hits by Country",
+        )
+        return empty_fig, pd.DataFrame(columns=["iso3", "hits"]), f"No rows matched. Rows scanned: {scanned:,}"
+    agg = pd.DataFrame([{"iso3": k, "hits": v} for k, v in counts.items()]).sort_values("hits", ascending=False)
+    fig = px.choropleth(
+        agg,
+        locations="iso3",
+        color="hits",
+        projection="natural earth",
+        title="Hits by Country",
+        hover_name="iso3",
+    )
+    top = agg.head(30).reset_index(drop=True)
+    summary = f"Rows scanned: {scanned:,} • Rows mapped: {matched:,} • Countries: {len(agg):,} • Total hits: {int(agg['hits'].sum()):,}"
+    return fig, top, summary
+with gr.Blocks(title="AI Recruiting Agent Usage Map") as demo:
+    gr.Markdown(
+        "# AI Recruiting Agent — Usage by Country\n"
+        "Loads **only** `usage/visits.jsonl` and visualizes hits by country."
+    )
+    with gr.Row():
+        start_date = gr.Textbox(label="Start date (optional)", placeholder="2026-01-01")
+        end_date = gr.Textbox(label="End date (optional)", placeholder="2026-02-05")
+        url_contains = gr.Textbox(label="Space URL contains (optional)", placeholder="AI_Recruiting_Agent")
+    include_unknown = gr.Checkbox(label="Include 'Unknown' country rows", value=False)
+    run = gr.Button("Generate map")
+    summary = gr.Markdown()
+    plot = gr.Plot()
+    table = gr.Dataframe(label="Top countries", interactive=False)
+    run.click(
+        fn=build_report,
+        inputs=[start_date, end_date, url_contains, include_unknown],
+        outputs=[plot, table, summary],
+    )
+demo.launch()