Spaces:

SWE-Arena
/

SWE-Community

Sleeping

App Files Files Community

zhimin-z commited on Apr 4

Commit

d277f5a

0 Parent(s):

first commit

Browse files

Files changed (7) hide show

.gitattributes +35 -0
.github/workflows/hf_sync.yml +35 -0
.gitignore +6 -0
README.md +66 -0
app.py +661 -0
msr.py +715 -0
requirements.txt +10 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.github/workflows/hf_sync.yml ADDED Viewed

	@@ -0,0 +1,35 @@

+name: Sync to Hugging Face Space
+on:
+  push:
+    branches:
+      - main
+jobs:
+  sync:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout GitHub Repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0  # Fetch the entire history to avoid shallow clone issues
+      - name: Install Git LFS
+        run: |
+          curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
+          sudo apt-get install git-lfs
+          git lfs install
+      - name: Configure Git
+        run: |
+          git config --global user.name "GitHub Actions Bot"
+          git config --global user.email "actions@github.com"
+      - name: Push to Hugging Face
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          git remote add huggingface https://user:${HF_TOKEN}@huggingface.co/spaces/SWE-Arena/SWE-Community
+          git fetch huggingface
+          git push huggingface main --force

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+*.claude
+*.env
+*.venv
+*.ipynb
+*.pyc
+*.duckdb

README.md ADDED Viewed

	@@ -0,0 +1,66 @@

+---
+title: SWE-Community
+emoji: 🌐
+colorFrom: green
+colorTo: blue
+sdk: gradio
+sdk_version: 5.50.0
+app_file: app.py
+hf_oauth: true
+pinned: false
+short_description: Track GitHub community statistics for SWE assistants
+---
+# SWE Assistant Community Leaderboard
+SWE-Community ranks software engineering assistants by their real-world GitHub community activity: wiki edits and team membership events.
+No benchmarks. No sandboxes. Just real community activity tracked from public repositories.
+## Why This Exists
+Most AI coding assistant benchmarks use synthetic tasks and simulated environments. This leaderboard measures real-world activity: how many wiki pages is the assistant editing? How many membership events is it generating? Is the assistant's community engagement growing?
+If an assistant is consistently active across different projects, that tells you something no benchmark can.
+## What We Track
+Key metrics from the last 180 days:
+**Leaderboard Table**
+- **Assistant Name**: Display name of the assistant
+- **Website**: Link to the assistant's homepage or documentation
+- **Total Wiki Edits**: Total number of wiki pages edited by the assistant
+- **Total Membership Events**: Number of team membership changes performed by the assistant
+**Monthly Trends**
+- Wiki edit volume over time (bar charts)
+- Membership event volume over time (bar charts)
+- Activity patterns across months
+We focus on 180 days to highlight current capabilities and active assistants.
+## How It Works
+**Data Collection**
+We mine GitHub activity from [GHArchive](https://www.gharchive.org/), tracking:
+- Wiki pages edited by the assistant (`GollumEvent` data)
+- Membership events by the assistant (`MemberEvent` data)
+**Regular Updates**
+Leaderboard refreshes daily
+**Community Submissions**
+Anyone can submit an assistant. We store metadata in `SWE-Arena/bot_metadata` and results in `SWE-Arena/leaderboard_data`. All submissions are validated via GitHub API.
+## What's Next
+Planned improvements:
+- Repository-based analysis (which repos are assistants active in)
+- Extended metrics (wiki page types, membership roles, access levels)
+- Organization and team breakdown
+- Activity patterns (page creations, updates, invitations, removals)
+## Questions or Issues?
+[Open an issue](https://github.com/SWE-Arena/SWE-Community/issues) for bugs, feature requests, or data concerns.

app.py ADDED Viewed

	@@ -0,0 +1,661 @@

+import gradio as gr
+from gradio_leaderboard import Leaderboard
+import json
+import os
+import time
+import requests
+from huggingface_hub import HfApi, hf_hub_download
+from huggingface_hub.errors import HfHubHTTPError
+import backoff
+from dotenv import load_dotenv
+import pandas as pd
+import random
+import plotly.graph_objects as go
+from apscheduler.schedulers.background import BackgroundScheduler
+from apscheduler.triggers.cron import CronTrigger
+# Load environment variables
+load_dotenv(override=True)
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+AGENTS_REPO = "SWE-Arena/bot_data"  # HuggingFace dataset for assistant metadata
+LEADERBOARD_FILENAME = f"{os.getenv('COMPOSE_PROJECT_NAME')}.json"
+LEADERBOARD_REPO = "SWE-Arena/leaderboard_data"  # HuggingFace dataset for leaderboard data
+MAX_RETRIES = 5
+LEADERBOARD_COLUMNS = [
+    ("Assistant", "string"),
+    ("Website", "string"),
+    ("Total Wiki Edits", "number"),
+    ("Total Membership Events", "number"),
+]
+# =============================================================================
+# HUGGINGFACE API WRAPPERS WITH BACKOFF
+# =============================================================================
+def is_rate_limit_error(e):
+    """Check if exception is a HuggingFace rate limit error (429)."""
+    if isinstance(e, HfHubHTTPError):
+        return e.response.status_code == 429
+    return False
+@backoff.on_exception(
+    backoff.expo,
+    HfHubHTTPError,
+    max_tries=MAX_RETRIES,
+    base=300,
+    max_value=3600,
+    giveup=lambda e: not is_rate_limit_error(e),
+    on_backoff=lambda details: print(
+        f"Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
+    )
+)
+def list_repo_files_with_backoff(api, **kwargs):
+    """Wrapper for api.list_repo_files() with exponential backoff for rate limits."""
+    return api.list_repo_files(**kwargs)
+@backoff.on_exception(
+    backoff.expo,
+    HfHubHTTPError,
+    max_tries=MAX_RETRIES,
+    base=300,
+    max_value=3600,
+    giveup=lambda e: not is_rate_limit_error(e),
+    on_backoff=lambda details: print(
+        f"Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
+    )
+)
+def hf_hub_download_with_backoff(**kwargs):
+    """Wrapper for hf_hub_download() with exponential backoff for rate limits."""
+    return hf_hub_download(**kwargs)
+# =============================================================================
+# GITHUB USERNAME VALIDATION
+# =============================================================================
+def validate_github_username(identifier):
+    """Verify that a GitHub identifier exists."""
+    try:
+        response = requests.get(f'https://api.github.com/users/{identifier}', timeout=10)
+        return (True, "Username is valid") if response.status_code == 200 else (False, "GitHub identifier not found" if response.status_code == 404 else f"Validation error: HTTP {response.status_code}")
+    except Exception as e:
+        return False, f"Validation error: {str(e)}"
+# =============================================================================
+# HUGGINGFACE DATASET OPERATIONS
+# =============================================================================
+def load_agents_from_hf():
+    """Load all assistant metadata JSON files from HuggingFace dataset."""
+    try:
+        api = HfApi()
+        assistants = []
+        # List all files in the repository
+        files = list_repo_files_with_backoff(api=api, repo_id=AGENTS_REPO, repo_type="dataset")
+        # Filter for JSON files only
+        json_files = [f for f in files if f.endswith('.json')]
+        # Download and parse each JSON file
+        for json_file in json_files:
+            try:
+                file_path = hf_hub_download_with_backoff(
+                    repo_id=AGENTS_REPO,
+                    filename=json_file,
+                    repo_type="dataset"
+                )
+                with open(file_path, 'r') as f:
+                    agent_data = json.load(f)
+                    # Only process assistants with status == "active"
+                    if agent_data.get('status') != 'active':
+                        continue
+                    # Extract github_identifier from filename (e.g., "assistant[bot].json" -> "assistant[bot]")
+                    filename_identifier = json_file.replace('.json', '')
+                    # Add or override github_identifier to match filename
+                    agent_data['github_identifier'] = filename_identifier
+                    assistants.append(agent_data)
+            except Exception as e:
+                print(f"Warning: Could not load {json_file}: {str(e)}")
+                continue
+        print(f"Loaded {len(assistants)} assistants from HuggingFace")
+        return assistants
+    except Exception as e:
+        print(f"Could not load assistants from HuggingFace: {str(e)}")
+        return None
+def get_hf_token():
+    """Get HuggingFace token from environment variables."""
+    token = os.getenv('HF_TOKEN')
+    if not token:
+        print("Warning: HF_TOKEN not found in environment variables")
+    return token
+def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, token, max_retries=5):
+    """Upload file to HuggingFace with exponential backoff retry logic."""
+    delay = 2.0
+    for attempt in range(max_retries):
+        try:
+            api.upload_file(
+                path_or_fileobj=path_or_fileobj,
+                path_in_repo=path_in_repo,
+                repo_id=repo_id,
+                repo_type=repo_type,
+                token=token
+            )
+            if attempt > 0:
+                print(f"   Upload succeeded on attempt {attempt + 1}/{max_retries}")
+            return True
+        except Exception as e:
+            if attempt < max_retries - 1:
+                wait_time = delay + random.uniform(0, 1.0)
+                print(f"   Upload failed (attempt {attempt + 1}/{max_retries}): {str(e)}")
+                print(f"   Retrying in {wait_time:.1f} seconds...")
+                time.sleep(wait_time)
+                delay = min(delay * 2, 60.0)
+            else:
+                print(f"   Upload failed after {max_retries} attempts: {str(e)}")
+                raise
+def save_agent_to_hf(data):
+    """Save a new assistant to HuggingFace dataset as {identifier}.json in root."""
+    try:
+        api = HfApi()
+        token = get_hf_token()
+        if not token:
+            raise Exception("No HuggingFace token found. Please set HF_TOKEN in your Space settings.")
+        identifier = data['github_identifier']
+        filename = f"{identifier}.json"
+        # Save locally first
+        with open(filename, 'w') as f:
+            json.dump(data, f, indent=2)
+        try:
+            # Upload to HuggingFace (root directory)
+            upload_with_retry(
+                api=api,
+                path_or_fileobj=filename,
+                path_in_repo=filename,
+                repo_id=AGENTS_REPO,
+                repo_type="dataset",
+                token=token
+            )
+            print(f"Saved assistant to HuggingFace: {filename}")
+            return True
+        finally:
+            # Always clean up local file, even if upload fails
+            if os.path.exists(filename):
+                os.remove(filename)
+    except Exception as e:
+        print(f"Error saving assistant: {str(e)}")
+        return False
+def load_leaderboard_data_from_hf():
+    """Load leaderboard data and monthly metrics from HuggingFace dataset."""
+    try:
+        token = get_hf_token()
+        # Download file
+        file_path = hf_hub_download_with_backoff(
+            repo_id=LEADERBOARD_REPO,
+            filename=LEADERBOARD_FILENAME,
+            repo_type="dataset",
+            token=token
+        )
+        # Load JSON data
+        with open(file_path, 'r') as f:
+            data = json.load(f)
+        last_updated = data.get('metadata', {}).get('last_updated', 'Unknown')
+        print(f"Loaded leaderboard data from HuggingFace (last updated: {last_updated})")
+        return data
+    except Exception as e:
+        print(f"Could not load leaderboard data from HuggingFace: {str(e)}")
+        return None
+# =============================================================================
+# UI FUNCTIONS
+# =============================================================================
+def _empty_plot(message="No data available for visualization"):
+    """Return an empty Plotly figure with a message."""
+    fig = go.Figure()
+    fig.add_annotation(
+        text=message,
+        xref="paper", yref="paper",
+        x=0.5, y=0.5, showarrow=False,
+        font=dict(size=16)
+    )
+    fig.update_layout(title=None, xaxis_title=None, height=500)
+    return fig
+def _generate_color(index, total):
+    """Generate distinct colors using HSL color space for better distribution."""
+    hue = (index * 360 / total) % 360
+    saturation = 70 + (index % 3) * 10
+    lightness = 45 + (index % 2) * 10
+    return f'hsl({hue}, {saturation}%, {lightness}%)'
+def create_monthly_wiki_plot(top_n=5):
+    """Create a Plotly figure showing monthly wiki edits as bar charts."""
+    saved_data = load_leaderboard_data_from_hf()
+    if not saved_data or 'monthly_metrics' not in saved_data:
+        return _empty_plot()
+    metrics = saved_data['monthly_metrics']
+    # Apply top_n filter
+    if top_n is not None and top_n > 0 and metrics.get('assistants'):
+        agent_totals = []
+        for agent_name in metrics['assistants']:
+            agent_data = metrics['data'].get(agent_name, {})
+            wiki_edits = sum(agent_data.get('total_wiki_edits', []))
+            agent_totals.append((agent_name, wiki_edits))
+        agent_totals.sort(key=lambda x: x[1], reverse=True)
+        top_agents = [name for name, _ in agent_totals[:top_n]]
+        metrics = {
+            'assistants': top_agents,
+            'months': metrics['months'],
+            'data': {a: metrics['data'][a] for a in top_agents if a in metrics['data']}
+        }
+    if not metrics['assistants'] or not metrics['months']:
+        return _empty_plot()
+    fig = go.Figure()
+    assistants = metrics['assistants']
+    months = metrics['months']
+    data = metrics['data']
+    for idx, agent_name in enumerate(assistants):
+        color = _generate_color(idx, len(assistants))
+        agent_data = data[agent_name]
+        x_bars = []
+        y_bars = []
+        for month, count in zip(months, agent_data.get('total_wiki_edits', [])):
+            if count > 0:
+                x_bars.append(month)
+                y_bars.append(count)
+        if x_bars and y_bars:
+            fig.add_trace(
+                go.Bar(
+                    x=x_bars, y=y_bars, name=agent_name,
+                    marker=dict(color=color, opacity=0.7),
+                    hovertemplate='<b>%{fullData.name}</b><br>Month: %{x}<br>Wiki Edits: %{y}<extra></extra>',
+                    offsetgroup=agent_name
+                )
+            )
+    fig.update_xaxes(title_text=None)
+    fig.update_yaxes(title_text="<b>Wiki Edits</b>")
+    show_legend = (top_n is not None and top_n <= 10)
+    fig.update_layout(
+        title=None, hovermode='closest', barmode='group', height=600,
+        showlegend=show_legend,
+        margin=dict(l=50, r=150 if show_legend else 50, t=50, b=50)
+    )
+    return fig
+def create_monthly_members_plot(top_n=5):
+    """Create a Plotly figure showing monthly membership events as bar charts."""
+    saved_data = load_leaderboard_data_from_hf()
+    if not saved_data or 'monthly_metrics' not in saved_data:
+        return _empty_plot()
+    metrics = saved_data['monthly_metrics']
+    # Apply top_n filter
+    if top_n is not None and top_n > 0 and metrics.get('assistants'):
+        agent_totals = []
+        for agent_name in metrics['assistants']:
+            agent_data = metrics['data'].get(agent_name, {})
+            total_members = sum(agent_data.get('total_members', []))
+            agent_totals.append((agent_name, total_members))
+        agent_totals.sort(key=lambda x: x[1], reverse=True)
+        top_agents = [name for name, _ in agent_totals[:top_n]]
+        metrics = {
+            'assistants': top_agents,
+            'months': metrics['months'],
+            'data': {a: metrics['data'][a] for a in top_agents if a in metrics['data']}
+        }
+    if not metrics['assistants'] or not metrics['months']:
+        return _empty_plot()
+    fig = go.Figure()
+    assistants = metrics['assistants']
+    months = metrics['months']
+    data = metrics['data']
+    for idx, agent_name in enumerate(assistants):
+        color = _generate_color(idx, len(assistants))
+        agent_data = data[agent_name]
+        x_bars = []
+        y_bars = []
+        for month, count in zip(months, agent_data.get('total_members', [])):
+            if count > 0:
+                x_bars.append(month)
+                y_bars.append(count)
+        if x_bars and y_bars:
+            fig.add_trace(
+                go.Bar(
+                    x=x_bars, y=y_bars, name=agent_name,
+                    marker=dict(color=color, opacity=0.7),
+                    hovertemplate='<b>%{fullData.name}</b><br>Month: %{x}<br>Membership Events: %{y}<extra></extra>',
+                    offsetgroup=agent_name
+                )
+            )
+    fig.update_xaxes(title_text=None)
+    fig.update_yaxes(title_text="<b>Membership Events</b>")
+    show_legend = (top_n is not None and top_n <= 10)
+    fig.update_layout(
+        title=None, hovermode='closest', barmode='group', height=600,
+        showlegend=show_legend,
+        margin=dict(l=50, r=150 if show_legend else 50, t=50, b=50)
+    )
+    return fig
+def get_leaderboard_dataframe():
+    """Load leaderboard from saved dataset and convert to pandas DataFrame for display."""
+    saved_data = load_leaderboard_data_from_hf()
+    if not saved_data or 'leaderboard' not in saved_data:
+        print(f"No leaderboard data available")
+        column_names = [col[0] for col in LEADERBOARD_COLUMNS]
+        return pd.DataFrame(columns=column_names)
+    cache_dict = saved_data['leaderboard']
+    last_updated = saved_data.get('metadata', {}).get('last_updated', 'Unknown')
+    print(f"Loaded leaderboard from saved dataset (last updated: {last_updated})")
+    print(f"Cache dict size: {len(cache_dict)}")
+    if not cache_dict:
+        print("WARNING: cache_dict is empty!")
+        column_names = [col[0] for col in LEADERBOARD_COLUMNS]
+        return pd.DataFrame(columns=column_names)
+    rows = []
+    filtered_count = 0
+    for identifier, data in cache_dict.items():
+        wiki_edits = data.get('total_wiki_edits', 0)
+        total_members = data.get('total_members', 0)
+        # Filter out assistants with zero activity across both metrics
+        if wiki_edits == 0 and total_members == 0:
+            filtered_count += 1
+            continue
+        rows.append([
+            data.get('name', 'Unknown'),
+            data.get('website', 'N/A'),
+            wiki_edits,
+            total_members,
+        ])
+    print(f"Filtered out {filtered_count} assistants with 0 activity")
+    print(f"Leaderboard will show {len(rows)} assistants")
+    # Create DataFrame
+    column_names = [col[0] for col in LEADERBOARD_COLUMNS]
+    df = pd.DataFrame(rows, columns=column_names)
+    # Ensure numeric types
+    numeric_cols = ["Total Wiki Edits", "Total Membership Events"]
+    for col in numeric_cols:
+        if col in df.columns:
+            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
+    # Sort by combined activity descending
+    if not df.empty:
+        df = df.sort_values(by=["Total Wiki Edits", "Total Membership Events"], ascending=False).reset_index(drop=True)
+    # Workaround for gradio_leaderboard bug: single-row tables don't render properly
+    if len(df) == 1:
+        placeholder_row = pd.DataFrame([[
+            "Submit yours to join!", "—", 0, 0
+        ]], columns=df.columns)
+        df = pd.concat([df, placeholder_row], ignore_index=True)
+        print("Added placeholder row for single-record workaround")
+    print(f"Final DataFrame shape: {df.shape}")
+    print("="*60 + "\n")
+    return df
+def submit_agent(identifier, agent_name, organization, website):
+    """Submit a new assistant to the leaderboard."""
+    # Validate required fields
+    if not identifier or not identifier.strip():
+        return "ERROR: GitHub identifier is required", gr.update()
+    if not agent_name or not agent_name.strip():
+        return "ERROR: Assistant name is required", gr.update()
+    if not organization or not organization.strip():
+        return "ERROR: Organization name is required", gr.update()
+    if not website or not website.strip():
+        return "ERROR: Website URL is required", gr.update()
+    # Clean inputs
+    identifier = identifier.strip()
+    agent_name = agent_name.strip()
+    organization = organization.strip()
+    website = website.strip()
+    # Validate GitHub identifier
+    is_valid, message = validate_github_username(identifier)
+    if not is_valid:
+        return f"ERROR: {message}", gr.update()
+    # Check for duplicates by loading assistants from HuggingFace
+    assistants = load_agents_from_hf()
+    if assistants:
+        existing_names = {assistant['github_identifier'] for assistant in assistants}
+        if identifier in existing_names:
+            return f"WARNING: Assistant with identifier '{identifier}' already exists", gr.update()
+    # Create submission
+    submission = {
+        'name': agent_name,
+        'organization': organization,
+        'github_identifier': identifier,
+        'website': website,
+        'status': 'active'
+    }
+    # Save to HuggingFace
+    if not save_agent_to_hf(submission):
+        return "ERROR: Failed to save submission", gr.update()
+    return f"SUCCESS: Successfully submitted {agent_name}! Community data will be automatically populated by the backend system via the maintainers.", gr.update()
+# =============================================================================
+# DATA RELOAD FUNCTION
+# =============================================================================
+def reload_leaderboard_data():
+    """Reload leaderboard data from HuggingFace. Called by scheduler daily."""
+    print(f"\n{'='*80}")
+    print(f"Reloading leaderboard data from HuggingFace...")
+    print(f"{'='*80}\n")
+    try:
+        data = load_leaderboard_data_from_hf()
+        if data:
+            print(f"Successfully reloaded leaderboard data")
+            print(f"   Last updated: {data.get('metadata', {}).get('last_updated', 'Unknown')}")
+            print(f"   Agents: {len(data.get('leaderboard', {}))}")
+        else:
+            print(f"No data available")
+    except Exception as e:
+        print(f"Error reloading leaderboard data: {str(e)}")
+    print(f"{'='*80}\n")
+# =============================================================================
+# GRADIO APPLICATION
+# =============================================================================
+print(f"\nStarting SWE Assistant Community Leaderboard")
+print(f"   Data source: {LEADERBOARD_REPO}")
+print(f"   Reload frequency: Daily at 12:00 AM UTC\n")
+# Start APScheduler for daily data reload at 12:00 AM UTC
+scheduler = BackgroundScheduler(timezone="UTC")
+scheduler.add_job(
+    reload_leaderboard_data,
+    trigger=CronTrigger(hour=0, minute=0),
+    id='daily_data_reload',
+    name='Daily Data Reload',
+    replace_existing=True
+)
+scheduler.start()
+print(f"\n{'='*80}")
+print(f"Scheduler initialized successfully")
+print(f"Reload schedule: Daily at 12:00 AM UTC")
+print(f"On startup: Loads cached data from HuggingFace on demand")
+print(f"{'='*80}\n")
+# Create Gradio interface
+with gr.Blocks(title="SWE Assistant Community Leaderboard", theme=gr.themes.Soft()) as app:
+    gr.Markdown("# SWE Assistant Community Leaderboard")
+    gr.Markdown(f"Track and compare community activity (wiki edits & membership events) by SWE assistants")
+    with gr.Tabs():
+        # Leaderboard Tab
+        with gr.Tab("Leaderboard"):
+            gr.Markdown("*Statistics are based on wiki edits and membership events by assistants*")
+            leaderboard_table = Leaderboard(
+                value=pd.DataFrame(columns=[col[0] for col in LEADERBOARD_COLUMNS]),
+                datatype=LEADERBOARD_COLUMNS,
+                search_columns=["Assistant", "Website"],
+                filter_columns=[]
+            )
+            # Load leaderboard data when app starts
+            app.load(
+                fn=get_leaderboard_dataframe,
+                inputs=[],
+                outputs=[leaderboard_table]
+            )
+            # Monthly Performance Metrics
+            gr.Markdown("---")
+            gr.Markdown("## Monthly Performance Metrics - Top 5 Assistants")
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("*Wiki edit volume over time*")
+                    wiki_plot = gr.Plot()
+                with gr.Column():
+                    gr.Markdown("*Membership event volume over time*")
+                    members_plot = gr.Plot()
+            app.load(
+                fn=lambda: create_monthly_wiki_plot(),
+                inputs=[],
+                outputs=[wiki_plot]
+            )
+            app.load(
+                fn=lambda: create_monthly_members_plot(),
+                inputs=[],
+                outputs=[members_plot]
+            )
+        # Submit Assistant Tab
+        with gr.Tab("Submit Your Assistant"):
+            gr.Markdown("Fill in the details below to add your assistant to the leaderboard.")
+            with gr.Row():
+                with gr.Column():
+                    github_input = gr.Textbox(
+                        label="GitHub Identifier*",
+                        placeholder="Your assistant username (e.g., my-assistant[bot])"
+                    )
+                    name_input = gr.Textbox(
+                        label="Assistant Name*",
+                        placeholder="Your assistant's display name"
+                    )
+                with gr.Column():
+                    organization_input = gr.Textbox(
+                        label="Organization*",
+                        placeholder="Your organization or team name"
+                    )
+                    website_input = gr.Textbox(
+                        label="Website*",
+                        placeholder="https://your-assistant-website.com"
+                    )
+            submit_button = gr.Button(
+                "Submit Assistant",
+                variant="primary"
+            )
+            submission_status = gr.Textbox(
+                label="Submission Status",
+                interactive=False
+            )
+            # Event handler
+            submit_button.click(
+                fn=submit_agent,
+                inputs=[github_input, name_input, organization_input, website_input],
+                outputs=[submission_status, leaderboard_table]
+            )
+# Launch application
+if __name__ == "__main__":
+    app.launch()

msr.py ADDED Viewed

	@@ -0,0 +1,715 @@

+import json
+import os
+from datetime import datetime, timezone, timedelta
+from collections import defaultdict
+from huggingface_hub import HfApi, hf_hub_download
+from huggingface_hub.errors import HfHubHTTPError
+from dotenv import load_dotenv
+import duckdb
+import backoff
+import requests
+import requests.exceptions
+import traceback
+import re
+# Load environment variables
+load_dotenv(override=True)
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+# Get script directory for relative paths
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+BASE_DIR = os.path.dirname(SCRIPT_DIR)  # Parent directory
+AGENTS_REPO = "SWE-Arena/bot_data"
+AGENTS_REPO_LOCAL_PATH = os.path.join(BASE_DIR, "bot_data")  # Local git clone path
+DUCKDB_CACHE_FILE = os.path.join(SCRIPT_DIR, "cache.duckdb")
+GHARCHIVE_DATA_LOCAL_PATH = os.path.join(BASE_DIR, "gharchive/data")
+LEADERBOARD_FILENAME = f"{os.getenv('COMPOSE_PROJECT_NAME')}.json"
+LEADERBOARD_REPO = "SWE-Arena/leaderboard_data"
+LEADERBOARD_TIME_FRAME_DAYS = 180
+# Git sync configuration (mandatory to get latest bot data)
+GIT_SYNC_TIMEOUT = 300  # 5 minutes timeout for git pull
+# Streaming batch configuration
+BATCH_SIZE_DAYS = 1  # Process 1 day at a time (~24 hourly files)
+# Retry configuration
+MAX_RETRIES = 5
+# =============================================================================
+# UTILITY FUNCTIONS
+# =============================================================================
+def load_jsonl(filename):
+    """Load JSONL file and return list of dictionaries."""
+    if not os.path.exists(filename):
+        return []
+    data = []
+    with open(filename, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                try:
+                    data.append(json.loads(line))
+                except json.JSONDecodeError as e:
+                    print(f"Warning: Skipping invalid JSON line: {e}")
+    return data
+def save_jsonl(filename, data):
+    """Save list of dictionaries to JSONL file."""
+    with open(filename, 'w', encoding='utf-8') as f:
+        for item in data:
+            f.write(json.dumps(item) + '\n')
+def normalize_date_format(date_string):
+    """Convert date strings or datetime objects to standardized ISO 8601 format with Z suffix."""
+    if not date_string or date_string == 'N/A':
+        return 'N/A'
+    try:
+        if isinstance(date_string, datetime):
+            return date_string.strftime('%Y-%m-%dT%H:%M:%SZ')
+        date_string = re.sub(r'\s+', ' ', date_string.strip())
+        date_string = date_string.replace(' ', 'T')
+        if len(date_string) >= 3:
+            if date_string[-3:-2] in ('+', '-') and ':' not in date_string[-3:]:
+                date_string = date_string + ':00'
+        dt = datetime.fromisoformat(date_string.replace('Z', '+00:00'))
+        return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
+    except Exception as e:
+        print(f"Warning: Could not parse date '{date_string}': {e}")
+        return date_string
+def get_hf_token():
+    """Get HuggingFace token from environment variables."""
+    token = os.getenv('HF_TOKEN')
+    if not token:
+        print("Warning: HF_TOKEN not found in environment variables")
+    return token
+# =============================================================================
+# GHARCHIVE DOWNLOAD FUNCTIONS
+# =============================================================================
+def download_file(url):
+    """Download a GHArchive file with retry logic."""
+    filename = url.split("/")[-1]
+    filepath = os.path.join(GHARCHIVE_DATA_LOCAL_PATH, filename)
+    if os.path.exists(filepath):
+        return True
+    try:
+        response = requests.get(url, timeout=30)
+        response.raise_for_status()
+        with open(filepath, "wb") as f:
+            f.write(response.content)
+        return True
+    except Exception as e:
+        print(f"   ⚠ {filename}: {e}")
+        return False
+def download_all_gharchive_data():
+    """Download all GHArchive data files for the last LEADERBOARD_TIME_FRAME_DAYS."""
+    os.makedirs(GHARCHIVE_DATA_LOCAL_PATH, exist_ok=True)
+    end_date = datetime.now(timezone.utc)
+    start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
+    urls = []
+    current_date = start_date
+    while current_date <= end_date:
+        date_str = current_date.strftime("%Y-%m-%d")
+        for hour in range(24):
+            url = f"https://data.gharchive.org/{date_str}-{hour}.json.gz"
+            urls.append(url)
+        current_date += timedelta(days=1)
+    success = True
+    for url in urls:
+        if not download_file(url):
+            success = False
+    return success
+# =============================================================================
+# HUGGINGFACE API WRAPPERS
+# =============================================================================
+def is_retryable_error(e):
+    """Check if exception is retryable (rate limit or timeout error)."""
+    if isinstance(e, HfHubHTTPError):
+        if e.response.status_code == 429:
+            return True
+    if isinstance(e, (requests.exceptions.Timeout,
+                     requests.exceptions.ReadTimeout,
+                     requests.exceptions.ConnectTimeout)):
+        return True
+    if isinstance(e, Exception):
+        error_str = str(e).lower()
+        if 'timeout' in error_str or 'timed out' in error_str:
+            return True
+    return False
+@backoff.on_exception(
+    backoff.expo,
+    (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
+    max_tries=MAX_RETRIES,
+    base=300,
+    max_value=3600,
+    giveup=lambda e: not is_retryable_error(e),
+    on_backoff=lambda details: print(
+        f"   {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
+    )
+)
+def list_repo_files_with_backoff(api, **kwargs):
+    """Wrapper for api.list_repo_files() with exponential backoff."""
+    return api.list_repo_files(**kwargs)
+@backoff.on_exception(
+    backoff.expo,
+    (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
+    max_tries=MAX_RETRIES,
+    base=300,
+    max_value=3600,
+    giveup=lambda e: not is_retryable_error(e),
+    on_backoff=lambda details: print(
+        f"   {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
+    )
+)
+def hf_hub_download_with_backoff(**kwargs):
+    """Wrapper for hf_hub_download() with exponential backoff."""
+    return hf_hub_download(**kwargs)
+@backoff.on_exception(
+    backoff.expo,
+    (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
+    max_tries=MAX_RETRIES,
+    base=300,
+    max_value=3600,
+    giveup=lambda e: not is_retryable_error(e),
+    on_backoff=lambda details: print(
+        f"   {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
+    )
+)
+def upload_file_with_backoff(api, **kwargs):
+    """Wrapper for api.upload_file() with exponential backoff."""
+    return api.upload_file(**kwargs)
+@backoff.on_exception(
+    backoff.expo,
+    (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
+    max_tries=MAX_RETRIES,
+    base=300,
+    max_value=3600,
+    giveup=lambda e: not is_retryable_error(e),
+    on_backoff=lambda details: print(
+        f"   {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
+    )
+)
+def upload_folder_with_backoff(api, **kwargs):
+    """Wrapper for api.upload_folder() with exponential backoff."""
+    return api.upload_folder(**kwargs)
+def get_duckdb_connection():
+    """
+    Initialize DuckDB connection with OPTIMIZED memory settings.
+    Uses persistent database and reduced memory footprint.
+    Automatically removes cache file if lock conflict is detected.
+    """
+    try:
+        conn = duckdb.connect(DUCKDB_CACHE_FILE)
+    except Exception as e:
+        # Check if it's a locking error
+        error_msg = str(e)
+        if "lock" in error_msg.lower() or "conflicting" in error_msg.lower():
+            print(f"   ⚠ Lock conflict detected, removing {DUCKDB_CACHE_FILE}...")
+            if os.path.exists(DUCKDB_CACHE_FILE):
+                os.remove(DUCKDB_CACHE_FILE)
+                print(f"   ✓ Cache file removed, retrying connection...")
+            # Retry connection after removing cache
+            conn = duckdb.connect(DUCKDB_CACHE_FILE)
+        else:
+            # Re-raise if it's not a locking error
+            raise
+    # CORE MEMORY & THREADING SETTINGS
+    conn.execute(f"SET threads TO 4;")
+    conn.execute(f"SET max_memory = '50GB';")
+    conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
+    # PERFORMANCE OPTIMIZATIONS
+    conn.execute("SET preserve_insertion_order = false;")  # Disable expensive ordering
+    conn.execute("SET enable_object_cache = true;")  # Cache repeatedly read files
+    return conn
+def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_LOCAL_PATH):
+    """Generate file path patterns for GHArchive data in date range (only existing files)."""
+    file_patterns = []
+    missing_dates = set()
+    current_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0)
+    end_day = end_date.replace(hour=0, minute=0, second=0, microsecond=0)
+    while current_date <= end_day:
+        date_has_files = False
+        for hour in range(24):
+            pattern = os.path.join(data_dir, f"{current_date.strftime('%Y-%m-%d')}-{hour}.json.gz")
+            if os.path.exists(pattern):
+                file_patterns.append(pattern)
+                date_has_files = True
+        if not date_has_files:
+            missing_dates.add(current_date.strftime('%Y-%m-%d'))
+        current_date += timedelta(days=1)
+    if missing_dates:
+        print(f"   ○ Skipping {len(missing_dates)} date(s) with no data")
+    return file_patterns
+# =============================================================================
+# STREAMING BATCH PROCESSING
+# =============================================================================
+def fetch_all_community_metadata_streaming(conn, identifiers, start_date, end_date):
+    """
+    QUERY: Fetch community metadata (wiki edits + member events) using streaming batch processing.
+    Args:
+        conn: DuckDB connection instance
+        identifiers: List of GitHub usernames/bot identifiers
+        start_date: Start datetime (timezone-aware)
+        end_date: End datetime (timezone-aware)
+    Returns:
+        Tuple of (wiki_metadata_by_agent, member_metadata_by_agent)
+    """
+    identifier_list = ', '.join([f"'{id}'" for id in identifiers])
+    wiki_metadata_by_agent = defaultdict(list)
+    member_metadata_by_agent = defaultdict(list)
+    # Calculate total batches
+    total_days = (end_date - start_date).days
+    total_batches = (total_days // BATCH_SIZE_DAYS) + 1
+    # Process in configurable batches
+    current_date = start_date
+    batch_num = 0
+    total_wiki_edits = 0
+    total_members = 0
+    print(f"   Streaming {total_batches} batches of {BATCH_SIZE_DAYS}-day intervals...")
+    while current_date <= end_date:
+        batch_num += 1
+        batch_end = min(current_date + timedelta(days=BATCH_SIZE_DAYS - 1), end_date)
+        # Get file patterns for THIS BATCH ONLY
+        file_patterns = generate_file_path_patterns(current_date, batch_end)
+        if not file_patterns:
+            print(f"   Batch {batch_num}/{total_batches}: {current_date.date()} to {batch_end.date()} - NO DATA")
+            current_date = batch_end + timedelta(days=1)
+            continue
+        print(f"   Batch {batch_num}/{total_batches}: {current_date.date()} to {batch_end.date()} ({len(file_patterns)} files)... ", end="", flush=True)
+        file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
+        # --- Wiki query (GollumEvent) ---
+        wiki_query = f"""
+        SELECT
+            TRY_CAST(json_extract_string(to_json(actor), '$.login') AS VARCHAR) as assistant,
+            TRY_CAST(json_array_length(json_extract(to_json(payload), '$.pages')) AS INTEGER) as page_count,
+            created_at
+        FROM read_json(
+            {file_patterns_sql},
+            union_by_name=true,
+            filename=true,
+            compression='gzip',
+            format='newline_delimited',
+            ignore_errors=true
+        )
+        WHERE type = 'GollumEvent'
+            AND json_extract(to_json(payload), '$.pages') IS NOT NULL
+            AND TRY_CAST(json_extract_string(to_json(actor), '$.login') AS VARCHAR) IN ({identifier_list})
+        """
+        # --- Member query (MemberEvent) ---
+        member_query = f"""
+        SELECT DISTINCT
+            actor.login as assistant,
+            TRY_CAST(json_extract_string(to_json(payload), '$.member.login') AS VARCHAR) as member_login,
+            TRY_CAST(json_extract_string(to_json(payload), '$.action') AS VARCHAR) as action,
+            created_at
+        FROM read_json(
+            {file_patterns_sql},
+            union_by_name=true,
+            filename=true,
+            compression='gzip',
+            format='newline_delimited',
+            ignore_errors=true
+        )
+        WHERE type = 'MemberEvent'
+            AND TRY_CAST(json_extract_string(to_json(payload), '$.member.login') AS VARCHAR) IS NOT NULL
+            AND TRY_CAST(json_extract_string(to_json(actor), '$.login') AS VARCHAR) IN ({identifier_list})
+        """
+        try:
+            # Wiki results
+            batch_wiki_edits = 0
+            results = conn.execute(wiki_query).fetchall()
+            for row in results:
+                assistant = row[0]
+                page_count = row[1] if row[1] is not None else 0
+                created_at = normalize_date_format(row[2]) if row[2] else None
+                if not assistant or page_count == 0:
+                    continue
+                wiki_metadata_by_agent[assistant].append({
+                    'page_count': page_count,
+                    'created_at': created_at,
+                })
+                batch_wiki_edits += page_count
+                total_wiki_edits += page_count
+            # Member results
+            batch_members = 0
+            results = conn.execute(member_query).fetchall()
+            for row in results:
+                assistant = row[0]
+                member_login = row[1]
+                action = row[2]
+                created_at = normalize_date_format(row[3]) if row[3] else None
+                if not assistant or not member_login:
+                    continue
+                member_metadata_by_agent[assistant].append({
+                    'member_login': member_login,
+                    'action': action,
+                    'created_at': created_at,
+                })
+                batch_members += 1
+                total_members += 1
+            print(f"✓ {batch_wiki_edits} wiki edits, {batch_members} members")
+        except Exception as e:
+            print(f"\n   ✗ Batch {batch_num} error: {str(e)}")
+            traceback.print_exc()
+        current_date = batch_end + timedelta(days=1)
+    # Final summary
+    wiki_agents = sum(1 for v in wiki_metadata_by_agent.values() if v)
+    member_agents = sum(1 for v in member_metadata_by_agent.values() if v)
+    print(f"\n   ✓ Complete: {total_wiki_edits} wiki edits ({wiki_agents} assistants), {total_members} members ({member_agents} assistants)")
+    return dict(wiki_metadata_by_agent), dict(member_metadata_by_agent)
+def load_agents_from_hf():
+    """
+    Load all assistant metadata JSON files from local git repository.
+    """
+    assistants = []
+    # Scan local directory for JSON files
+    if not os.path.exists(AGENTS_REPO_LOCAL_PATH):
+        raise FileNotFoundError(f"Local repository not found at {AGENTS_REPO_LOCAL_PATH}")
+    # Walk through the directory to find all JSON files
+    files_processed = 0
+    print(f"   Loading assistant metadata from {AGENTS_REPO_LOCAL_PATH}...")
+    for root, dirs, files in os.walk(AGENTS_REPO_LOCAL_PATH):
+        # Skip .git directory
+        if '.git' in root:
+            continue
+        for filename in files:
+            if not filename.endswith('.json'):
+                continue
+            files_processed += 1
+            file_path = os.path.join(root, filename)
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    agent_data = json.load(f)
+                # Only include active assistants
+                if agent_data.get('status') != 'active':
+                    continue
+                # Extract github_identifier from filename
+                github_identifier = filename.replace('.json', '')
+                agent_data['github_identifier'] = github_identifier
+                assistants.append(agent_data)
+            except Exception as e:
+                print(f"   ○ Error loading {filename}: {str(e)}")
+                continue
+    print(f"   ✓ Loaded {len(assistants)} active assistants (from {files_processed} total files)")
+    return assistants
+def calculate_community_stats(wiki_metadata, member_metadata):
+    """Calculate combined community statistics."""
+    total_wiki_edits = sum(item.get('page_count', 0) for item in wiki_metadata)
+    total_members = len(member_metadata)
+    return {
+        'total_wiki_edits': total_wiki_edits,
+        'total_members': total_members,
+    }
+def calculate_monthly_metrics_by_agent(wiki_metadata_dict, member_metadata_dict, assistants):
+    """Calculate monthly metrics for all assistants for visualization."""
+    identifier_to_name = {assistant.get('github_identifier'): assistant.get('name') for assistant in assistants if assistant.get('github_identifier')}
+    if not wiki_metadata_dict and not member_metadata_dict:
+        return {'assistants': [], 'months': [], 'data': {}}
+    # Collect all agent identifiers that have any data
+    all_agent_ids = set(wiki_metadata_dict.keys()) | set(member_metadata_dict.keys())
+    agent_month_wiki = defaultdict(lambda: defaultdict(list))
+    agent_month_member = defaultdict(lambda: defaultdict(list))
+    # Process wiki metadata
+    for agent_identifier, metadata_list in wiki_metadata_dict.items():
+        agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
+        for wiki_meta in metadata_list:
+            created_at = wiki_meta.get('created_at')
+            if not created_at:
+                continue
+            try:
+                dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
+                month_key = f"{dt.year}-{dt.month:02d}"
+                agent_month_wiki[agent_name][month_key].append(wiki_meta)
+            except Exception as e:
+                print(f"Warning: Could not parse date '{created_at}': {e}")
+    # Process member metadata
+    for agent_identifier, metadata_list in member_metadata_dict.items():
+        agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
+        for member_meta in metadata_list:
+            created_at = member_meta.get('created_at')
+            if not created_at:
+                continue
+            try:
+                dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
+                month_key = f"{dt.year}-{dt.month:02d}"
+                agent_month_member[agent_name][month_key].append(member_meta)
+            except Exception as e:
+                print(f"Warning: Could not parse date '{created_at}': {e}")
+    # Collect all months and agent names
+    all_months = set()
+    all_agent_names = set()
+    for agent_data in agent_month_wiki.values():
+        all_months.update(agent_data.keys())
+    for agent_data in agent_month_member.values():
+        all_months.update(agent_data.keys())
+    all_agent_names.update(agent_month_wiki.keys())
+    all_agent_names.update(agent_month_member.keys())
+    months = sorted(list(all_months))
+    result_data = {}
+    for agent_name in all_agent_names:
+        total_wiki_edits_list = []
+        total_members_list = []
+        for month in months:
+            wiki_events = agent_month_wiki.get(agent_name, {}).get(month, [])
+            member_events = agent_month_member.get(agent_name, {}).get(month, [])
+            total_wiki_edits_list.append(sum(item.get('page_count', 0) for item in wiki_events))
+            total_members_list.append(len(member_events))
+        result_data[agent_name] = {
+            'total_wiki_edits': total_wiki_edits_list,
+            'total_members': total_members_list,
+        }
+    agents_list = sorted(list(all_agent_names))
+    return {
+        'assistants': agents_list,
+        'months': months,
+        'data': result_data
+    }
+def construct_leaderboard_from_metadata(wiki_metadata_dict, member_metadata_dict, assistants):
+    """Construct leaderboard from in-memory community metadata."""
+    if not assistants:
+        print("Error: No assistants found")
+        return {}
+    cache_dict = {}
+    for assistant in assistants:
+        identifier = assistant.get('github_identifier')
+        agent_name = assistant.get('name', 'Unknown')
+        wiki_data = wiki_metadata_dict.get(identifier, [])
+        member_data = member_metadata_dict.get(identifier, [])
+        stats = calculate_community_stats(wiki_data, member_data)
+        cache_dict[identifier] = {
+            'name': agent_name,
+            'website': assistant.get('website', 'N/A'),
+            'github_identifier': identifier,
+            **stats
+        }
+    return cache_dict
+def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
+    """Save leaderboard data and monthly metrics to HuggingFace dataset."""
+    try:
+        token = get_hf_token()
+        if not token:
+            raise Exception("No HuggingFace token found")
+        api = HfApi(token=token)
+        combined_data = {
+            'last_updated': datetime.now(timezone.utc).isoformat(),
+            'leaderboard': leaderboard_dict,
+            'monthly_metrics': monthly_metrics,
+            'metadata': {
+                'leaderboard_time_frame_days': LEADERBOARD_TIME_FRAME_DAYS
+            }
+        }
+        with open(LEADERBOARD_FILENAME, 'w') as f:
+            json.dump(combined_data, f, indent=2)
+        try:
+            upload_file_with_backoff(
+                api=api,
+                path_or_fileobj=LEADERBOARD_FILENAME,
+                path_in_repo=LEADERBOARD_FILENAME,
+                repo_id=LEADERBOARD_REPO,
+                repo_type="dataset"
+            )
+            return True
+        finally:
+            if os.path.exists(LEADERBOARD_FILENAME):
+                os.remove(LEADERBOARD_FILENAME)
+    except Exception as e:
+        print(f"Error saving leaderboard data: {str(e)}")
+        traceback.print_exc()
+        return False
+# =============================================================================
+# MINING FUNCTION
+# =============================================================================
+def mine_all_agents():
+    """
+    Mine community metadata (wiki + members) for all assistants using STREAMING batch processing.
+    Downloads GHArchive data, then uses BATCH-based DuckDB queries.
+    """
+    print(f"\n[1/4] Downloading GHArchive data...")
+    if not download_all_gharchive_data():
+        print("Warning: Download had errors, continuing with available data...")
+    print(f"\n[2/4] Loading assistant metadata...")
+    assistants = load_agents_from_hf()
+    if not assistants:
+        print("Error: No assistants found")
+        return
+    identifiers = [assistant['github_identifier'] for assistant in assistants if assistant.get('github_identifier')]
+    if not identifiers:
+        print("Error: No valid assistant identifiers found")
+        return
+    print(f"\n[3/4] Mining community metadata ({len(identifiers)} assistants, {LEADERBOARD_TIME_FRAME_DAYS} days)...")
+    try:
+        conn = get_duckdb_connection()
+    except Exception as e:
+        print(f"Failed to initialize DuckDB connection: {str(e)}")
+        return
+    current_time = datetime.now(timezone.utc)
+    end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
+    start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
+    try:
+        wiki_metadata, member_metadata = fetch_all_community_metadata_streaming(
+            conn, identifiers, start_date, end_date
+        )
+    except Exception as e:
+        print(f"Error during DuckDB fetch: {str(e)}")
+        traceback.print_exc()
+        return
+    finally:
+        conn.close()
+    print(f"\n[4/4] Saving leaderboard...")
+    try:
+        leaderboard_dict = construct_leaderboard_from_metadata(wiki_metadata, member_metadata, assistants)
+        monthly_metrics = calculate_monthly_metrics_by_agent(wiki_metadata, member_metadata, assistants)
+        save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics)
+    except Exception as e:
+        print(f"Error saving leaderboard: {str(e)}")
+        traceback.print_exc()
+    finally:
+        # Clean up DuckDB cache file to save storage
+        if os.path.exists(DUCKDB_CACHE_FILE):
+            try:
+                os.remove(DUCKDB_CACHE_FILE)
+                print(f"   ✓ Cache file removed: {DUCKDB_CACHE_FILE}")
+            except Exception as e:
+                print(f"   ⚠ Failed to remove cache file: {str(e)}")
+# =============================================================================
+# ENTRY POINT
+# =============================================================================
+if __name__ == "__main__":
+    mine_all_agents()

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+APScheduler
+backoff
+duckdb[all]
+gradio
+gradio_leaderboard
+huggingface_hub
+pandas
+plotly
+python-dotenv
+requests