Spaces:

assafvayner
/

webhook-processing

Sleeping

assafvayner HF Staff Claude Sonnet 4.5 commited on Feb 9

Commit

41d63c9

1 Parent(s): 643b3a1

Add HuggingFace webhook processor Gradio app

- Create Gradio app with webhook endpoint at /webhooks/hub
- Filter webhooks by repo and repo.content scopes
- Store webhook messages in memory with thread-safe locking
- Batch and save to dataset every 10,000 messages as parquet files
- Display real-time status dashboard with message counts
- Show latest batch file saved
- Add collapsible view of first 10 webhook messages with JSON payloads
- Include query example script for analyzing saved data
- Add project dependencies and gitignore

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (5) hide show

.gitignore +6 -0
README.md +50 -0
app.py +272 -0
query_example.py +57 -0
requirements.txt +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+venv/
+__pycache__/
+*.pyc
+.env
+.DS_Store
+/tmp/

README.md CHANGED Viewed

@@ -9,4 +9,54 @@ app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 ---
+# HuggingFace Webhook Processor
+A Gradio Space that receives and processes HuggingFace Hub webhooks, storing them for later analysis.
+## Features
+- ✅ Receives HuggingFace Hub webhooks via `/webhooks/hub` endpoint
+- ✅ Filters webhooks by scope (`repo` and `repo.content` only)
+- ✅ Stores webhook payloads in memory
+- ✅ Automatically batches and saves to dataset every 10,000 messages
+- ✅ Saves as efficient parquet files for easy querying
+- ✅ Real-time status dashboard
+## Webhook Setup
+Configure your HuggingFace Hub webhooks to point to:
+```
+https://[your-space-name].hf.space/webhooks/hub
+```
+## Dataset Output
+Webhooks are saved to: `assafvayner/webhook-messages`
+Each batch is saved as a separate parquet file with:
+- Timestamp
+- Event type
+- Scope
+- Full JSON payload
+## Environment Variables
+Requires `HF_TOKEN` with write access to the dataset repository.
+## API Endpoints
+- `POST /webhooks/hub` - Receive webhooks
+- `GET /webhooks/health` - Health check and stats
+## Local Development
+```bash
+python -m venv venv
+source venv/bin/activate  # or `venv\Scripts\activate` on Windows
+pip install -r requirements.txt
+export HF_TOKEN=your_token_here
+python app.py
+```
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import gradio as gr
+from fastapi import Request, HTTPException
+from fastapi.responses import JSONResponse
+import os
+import json
+from datetime import datetime
+from threading import Lock
+from datasets import Dataset
+from huggingface_hub import HfApi
+import pandas as pd
+# Configuration
+DATASET_REPO = "assafvayner/webhook-messages"
+BATCH_SIZE = 10000
+ALLOWED_SCOPES = {"repo", "repo.content"}
+# In-memory storage
+webhook_messages = []
+message_lock = Lock()
+batch_counter = 0
+latest_batch_file = None
+# HuggingFace API client
+hf_api = HfApi(token=os.environ.get("HF_TOKEN"))
+def save_batch_to_dataset(messages, batch_num):
+    """Save a batch of webhook messages to the HuggingFace dataset as a parquet file."""
+    global latest_batch_file
+    try:
+        # Create DataFrame from messages
+        df = pd.DataFrame(messages)
+        # Create filename with timestamp and batch number
+        timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
+        filename = f"batch_{batch_num:06d}_{timestamp}.parquet"
+        # Convert to HuggingFace Dataset
+        dataset = Dataset.from_pandas(df)
+        # Upload to the dataset repo
+        dataset.to_parquet(f"/tmp/{filename}")
+        hf_api.upload_file(
+            path_or_fileobj=f"/tmp/{filename}",
+            path_in_repo=f"data/{filename}",
+            repo_id=DATASET_REPO,
+            repo_type="dataset",
+        )
+        print(f"✅ Saved batch {batch_num} with {len(messages)} messages to {DATASET_REPO}")
+        # Update latest batch file info
+        latest_batch_file = f"data/{filename}"
+        # Clean up temp file
+        os.remove(f"/tmp/{filename}")
+        return True
+    except Exception as e:
+        print(f"❌ Error saving batch {batch_num}: {str(e)}")
+        return False
+def process_webhook(payload: dict, event_type: str):
+    """Process and store webhook payload if it matches allowed scopes."""
+    global batch_counter
+    # Extract scope from payload
+    scope = payload.get("event", {}).get("scope")
+    # Filter by scope
+    if scope not in ALLOWED_SCOPES:
+        return False
+    # Create message entry
+    message = {
+        "timestamp": datetime.utcnow().isoformat(),
+        "event_type": event_type,
+        "scope": scope,
+        "payload": json.dumps(payload)  # Store full payload as JSON string
+    }
+    with message_lock:
+        webhook_messages.append(message)
+        current_count = len(webhook_messages)
+        # Check if we need to save a batch
+        if current_count >= BATCH_SIZE:
+            batch_counter += 1
+            messages_to_save = webhook_messages.copy()
+            webhook_messages.clear()
+            # Save in background (non-blocking)
+            save_batch_to_dataset(messages_to_save, batch_counter)
+    return True
+# Create Gradio interface
+with gr.Blocks(title="HuggingFace Webhook Processor") as demo:
+    gr.Markdown("""
+    # 🌖 HuggingFace Webhook Processor
+    This app receives HuggingFace Hub webhooks and stores them for analysis.
+    ## Webhook Endpoint
+    Send POST requests to: `/webhooks/hub`
+    ## Configuration
+    - **Filtered Scopes**: `repo`, `repo.content`
+    - **Batch Size**: 10,000 messages
+    - **Dataset**: `assafvayner/webhook-messages`
+    ## Status
+    """)
+    with gr.Row():
+        with gr.Column():
+            status_text = gr.Textbox(
+                label="Current Status",
+                value="Waiting for webhooks...",
+                interactive=False
+            )
+            message_count = gr.Number(
+                label="Messages in Memory",
+                value=0,
+                interactive=False
+            )
+        with gr.Column():
+            batch_count = gr.Number(
+                label="Batches Saved",
+                value=0,
+                interactive=False
+            )
+            latest_batch = gr.Textbox(
+                label="Latest Batch File",
+                value="No batches saved yet",
+                interactive=False
+            )
+    def get_status():
+        with message_lock:
+            batch_file = latest_batch_file if latest_batch_file else "No batches saved yet"
+            return (
+                f"Active - Ready to receive webhooks",
+                len(webhook_messages),
+                batch_counter,
+                batch_file
+            )
+    def get_recent_messages():
+        with message_lock:
+            if not webhook_messages:
+                return "No messages in memory yet"
+            # Get first 10 messages (or fewer if less than 10)
+            messages_to_show = webhook_messages[:10]
+            # Format messages nicely
+            output = []
+            for i, msg in enumerate(messages_to_show, 1):
+                output.append(f"### Message {i}")
+                output.append(f"**Timestamp:** {msg['timestamp']}")
+                output.append(f"**Event Type:** {msg['event_type']}")
+                output.append(f"**Scope:** {msg['scope']}")
+                output.append(f"**Payload:**")
+                # Parse and pretty-print JSON
+                try:
+                    payload = json.loads(msg['payload'])
+                    output.append(f"```json\n{json.dumps(payload, indent=2)}\n```")
+                except:
+                    output.append(f"```\n{msg['payload']}\n```")
+                output.append("\n---\n")
+            return "\n".join(output)
+    refresh_btn = gr.Button("🔄 Refresh Status")
+    refresh_btn.click(
+        fn=get_status,
+        outputs=[status_text, message_count, batch_count, latest_batch]
+    )
+    with gr.Accordion("📋 Recent Messages (First 10)", open=False):
+        recent_messages = gr.Markdown(
+            value="Click 'Refresh Messages' to load recent messages"
+        )
+        refresh_messages_btn = gr.Button("🔄 Refresh Messages")
+        refresh_messages_btn.click(
+            fn=get_recent_messages,
+            outputs=[recent_messages]
+        )
+    # Auto-refresh every 5 seconds
+    demo.load(get_status, outputs=[status_text, message_count, batch_count, latest_batch], every=5)
+# Add webhook endpoint to FastAPI
+@demo.fastapi_app.post("/webhooks/hub")
+async def webhook_endpoint(request: Request):
+    """
+    Webhook endpoint for HuggingFace Hub events.
+    Supports all webhook events documented at:
+    https://huggingface.co/docs/hub/webhooks
+    """
+    try:
+        # Get the event type from headers
+        event_type = request.headers.get("X-Event-Type", "unknown")
+        # Parse JSON payload
+        payload = await request.json()
+        # Process the webhook
+        processed = process_webhook(payload, event_type)
+        if processed:
+            return JSONResponse(
+                content={
+                    "status": "success",
+                    "message": "Webhook received and queued",
+                    "scope": payload.get("event", {}).get("scope")
+                },
+                status_code=200
+            )
+        else:
+            return JSONResponse(
+                content={
+                    "status": "ignored",
+                    "message": "Webhook scope not in allowed list",
+                    "scope": payload.get("event", {}).get("scope")
+                },
+                status_code=200
+            )
+    except Exception as e:
+        print(f"Error processing webhook: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@demo.fastapi_app.get("/webhooks/health")
+async def health_check():
+    """Health check endpoint."""
+    with message_lock:
+        return {
+            "status": "healthy",
+            "messages_in_memory": len(webhook_messages),
+            "batches_saved": batch_counter,
+            "allowed_scopes": list(ALLOWED_SCOPES)
+        }
+if __name__ == "__main__":
+    # Ensure dataset repo exists
+    try:
+        hf_api.create_repo(
+            repo_id=DATASET_REPO,
+            repo_type="dataset",
+            exist_ok=True
+        )
+        print(f"✅ Dataset repository ready: {DATASET_REPO}")
+    except Exception as e:
+        print(f"⚠️  Warning: Could not create/verify dataset repo: {str(e)}")
+    demo.launch(server_name="0.0.0.0", server_port=7860)

query_example.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+Example script to query webhook messages from the dataset.
+This demonstrates how to load and analyze the batched parquet files.
+"""
+from datasets import load_dataset
+import json
+import pandas as pd
+# Load the dataset
+print("Loading webhook messages dataset...")
+dataset = load_dataset("assafvayner/webhook-messages", split="train")
+print(f"Total messages: {len(dataset)}")
+print(f"\nFirst message:")
+print("-" * 50)
+# Convert to pandas for easier querying
+df = dataset.to_pandas()
+# Display first message
+first_msg = df.iloc[0]
+print(f"Timestamp: {first_msg['timestamp']}")
+print(f"Event Type: {first_msg['event_type']}")
+print(f"Scope: {first_msg['scope']}")
+print(f"\nPayload:")
+payload = json.loads(first_msg['payload'])
+print(json.dumps(payload, indent=2))
+print("\n" + "=" * 50)
+print("Summary Statistics:")
+print("=" * 50)
+# Event type distribution
+print("\nEvent Types:")
+print(df['event_type'].value_counts())
+print("\nScope Distribution:")
+print(df['scope'].value_counts())
+# Time range
+print(f"\nTime Range:")
+print(f"  First message: {df['timestamp'].min()}")
+print(f"  Last message:  {df['timestamp'].max()}")
+# Example: Filter for specific event type
+print("\n" + "=" * 50)
+print("Example Query: Find all 'repo' scope events")
+print("=" * 50)
+repo_events = df[df['scope'] == 'repo']
+print(f"Found {len(repo_events)} events")
+# Show sample payloads
+if len(repo_events) > 0:
+    print("\nSample payload:")
+    sample_payload = json.loads(repo_events.iloc[0]['payload'])
+    print(json.dumps(sample_payload, indent=2)[:500] + "...")

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio==6.5.1
+huggingface-hub
+datasets
+pandas
+pyarrow