assafvayner HF Staff Claude Sonnet 4.5 commited on
Commit
41d63c9
·
1 Parent(s): 643b3a1

Add HuggingFace webhook processor Gradio app

Browse files

- Create Gradio app with webhook endpoint at /webhooks/hub
- Filter webhooks by repo and repo.content scopes
- Store webhook messages in memory with thread-safe locking
- Batch and save to dataset every 10,000 messages as parquet files
- Display real-time status dashboard with message counts
- Show latest batch file saved
- Add collapsible view of first 10 webhook messages with JSON payloads
- Include query example script for analyzing saved data
- Add project dependencies and gitignore

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (5) hide show
  1. .gitignore +6 -0
  2. README.md +50 -0
  3. app.py +272 -0
  4. query_example.py +57 -0
  5. requirements.txt +5 -0
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ venv/
2
+ __pycache__/
3
+ *.pyc
4
+ .env
5
+ .DS_Store
6
+ /tmp/
README.md CHANGED
@@ -9,4 +9,54 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
9
  pinned: false
10
  ---
11
 
12
+ # HuggingFace Webhook Processor
13
+
14
+ A Gradio Space that receives and processes HuggingFace Hub webhooks, storing them for later analysis.
15
+
16
+ ## Features
17
+
18
+ - ✅ Receives HuggingFace Hub webhooks via `/webhooks/hub` endpoint
19
+ - ✅ Filters webhooks by scope (`repo` and `repo.content` only)
20
+ - ✅ Stores webhook payloads in memory
21
+ - ✅ Automatically batches and saves to dataset every 10,000 messages
22
+ - ✅ Saves as efficient parquet files for easy querying
23
+ - ✅ Real-time status dashboard
24
+
25
+ ## Webhook Setup
26
+
27
+ Configure your HuggingFace Hub webhooks to point to:
28
+
29
+ ```
30
+ https://[your-space-name].hf.space/webhooks/hub
31
+ ```
32
+
33
+ ## Dataset Output
34
+
35
+ Webhooks are saved to: `assafvayner/webhook-messages`
36
+
37
+ Each batch is saved as a separate parquet file with:
38
+ - Timestamp
39
+ - Event type
40
+ - Scope
41
+ - Full JSON payload
42
+
43
+ ## Environment Variables
44
+
45
+ Requires `HF_TOKEN` with write access to the dataset repository.
46
+
47
+ ## API Endpoints
48
+
49
+ - `POST /webhooks/hub` - Receive webhooks
50
+ - `GET /webhooks/health` - Health check and stats
51
+
52
+ ## Local Development
53
+
54
+ ```bash
55
+ python -m venv venv
56
+ source venv/bin/activate # or `venv\Scripts\activate` on Windows
57
+ pip install -r requirements.txt
58
+ export HF_TOKEN=your_token_here
59
+ python app.py
60
+ ```
61
+
62
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from fastapi import Request, HTTPException
3
+ from fastapi.responses import JSONResponse
4
+ import os
5
+ import json
6
+ from datetime import datetime
7
+ from threading import Lock
8
+ from datasets import Dataset
9
+ from huggingface_hub import HfApi
10
+ import pandas as pd
11
+
12
+ # Configuration
13
+ DATASET_REPO = "assafvayner/webhook-messages"
14
+ BATCH_SIZE = 10000
15
+ ALLOWED_SCOPES = {"repo", "repo.content"}
16
+
17
+ # In-memory storage
18
+ webhook_messages = []
19
+ message_lock = Lock()
20
+ batch_counter = 0
21
+ latest_batch_file = None
22
+
23
+ # HuggingFace API client
24
+ hf_api = HfApi(token=os.environ.get("HF_TOKEN"))
25
+
26
+
27
+ def save_batch_to_dataset(messages, batch_num):
28
+ """Save a batch of webhook messages to the HuggingFace dataset as a parquet file."""
29
+ global latest_batch_file
30
+ try:
31
+ # Create DataFrame from messages
32
+ df = pd.DataFrame(messages)
33
+
34
+ # Create filename with timestamp and batch number
35
+ timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
36
+ filename = f"batch_{batch_num:06d}_{timestamp}.parquet"
37
+
38
+ # Convert to HuggingFace Dataset
39
+ dataset = Dataset.from_pandas(df)
40
+
41
+ # Upload to the dataset repo
42
+ dataset.to_parquet(f"/tmp/{filename}")
43
+
44
+ hf_api.upload_file(
45
+ path_or_fileobj=f"/tmp/{filename}",
46
+ path_in_repo=f"data/{filename}",
47
+ repo_id=DATASET_REPO,
48
+ repo_type="dataset",
49
+ )
50
+
51
+ print(f"✅ Saved batch {batch_num} with {len(messages)} messages to {DATASET_REPO}")
52
+
53
+ # Update latest batch file info
54
+ latest_batch_file = f"data/{filename}"
55
+
56
+ # Clean up temp file
57
+ os.remove(f"/tmp/{filename}")
58
+
59
+ return True
60
+ except Exception as e:
61
+ print(f"❌ Error saving batch {batch_num}: {str(e)}")
62
+ return False
63
+
64
+
65
+ def process_webhook(payload: dict, event_type: str):
66
+ """Process and store webhook payload if it matches allowed scopes."""
67
+ global batch_counter
68
+
69
+ # Extract scope from payload
70
+ scope = payload.get("event", {}).get("scope")
71
+
72
+ # Filter by scope
73
+ if scope not in ALLOWED_SCOPES:
74
+ return False
75
+
76
+ # Create message entry
77
+ message = {
78
+ "timestamp": datetime.utcnow().isoformat(),
79
+ "event_type": event_type,
80
+ "scope": scope,
81
+ "payload": json.dumps(payload) # Store full payload as JSON string
82
+ }
83
+
84
+ with message_lock:
85
+ webhook_messages.append(message)
86
+ current_count = len(webhook_messages)
87
+
88
+ # Check if we need to save a batch
89
+ if current_count >= BATCH_SIZE:
90
+ batch_counter += 1
91
+ messages_to_save = webhook_messages.copy()
92
+ webhook_messages.clear()
93
+
94
+ # Save in background (non-blocking)
95
+ save_batch_to_dataset(messages_to_save, batch_counter)
96
+
97
+ return True
98
+
99
+
100
+ # Create Gradio interface
101
+ with gr.Blocks(title="HuggingFace Webhook Processor") as demo:
102
+ gr.Markdown("""
103
+ # 🌖 HuggingFace Webhook Processor
104
+
105
+ This app receives HuggingFace Hub webhooks and stores them for analysis.
106
+
107
+ ## Webhook Endpoint
108
+
109
+ Send POST requests to: `/webhooks/hub`
110
+
111
+ ## Configuration
112
+ - **Filtered Scopes**: `repo`, `repo.content`
113
+ - **Batch Size**: 10,000 messages
114
+ - **Dataset**: `assafvayner/webhook-messages`
115
+
116
+ ## Status
117
+ """)
118
+
119
+ with gr.Row():
120
+ with gr.Column():
121
+ status_text = gr.Textbox(
122
+ label="Current Status",
123
+ value="Waiting for webhooks...",
124
+ interactive=False
125
+ )
126
+
127
+ message_count = gr.Number(
128
+ label="Messages in Memory",
129
+ value=0,
130
+ interactive=False
131
+ )
132
+
133
+ with gr.Column():
134
+ batch_count = gr.Number(
135
+ label="Batches Saved",
136
+ value=0,
137
+ interactive=False
138
+ )
139
+
140
+ latest_batch = gr.Textbox(
141
+ label="Latest Batch File",
142
+ value="No batches saved yet",
143
+ interactive=False
144
+ )
145
+
146
+ def get_status():
147
+ with message_lock:
148
+ batch_file = latest_batch_file if latest_batch_file else "No batches saved yet"
149
+ return (
150
+ f"Active - Ready to receive webhooks",
151
+ len(webhook_messages),
152
+ batch_counter,
153
+ batch_file
154
+ )
155
+
156
+ def get_recent_messages():
157
+ with message_lock:
158
+ if not webhook_messages:
159
+ return "No messages in memory yet"
160
+
161
+ # Get first 10 messages (or fewer if less than 10)
162
+ messages_to_show = webhook_messages[:10]
163
+
164
+ # Format messages nicely
165
+ output = []
166
+ for i, msg in enumerate(messages_to_show, 1):
167
+ output.append(f"### Message {i}")
168
+ output.append(f"**Timestamp:** {msg['timestamp']}")
169
+ output.append(f"**Event Type:** {msg['event_type']}")
170
+ output.append(f"**Scope:** {msg['scope']}")
171
+ output.append(f"**Payload:**")
172
+
173
+ # Parse and pretty-print JSON
174
+ try:
175
+ payload = json.loads(msg['payload'])
176
+ output.append(f"```json\n{json.dumps(payload, indent=2)}\n```")
177
+ except:
178
+ output.append(f"```\n{msg['payload']}\n```")
179
+
180
+ output.append("\n---\n")
181
+
182
+ return "\n".join(output)
183
+
184
+ refresh_btn = gr.Button("🔄 Refresh Status")
185
+ refresh_btn.click(
186
+ fn=get_status,
187
+ outputs=[status_text, message_count, batch_count, latest_batch]
188
+ )
189
+
190
+ with gr.Accordion("📋 Recent Messages (First 10)", open=False):
191
+ recent_messages = gr.Markdown(
192
+ value="Click 'Refresh Messages' to load recent messages"
193
+ )
194
+
195
+ refresh_messages_btn = gr.Button("🔄 Refresh Messages")
196
+ refresh_messages_btn.click(
197
+ fn=get_recent_messages,
198
+ outputs=[recent_messages]
199
+ )
200
+
201
+ # Auto-refresh every 5 seconds
202
+ demo.load(get_status, outputs=[status_text, message_count, batch_count, latest_batch], every=5)
203
+
204
+
205
+ # Add webhook endpoint to FastAPI
206
+ @demo.fastapi_app.post("/webhooks/hub")
207
+ async def webhook_endpoint(request: Request):
208
+ """
209
+ Webhook endpoint for HuggingFace Hub events.
210
+
211
+ Supports all webhook events documented at:
212
+ https://huggingface.co/docs/hub/webhooks
213
+ """
214
+ try:
215
+ # Get the event type from headers
216
+ event_type = request.headers.get("X-Event-Type", "unknown")
217
+
218
+ # Parse JSON payload
219
+ payload = await request.json()
220
+
221
+ # Process the webhook
222
+ processed = process_webhook(payload, event_type)
223
+
224
+ if processed:
225
+ return JSONResponse(
226
+ content={
227
+ "status": "success",
228
+ "message": "Webhook received and queued",
229
+ "scope": payload.get("event", {}).get("scope")
230
+ },
231
+ status_code=200
232
+ )
233
+ else:
234
+ return JSONResponse(
235
+ content={
236
+ "status": "ignored",
237
+ "message": "Webhook scope not in allowed list",
238
+ "scope": payload.get("event", {}).get("scope")
239
+ },
240
+ status_code=200
241
+ )
242
+
243
+ except Exception as e:
244
+ print(f"Error processing webhook: {str(e)}")
245
+ raise HTTPException(status_code=500, detail=str(e))
246
+
247
+
248
+ @demo.fastapi_app.get("/webhooks/health")
249
+ async def health_check():
250
+ """Health check endpoint."""
251
+ with message_lock:
252
+ return {
253
+ "status": "healthy",
254
+ "messages_in_memory": len(webhook_messages),
255
+ "batches_saved": batch_counter,
256
+ "allowed_scopes": list(ALLOWED_SCOPES)
257
+ }
258
+
259
+
260
+ if __name__ == "__main__":
261
+ # Ensure dataset repo exists
262
+ try:
263
+ hf_api.create_repo(
264
+ repo_id=DATASET_REPO,
265
+ repo_type="dataset",
266
+ exist_ok=True
267
+ )
268
+ print(f"✅ Dataset repository ready: {DATASET_REPO}")
269
+ except Exception as e:
270
+ print(f"⚠️ Warning: Could not create/verify dataset repo: {str(e)}")
271
+
272
+ demo.launch(server_name="0.0.0.0", server_port=7860)
query_example.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Example script to query webhook messages from the dataset.
3
+
4
+ This demonstrates how to load and analyze the batched parquet files.
5
+ """
6
+ from datasets import load_dataset
7
+ import json
8
+ import pandas as pd
9
+
10
+ # Load the dataset
11
+ print("Loading webhook messages dataset...")
12
+ dataset = load_dataset("assafvayner/webhook-messages", split="train")
13
+
14
+ print(f"Total messages: {len(dataset)}")
15
+ print(f"\nFirst message:")
16
+ print("-" * 50)
17
+
18
+ # Convert to pandas for easier querying
19
+ df = dataset.to_pandas()
20
+
21
+ # Display first message
22
+ first_msg = df.iloc[0]
23
+ print(f"Timestamp: {first_msg['timestamp']}")
24
+ print(f"Event Type: {first_msg['event_type']}")
25
+ print(f"Scope: {first_msg['scope']}")
26
+ print(f"\nPayload:")
27
+ payload = json.loads(first_msg['payload'])
28
+ print(json.dumps(payload, indent=2))
29
+
30
+ print("\n" + "=" * 50)
31
+ print("Summary Statistics:")
32
+ print("=" * 50)
33
+
34
+ # Event type distribution
35
+ print("\nEvent Types:")
36
+ print(df['event_type'].value_counts())
37
+
38
+ print("\nScope Distribution:")
39
+ print(df['scope'].value_counts())
40
+
41
+ # Time range
42
+ print(f"\nTime Range:")
43
+ print(f" First message: {df['timestamp'].min()}")
44
+ print(f" Last message: {df['timestamp'].max()}")
45
+
46
+ # Example: Filter for specific event type
47
+ print("\n" + "=" * 50)
48
+ print("Example Query: Find all 'repo' scope events")
49
+ print("=" * 50)
50
+ repo_events = df[df['scope'] == 'repo']
51
+ print(f"Found {len(repo_events)} events")
52
+
53
+ # Show sample payloads
54
+ if len(repo_events) > 0:
55
+ print("\nSample payload:")
56
+ sample_payload = json.loads(repo_events.iloc[0]['payload'])
57
+ print(json.dumps(sample_payload, indent=2)[:500] + "...")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio==6.5.1
2
+ huggingface-hub
3
+ datasets
4
+ pandas
5
+ pyarrow