Spaces:
Sleeping
Sleeping
cdborinstein commited on
Commit ·
8971251
1
Parent(s): ab773e9
Custom Argilla with auto-backup to HF Dataset
Browse files- Dockerfile +16 -7
- README.md +3 -15
- backup_to_hub.py +88 -0
- start.sh +7 -0
Dockerfile
CHANGED
|
@@ -1,10 +1,19 @@
|
|
| 1 |
-
FROM argilla/argilla-
|
| 2 |
|
| 3 |
-
#
|
| 4 |
-
|
|
|
|
| 5 |
|
| 6 |
-
#
|
| 7 |
-
|
| 8 |
|
| 9 |
-
#
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM argilla/argilla-server:latest
|
| 2 |
|
| 3 |
+
# Install additional dependencies
|
| 4 |
+
USER root
|
| 5 |
+
RUN pip install huggingface_hub datasets
|
| 6 |
|
| 7 |
+
# Create exports directory
|
| 8 |
+
RUN mkdir -p /exports && chmod 777 /exports
|
| 9 |
|
| 10 |
+
# Copy backup script
|
| 11 |
+
COPY backup_to_hub.py /app/backup_to_hub.py
|
| 12 |
+
COPY start.sh /app/start.sh
|
| 13 |
+
RUN chmod +x /app/start.sh
|
| 14 |
+
|
| 15 |
+
USER argilla
|
| 16 |
+
WORKDIR /home/argilla
|
| 17 |
+
|
| 18 |
+
# Start both Argilla and backup script
|
| 19 |
+
CMD ["/app/start.sh"]
|
README.md
CHANGED
|
@@ -1,20 +1,8 @@
|
|
| 1 |
---
|
| 2 |
title: Vuln Detection Preferences
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
app_port: 6900
|
| 8 |
-
fullWidth: true
|
| 9 |
-
hf_oauth: true
|
| 10 |
-
tags:
|
| 11 |
-
- argilla
|
| 12 |
-
pinned: true
|
| 13 |
-
license: mit
|
| 14 |
---
|
| 15 |
-
Argilla is a free and open source tool to build and iterate on datasets for AI. It can be deployed on the Hub with a few clicks and Hugging Face OAuth enabled, perfect for running community annotation initiatives!
|
| 16 |
-
|
| 17 |
-
This is the Argilla Space for:
|
| 18 |
-
|
| 19 |
-
- Creating your own Argilla Spaces, check the [quickstart guide](http://docs.argilla.io/latest/getting_started/quickstart/) and the [Hugging Face Spaces configuration](http://docs.argilla.io/latest/getting_started/how-to-configure-argilla-on-huggingface/) for more details.
|
| 20 |
-
- Discovering the Argilla UI, sign in with your Hugging Face account!
|
|
|
|
| 1 |
---
|
| 2 |
title: Vuln Detection Preferences
|
| 3 |
+
emoji: 🔒
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
app_port: 6900
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backup_to_hub.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Automatic backup of Argilla annotations to HF Dataset.
|
| 3 |
+
Runs as background process, syncs every 10 minutes.
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
import time
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from huggingface_hub import CommitScheduler, login
|
| 11 |
+
import argilla as rg
|
| 12 |
+
|
| 13 |
+
# Configuration
|
| 14 |
+
EXPORT_DIR = Path("/exports")
|
| 15 |
+
EXPORT_DIR.mkdir(exist_ok=True)
|
| 16 |
+
BACKUP_REPO = os.environ.get("BACKUP_DATASET_REPO", "argus-systems/vuln-preferences-data")
|
| 17 |
+
SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL_MINUTES", "10"))
|
| 18 |
+
ARGILLA_API_URL = "http://localhost:6900"
|
| 19 |
+
|
| 20 |
+
# Login to HF (uses HF_TOKEN secret)
|
| 21 |
+
hf_token = os.environ.get("HF_TOKEN")
|
| 22 |
+
if hf_token:
|
| 23 |
+
login(token=hf_token)
|
| 24 |
+
|
| 25 |
+
# Setup CommitScheduler
|
| 26 |
+
scheduler = CommitScheduler(
|
| 27 |
+
repo_id=BACKUP_REPO,
|
| 28 |
+
repo_type="dataset",
|
| 29 |
+
folder_path=EXPORT_DIR,
|
| 30 |
+
every=SYNC_INTERVAL,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def wait_for_argilla():
|
| 35 |
+
"""Wait for Argilla server to be ready."""
|
| 36 |
+
import urllib.request
|
| 37 |
+
max_retries = 30
|
| 38 |
+
for i in range(max_retries):
|
| 39 |
+
try:
|
| 40 |
+
urllib.request.urlopen(f"{ARGILLA_API_URL}/api/v1/status")
|
| 41 |
+
print("Argilla server is ready")
|
| 42 |
+
return True
|
| 43 |
+
except Exception:
|
| 44 |
+
print(f"Waiting for Argilla... ({i+1}/{max_retries})")
|
| 45 |
+
time.sleep(2)
|
| 46 |
+
return False
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def export_annotations():
|
| 50 |
+
"""Export all annotations from Argilla to JSON files."""
|
| 51 |
+
try:
|
| 52 |
+
api_key = os.environ.get("ARGILLA_API_KEY", "argilla.apikey")
|
| 53 |
+
client = rg.Argilla(api_url=ARGILLA_API_URL, api_key=api_key)
|
| 54 |
+
|
| 55 |
+
for dataset in client.datasets:
|
| 56 |
+
records = list(dataset.records)
|
| 57 |
+
|
| 58 |
+
# Filter to records with responses (annotations)
|
| 59 |
+
annotated = [r.to_dict() for r in records if r.responses]
|
| 60 |
+
|
| 61 |
+
if annotated:
|
| 62 |
+
export_file = EXPORT_DIR / f"{dataset.name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
| 63 |
+
with scheduler.lock:
|
| 64 |
+
with open(export_file, "w") as f:
|
| 65 |
+
json.dump({
|
| 66 |
+
"dataset": dataset.name,
|
| 67 |
+
"exported_at": datetime.now().isoformat(),
|
| 68 |
+
"num_annotations": len(annotated),
|
| 69 |
+
"records": annotated
|
| 70 |
+
}, f, indent=2, default=str)
|
| 71 |
+
print(f"Exported {len(annotated)} annotations from {dataset.name}")
|
| 72 |
+
except Exception as e:
|
| 73 |
+
print(f"Export error: {e}")
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def main():
|
| 77 |
+
print("Starting Argilla backup service...")
|
| 78 |
+
|
| 79 |
+
if not wait_for_argilla():
|
| 80 |
+
print("Warning: Argilla not responding, will retry exports")
|
| 81 |
+
|
| 82 |
+
while True:
|
| 83 |
+
export_annotations()
|
| 84 |
+
time.sleep(SYNC_INTERVAL * 60) # Convert to seconds
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
if __name__ == "__main__":
|
| 88 |
+
main()
|
start.sh
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Start backup script in background
|
| 4 |
+
python /app/backup_to_hub.py &
|
| 5 |
+
|
| 6 |
+
# Start Argilla server (original command)
|
| 7 |
+
python -m argilla server start --host 0.0.0.0 --port 6900
|