cdborinstein commited on
Commit
8971251
·
1 Parent(s): ab773e9

Custom Argilla with auto-backup to HF Dataset

Browse files
Files changed (4) hide show
  1. Dockerfile +16 -7
  2. README.md +3 -15
  3. backup_to_hub.py +88 -0
  4. start.sh +7 -0
Dockerfile CHANGED
@@ -1,10 +1,19 @@
1
- FROM argilla/argilla-hf-spaces:v2.8.0
2
 
3
- # Copy the auth config section
4
- COPY .oauth.yaml /home/argilla/
 
5
 
6
- # Comment this line to disable annotation progress sharing feature
7
- ENV ARGILLA_ENABLE_SHARE_YOUR_PROGRESS=1
8
 
9
- # Uncoment this line to remove the persistence storage warning
10
- #ENV ARGILLA_SHOW_HUGGINGFACE_SPACE_PERSISTENT_STORAGE_WARNING=false
 
 
 
 
 
 
 
 
 
1
+ FROM argilla/argilla-server:latest
2
 
3
+ # Install additional dependencies
4
+ USER root
5
+ RUN pip install huggingface_hub datasets
6
 
7
+ # Create exports directory
8
+ RUN mkdir -p /exports && chmod 777 /exports
9
 
10
+ # Copy backup script
11
+ COPY backup_to_hub.py /app/backup_to_hub.py
12
+ COPY start.sh /app/start.sh
13
+ RUN chmod +x /app/start.sh
14
+
15
+ USER argilla
16
+ WORKDIR /home/argilla
17
+
18
+ # Start both Argilla and backup script
19
+ CMD ["/app/start.sh"]
README.md CHANGED
@@ -1,20 +1,8 @@
1
  ---
2
  title: Vuln Detection Preferences
3
- emoji:
4
- colorFrom: pink
5
- colorTo: pink
6
  sdk: docker
7
  app_port: 6900
8
- fullWidth: true
9
- hf_oauth: true
10
- tags:
11
- - argilla
12
- pinned: true
13
- license: mit
14
  ---
15
- Argilla is a free and open source tool to build and iterate on datasets for AI. It can be deployed on the Hub with a few clicks and Hugging Face OAuth enabled, perfect for running community annotation initiatives!
16
-
17
- This is the Argilla Space for:
18
-
19
- - Creating your own Argilla Spaces, check the [quickstart guide](http://docs.argilla.io/latest/getting_started/quickstart/) and the [Hugging Face Spaces configuration](http://docs.argilla.io/latest/getting_started/how-to-configure-argilla-on-huggingface/) for more details.
20
- - Discovering the Argilla UI, sign in with your Hugging Face account!
 
1
  ---
2
  title: Vuln Detection Preferences
3
+ emoji: 🔒
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: docker
7
  app_port: 6900
 
 
 
 
 
 
8
  ---
 
 
 
 
 
 
backup_to_hub.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Automatic backup of Argilla annotations to HF Dataset.
3
+ Runs as background process, syncs every 10 minutes.
4
+ """
5
+ import os
6
+ import json
7
+ import time
8
+ from pathlib import Path
9
+ from datetime import datetime
10
+ from huggingface_hub import CommitScheduler, login
11
+ import argilla as rg
12
+
13
+ # Configuration
14
+ EXPORT_DIR = Path("/exports")
15
+ EXPORT_DIR.mkdir(exist_ok=True)
16
+ BACKUP_REPO = os.environ.get("BACKUP_DATASET_REPO", "argus-systems/vuln-preferences-data")
17
+ SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL_MINUTES", "10"))
18
+ ARGILLA_API_URL = "http://localhost:6900"
19
+
20
+ # Login to HF (uses HF_TOKEN secret)
21
+ hf_token = os.environ.get("HF_TOKEN")
22
+ if hf_token:
23
+ login(token=hf_token)
24
+
25
+ # Setup CommitScheduler
26
+ scheduler = CommitScheduler(
27
+ repo_id=BACKUP_REPO,
28
+ repo_type="dataset",
29
+ folder_path=EXPORT_DIR,
30
+ every=SYNC_INTERVAL,
31
+ )
32
+
33
+
34
+ def wait_for_argilla():
35
+ """Wait for Argilla server to be ready."""
36
+ import urllib.request
37
+ max_retries = 30
38
+ for i in range(max_retries):
39
+ try:
40
+ urllib.request.urlopen(f"{ARGILLA_API_URL}/api/v1/status")
41
+ print("Argilla server is ready")
42
+ return True
43
+ except Exception:
44
+ print(f"Waiting for Argilla... ({i+1}/{max_retries})")
45
+ time.sleep(2)
46
+ return False
47
+
48
+
49
+ def export_annotations():
50
+ """Export all annotations from Argilla to JSON files."""
51
+ try:
52
+ api_key = os.environ.get("ARGILLA_API_KEY", "argilla.apikey")
53
+ client = rg.Argilla(api_url=ARGILLA_API_URL, api_key=api_key)
54
+
55
+ for dataset in client.datasets:
56
+ records = list(dataset.records)
57
+
58
+ # Filter to records with responses (annotations)
59
+ annotated = [r.to_dict() for r in records if r.responses]
60
+
61
+ if annotated:
62
+ export_file = EXPORT_DIR / f"{dataset.name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
63
+ with scheduler.lock:
64
+ with open(export_file, "w") as f:
65
+ json.dump({
66
+ "dataset": dataset.name,
67
+ "exported_at": datetime.now().isoformat(),
68
+ "num_annotations": len(annotated),
69
+ "records": annotated
70
+ }, f, indent=2, default=str)
71
+ print(f"Exported {len(annotated)} annotations from {dataset.name}")
72
+ except Exception as e:
73
+ print(f"Export error: {e}")
74
+
75
+
76
+ def main():
77
+ print("Starting Argilla backup service...")
78
+
79
+ if not wait_for_argilla():
80
+ print("Warning: Argilla not responding, will retry exports")
81
+
82
+ while True:
83
+ export_annotations()
84
+ time.sleep(SYNC_INTERVAL * 60) # Convert to seconds
85
+
86
+
87
+ if __name__ == "__main__":
88
+ main()
start.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Start backup script in background
4
+ python /app/backup_to_hub.py &
5
+
6
+ # Start Argilla server (original command)
7
+ python -m argilla server start --host 0.0.0.0 --port 6900