Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
cdborinstein commited on
Commit ·
f3ea89b
1
Parent(s): 9a7efe4
Add backup process via honcho Procfile
Browse files- Dockerfile +5 -2
- Procfile +3 -0
- backup_to_hub.py +29 -17
Dockerfile
CHANGED
|
@@ -1,4 +1,7 @@
|
|
| 1 |
FROM argilla/argilla-quickstart:latest
|
| 2 |
|
| 3 |
-
#
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
FROM argilla/argilla-quickstart:latest
|
| 2 |
|
| 3 |
+
# Copy backup script to a location honcho can find
|
| 4 |
+
COPY backup_to_hub.py /home/argilla/backup_to_hub.py
|
| 5 |
+
|
| 6 |
+
# Copy custom Procfile that adds backup process
|
| 7 |
+
COPY Procfile /home/argilla/Procfile
|
Procfile
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
elastic: /usr/share/elasticsearch/bin/elasticsearch
|
| 2 |
+
argilla: sleep 15 && /home/argilla/start_argilla_server.sh
|
| 3 |
+
backup: sleep 60 && python /home/argilla/backup_to_hub.py
|
backup_to_hub.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
"""
|
| 2 |
Automatic backup of Argilla annotations to HF Dataset.
|
| 3 |
-
Runs as
|
| 4 |
"""
|
| 5 |
import os
|
| 6 |
import sys
|
|
@@ -9,8 +9,8 @@ import time
|
|
| 9 |
from pathlib import Path
|
| 10 |
from datetime import datetime
|
| 11 |
|
| 12 |
-
# Configuration
|
| 13 |
-
EXPORT_DIR = Path("/exports")
|
| 14 |
BACKUP_REPO = os.environ.get("BACKUP_DATASET_REPO", "argus-systems/vuln-preferences-data")
|
| 15 |
SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL_MINUTES", "10"))
|
| 16 |
ARGILLA_API_URL = "http://localhost:6900"
|
|
@@ -18,14 +18,14 @@ ARGILLA_API_URL = "http://localhost:6900"
|
|
| 18 |
# Check for HF_TOKEN before importing heavy dependencies
|
| 19 |
hf_token = os.environ.get("HF_TOKEN")
|
| 20 |
if not hf_token:
|
| 21 |
-
print("ERROR: HF_TOKEN not set, backup script exiting")
|
| 22 |
sys.exit(1)
|
| 23 |
|
| 24 |
from huggingface_hub import CommitScheduler, login
|
| 25 |
import argilla as rg
|
| 26 |
|
| 27 |
# Setup
|
| 28 |
-
EXPORT_DIR.mkdir(exist_ok=True)
|
| 29 |
login(token=hf_token)
|
| 30 |
|
| 31 |
# Setup CommitScheduler
|
|
@@ -38,25 +38,28 @@ scheduler = CommitScheduler(
|
|
| 38 |
|
| 39 |
|
| 40 |
def wait_for_argilla():
|
| 41 |
-
"""Wait for Argilla server to be ready."""
|
| 42 |
import urllib.request
|
| 43 |
-
max_retries =
|
| 44 |
for i in range(max_retries):
|
| 45 |
try:
|
| 46 |
-
urllib.request.urlopen(f"{ARGILLA_API_URL}/api/v1/status")
|
| 47 |
-
print("Argilla server is ready")
|
| 48 |
return True
|
| 49 |
-
except Exception:
|
| 50 |
-
print(f"Waiting for Argilla... ({i+1}/{max_retries})")
|
| 51 |
-
time.sleep(
|
|
|
|
| 52 |
return False
|
| 53 |
|
| 54 |
|
| 55 |
def export_annotations():
|
| 56 |
"""Export all annotations from Argilla to JSON files."""
|
| 57 |
try:
|
| 58 |
-
|
|
|
|
| 59 |
client = rg.Argilla(api_url=ARGILLA_API_URL, api_key=api_key)
|
|
|
|
| 60 |
|
| 61 |
for dataset in client.datasets:
|
| 62 |
records = list(dataset.records)
|
|
@@ -74,19 +77,28 @@ def export_annotations():
|
|
| 74 |
"num_annotations": len(annotated),
|
| 75 |
"records": annotated
|
| 76 |
}, f, indent=2, default=str)
|
| 77 |
-
print(f"Exported {len(annotated)} annotations from {dataset.name}")
|
|
|
|
|
|
|
| 78 |
except Exception as e:
|
| 79 |
-
print(f"Export error: {e}")
|
|
|
|
|
|
|
| 80 |
|
| 81 |
|
| 82 |
def main():
|
| 83 |
-
print("Starting Argilla backup service...")
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
if not wait_for_argilla():
|
| 86 |
-
print("Warning: Argilla not responding, will retry exports")
|
| 87 |
|
|
|
|
| 88 |
while True:
|
| 89 |
export_annotations()
|
|
|
|
| 90 |
time.sleep(SYNC_INTERVAL * 60) # Convert to seconds
|
| 91 |
|
| 92 |
|
|
|
|
| 1 |
"""
|
| 2 |
Automatic backup of Argilla annotations to HF Dataset.
|
| 3 |
+
Runs as a honcho-managed process, syncs every 10 minutes.
|
| 4 |
"""
|
| 5 |
import os
|
| 6 |
import sys
|
|
|
|
| 9 |
from pathlib import Path
|
| 10 |
from datetime import datetime
|
| 11 |
|
| 12 |
+
# Configuration - use writable directory under /home/argilla
|
| 13 |
+
EXPORT_DIR = Path("/home/argilla/exports")
|
| 14 |
BACKUP_REPO = os.environ.get("BACKUP_DATASET_REPO", "argus-systems/vuln-preferences-data")
|
| 15 |
SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL_MINUTES", "10"))
|
| 16 |
ARGILLA_API_URL = "http://localhost:6900"
|
|
|
|
| 18 |
# Check for HF_TOKEN before importing heavy dependencies
|
| 19 |
hf_token = os.environ.get("HF_TOKEN")
|
| 20 |
if not hf_token:
|
| 21 |
+
print("ERROR: HF_TOKEN not set, backup script exiting", flush=True)
|
| 22 |
sys.exit(1)
|
| 23 |
|
| 24 |
from huggingface_hub import CommitScheduler, login
|
| 25 |
import argilla as rg
|
| 26 |
|
| 27 |
# Setup
|
| 28 |
+
EXPORT_DIR.mkdir(parents=True, exist_ok=True)
|
| 29 |
login(token=hf_token)
|
| 30 |
|
| 31 |
# Setup CommitScheduler
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
def wait_for_argilla():
|
| 41 |
+
"""Wait for Argilla server to be ready with extended retries."""
|
| 42 |
import urllib.request
|
| 43 |
+
max_retries = 60 # Extended retries since we're starting with Argilla
|
| 44 |
for i in range(max_retries):
|
| 45 |
try:
|
| 46 |
+
urllib.request.urlopen(f"{ARGILLA_API_URL}/api/v1/status", timeout=5)
|
| 47 |
+
print("Argilla server is ready", flush=True)
|
| 48 |
return True
|
| 49 |
+
except Exception as e:
|
| 50 |
+
print(f"Waiting for Argilla... ({i+1}/{max_retries}) - {e}", flush=True)
|
| 51 |
+
time.sleep(5)
|
| 52 |
+
print("ERROR: Argilla server not responding after maximum retries", flush=True)
|
| 53 |
return False
|
| 54 |
|
| 55 |
|
| 56 |
def export_annotations():
|
| 57 |
"""Export all annotations from Argilla to JSON files."""
|
| 58 |
try:
|
| 59 |
+
# Default API key matches what argilla-quickstart creates for the owner user
|
| 60 |
+
api_key = os.environ.get("ARGILLA_API_KEY", "owner.apikey")
|
| 61 |
client = rg.Argilla(api_url=ARGILLA_API_URL, api_key=api_key)
|
| 62 |
+
print(f"Connected to Argilla, checking datasets...", flush=True)
|
| 63 |
|
| 64 |
for dataset in client.datasets:
|
| 65 |
records = list(dataset.records)
|
|
|
|
| 77 |
"num_annotations": len(annotated),
|
| 78 |
"records": annotated
|
| 79 |
}, f, indent=2, default=str)
|
| 80 |
+
print(f"Exported {len(annotated)} annotations from {dataset.name}", flush=True)
|
| 81 |
+
else:
|
| 82 |
+
print(f"No new annotations in {dataset.name}", flush=True)
|
| 83 |
except Exception as e:
|
| 84 |
+
print(f"Export error: {e}", flush=True)
|
| 85 |
+
import traceback
|
| 86 |
+
traceback.print_exc()
|
| 87 |
|
| 88 |
|
| 89 |
def main():
|
| 90 |
+
print("Starting Argilla backup service...", flush=True)
|
| 91 |
+
print(f" Backup repo: {BACKUP_REPO}", flush=True)
|
| 92 |
+
print(f" Export dir: {EXPORT_DIR}", flush=True)
|
| 93 |
+
print(f" Sync interval: {SYNC_INTERVAL} minutes", flush=True)
|
| 94 |
|
| 95 |
if not wait_for_argilla():
|
| 96 |
+
print("Warning: Argilla not responding, will retry exports anyway", flush=True)
|
| 97 |
|
| 98 |
+
print("Starting backup loop...", flush=True)
|
| 99 |
while True:
|
| 100 |
export_annotations()
|
| 101 |
+
print(f"Sleeping for {SYNC_INTERVAL} minutes until next export...", flush=True)
|
| 102 |
time.sleep(SYNC_INTERVAL * 60) # Convert to seconds
|
| 103 |
|
| 104 |
|