cdborinstein commited on
Commit
f3ea89b
·
1 Parent(s): 9a7efe4

Add backup process via honcho Procfile

Browse files
Files changed (3) hide show
  1. Dockerfile +5 -2
  2. Procfile +3 -0
  3. backup_to_hub.py +29 -17
Dockerfile CHANGED
@@ -1,4 +1,7 @@
1
  FROM argilla/argilla-quickstart:latest
2
 
3
- # The quickstart image includes Elasticsearch + Argilla
4
- # Uses default entrypoint - no customization needed for basic setup
 
 
 
 
1
  FROM argilla/argilla-quickstart:latest
2
 
3
+ # Copy backup script to a location honcho can find
4
+ COPY backup_to_hub.py /home/argilla/backup_to_hub.py
5
+
6
+ # Copy custom Procfile that adds backup process
7
+ COPY Procfile /home/argilla/Procfile
Procfile ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ elastic: /usr/share/elasticsearch/bin/elasticsearch
2
+ argilla: sleep 15 && /home/argilla/start_argilla_server.sh
3
+ backup: sleep 60 && python /home/argilla/backup_to_hub.py
backup_to_hub.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
  Automatic backup of Argilla annotations to HF Dataset.
3
- Runs as background process, syncs every 10 minutes.
4
  """
5
  import os
6
  import sys
@@ -9,8 +9,8 @@ import time
9
  from pathlib import Path
10
  from datetime import datetime
11
 
12
- # Configuration
13
- EXPORT_DIR = Path("/exports")
14
  BACKUP_REPO = os.environ.get("BACKUP_DATASET_REPO", "argus-systems/vuln-preferences-data")
15
  SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL_MINUTES", "10"))
16
  ARGILLA_API_URL = "http://localhost:6900"
@@ -18,14 +18,14 @@ ARGILLA_API_URL = "http://localhost:6900"
18
  # Check for HF_TOKEN before importing heavy dependencies
19
  hf_token = os.environ.get("HF_TOKEN")
20
  if not hf_token:
21
- print("ERROR: HF_TOKEN not set, backup script exiting")
22
  sys.exit(1)
23
 
24
  from huggingface_hub import CommitScheduler, login
25
  import argilla as rg
26
 
27
  # Setup
28
- EXPORT_DIR.mkdir(exist_ok=True)
29
  login(token=hf_token)
30
 
31
  # Setup CommitScheduler
@@ -38,25 +38,28 @@ scheduler = CommitScheduler(
38
 
39
 
40
  def wait_for_argilla():
41
- """Wait for Argilla server to be ready."""
42
  import urllib.request
43
- max_retries = 30
44
  for i in range(max_retries):
45
  try:
46
- urllib.request.urlopen(f"{ARGILLA_API_URL}/api/v1/status")
47
- print("Argilla server is ready")
48
  return True
49
- except Exception:
50
- print(f"Waiting for Argilla... ({i+1}/{max_retries})")
51
- time.sleep(2)
 
52
  return False
53
 
54
 
55
  def export_annotations():
56
  """Export all annotations from Argilla to JSON files."""
57
  try:
58
- api_key = os.environ.get("ARGILLA_API_KEY", "argilla.apikey")
 
59
  client = rg.Argilla(api_url=ARGILLA_API_URL, api_key=api_key)
 
60
 
61
  for dataset in client.datasets:
62
  records = list(dataset.records)
@@ -74,19 +77,28 @@ def export_annotations():
74
  "num_annotations": len(annotated),
75
  "records": annotated
76
  }, f, indent=2, default=str)
77
- print(f"Exported {len(annotated)} annotations from {dataset.name}")
 
 
78
  except Exception as e:
79
- print(f"Export error: {e}")
 
 
80
 
81
 
82
  def main():
83
- print("Starting Argilla backup service...")
 
 
 
84
 
85
  if not wait_for_argilla():
86
- print("Warning: Argilla not responding, will retry exports")
87
 
 
88
  while True:
89
  export_annotations()
 
90
  time.sleep(SYNC_INTERVAL * 60) # Convert to seconds
91
 
92
 
 
1
  """
2
  Automatic backup of Argilla annotations to HF Dataset.
3
+ Runs as a honcho-managed process, syncs every 10 minutes.
4
  """
5
  import os
6
  import sys
 
9
  from pathlib import Path
10
  from datetime import datetime
11
 
12
+ # Configuration - use writable directory under /home/argilla
13
+ EXPORT_DIR = Path("/home/argilla/exports")
14
  BACKUP_REPO = os.environ.get("BACKUP_DATASET_REPO", "argus-systems/vuln-preferences-data")
15
  SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL_MINUTES", "10"))
16
  ARGILLA_API_URL = "http://localhost:6900"
 
18
  # Check for HF_TOKEN before importing heavy dependencies
19
  hf_token = os.environ.get("HF_TOKEN")
20
  if not hf_token:
21
+ print("ERROR: HF_TOKEN not set, backup script exiting", flush=True)
22
  sys.exit(1)
23
 
24
  from huggingface_hub import CommitScheduler, login
25
  import argilla as rg
26
 
27
  # Setup
28
+ EXPORT_DIR.mkdir(parents=True, exist_ok=True)
29
  login(token=hf_token)
30
 
31
  # Setup CommitScheduler
 
38
 
39
 
40
  def wait_for_argilla():
41
+ """Wait for Argilla server to be ready with extended retries."""
42
  import urllib.request
43
+ max_retries = 60 # Extended retries since we're starting with Argilla
44
  for i in range(max_retries):
45
  try:
46
+ urllib.request.urlopen(f"{ARGILLA_API_URL}/api/v1/status", timeout=5)
47
+ print("Argilla server is ready", flush=True)
48
  return True
49
+ except Exception as e:
50
+ print(f"Waiting for Argilla... ({i+1}/{max_retries}) - {e}", flush=True)
51
+ time.sleep(5)
52
+ print("ERROR: Argilla server not responding after maximum retries", flush=True)
53
  return False
54
 
55
 
56
  def export_annotations():
57
  """Export all annotations from Argilla to JSON files."""
58
  try:
59
+ # Default API key matches what argilla-quickstart creates for the owner user
60
+ api_key = os.environ.get("ARGILLA_API_KEY", "owner.apikey")
61
  client = rg.Argilla(api_url=ARGILLA_API_URL, api_key=api_key)
62
+ print(f"Connected to Argilla, checking datasets...", flush=True)
63
 
64
  for dataset in client.datasets:
65
  records = list(dataset.records)
 
77
  "num_annotations": len(annotated),
78
  "records": annotated
79
  }, f, indent=2, default=str)
80
+ print(f"Exported {len(annotated)} annotations from {dataset.name}", flush=True)
81
+ else:
82
+ print(f"No new annotations in {dataset.name}", flush=True)
83
  except Exception as e:
84
+ print(f"Export error: {e}", flush=True)
85
+ import traceback
86
+ traceback.print_exc()
87
 
88
 
89
  def main():
90
+ print("Starting Argilla backup service...", flush=True)
91
+ print(f" Backup repo: {BACKUP_REPO}", flush=True)
92
+ print(f" Export dir: {EXPORT_DIR}", flush=True)
93
+ print(f" Sync interval: {SYNC_INTERVAL} minutes", flush=True)
94
 
95
  if not wait_for_argilla():
96
+ print("Warning: Argilla not responding, will retry exports anyway", flush=True)
97
 
98
+ print("Starting backup loop...", flush=True)
99
  while True:
100
  export_annotations()
101
+ print(f"Sleeping for {SYNC_INTERVAL} minutes until next export...", flush=True)
102
  time.sleep(SYNC_INTERVAL * 60) # Convert to seconds
103
 
104