wwforonce commited on
Commit
15d23da
·
1 Parent(s): aa83347

add dataset

Browse files
Files changed (3) hide show
  1. Dockerfile +6 -2
  2. start_with_sync.sh +97 -0
  3. sync_storage.py +188 -0
Dockerfile CHANGED
@@ -22,6 +22,10 @@ RUN chmod 666 /app/readeck/config.toml
22
  # can create and write files (like db.sqlite3) within this directory.
23
  RUN mkdir -p /app/readeck/data && chmod 777 /app/readeck/data
24
 
 
 
 
25
 
26
-
27
- CMD ["/bin/readeck","serve", "-config", "/app/readeck/config.toml"]
 
 
22
  # can create and write files (like db.sqlite3) within this directory.
23
  RUN mkdir -p /app/readeck/data && chmod 777 /app/readeck/data
24
 
25
+ # Copy sync scripts
26
+ COPY sync_storage.py /app/sync_storage.py
27
+ COPY start_with_sync.sh /start.sh
28
 
29
+ # Start with sync
30
+ ENTRYPOINT ["/start.sh"]
31
+ # CMD ["/bin/readeck","serve", "-config", "/app/readeck/config.toml"]
start_with_sync.sh ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ set -e
4
+
5
+ # Use /tmp for all writable data
6
+ export DATA_DIR="/app/readeck/data"
7
+ export HF_STORAGE_REPO="${HF_STORAGE_REPO:-nxdev-org/open-webui-storage}"
8
+ export SYNC_INTERVAL="${SYNC_INTERVAL:-300}"
9
+
10
+ # Set all HuggingFace and cache directories to /tmp
11
+ export HF_HOME="/tmp/hf_cache"
12
+ export HUGGINGFACE_HUB_CACHE="/tmp/hf_cache"
13
+ export TRANSFORMERS_CACHE="/tmp/hf_cache"
14
+ export SENTENCE_TRANSFORMERS_HOME="/tmp/hf_cache"
15
+
16
+ # Override Open WebUI environment variables
17
+ export STATIC_DIR="/tmp/static"
18
+ export UPLOAD_DIR="/tmp/uploads"
19
+
20
+ echo "Starting Open WebUI with HF Dataset persistence..."
21
+ echo "Data directory: $DATA_DIR"
22
+ echo "HF Repository: $HF_STORAGE_REPO"
23
+ echo "HF Cache: $HF_HOME"
24
+
25
+ # Create all necessary directories
26
+ mkdir -p "$DATA_DIR" "$HF_HOME" "$STATIC_DIR" "$UPLOAD_DIR"
27
+
28
+
29
+
30
+ # Test write permissions
31
+ if touch "$DATA_DIR/test" 2>/dev/null; then
32
+ rm "$DATA_DIR/test"
33
+ echo "Data directory is writable"
34
+ else
35
+ echo "Warning: Data directory may not be writable"
36
+ fi
37
+
38
+ # Check if HF_TOKEN is set
39
+ if [ -z "$HF_TOKEN" ]; then
40
+ echo "Warning: HF_TOKEN not set. Sync functionality will be limited."
41
+ else
42
+ echo "HF_TOKEN is set, proceeding with sync..."
43
+ fi
44
+
45
+ # Download existing data on startup
46
+ echo "Syncing data from Hugging Face..."
47
+ python3 /app/sync_storage.py download
48
+
49
+ # Function to handle graceful shutdown
50
+ cleanup() {
51
+ echo "Shutting down gracefully..."
52
+
53
+ # Upload final data state
54
+ if [ -n "$HF_TOKEN" ]; then
55
+ echo "Uploading final data state..."
56
+ python3 /app/sync_storage.py upload
57
+ fi
58
+
59
+ # Kill background processes
60
+ kill $SYNC_PID 2>/dev/null || true
61
+ kill $WEBUI_PID 2>/dev/null || true
62
+
63
+ exit 0
64
+ }
65
+
66
+ # Set up signal handlers
67
+ trap cleanup SIGTERM SIGINT
68
+
69
+ # Background sync function
70
+ background_sync() {
71
+ if [ -n "$HF_TOKEN" ]; then
72
+ while true; do
73
+ sleep $SYNC_INTERVAL
74
+ echo "Periodic sync to Hugging Face..."
75
+ python3 /app/sync_storage.py upload
76
+ done
77
+ else
78
+ echo "Skipping background sync - no HF_TOKEN"
79
+ while true; do
80
+ sleep 3600
81
+ done
82
+ fi
83
+ }
84
+
85
+ # Start background sync
86
+ background_sync &
87
+ SYNC_PID=$!
88
+
89
+ # Start Open WebUI
90
+ echo "Starting Readeck..."
91
+
92
+ # Start readeck in background
93
+ /bin/readeck serve -config /app/readeck/config.toml &
94
+ WEBUI_PID=$!
95
+
96
+ # Wait for Open WebUI process
97
+ wait $WEBUI_PID
sync_storage.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import shutil
4
+ import json
5
+ from pathlib import Path
6
+ from huggingface_hub import HfApi, create_repo
7
+ import tarfile
8
+ import tempfile
9
+
10
+ class HFStorageSync:
11
+ def __init__(self, repo_id, token=None, data_dir="/tmp/open-webui-data"):
12
+ self.repo_id = repo_id
13
+ self.data_dir = Path(data_dir)
14
+ self.token = token
15
+
16
+ # Initialize API with token directly
17
+ self.api = HfApi(token=token) if token else HfApi()
18
+
19
+ def ensure_repo_exists(self):
20
+ """Create repository if it doesn't exist"""
21
+ if not self.token:
22
+ print("No token provided, cannot create repository")
23
+ return False
24
+
25
+ try:
26
+ # Check if repo exists
27
+ repo_info = self.api.repo_info(repo_id=self.repo_id, repo_type="dataset")
28
+ print(f"Repository {self.repo_id} exists")
29
+ return True
30
+ except Exception as e:
31
+ print(f"Repository {self.repo_id} not found, attempting to create...")
32
+ try:
33
+ create_repo(
34
+ repo_id=self.repo_id,
35
+ repo_type="dataset",
36
+ token=self.token,
37
+ private=True, # Make it private by default
38
+ exist_ok=True
39
+ )
40
+ print(f"Created repository {self.repo_id}")
41
+
42
+ # Create initial README
43
+ readme_content = """# Open WebUI Storage
44
+
45
+ This dataset stores persistent data for Open WebUI deployment.
46
+
47
+ ## Contents
48
+
49
+ - `data.tar.gz`: Compressed archive containing all Open WebUI data including:
50
+ - User configurations
51
+ - Chat histories
52
+ - Uploaded files
53
+ - Database files
54
+
55
+ This repository is automatically managed by the Open WebUI sync system.
56
+ """
57
+
58
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as tmp:
59
+ tmp.write(readme_content)
60
+ tmp.flush()
61
+
62
+ self.api.upload_file(
63
+ path_or_fileobj=tmp.name,
64
+ path_in_repo="README.md",
65
+ repo_id=self.repo_id,
66
+ repo_type="dataset",
67
+ commit_message="Initial repository setup",
68
+ token=self.token
69
+ )
70
+
71
+ os.unlink(tmp.name)
72
+
73
+ return True
74
+ except Exception as create_error:
75
+ print(f"Failed to create repository: {create_error}")
76
+ return False
77
+
78
+ def download_data(self):
79
+ """Download and extract data from HF dataset repo"""
80
+ try:
81
+ print("Downloading data from Hugging Face...")
82
+
83
+ # Ensure data directory exists and is writable
84
+ self.data_dir.mkdir(parents=True, exist_ok=True)
85
+
86
+ # Test write permissions
87
+ test_file = self.data_dir / "test_write"
88
+ try:
89
+ test_file.touch()
90
+ test_file.unlink()
91
+ print(f"Data directory {self.data_dir} is writable")
92
+ except Exception as e:
93
+ print(f"Warning: Data directory may not be writable: {e}")
94
+ return
95
+
96
+ if not self.token:
97
+ print("No HF_TOKEN provided, skipping download")
98
+ return
99
+
100
+ # Ensure repository exists
101
+ if not self.ensure_repo_exists():
102
+ print("Could not access or create repository")
103
+ return
104
+
105
+ # Try to download the data archive
106
+ try:
107
+ file_path = self.api.hf_hub_download(
108
+ repo_id=self.repo_id,
109
+ filename="data.tar.gz",
110
+ repo_type="dataset",
111
+ token=self.token
112
+ )
113
+
114
+ with tarfile.open(file_path, 'r:gz') as tar:
115
+ tar.extractall(self.data_dir)
116
+
117
+ print(f"Data extracted to {self.data_dir}")
118
+
119
+ except Exception as e:
120
+ print(f"No existing data found (this is normal for first run): {e}")
121
+
122
+ except Exception as e:
123
+ print(f"Error during download: {e}")
124
+
125
+ def upload_data(self):
126
+ """Compress and upload data to HF dataset repo"""
127
+ try:
128
+ if not self.token:
129
+ print("No HF_TOKEN provided, skipping upload")
130
+ return
131
+
132
+ print("Uploading data to Hugging Face...")
133
+
134
+ if not self.data_dir.exists() or not any(self.data_dir.iterdir()):
135
+ print("No data to upload")
136
+ return
137
+
138
+ # Ensure repository exists
139
+ if not self.ensure_repo_exists():
140
+ print("Could not access or create repository")
141
+ return
142
+
143
+ # Create temporary archive
144
+ with tempfile.NamedTemporaryFile(suffix='.tar.gz', delete=False) as tmp:
145
+ with tarfile.open(tmp.name, 'w:gz') as tar:
146
+ for item in self.data_dir.iterdir():
147
+ if item.name not in ["test_write", ".gitkeep"]: # Skip test files
148
+ tar.add(item, arcname=item.name)
149
+
150
+ # Upload to HF
151
+ self.api.upload_file(
152
+ path_or_fileobj=tmp.name,
153
+ path_in_repo="data.tar.gz",
154
+ repo_id=self.repo_id,
155
+ repo_type="dataset",
156
+ commit_message="Update Open WebUI data",
157
+ token=self.token
158
+ )
159
+
160
+ # Clean up
161
+ os.unlink(tmp.name)
162
+
163
+ print("Data uploaded successfully")
164
+
165
+ except Exception as e:
166
+ print(f"Error uploading data: {e}")
167
+
168
+ def main():
169
+ import sys
170
+
171
+ repo_id = os.getenv("HF_STORAGE_REPO", "nxdev-org/open-webui-storage")
172
+ token = os.getenv("HF_TOKEN")
173
+ data_dir = os.getenv("DATA_DIR", "/tmp/open-webui-data")
174
+
175
+ sync = HFStorageSync(repo_id, token, data_dir)
176
+
177
+ if len(sys.argv) > 1:
178
+ if sys.argv[1] == "download":
179
+ sync.download_data()
180
+ elif sys.argv[1] == "upload":
181
+ sync.upload_data()
182
+ else:
183
+ print("Usage: sync_storage.py [download|upload]")
184
+ else:
185
+ print("Usage: sync_storage.py [download|upload]")
186
+
187
+ if __name__ == "__main__":
188
+ main()