Spaces:

flzta
/

data

Paused

App Files Files Community

data / sync_data.sh

flzta

Update sync_data.sh

c61036b verified 9 months ago

raw

history blame contribute delete

13.7 kB

	#!/bin/bash

	# 检查 Hugging Face Token 和 Dataset ID 环境变量
	if [[ -z "$HF_TOKEN" ]] \|\| [[ -z "$DATASET_ID" ]]; then
	echo "Starting Cloudreve without backup/restore functionality - missing HF_TOKEN or DATASET_ID"
	# 直接启动 Cloudreve 作为主进程
	echo "Starting Cloudreve directly..."
	exec /opt/cloudreve/cloudreve -c /opt/cloudreve/config.ini
	exit 0 # exec 通常不会返回，但加上 exit 0 以防万一
	fi

	# 激活 Python 虚拟环境
	echo "Activating Python venv..."
	source /opt/venv/bin/activate

	# 定义 Cloudreve 主程序目录和备份文件前缀
	CLOUDREVE_DIR="/opt/cloudreve"
	BACKUP_PREFIX="cloudreve_backup"
	CONFIG_FILE_PATH="/opt/cloudreve/config.ini"
	DB_FILE_PATH="/opt/cloudreve/cloudreve.db"
	EXECUTABLE_PATH="/opt/cloudreve/cloudreve"

	# --- Python 函数定义 ---
	# (Python 函数 upload_backup 和 download_latest_backup 保持不变，这里省略以减少篇幅)
	# --- 请将你原始脚本中的 Python 函数 upload_backup 和 download_latest_backup 复制到这里 ---
	# Python 函数: 上传备份
	upload_backup() {
	file_path="$1"
	file_name="$2"
	token="$HF_TOKEN"
	repo_id="$DATASET_ID"

	echo "Preparing to upload backup file: $file_path as $file_name to Dataset: $repo_id"

	python3 -c "
	from huggingface_hub import HfApi
	import sys
	import os
	print(f'HF_TOKEN is set: {os.environ.get(\"HF_TOKEN\") is not None}')
	print(f'DATASET_ID is set: {os.environ.get(\"DATASET_ID\") is not None}')
	def manage_backups(api, repo_id_val, max_files=5):
	print('Managing old backups...')
	files = api.list_repo_files(repo_id=repo_id_val, repo_type='dataset')
	backup_files = [f for f in files if f.startswith('$BACKUP_PREFIX') and f.endswith('.tar.gz')]
	backup_files.sort()
	if len(backup_files) >= max_files:
	print(f'Found {len(backup_files)} backup files, maximum allowed is {max_files}.')
	files_to_delete = backup_files[:(len(backup_files) - max_files + 1)]
	for file_to_delete in files_to_delete:
	try:
	print(f'Deleting old backup: {file_to_delete}')
	api.delete_file(path_in_repo=file_to_delete, repo_id=repo_id_val, repo_type='dataset')
	print(f'Successfully deleted: {file_to_delete}')
	except Exception as e:
	print(f'Error deleting {file_to_delete}: {str(e)}')
	else:
	print('Number of backup files is within the limit.')
	api = HfApi(token='$token')
	try:
	repo_id_val = os.environ.get('DATASET_ID') # 从环境变量中获取 repo_id
	if not repo_id_val:
	raise ValueError('DATASET_ID environment variable is not set.')
	print(f'Uploading file: $file_path to {repo_id_val} as $file_name')
	api.upload_file(
	path_or_fileobj='$file_path',
	path_in_repo='$file_name',
	repo_id=repo_id_val,
	repo_type='dataset'
	)
	print(f'Successfully uploaded $file_name')
	manage_backups(api, repo_id_val)
	except Exception as e:
	print(f'Error uploading file: {str(e)}')
	sys.exit(1) # Exit if upload fails
	"
	}

	# Python 函数: 下载最新备份
	download_latest_backup() {
	token="$HF_TOKEN"
	repo_id="$DATASET_ID"

	echo "Preparing to download the latest backup from Dataset: $repo_id"

	python3 -c "
	from huggingface_hub import HfApi, hf_hub_download
	import sys
	import os
	import tarfile
	import tempfile
	import shutil
	import subprocess

	print(f'HF_TOKEN is set: {os.environ.get(\"HF_TOKEN\") is not None}')
	print(f'DATASET_ID is set: {os.environ.get(\"DATASET_ID\") is not None}')

	api = HfApi(token='$token')
	try:
	repo_id_val = os.environ.get('DATASET_ID') # 从环境变量中获取 repo_id
	if not repo_id_val:
	raise ValueError('DATASET_ID environment variable is not set.')

	print(f'Listing files in Dataset: {repo_id_val}')
	files = api.list_repo_files(repo_id=repo_id_val, repo_type='dataset')
	backup_files = [f for f in files if f.startswith('$BACKUP_PREFIX') and f.endswith('.tar.gz')]

	if not backup_files:
	print('No backup files found in the Dataset. Skipping restore.')
	sys.exit(0) # Exit successfully if no backups to restore

	latest_backup = sorted(backup_files)[-1]
	print(f'Latest backup file found: {latest_backup}')

	with tempfile.TemporaryDirectory() as temp_dir:
	print(f'Downloading {latest_backup} to temporary directory {temp_dir}...')
	try:
	filepath = hf_hub_download(
	repo_id=repo_id_val,
	filename=latest_backup,
	repo_type='dataset',
	local_dir=temp_dir,
	token=os.environ.get('HF_TOKEN') # Pass token explicitly if needed
	)
	except Exception as download_error:
	print(f'Error during hf_hub_download: {download_error}')
	# Attempt to list files again for debugging
	try:
	print('Attempting to list repo files again for debugging...')
	files_debug = api.list_repo_files(repo_id=repo_id_val, repo_type='dataset')
	print(f'Files found (debug): {files_debug}')
	except Exception as list_error:
	print(f'Error listing files during debug: {list_error}')
	sys.exit(1)


	if filepath and os.path.exists(filepath):
	print(f'Successfully downloaded backup to temporary directory: {filepath}')

	# Files/Dirs to restore (relative paths within CLOUDREVE_DIR)
	items_to_restore = ['cloudreve', 'cloudreve.db', 'config.ini']

	# Ensure target directory exists
	os.makedirs(\"$CLOUDREVE_DIR\", exist_ok=True)

	print('Listing contents before restore:')
	subprocess.run(['ls', '-lA', \"$CLOUDREVE_DIR\"], check=False) # Use -A to show hidden files

	# --- Safer Restore Logic ---
	# 1. Extract backup to a temporary location first
	extract_temp_dir = os.path.join(temp_dir, 'extracted_backup')
	os.makedirs(extract_temp_dir, exist_ok=True)
	print(f'Extracting backup archive: {filepath} to {extract_temp_dir}')
	try:
	with tarfile.open(filepath, 'r:gz') as tar:
	tar.extractall(extract_temp_dir)
	print('Extraction complete.')
	except tarfile.ReadError as tar_err:
	print(f'Error reading tar file: {tar_err}')
	sys.exit(1)
	except Exception as extract_err:
	print(f'Error during extraction: {extract_err}')
	sys.exit(1)


	# 2. Check if essential files exist in the extracted backup
	essential_files_present = True
	for item in items_to_restore:
	extracted_item_path = os.path.join(extract_temp_dir, item)
	if not os.path.exists(extracted_item_path):
	print(f'Error: Essential item "{item}" not found in extracted backup at {extracted_item_path}. Aborting restore.')
	essential_files_present = False
	break # Stop checking

	if not essential_files_present:
	sys.exit(1) # Abort if essential files are missing

	# 3. Delete existing items in the target directory
	print(f'Deleting existing items in $CLOUDREVE_DIR before restoring...')
	for item in items_to_restore:
	target_path = os.path.join(\"$CLOUDREVE_DIR\", item)
	if os.path.exists(target_path):
	try:
	if os.path.isdir(target_path) and not os.path.islink(target_path):
	print(f'Deleting directory: {target_path}')
	shutil.rmtree(target_path)
	else:
	print(f'Deleting file/link: {target_path}')
	os.remove(target_path)
	except OSError as e:
	print(f'Error deleting {target_path}: {e}. Continuing...')


	# 4. Move extracted items to the target directory
	print(f'Moving extracted items from {extract_temp_dir} to $CLOUDREVE_DIR...')
	for item in items_to_restore:
	source_path = os.path.join(extract_temp_dir, item)
	target_path = os.path.join(\"$CLOUDREVE_DIR\", item)
	try:
	print(f'Moving {source_path} to {target_path}')
	shutil.move(source_path, target_path)
	except Exception as move_err:
	print(f'Error moving {item}: {move_err}')
	# Decide if this is critical, maybe exit? For now, print and continue.


	print(f'Successfully restored backup from {latest_backup}')
	print('Listing contents after restore:')
	subprocess.run(['ls', '-lA', \"$CLOUDREVE_DIR\"], check=False) # Use -A
	else:
	print(f'Error: Downloaded file path "{filepath}" does not exist or download failed.')
	sys.exit(1) # Exit if download path invalid

	except ValueError as ve:
	print(f'Configuration Error: {ve}')
	sys.exit(1)
	except Exception as e:
	print(f'Error during backup download/restore: {str(e)}')
	# Print traceback for more details
	import traceback
	traceback.print_exc()
	sys.exit(1) # Exit on error
	"
	}


	# --- Sync Function ---
	sync_data() {
	echo "Background Sync Process Started"
	while true; do
	# Wait for initial Cloudreve setup potentially creating db/config if first run
	# Also wait if essential files are missing before attempting backup
	while [ ! -f "$CONFIG_FILE_PATH" ] \|\| [ ! -f "$DB_FILE_PATH" ] \|\| [ ! -f "$EXECUTABLE_PATH" ]; do
	echo "Waiting for essential Cloudreve files (config.ini, cloudreve.db, cloudreve) to exist before backup attempt..."
	sleep 15
	done

	echo "Starting sync cycle at $(date)"

	# Define backup path and name
	timestamp=$(date +%Y%m%d_%H%M%S)
	backup_file="${BACKUP_PREFIX}_${timestamp}.tar.gz"
	backup_path="/tmp/${backup_file}" # Use /tmp for temporary files

	echo "Compressing Cloudreve data (executable, db, config) to: $backup_path"
	# Use -C to change directory, ensuring archive paths are relative
	# Only include the executable, db, and config file
	tar -czf "$backup_path" -C "$CLOUDREVE_DIR" \
	$(basename "$EXECUTABLE_PATH") \
	$(basename "$DB_FILE_PATH") \
	$(basename "$CONFIG_FILE_PATH")

	# Check if compression was successful (file exists and is not empty)
	if [ -s "$backup_path" ]; then
	echo "Compression complete. File size: $(ls -lh "$backup_path" \| awk '{print $5}')"
	echo "Uploading backup to HuggingFace..."
	upload_backup "$backup_path" "${backup_file}"
	# Check exit status of upload_backup? The python script should exit non-zero on failure.
	if [ $? -ne 0 ]; then
	echo "Backup upload failed. Keeping local archive: $backup_path"
	else
	echo "Upload successful. Removing local archive."
	rm -f "$backup_path"
	fi
	else
	echo "Compression failed or created an empty file. Skipping upload."
	rm -f "$backup_path" # Remove potentially empty/corrupt file
	fi

	# Define sync interval (use environment variable or default to 3600 seconds = 1 hour)
	SYNC_INTERVAL=${SYNC_INTERVAL:-3600}
	echo "Next sync in ${SYNC_INTERVAL} seconds..."
	sleep $SYNC_INTERVAL
	done
	}

	# --- Main Execution ---

	# 1. Attempt to restore from the latest backup on startup
	echo "Attempting to restore latest backup from HuggingFace..."
	download_latest_backup
	# Check exit code? If restore fails critically, maybe don't start?
	# The python script now exits non-zero on critical errors.
	if [ $? -ne 0 ]; then
	echo "CRITICAL: Backup restoration failed. Exiting."
	exit 1
	fi
	echo "Backup restore process finished."


	# 2. Check if config file exists after potential restore. If not, Cloudreve needs to run once to create it.
	if [ ! -f "$CONFIG_FILE_PATH" ]; then
	echo "Config file ($CONFIG_FILE_PATH) not found. Running Cloudreve once to generate initial config."
	/opt/cloudreve/cloudreve -c "$CONFIG_FILE_PATH"
	# Cloudreve will print initial password and exit (or wait for setup if web setup enabled)
	# Need to check if it actually created the config...
	if [ ! -f "$CONFIG_FILE_PATH" ]; then
	echo "CRITICAL: Cloudreve failed to create initial config file. Exiting."
	exit 1
	else
	echo "Initial config file created. Please check logs for admin credentials if needed."
	# Consider stopping here or adding a pause? For automated deployment, continue.
	fi
	fi


	# 3. Start the background sync process
	echo "Starting background data sync..."
	sync_data & # Run sync_data function in the background
	sync_pid=$! # Get PID of background sync process

	# 4. Start Cloudreve in the foreground using exec
	# 'exec' replaces the current shell process with the Cloudreve process.
	# This makes Cloudreve the main process of the container.
	echo "Starting Cloudreve application as the main process..."
	exec /opt/cloudreve/cloudreve -c "$CONFIG_FILE_PATH"

	# If exec fails, the script continues here.
	exec_failed_code=$?
	echo "CRITICAL: Failed to execute Cloudreve. Exit code: $exec_failed_code"
	# Attempt to kill the background sync process if exec failed
	kill $sync_pid 2>/dev/null
	exit $exec_failed_code