Spaces:

mametarow
/

booth-pic-api

Running

booth-pic-api / backend /scraper /sync_to_r2.py

github-actions

Deploy to HF (clean history with LFS)

e666301 about 1 month ago

4.62 kB

	import os
	import json
	import logging
	import time
	from tqdm import tqdm
	from booth_scraper import BoothScraper
	from PIL import Image
	import io

	# Logging setup
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	handlers=[
	logging.FileHandler("sync_r2_final.log", encoding='utf-8'),
	logging.StreamHandler()
	]
	)

	def sync_to_r2():
	scraper = BoothScraper()
	if not scraper.r2_enabled:
	logging.error("R2 is not enabled in .env. Please check your configuration.")
	return

	metadata_path = os.path.join("data", "metadata.jsonl")
	temp_metadata_path = os.path.join("data", "metadata_synced_final.jsonl")

	if not os.path.exists(metadata_path):
	logging.error(f"Metadata file not found: {metadata_path}")
	return

	# Count lines for progress bar
	total_items = 0
	with open(metadata_path, 'r', encoding='utf-8') as f:
	for _ in f: total_items += 1

	logging.info(f"Starting FINAL synchronization of {total_items} items to R2 with FORCED RESIZING...")

	with open(metadata_path, 'r', encoding='utf-8') as f_in, \
	open(temp_metadata_path, 'w', encoding='utf-8') as f_out:

	for line in tqdm(f_in, total=total_items, desc="Syncing items"):
	try:
	item = json.loads(line.strip())
	updated_images = []

	# Each item may have multiple images
	for img_val in item.get("images", []):
	# 1. Determine local path even if it's already an R2 URL in metadata
	local_path = None
	if img_val.startswith("http") and scraper.r2_public_url in img_val:
	# Reconstruct local path from R2 URL
	# URL: https://.../data/raw_images/123_0.jpg -> PATH: data/raw_images/123_0.jpg
	# We need to handle os.sep correctly
	relative_path = img_val.replace(f"{scraper.r2_public_url}/", "").replace("/", os.sep)
	local_path = relative_path
	elif not img_val.startswith("http"):
	local_path = img_val

	# 2. Process local file with compression
	if local_path and os.path.exists(local_path):
	try:
	with Image.open(local_path) as img:
	if img.mode != 'RGB':
	img = img.convert('RGB')
	# Resize to max 800x800
	img.set_format = 'JPEG'
	img.thumbnail((800, 800), Image.Resampling.LANCZOS)

	img_byte_arr = io.BytesIO()
	img.save(img_byte_arr, format='JPEG', quality=85)
	compressed_bytes = img_byte_arr.getvalue()

	r2_key = local_path.replace(os.sep, '/')
	scraper.s3_client.put_object(
	Bucket=scraper.r2_bucket,
	Key=r2_key,
	Body=compressed_bytes,
	ContentType='image/jpeg'
	)
	r2_url = f"{scraper.r2_public_url}/{r2_key}"
	updated_images.append(r2_url)
	# logging.info(f"OK: {local_path} ({len(compressed_bytes)/1024:.1f} KB)")
	except Exception as e:
	logging.error(f"Failed to process {local_path}: {e}")
	updated_images.append(img_val)
	else:
	# If image is just a BOOTH URL (should not happen with existing metadata items usually)
	# or if local file is missing, keep original
	updated_images.append(img_val)

	# Update the item and save
	item["images"] = updated_images
	f_out.write(json.dumps(item, ensure_ascii=False) + "\n")

	except Exception as e:
	logging.error(f"Error processing item: {e}")
	f_out.write(line)

	# Replace old metadata with new one
	os.replace(temp_metadata_path, metadata_path)
	logging.info("FINAL Synchronization complete. All images resized and uploaded.")

	if __name__ == "__main__":
	sync_to_r2()