Spaces:

Penguindrum920
/

aniverse

Sleeping

App Files Files Community

aniverse / setup_data.py

Penguindrum920

Upload 57 files

59eb043 verified 13 days ago

raw

history blame contribute delete

6.51 kB

	"""Download datasets and embeddings from Google Drive on startup"""
	import os
	import shutil
	import subprocess
	import sys
	from pathlib import Path

	# Google Drive folder ID (extracted from the share link)
	GDRIVE_FOLDER_ID = "1tvoY4Ks3elgRgC81uRsZRDhDcclmu5hO"

	def install_gdown():
	"""Install gdown if not present"""
	try:
	import gdown
	except ImportError:
	print("Installing gdown...")
	subprocess.check_call([sys.executable, "-m", "pip", "install", "gdown", "-q"])
	import gdown
	return gdown

	def download_folder_from_gdrive(folder_id: str, output_dir: str):
	"""Download entire folder from GDrive"""
	gdown = install_gdown()

	output_path = Path(output_dir)
	output_path.mkdir(parents=True, exist_ok=True)

	print(f"Downloading from GDrive folder {folder_id} to {output_dir}...")

	try:
	url = f"https://drive.google.com/drive/folders/{folder_id}"
	gdown.download_folder(url, output=output_dir, quiet=False, remaining_ok=True)
	print(f"Download complete: {output_dir}")
	return True
	except Exception as e:
	print(f"Error downloading: {e}")
	import traceback
	traceback.print_exc()
	return False

	def organize_downloaded_files(download_dir: Path, backend_dir: Path):
	"""Move downloaded files to correct locations"""
	print(f"\n{'='*50}")
	print("ORGANIZING DOWNLOADED FILES")
	print(f"{'='*50}")
	print(f"Source: {download_dir}")
	print(f"Destination: {backend_dir}")

	if not download_dir.exists():
	print(f"ERROR: Download directory {download_dir} does not exist!")
	return False

	# List all downloaded items
	items = list(download_dir.iterdir())
	print(f"Found {len(items)} items in download folder:")

	for item in items:
	print(f"\n Processing: {item.name} (is_dir={item.is_dir()})")

	try:
	# Move CSV files to dataset folder
	if item.suffix.lower() == '.csv':
	dataset_dir = backend_dir / "dataset"
	dataset_dir.mkdir(parents=True, exist_ok=True)
	dest = dataset_dir / item.name
	print(f" Moving CSV to {dest}")
	shutil.move(str(item), str(dest))
	print(f" ✓ Moved successfully")

	# Move chroma_db folder
	elif item.name == 'chroma_db' and item.is_dir():
	dest = backend_dir / "chroma_db"
	if dest.exists():
	print(f" Removing existing {dest}")
	shutil.rmtree(dest)
	print(f" Moving chroma_db to {dest}")
	shutil.move(str(item), str(dest))
	print(f" ✓ Moved successfully")

	# Move manga_chroma_db folder
	elif item.name == 'manga_chroma_db' and item.is_dir():
	dest = backend_dir / "manga_chroma_db"
	if dest.exists():
	print(f" Removing existing {dest}")
	shutil.rmtree(dest)
	print(f" Moving manga_chroma_db to {dest}")
	shutil.move(str(item), str(dest))
	print(f" ✓ Moved successfully")

	# Handle nested directories (GDrive sometimes creates nested folders)
	elif item.is_dir():
	print(f" Recursively processing subdirectory: {item.name}")
	organize_downloaded_files(item, backend_dir)
	except Exception as e:
	print(f" ERROR moving {item.name}: {e}")
	import traceback
	traceback.print_exc()

	return True

	def verify_data(backend_dir: Path):
	"""Verify all required data files exist"""
	print(f"\n{'='*50}")
	print("VERIFYING DATA FILES")
	print(f"{'='*50}")

	dataset_path = backend_dir / "dataset" / "anime.csv"
	chroma_path = backend_dir / "chroma_db"
	manga_chroma_path = backend_dir / "manga_chroma_db"

	print(f"Dataset (anime.csv): {dataset_path.exists()} - {dataset_path}")
	print(f"ChromaDB: {chroma_path.exists()} - {chroma_path}")
	print(f"Manga ChromaDB: {manga_chroma_path.exists()} - {manga_chroma_path}")

	if chroma_path.exists():
	print(f" ChromaDB contents: {list(chroma_path.iterdir())}")
	if manga_chroma_path.exists():
	print(f" Manga ChromaDB contents: {list(manga_chroma_path.iterdir())}")

	return dataset_path.exists() and chroma_path.exists()

	def setup_data():
	"""Download all required data files"""
	backend_dir = Path(__file__).parent

	print(f"\n{'='*50}")
	print("ANIVERSE DATA SETUP")
	print(f"{'='*50}")
	print(f"Backend directory: {backend_dir}")

	# Check if data already exists
	dataset_dir = backend_dir / "dataset"
	chroma_dir = backend_dir / "chroma_db"

	dataset_exists = (dataset_dir / "anime.csv").exists()
	chroma_exists = chroma_dir.exists() and any(chroma_dir.iterdir()) if chroma_dir.exists() else False

	print(f"Dataset exists: {dataset_exists}")
	print(f"ChromaDB exists: {chroma_exists}")

	if dataset_exists and chroma_exists:
	print("All data files present, skipping download.")
	verify_data(backend_dir)
	return True

	# Download from GDrive
	print(f"\n{'='*50}")
	print("DOWNLOADING DATA FROM GOOGLE DRIVE")
	print(f"{'='*50}")

	download_dir = backend_dir / "data_download"
	success = download_folder_from_gdrive(GDRIVE_FOLDER_ID, str(download_dir))

	if success:
	organize_downloaded_files(download_dir, backend_dir)

	# Cleanup download folder
	if download_dir.exists():
	print(f"\nCleaning up download folder: {download_dir}")
	try:
	shutil.rmtree(download_dir)
	print("✓ Cleanup complete")
	except Exception as e:
	print(f"Warning: Could not cleanup: {e}")

	# Verify the data
	verify_data(backend_dir)
	else:
	print("\n" + "!"*50)
	print("WARNING: Failed to download data from GDrive!")
	print("The server may not function correctly without data files.")
	print("!"*50)

	return success

	if __name__ == "__main__":
	setup_data()