#!/usr/bin/env python3 """ Comprehensive artifact manager for Dressify. Handles packaging, downloading, and organizing all system artifacts. """ import os import json import shutil import zipfile import tarfile from datetime import datetime from typing import Dict, List, Any, Optional from pathlib import Path class ArtifactManager: """Manages all system artifacts for easy download and upload.""" def __init__(self, base_dir: str = "/home/user/app"): self.base_dir = base_dir self.data_dir = os.path.join(base_dir, "data/Polyvore") self.splits_dir = os.path.join(self.data_dir, "splits") self.export_dir = os.getenv("EXPORT_DIR", "models/exports") # Default HF repositories - updated to use your specific repos self.default_repos = { "splits": "Stylique/Dressify-Helper", "models": "Stylique/dressify-models", "metadata": "Stylique/Dressify-Helper" } # Repository organization structure self.repo_structure = { "Stylique/dressify-models": { "description": "Dressify trained models and checkpoints", "files": { "resnet_item_embedder_best.pth": "ResNet50 item embedder (best checkpoint)", "vit_outfit_model_best.pth": "ViT outfit compatibility model (best checkpoint)", "resnet_metrics.json": "ResNet training metrics and history", "vit_metrics.json": "ViT training metrics and history", "model_cards/": "Model documentation and cards" } }, "Stylique/Dressify-Helper": { "description": "Dressify dataset splits, metadata, and helper files", "files": { "splits/": "Dataset splits (train/valid/test)", "metadata/": "Item metadata and outfit information", "configs/": "Training configurations", "packages/": "Pre-packaged downloads" } } } def get_artifact_summary(self) -> Dict[str, Any]: """Get comprehensive summary of all available artifacts.""" summary = { "timestamp": datetime.now().isoformat(), "datasets": self._get_dataset_info(), "splits": self._get_splits_info(), "models": self._get_models_info(), "configs": self._get_configs_info(), "metadata": self._get_metadata_info(), "hf_repos": self.repo_structure, "total_size_mb": 0 } # Calculate total size total_size = 0 for category in summary.values(): if isinstance(category, dict) and "size_mb" in category: total_size += category["size_mb"] summary["total_size_mb"] = round(total_size, 2) return summary def _get_dataset_info(self) -> Dict[str, Any]: """Get information about the Polyvore dataset.""" info = { "status": "not_found", "size_mb": 0, "files": [], "images_count": 0 } if os.path.exists(self.data_dir): info["status"] = "available" # Count images images_dir = os.path.join(self.data_dir, "images") if os.path.exists(images_dir): try: # Support all major image formats from utils.image_utils import get_supported_extensions supported_exts = tuple(ext.lower() for ext in get_supported_extensions()) image_files = [f for f in os.listdir(images_dir) if f.lower().endswith(supported_exts)] info["images_count"] = len(image_files) except: pass # Calculate size try: total_size = sum(os.path.getsize(os.path.join(dirpath, filename)) for dirpath, dirnames, filenames in os.walk(self.data_dir) for filename in filenames) info["size_mb"] = round(total_size / (1024 * 1024), 2) except: pass # List key files key_files = ["images.zip", "polyvore_item_metadata.json", "polyvore_outfit_titles.json", "categories.csv"] for file in key_files: file_path = os.path.join(self.data_dir, file) if os.path.exists(file_path): info["files"].append({ "name": file, "size_mb": round(os.path.getsize(file_path) / (1024 * 1024), 2), "path": file_path }) return info def _get_splits_info(self) -> Dict[str, Any]: """Get information about dataset splits.""" info = { "status": "not_found", "size_mb": 0, "files": [], "splits_available": [] } if os.path.exists(self.splits_dir): info["status"] = "available" split_files = [ "train.json", "valid.json", "test.json", "outfits_train.json", "outfits_valid.json", "outfits_test.json", "outfit_triplets_train.json", "outfit_triplets_valid.json", "outfit_triplets_test.json" ] total_size = 0 for file in split_files: file_path = os.path.join(self.splits_dir, file) if os.path.exists(file_path): size_mb = round(os.path.getsize(file_path) / (1024 * 1024), 2) total_size += size_mb info["files"].append({ "name": file, "size_mb": size_mb, "path": file_path }) info["splits_available"].append(file.replace(".json", "")) info["size_mb"] = round(total_size, 2) return info def _get_models_info(self) -> Dict[str, Any]: """Get information about trained models.""" info = { "status": "not_found", "size_mb": 0, "files": [], "models_available": [] } if os.path.exists(self.export_dir): info["status"] = "available" model_files = [ "resnet_item_embedder.pth", "resnet_item_embedder_best.pth", "vit_outfit_model.pth", "vit_outfit_model_best.pth", "resnet_metrics.json", "vit_metrics.json" ] total_size = 0 for file in model_files: file_path = os.path.join(self.export_dir, file) if os.path.exists(file_path): size_mb = round(os.path.getsize(file_path) / (1024 * 1024), 2) total_size += size_mb info["files"].append({ "name": file, "size_mb": size_mb, "path": file_path, "type": "checkpoint" if file.endswith(".pth") else "metrics" }) if file.endswith(".pth"): info["models_available"].append(file.replace(".pth", "")) info["size_mb"] = round(total_size, 2) return info def _get_configs_info(self) -> Dict[str, Any]: """Get information about configuration files.""" info = { "status": "not_found", "size_mb": 0, "files": [] } config_files = [ "resnet_config_custom.json", "vit_config_custom.json", "item.yaml", "outfit.yaml", "default.yaml" ] total_size = 0 for file in config_files: # Check export dir first, then configs dir file_path = os.path.join(self.export_dir, file) if not os.path.exists(file_path): file_path = os.path.join("configs", file) if os.path.exists(file_path): size_mb = round(os.path.getsize(file_path) / (1024 * 1024), 2) total_size += size_mb info["files"].append({ "name": file, "size_mb": size_mb, "path": file_path }) if info["files"]: info["status"] = "available" info["size_mb"] = round(total_size, 2) return info def _get_metadata_info(self) -> Dict[str, Any]: """Get information about metadata files.""" info = { "status": "not_found", "size_mb": 0, "files": [] } metadata_files = [ "polyvore_item_metadata.json", "polyvore_outfit_titles.json", "categories.csv" ] total_size = 0 for file in metadata_files: file_path = os.path.join(self.data_dir, file) if os.path.exists(file_path): size_mb = round(os.path.getsize(file_path) / (1024 * 1024), 2) total_size += size_mb info["files"].append({ "name": file, "size_mb": size_mb, "path": file_path }) if info["files"]: info["status"] = "available" info["size_mb"] = round(total_size, 2) return info def create_download_package(self, package_type: str = "complete") -> str: """Create a downloadable package of artifacts.""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if package_type == "complete": # Complete package with everything package_name = f"dressify_complete_{timestamp}" package_path = os.path.join(self.export_dir, f"{package_name}.tar.gz") with tarfile.open(package_path, "w:gz") as tar: # Add splits if os.path.exists(self.splits_dir): tar.add(self.splits_dir, arcname="splits") # Add models if os.path.exists(self.export_dir): for file in os.listdir(self.export_dir): if file.endswith((".pth", ".json", ".yaml")): tar.add(os.path.join(self.export_dir, file), arcname=f"models/{file}") # Add metadata metadata_files = ["polyvore_item_metadata.json", "polyvore_outfit_titles.json", "categories.csv"] for file in metadata_files: file_path = os.path.join(self.data_dir, file) if os.path.exists(file_path): tar.add(file_path, arcname=f"metadata/{file}") # Add configs configs_dir = "configs" if os.path.exists(configs_dir): tar.add(configs_dir, arcname="configs") elif package_type == "splits_only": # Only splits (lightweight) package_name = f"dressify_splits_{timestamp}" package_path = os.path.join(self.export_dir, f"{package_name}.tar.gz") with tarfile.open(package_path, "w:gz") as tar: if os.path.exists(self.splits_dir): tar.add(self.splits_dir, arcname="splits") elif package_type == "models_only": # Only trained models package_name = f"dressify_models_{timestamp}" package_path = os.path.join(self.export_dir, f"{package_name}.tar.gz") with tarfile.open(package_path, "w:gz") as tar: if os.path.exists(self.export_dir): for file in os.listdir(self.export_dir): if file.endswith((".pth", ".json")): tar.add(os.path.join(self.export_dir, file), arcname=f"models/{file}") else: raise ValueError(f"Unknown package type: {package_type}") return package_path def get_downloadable_files(self) -> List[Dict[str, Any]]: """Get list of all downloadable files.""" files = [] # Add splits if os.path.exists(self.splits_dir): for file in os.listdir(self.splits_dir): if file.endswith(".json"): file_path = os.path.join(self.splits_dir, file) files.append({ "name": f"splits/{file}", "size_mb": round(os.path.getsize(file_path) / (1024 * 1024), 2), "path": file_path, "category": "splits", "description": f"Dataset split: {file.replace('.json', '')}" }) # Add models if os.path.exists(self.export_dir): for file in os.listdir(self.export_dir): if file.endswith((".pth", ".json")): file_path = os.path.join(self.export_dir, file) files.append({ "name": f"models/{file}", "size_mb": round(os.path.getsize(file_path) / (1024 * 1024), 2), "path": file_path, "category": "models", "description": "Trained model or metrics" }) # Add metadata metadata_files = ["polyvore_item_metadata.json", "polyvore_outfit_titles.json", "categories.csv"] for file in metadata_files: file_path = os.path.join(self.data_dir, file) if os.path.exists(file_path): files.append({ "name": f"metadata/{file}", "size_mb": round(os.path.getsize(file_path) / (1024 * 1024), 2), "path": file_path, "category": "metadata", "description": "Dataset metadata" }) return files def create_hf_upload_plan(self) -> Dict[str, Any]: """Create a plan for uploading to HF Hub.""" plan = { "Stylique/dressify-models": { "description": "Upload trained models and checkpoints", "files_to_upload": [], "estimated_size_mb": 0 }, "Stylique/Dressify-Helper": { "description": "Upload dataset splits and metadata", "files_to_upload": [], "estimated_size_mb": 0 } } # Plan for models repo if os.path.exists(self.export_dir): for file in os.listdir(self.export_dir): if file.endswith((".pth", ".json")): file_path = os.path.join(self.export_dir, file) size_mb = round(os.path.getsize(file_path) / (1024 * 1024), 2) plan["Stylique/dressify-models"]["files_to_upload"].append({ "name": file, "path": file_path, "size_mb": size_mb }) plan["Stylique/dressify-models"]["estimated_size_mb"] += size_mb # Plan for helper repo if os.path.exists(self.splits_dir): for file in os.listdir(self.splits_dir): if file.endswith(".json"): file_path = os.path.join(self.splits_dir, file) size_mb = round(os.path.getsize(file_path) / (1024 * 1024), 2) plan["Stylique/Dressify-Helper"]["files_to_upload"].append({ "name": f"splits/{file}", "path": file_path, "size_mb": size_mb }) plan["Stylique/Dressify-Helper"]["estimated_size_mb"] += size_mb # Add metadata files metadata_files = ["polyvore_item_metadata.json", "polyvore_outfit_titles.json", "categories.csv"] for file in metadata_files: file_path = os.path.join(self.data_dir, file) if os.path.exists(file_path): size_mb = round(os.path.getsize(file_path) / (1024 * 1024), 2) plan["Stylique/Dressify-Helper"]["files_to_upload"].append({ "name": f"metadata/{file}", "path": file_path, "size_mb": size_mb }) plan["Stylique/Dressify-Helper"]["estimated_size_mb"] += size_mb return plan def create_artifact_manager() -> ArtifactManager: """Create an artifact manager instance.""" return ArtifactManager()