Spaces:
Sleeping
Sleeping
| """ | |
| Script to prepare images for HuggingFace deployment | |
| This will copy/organize images and update paths in image_paths.pkl | |
| """ | |
| import pickle | |
| import shutil | |
| from pathlib import Path | |
| import os | |
| from tqdm import tqdm | |
| def prepare_images_for_deployment( | |
| image_paths_file='image_paths.pkl', | |
| output_dir='images', | |
| max_images_per_category=None, | |
| create_examples=True | |
| ): | |
| """ | |
| Prepare images for deployment by organizing them in a clean structure | |
| Args: | |
| image_paths_file: Path to your image_paths.pkl file | |
| output_dir: Directory where images will be copied | |
| max_images_per_category: Limit images per category (None = all) | |
| create_examples: Whether to create example images folder | |
| """ | |
| print("Loading image paths...") | |
| with open(image_paths_file, 'rb') as f: | |
| image_paths = pickle.load(f) | |
| print(f"Total images in index: {len(image_paths)}") | |
| # Create output directory | |
| output_path = Path(output_dir) | |
| output_path.mkdir(exist_ok=True) | |
| # Track statistics | |
| copied_count = 0 | |
| missing_count = 0 | |
| new_paths = [] | |
| categories = {} | |
| print("\nCopying images...") | |
| for idx, img_path in enumerate(tqdm(image_paths)): | |
| img_path = Path(img_path) | |
| # Check if source image exists | |
| if not img_path.exists(): | |
| print(f"Warning: Image not found: {img_path}") | |
| missing_count += 1 | |
| new_paths.append(str(img_path)) # Keep original path | |
| continue | |
| # Get category and filename | |
| category = img_path.parent.name | |
| filename = img_path.name | |
| # Track categories | |
| if category not in categories: | |
| categories[category] = 0 | |
| # Check if we should skip due to max_images_per_category | |
| if max_images_per_category and categories[category] >= max_images_per_category: | |
| continue | |
| categories[category] += 1 | |
| # Create category folder | |
| category_path = output_path / category | |
| category_path.mkdir(exist_ok=True) | |
| # Destination path | |
| dest_path = category_path / filename | |
| # Copy image | |
| if not dest_path.exists(): | |
| shutil.copy2(img_path, dest_path) | |
| # Store relative path | |
| relative_path = f"./{output_dir}/{category}/{filename}" | |
| new_paths.append(relative_path) | |
| copied_count += 1 | |
| print(f"\nβ Copied {copied_count} images") | |
| print(f"β Missing {missing_count} images") | |
| print(f"π Categories: {len(categories)}") | |
| # Save updated paths | |
| new_pkl_file = 'image_paths_deployment.pkl' | |
| with open(new_pkl_file, 'wb') as f: | |
| pickle.dump(new_paths, f) | |
| print(f"\nβ Saved updated paths to: {new_pkl_file}") | |
| # Create examples folder | |
| if create_examples and copied_count > 0: | |
| examples_path = output_path / 'examples' | |
| examples_path.mkdir(exist_ok=True) | |
| # Copy one image from each of first 5 categories as examples | |
| example_count = 0 | |
| for category in list(categories.keys())[:5]: | |
| category_path = output_path / category | |
| images = list(category_path.glob('*.jpg')) | |
| if images: | |
| example_dest = examples_path / f"{category}_{images[0].name}" | |
| shutil.copy2(images[0], example_dest) | |
| example_count += 1 | |
| print(f"β Created {example_count} example images in {examples_path}") | |
| # Print deployment instructions | |
| print("\n" + "="*60) | |
| print("DEPLOYMENT INSTRUCTIONS") | |
| print("="*60) | |
| print("\n1. Replace 'image_paths.pkl' with 'image_paths_deployment.pkl':") | |
| print(" > mv image_paths_deployment.pkl image_paths.pkl") | |
| print("\n2. Your directory structure should look like:") | |
| print(" βββ app.py") | |
| print(" βββ requirements.txt") | |
| print(" βββ README.md") | |
| print(" βββ faiss_index.bin") | |
| print(" βββ image_paths.pkl") | |
| print(" βββ images/") | |
| print(" βββ examples/") | |
| print(" βββ category1/") | |
| print(" βββ category2/") | |
| print(" βββ ...") | |
| print("\n3. Commit and push to HuggingFace:") | |
| print(" > git lfs track '*.bin' '*.pkl'") | |
| print(" > git add .") | |
| print(" > git commit -m 'Add images and deployment files'") | |
| print(" > git push") | |
| print("\n4. Total size estimate:") | |
| # Estimate size | |
| total_size = 0 | |
| for root, dirs, files in os.walk(output_dir): | |
| for file in files: | |
| filepath = os.path.join(root, file) | |
| total_size += os.path.getsize(filepath) | |
| size_mb = total_size / (1024 * 1024) | |
| size_gb = size_mb / 1024 | |
| if size_gb > 1: | |
| print(f" ~{size_gb:.2f} GB") | |
| else: | |
| print(f" ~{size_mb:.2f} MB") | |
| if size_gb > 10: | |
| print("\nβ οΈ WARNING: Your dataset is larger than 10GB!") | |
| print(" Consider using HuggingFace Datasets or reducing images.") | |
| return new_paths, categories | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser(description='Prepare images for HuggingFace deployment') | |
| parser.add_argument('--pkl-file', default='image_paths.pkl', | |
| help='Path to image_paths.pkl file') | |
| parser.add_argument('--output-dir', default='images', | |
| help='Output directory for images') | |
| parser.add_argument('--max-per-category', type=int, default=None, | |
| help='Maximum images per category (for size reduction)') | |
| parser.add_argument('--no-examples', action='store_true', | |
| help='Skip creating example images') | |
| args = parser.parse_args() | |
| prepare_images_for_deployment( | |
| image_paths_file=args.pkl_file, | |
| output_dir=args.output_dir, | |
| max_images_per_category=args.max_per_category, | |
| create_examples=not args.no_examples | |
| ) | |