Spaces:

PavaniKadari
/

AdkraftAI

Sleeping

File size: 7,062 Bytes

24cd5a5

import json
import os
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

class MAdVerseDataLoaderOptimized:
    def __init__(self, data_path="./data"):
        self.data_path = Path(data_path)
        self.annotations_path = self.data_path / "annotations"
        self.images_path = self.data_path / "images"
        self.file_index = {}  # Fast lookup: filename -> full_path

    def build_file_index(self):
        """Build complete file index once (FAST!)"""
        print("\n" + "="*80)
        print("Building file index (one-time scan)...")
        print("="*80)

        file_count = 0
        for source_folder in ['Advert_Gallery', 'OnlineAds', 'Epaper1', 'Epaper2']:
            search_dir = self.images_path / source_folder

            if not search_dir.exists():
                print(f"⚠️  Folder not found: {search_dir}")
                continue

            print(f"\nScanning {source_folder}...")

            for root, dirs, files in os.walk(search_dir):
                for filename in files:
                    if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                        full_path = str(Path(root) / filename)
                        self.file_index[filename.lower()] = full_path
                        file_count += 1

            print(f"  ✓ Found {file_count} images so far")

        print(f"\n✓ Total files indexed: {len(self.file_index)}")
        print("="*80)

    def load_annotations(self):
        """Load all annotation files"""
        annotations = {}

        print("\n" + "="*80)
        print("Loading annotations...")
        print("="*80 + "\n")

        for json_file in self.annotations_path.glob("*.json"):
            source_name = json_file.stem.replace("_annotation", "").replace("_annot_j", "")
            print(f"Loading {source_name}...", end=" ")

            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                annotations[source_name] = data
                print(f"✓ ({len(data)} items)")

        return annotations

    def create_metadata_df(self, annotations):
        """Convert annotations to structured DataFrame"""
        rows = []

        print("\n" + "="*80)
        print("Processing annotations...")
        print("="*80 + "\n")

        for source, data in annotations.items():
            print(f"Processing {source} ({len(data)} items)...")

            for item in tqdm(data, desc=f"  {source}", ncols=80):
                row = self._extract_row_from_madverse_item(item, source)
                if row:
                    rows.append(row)

        df = pd.DataFrame(rows)
        print(f"\n✓ Created DataFrame with {len(df)} rows")

        # Fast image path lookup using pre-built index
        print("\nLinking image paths (using index)...")
        df['image_path'] = df['image_filename'].apply(self._fast_find_image)

        # Remove rows where images don't exist
        before_count = len(df)
        df = df[df['image_path'].notna()]
        found_count = len(df)
        missing_count = before_count - found_count

        print(f"✓ Found {found_count:,} images")
        print(f"✓ Missing {missing_count:,} images ({missing_count/before_count*100:.1f}%)")

        return df

    def _extract_row_from_madverse_item(self, item, source):
        """Extract row from MAdVerse annotation format"""

        # Extract filename from img_path
        img_path = item.get('img_path', '')

        # Handle both forward and backward slashes
        img_path = img_path.replace('\\\\', '/').replace('\\', '/')
        filename = Path(img_path).name

        # Extract hierarchical annotation
        hier_annot = item.get('hier_annot', [])
        category = hier_annot[0] if len(hier_annot) > 0 else 'unknown'
        subcategory = hier_annot[1] if len(hier_annot) > 1 else ''
        brand = hier_annot[2] if len(hier_annot) > 2 else ''

        # Map source to folder name
        source_folder = self._map_source_to_folder(source)

        # Create image_id from filename (without extension)
        image_id = Path(filename).stem

        row = {
            'source': source,
            'source_folder': source_folder,
            'image_id': image_id,
            'image_filename': filename,
            'category': category,
            'subcategory': subcategory,
            'brand': brand,
            'language': item.get('language', 'unknown'),
            'ad_type': item.get('ad_type', 'unknown'),
            'original_path': img_path
        }

        return row

    def _map_source_to_folder(self, annotation_source):
        """Map annotation source to actual image folder"""
        mapping = {
            'adgal': 'Advert_Gallery',
            'epaper1': 'Epaper1',
            'epaper2': 'Epaper2',
            'web': 'OnlineAds'
        }
        return mapping.get(annotation_source, annotation_source)

    def _fast_find_image(self, filename):
        """Fast O(1) lookup using pre-built index"""
        return self.file_index.get(filename.lower())

    def save_metadata(self, df, output_path="./processed/metadata/madverse_metadata.csv"):
        """Save metadata to CSV"""
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(output_path, index=False)

        print("\n" + "="*80)
        print("RESULTS")
        print("="*80)
        print(f"\n✓ Metadata saved to: {output_path}")
        print(f"✓ Total images processed: {len(df):,}")

        if len(df) > 0:
            print(f"\n📊 Breakdown by source:")
            print(df['source'].value_counts())

            print(f"\n📊 Breakdown by category (top 10):")
            print(df['category'].value_counts().head(10))

            print(f"\n📊 Breakdown by language:")
            print(df['language'].value_counts())

            print(f"\n📊 Breakdown by ad_type:")
            print(df['ad_type'].value_counts())

        return df

# Main execution
if __name__ == "__main__":
    import time
    start_time = time.time()

    print("\n" + "="*80)
    print("MAdVerse Dataset Loader (OPTIMIZED)")
    print("="*80)

    loader = MAdVerseDataLoaderOptimized()

    # Step 1: Build file index (fast one-time scan)
    loader.build_file_index()

    # Step 2: Load annotations
    annotations = loader.load_annotations()

    # Step 3: Create metadata DataFrame
    df = loader.create_metadata_df(annotations)

    # Step 4: Save metadata
    if len(df) > 0:
        loader.save_metadata(df)

        print(f"\n📋 Sample metadata:")
        print(df[['source', 'category', 'brand', 'language', 'image_filename']].head(15))

        print(f"\n✅ Columns: {list(df.columns)}")
        print(f"✅ Shape: {df.shape}")
    else:
        print("\n⚠️  No images found. Check your image folders.")

    elapsed_time = time.time() - start_time
    print(f"\n⏱️  Total execution time: {elapsed_time:.2f} seconds ({elapsed_time/60:.2f} minutes)")
    print("\n" + "="*80)