import json
import random
from PIL import Image
import os

def create_visual_grounding_dataset():
    """Create dataset for visual grounding training"""
    
    print("🔄 CREATING VISUAL GROUNDING DATASET")
    print("=" * 50)
    
    # Enhanced adjective pool with object-specific mappings
    object_adjective_mapping = {
        "person": ["elegant", "expressive", "dynamic", "poised", "animated"],
        "car": ["sleek", "streamlined", "powerful", "gleaming", "modern"],
        "building": ["majestic", "imposing", "architectural", "towering", "historic"],
        "tree": ["lush", "verdant", "towering", "ancient", "leafy"],
        "sky": ["dramatic", "expansive", "atmospheric", "luminous", "cloudy"],
        "water": ["glistening", "tranquil", "rippling", "crystal", "flowing"],
        "animal": ["graceful", "majestic", "wild", "poised", "curious"]
    }
    
    spatial_relations = [
        "in front of", "behind", "next to", "above", "below", 
        "between", "surrounded by", "near", "far from", "across from"
    ]
    
    # Create enhanced training examples
    enhanced_dataset = []
    
    # Example templates for different improvement areas
    templates = [
        # Object-specific descriptions
        "a {adjective1} {object1} in a {adjective2} {scene_type}",
        "a {adjective1} {object1} {spatial_relation} a {adjective2} {object2}",
        "multiple {adjective1} {object1}s in a {adjective2} {scene_type}",
        
        # Spatial relationships
        "a {adjective1} {object1} positioned {spatial_relation} the {adjective2} {object2}",
        "the {adjective1} {object1} stands {spatial_relation} the {adjective2} {object2}",
        
        # Complex scenes
        "a {scene_type} featuring a {adjective1} {object1} and a {adjective2} {object2}",
        "multiple objects including a {adjective1} {object1} and {adjective2} {object2}"
    ]
    
    objects = list(object_adjective_mapping.keys())
    scene_types = ["urban landscape", "natural setting", "indoor environment", "street scene"]
    
    # Generate diverse examples
    for i in range(1000):  # Start with 1000 enhanced examples
        template = random.choice(templates)
        
        # Fill template
        if "{object1}" in template and "{object2}" in template:
            obj1 = random.choice(objects)
            obj2 = random.choice([o for o in objects if o != obj1])
            adj1 = random.choice(object_adjective_mapping[obj1])
            adj2 = random.choice(object_adjective_mapping[obj2])
            spatial = random.choice(spatial_relations)
            
            caption = template.format(
                adjective1=adj1, adjective2=adj2,
                object1=obj1, object2=obj2,
                spatial_relation=spatial,
                scene_type=random.choice(scene_types)
            )
        else:
            # Simple object description
            obj = random.choice(objects)
            adj = random.choice(object_adjective_mapping[obj])
            caption = template.format(
                adjective1=adj, adjective2=random.choice(["vivid", "dramatic", "serene"]),
                object1=obj,
                scene_type=random.choice(scene_types)
            )
        
        enhanced_dataset.append({
            "caption": caption,
            "objects": [obj1] if 'obj1' in locals() else [random.choice(objects)],
            "adjectives": [adj1, adj2] if 'adj1' in locals() and 'adj2' in locals() else [adj],
            "has_spatial_relations": "spatial_relation" in template,
            "training_focus": "visual_grounding"
        })
    
    # Save enhanced dataset
    output_path = "phase8/visual_grounding_dataset.json"
    os.makedirs("phase8", exist_ok=True)
    
    with open(output_path, 'w') as f:
        json.dump(enhanced_dataset, f, indent=2)
    
    print(f"✅ Enhanced dataset created: {output_path}")
    print(f"📊 Dataset Statistics:")
    print(f"   - Total examples: {len(enhanced_dataset)}")
    print(f"   - Object types: {len(objects)}")
    print(f"   - Spatial relations: {len(spatial_relations)}")
    print(f"   - Average adjectives per caption: {sum(len(item['adjectives']) for item in enhanced_dataset) / len(enhanced_dataset):.2f}")
    
    # Show examples
    print(f"📝 Example Enhanced Captions:")
    for i in range(5):
        print(f"   {i+1}. {enhanced_dataset[i]['caption']}")
    
    return enhanced_dataset

if __name__ == "__main__":
    create_visual_grounding_dataset()