| import json |
| import os |
|
|
| def diagnose_dataset(): |
| """Diagnose the dataset issues""" |
| |
| dataset_path = "phase7/synth_train_enhanced.json" |
| |
| if not os.path.exists(dataset_path): |
| print("β Enhanced dataset not found!") |
| return |
| |
| with open(dataset_path, 'r') as f: |
| data = json.load(f) |
| |
| print("π DATASET DIAGNOSIS") |
| print("=" * 60) |
| print(f"π Total samples: {len(data)}") |
| |
| |
| existing_images = 0 |
| missing_images = 0 |
| |
| for i, item in enumerate(data): |
| if os.path.exists(item["image"]): |
| existing_images += 1 |
| else: |
| missing_images += 1 |
| if missing_images <= 3: |
| print(f"β Missing image: {item['image']}") |
| |
| print(f"πΌοΈ Images: {existing_images} exist, {missing_images} missing") |
| |
| |
| total_adjectives = sum(item.get("adjective_count", 0) for item in data) |
| avg_adjectives = total_adjectives / len(data) if data else 0 |
| |
| print(f"π― Adjective density: {avg_adjectives:.2f} (target: β₯3.0)") |
| |
| |
| print(f"π Sample captions (first 5):") |
| for i in range(min(5, len(data))): |
| adj_count = data[i].get("adjective_count", 0) |
| print(f" {i+1}. [{adj_count} adj] {data[i]['caption']}") |
| |
| |
| batch_size = 8 |
| expected_steps = len(data) // batch_size |
| print(f"π Expected training: {expected_steps} steps (batch_size={batch_size})") |
| |
| return len(data), existing_images, avg_adjectives |
|
|
| if __name__ == "__main__": |
| diagnose_dataset() |
|
|