Spaces:
Sleeping
Sleeping
Upload 96 files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- app.py +486 -0
- outputs/captioning/swin-transformer_final_best.pt +3 -0
- outputs/classification/cls_swin-t_base_cls_raw-20260525-v2_lr-0005_bs-32_adamw_none_wdc-0.05_ls-0.0_best.pth +3 -0
- params.yaml +242 -0
- requirements.txt +342 -0
- src/caption/check_clip_score.py +440 -0
- src/caption/generate_captions_blip.py +220 -0
- src/caption/generate_captions_florence2.py +345 -0
- src/caption/generate_captions_git.py +600 -0
- src/caption/generate_captions_vit_gpt2.py +457 -0
- src/collection/check_class_counts.py +81 -0
- src/collection/collect_filtering_images.py +228 -0
- src/collection/count_label_hf.py +115 -0
- src/collection/download_dataset_hf.py +187 -0
- src/collection/download_dataset_kg.py +56 -0
- src/collection/download_dataset_us.py +233 -0
- src/collection/get_label_list_hf.py +98 -0
- src/collection/select_60_images.py +115 -0
- src/collection/unzip_data_kg.py +45 -0
- src/dataset/.ipynb_checkpoints/captioning_dataset-checkpoint.py +124 -0
- src/dataset/.ipynb_checkpoints/train_sub_tokenizer-checkpoint.py +55 -0
- src/dataset/__pycache__/build_vocab.cpython-310.pyc +0 -0
- src/dataset/__pycache__/captioning_dataset.cpython-310.pyc +0 -0
- src/dataset/__pycache__/classification_dataset.cpython-310.pyc +0 -0
- src/dataset/__pycache__/collate_caption.cpython-310.pyc +0 -0
- src/dataset/build_vocab.py +62 -0
- src/dataset/captioning_dataset.py +124 -0
- src/dataset/classification_dataset.py +69 -0
- src/dataset/collate_caption.py +14 -0
- src/dataset/sub_tokenizer1000.model +3 -0
- src/dataset/sub_tokenizer1000.vocab +1000 -0
- src/dataset/sub_tokenizer1500.model +3 -0
- src/dataset/sub_tokenizer1500.vocab +1500 -0
- src/dataset/sub_tokenizer2000.model +3 -0
- src/dataset/sub_tokenizer2000.vocab +2000 -0
- src/dataset/sub_tokenizing_captions.txt +0 -0
- src/dataset/train_sub_tokenizer.py +55 -0
- src/debug/test_forward.py +208 -0
- src/engines/__pycache__/captioning_trainer.cpython-310.pyc +0 -0
- src/engines/__pycache__/captioning_validator.cpython-310.pyc +0 -0
- src/engines/__pycache__/classification_trainer.cpython-310.pyc +0 -0
- src/engines/__pycache__/classification_validator.cpython-310.pyc +0 -0
- src/engines/__pycache__/resnet18_decoder_trainer.cpython-310.pyc +0 -0
- src/engines/__pycache__/resnet18_decoder_validator.cpython-310.pyc +0 -0
- src/engines/captioning_trainer.py +43 -0
- src/engines/captioning_validator.py +36 -0
- src/engines/classification_trainer.py +70 -0
- src/engines/classification_validator.py +90 -0
- src/metrics/.ipynb_checkpoints/evaluate_caption-checkpoint.py +35 -0
- src/metrics/.ipynb_checkpoints/make_show_all_caption-checkpoint.py +105 -0
app.py
ADDED
|
@@ -0,0 +1,486 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import tempfile
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import numpy as np
|
| 8 |
+
import torch
|
| 9 |
+
import yaml
|
| 10 |
+
from PIL import Image
|
| 11 |
+
from pytorch_grad_cam import GradCAM
|
| 12 |
+
from pytorch_grad_cam.utils.image import show_cam_on_image
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
WORKSPACE_ROOT = Path(
|
| 16 |
+
os.environ.get("WORKSPACE_ROOT", Path(__file__).resolve().parents[1])
|
| 17 |
+
)
|
| 18 |
+
SRC_DIR = WORKSPACE_ROOT / "src"
|
| 19 |
+
sys.path.insert(0, str(SRC_DIR))
|
| 20 |
+
|
| 21 |
+
from models.swin import EncoderSwinTiny
|
| 22 |
+
from transforms.image_transform import get_classification_valid_transform
|
| 23 |
+
from utils.captioning_inference import build_caption_runtime, decode_tokens
|
| 24 |
+
from visualization.generate_gradcam import (
|
| 25 |
+
SwinClassifierWrapper,
|
| 26 |
+
reshape_transform,
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
CLASSIFICATION_STATE = None
|
| 31 |
+
CAPTIONING_STATE = None
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def load_params():
|
| 35 |
+
"""params.yaml을 읽어서 데모, 모델, 체크포인트 설정을 가져온다."""
|
| 36 |
+
with open(WORKSPACE_ROOT / "params.yaml", "r", encoding="utf-8") as f:
|
| 37 |
+
return yaml.safe_load(f)
|
| 38 |
+
|
| 39 |
+
# params.yaml의 demo.class_names에서 학습 당시 클래스 목록을 가져온다.
|
| 40 |
+
def load_class_names(params):
|
| 41 |
+
class_names = params.get("demo", {}).get("class_names", [])
|
| 42 |
+
|
| 43 |
+
if not isinstance(class_names, list) or not all(
|
| 44 |
+
isinstance(class_name, str)
|
| 45 |
+
for class_name in class_names
|
| 46 |
+
):
|
| 47 |
+
raise ValueError("demo.class_names must be a list of class name strings.")
|
| 48 |
+
|
| 49 |
+
if not class_names:
|
| 50 |
+
raise ValueError("No class names found in params.yaml demo.class_names.")
|
| 51 |
+
|
| 52 |
+
return class_names
|
| 53 |
+
|
| 54 |
+
# CUDA 사용 가능 여부를 기준으로 장치를 선택
|
| 55 |
+
def get_device(params):
|
| 56 |
+
device_name = params.get("train", {}).get("device", "cuda")
|
| 57 |
+
|
| 58 |
+
# 설정이 cuda이고 실제 CUDA가 있으면 GPU를 사용한다.
|
| 59 |
+
if device_name == "cuda" and torch.cuda.is_available():
|
| 60 |
+
return torch.device("cuda")
|
| 61 |
+
|
| 62 |
+
return torch.device("cpu")
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def load_classification_checkpoint(model, checkpoint_path, device):
|
| 66 |
+
"""분류 모델 체크포인트를 로드하고 model_state_dict 형식이면 내부 state_dict만 꺼낸다."""
|
| 67 |
+
checkpoint = torch.load(
|
| 68 |
+
checkpoint_path,
|
| 69 |
+
map_location=device,
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# 저장 포맷이 {"model_state_dict": ...} 형태인 경우 실제 가중치만 사용한다.
|
| 73 |
+
if isinstance(checkpoint, dict) and "model_state_dict" in checkpoint:
|
| 74 |
+
checkpoint = checkpoint["model_state_dict"]
|
| 75 |
+
|
| 76 |
+
model.load_state_dict(checkpoint)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def build_classification_runtime():
|
| 80 |
+
"""분류 모델, transform, 클래스명, 체크포인트 경로를 묶은 런타임 상태를 만든다."""
|
| 81 |
+
params = load_params()
|
| 82 |
+
model_name = params["classification"]["model_name"]
|
| 83 |
+
|
| 84 |
+
# 현재 Grad-CAM wrapper와 모델 생성 로직은 Swin-T 전용이므로 다른 모델은 명시적으로 막는다.
|
| 85 |
+
if model_name != "swin_t":
|
| 86 |
+
raise ValueError(
|
| 87 |
+
"The combined Gradio demo currently supports only swin_t "
|
| 88 |
+
f"for classification, got: {model_name}"
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
class_names = load_class_names(params)
|
| 92 |
+
device = get_device(params)
|
| 93 |
+
|
| 94 |
+
model = EncoderSwinTiny(
|
| 95 |
+
num_classes=len(class_names)
|
| 96 |
+
).to(device)
|
| 97 |
+
|
| 98 |
+
checkpoint_path = WORKSPACE_ROOT / params["classification"]["final_checkpoint"]
|
| 99 |
+
load_classification_checkpoint(
|
| 100 |
+
model,
|
| 101 |
+
checkpoint_path,
|
| 102 |
+
device,
|
| 103 |
+
)
|
| 104 |
+
model.eval()
|
| 105 |
+
|
| 106 |
+
return {
|
| 107 |
+
"params": params,
|
| 108 |
+
"model": model,
|
| 109 |
+
"model_name": model_name,
|
| 110 |
+
"device": device,
|
| 111 |
+
"class_names": class_names,
|
| 112 |
+
"transform": get_classification_valid_transform(),
|
| 113 |
+
"checkpoint_path": checkpoint_path,
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def get_classification_runtime():
|
| 118 |
+
"""분류 런타임을 최초 요청 시 한 번만 만들고 이후에는 캐시된 상태를 재사용한다."""
|
| 119 |
+
global CLASSIFICATION_STATE
|
| 120 |
+
|
| 121 |
+
# 버튼 클릭 전에는 모델을 로드하지 않고, 첫 예측 시점에만 로드한다.
|
| 122 |
+
if CLASSIFICATION_STATE is None:
|
| 123 |
+
CLASSIFICATION_STATE = build_classification_runtime()
|
| 124 |
+
|
| 125 |
+
return CLASSIFICATION_STATE
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def get_caption_checkpoint_path(params):
|
| 129 |
+
"""캡셔닝 체크포인트 경로를 params.yaml에서 우선 찾고, 없으면 기본 파일명 규칙으로 만든다."""
|
| 130 |
+
checkpoint_config = params["captioning"]["checkpoint"]
|
| 131 |
+
final_checkpoint = checkpoint_config.get("final_checkpoint")
|
| 132 |
+
|
| 133 |
+
# final_checkpoint가 명시되어 있으면 그 파일을 우선 사용한다.
|
| 134 |
+
if final_checkpoint:
|
| 135 |
+
return WORKSPACE_ROOT / checkpoint_config["save_dir"] / final_checkpoint
|
| 136 |
+
|
| 137 |
+
# 명시 경로가 없으면 학습 코드의 encoder-decoder_version_best.pt 규칙으로 fallback한다.
|
| 138 |
+
encoder_name = params["captioning"]["encoder"]
|
| 139 |
+
decoder_name = params["captioning"]["decoder"]
|
| 140 |
+
version = params["captioning"]["version"]
|
| 141 |
+
return (
|
| 142 |
+
WORKSPACE_ROOT
|
| 143 |
+
/ checkpoint_config["save_dir"]
|
| 144 |
+
/ f"{encoder_name}-{decoder_name}_{version}_best.pt"
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def get_captioning_runtime():
|
| 149 |
+
"""캡셔닝 런타임을 최초 요청 시 한 번만 만들고 이후에는 캐시된 상태를 재사용한다."""
|
| 150 |
+
global CAPTIONING_STATE
|
| 151 |
+
|
| 152 |
+
# 캡셔닝 탭을 실제로 실행하기 전까지 encoder/decoder 로딩을 미룬다.
|
| 153 |
+
if CAPTIONING_STATE is None:
|
| 154 |
+
params = load_params()
|
| 155 |
+
CAPTIONING_STATE = build_caption_runtime(
|
| 156 |
+
WORKSPACE_ROOT,
|
| 157 |
+
checkpoint_path=get_caption_checkpoint_path(params),
|
| 158 |
+
)
|
| 159 |
+
return CAPTIONING_STATE
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def make_gradcam_overlay(model, image, tensor, device):
|
| 163 |
+
"""분류 모델의 마지막 Swin block을 대상으로 Grad-CAM overlay 이미지를 생성한다."""
|
| 164 |
+
# Grad-CAM은 gradient가 필요하므로 frozen backbone/classifier도 일시적으로 gradient를 켠다.
|
| 165 |
+
for param in model.backbone.parameters():
|
| 166 |
+
param.requires_grad = True
|
| 167 |
+
|
| 168 |
+
for param in model.classifier.parameters():
|
| 169 |
+
param.requires_grad = True
|
| 170 |
+
|
| 171 |
+
gradcam_model = SwinClassifierWrapper(model).to(device)
|
| 172 |
+
gradcam_model.eval()
|
| 173 |
+
|
| 174 |
+
resized_image = image.resize((224, 224))
|
| 175 |
+
image_np = np.array(resized_image).astype(np.float32) / 255.0
|
| 176 |
+
target_layer = model.backbone.features[-1][-1].norm2
|
| 177 |
+
|
| 178 |
+
with GradCAM(
|
| 179 |
+
model=gradcam_model,
|
| 180 |
+
target_layers=[target_layer],
|
| 181 |
+
reshape_transform=reshape_transform,
|
| 182 |
+
) as cam:
|
| 183 |
+
grayscale_cam = cam(input_tensor=tensor)[0]
|
| 184 |
+
|
| 185 |
+
overlay = show_cam_on_image(
|
| 186 |
+
image_np,
|
| 187 |
+
grayscale_cam,
|
| 188 |
+
use_rgb=True,
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
return Image.fromarray(overlay)
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def predict_classification(image, show_gradcam):
|
| 195 |
+
"""업로드된 이미지를 분류하고, 선택 시 Grad-CAM 결과까지 함께 반환한다."""
|
| 196 |
+
# 이미지가 없으면 Gradio 출력 개수에 맞춰 빈 결과를 반환한다.
|
| 197 |
+
if image is None:
|
| 198 |
+
return None, "Please upload an image.", {}, []
|
| 199 |
+
|
| 200 |
+
runtime = get_classification_runtime()
|
| 201 |
+
params = runtime["params"]
|
| 202 |
+
model = runtime["model"]
|
| 203 |
+
device = runtime["device"]
|
| 204 |
+
class_names = runtime["class_names"]
|
| 205 |
+
transform = runtime["transform"]
|
| 206 |
+
|
| 207 |
+
image = image.convert("RGB")
|
| 208 |
+
tensor = transform(image).unsqueeze(0).to(device)
|
| 209 |
+
|
| 210 |
+
with torch.no_grad():
|
| 211 |
+
logits = model(tensor)
|
| 212 |
+
probs = torch.softmax(logits, dim=1)[0]
|
| 213 |
+
|
| 214 |
+
top_k = max(
|
| 215 |
+
1,
|
| 216 |
+
min(
|
| 217 |
+
int(params["demo"].get("top_k", 5)),
|
| 218 |
+
len(class_names),
|
| 219 |
+
),
|
| 220 |
+
)
|
| 221 |
+
top_probs, top_indices = torch.topk(
|
| 222 |
+
probs,
|
| 223 |
+
k=top_k,
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
top_probs = top_probs.detach().cpu().tolist()
|
| 227 |
+
top_indices = top_indices.detach().cpu().tolist()
|
| 228 |
+
|
| 229 |
+
confidences = {
|
| 230 |
+
class_names[idx]: float(prob)
|
| 231 |
+
for idx, prob in zip(top_indices, top_probs)
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
predicted_idx = top_indices[0]
|
| 235 |
+
predicted_label = class_names[predicted_idx]
|
| 236 |
+
predicted_confidence = top_probs[0]
|
| 237 |
+
summary = (
|
| 238 |
+
f"Prediction: {predicted_label} "
|
| 239 |
+
f"({predicted_confidence * 100:.2f}%)"
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
table = [
|
| 243 |
+
[
|
| 244 |
+
rank,
|
| 245 |
+
class_names[idx],
|
| 246 |
+
f"{prob * 100:.2f}%",
|
| 247 |
+
]
|
| 248 |
+
for rank, (idx, prob) in enumerate(
|
| 249 |
+
zip(top_indices, top_probs),
|
| 250 |
+
start=1,
|
| 251 |
+
)
|
| 252 |
+
]
|
| 253 |
+
|
| 254 |
+
gradcam_image = None
|
| 255 |
+
|
| 256 |
+
# 사용자가 체크박스를 켠 경우에만 비용이 큰 Grad-CAM을 생성한다.
|
| 257 |
+
if show_gradcam:
|
| 258 |
+
gradcam_image = make_gradcam_overlay(
|
| 259 |
+
model,
|
| 260 |
+
image,
|
| 261 |
+
tensor,
|
| 262 |
+
device,
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
return gradcam_image, summary, confidences, table
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def caption_token_labels(generated_tokens, runtime, caption):
|
| 269 |
+
"""attention heatmap 제목으로 사용할 생성 토큰 라벨을 만든다."""
|
| 270 |
+
special_ids = {
|
| 271 |
+
runtime["w2i"].get("<pad>"),
|
| 272 |
+
runtime["w2i"].get("<sos>"),
|
| 273 |
+
runtime["w2i"].get("<eos>"),
|
| 274 |
+
}
|
| 275 |
+
labels = [
|
| 276 |
+
runtime["i2w"].get(token, "<unk>")
|
| 277 |
+
for token in generated_tokens
|
| 278 |
+
if token not in special_ids
|
| 279 |
+
]
|
| 280 |
+
|
| 281 |
+
# 토큰 id 기반 라벨이 있으면 attention 길이와 맞기 쉬운 이 라벨을 사용한다.
|
| 282 |
+
if labels:
|
| 283 |
+
return labels
|
| 284 |
+
|
| 285 |
+
# 예외적으로 라벨이 비어 있으면 문장 문자열을 단어 단위로 나눠 fallback한다.
|
| 286 |
+
return caption.split()
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
@torch.no_grad()
|
| 290 |
+
def predict_captioning(image):
|
| 291 |
+
"""업로드된 이미지에 대해 캡션을 생성하고 cross-attention heatmap들을 반환한다."""
|
| 292 |
+
# 이미지가 없으면 Gradio 출력 개수에 맞춰 빈 결과를 반환한다.
|
| 293 |
+
if image is None:
|
| 294 |
+
return "Please upload an image.", []
|
| 295 |
+
|
| 296 |
+
runtime = get_captioning_runtime()
|
| 297 |
+
params = runtime["params"]
|
| 298 |
+
image = image.convert("RGB")
|
| 299 |
+
image_tensor = runtime["transform"](image)
|
| 300 |
+
image_tensor = image_tensor.unsqueeze(0).to(runtime["device"])
|
| 301 |
+
|
| 302 |
+
features = runtime["encoder"](
|
| 303 |
+
image_tensor,
|
| 304 |
+
return_features=True,
|
| 305 |
+
)
|
| 306 |
+
start_token = torch.full(
|
| 307 |
+
(features.size(0),),
|
| 308 |
+
runtime["w2i"]["<sos>"],
|
| 309 |
+
dtype=torch.long,
|
| 310 |
+
device=runtime["device"],
|
| 311 |
+
)
|
| 312 |
+
|
| 313 |
+
beam_config = params["captioning"]["beam_search"]
|
| 314 |
+
use_beam_search = beam_config.get("use_beam_search", True)
|
| 315 |
+
beam_size = beam_config.get("beam_size", 3)
|
| 316 |
+
|
| 317 |
+
# params.yaml에서 beam search를 켠 경우 여러 후보를 탐색해 캡션을 생성한다.
|
| 318 |
+
if use_beam_search:
|
| 319 |
+
generated_tokens, _, enc_dec_atten = runtime["decoder"].generate_beam(
|
| 320 |
+
features,
|
| 321 |
+
start_token,
|
| 322 |
+
runtime["w2i"]["<eos>"],
|
| 323 |
+
beam_size,
|
| 324 |
+
)
|
| 325 |
+
else:
|
| 326 |
+
# beam search를 끈 경우 매 step에서 가장 확률 높은 토큰을 선택하는 greedy 생성을 사용한다.
|
| 327 |
+
generated_tokens, _, enc_dec_atten = runtime["decoder"].generate(
|
| 328 |
+
features,
|
| 329 |
+
start_token,
|
| 330 |
+
runtime["w2i"]["<eos>"],
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
+
caption = decode_tokens(
|
| 334 |
+
generated_tokens[0],
|
| 335 |
+
runtime["w2i"],
|
| 336 |
+
runtime["i2w"],
|
| 337 |
+
params["captioning"]["tokenizer"]["use_subword"],
|
| 338 |
+
sp_model_path=runtime["sp_model_path"],
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
caption_tokens = caption_token_labels(
|
| 342 |
+
generated_tokens[0],
|
| 343 |
+
runtime,
|
| 344 |
+
caption,
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
tmp_dir = tempfile.mkdtemp(prefix="combined_captioning_gradio_")
|
| 348 |
+
heatmap_images = []
|
| 349 |
+
n_layers = len(runtime["decoder"].layers)
|
| 350 |
+
|
| 351 |
+
# 각 decoder layer별 cross-attention heatmap 이미지를 만들어 Gallery에 표시한다.
|
| 352 |
+
for layer in range(1, n_layers + 1):
|
| 353 |
+
cross_atten_path = Path(tmp_dir) / f"cross_attention_layer_{layer}.jpg"
|
| 354 |
+
runtime["decoder"].show_cross_atten(
|
| 355 |
+
enc_dec_atten[0],
|
| 356 |
+
caption_tokens,
|
| 357 |
+
layer,
|
| 358 |
+
image_tensor.squeeze(0).detach().cpu(),
|
| 359 |
+
str(cross_atten_path),
|
| 360 |
+
)
|
| 361 |
+
heatmap_images.append(
|
| 362 |
+
(
|
| 363 |
+
str(cross_atten_path),
|
| 364 |
+
f"Layer {layer}",
|
| 365 |
+
)
|
| 366 |
+
)
|
| 367 |
+
|
| 368 |
+
return caption, heatmap_images
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
def create_demo():
|
| 372 |
+
"""분류 탭과 캡셔닝 탭을 가진 하나의 Gradio Blocks 앱을 만든다."""
|
| 373 |
+
params = load_params()
|
| 374 |
+
top_k = max(1, int(params["demo"].get("top_k", 5)))
|
| 375 |
+
caption_checkpoint = get_caption_checkpoint_path(params)
|
| 376 |
+
|
| 377 |
+
with gr.Blocks(title="ImageNet Classification and Captioning Demo") as demo:
|
| 378 |
+
gr.Markdown("# ImageNet Classification and Captioning Demo")
|
| 379 |
+
|
| 380 |
+
with gr.Tabs():
|
| 381 |
+
with gr.Tab("Classification"):
|
| 382 |
+
gr.Markdown(
|
| 383 |
+
"Upload an image and classify it with the final checkpoint."
|
| 384 |
+
)
|
| 385 |
+
gr.Markdown(
|
| 386 |
+
f"checkpoint: {WORKSPACE_ROOT / params['classification']['final_checkpoint']}"
|
| 387 |
+
)
|
| 388 |
+
|
| 389 |
+
with gr.Row():
|
| 390 |
+
with gr.Column():
|
| 391 |
+
classification_image_input = gr.Image(
|
| 392 |
+
type="pil",
|
| 393 |
+
label="Input Image",
|
| 394 |
+
)
|
| 395 |
+
gradcam_checkbox = gr.Checkbox(
|
| 396 |
+
value=bool(params["demo"].get("show_gradcam", True)),
|
| 397 |
+
label="Show Grad-CAM",
|
| 398 |
+
)
|
| 399 |
+
classification_button = gr.Button(
|
| 400 |
+
"Predict",
|
| 401 |
+
variant="primary",
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
with gr.Column():
|
| 405 |
+
gradcam_output = gr.Image(
|
| 406 |
+
type="pil",
|
| 407 |
+
label="Grad-CAM",
|
| 408 |
+
)
|
| 409 |
+
classification_summary_output = gr.Textbox(
|
| 410 |
+
label="Prediction",
|
| 411 |
+
)
|
| 412 |
+
confidence_output = gr.Label(
|
| 413 |
+
label="Top Prediction Scores",
|
| 414 |
+
num_top_classes=top_k,
|
| 415 |
+
)
|
| 416 |
+
table_output = gr.Dataframe(
|
| 417 |
+
headers=["Rank", "Class", "Confidence"],
|
| 418 |
+
datatype=["number", "str", "str"],
|
| 419 |
+
label=f"Top-{top_k}",
|
| 420 |
+
interactive=False,
|
| 421 |
+
)
|
| 422 |
+
|
| 423 |
+
classification_button.click(
|
| 424 |
+
fn=predict_classification,
|
| 425 |
+
inputs=[
|
| 426 |
+
classification_image_input,
|
| 427 |
+
gradcam_checkbox,
|
| 428 |
+
],
|
| 429 |
+
outputs=[
|
| 430 |
+
gradcam_output,
|
| 431 |
+
classification_summary_output,
|
| 432 |
+
confidence_output,
|
| 433 |
+
table_output,
|
| 434 |
+
],
|
| 435 |
+
)
|
| 436 |
+
|
| 437 |
+
with gr.Tab("Captioning"):
|
| 438 |
+
gr.Markdown(
|
| 439 |
+
"Upload an image and generate a caption with cross-attention heatmaps."
|
| 440 |
+
)
|
| 441 |
+
gr.Markdown(f"checkpoint: {caption_checkpoint}")
|
| 442 |
+
|
| 443 |
+
with gr.Row():
|
| 444 |
+
with gr.Column():
|
| 445 |
+
captioning_image_input = gr.Image(
|
| 446 |
+
type="pil",
|
| 447 |
+
label="Input Image",
|
| 448 |
+
)
|
| 449 |
+
captioning_button = gr.Button(
|
| 450 |
+
"Generate Caption",
|
| 451 |
+
variant="primary",
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
with gr.Column():
|
| 455 |
+
caption_output = gr.Textbox(
|
| 456 |
+
label="Generated Caption",
|
| 457 |
+
lines=4,
|
| 458 |
+
)
|
| 459 |
+
cross_atten_output = gr.Gallery(
|
| 460 |
+
label="Cross Attention Heatmaps",
|
| 461 |
+
columns=2,
|
| 462 |
+
object_fit="contain",
|
| 463 |
+
height="auto",
|
| 464 |
+
)
|
| 465 |
+
|
| 466 |
+
captioning_button.click(
|
| 467 |
+
fn=predict_captioning,
|
| 468 |
+
inputs=[captioning_image_input],
|
| 469 |
+
outputs=[
|
| 470 |
+
caption_output,
|
| 471 |
+
cross_atten_output,
|
| 472 |
+
],
|
| 473 |
+
)
|
| 474 |
+
|
| 475 |
+
return demo
|
| 476 |
+
|
| 477 |
+
|
| 478 |
+
if __name__ == "__main__":
|
| 479 |
+
params = load_params()
|
| 480 |
+
|
| 481 |
+
demo = create_demo()
|
| 482 |
+
demo.launch(
|
| 483 |
+
server_name=params["demo"]["host"],
|
| 484 |
+
server_port=params["demo"]["port"],
|
| 485 |
+
share=params["demo"]["share"],
|
| 486 |
+
)
|
outputs/captioning/swin-transformer_final_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9897bcdee87a9b20241c9c742c27feb0f1204cf2d65456f3d892300a23b59adc
|
| 3 |
+
size 468449515
|
outputs/classification/cls_swin-t_base_cls_raw-20260525-v2_lr-0005_bs-32_adamw_none_wdc-0.05_ls-0.0_best.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ec30dab609101cc6a58722968771d66a81d44cd3098e79975c4c4ab59141b1b2
|
| 3 |
+
size 112163475
|
params.yaml
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
project:
|
| 2 |
+
name: imagenet-project
|
| 3 |
+
|
| 4 |
+
data:
|
| 5 |
+
raw_dir: ./data/raw/
|
| 6 |
+
captions_file: ./data/captioning/annotations/train.json
|
| 7 |
+
dataset_version: cls_raw-20260525-v2
|
| 8 |
+
# dataset_version: raw-20260509-v1
|
| 9 |
+
|
| 10 |
+
split:
|
| 11 |
+
train_ratio: 0.7
|
| 12 |
+
val_ratio: 0.15
|
| 13 |
+
test_ratio: 0.15
|
| 14 |
+
|
| 15 |
+
train:
|
| 16 |
+
seed: 42
|
| 17 |
+
# repeated experiment
|
| 18 |
+
# seed: 7
|
| 19 |
+
# seed: 21
|
| 20 |
+
epochs: 20
|
| 21 |
+
batch_size: 32
|
| 22 |
+
num_workers: 4
|
| 23 |
+
device: cuda
|
| 24 |
+
optimizer: adam
|
| 25 |
+
|
| 26 |
+
preprocess:
|
| 27 |
+
image_size: 224
|
| 28 |
+
normalize: true
|
| 29 |
+
|
| 30 |
+
loss:
|
| 31 |
+
name: cross_entropy
|
| 32 |
+
ignore_index: pad_token
|
| 33 |
+
|
| 34 |
+
evaluate:
|
| 35 |
+
batch_size: 32
|
| 36 |
+
metrics:
|
| 37 |
+
- bleu
|
| 38 |
+
- rouge_l
|
| 39 |
+
- meteor
|
| 40 |
+
|
| 41 |
+
logging:
|
| 42 |
+
use_wandb: true
|
| 43 |
+
project_name: imagenet-project
|
| 44 |
+
log_interval: 10
|
| 45 |
+
|
| 46 |
+
outputs:
|
| 47 |
+
base_dir: outputs
|
| 48 |
+
|
| 49 |
+
demo:
|
| 50 |
+
host: 0.0.0.0
|
| 51 |
+
port: 7860
|
| 52 |
+
share: false
|
| 53 |
+
top_k: 5
|
| 54 |
+
show_gradcam: true
|
| 55 |
+
class_names : [airplane, apple, aster, banana, bicycle, bracelet, bulldog, bus, butterfly, car, carrot, cucumber, cup-cake, daisy, dandelion, dumpling, earrings, elephant, glasses, golden-retriever, hamburger, horse, iris, lavender, lily, marigold, motorcycle, necklace, orange, orchid, pants, pasta, penguin, persian-cat, pizza, rose, salad, sandwich, sheep, siamese-cat, sneakers, squirrel, steak, strawberry, sunflower, sushi, tomato, t-shirt, tulip, waffle]
|
| 56 |
+
|
| 57 |
+
cnn:
|
| 58 |
+
backbone: resnet18
|
| 59 |
+
pretrained: true
|
| 60 |
+
freeze: true
|
| 61 |
+
output_dim: 512
|
| 62 |
+
dropout: 0.3
|
| 63 |
+
pooling: avg
|
| 64 |
+
|
| 65 |
+
captioning:
|
| 66 |
+
|
| 67 |
+
# encoder: resnet18
|
| 68 |
+
encoder: swin
|
| 69 |
+
# encoder: vit
|
| 70 |
+
decoder: transformer
|
| 71 |
+
# decoder: lstm
|
| 72 |
+
# decoder: gru
|
| 73 |
+
version: final
|
| 74 |
+
|
| 75 |
+
epochs: 25
|
| 76 |
+
learning_rate: 0.0001
|
| 77 |
+
batch_size: 32
|
| 78 |
+
optimizer: adamw
|
| 79 |
+
max_caption_length: 30
|
| 80 |
+
train_num_caption: 2
|
| 81 |
+
|
| 82 |
+
debug: False
|
| 83 |
+
|
| 84 |
+
lstm:
|
| 85 |
+
embed_dim: 256
|
| 86 |
+
hidden_dim: 512
|
| 87 |
+
num_layers: 1
|
| 88 |
+
|
| 89 |
+
gru:
|
| 90 |
+
embed_dim: 256
|
| 91 |
+
hidden_dim: 512
|
| 92 |
+
num_layers: 1
|
| 93 |
+
|
| 94 |
+
transformer:
|
| 95 |
+
n_layers: 6
|
| 96 |
+
nhead: 8
|
| 97 |
+
d_model: 512
|
| 98 |
+
drop_p: 0.3
|
| 99 |
+
label_smoothing: 0
|
| 100 |
+
weight_decay: 0.001
|
| 101 |
+
|
| 102 |
+
data:
|
| 103 |
+
dataset_version: cap_raw-20260524-v1
|
| 104 |
+
train_img: ./data/captioning/raw/train/
|
| 105 |
+
train_caption: ./data/captioning/annotations/train.json
|
| 106 |
+
val_img: ./data/captioning/raw/val/
|
| 107 |
+
val_caption: ./data/captioning/annotations/val.json
|
| 108 |
+
test_img: ./data/captioning/raw/test/
|
| 109 |
+
test_caption: ./data/captioning/annotations/test.json
|
| 110 |
+
|
| 111 |
+
tokenizer:
|
| 112 |
+
min_freq: 3
|
| 113 |
+
max_vocab_size: 10000
|
| 114 |
+
sp_vocab_size: 2000
|
| 115 |
+
use_subword: False
|
| 116 |
+
sp_model_path: ./src/dataset/sub_tokenizer2000.model
|
| 117 |
+
|
| 118 |
+
checkpoint:
|
| 119 |
+
save_dir: ./outputs/captioning
|
| 120 |
+
final_checkpoint: swin-transformer_final_best.pt
|
| 121 |
+
resume: False
|
| 122 |
+
|
| 123 |
+
heatmap:
|
| 124 |
+
dec_atten_dir: /workspace/outputs/captioning/heatmap/
|
| 125 |
+
enc_dec_atten_dir: /workspace/outputs/captioning/heatmap/
|
| 126 |
+
layer: 6 # 몇번째 층
|
| 127 |
+
sample: [0, 410, 820, 1230, 1640] # caption & heatmap 몇번째 샘플(batch)
|
| 128 |
+
|
| 129 |
+
scheduler:
|
| 130 |
+
use_scheduler: False
|
| 131 |
+
warmup_step: 500
|
| 132 |
+
lr_scale: 0.5
|
| 133 |
+
|
| 134 |
+
beam_search:
|
| 135 |
+
use_beam_search: True
|
| 136 |
+
beam_size: 3
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
classification:
|
| 141 |
+
|
| 142 |
+
# model_name: resnet18
|
| 143 |
+
# model_name: efficientnet_b0
|
| 144 |
+
# model_name: convnext_tiny
|
| 145 |
+
# model_name: mobilenet_v3_small
|
| 146 |
+
# model_name: vit_b_16
|
| 147 |
+
model_name: swin_t
|
| 148 |
+
# model_name: deit_tiny_patch16_224
|
| 149 |
+
|
| 150 |
+
final_checkpoint: ./outputs/classification/cls_swin-t_base_cls_raw-20260525-v2_lr-0005_bs-32_adamw_none_wdc-0.05_ls-0.0_best.pth
|
| 151 |
+
|
| 152 |
+
epochs: 50
|
| 153 |
+
|
| 154 |
+
learning_rate:
|
| 155 |
+
|
| 156 |
+
# baseline
|
| 157 |
+
cnn: 0.001
|
| 158 |
+
transformer: 0.0005
|
| 159 |
+
|
| 160 |
+
# hyperparameter tuning
|
| 161 |
+
# cnn: 0.0005
|
| 162 |
+
# transformer: 0.0001
|
| 163 |
+
|
| 164 |
+
# optimizer: adam
|
| 165 |
+
# optimizer: sgd
|
| 166 |
+
optimizer: adamw
|
| 167 |
+
|
| 168 |
+
# default
|
| 169 |
+
# weight_decay: 0.01
|
| 170 |
+
|
| 171 |
+
# tuning
|
| 172 |
+
weight_decay: 0.05
|
| 173 |
+
|
| 174 |
+
scheduler:
|
| 175 |
+
use: false
|
| 176 |
+
|
| 177 |
+
# use: true
|
| 178 |
+
# name: cosineannealinglr
|
| 179 |
+
|
| 180 |
+
augmentation:
|
| 181 |
+
|
| 182 |
+
# baseline
|
| 183 |
+
use_aug: false
|
| 184 |
+
type: none
|
| 185 |
+
|
| 186 |
+
# mixup
|
| 187 |
+
# use_aug: true
|
| 188 |
+
# type: mixup
|
| 189 |
+
|
| 190 |
+
# cutmix
|
| 191 |
+
# use_aug: true
|
| 192 |
+
# type: cutmix
|
| 193 |
+
|
| 194 |
+
label_smoothing: 0.0
|
| 195 |
+
|
| 196 |
+
# label smoothing experiment
|
| 197 |
+
# label_smoothing: 0.05
|
| 198 |
+
# label_smoothing: 0.1
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
metrics:
|
| 202 |
+
|
| 203 |
+
train:
|
| 204 |
+
- loss
|
| 205 |
+
- accuracy
|
| 206 |
+
|
| 207 |
+
validation:
|
| 208 |
+
- loss
|
| 209 |
+
- accuracy
|
| 210 |
+
- macro_f1
|
| 211 |
+
|
| 212 |
+
final_test:
|
| 213 |
+
- accuracy
|
| 214 |
+
- macro_f1
|
| 215 |
+
- precision
|
| 216 |
+
- recall
|
| 217 |
+
- confusion_matrix
|
| 218 |
+
|
| 219 |
+
checkpoint:
|
| 220 |
+
save_dir: /workspace/outputs/classification
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
latent_space:
|
| 224 |
+
data_dir: /workspace/data/raw
|
| 225 |
+
checkpoint: /workspace/outputs/classification/cls_swin-t_base_cls_raw-20260525-v2_lr-0005_bs-32_adamw_none_wdc-0.05_ls-0.0_best.pth
|
| 226 |
+
output_dir: /workspace/outputs/latent_space
|
| 227 |
+
output_umap_npy: cls_swin-t_best_umap_2d_test_nb10_md05
|
| 228 |
+
output_umap_png: cls_swin-t_best_umap_plt_test_nb10_md05
|
| 229 |
+
output_meta_csv: cls_swin-t_best_metadata_test_nb10_md05
|
| 230 |
+
split: test
|
| 231 |
+
batch_size: 32
|
| 232 |
+
num_workers: 4
|
| 233 |
+
device: cuda
|
| 234 |
+
seed: 42
|
| 235 |
+
save_meta: true
|
| 236 |
+
use_wandb: true
|
| 237 |
+
wandb_name: latent_space_umap
|
| 238 |
+
|
| 239 |
+
umap:
|
| 240 |
+
n_neighbors: 10
|
| 241 |
+
min_dist: 0.5
|
| 242 |
+
metric: cosine
|
requirements.txt
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
absl-py==2.4.0
|
| 2 |
+
accelerate==0.27.2
|
| 3 |
+
aiobotocore==3.5.0
|
| 4 |
+
aiohappyeyeballs==2.6.1
|
| 5 |
+
aiohttp==3.13.5
|
| 6 |
+
aiohttp-retry==2.9.1
|
| 7 |
+
aioitertools==0.13.0
|
| 8 |
+
aiosignal==1.4.0
|
| 9 |
+
albumentations==1.4.7
|
| 10 |
+
amqp==5.3.1
|
| 11 |
+
annotated-doc==0.0.4
|
| 12 |
+
annotated-types==0.7.0
|
| 13 |
+
antlr4-python3-runtime==4.9.3
|
| 14 |
+
anyio==4.13.0
|
| 15 |
+
appdirs==1.4.4
|
| 16 |
+
archspec @ file:///croot/archspec_1697725767277/work
|
| 17 |
+
argon2-cffi==25.1.0
|
| 18 |
+
argon2-cffi-bindings==25.1.0
|
| 19 |
+
arrow==1.4.0
|
| 20 |
+
asttokens @ file:///opt/conda/conda-bld/asttokens_1646925590279/work
|
| 21 |
+
astunparse==1.6.3
|
| 22 |
+
async-lru==2.3.0
|
| 23 |
+
async-timeout==5.0.1
|
| 24 |
+
asyncssh==2.22.0
|
| 25 |
+
atpublic==7.0.0
|
| 26 |
+
attrs==26.1.0
|
| 27 |
+
babel==2.18.0
|
| 28 |
+
backoff==2.2.1
|
| 29 |
+
beautifulsoup4 @ file:///croot/beautifulsoup4-split_1681493039619/work
|
| 30 |
+
billiard==4.2.4
|
| 31 |
+
bleach==6.3.0
|
| 32 |
+
boltons @ file:///croot/boltons_1677628692245/work
|
| 33 |
+
boto3==1.42.91
|
| 34 |
+
botocore==1.42.91
|
| 35 |
+
Brotli @ file:///tmp/abs_ecyw11_7ze/croots/recipe/brotli-split_1659616059936/work
|
| 36 |
+
celery==5.6.3
|
| 37 |
+
certifi==2026.4.22
|
| 38 |
+
cffi==2.0.0
|
| 39 |
+
chardet @ file:///home/builder/ci_310/chardet_1640804867535/work
|
| 40 |
+
charset-normalizer==3.4.7
|
| 41 |
+
click==8.3.3
|
| 42 |
+
click-didyoumean==0.3.1
|
| 43 |
+
click-plugins==1.1.1.2
|
| 44 |
+
click-repl==0.3.0
|
| 45 |
+
colorama==0.4.6
|
| 46 |
+
comm==0.2.3
|
| 47 |
+
conda @ file:///croot/conda_1696257509808/work
|
| 48 |
+
conda-build @ file:///croot/conda-build_1708025865815/work
|
| 49 |
+
conda-content-trust @ file:///croot/conda-content-trust_1693490622020/work
|
| 50 |
+
conda-libmamba-solver @ file:///croot/conda-libmamba-solver_1691418897561/work/src
|
| 51 |
+
conda-package-handling @ file:///croot/conda-package-handling_1690999929514/work
|
| 52 |
+
conda_index @ file:///croot/conda-index_1706633791028/work
|
| 53 |
+
conda_package_streaming @ file:///croot/conda-package-streaming_1690987966409/work
|
| 54 |
+
configobj==5.0.9
|
| 55 |
+
contourpy==1.3.2
|
| 56 |
+
cryptography @ file:///croot/cryptography_1707523700518/work
|
| 57 |
+
cuda-bindings==12.9.4
|
| 58 |
+
cuda-pathfinder==1.5.4
|
| 59 |
+
cuda-toolkit==12.8.1
|
| 60 |
+
cycler==0.12.1
|
| 61 |
+
dacite==1.6.0
|
| 62 |
+
dagshub==0.7.0
|
| 63 |
+
dagshub-annotation-converter==0.2.0
|
| 64 |
+
dataclasses-json==0.6.7
|
| 65 |
+
datasets==2.18.0
|
| 66 |
+
debugpy==1.8.20
|
| 67 |
+
decorator @ file:///opt/conda/conda-bld/decorator_1643638310831/work
|
| 68 |
+
defusedxml==0.7.1
|
| 69 |
+
dictdiffer==0.9.0
|
| 70 |
+
dill==0.3.8
|
| 71 |
+
diskcache==5.6.3
|
| 72 |
+
distro @ file:///croot/distro_1701455004953/work
|
| 73 |
+
dnspython==2.6.1
|
| 74 |
+
docker-pycreds==0.4.0
|
| 75 |
+
dpath==2.2.0
|
| 76 |
+
dulwich==1.2.0
|
| 77 |
+
dvc==3.67.1
|
| 78 |
+
dvc-data==3.18.3
|
| 79 |
+
dvc-http==2.32.0
|
| 80 |
+
dvc-objects==5.2.0
|
| 81 |
+
dvc-render==1.0.2
|
| 82 |
+
dvc-s3==3.3.0
|
| 83 |
+
dvc-studio-client==0.22.0
|
| 84 |
+
dvc-task==0.40.2
|
| 85 |
+
entrypoints==0.4
|
| 86 |
+
exceptiongroup==1.3.1
|
| 87 |
+
executing @ file:///opt/conda/conda-bld/executing_1646925071911/work
|
| 88 |
+
expecttest==0.2.1
|
| 89 |
+
fastjsonschema==2.21.2
|
| 90 |
+
filelock==3.29.0
|
| 91 |
+
flatten-dict==0.4.2
|
| 92 |
+
flufl.lock==9.0.0
|
| 93 |
+
fonttools==4.62.1
|
| 94 |
+
fqdn==1.5.1
|
| 95 |
+
frozenlist==1.8.0
|
| 96 |
+
fsspec==2024.2.0
|
| 97 |
+
ftfy==6.3.1
|
| 98 |
+
funcy==2.0
|
| 99 |
+
gitdb==4.0.12
|
| 100 |
+
GitPython==3.1.49
|
| 101 |
+
gmpy2 @ file:///tmp/build/80754af9/gmpy2_1645455533097/work
|
| 102 |
+
gql==4.0.0
|
| 103 |
+
grandalf==0.8
|
| 104 |
+
graphql-core==3.2.8
|
| 105 |
+
gto==1.9.0
|
| 106 |
+
h11==0.16.0
|
| 107 |
+
hf-xet==1.4.3
|
| 108 |
+
httpcore==1.0.9
|
| 109 |
+
httpx==0.28.1
|
| 110 |
+
huggingface-hub==0.20.3
|
| 111 |
+
hydra-core==1.3.2
|
| 112 |
+
hypothesis==6.98.10
|
| 113 |
+
idna==3.13
|
| 114 |
+
ImageHash==4.3.1
|
| 115 |
+
ImageIO==2.37.3
|
| 116 |
+
ipykernel==7.2.0
|
| 117 |
+
ipython @ file:///croot/ipython_1704833016303/work
|
| 118 |
+
isoduration==20.11.0
|
| 119 |
+
iterative-telemetry==0.0.10
|
| 120 |
+
jedi @ file:///tmp/build/80754af9/jedi_1644315229345/work
|
| 121 |
+
Jinja2==3.1.6
|
| 122 |
+
jmespath==1.1.0
|
| 123 |
+
joblib==1.5.3
|
| 124 |
+
json5==0.14.0
|
| 125 |
+
jsonpatch @ file:///tmp/build/80754af9/jsonpatch_1615747632069/work
|
| 126 |
+
jsonpointer==2.1
|
| 127 |
+
jsonschema @ file:///croot/jsonschema_1699041609003/work
|
| 128 |
+
jsonschema-specifications @ file:///croot/jsonschema-specifications_1699032386549/work
|
| 129 |
+
jupyter-events==0.12.1
|
| 130 |
+
jupyter-lsp==2.3.1
|
| 131 |
+
jupyter_client==8.8.0
|
| 132 |
+
jupyter_core==5.9.1
|
| 133 |
+
jupyter_server==2.18.2
|
| 134 |
+
jupyter_server_terminals==0.5.4
|
| 135 |
+
jupyterlab==4.5.6
|
| 136 |
+
jupyterlab_pygments==0.3.0
|
| 137 |
+
jupyterlab_server==2.28.0
|
| 138 |
+
kiwisolver==1.5.0
|
| 139 |
+
kombu==5.6.2
|
| 140 |
+
lazy-loader==0.5
|
| 141 |
+
libarchive-c @ file:///tmp/build/80754af9/python-libarchive-c_1617780486945/work
|
| 142 |
+
libmambapy @ file:///croot/mamba-split_1698782620632/work/libmambapy
|
| 143 |
+
lightning-utilities==0.15.3
|
| 144 |
+
lxml==6.1.0
|
| 145 |
+
markdown-it-py==4.0.0
|
| 146 |
+
MarkupSafe==3.0.3
|
| 147 |
+
marshmallow==3.26.2
|
| 148 |
+
matplotlib==3.8.3
|
| 149 |
+
matplotlib-inline @ file:///opt/conda/conda-bld/matplotlib-inline_1662014470464/work
|
| 150 |
+
mdurl==0.1.2
|
| 151 |
+
menuinst @ file:///croot/menuinst_1706732933928/work
|
| 152 |
+
mistune==3.2.1
|
| 153 |
+
mkl-fft @ file:///croot/mkl_fft_1695058164594/work
|
| 154 |
+
mkl-random @ file:///croot/mkl_random_1695059800811/work
|
| 155 |
+
mkl-service==2.4.0
|
| 156 |
+
more-itertools @ file:///croot/more-itertools_1700662129964/work
|
| 157 |
+
mpmath==1.3.0
|
| 158 |
+
multidict==6.7.1
|
| 159 |
+
multiprocess==0.70.16
|
| 160 |
+
mypy_extensions==1.1.0
|
| 161 |
+
nbclient==0.10.4
|
| 162 |
+
nbconvert==7.17.1
|
| 163 |
+
nbformat==5.10.4
|
| 164 |
+
nest-asyncio==1.6.0
|
| 165 |
+
networkx==3.3
|
| 166 |
+
nltk==3.8.1
|
| 167 |
+
notebook_shim==0.2.4
|
| 168 |
+
numpy==1.26.4
|
| 169 |
+
nvidia-cublas==13.1.0.3
|
| 170 |
+
nvidia-cublas-cu12==12.8.4.1
|
| 171 |
+
nvidia-cuda-cupti==13.0.85
|
| 172 |
+
nvidia-cuda-cupti-cu12==12.8.90
|
| 173 |
+
nvidia-cuda-nvrtc==13.0.88
|
| 174 |
+
nvidia-cuda-nvrtc-cu12==12.8.93
|
| 175 |
+
nvidia-cuda-runtime==13.0.96
|
| 176 |
+
nvidia-cuda-runtime-cu12==12.8.90
|
| 177 |
+
nvidia-cudnn-cu12==9.19.0.56
|
| 178 |
+
nvidia-cudnn-cu13==9.19.0.56
|
| 179 |
+
nvidia-cufft==12.0.0.61
|
| 180 |
+
nvidia-cufft-cu12==11.3.3.83
|
| 181 |
+
nvidia-cufile==1.15.1.6
|
| 182 |
+
nvidia-cufile-cu12==1.13.1.3
|
| 183 |
+
nvidia-curand==10.4.0.35
|
| 184 |
+
nvidia-curand-cu12==10.3.9.90
|
| 185 |
+
nvidia-cusolver==12.0.4.66
|
| 186 |
+
nvidia-cusolver-cu12==11.7.3.90
|
| 187 |
+
nvidia-cusparse==12.6.3.3
|
| 188 |
+
nvidia-cusparse-cu12==12.5.8.93
|
| 189 |
+
nvidia-cusparselt-cu12==0.7.1
|
| 190 |
+
nvidia-cusparselt-cu13==0.8.0
|
| 191 |
+
nvidia-nccl-cu12==2.28.9
|
| 192 |
+
nvidia-nccl-cu13==2.28.9
|
| 193 |
+
nvidia-nvjitlink==13.0.88
|
| 194 |
+
nvidia-nvjitlink-cu12==12.8.93
|
| 195 |
+
nvidia-nvshmem-cu12==3.4.5
|
| 196 |
+
nvidia-nvshmem-cu13==3.4.5
|
| 197 |
+
nvidia-nvtx==13.0.85
|
| 198 |
+
nvidia-nvtx-cu12==12.8.90
|
| 199 |
+
omegaconf==2.3.0
|
| 200 |
+
open-clip-torch==2.24.0
|
| 201 |
+
opencv-python-headless==4.9.0.80
|
| 202 |
+
optree==0.10.0
|
| 203 |
+
orjson==3.11.8
|
| 204 |
+
overrides==7.7.0
|
| 205 |
+
packaging==26.2
|
| 206 |
+
pandas==2.1.4
|
| 207 |
+
pandocfilters==1.5.1
|
| 208 |
+
parso @ file:///opt/conda/conda-bld/parso_1641458642106/work
|
| 209 |
+
pathspec==1.1.1
|
| 210 |
+
pathvalidate==3.3.1
|
| 211 |
+
pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work
|
| 212 |
+
pillow==10.3.0
|
| 213 |
+
pkginfo @ file:///croot/pkginfo_1679431160147/work
|
| 214 |
+
platformdirs @ file:///croot/platformdirs_1692205439124/work
|
| 215 |
+
pluggy @ file:///tmp/build/80754af9/pluggy_1648024709248/work
|
| 216 |
+
portalocker==3.2.0
|
| 217 |
+
prometheus_client==0.25.0
|
| 218 |
+
prompt-toolkit @ file:///croot/prompt-toolkit_1704404351921/work
|
| 219 |
+
propcache==0.4.1
|
| 220 |
+
protobuf==4.25.9
|
| 221 |
+
psutil==7.2.2
|
| 222 |
+
ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
|
| 223 |
+
pure-eval @ file:///opt/conda/conda-bld/pure_eval_1646925070566/work
|
| 224 |
+
pyarrow==24.0.0
|
| 225 |
+
pyarrow-hotfix==0.7
|
| 226 |
+
pycocoevalcap==1.2
|
| 227 |
+
pycocotools==2.0.11
|
| 228 |
+
pycosat @ file:///croot/pycosat_1696536503704/work
|
| 229 |
+
pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work
|
| 230 |
+
pydantic==2.9.2
|
| 231 |
+
pydantic-settings==2.14.0
|
| 232 |
+
pydantic_core==2.23.4
|
| 233 |
+
pydot==4.0.1
|
| 234 |
+
pygit2==1.18.2
|
| 235 |
+
Pygments @ file:///croot/pygments_1684279966437/work
|
| 236 |
+
pygtrie==2.5.0
|
| 237 |
+
pyOpenSSL @ file:///croot/pyopenssl_1708380408460/work
|
| 238 |
+
pyparsing==3.3.2
|
| 239 |
+
PySocks @ file:///home/builder/ci_310/pysocks_1640793678128/work
|
| 240 |
+
python-dateutil==2.9.0.post0
|
| 241 |
+
python-dotenv==1.2.2
|
| 242 |
+
python-etcd==0.4.5
|
| 243 |
+
python-json-logger==4.1.0
|
| 244 |
+
pytz==2026.1.post1
|
| 245 |
+
PyWavelets==1.8.0
|
| 246 |
+
PyYAML==6.0.1
|
| 247 |
+
pyzmq==27.1.0
|
| 248 |
+
referencing @ file:///croot/referencing_1699012038513/work
|
| 249 |
+
regex==2026.4.4
|
| 250 |
+
requests==2.32.3
|
| 251 |
+
requests-toolbelt==1.0.0
|
| 252 |
+
rfc3339-validator==0.1.4
|
| 253 |
+
rfc3986-validator==0.1.1
|
| 254 |
+
rich==15.0.0
|
| 255 |
+
rouge_score==0.1.2
|
| 256 |
+
rpds-py @ file:///croot/rpds-py_1698945930462/work
|
| 257 |
+
ruamel.yaml @ file:///croot/ruamel.yaml_1666304550667/work
|
| 258 |
+
ruamel.yaml.clib @ file:///croot/ruamel.yaml.clib_1666302247304/work
|
| 259 |
+
s3fs==2026.3.0
|
| 260 |
+
s3transfer==0.16.1
|
| 261 |
+
sacrebleu==2.4.0
|
| 262 |
+
safetensors==0.4.2
|
| 263 |
+
scikit-image==0.25.2
|
| 264 |
+
scikit-learn==1.7.2
|
| 265 |
+
scipy==1.15.3
|
| 266 |
+
scmrepo==3.6.2
|
| 267 |
+
semver==3.0.4
|
| 268 |
+
Send2Trash==2.1.0
|
| 269 |
+
sentence-transformers==2.7.0
|
| 270 |
+
sentencepiece==0.2.0
|
| 271 |
+
sentry-sdk==2.58.0
|
| 272 |
+
setproctitle==1.3.7
|
| 273 |
+
shellingham==1.5.4
|
| 274 |
+
shortuuid==1.0.13
|
| 275 |
+
shtab==1.8.0
|
| 276 |
+
six==1.17.0
|
| 277 |
+
smmap==5.0.3
|
| 278 |
+
sortedcontainers==2.4.0
|
| 279 |
+
soupsieve @ file:///croot/soupsieve_1696347547217/work
|
| 280 |
+
sqltrie==0.11.2
|
| 281 |
+
stack-data @ file:///opt/conda/conda-bld/stack_data_1646927590127/work
|
| 282 |
+
sympy==1.14.0
|
| 283 |
+
tabulate==0.10.0
|
| 284 |
+
tenacity==9.1.4
|
| 285 |
+
terminado==0.18.1
|
| 286 |
+
threadpoolctl==3.6.0
|
| 287 |
+
tifffile==2025.5.10
|
| 288 |
+
timm==0.9.12
|
| 289 |
+
tinycss2==1.4.0
|
| 290 |
+
tokenizers==0.15.2
|
| 291 |
+
tomli @ file:///opt/conda/conda-bld/tomli_1657175507142/work
|
| 292 |
+
tomlkit==0.14.0
|
| 293 |
+
toolz @ file:///croot/toolz_1667464077321/work
|
| 294 |
+
torch==2.11.0+cu128
|
| 295 |
+
torchaudio==2.11.0+cu128
|
| 296 |
+
torchelastic==0.2.2
|
| 297 |
+
torchmetrics==1.9.0
|
| 298 |
+
torchvision==0.26.0+cu128
|
| 299 |
+
tornado==6.5.5
|
| 300 |
+
tqdm==4.66.2
|
| 301 |
+
traitlets @ file:///croot/traitlets_1671143879854/work
|
| 302 |
+
transformers==4.38.2
|
| 303 |
+
treelib==1.8.0
|
| 304 |
+
triton==3.6.0
|
| 305 |
+
truststore @ file:///croot/truststore_1695244293384/work
|
| 306 |
+
typer==0.25.0
|
| 307 |
+
types-dataclasses==0.6.6
|
| 308 |
+
typing-inspect==0.9.0
|
| 309 |
+
typing-inspection==0.4.2
|
| 310 |
+
typing_extensions==4.10.0
|
| 311 |
+
tzdata==2026.2
|
| 312 |
+
tzlocal==5.3.1
|
| 313 |
+
uri-template==1.3.0
|
| 314 |
+
urllib3==2.6.3
|
| 315 |
+
vine==5.1.0
|
| 316 |
+
voluptuous==0.16.0
|
| 317 |
+
wandb==0.26.1
|
| 318 |
+
wcwidth==0.7.0
|
| 319 |
+
webcolors==25.10.0
|
| 320 |
+
webencodings==0.5.1
|
| 321 |
+
websocket-client==1.9.0
|
| 322 |
+
wrapt==2.1.2
|
| 323 |
+
xxhash==3.7.0
|
| 324 |
+
yarl==1.23.0
|
| 325 |
+
zc.lockfile==4.0
|
| 326 |
+
zstandard @ file:///croot/zstandard_1677013143055/work
|
| 327 |
+
|
| 328 |
+
# 5/25 추가필요
|
| 329 |
+
notebook==7.5.0
|
| 330 |
+
einops==0.8.2
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
# 5/30 추가
|
| 334 |
+
umap-learn==0.5.12
|
| 335 |
+
|
| 336 |
+
# 5/31 가인 (추가 필요)
|
| 337 |
+
grad-cam==1.5.5
|
| 338 |
+
opencv-python==4.11.0.86
|
| 339 |
+
ttach==0.0.3
|
| 340 |
+
gradio==4.44.1
|
| 341 |
+
fastapi==0.112.4
|
| 342 |
+
starlette==0.38.6
|
src/caption/check_clip_score.py
ADDED
|
@@ -0,0 +1,440 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import torch
|
| 7 |
+
import torch.nn.functional as F
|
| 8 |
+
from PIL import Image
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
from transformers import CLIPModel, CLIPProcessor
|
| 11 |
+
import os
|
| 12 |
+
from dotenv import load_dotenv
|
| 13 |
+
|
| 14 |
+
# ============================================================
|
| 15 |
+
# 설정값
|
| 16 |
+
# ============================================================
|
| 17 |
+
|
| 18 |
+
load_dotenv()
|
| 19 |
+
|
| 20 |
+
# .env 안의 HF_TOKEN 읽기
|
| 21 |
+
hf_token = os.getenv("HF_TOKEN")
|
| 22 |
+
|
| 23 |
+
# 전체 클래스를 검수하려면 True
|
| 24 |
+
# 특정 클래스만 검수하려면 False
|
| 25 |
+
CHECK_ALL_CLASSES = True
|
| 26 |
+
|
| 27 |
+
# 전체 클래스 검수 시 기준이 되는 raw 데이터 루트
|
| 28 |
+
DATA_RAW_ROOT_DIR = Path("data/raw")
|
| 29 |
+
|
| 30 |
+
# 특정 클래스만 검수할 때 사용할 클래스 폴더 경로
|
| 31 |
+
# CHECK_ALL_CLASSES = False 일 때만 사용됨
|
| 32 |
+
TARGET_CLASS_DIR = Path("data/raw")
|
| 33 |
+
|
| 34 |
+
# 입력 JSON 파일
|
| 35 |
+
INPUT_JSON_PATH = Path("data/annotations/captions_flo_all.json")
|
| 36 |
+
|
| 37 |
+
# 출력 JSON 파일
|
| 38 |
+
OUTPUT_JSON_PATH = Path("data/annotations/clip_checked_flo_all.json")
|
| 39 |
+
|
| 40 |
+
# 사용할 CLIP 모델
|
| 41 |
+
MODEL_NAME = "openai/clip-vit-base-patch32"
|
| 42 |
+
|
| 43 |
+
# 한 번에 처리할 이미지-캡션 쌍 개수
|
| 44 |
+
BATCH_SIZE = 32
|
| 45 |
+
|
| 46 |
+
# 하위 몇 %를 fail / review로 볼지
|
| 47 |
+
FAIL_BOTTOM_PERCENT = 10
|
| 48 |
+
REVIEW_BOTTOM_PERCENT = 20
|
| 49 |
+
|
| 50 |
+
print("경로 : " , INPUT_JSON_PATH)
|
| 51 |
+
|
| 52 |
+
# ============================================================
|
| 53 |
+
# JSON 입출력
|
| 54 |
+
# ============================================================
|
| 55 |
+
|
| 56 |
+
def load_json(path: Path) -> list[dict[str, Any]]:
|
| 57 |
+
with path.open("r", encoding="utf-8") as f:
|
| 58 |
+
data = json.load(f)
|
| 59 |
+
|
| 60 |
+
if not isinstance(data, list):
|
| 61 |
+
raise ValueError("입력 JSON은 반드시 배열 형태여야 합니다.")
|
| 62 |
+
|
| 63 |
+
return data
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def save_json(data: list[dict[str, Any]], path: Path) -> None:
|
| 67 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 68 |
+
|
| 69 |
+
with path.open("w", encoding="utf-8") as f:
|
| 70 |
+
json.dump(data, f, ensure_ascii=False, indent=4)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
# ============================================================
|
| 74 |
+
# 클래스 / 경로 처리
|
| 75 |
+
# ============================================================
|
| 76 |
+
|
| 77 |
+
def get_target_class_name() -> str:
|
| 78 |
+
"""
|
| 79 |
+
TARGET_CLASS_DIR = data/raw/airplane 이면 airplane 반환
|
| 80 |
+
"""
|
| 81 |
+
return TARGET_CLASS_DIR.name
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def get_class_name_from_image_value(image_value: str) -> str:
|
| 85 |
+
"""
|
| 86 |
+
JSON의 image 값이 airplane/hf_airplane_001.jpg 라면 airplane 반환
|
| 87 |
+
"""
|
| 88 |
+
image_value = image_value.replace("\\", "/")
|
| 89 |
+
image_path = Path(image_value)
|
| 90 |
+
|
| 91 |
+
if len(image_path.parts) < 2:
|
| 92 |
+
return ""
|
| 93 |
+
|
| 94 |
+
return image_path.parts[0]
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def is_target_item(item: dict[str, Any]) -> bool:
|
| 98 |
+
"""
|
| 99 |
+
CHECK_ALL_CLASSES = True:
|
| 100 |
+
모든 item 처리
|
| 101 |
+
|
| 102 |
+
CHECK_ALL_CLASSES = False:
|
| 103 |
+
TARGET_CLASS_DIR.name과 JSON image의 첫 번째 폴더명이 같은 item만 처리
|
| 104 |
+
"""
|
| 105 |
+
if CHECK_ALL_CLASSES:
|
| 106 |
+
return True
|
| 107 |
+
|
| 108 |
+
image_value = str(item.get("image", ""))
|
| 109 |
+
image_class_name = get_class_name_from_image_value(image_value)
|
| 110 |
+
|
| 111 |
+
return image_class_name == get_target_class_name()
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def resolve_image_path(image_value: str) -> Path:
|
| 115 |
+
"""
|
| 116 |
+
JSON:
|
| 117 |
+
"image": "airplane/hf_airplane_001.jpg"
|
| 118 |
+
|
| 119 |
+
전체 클래스 검수:
|
| 120 |
+
DATA_RAW_ROOT_DIR / image
|
| 121 |
+
→ data/raw/airplane/hf_airplane_001.jpg
|
| 122 |
+
|
| 123 |
+
특정 클래스 검수:
|
| 124 |
+
TARGET_CLASS_DIR / 파일명
|
| 125 |
+
→ data/raw/airplane/hf_airplane_001.jpg
|
| 126 |
+
"""
|
| 127 |
+
image_value = image_value.replace("\\", "/")
|
| 128 |
+
image_path = Path(image_value)
|
| 129 |
+
|
| 130 |
+
if CHECK_ALL_CLASSES:
|
| 131 |
+
return DATA_RAW_ROOT_DIR / image_path
|
| 132 |
+
|
| 133 |
+
return TARGET_CLASS_DIR / image_path.name
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def load_image(image_path: Path) -> Image.Image | None:
|
| 137 |
+
try:
|
| 138 |
+
with Image.open(image_path) as img:
|
| 139 |
+
return img.convert("RGB").copy()
|
| 140 |
+
except Exception:
|
| 141 |
+
return None
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
# ============================================================
|
| 145 |
+
# 캡션 펼치기
|
| 146 |
+
# ============================================================
|
| 147 |
+
|
| 148 |
+
def flatten_caption_items(data: list[dict[str, Any]]) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
|
| 149 |
+
"""
|
| 150 |
+
이미지 1장에 caption 3개가 있으면
|
| 151 |
+
이미지-캡션 쌍 3개로 펼친다.
|
| 152 |
+
"""
|
| 153 |
+
target_data = []
|
| 154 |
+
flat_items = []
|
| 155 |
+
|
| 156 |
+
for item in data:
|
| 157 |
+
if not is_target_item(item):
|
| 158 |
+
continue
|
| 159 |
+
|
| 160 |
+
target_item_index = len(target_data)
|
| 161 |
+
target_data.append(item)
|
| 162 |
+
|
| 163 |
+
image_value = str(item.get("image", ""))
|
| 164 |
+
captions = item.get("captions", [])
|
| 165 |
+
|
| 166 |
+
if not isinstance(captions, list):
|
| 167 |
+
captions = []
|
| 168 |
+
|
| 169 |
+
for caption_index, caption in enumerate(captions):
|
| 170 |
+
flat_items.append({
|
| 171 |
+
"item_index": target_item_index,
|
| 172 |
+
"caption_index": caption_index,
|
| 173 |
+
"image": image_value,
|
| 174 |
+
"class": item.get("class", ""),
|
| 175 |
+
"split": item.get("split", ""),
|
| 176 |
+
"caption": str(caption).strip()
|
| 177 |
+
})
|
| 178 |
+
|
| 179 |
+
return target_data, flat_items
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
# ============================================================
|
| 183 |
+
# CLIP Score 계산
|
| 184 |
+
# ============================================================
|
| 185 |
+
|
| 186 |
+
@torch.no_grad()
|
| 187 |
+
def compute_clip_scores(
|
| 188 |
+
flat_items: list[dict[str, Any]],
|
| 189 |
+
model: CLIPModel,
|
| 190 |
+
processor: CLIPProcessor,
|
| 191 |
+
device: torch.device
|
| 192 |
+
) -> list[dict[str, Any]]:
|
| 193 |
+
|
| 194 |
+
results = []
|
| 195 |
+
|
| 196 |
+
for start in tqdm(range(0, len(flat_items), BATCH_SIZE), desc="computing CLIP scores"):
|
| 197 |
+
batch_items = flat_items[start:start + BATCH_SIZE]
|
| 198 |
+
|
| 199 |
+
valid_items = []
|
| 200 |
+
images = []
|
| 201 |
+
texts = []
|
| 202 |
+
|
| 203 |
+
for item in batch_items:
|
| 204 |
+
image_path = resolve_image_path(item["image"])
|
| 205 |
+
image = load_image(image_path)
|
| 206 |
+
|
| 207 |
+
if image is None:
|
| 208 |
+
results.append({
|
| 209 |
+
**item,
|
| 210 |
+
"resolved_image_path": str(image_path).replace("\\", "/"),
|
| 211 |
+
"clip_cosine": None,
|
| 212 |
+
"clip_score": None,
|
| 213 |
+
"clip_status": "missing_image",
|
| 214 |
+
"clip_reason": f"image file could not be opened: {image_path}"
|
| 215 |
+
})
|
| 216 |
+
continue
|
| 217 |
+
|
| 218 |
+
caption = item["caption"]
|
| 219 |
+
|
| 220 |
+
if not caption:
|
| 221 |
+
results.append({
|
| 222 |
+
**item,
|
| 223 |
+
"resolved_image_path": str(image_path).replace("\\", "/"),
|
| 224 |
+
"clip_cosine": None,
|
| 225 |
+
"clip_score": None,
|
| 226 |
+
"clip_status": "empty_caption",
|
| 227 |
+
"clip_reason": "caption is empty"
|
| 228 |
+
})
|
| 229 |
+
continue
|
| 230 |
+
|
| 231 |
+
valid_items.append({
|
| 232 |
+
**item,
|
| 233 |
+
"resolved_image_path": str(image_path).replace("\\", "/")
|
| 234 |
+
})
|
| 235 |
+
images.append(image)
|
| 236 |
+
texts.append(caption)
|
| 237 |
+
|
| 238 |
+
if not valid_items:
|
| 239 |
+
continue
|
| 240 |
+
|
| 241 |
+
inputs = processor(
|
| 242 |
+
text=texts,
|
| 243 |
+
images=images,
|
| 244 |
+
return_tensors="pt",
|
| 245 |
+
padding=True,
|
| 246 |
+
truncation=True
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
inputs = {
|
| 250 |
+
key: value.to(device)
|
| 251 |
+
for key, value in inputs.items()
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
outputs = model(
|
| 255 |
+
input_ids=inputs["input_ids"],
|
| 256 |
+
attention_mask=inputs["attention_mask"],
|
| 257 |
+
pixel_values=inputs["pixel_values"]
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
image_features = outputs.image_embeds
|
| 261 |
+
text_features = outputs.text_embeds
|
| 262 |
+
|
| 263 |
+
image_features = F.normalize(image_features, p=2, dim=1)
|
| 264 |
+
text_features = F.normalize(text_features, p=2, dim=1)
|
| 265 |
+
|
| 266 |
+
cosine_scores = (image_features * text_features).sum(dim=1)
|
| 267 |
+
|
| 268 |
+
for item, cosine in zip(valid_items, cosine_scores):
|
| 269 |
+
cosine_value = float(cosine.detach().cpu().item())
|
| 270 |
+
clip_score = 2.5 * max(cosine_value, 0.0)
|
| 271 |
+
|
| 272 |
+
results.append({
|
| 273 |
+
**item,
|
| 274 |
+
"clip_cosine": round(cosine_value, 6),
|
| 275 |
+
"clip_score": round(clip_score, 6),
|
| 276 |
+
"clip_status": "pending",
|
| 277 |
+
"clip_reason": ""
|
| 278 |
+
})
|
| 279 |
+
|
| 280 |
+
return results
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
# ============================================================
|
| 284 |
+
# pass / review / fail 판정
|
| 285 |
+
# ============================================================
|
| 286 |
+
|
| 287 |
+
def assign_clip_status(results: list[dict[str, Any]]) -> None:
|
| 288 |
+
valid_scores = [
|
| 289 |
+
result["clip_score"]
|
| 290 |
+
for result in results
|
| 291 |
+
if isinstance(result.get("clip_score"), float)
|
| 292 |
+
]
|
| 293 |
+
|
| 294 |
+
if not valid_scores:
|
| 295 |
+
return
|
| 296 |
+
|
| 297 |
+
fail_threshold = np.percentile(valid_scores, FAIL_BOTTOM_PERCENT)
|
| 298 |
+
review_threshold = np.percentile(valid_scores, REVIEW_BOTTOM_PERCENT)
|
| 299 |
+
|
| 300 |
+
for result in results:
|
| 301 |
+
clip_score = result.get("clip_score")
|
| 302 |
+
|
| 303 |
+
if clip_score is None:
|
| 304 |
+
continue
|
| 305 |
+
|
| 306 |
+
if clip_score <= fail_threshold:
|
| 307 |
+
result["clip_status"] = "fail"
|
| 308 |
+
result["clip_reason"] = f"clip score is in the bottom {FAIL_BOTTOM_PERCENT}%"
|
| 309 |
+
elif clip_score <= review_threshold:
|
| 310 |
+
result["clip_status"] = "review"
|
| 311 |
+
result["clip_reason"] = f"clip score is in the bottom {REVIEW_BOTTOM_PERCENT}%"
|
| 312 |
+
else:
|
| 313 |
+
result["clip_status"] = "pass"
|
| 314 |
+
result["clip_reason"] = "clip score is acceptable"
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
# ============================================================
|
| 318 |
+
# 결과를 원래 JSON 구조에 붙이기
|
| 319 |
+
# ============================================================
|
| 320 |
+
|
| 321 |
+
def attach_results_to_data(
|
| 322 |
+
target_data: list[dict[str, Any]],
|
| 323 |
+
results: list[dict[str, Any]]
|
| 324 |
+
) -> list[dict[str, Any]]:
|
| 325 |
+
|
| 326 |
+
for item in target_data:
|
| 327 |
+
item["caption_checks"] = []
|
| 328 |
+
|
| 329 |
+
results = sorted(
|
| 330 |
+
results,
|
| 331 |
+
key=lambda x: (x["item_index"], x["caption_index"])
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
for result in results:
|
| 335 |
+
item_index = result["item_index"]
|
| 336 |
+
|
| 337 |
+
check = {
|
| 338 |
+
"caption_index": result["caption_index"],
|
| 339 |
+
"caption": result["caption"],
|
| 340 |
+
"resolved_image_path": result.get("resolved_image_path"),
|
| 341 |
+
"clip_cosine": result.get("clip_cosine"),
|
| 342 |
+
"clip_score": result.get("clip_score"),
|
| 343 |
+
"clip_status": result.get("clip_status"),
|
| 344 |
+
"clip_reason": result.get("clip_reason", "")
|
| 345 |
+
}
|
| 346 |
+
|
| 347 |
+
target_data[item_index]["caption_checks"].append(check)
|
| 348 |
+
|
| 349 |
+
return target_data
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
# ============================================================
|
| 353 |
+
# 요약 출력
|
| 354 |
+
# ============================================================
|
| 355 |
+
|
| 356 |
+
def print_summary(
|
| 357 |
+
target_data: list[dict[str, Any]],
|
| 358 |
+
flat_items: list[dict[str, Any]],
|
| 359 |
+
results: list[dict[str, Any]]
|
| 360 |
+
) -> None:
|
| 361 |
+
|
| 362 |
+
status_count = {}
|
| 363 |
+
valid_scores = []
|
| 364 |
+
|
| 365 |
+
for result in results:
|
| 366 |
+
status = result.get("clip_status", "unknown")
|
| 367 |
+
status_count[status] = status_count.get(status, 0) + 1
|
| 368 |
+
|
| 369 |
+
if isinstance(result.get("clip_score"), float):
|
| 370 |
+
valid_scores.append(result["clip_score"])
|
| 371 |
+
|
| 372 |
+
print("\n===== CLIP Score Summary =====")
|
| 373 |
+
print(f"check all classes: {CHECK_ALL_CLASSES}")
|
| 374 |
+
|
| 375 |
+
if CHECK_ALL_CLASSES:
|
| 376 |
+
print(f"data raw root dir: {DATA_RAW_ROOT_DIR}")
|
| 377 |
+
else:
|
| 378 |
+
print(f"target class dir: {TARGET_CLASS_DIR}")
|
| 379 |
+
print(f"target class name: {get_target_class_name()}")
|
| 380 |
+
|
| 381 |
+
print(f"target images: {len(target_data)}")
|
| 382 |
+
print(f"target image-caption pairs: {len(flat_items)}")
|
| 383 |
+
print(f"status count: {status_count}")
|
| 384 |
+
|
| 385 |
+
if valid_scores:
|
| 386 |
+
print(f"min score: {min(valid_scores):.4f}")
|
| 387 |
+
print(f"max score: {max(valid_scores):.4f}")
|
| 388 |
+
print(f"mean score: {np.mean(valid_scores):.4f}")
|
| 389 |
+
print(f"bottom {FAIL_BOTTOM_PERCENT}% threshold: {np.percentile(valid_scores, FAIL_BOTTOM_PERCENT):.4f}")
|
| 390 |
+
print(f"bottom {REVIEW_BOTTOM_PERCENT}% threshold: {np.percentile(valid_scores, REVIEW_BOTTOM_PERCENT):.4f}")
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
# ============================================================
|
| 394 |
+
# 실행
|
| 395 |
+
# ============================================================
|
| 396 |
+
|
| 397 |
+
def main():
|
| 398 |
+
if not INPUT_JSON_PATH.exists():
|
| 399 |
+
raise FileNotFoundError(f"input file not found: {INPUT_JSON_PATH}")
|
| 400 |
+
|
| 401 |
+
if CHECK_ALL_CLASSES:
|
| 402 |
+
if not DATA_RAW_ROOT_DIR.exists():
|
| 403 |
+
raise FileNotFoundError(f"data raw root directory not found: {DATA_RAW_ROOT_DIR}")
|
| 404 |
+
else:
|
| 405 |
+
if not TARGET_CLASS_DIR.exists():
|
| 406 |
+
raise FileNotFoundError(f"target class directory not found: {TARGET_CLASS_DIR}")
|
| 407 |
+
|
| 408 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 409 |
+
print(f"device: {device}")
|
| 410 |
+
|
| 411 |
+
print(f"loading model: {MODEL_NAME}")
|
| 412 |
+
model = CLIPModel.from_pretrained(MODEL_NAME, token=hf_token).to(device)
|
| 413 |
+
processor = CLIPProcessor.from_pretrained(MODEL_NAME, token=hf_token)
|
| 414 |
+
model.eval()
|
| 415 |
+
|
| 416 |
+
data = load_json(INPUT_JSON_PATH)
|
| 417 |
+
target_data, flat_items = flatten_caption_items(data)
|
| 418 |
+
|
| 419 |
+
if not target_data:
|
| 420 |
+
raise ValueError("검수 대상 데이터가 없습니다. CHECK_ALL_CLASSES 또는 TARGET_CLASS_DIR 설정을 확인하세요.")
|
| 421 |
+
|
| 422 |
+
results = compute_clip_scores(
|
| 423 |
+
flat_items=flat_items,
|
| 424 |
+
model=model,
|
| 425 |
+
processor=processor,
|
| 426 |
+
device=device
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
assign_clip_status(results)
|
| 430 |
+
|
| 431 |
+
checked_data = attach_results_to_data(target_data, results)
|
| 432 |
+
|
| 433 |
+
save_json(checked_data, OUTPUT_JSON_PATH)
|
| 434 |
+
|
| 435 |
+
print_summary(target_data, flat_items, results)
|
| 436 |
+
print(f"\nsaved: {OUTPUT_JSON_PATH}")
|
| 437 |
+
|
| 438 |
+
|
| 439 |
+
if __name__ == "__main__":
|
| 440 |
+
main()
|
src/caption/generate_captions_blip.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os # 파일/폴더 탐색
|
| 2 |
+
import json # JSON 저장
|
| 3 |
+
import random # 데이터 섞기
|
| 4 |
+
import torch # GPU 사용
|
| 5 |
+
import re # 정규식 (문장 필터링)
|
| 6 |
+
from collections import defaultdict # 클래스별 그룹화
|
| 7 |
+
from PIL import Image # 이미지 로드
|
| 8 |
+
from transformers import BlipProcessor, BlipForConditionalGeneration # BLIP
|
| 9 |
+
from sentence_transformers import SentenceTransformer, util # SBERT
|
| 10 |
+
|
| 11 |
+
# ----------------------
|
| 12 |
+
# 1. 설정
|
| 13 |
+
# ----------------------
|
| 14 |
+
ROOT_DIR = "data/raw" # 이미지 루트 폴더 (raw/클래스/이미지)
|
| 15 |
+
OUTPUT_JSON = "annotation.json" # 결과 JSON 파일 이름
|
| 16 |
+
|
| 17 |
+
TARGET_CAPTIONS = 3 # 이미지당 캡션 개수 (3 또는 5 추천)
|
| 18 |
+
SIM_THRESHOLD = 0.85 # 문장 유사도 기준 (높을수록 엄격)
|
| 19 |
+
MIN_WORDS = 3 # 최소 단어 수 (짧은 문장 제거)
|
| 20 |
+
MAX_ATTEMPTS = 10 # 캡션 생성 최대 반복 횟수
|
| 21 |
+
|
| 22 |
+
TRAIN_RATIO = 0.7 # train 비율
|
| 23 |
+
VAL_RATIO = 0.15 # val 비율
|
| 24 |
+
TEST_RATIO = 0.15 # test 비율
|
| 25 |
+
|
| 26 |
+
device = "cuda" if torch.cuda.is_available() else "cpu" # GPU 사용 여부
|
| 27 |
+
print("device : ", device)
|
| 28 |
+
# ----------------------
|
| 29 |
+
# 2. 모델 로드
|
| 30 |
+
# ----------------------
|
| 31 |
+
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 32 |
+
# 이미지 → 토큰 변환
|
| 33 |
+
|
| 34 |
+
blip_model = BlipForConditionalGeneration.from_pretrained(
|
| 35 |
+
"Salesforce/blip-image-captioning-base"
|
| 36 |
+
).to(device)
|
| 37 |
+
# 캡션 생성 모델
|
| 38 |
+
|
| 39 |
+
embedder = SentenceTransformer("all-MiniLM-L6-v2", device=device)
|
| 40 |
+
# 문장 → 벡터 (유사도 계산용)
|
| 41 |
+
|
| 42 |
+
# ----------------------
|
| 43 |
+
# 3. 캡션 생성 함수
|
| 44 |
+
# ----------------------
|
| 45 |
+
def generate_captions(image, n):
|
| 46 |
+
inputs = processor(images=image, return_tensors="pt").to(device) # 이미지 전처리
|
| 47 |
+
|
| 48 |
+
outputs = blip_model.generate(
|
| 49 |
+
**inputs,
|
| 50 |
+
do_sample=True, # 다양성 확보 (샘플링)
|
| 51 |
+
top_k=50,
|
| 52 |
+
top_p=0.95,
|
| 53 |
+
temperature=0.9,
|
| 54 |
+
num_return_sequences=n, # n개 생성
|
| 55 |
+
max_length=30
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
# 토큰 → 문자열 변환
|
| 59 |
+
return [
|
| 60 |
+
processor.decode(o, skip_special_tokens=True).strip().lower()
|
| 61 |
+
for o in outputs
|
| 62 |
+
]
|
| 63 |
+
|
| 64 |
+
# ----------------------
|
| 65 |
+
# 4. 기본 품질 필터
|
| 66 |
+
# ----------------------
|
| 67 |
+
def basic_filter(captions):
|
| 68 |
+
filtered = []
|
| 69 |
+
|
| 70 |
+
for c in captions:
|
| 71 |
+
words = c.split()
|
| 72 |
+
|
| 73 |
+
if len(words) < MIN_WORDS: # 너무 짧은 문장 제거
|
| 74 |
+
continue
|
| 75 |
+
|
| 76 |
+
if len(set(words)) < len(words) * 0.6: # 반복 단어 많은 문장 제거
|
| 77 |
+
continue
|
| 78 |
+
|
| 79 |
+
if re.search(r"[^a-z0-9\s]", c): # 이상한 문자 제거
|
| 80 |
+
continue
|
| 81 |
+
|
| 82 |
+
filtered.append(c)
|
| 83 |
+
|
| 84 |
+
return filtered
|
| 85 |
+
|
| 86 |
+
# ----------------------
|
| 87 |
+
# 5. 키워드 추출
|
| 88 |
+
# ----------------------
|
| 89 |
+
def extract_keywords(caption):
|
| 90 |
+
stopwords = {"a","the","on","in","at","with","and","of","to","is","are"} # 불용어
|
| 91 |
+
return set([w for w in caption.split() if w not in stopwords]) # 핵심 단어만 추출
|
| 92 |
+
|
| 93 |
+
# ----------------------
|
| 94 |
+
# 6. 유사도 + 키워드 필터
|
| 95 |
+
# ----------------------
|
| 96 |
+
def advanced_filter(captions):
|
| 97 |
+
if not captions:
|
| 98 |
+
return []
|
| 99 |
+
|
| 100 |
+
embeddings = embedder.encode(captions, convert_to_tensor=True) # 문장 → 벡터
|
| 101 |
+
|
| 102 |
+
selected = []
|
| 103 |
+
selected_idx = []
|
| 104 |
+
|
| 105 |
+
for i, cap in enumerate(captions):
|
| 106 |
+
keep = True
|
| 107 |
+
kw_i = extract_keywords(cap)
|
| 108 |
+
|
| 109 |
+
for j in selected_idx:
|
| 110 |
+
sim = util.cos_sim(embeddings[i], embeddings[j]).item() # cosine similarity
|
| 111 |
+
|
| 112 |
+
if sim > SIM_THRESHOLD: # 의미가 너무 비슷하면 제거
|
| 113 |
+
keep = False
|
| 114 |
+
break
|
| 115 |
+
|
| 116 |
+
kw_j = extract_keywords(captions[j])
|
| 117 |
+
overlap = len(kw_i & kw_j) / max(len(kw_i), 1)
|
| 118 |
+
|
| 119 |
+
if overlap > 0.7: # 키워드 많이 겹치면 제거
|
| 120 |
+
keep = False
|
| 121 |
+
break
|
| 122 |
+
|
| 123 |
+
if keep:
|
| 124 |
+
selected.append(cap)
|
| 125 |
+
selected_idx.append(i)
|
| 126 |
+
|
| 127 |
+
return selected
|
| 128 |
+
|
| 129 |
+
# ----------------------
|
| 130 |
+
# 7. 캡션 생성 루프
|
| 131 |
+
# ----------------------
|
| 132 |
+
def get_captions(image):
|
| 133 |
+
final_caps = []
|
| 134 |
+
attempts = 0
|
| 135 |
+
|
| 136 |
+
while len(final_caps) < TARGET_CAPTIONS and attempts < MAX_ATTEMPTS:
|
| 137 |
+
needed = TARGET_CAPTIONS - len(final_caps)
|
| 138 |
+
|
| 139 |
+
new_caps = generate_captions(image, needed * 3) # 부족분보다 넉넉히 생성
|
| 140 |
+
new_caps = basic_filter(new_caps) # 1차 필터
|
| 141 |
+
|
| 142 |
+
combined = list(set(final_caps + new_caps)) # 중복 제거
|
| 143 |
+
filtered = advanced_filter(combined) # 유사도 필터
|
| 144 |
+
|
| 145 |
+
final_caps = filtered[:TARGET_CAPTIONS] # 목표 개수 맞춤
|
| 146 |
+
attempts += 1
|
| 147 |
+
|
| 148 |
+
return final_caps
|
| 149 |
+
|
| 150 |
+
# ----------------------
|
| 151 |
+
# 8. 데이터 수집
|
| 152 |
+
# ----------------------
|
| 153 |
+
dataset = []
|
| 154 |
+
|
| 155 |
+
for class_name in os.listdir(ROOT_DIR): # 클래스 폴더 순회
|
| 156 |
+
class_path = os.path.join(ROOT_DIR, class_name)
|
| 157 |
+
|
| 158 |
+
if not os.path.isdir(class_path):
|
| 159 |
+
continue
|
| 160 |
+
|
| 161 |
+
for filename in os.listdir(class_path): # 이미지 순회
|
| 162 |
+
if not filename.lower().endswith((".jpg", ".jpeg", ".png")):
|
| 163 |
+
continue
|
| 164 |
+
|
| 165 |
+
path = os.path.join(class_path, filename)
|
| 166 |
+
image = Image.open(path).convert("RGB") # 이미지 로드
|
| 167 |
+
|
| 168 |
+
captions = get_captions(image) # 캡션 생성
|
| 169 |
+
|
| 170 |
+
dataset.append({
|
| 171 |
+
"image": f"{class_name}/{filename}", # 상대 경로 저장
|
| 172 |
+
"class": class_name, # 클래스 라벨
|
| 173 |
+
"captions": captions # 캡션 리스트
|
| 174 |
+
})
|
| 175 |
+
|
| 176 |
+
print(f"\n{class_name}/{filename}")
|
| 177 |
+
for i, c in enumerate(captions):
|
| 178 |
+
print(f"{i+1}. {c}")
|
| 179 |
+
|
| 180 |
+
# ----------------------
|
| 181 |
+
# 9. Stratified Split
|
| 182 |
+
# ----------------------
|
| 183 |
+
class_groups = defaultdict(list)
|
| 184 |
+
|
| 185 |
+
for item in dataset:
|
| 186 |
+
class_groups[item["class"]].append(item) # 클래스별 묶기
|
| 187 |
+
|
| 188 |
+
train_set, val_set, test_set = [], [], []
|
| 189 |
+
|
| 190 |
+
for class_name, items in class_groups.items():
|
| 191 |
+
random.shuffle(items) # 클래스 내부 shuffle
|
| 192 |
+
|
| 193 |
+
total = len(items)
|
| 194 |
+
|
| 195 |
+
train_end = max(1, int(total * TRAIN_RATIO)) # 최소 1개 보장
|
| 196 |
+
val_end = train_end + max(1, int(total * VAL_RATIO))
|
| 197 |
+
|
| 198 |
+
train_set += items[:train_end]
|
| 199 |
+
val_set += items[train_end:val_end]
|
| 200 |
+
test_set += items[val_end:]
|
| 201 |
+
|
| 202 |
+
# split 라벨 부여
|
| 203 |
+
for item in train_set:
|
| 204 |
+
item["split"] = "train"
|
| 205 |
+
|
| 206 |
+
for item in val_set:
|
| 207 |
+
item["split"] = "val"
|
| 208 |
+
|
| 209 |
+
for item in test_set:
|
| 210 |
+
item["split"] = "test"
|
| 211 |
+
|
| 212 |
+
dataset = train_set + val_set + test_set # 다시 하나로 합침
|
| 213 |
+
|
| 214 |
+
# ----------------------
|
| 215 |
+
# 10. JSON 저장
|
| 216 |
+
# ----------------------
|
| 217 |
+
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
|
| 218 |
+
json.dump(dataset, f, indent=4, ensure_ascii=False)
|
| 219 |
+
|
| 220 |
+
print(f"\n완료: {OUTPUT_JSON} 생성됨")
|
src/caption/generate_captions_florence2.py
ADDED
|
@@ -0,0 +1,345 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import random
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
from PIL import Image
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
from transformers import AutoProcessor, Florence2ForConditionalGeneration
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# =========================================================
|
| 14 |
+
# 1. 설정값
|
| 15 |
+
# =========================================================
|
| 16 |
+
|
| 17 |
+
# 전체 클래스 캡셔닝: "data/raw"
|
| 18 |
+
# 특정 클래스만 캡셔닝: "data/raw/apple"
|
| 19 |
+
INPUT_IMAGE_DIR = "data/raw"
|
| 20 |
+
|
| 21 |
+
# image 값을 "pizza/hf_pizza_001.jpg" 형태로 만들기 위한 기준 경로
|
| 22 |
+
DATA_RAW_ROOT = "data/raw"
|
| 23 |
+
|
| 24 |
+
# 결과 JSON 저장 경로
|
| 25 |
+
OUTPUT_JSON_PATH = "data/annotations/captions_flo_all.json"
|
| 26 |
+
|
| 27 |
+
# transformers 5.7.0에서는 florence-community 모델 사용 권장
|
| 28 |
+
# base-ft: 가볍고 다운스트림 task에 fine-tuning된 모델
|
| 29 |
+
# large-ft: 더 무겁지만 품질이 더 좋을 수 있음
|
| 30 |
+
MODEL_ID = "florence-community/Florence-2-base-ft"
|
| 31 |
+
# MODEL_ID = "florence-community/Florence-2-large-ft"
|
| 32 |
+
|
| 33 |
+
# .env 파일에서 읽을 Hugging Face 토큰 이름
|
| 34 |
+
# 공개 모델이면 없어도 동작할 수 있지만, 토큰을 넣어두는 편이 안정적입니다.
|
| 35 |
+
HF_TOKEN_ENV_NAME = "HF_TOKEN"
|
| 36 |
+
|
| 37 |
+
# split 비율: 기본 7 : 1.5 : 1.5
|
| 38 |
+
TRAIN_RATIO = 0.7
|
| 39 |
+
VAL_RATIO = 0.15
|
| 40 |
+
TEST_RATIO = 0.15
|
| 41 |
+
|
| 42 |
+
# split 재현을 위한 seed
|
| 43 |
+
RANDOM_SEED = 42
|
| 44 |
+
|
| 45 |
+
# 이미지당 캡션 3개 생성
|
| 46 |
+
# Florence-2 문서에서 지원하는 caption task입니다.
|
| 47 |
+
CAPTION_TASKS = [
|
| 48 |
+
"<CAPTION>",
|
| 49 |
+
"<DETAILED_CAPTION>",
|
| 50 |
+
"<MORE_DETAILED_CAPTION>",
|
| 51 |
+
]
|
| 52 |
+
|
| 53 |
+
# 생성 옵션
|
| 54 |
+
NUM_BEAMS = 3
|
| 55 |
+
MAX_NEW_TOKENS = 64
|
| 56 |
+
|
| 57 |
+
# 몇 장마다 중간 저장할지
|
| 58 |
+
SAVE_EVERY = 220
|
| 59 |
+
|
| 60 |
+
# 이미 JSON에 있는 이미지는 건너뛸지 여부
|
| 61 |
+
SKIP_ALREADY_DONE = True
|
| 62 |
+
|
| 63 |
+
# 허용 이미지 확장자
|
| 64 |
+
IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp", ".bmp"]
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# =========================================================
|
| 68 |
+
# 2. 이미지 목록 가져오기
|
| 69 |
+
# =========================================================
|
| 70 |
+
|
| 71 |
+
def get_image_list():
|
| 72 |
+
input_dir = Path(INPUT_IMAGE_DIR).resolve()
|
| 73 |
+
raw_root = Path(DATA_RAW_ROOT).resolve()
|
| 74 |
+
|
| 75 |
+
if not input_dir.exists():
|
| 76 |
+
raise FileNotFoundError(f"입력 경로가 없습니다: {input_dir}")
|
| 77 |
+
|
| 78 |
+
image_list = []
|
| 79 |
+
|
| 80 |
+
for image_path in sorted(input_dir.rglob("*")):
|
| 81 |
+
if image_path.suffix.lower() not in IMAGE_EXTENSIONS:
|
| 82 |
+
continue
|
| 83 |
+
|
| 84 |
+
# 예:
|
| 85 |
+
# /workspace/data/raw/pizza/hf_pizza_001.jpg
|
| 86 |
+
# -> pizza/hf_pizza_001.jpg
|
| 87 |
+
relative_image_path = image_path.resolve().relative_to(raw_root).as_posix()
|
| 88 |
+
|
| 89 |
+
# 예:
|
| 90 |
+
# pizza/hf_pizza_001.jpg
|
| 91 |
+
# -> pizza
|
| 92 |
+
class_name = relative_image_path.split("/")[0]
|
| 93 |
+
|
| 94 |
+
image_list.append({
|
| 95 |
+
"path": image_path,
|
| 96 |
+
"image": relative_image_path,
|
| 97 |
+
"class": class_name,
|
| 98 |
+
})
|
| 99 |
+
|
| 100 |
+
return image_list
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# =========================================================
|
| 104 |
+
# 3. train / val / test 나누기
|
| 105 |
+
# =========================================================
|
| 106 |
+
|
| 107 |
+
def add_split(image_list):
|
| 108 |
+
random.seed(RANDOM_SEED)
|
| 109 |
+
|
| 110 |
+
total_ratio = TRAIN_RATIO + VAL_RATIO + TEST_RATIO
|
| 111 |
+
|
| 112 |
+
result = []
|
| 113 |
+
|
| 114 |
+
# 클래스별로 이미지 모으기
|
| 115 |
+
class_map = {}
|
| 116 |
+
|
| 117 |
+
for item in image_list:
|
| 118 |
+
class_name = item["class"]
|
| 119 |
+
|
| 120 |
+
if class_name not in class_map:
|
| 121 |
+
class_map[class_name] = []
|
| 122 |
+
|
| 123 |
+
class_map[class_name].append(item)
|
| 124 |
+
|
| 125 |
+
# 클래스별로 train / val / test 나누기
|
| 126 |
+
for class_name, items in class_map.items():
|
| 127 |
+
random.shuffle(items)
|
| 128 |
+
|
| 129 |
+
total_count = len(items)
|
| 130 |
+
|
| 131 |
+
train_count = round(total_count * TRAIN_RATIO / total_ratio)
|
| 132 |
+
val_count = round(total_count * VAL_RATIO / total_ratio)
|
| 133 |
+
|
| 134 |
+
for index, item in enumerate(items):
|
| 135 |
+
if index < train_count:
|
| 136 |
+
split = "train"
|
| 137 |
+
elif index < train_count + val_count:
|
| 138 |
+
split = "val"
|
| 139 |
+
else:
|
| 140 |
+
split = "test"
|
| 141 |
+
|
| 142 |
+
item["split"] = split
|
| 143 |
+
result.append(item)
|
| 144 |
+
|
| 145 |
+
return result
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
# =========================================================
|
| 149 |
+
# 4. Florence-2 모델 준비
|
| 150 |
+
# =========================================================
|
| 151 |
+
|
| 152 |
+
def load_model():
|
| 153 |
+
load_dotenv()
|
| 154 |
+
|
| 155 |
+
hf_token = os.getenv(HF_TOKEN_ENV_NAME)
|
| 156 |
+
|
| 157 |
+
if torch.cuda.is_available():
|
| 158 |
+
device = "cuda"
|
| 159 |
+
|
| 160 |
+
# GPU가 bfloat16을 지원하면 bfloat16 사용
|
| 161 |
+
# 아니면 float16 사용
|
| 162 |
+
if torch.cuda.is_bf16_supported():
|
| 163 |
+
torch_dtype = torch.bfloat16
|
| 164 |
+
else:
|
| 165 |
+
torch_dtype = torch.float16
|
| 166 |
+
else:
|
| 167 |
+
device = "cpu"
|
| 168 |
+
torch_dtype = torch.float32
|
| 169 |
+
|
| 170 |
+
print(f"device: {device}")
|
| 171 |
+
print(f"dtype: {torch_dtype}")
|
| 172 |
+
print(f"model: {MODEL_ID}")
|
| 173 |
+
|
| 174 |
+
processor = AutoProcessor.from_pretrained(
|
| 175 |
+
MODEL_ID,
|
| 176 |
+
token=hf_token,
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
model = Florence2ForConditionalGeneration.from_pretrained(
|
| 180 |
+
MODEL_ID,
|
| 181 |
+
dtype=torch_dtype,
|
| 182 |
+
token=hf_token,
|
| 183 |
+
).to(device)
|
| 184 |
+
|
| 185 |
+
model.eval()
|
| 186 |
+
|
| 187 |
+
return model, processor, device, torch_dtype
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
# =========================================================
|
| 191 |
+
# 5. 이미지 1장 캡셔닝
|
| 192 |
+
# =========================================================
|
| 193 |
+
|
| 194 |
+
def make_caption(image, task, model, processor, device, torch_dtype):
|
| 195 |
+
inputs = processor(
|
| 196 |
+
text=task,
|
| 197 |
+
images=image,
|
| 198 |
+
return_tensors="pt",
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
inputs = inputs.to(device, torch_dtype)
|
| 202 |
+
|
| 203 |
+
with torch.no_grad():
|
| 204 |
+
generated_ids = model.generate(
|
| 205 |
+
**inputs,
|
| 206 |
+
max_new_tokens=MAX_NEW_TOKENS,
|
| 207 |
+
num_beams=NUM_BEAMS,
|
| 208 |
+
do_sample=False,
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
generated_text = processor.batch_decode(
|
| 212 |
+
generated_ids,
|
| 213 |
+
skip_special_tokens=False,
|
| 214 |
+
)[0]
|
| 215 |
+
|
| 216 |
+
parsed_result = processor.post_process_generation(
|
| 217 |
+
generated_text,
|
| 218 |
+
task=task,
|
| 219 |
+
image_size=image.size,
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
caption = parsed_result.get(task, "")
|
| 223 |
+
|
| 224 |
+
if not isinstance(caption, str):
|
| 225 |
+
caption = str(caption)
|
| 226 |
+
|
| 227 |
+
return caption.strip()
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
def make_three_captions(image_path, model, processor, device, torch_dtype):
|
| 231 |
+
image = Image.open(image_path).convert("RGB")
|
| 232 |
+
|
| 233 |
+
captions = []
|
| 234 |
+
|
| 235 |
+
for task in CAPTION_TASKS:
|
| 236 |
+
caption = make_caption(
|
| 237 |
+
image=image,
|
| 238 |
+
task=task,
|
| 239 |
+
model=model,
|
| 240 |
+
processor=processor,
|
| 241 |
+
device=device,
|
| 242 |
+
torch_dtype=torch_dtype,
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
captions.append(caption)
|
| 246 |
+
|
| 247 |
+
return captions
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
# =========================================================
|
| 251 |
+
# 6. 기존 JSON 읽기 / 저장하기
|
| 252 |
+
# =========================================================
|
| 253 |
+
|
| 254 |
+
def load_existing_result():
|
| 255 |
+
output_path = Path(OUTPUT_JSON_PATH)
|
| 256 |
+
|
| 257 |
+
if not output_path.exists():
|
| 258 |
+
return {}
|
| 259 |
+
|
| 260 |
+
with output_path.open("r", encoding="utf-8") as f:
|
| 261 |
+
data = json.load(f)
|
| 262 |
+
|
| 263 |
+
result = {}
|
| 264 |
+
|
| 265 |
+
for item in data:
|
| 266 |
+
result[item["image"]] = item
|
| 267 |
+
|
| 268 |
+
return result
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
def save_result(result_map):
|
| 272 |
+
output_path = Path(OUTPUT_JSON_PATH)
|
| 273 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 274 |
+
|
| 275 |
+
result_list = list(result_map.values())
|
| 276 |
+
result_list.sort(key=lambda x: x["image"])
|
| 277 |
+
|
| 278 |
+
with output_path.open("w", encoding="utf-8") as f:
|
| 279 |
+
json.dump(result_list, f, ensure_ascii=False, indent=4)
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
# =========================================================
|
| 283 |
+
# 7. 실행
|
| 284 |
+
# =========================================================
|
| 285 |
+
|
| 286 |
+
def main():
|
| 287 |
+
print("이미지 목록을 읽는 중입니다.")
|
| 288 |
+
|
| 289 |
+
image_list = get_image_list()
|
| 290 |
+
image_list = add_split(image_list)
|
| 291 |
+
|
| 292 |
+
print(f"총 이미지 수: {len(image_list)}")
|
| 293 |
+
|
| 294 |
+
result_map = load_existing_result()
|
| 295 |
+
|
| 296 |
+
model, processor, device, torch_dtype = load_model()
|
| 297 |
+
|
| 298 |
+
new_count = 0
|
| 299 |
+
skip_count = 0
|
| 300 |
+
fail_count = 0
|
| 301 |
+
|
| 302 |
+
for item in tqdm(image_list):
|
| 303 |
+
image_key = item["image"]
|
| 304 |
+
|
| 305 |
+
if SKIP_ALREADY_DONE and image_key in result_map:
|
| 306 |
+
skip_count += 1
|
| 307 |
+
continue
|
| 308 |
+
|
| 309 |
+
try:
|
| 310 |
+
captions = make_three_captions(
|
| 311 |
+
image_path=item["path"],
|
| 312 |
+
model=model,
|
| 313 |
+
processor=processor,
|
| 314 |
+
device=device,
|
| 315 |
+
torch_dtype=torch_dtype,
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
result_map[image_key] = {
|
| 319 |
+
"image": item["image"],
|
| 320 |
+
"class": item["class"],
|
| 321 |
+
"captions": captions,
|
| 322 |
+
"split": item["split"],
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
new_count += 1
|
| 326 |
+
|
| 327 |
+
if new_count % SAVE_EVERY == 0:
|
| 328 |
+
save_result(result_map)
|
| 329 |
+
|
| 330 |
+
except Exception as e:
|
| 331 |
+
fail_count += 1
|
| 332 |
+
print(f"\n실패한 이미지: {item['path']}")
|
| 333 |
+
print(f"에러 내용: {e}")
|
| 334 |
+
|
| 335 |
+
save_result(result_map)
|
| 336 |
+
|
| 337 |
+
print("\n캡셔닝 완료")
|
| 338 |
+
print(f"새로 처리한 이미지 수: {new_count}")
|
| 339 |
+
print(f"건너뛴 이미지 수: {skip_count}")
|
| 340 |
+
print(f"실패한 이미지 수: {fail_count}")
|
| 341 |
+
print(f"저장 위치: {OUTPUT_JSON_PATH}")
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
if __name__ == "__main__":
|
| 345 |
+
main()
|
src/caption/generate_captions_git.py
ADDED
|
@@ -0,0 +1,600 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import math
|
| 3 |
+
import random
|
| 4 |
+
import re
|
| 5 |
+
from collections import defaultdict
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Dict, List, Tuple
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import torch
|
| 12 |
+
from dotenv import load_dotenv
|
| 13 |
+
from PIL import Image
|
| 14 |
+
from tqdm import tqdm
|
| 15 |
+
from transformers import AutoModelForCausalLM, AutoProcessor
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# ============================================================
|
| 19 |
+
# 1. 설정값
|
| 20 |
+
# ============================================================
|
| 21 |
+
|
| 22 |
+
load_dotenv()
|
| 23 |
+
|
| 24 |
+
# .env 안의 HF_TOKEN 읽기
|
| 25 |
+
hf_token = os.getenv("HF_TOKEN")
|
| 26 |
+
|
| 27 |
+
# 이미지 원본 루트 경로
|
| 28 |
+
# 예:
|
| 29 |
+
# - 전체 클래스 캡셔닝: "data/raw"
|
| 30 |
+
# - 특정 클래스만 캡셔닝: "data/raw/apple"
|
| 31 |
+
INPUT_IMAGE_PATH = "data/raw/airplane"
|
| 32 |
+
|
| 33 |
+
# image 필드를 만들 때 기준이 되는 root
|
| 34 |
+
# JSON에는 "pizza/hf_pizza_001.jpg" 형태로 저장됨
|
| 35 |
+
DATA_RAW_ROOT = "data/raw"
|
| 36 |
+
|
| 37 |
+
# 결과 저장 경로
|
| 38 |
+
OUTPUT_JSON_PATH = "data/annotations/captions_git.json"
|
| 39 |
+
|
| 40 |
+
# 에러 이미지 목록 저장 경로
|
| 41 |
+
ERROR_JSON_PATH = "data/annotations/caption_git_errors.json"
|
| 42 |
+
|
| 43 |
+
# GIT 모델
|
| 44 |
+
# 기본 추천: microsoft/git-base-coco
|
| 45 |
+
# 더 큰 모델을 쓰고 싶으면: microsoft/git-large-coco
|
| 46 |
+
MODEL_NAME = "microsoft/git-large-coco"
|
| 47 |
+
|
| 48 |
+
# 이미지당 생성할 캡션 개수
|
| 49 |
+
CAPTIONS_PER_IMAGE = 3
|
| 50 |
+
|
| 51 |
+
# split 비율
|
| 52 |
+
# 기본 7 : 1.5 : 1.5
|
| 53 |
+
SPLIT_RATIO = {
|
| 54 |
+
"train": 0.7,
|
| 55 |
+
"val": 0.15,
|
| 56 |
+
"test": 0.15,
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
# split 재현성을 위한 seed
|
| 60 |
+
RANDOM_SEED = 42
|
| 61 |
+
|
| 62 |
+
# 추론 배치 크기
|
| 63 |
+
# GPU 메모리가 부족하면 8 -> 4 -> 2 -> 1 순서로 줄이기
|
| 64 |
+
BATCH_SIZE = 8
|
| 65 |
+
|
| 66 |
+
# 장치 설정
|
| 67 |
+
# "auto": CUDA 가능하면 GPU, 아니면 CPU
|
| 68 |
+
# 직접 지정 가능: "cuda", "cpu"
|
| 69 |
+
DEVICE = "auto"
|
| 70 |
+
|
| 71 |
+
# dtype 설정
|
| 72 |
+
# "auto": CUDA면 float16, CPU면 float32
|
| 73 |
+
# 직접 지정 가능: "float32", "float16", "bfloat16"
|
| 74 |
+
TORCH_DTYPE = "auto"
|
| 75 |
+
|
| 76 |
+
# 중간 저장 간격
|
| 77 |
+
# 이미지가 많을 때 중간에 오류가 나도 일부 결과를 보존하기 위한 설정
|
| 78 |
+
SAVE_EVERY_N_IMAGES = 100
|
| 79 |
+
|
| 80 |
+
# 기존 OUTPUT_JSON_PATH가 있으면 이미 캡셔닝된 이미지는 건너뛸지 여부
|
| 81 |
+
RESUME_FROM_EXISTING_JSON = True
|
| 82 |
+
|
| 83 |
+
# 지원 이미지 확장자
|
| 84 |
+
SUPPORTED_EXTENSIONS = {
|
| 85 |
+
".jpg", ".jpeg", ".png", ".webp", ".bmp"
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
# 캡션 생성 설정
|
| 89 |
+
# num_beams >= num_return_sequences 여야 함
|
| 90 |
+
# num_beam_groups를 사용해서 서로 조금 다른 caption을 생성하도록 함
|
| 91 |
+
GENERATION_CONFIG = {
|
| 92 |
+
"max_length": 40,
|
| 93 |
+
"num_beams": 5,
|
| 94 |
+
"num_return_sequences": CAPTIONS_PER_IMAGE,
|
| 95 |
+
"early_stopping": True,
|
| 96 |
+
"no_repeat_ngram_size": 2,
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
# beam search 결과가 너무 중복될 때 추가 샘플링으로 보완할지 여부
|
| 100 |
+
ENABLE_SAMPLING_FALLBACK = True
|
| 101 |
+
|
| 102 |
+
SAMPLING_FALLBACK_CONFIG = {
|
| 103 |
+
"max_length": 40,
|
| 104 |
+
"do_sample": True,
|
| 105 |
+
"top_p": 0.9,
|
| 106 |
+
"temperature": 0.8,
|
| 107 |
+
"num_return_sequences": CAPTIONS_PER_IMAGE,
|
| 108 |
+
"no_repeat_ngram_size": 2,
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# ============================================================
|
| 113 |
+
# 2. 데이터 구조
|
| 114 |
+
# ============================================================
|
| 115 |
+
|
| 116 |
+
@dataclass
|
| 117 |
+
class ImageItem:
|
| 118 |
+
path: Path
|
| 119 |
+
image_field: str
|
| 120 |
+
class_name: str
|
| 121 |
+
split: str = ""
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
# ============================================================
|
| 125 |
+
# 3. 유틸 함수
|
| 126 |
+
# ============================================================
|
| 127 |
+
|
| 128 |
+
def resolve_device() -> torch.device:
|
| 129 |
+
if DEVICE == "auto":
|
| 130 |
+
return torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 131 |
+
return torch.device(DEVICE)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def resolve_dtype(device: torch.device) -> torch.dtype:
|
| 135 |
+
if TORCH_DTYPE == "auto":
|
| 136 |
+
return torch.float16 if device.type == "cuda" else torch.float32
|
| 137 |
+
|
| 138 |
+
dtype_map = {
|
| 139 |
+
"float32": torch.float32,
|
| 140 |
+
"float16": torch.float16,
|
| 141 |
+
"bfloat16": torch.bfloat16,
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
if TORCH_DTYPE not in dtype_map:
|
| 145 |
+
raise ValueError(f"지원하지 않는 TORCH_DTYPE입니다: {TORCH_DTYPE}")
|
| 146 |
+
|
| 147 |
+
if device.type == "cpu" and TORCH_DTYPE in {"float16", "bfloat16"}:
|
| 148 |
+
print("[WARN] CPU에서는 float16/bfloat16이 불안정할 수 있어 float32로 변경합니다.")
|
| 149 |
+
return torch.float32
|
| 150 |
+
|
| 151 |
+
return dtype_map[TORCH_DTYPE]
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def normalize_caption(text: str) -> str:
|
| 155 |
+
text = text.strip()
|
| 156 |
+
text = re.sub(r"\s+", " ", text)
|
| 157 |
+
return text
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def deduplicate_captions(captions: List[str]) -> List[str]:
|
| 161 |
+
result = []
|
| 162 |
+
seen = set()
|
| 163 |
+
|
| 164 |
+
for caption in captions:
|
| 165 |
+
caption = normalize_caption(caption)
|
| 166 |
+
|
| 167 |
+
if not caption:
|
| 168 |
+
continue
|
| 169 |
+
|
| 170 |
+
key = caption.lower()
|
| 171 |
+
|
| 172 |
+
if key in seen:
|
| 173 |
+
continue
|
| 174 |
+
|
| 175 |
+
seen.add(key)
|
| 176 |
+
result.append(caption)
|
| 177 |
+
|
| 178 |
+
return result
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def ensure_caption_count(captions: List[str], target_count: int) -> List[str]:
|
| 182 |
+
captions = deduplicate_captions(captions)
|
| 183 |
+
|
| 184 |
+
if len(captions) >= target_count:
|
| 185 |
+
return captions[:target_count]
|
| 186 |
+
|
| 187 |
+
if len(captions) == 0:
|
| 188 |
+
return [""] * target_count
|
| 189 |
+
|
| 190 |
+
while len(captions) < target_count:
|
| 191 |
+
captions.append(captions[-1])
|
| 192 |
+
|
| 193 |
+
return captions
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def save_json(path: Path, data: List[dict]) -> None:
|
| 197 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 198 |
+
|
| 199 |
+
with path.open("w", encoding="utf-8") as f:
|
| 200 |
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def load_existing_json(path: Path) -> Dict[str, dict]:
|
| 204 |
+
if not path.exists():
|
| 205 |
+
return {}
|
| 206 |
+
|
| 207 |
+
with path.open("r", encoding="utf-8") as f:
|
| 208 |
+
data = json.load(f)
|
| 209 |
+
|
| 210 |
+
result = {}
|
| 211 |
+
|
| 212 |
+
for item in data:
|
| 213 |
+
image_key = item.get("image")
|
| 214 |
+
|
| 215 |
+
if image_key:
|
| 216 |
+
result[image_key] = item
|
| 217 |
+
|
| 218 |
+
return result
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
# ============================================================
|
| 222 |
+
# 4. 이미지 수집
|
| 223 |
+
# ============================================================
|
| 224 |
+
|
| 225 |
+
def collect_images(input_path: Path, data_raw_root: Path) -> List[ImageItem]:
|
| 226 |
+
if not input_path.exists():
|
| 227 |
+
raise FileNotFoundError(f"입력 경로가 존재하지 않습니다: {input_path}")
|
| 228 |
+
|
| 229 |
+
image_paths = sorted([
|
| 230 |
+
path
|
| 231 |
+
for path in input_path.rglob("*")
|
| 232 |
+
if path.is_file() and path.suffix.lower() in SUPPORTED_EXTENSIONS
|
| 233 |
+
])
|
| 234 |
+
|
| 235 |
+
if not image_paths:
|
| 236 |
+
raise RuntimeError(f"이미지를 찾지 못했습니다: {input_path}")
|
| 237 |
+
|
| 238 |
+
items = []
|
| 239 |
+
|
| 240 |
+
for image_path in image_paths:
|
| 241 |
+
try:
|
| 242 |
+
relative_path = image_path.relative_to(data_raw_root)
|
| 243 |
+
except ValueError:
|
| 244 |
+
raise ValueError(
|
| 245 |
+
f"이미지 경로가 DATA_RAW_ROOT 하위에 있어야 합니다.\n"
|
| 246 |
+
f"image_path={image_path}\n"
|
| 247 |
+
f"DATA_RAW_ROOT={data_raw_root}"
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
if len(relative_path.parts) < 2:
|
| 251 |
+
raise ValueError(
|
| 252 |
+
f"이미지는 클래스 폴더 하위에 있어야 합니다: {image_path}\n"
|
| 253 |
+
f"예: data/raw/pizza/hf_pizza_001.jpg"
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
class_name = relative_path.parts[0]
|
| 257 |
+
image_field = relative_path.as_posix()
|
| 258 |
+
|
| 259 |
+
items.append(
|
| 260 |
+
ImageItem(
|
| 261 |
+
path=image_path,
|
| 262 |
+
image_field=image_field,
|
| 263 |
+
class_name=class_name,
|
| 264 |
+
)
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
return items
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
# ============================================================
|
| 271 |
+
# 5. split 분리
|
| 272 |
+
# ============================================================
|
| 273 |
+
|
| 274 |
+
def calculate_split_counts(total_count: int) -> Dict[str, int]:
|
| 275 |
+
ratio_sum = sum(SPLIT_RATIO.values())
|
| 276 |
+
|
| 277 |
+
raw_counts = {
|
| 278 |
+
split_name: total_count * ratio / ratio_sum
|
| 279 |
+
for split_name, ratio in SPLIT_RATIO.items()
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
counts = {
|
| 283 |
+
split_name: int(math.floor(count))
|
| 284 |
+
for split_name, count in raw_counts.items()
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
remaining = total_count - sum(counts.values())
|
| 288 |
+
|
| 289 |
+
# 소수점이 큰 split부터 남은 개수 배분
|
| 290 |
+
sorted_splits = sorted(
|
| 291 |
+
raw_counts.keys(),
|
| 292 |
+
key=lambda split_name: raw_counts[split_name] - counts[split_name],
|
| 293 |
+
reverse=True,
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
for split_name in sorted_splits[:remaining]:
|
| 297 |
+
counts[split_name] += 1
|
| 298 |
+
|
| 299 |
+
return counts
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
def assign_splits(items: List[ImageItem]) -> List[ImageItem]:
|
| 303 |
+
rng = random.Random(RANDOM_SEED)
|
| 304 |
+
|
| 305 |
+
class_map = defaultdict(list)
|
| 306 |
+
|
| 307 |
+
for item in items:
|
| 308 |
+
class_map[item.class_name].append(item)
|
| 309 |
+
|
| 310 |
+
for class_name, class_items in class_map.items():
|
| 311 |
+
rng.shuffle(class_items)
|
| 312 |
+
|
| 313 |
+
counts = calculate_split_counts(len(class_items))
|
| 314 |
+
|
| 315 |
+
start = 0
|
| 316 |
+
|
| 317 |
+
for split_name in ["train", "val", "test"]:
|
| 318 |
+
end = start + counts.get(split_name, 0)
|
| 319 |
+
|
| 320 |
+
for item in class_items[start:end]:
|
| 321 |
+
item.split = split_name
|
| 322 |
+
|
| 323 |
+
start = end
|
| 324 |
+
|
| 325 |
+
return items
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
# ============================================================
|
| 329 |
+
# 6. 모델 로드
|
| 330 |
+
# ============================================================
|
| 331 |
+
|
| 332 |
+
def load_model():
|
| 333 |
+
device = resolve_device()
|
| 334 |
+
torch_dtype = resolve_dtype(device)
|
| 335 |
+
|
| 336 |
+
print(f"[INFO] device={device}")
|
| 337 |
+
print(f"[INFO] dtype={torch_dtype}")
|
| 338 |
+
print(f"[INFO] model={MODEL_NAME}")
|
| 339 |
+
|
| 340 |
+
processor = AutoProcessor.from_pretrained(MODEL_NAME, token=hf_token)
|
| 341 |
+
|
| 342 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 343 |
+
MODEL_NAME,
|
| 344 |
+
dtype=torch_dtype,
|
| 345 |
+
token=hf_token
|
| 346 |
+
)
|
| 347 |
+
model.to(device)
|
| 348 |
+
model.eval()
|
| 349 |
+
|
| 350 |
+
return model, processor, device, torch_dtype
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
# ============================================================
|
| 354 |
+
# 7. 캡셔닝
|
| 355 |
+
# ============================================================
|
| 356 |
+
|
| 357 |
+
def load_batch_images(batch_items: List[ImageItem]) -> Tuple[List[Image.Image], List[ImageItem], List[dict]]:
|
| 358 |
+
images = []
|
| 359 |
+
valid_items = []
|
| 360 |
+
errors = []
|
| 361 |
+
|
| 362 |
+
for item in batch_items:
|
| 363 |
+
try:
|
| 364 |
+
with Image.open(item.path) as img:
|
| 365 |
+
images.append(img.convert("RGB"))
|
| 366 |
+
valid_items.append(item)
|
| 367 |
+
except Exception as e:
|
| 368 |
+
errors.append({
|
| 369 |
+
"image": item.image_field,
|
| 370 |
+
"class": item.class_name,
|
| 371 |
+
"split": item.split,
|
| 372 |
+
"error": str(e),
|
| 373 |
+
})
|
| 374 |
+
|
| 375 |
+
return images, valid_items, errors
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
@torch.inference_mode()
|
| 379 |
+
def generate_batch_captions(
|
| 380 |
+
model,
|
| 381 |
+
processor,
|
| 382 |
+
device: torch.device,
|
| 383 |
+
torch_dtype: torch.dtype,
|
| 384 |
+
images: List[Image.Image],
|
| 385 |
+
) -> List[List[str]]:
|
| 386 |
+
inputs = processor(images=images, return_tensors="pt")
|
| 387 |
+
|
| 388 |
+
inputs = {
|
| 389 |
+
key: value.to(device)
|
| 390 |
+
for key, value in inputs.items()
|
| 391 |
+
}
|
| 392 |
+
|
| 393 |
+
if "pixel_values" in inputs:
|
| 394 |
+
inputs["pixel_values"] = inputs["pixel_values"].to(dtype=torch_dtype)
|
| 395 |
+
|
| 396 |
+
generated_ids = model.generate(
|
| 397 |
+
**inputs,
|
| 398 |
+
**GENERATION_CONFIG,
|
| 399 |
+
)
|
| 400 |
+
|
| 401 |
+
decoded = processor.batch_decode(
|
| 402 |
+
generated_ids,
|
| 403 |
+
skip_special_tokens=True,
|
| 404 |
+
)
|
| 405 |
+
|
| 406 |
+
grouped_captions = []
|
| 407 |
+
|
| 408 |
+
for i in range(len(images)):
|
| 409 |
+
start = i * CAPTIONS_PER_IMAGE
|
| 410 |
+
end = start + CAPTIONS_PER_IMAGE
|
| 411 |
+
|
| 412 |
+
captions = decoded[start:end]
|
| 413 |
+
captions = deduplicate_captions(captions)
|
| 414 |
+
|
| 415 |
+
grouped_captions.append(captions)
|
| 416 |
+
|
| 417 |
+
return grouped_captions
|
| 418 |
+
|
| 419 |
+
|
| 420 |
+
@torch.inference_mode()
|
| 421 |
+
def generate_sampling_fallback_captions(
|
| 422 |
+
model,
|
| 423 |
+
processor,
|
| 424 |
+
device: torch.device,
|
| 425 |
+
torch_dtype: torch.dtype,
|
| 426 |
+
image: Image.Image,
|
| 427 |
+
) -> List[str]:
|
| 428 |
+
inputs = processor(images=[image], return_tensors="pt")
|
| 429 |
+
|
| 430 |
+
inputs = {
|
| 431 |
+
key: value.to(device)
|
| 432 |
+
for key, value in inputs.items()
|
| 433 |
+
}
|
| 434 |
+
|
| 435 |
+
if "pixel_values" in inputs:
|
| 436 |
+
inputs["pixel_values"] = inputs["pixel_values"].to(dtype=torch_dtype)
|
| 437 |
+
|
| 438 |
+
generated_ids = model.generate(
|
| 439 |
+
**inputs,
|
| 440 |
+
**SAMPLING_FALLBACK_CONFIG,
|
| 441 |
+
)
|
| 442 |
+
|
| 443 |
+
decoded = processor.batch_decode(
|
| 444 |
+
generated_ids,
|
| 445 |
+
skip_special_tokens=True,
|
| 446 |
+
)
|
| 447 |
+
|
| 448 |
+
return deduplicate_captions(decoded)
|
| 449 |
+
|
| 450 |
+
|
| 451 |
+
def make_result_item(item: ImageItem, captions: List[str]) -> dict:
|
| 452 |
+
return {
|
| 453 |
+
"image": item.image_field,
|
| 454 |
+
"class": item.class_name,
|
| 455 |
+
"captions": captions,
|
| 456 |
+
"split": item.split,
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
|
| 460 |
+
def caption_images(
|
| 461 |
+
model,
|
| 462 |
+
processor,
|
| 463 |
+
device: torch.device,
|
| 464 |
+
torch_dtype: torch.dtype,
|
| 465 |
+
items: List[ImageItem],
|
| 466 |
+
existing_result_map: Dict[str, dict],
|
| 467 |
+
) -> Tuple[Dict[str, dict], List[dict]]:
|
| 468 |
+
result_map = dict(existing_result_map)
|
| 469 |
+
error_list = []
|
| 470 |
+
|
| 471 |
+
target_items = [
|
| 472 |
+
item
|
| 473 |
+
for item in items
|
| 474 |
+
if item.image_field not in result_map
|
| 475 |
+
]
|
| 476 |
+
|
| 477 |
+
print(f"[INFO] 전체 이미지 수: {len(items)}")
|
| 478 |
+
print(f"[INFO] 기존 결과 수: {len(existing_result_map)}")
|
| 479 |
+
print(f"[INFO] 새로 캡셔닝할 이미지 수: {len(target_items)}")
|
| 480 |
+
|
| 481 |
+
processed_count = 0
|
| 482 |
+
|
| 483 |
+
for batch_start in tqdm(range(0, len(target_items), BATCH_SIZE), desc="Captioning"):
|
| 484 |
+
batch_items = target_items[batch_start:batch_start + BATCH_SIZE]
|
| 485 |
+
|
| 486 |
+
images, valid_items, errors = load_batch_images(batch_items)
|
| 487 |
+
error_list.extend(errors)
|
| 488 |
+
|
| 489 |
+
if not images:
|
| 490 |
+
continue
|
| 491 |
+
|
| 492 |
+
try:
|
| 493 |
+
batch_captions = generate_batch_captions(
|
| 494 |
+
model=model,
|
| 495 |
+
processor=processor,
|
| 496 |
+
device=device,
|
| 497 |
+
torch_dtype=torch_dtype,
|
| 498 |
+
images=images,
|
| 499 |
+
)
|
| 500 |
+
|
| 501 |
+
for image, item, captions in zip(images, valid_items, batch_captions):
|
| 502 |
+
if ENABLE_SAMPLING_FALLBACK and len(captions) < CAPTIONS_PER_IMAGE:
|
| 503 |
+
fallback_captions = generate_sampling_fallback_captions(
|
| 504 |
+
model=model,
|
| 505 |
+
processor=processor,
|
| 506 |
+
device=device,
|
| 507 |
+
torch_dtype=torch_dtype,
|
| 508 |
+
image=image,
|
| 509 |
+
)
|
| 510 |
+
|
| 511 |
+
captions = deduplicate_captions(captions + fallback_captions)
|
| 512 |
+
|
| 513 |
+
captions = ensure_caption_count(
|
| 514 |
+
captions=captions,
|
| 515 |
+
target_count=CAPTIONS_PER_IMAGE,
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
result_map[item.image_field] = make_result_item(
|
| 519 |
+
item=item,
|
| 520 |
+
captions=captions,
|
| 521 |
+
)
|
| 522 |
+
|
| 523 |
+
processed_count += 1
|
| 524 |
+
|
| 525 |
+
except Exception as e:
|
| 526 |
+
print("[ERROR] 배치 캡셔닝 실패")
|
| 527 |
+
print(f"[ERROR] {type(e).__name__}: {e}")
|
| 528 |
+
|
| 529 |
+
for item in valid_items:
|
| 530 |
+
error_list.append({
|
| 531 |
+
"image": item.image_field,
|
| 532 |
+
"class": item.class_name,
|
| 533 |
+
"split": item.split,
|
| 534 |
+
"error": str(e),
|
| 535 |
+
})
|
| 536 |
+
|
| 537 |
+
if SAVE_EVERY_N_IMAGES > 0 and processed_count > 0:
|
| 538 |
+
if processed_count % SAVE_EVERY_N_IMAGES == 0:
|
| 539 |
+
current_results = [
|
| 540 |
+
result_map[item.image_field]
|
| 541 |
+
for item in items
|
| 542 |
+
if item.image_field in result_map
|
| 543 |
+
]
|
| 544 |
+
save_json(Path(OUTPUT_JSON_PATH), current_results)
|
| 545 |
+
save_json(Path(ERROR_JSON_PATH), error_list)
|
| 546 |
+
|
| 547 |
+
return result_map, error_list
|
| 548 |
+
|
| 549 |
+
|
| 550 |
+
# ============================================================
|
| 551 |
+
# 8. main
|
| 552 |
+
# ============================================================
|
| 553 |
+
|
| 554 |
+
def main():
|
| 555 |
+
input_path = Path(INPUT_IMAGE_PATH).resolve()
|
| 556 |
+
data_raw_root = Path(DATA_RAW_ROOT).resolve()
|
| 557 |
+
output_json_path = Path(OUTPUT_JSON_PATH)
|
| 558 |
+
error_json_path = Path(ERROR_JSON_PATH)
|
| 559 |
+
|
| 560 |
+
items = collect_images(
|
| 561 |
+
input_path=input_path,
|
| 562 |
+
data_raw_root=data_raw_root,
|
| 563 |
+
)
|
| 564 |
+
|
| 565 |
+
items = assign_splits(items)
|
| 566 |
+
|
| 567 |
+
existing_result_map = {}
|
| 568 |
+
|
| 569 |
+
if RESUME_FROM_EXISTING_JSON:
|
| 570 |
+
existing_result_map = load_existing_json(output_json_path)
|
| 571 |
+
|
| 572 |
+
model, processor, device, torch_dtype = load_model()
|
| 573 |
+
|
| 574 |
+
result_map, error_list = caption_images(
|
| 575 |
+
model=model,
|
| 576 |
+
processor=processor,
|
| 577 |
+
device=device,
|
| 578 |
+
torch_dtype=torch_dtype,
|
| 579 |
+
items=items,
|
| 580 |
+
existing_result_map=existing_result_map,
|
| 581 |
+
)
|
| 582 |
+
|
| 583 |
+
final_results = [
|
| 584 |
+
result_map[item.image_field]
|
| 585 |
+
for item in items
|
| 586 |
+
if item.image_field in result_map
|
| 587 |
+
]
|
| 588 |
+
|
| 589 |
+
save_json(output_json_path, final_results)
|
| 590 |
+
save_json(error_json_path, error_list)
|
| 591 |
+
|
| 592 |
+
print("[DONE] 캡셔닝 완료")
|
| 593 |
+
print(f"[DONE] 결과 저장: {output_json_path}")
|
| 594 |
+
print(f"[DONE] 에러 저장: {error_json_path}")
|
| 595 |
+
print(f"[DONE] 정상 결과 수: {len(final_results)}")
|
| 596 |
+
print(f"[DONE] 에러 수: {len(error_list)}")
|
| 597 |
+
|
| 598 |
+
|
| 599 |
+
if __name__ == "__main__":
|
| 600 |
+
main()
|
src/caption/generate_captions_vit_gpt2.py
ADDED
|
@@ -0,0 +1,457 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import random
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from collections import defaultdict
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
from PIL import Image, UnidentifiedImageError
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# =========================================================
|
| 13 |
+
# 1. 설정값
|
| 14 |
+
# =========================================================
|
| 15 |
+
|
| 16 |
+
# 전체 클래스 캡셔닝:
|
| 17 |
+
# INPUT_IMAGE_DIR = "/workspace/data/raw"
|
| 18 |
+
#
|
| 19 |
+
# 특정 클래스만 캡셔닝:
|
| 20 |
+
# INPUT_IMAGE_DIR = "/workspace/data/raw/apple"
|
| 21 |
+
INPUT_IMAGE_DIR = "/workspace/data/raw/airplane"
|
| 22 |
+
|
| 23 |
+
OUTPUT_JSON_PATH = "/workspace/data/annotations/annotation.json"
|
| 24 |
+
|
| 25 |
+
MODEL_NAME = "nlpconnect/vit-gpt2-image-captioning"
|
| 26 |
+
|
| 27 |
+
CAPTIONS_PER_IMAGE = 3
|
| 28 |
+
|
| 29 |
+
SPLIT_RATIO = {
|
| 30 |
+
"train": 0.7,
|
| 31 |
+
"val": 0.15,
|
| 32 |
+
"test": 0.15,
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
RANDOM_SEED = 42
|
| 36 |
+
|
| 37 |
+
BATCH_SIZE = 8
|
| 38 |
+
|
| 39 |
+
IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".webp", ".bmp"]
|
| 40 |
+
|
| 41 |
+
# "auto": data/raw 입력 시 전체 클래스, data/raw/apple 입력 시 apple 클래스만 자동 판단
|
| 42 |
+
# "raw": INPUT_IMAGE_DIR 아래를 전체 raw 폴더로 간주
|
| 43 |
+
# "class": INPUT_IMAGE_DIR 자체를 하나의 클래스 폴더로 간주
|
| 44 |
+
INPUT_MODE = "auto"
|
| 45 |
+
|
| 46 |
+
# 캡션 문장 끝의 마침표 제거 여부
|
| 47 |
+
REMOVE_TRAILING_PERIOD = True
|
| 48 |
+
|
| 49 |
+
# beam search 설정
|
| 50 |
+
GENERATION_CONFIG = {
|
| 51 |
+
"max_new_tokens": 32,
|
| 52 |
+
"num_beams": 8,
|
| 53 |
+
"num_return_sequences": CAPTIONS_PER_IMAGE,
|
| 54 |
+
"early_stopping": True,
|
| 55 |
+
"no_repeat_ngram_size": 2,
|
| 56 |
+
"repetition_penalty": 1.1,
|
| 57 |
+
"length_penalty": 0.8,
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
# beam search 결과가 중복될 때 샘플링으로 보충
|
| 61 |
+
ENABLE_SAMPLING_FALLBACK = True
|
| 62 |
+
|
| 63 |
+
SAMPLING_FALLBACK_CONFIG = {
|
| 64 |
+
"max_new_tokens": 32,
|
| 65 |
+
"do_sample": True,
|
| 66 |
+
"top_p": 0.9,
|
| 67 |
+
"temperature": 0.8,
|
| 68 |
+
"num_return_sequences": CAPTIONS_PER_IMAGE * 2,
|
| 69 |
+
"no_repeat_ngram_size": 2,
|
| 70 |
+
"repetition_penalty": 1.1,
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
MAX_FALLBACK_ROUNDS = 3
|
| 74 |
+
|
| 75 |
+
# 그래도 3개를 못 채우면 중복을 허용해서라도 3개를 맞출지 여부
|
| 76 |
+
FILL_WITH_DUPLICATES_IF_NEEDED = True
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
# =========================================================
|
| 80 |
+
# 2. 기본 유틸 함수
|
| 81 |
+
# =========================================================
|
| 82 |
+
|
| 83 |
+
def validate_config():
|
| 84 |
+
total_ratio = sum(SPLIT_RATIO.values())
|
| 85 |
+
|
| 86 |
+
if abs(total_ratio - 1.0) > 1e-6:
|
| 87 |
+
raise ValueError(f"SPLIT_RATIO의 합은 1이어야 합니다. 현재 합: {total_ratio}")
|
| 88 |
+
|
| 89 |
+
if GENERATION_CONFIG["num_beams"] < CAPTIONS_PER_IMAGE:
|
| 90 |
+
raise ValueError("num_beams는 CAPTIONS_PER_IMAGE보다 크거나 같아야 합니다.")
|
| 91 |
+
|
| 92 |
+
if GENERATION_CONFIG["num_return_sequences"] != CAPTIONS_PER_IMAGE:
|
| 93 |
+
raise ValueError("GENERATION_CONFIG의 num_return_sequences는 CAPTIONS_PER_IMAGE와 같아야 합니다.")
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def is_image_file(path: Path) -> bool:
|
| 97 |
+
return path.suffix.lower() in IMAGE_EXTENSIONS
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def clean_caption(text: str) -> str:
|
| 101 |
+
caption = " ".join(text.strip().split())
|
| 102 |
+
|
| 103 |
+
if REMOVE_TRAILING_PERIOD:
|
| 104 |
+
caption = caption.rstrip(".")
|
| 105 |
+
|
| 106 |
+
return caption
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def unique_captions(captions):
|
| 110 |
+
result = []
|
| 111 |
+
seen = set()
|
| 112 |
+
|
| 113 |
+
for caption in captions:
|
| 114 |
+
caption = clean_caption(caption)
|
| 115 |
+
key = caption.lower()
|
| 116 |
+
|
| 117 |
+
if caption and key not in seen:
|
| 118 |
+
result.append(caption)
|
| 119 |
+
seen.add(key)
|
| 120 |
+
|
| 121 |
+
return result
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def load_image(image_path: Path):
|
| 125 |
+
try:
|
| 126 |
+
return Image.open(image_path).convert("RGB")
|
| 127 |
+
except (UnidentifiedImageError, OSError) as e:
|
| 128 |
+
print(f"[SKIP] 이미지를 열 수 없습니다: {image_path} / error: {e}")
|
| 129 |
+
return None
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# =========================================================
|
| 133 |
+
# 3. 이미지 목록 수집
|
| 134 |
+
# =========================================================
|
| 135 |
+
|
| 136 |
+
def has_direct_images(input_dir: Path) -> bool:
|
| 137 |
+
for child in input_dir.iterdir():
|
| 138 |
+
if child.is_file() and is_image_file(child):
|
| 139 |
+
return True
|
| 140 |
+
return False
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def get_relative_base_dir(input_dir: Path) -> Path:
|
| 144 |
+
"""
|
| 145 |
+
JSON의 image 값을 '클래스폴더/이미지명' 형태로 만들기 위한 기준 경로를 정한다.
|
| 146 |
+
|
| 147 |
+
예시 1)
|
| 148 |
+
INPUT_IMAGE_DIR = /workspace/data/raw
|
| 149 |
+
image file = /workspace/data/raw/pizza/hf_pizza_001.jpg
|
| 150 |
+
relative base = /workspace/data/raw
|
| 151 |
+
result = pizza/hf_pizza_001.jpg
|
| 152 |
+
|
| 153 |
+
예시 2)
|
| 154 |
+
INPUT_IMAGE_DIR = /workspace/data/raw/apple
|
| 155 |
+
image file = /workspace/data/raw/apple/hf_apple_001.jpg
|
| 156 |
+
relative base = /workspace/data/raw
|
| 157 |
+
result = apple/hf_apple_001.jpg
|
| 158 |
+
"""
|
| 159 |
+
|
| 160 |
+
if INPUT_MODE == "raw":
|
| 161 |
+
return input_dir
|
| 162 |
+
|
| 163 |
+
if INPUT_MODE == "class":
|
| 164 |
+
return input_dir.parent
|
| 165 |
+
|
| 166 |
+
if INPUT_MODE == "auto":
|
| 167 |
+
if has_direct_images(input_dir):
|
| 168 |
+
return input_dir.parent
|
| 169 |
+
return input_dir
|
| 170 |
+
|
| 171 |
+
raise ValueError("INPUT_MODE은 'auto', 'raw', 'class' 중 하나여야 합니다.")
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def collect_image_records(input_dir: str):
|
| 175 |
+
input_path = Path(input_dir)
|
| 176 |
+
|
| 177 |
+
if not input_path.exists():
|
| 178 |
+
raise FileNotFoundError(f"��미지 경로가 존재하지 않습니다: {input_path}")
|
| 179 |
+
|
| 180 |
+
relative_base_dir = get_relative_base_dir(input_path)
|
| 181 |
+
|
| 182 |
+
records = []
|
| 183 |
+
|
| 184 |
+
for image_path in sorted(input_path.rglob("*")):
|
| 185 |
+
if not image_path.is_file():
|
| 186 |
+
continue
|
| 187 |
+
|
| 188 |
+
if not is_image_file(image_path):
|
| 189 |
+
continue
|
| 190 |
+
|
| 191 |
+
relative_path = image_path.relative_to(relative_base_dir)
|
| 192 |
+
relative_path_str = relative_path.as_posix()
|
| 193 |
+
|
| 194 |
+
# image 값이 apple/xxx.jpg 라면 class는 apple
|
| 195 |
+
class_name = relative_path.parts[0]
|
| 196 |
+
|
| 197 |
+
records.append({
|
| 198 |
+
"path": image_path,
|
| 199 |
+
"image": relative_path_str,
|
| 200 |
+
"class": class_name,
|
| 201 |
+
})
|
| 202 |
+
|
| 203 |
+
if not records:
|
| 204 |
+
raise ValueError(f"캡셔닝할 이미지가 없습니다: {input_path}")
|
| 205 |
+
|
| 206 |
+
return records
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
# =========================================================
|
| 210 |
+
# 4. train / val / test split 배정
|
| 211 |
+
# =========================================================
|
| 212 |
+
|
| 213 |
+
def assign_split(records):
|
| 214 |
+
random.seed(RANDOM_SEED)
|
| 215 |
+
|
| 216 |
+
class_map = defaultdict(list)
|
| 217 |
+
|
| 218 |
+
for record in records:
|
| 219 |
+
class_map[record["class"]].append(record)
|
| 220 |
+
|
| 221 |
+
result = []
|
| 222 |
+
|
| 223 |
+
for class_name, items in class_map.items():
|
| 224 |
+
random.shuffle(items)
|
| 225 |
+
|
| 226 |
+
total = len(items)
|
| 227 |
+
train_count = int(total * SPLIT_RATIO["train"])
|
| 228 |
+
val_count = int(total * SPLIT_RATIO["val"])
|
| 229 |
+
|
| 230 |
+
for idx, item in enumerate(items):
|
| 231 |
+
if idx < train_count:
|
| 232 |
+
item["split"] = "train"
|
| 233 |
+
elif idx < train_count + val_count:
|
| 234 |
+
item["split"] = "val"
|
| 235 |
+
else:
|
| 236 |
+
item["split"] = "test"
|
| 237 |
+
|
| 238 |
+
result.append(item)
|
| 239 |
+
|
| 240 |
+
result.sort(key=lambda x: x["image"])
|
| 241 |
+
|
| 242 |
+
return result
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
# =========================================================
|
| 246 |
+
# 5. 모델 로드
|
| 247 |
+
# =========================================================
|
| 248 |
+
|
| 249 |
+
def load_model():
|
| 250 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 251 |
+
|
| 252 |
+
print(f"[INFO] device: {device}")
|
| 253 |
+
print(f"[INFO] model: {MODEL_NAME}")
|
| 254 |
+
|
| 255 |
+
model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME)
|
| 256 |
+
processor = ViTImageProcessor.from_pretrained(MODEL_NAME)
|
| 257 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 258 |
+
|
| 259 |
+
if tokenizer.pad_token is None:
|
| 260 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 261 |
+
|
| 262 |
+
model.config.pad_token_id = tokenizer.pad_token_id
|
| 263 |
+
model.to(device)
|
| 264 |
+
model.eval()
|
| 265 |
+
|
| 266 |
+
return model, processor, tokenizer, device
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
# =========================================================
|
| 270 |
+
# 6. 캡션 생성
|
| 271 |
+
# =========================================================
|
| 272 |
+
|
| 273 |
+
def decode_output_ids(output_ids, tokenizer):
|
| 274 |
+
captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
|
| 275 |
+
return [clean_caption(caption) for caption in captions]
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
@torch.no_grad()
|
| 279 |
+
def generate_by_beam_search(images, model, processor, tokenizer, device):
|
| 280 |
+
pixel_values = processor(
|
| 281 |
+
images=images,
|
| 282 |
+
return_tensors="pt"
|
| 283 |
+
).pixel_values.to(device)
|
| 284 |
+
|
| 285 |
+
output_ids = model.generate(
|
| 286 |
+
pixel_values,
|
| 287 |
+
**GENERATION_CONFIG
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
captions = decode_output_ids(output_ids, tokenizer)
|
| 291 |
+
|
| 292 |
+
grouped = []
|
| 293 |
+
start = 0
|
| 294 |
+
|
| 295 |
+
for _ in images:
|
| 296 |
+
end = start + CAPTIONS_PER_IMAGE
|
| 297 |
+
grouped.append(captions[start:end])
|
| 298 |
+
start = end
|
| 299 |
+
|
| 300 |
+
return grouped
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
@torch.no_grad()
|
| 304 |
+
def generate_by_sampling(image, model, processor, tokenizer, device):
|
| 305 |
+
pixel_values = processor(
|
| 306 |
+
images=[image],
|
| 307 |
+
return_tensors="pt"
|
| 308 |
+
).pixel_values.to(device)
|
| 309 |
+
|
| 310 |
+
output_ids = model.generate(
|
| 311 |
+
pixel_values,
|
| 312 |
+
**SAMPLING_FALLBACK_CONFIG
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
return decode_output_ids(output_ids, tokenizer)
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
def complete_caption_count(captions, original_candidates):
|
| 319 |
+
"""
|
| 320 |
+
기본 목표:
|
| 321 |
+
- 최대한 중복 없는 캡션 3개를 만든다.
|
| 322 |
+
|
| 323 |
+
단, 모델이 비슷한 문장만 계속 만들면 3개를 못 채울 수 있다.
|
| 324 |
+
이때 FILL_WITH_DUPLICATES_IF_NEEDED=True이면 중복을 허용해서 3개를 맞춘다.
|
| 325 |
+
"""
|
| 326 |
+
|
| 327 |
+
captions = unique_captions(captions)
|
| 328 |
+
|
| 329 |
+
if len(captions) >= CAPTIONS_PER_IMAGE:
|
| 330 |
+
return captions[:CAPTIONS_PER_IMAGE]
|
| 331 |
+
|
| 332 |
+
if not FILL_WITH_DUPLICATES_IF_NEEDED:
|
| 333 |
+
return captions
|
| 334 |
+
|
| 335 |
+
for caption in original_candidates:
|
| 336 |
+
caption = clean_caption(caption)
|
| 337 |
+
|
| 338 |
+
if caption:
|
| 339 |
+
captions.append(caption)
|
| 340 |
+
|
| 341 |
+
if len(captions) >= CAPTIONS_PER_IMAGE:
|
| 342 |
+
break
|
| 343 |
+
|
| 344 |
+
return captions[:CAPTIONS_PER_IMAGE]
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
def generate_captions_for_batch(batch_records, model, processor, tokenizer, device):
|
| 348 |
+
images = []
|
| 349 |
+
valid_records = []
|
| 350 |
+
|
| 351 |
+
for record in batch_records:
|
| 352 |
+
image = load_image(record["path"])
|
| 353 |
+
|
| 354 |
+
if image is None:
|
| 355 |
+
continue
|
| 356 |
+
|
| 357 |
+
images.append(image)
|
| 358 |
+
valid_records.append(record)
|
| 359 |
+
|
| 360 |
+
if not images:
|
| 361 |
+
return []
|
| 362 |
+
|
| 363 |
+
beam_caption_groups = generate_by_beam_search(
|
| 364 |
+
images=images,
|
| 365 |
+
model=model,
|
| 366 |
+
processor=processor,
|
| 367 |
+
tokenizer=tokenizer,
|
| 368 |
+
device=device
|
| 369 |
+
)
|
| 370 |
+
|
| 371 |
+
results = []
|
| 372 |
+
|
| 373 |
+
for record, image, beam_captions in zip(valid_records, images, beam_caption_groups):
|
| 374 |
+
all_candidates = list(beam_captions)
|
| 375 |
+
captions = unique_captions(beam_captions)
|
| 376 |
+
|
| 377 |
+
if ENABLE_SAMPLING_FALLBACK:
|
| 378 |
+
fallback_round = 0
|
| 379 |
+
|
| 380 |
+
while len(captions) < CAPTIONS_PER_IMAGE and fallback_round < MAX_FALLBACK_ROUNDS:
|
| 381 |
+
sampled_captions = generate_by_sampling(
|
| 382 |
+
image=image,
|
| 383 |
+
model=model,
|
| 384 |
+
processor=processor,
|
| 385 |
+
tokenizer=tokenizer,
|
| 386 |
+
device=device
|
| 387 |
+
)
|
| 388 |
+
|
| 389 |
+
all_candidates.extend(sampled_captions)
|
| 390 |
+
captions = unique_captions(captions + sampled_captions)
|
| 391 |
+
fallback_round += 1
|
| 392 |
+
|
| 393 |
+
captions = complete_caption_count(
|
| 394 |
+
captions=captions,
|
| 395 |
+
original_candidates=all_candidates
|
| 396 |
+
)
|
| 397 |
+
|
| 398 |
+
results.append({
|
| 399 |
+
"image": record["image"],
|
| 400 |
+
"class": record["class"],
|
| 401 |
+
"captions": captions,
|
| 402 |
+
"split": record["split"],
|
| 403 |
+
})
|
| 404 |
+
|
| 405 |
+
return results
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
# =========================================================
|
| 409 |
+
# 7. JSON 저장
|
| 410 |
+
# =========================================================
|
| 411 |
+
|
| 412 |
+
def save_json(data, output_path: str):
|
| 413 |
+
output_path = Path(output_path)
|
| 414 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 415 |
+
|
| 416 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 417 |
+
json.dump(data, f, ensure_ascii=False, indent=4)
|
| 418 |
+
|
| 419 |
+
print(f"[DONE] JSON 저장 완료: {output_path}")
|
| 420 |
+
print(f"[DONE] 총 이미지 수: {len(data)}")
|
| 421 |
+
|
| 422 |
+
|
| 423 |
+
# =========================================================
|
| 424 |
+
# 8. 실행
|
| 425 |
+
# =========================================================
|
| 426 |
+
|
| 427 |
+
def main():
|
| 428 |
+
validate_config()
|
| 429 |
+
|
| 430 |
+
records = collect_image_records(INPUT_IMAGE_DIR)
|
| 431 |
+
records = assign_split(records)
|
| 432 |
+
|
| 433 |
+
print(f"[INFO] 캡셔닝 대상 이미지 수: {len(records)}")
|
| 434 |
+
|
| 435 |
+
model, processor, tokenizer, device = load_model()
|
| 436 |
+
|
| 437 |
+
results = []
|
| 438 |
+
|
| 439 |
+
for start in tqdm(range(0, len(records), BATCH_SIZE), desc="captioning"):
|
| 440 |
+
end = start + BATCH_SIZE
|
| 441 |
+
batch_records = records[start:end]
|
| 442 |
+
|
| 443 |
+
batch_results = generate_captions_for_batch(
|
| 444 |
+
batch_records=batch_records,
|
| 445 |
+
model=model,
|
| 446 |
+
processor=processor,
|
| 447 |
+
tokenizer=tokenizer,
|
| 448 |
+
device=device
|
| 449 |
+
)
|
| 450 |
+
|
| 451 |
+
results.extend(batch_results)
|
| 452 |
+
|
| 453 |
+
save_json(results, OUTPUT_JSON_PATH)
|
| 454 |
+
|
| 455 |
+
|
| 456 |
+
if __name__ == "__main__":
|
| 457 |
+
main()
|
src/collection/check_class_counts.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
# ================================
|
| 4 |
+
# 0. 설정
|
| 5 |
+
# ================================
|
| 6 |
+
TARGET_COUNT = 60
|
| 7 |
+
MIN_RES = 256 # 해상도 256
|
| 8 |
+
PREFIX = "kg"
|
| 9 |
+
BASE_DIR = "./data/raw"
|
| 10 |
+
|
| 11 |
+
# ================================
|
| 12 |
+
# 1. 경로
|
| 13 |
+
# ================================
|
| 14 |
+
HOME = os.path.expanduser("~")
|
| 15 |
+
|
| 16 |
+
DATA_DIR = os.path.join(
|
| 17 |
+
HOME,
|
| 18 |
+
"Desktop",
|
| 19 |
+
"raw_kg"
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
THRESHOLD = TARGET_COUNT
|
| 23 |
+
|
| 24 |
+
# ================================
|
| 25 |
+
# 2. 클래스 목록
|
| 26 |
+
# ================================
|
| 27 |
+
CLASS_LIST = [
|
| 28 |
+
# 음식 및 식재료
|
| 29 |
+
"pizza","hamburger","sushi","pasta","salad",
|
| 30 |
+
"steak","cup_cake","sandwich","waffle","dumpling",
|
| 31 |
+
|
| 32 |
+
# 동물
|
| 33 |
+
"golden-retriever","bulldog","siamese_cat",
|
| 34 |
+
"persian_cat","elephant","sheep","horse",
|
| 35 |
+
"penguin","butterfly","squirrel",
|
| 36 |
+
|
| 37 |
+
# 꽃
|
| 38 |
+
"rose","sunflower","daisy","tulip","dandelion",
|
| 39 |
+
"lily","lavender","orchid","iris","marigold","aster",
|
| 40 |
+
|
| 41 |
+
# 과일
|
| 42 |
+
"apple","banana","strawberry","orange",
|
| 43 |
+
"carrot","tomato","cucumber",
|
| 44 |
+
|
| 45 |
+
# 탈것
|
| 46 |
+
"car","bicycle","motorcycle","airplane","bus",
|
| 47 |
+
|
| 48 |
+
# 패션 및 잡화
|
| 49 |
+
"t-shirt","sneakers","earrings","glasses",
|
| 50 |
+
"pants","bracelet","necklace"
|
| 51 |
+
]
|
| 52 |
+
|
| 53 |
+
print(f"{THRESHOLD}장 이하 클래스 목록 (0장 포함)\n")
|
| 54 |
+
|
| 55 |
+
low_classes = []
|
| 56 |
+
|
| 57 |
+
# ================================
|
| 58 |
+
# 3. 클래스별 개수 체크
|
| 59 |
+
# ================================
|
| 60 |
+
for cls in sorted(CLASS_LIST):
|
| 61 |
+
|
| 62 |
+
cls_path = os.path.join(DATA_DIR, cls)
|
| 63 |
+
|
| 64 |
+
if not os.path.exists(cls_path):
|
| 65 |
+
count = 0
|
| 66 |
+
|
| 67 |
+
else:
|
| 68 |
+
count = len([
|
| 69 |
+
f for f in os.listdir(cls_path)
|
| 70 |
+
if os.path.isfile(os.path.join(cls_path, f))
|
| 71 |
+
])
|
| 72 |
+
|
| 73 |
+
if count < THRESHOLD:
|
| 74 |
+
print(f"{cls}: {count}장")
|
| 75 |
+
low_classes.append((cls, count))
|
| 76 |
+
|
| 77 |
+
# ================================
|
| 78 |
+
# 4. 요약
|
| 79 |
+
# ================================
|
| 80 |
+
print("\n요약")
|
| 81 |
+
print(f"{THRESHOLD}장 미만 클래스 수: {len(low_classes)}개")
|
src/collection/collect_filtering_images.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import hashlib
|
| 3 |
+
from PIL import Image
|
| 4 |
+
|
| 5 |
+
# ================================
|
| 6 |
+
# 0. 설정
|
| 7 |
+
# ================================
|
| 8 |
+
TARGET_COUNT = 60
|
| 9 |
+
MIN_RES = 128 # 해상도 128
|
| 10 |
+
PREFIX = "kg"
|
| 11 |
+
BASE_DIR = "./data/raw"
|
| 12 |
+
|
| 13 |
+
# ================================
|
| 14 |
+
# 1. 클래스 + 유사어 매핑
|
| 15 |
+
# ================================
|
| 16 |
+
CLASS_MAP = {
|
| 17 |
+
# 음식
|
| 18 |
+
"pizza": ["pizza"],
|
| 19 |
+
"hamburger": ["hamburger"],
|
| 20 |
+
"sushi": ["sushi"],
|
| 21 |
+
"pasta": ["pasta", "spaghetti"],
|
| 22 |
+
"salad": ["salad"],
|
| 23 |
+
"steak": ["steak"],
|
| 24 |
+
"cup_cake": ["cup_cake", "cup cake"],
|
| 25 |
+
"sandwich": ["sandwich"],
|
| 26 |
+
"waffle": ["waffle"],
|
| 27 |
+
"dumpling": ["dumpling"],
|
| 28 |
+
|
| 29 |
+
# 동물
|
| 30 |
+
"golden-retriever": ["golden retriever"],
|
| 31 |
+
"bulldog": ["bulldog"],
|
| 32 |
+
"siamese-cat": ["siamese"],
|
| 33 |
+
"persian-cat": ["persian"],
|
| 34 |
+
"elephant": ["elephant"],
|
| 35 |
+
"sheep": ["sheep"],
|
| 36 |
+
"horse": ["horse"],
|
| 37 |
+
"penguin": ["penguin"],
|
| 38 |
+
"butterfly": ["butterfly"],
|
| 39 |
+
"squirrel": ["squirrel"],
|
| 40 |
+
|
| 41 |
+
# 꽃
|
| 42 |
+
"rose": ["rose"],
|
| 43 |
+
"sunflower": ["sunflower"],
|
| 44 |
+
"daisy": ["daisy"],
|
| 45 |
+
"tulip": ["tulip"],
|
| 46 |
+
"dandelion": ["dandelion"],
|
| 47 |
+
"lily": ["lily"],
|
| 48 |
+
"lavender": ["lavender"],
|
| 49 |
+
"orchid": ["orchid"],
|
| 50 |
+
"iris": ["iris"],
|
| 51 |
+
"marigold": ["marigold"],
|
| 52 |
+
"aster": ["aster"],
|
| 53 |
+
|
| 54 |
+
# 과일
|
| 55 |
+
"apple": ["apple"],
|
| 56 |
+
"banana": ["banana"],
|
| 57 |
+
"strawberry": ["strawberry"],
|
| 58 |
+
"orange": ["orange"],
|
| 59 |
+
"carrot": ["carrot"],
|
| 60 |
+
"tomato": ["tomato"],
|
| 61 |
+
"cucumber": ["cucumber"],
|
| 62 |
+
|
| 63 |
+
# 탈것
|
| 64 |
+
"car": ["car"],
|
| 65 |
+
"bicycle": ["bicycle"],
|
| 66 |
+
"motorcycle": ["motorcycle"],
|
| 67 |
+
"airplane": ["airplane"],
|
| 68 |
+
"bus": ["bus"],
|
| 69 |
+
|
| 70 |
+
# 패션 및 잡화
|
| 71 |
+
"t-shirt": ["t shirt", "t-shirt"],
|
| 72 |
+
"sneakers": ["sneakers"],
|
| 73 |
+
"earrings": ["earring", "earrings"],
|
| 74 |
+
"glasses": ["glasses"],
|
| 75 |
+
"pants": ["pants"],
|
| 76 |
+
"bracelet": ["bracelet"],
|
| 77 |
+
"necklace": ["necklace"]
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
# ================================
|
| 81 |
+
# 2. 경로
|
| 82 |
+
# ================================
|
| 83 |
+
HOME = os.path.expanduser("~")
|
| 84 |
+
|
| 85 |
+
SRC_ROOT = os.path.join(
|
| 86 |
+
HOME,
|
| 87 |
+
"Desktop",
|
| 88 |
+
"raw_full_kg",
|
| 89 |
+
"extracted"
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
DST_ROOT = os.path.join(
|
| 93 |
+
HOME,
|
| 94 |
+
"Desktop",
|
| 95 |
+
"raw_kg"
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
os.makedirs(DST_ROOT, exist_ok=True)
|
| 99 |
+
|
| 100 |
+
# ================================
|
| 101 |
+
# 3. 해상도 필터
|
| 102 |
+
# ================================
|
| 103 |
+
def is_valid_image(path, min_size=MIN_RES):
|
| 104 |
+
try:
|
| 105 |
+
with Image.open(path) as img:
|
| 106 |
+
w, h = img.size
|
| 107 |
+
return w >= min_size and h >= min_size
|
| 108 |
+
except:
|
| 109 |
+
return False
|
| 110 |
+
|
| 111 |
+
# ================================
|
| 112 |
+
# 4. 중복 제거
|
| 113 |
+
# ================================
|
| 114 |
+
def get_hash(path):
|
| 115 |
+
try:
|
| 116 |
+
with open(path, "rb") as f:
|
| 117 |
+
return hashlib.md5(f.read()).hexdigest()
|
| 118 |
+
except:
|
| 119 |
+
return None
|
| 120 |
+
|
| 121 |
+
seen_hashes = set()
|
| 122 |
+
|
| 123 |
+
# ================================
|
| 124 |
+
# 5. 클래스 매칭
|
| 125 |
+
# ================================
|
| 126 |
+
def match_class(folder_name):
|
| 127 |
+
name = folder_name.lower()
|
| 128 |
+
name = name.replace("-", " ").replace("_", " ")
|
| 129 |
+
|
| 130 |
+
words = name.split()
|
| 131 |
+
|
| 132 |
+
for target, keywords in CLASS_MAP.items():
|
| 133 |
+
for kw in keywords:
|
| 134 |
+
kw_words = kw.split()
|
| 135 |
+
|
| 136 |
+
if all(word in words for word in kw_words):
|
| 137 |
+
return target
|
| 138 |
+
|
| 139 |
+
return None
|
| 140 |
+
|
| 141 |
+
# ================================
|
| 142 |
+
# 6. 메인 로직
|
| 143 |
+
# ================================
|
| 144 |
+
class_counter = {
|
| 145 |
+
cls: 1 for cls in CLASS_MAP.keys()
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
copied = 0
|
| 149 |
+
skipped = 0
|
| 150 |
+
no_match = 0
|
| 151 |
+
|
| 152 |
+
for root, dirs, files in os.walk(SRC_ROOT):
|
| 153 |
+
|
| 154 |
+
for d in dirs:
|
| 155 |
+
matched_class = match_class(d)
|
| 156 |
+
|
| 157 |
+
if matched_class is None:
|
| 158 |
+
no_match += 1
|
| 159 |
+
continue
|
| 160 |
+
|
| 161 |
+
src_path = os.path.join(root, d)
|
| 162 |
+
dst_path = os.path.join(DST_ROOT, matched_class)
|
| 163 |
+
|
| 164 |
+
for img in os.listdir(src_path):
|
| 165 |
+
|
| 166 |
+
src_file = os.path.join(src_path, img)
|
| 167 |
+
|
| 168 |
+
if not os.path.isfile(src_file):
|
| 169 |
+
continue
|
| 170 |
+
|
| 171 |
+
# 이미지 검증
|
| 172 |
+
try:
|
| 173 |
+
with Image.open(src_file) as im:
|
| 174 |
+
im.verify()
|
| 175 |
+
|
| 176 |
+
except:
|
| 177 |
+
skipped += 1
|
| 178 |
+
continue
|
| 179 |
+
|
| 180 |
+
# 해상도 필터
|
| 181 |
+
if not is_valid_image(src_file):
|
| 182 |
+
skipped += 1
|
| 183 |
+
continue
|
| 184 |
+
|
| 185 |
+
# 중복 제거
|
| 186 |
+
img_hash = get_hash(src_file)
|
| 187 |
+
|
| 188 |
+
if img_hash is None or img_hash in seen_hashes:
|
| 189 |
+
skipped += 1
|
| 190 |
+
continue
|
| 191 |
+
|
| 192 |
+
seen_hashes.add(img_hash)
|
| 193 |
+
|
| 194 |
+
if not os.path.exists(dst_path):
|
| 195 |
+
os.makedirs(dst_path, exist_ok=True)
|
| 196 |
+
|
| 197 |
+
number = str(
|
| 198 |
+
class_counter[matched_class]
|
| 199 |
+
).zfill(3)
|
| 200 |
+
|
| 201 |
+
class_name_for_file = matched_class.replace("_", "-")
|
| 202 |
+
|
| 203 |
+
new_name = (
|
| 204 |
+
f"{PREFIX}_{class_name_for_file}_{number}.jpg"
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
dst_file = os.path.join(dst_path, new_name)
|
| 208 |
+
|
| 209 |
+
try:
|
| 210 |
+
with Image.open(src_file) as im:
|
| 211 |
+
im.convert("RGB").save(
|
| 212 |
+
dst_file,
|
| 213 |
+
"JPEG"
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
class_counter[matched_class] += 1
|
| 217 |
+
copied += 1
|
| 218 |
+
|
| 219 |
+
if copied % 100 == 0:
|
| 220 |
+
print(f"{copied}장 처리 중...")
|
| 221 |
+
|
| 222 |
+
except:
|
| 223 |
+
skipped += 1
|
| 224 |
+
|
| 225 |
+
print("\n완료!")
|
| 226 |
+
print(f"복사: {copied}")
|
| 227 |
+
print(f"스킵: {skipped}")
|
| 228 |
+
print(f"매칭 실패 폴더: {no_match}")
|
src/collection/count_label_hf.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
from collections import Counter
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
# ============================================================
|
| 7 |
+
# [설정 부분]
|
| 8 |
+
# ============================================================
|
| 9 |
+
load_dotenv()
|
| 10 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 11 |
+
|
| 12 |
+
# 확인할 Hugging Face 데이터셋 이름
|
| 13 |
+
DATASET_NAME = "jbarat/plant_species" # 예: "uran66/animals"
|
| 14 |
+
|
| 15 |
+
# 확인할 split 이름
|
| 16 |
+
SPLIT_NAME = "train"
|
| 17 |
+
|
| 18 |
+
# 라벨 필드명
|
| 19 |
+
LABEL_FIELD_NAME = "label"
|
| 20 |
+
|
| 21 |
+
# streaming 사용 여부
|
| 22 |
+
# True : 전체 데이터셋을 미리 다운로드하지 않고 하나씩 읽으면서 확인
|
| 23 |
+
# False : 로컬 캐시에 데이터셋을 다운로드한 뒤 확인
|
| 24 |
+
USE_STREAMING = True
|
| 25 |
+
|
| 26 |
+
# 문자열 라벨 데이터셋일 경우 전체 데이터를 훑어야 정확한 개수를 알 수 있다.
|
| 27 |
+
# None이면 전체 확인, 숫자를 넣으면 일부 샘플만 확인한다.
|
| 28 |
+
MAX_SCAN_ITEMS = None
|
| 29 |
+
|
| 30 |
+
# ============================================================
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def get_label_name(dataset, label_value):
|
| 34 |
+
|
| 35 |
+
label_feature = dataset.features[LABEL_FIELD_NAME]
|
| 36 |
+
|
| 37 |
+
# ClassLabel 타입이면 숫자 라벨을 문자열 라벨명으로 변환한다.
|
| 38 |
+
if hasattr(label_feature, "int2str") and isinstance(label_value, int):
|
| 39 |
+
return label_feature.int2str(label_value)
|
| 40 |
+
|
| 41 |
+
# 이미 문자열 라벨이면 그대로 문자열로 변환해서 사용한다.
|
| 42 |
+
return str(label_value)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def get_unique_labels_with_counts():
|
| 46 |
+
|
| 47 |
+
print(f"[{DATASET_NAME}] 데이터셋 로드 중...")
|
| 48 |
+
|
| 49 |
+
dataset = load_dataset(
|
| 50 |
+
DATASET_NAME,
|
| 51 |
+
split=SPLIT_NAME,
|
| 52 |
+
streaming=USE_STREAMING,
|
| 53 |
+
token=HF_TOKEN
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# 데이터셋의 feature 정보에서 라벨 필드를 가져온다.
|
| 57 |
+
label_feature = dataset.features[LABEL_FIELD_NAME]
|
| 58 |
+
|
| 59 |
+
# 클래스별 이미지 개수를 저장할 Counter
|
| 60 |
+
label_counter = Counter()
|
| 61 |
+
|
| 62 |
+
print("\n클래스별 이미지 개수 집계 중...")
|
| 63 |
+
|
| 64 |
+
# streaming=True인 경우에도 dataset을 순회하면서 개수를 셀 수 있다.
|
| 65 |
+
for idx, item in enumerate(dataset):
|
| 66 |
+
# MAX_SCAN_ITEMS가 설정되어 있으면 지정한 개수까지만 확인한다.
|
| 67 |
+
if MAX_SCAN_ITEMS is not None and idx >= MAX_SCAN_ITEMS:
|
| 68 |
+
break
|
| 69 |
+
|
| 70 |
+
label_value = item.get(LABEL_FIELD_NAME)
|
| 71 |
+
|
| 72 |
+
# 라벨 값이 없는 데이터는 건너뛴다.
|
| 73 |
+
if label_value is None:
|
| 74 |
+
continue
|
| 75 |
+
|
| 76 |
+
# 숫자 라벨이면 실제 라벨명으로 변환하고,
|
| 77 |
+
# 문자열 라벨이면 그대로 사용한다.
|
| 78 |
+
label_name = get_label_name(dataset, label_value)
|
| 79 |
+
|
| 80 |
+
# 해당 라벨의 이미지 개수를 1 증가시킨다.
|
| 81 |
+
label_counter[label_name] += 1
|
| 82 |
+
|
| 83 |
+
print("\n라벨 목록 및 클래스별 이미지 개수")
|
| 84 |
+
print("-" * 60)
|
| 85 |
+
|
| 86 |
+
# ------------------------------------------------------------
|
| 87 |
+
# 1. Food101처럼 label이 ClassLabel 타입인 경우
|
| 88 |
+
# ------------------------------------------------------------
|
| 89 |
+
# label_feature.names가 있으면 원래 데이터셋의 라벨 순서대로 출력한다.
|
| 90 |
+
if hasattr(label_feature, "names") and label_feature.names is not None:
|
| 91 |
+
label_names = label_feature.names
|
| 92 |
+
|
| 93 |
+
for idx, label_name in enumerate(label_names):
|
| 94 |
+
count = label_counter.get(label_name, 0)
|
| 95 |
+
print(f"{idx}: {label_name} - {count} 장")
|
| 96 |
+
|
| 97 |
+
# ------------------------------------------------------------
|
| 98 |
+
# 2. label이 문자열로 직접 들어있는 데이터셋인 경우
|
| 99 |
+
# ------------------------------------------------------------
|
| 100 |
+
# Counter에 모인 라벨명을 이름순으로 정렬해서 출력한다.
|
| 101 |
+
else:
|
| 102 |
+
label_names = sorted(label_counter.keys())
|
| 103 |
+
|
| 104 |
+
for idx, label_name in enumerate(label_names):
|
| 105 |
+
count = label_counter[label_name]
|
| 106 |
+
print(f"{idx}: {label_name} - {count} 장")
|
| 107 |
+
|
| 108 |
+
print("-" * 60)
|
| 109 |
+
print(f"총 라벨 개수: {len(label_counter)}")
|
| 110 |
+
print(f"총 이미지 개수: {sum(label_counter.values())}")
|
| 111 |
+
|
| 112 |
+
return label_counter
|
| 113 |
+
|
| 114 |
+
if __name__ == "__main__":
|
| 115 |
+
get_unique_labels_with_counts()
|
src/collection/download_dataset_hf.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import glob
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
from datasets import load_dataset
|
| 6 |
+
from PIL import Image
|
| 7 |
+
|
| 8 |
+
# =====================================================================
|
| 9 |
+
# [설정 부분]
|
| 10 |
+
# =====================================================================
|
| 11 |
+
# 토큰
|
| 12 |
+
load_dotenv()
|
| 13 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 14 |
+
|
| 15 |
+
print(f"이거 토큰 : {HF_TOKEN}")
|
| 16 |
+
|
| 17 |
+
# 수집할 데이터셋
|
| 18 |
+
DATASET_NAME = "KrushiJethe/fashion_data"
|
| 19 |
+
# 데이터셋 내의 이미지 데이터가 있는 필드명
|
| 20 |
+
IMAGE_FIELD_NAME = "image"
|
| 21 |
+
# 데이터셋 내의 라벨 데이터가 있는 필드명
|
| 22 |
+
LABEL_FIELD_NAME = "articleType"
|
| 23 |
+
|
| 24 |
+
# 여러 라벨을 하나의 대표 클래스로 묶는 매핑 딕셔너리
|
| 25 |
+
CLASS_MAPPING = {
|
| 26 |
+
"t-shirt": ["Tshirts", "Tops"],
|
| 27 |
+
"sneakers":["Casual Shoes"],
|
| 28 |
+
#"umbrella":["Umbrellas"],
|
| 29 |
+
"glasses":["Sunglasses"],
|
| 30 |
+
"pants":["Jeans"],
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
# 클래스별로 수집할 이미지의 최대 개수
|
| 34 |
+
NUM_IMAGES_PER_CLASS = 100
|
| 35 |
+
# 저장할 이미지의 해상도 (width, height)
|
| 36 |
+
TARGET_RESOLUTION = 256
|
| 37 |
+
# 이미지를 저장할 최상위 디렉토리명
|
| 38 |
+
BASE_SAVE_DIR = "./dataset_output"
|
| 39 |
+
# 수집할 데이터셋의 split 이름 (예: "train", "validation", "test")
|
| 40 |
+
SPLIT_NAME = "train"
|
| 41 |
+
|
| 42 |
+
# 컨테이너를 실행한 상태에서는 컨테이너에 캐시 저장됨
|
| 43 |
+
# 캐시 확인 -> ls -lah ~/.cache/huggingface
|
| 44 |
+
# 캐시 삭제 -> rm -rf ~/.cache/huggingface
|
| 45 |
+
USE_STREAMING = False
|
| 46 |
+
# =====================================================================
|
| 47 |
+
|
| 48 |
+
# 클래스 명명 규칙 적용
|
| 49 |
+
def format_class_name(class_name: str) -> str:
|
| 50 |
+
"""
|
| 51 |
+
클래스명은 소문자로 하고 띄어쓰기가 있을 경우 "-"로 대체
|
| 52 |
+
"""
|
| 53 |
+
return str(class_name).lower().replace("_", "-").replace(" ", "-")
|
| 54 |
+
|
| 55 |
+
# 마지막 이미지의 번호 + 1
|
| 56 |
+
def get_next_image_index(save_dir: str, formatted_class_name: str) -> int:
|
| 57 |
+
"""
|
| 58 |
+
이미지를 여러 차례 이어서 수집할 수 있도록 마지막 이미지 번호를 탐색
|
| 59 |
+
디렉토리를 스캔하여 가장 높은 번호를 찾은 뒤 +1을 반환
|
| 60 |
+
"""
|
| 61 |
+
if not os.path.exists(save_dir):
|
| 62 |
+
return 1
|
| 63 |
+
|
| 64 |
+
# jpg와 jpeg 확장자 모두 검색
|
| 65 |
+
search_pattern_jpg = os.path.join(save_dir, f"hf_{formatted_class_name}_*.jpg")
|
| 66 |
+
search_pattern_jpeg = os.path.join(save_dir, f"hf_{formatted_class_name}_*.jpeg")
|
| 67 |
+
|
| 68 |
+
existing_files = glob.glob(search_pattern_jpg) + glob.glob(search_pattern_jpeg)
|
| 69 |
+
|
| 70 |
+
max_idx = 0
|
| 71 |
+
# 파일명에서 정규표현식을 통해 번호 추출 (예: hf_fried-chicken_001.jpg -> 1)
|
| 72 |
+
regex = re.compile(rf"hf_{formatted_class_name}_(\d+)\.jpe?g$")
|
| 73 |
+
|
| 74 |
+
for file_path in existing_files:
|
| 75 |
+
basename = os.path.basename(file_path)
|
| 76 |
+
match = regex.match(basename)
|
| 77 |
+
if match:
|
| 78 |
+
idx = int(match.group(1))
|
| 79 |
+
if idx > max_idx:
|
| 80 |
+
max_idx = idx
|
| 81 |
+
|
| 82 |
+
return max_idx + 1
|
| 83 |
+
|
| 84 |
+
def collect_hf_images():
|
| 85 |
+
"""
|
| 86 |
+
메인 데이터 수집 함수.
|
| 87 |
+
Hugging Face 데이터셋에서 설정을 반영하여 이미지를 수집하고 저장
|
| 88 |
+
"""
|
| 89 |
+
|
| 90 |
+
label_to_rep_class = {}
|
| 91 |
+
for rep_class, labels in CLASS_MAPPING.items():
|
| 92 |
+
for label in labels:
|
| 93 |
+
label_to_rep_class[label] = rep_class
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
print(label_to_rep_class)
|
| 97 |
+
# 데이터셋별 낱개로 수집
|
| 98 |
+
# streaming=True 속성을 사용하면 전체 데이터셋을 메모리나 디스크에 한 번에 다운로드하지 않고
|
| 99 |
+
# generator 형태로 하나씩(낱개로) 가져오므로 메모리와 네트워크 효율성이 극대화
|
| 100 |
+
print(f"[{DATASET_NAME}] 데이터셋 스트리밍 로드 시작...")
|
| 101 |
+
dataset = load_dataset(DATASET_NAME, split=SPLIT_NAME, streaming=USE_STREAMING, token=HF_TOKEN)
|
| 102 |
+
|
| 103 |
+
# 랜덤으로 가져오기
|
| 104 |
+
# random_seed = random.randint(0, 10000)
|
| 105 |
+
# dataset = load_dataset(DATASET_NAME, split=SPLIT_NAME, streaming=USE_STREAMING).shuffle(seed=random_seed, buffer_size=1000)
|
| 106 |
+
|
| 107 |
+
# 클래스별로 포맷팅된 폴더명과, 현재까지 수집된 개수, 그리고 저장될 시작 번호를 관리할 딕셔너리
|
| 108 |
+
class_info = {}
|
| 109 |
+
for label in CLASS_MAPPING.keys():
|
| 110 |
+
formatted_name = format_class_name(label)
|
| 111 |
+
save_path = os.path.join(BASE_SAVE_DIR, formatted_name)
|
| 112 |
+
|
| 113 |
+
# [규칙 1, 4] 클래스를 폴더로 관리하며 폴더명은 변환된 클래스명을 따른다.
|
| 114 |
+
os.makedirs(save_path, exist_ok=True)
|
| 115 |
+
|
| 116 |
+
# 이어서 수집하기 위한 시작 인덱스 탐색
|
| 117 |
+
start_idx = get_next_image_index(save_path, formatted_name)
|
| 118 |
+
|
| 119 |
+
class_info[label] = {
|
| 120 |
+
"formatted_name": formatted_name,
|
| 121 |
+
"save_path": save_path,
|
| 122 |
+
"collected_count": 0,
|
| 123 |
+
"current_idx": start_idx
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
print("데이터 수집을 시작합니다...")
|
| 127 |
+
|
| 128 |
+
# 스트리밍 데이터 순회
|
| 129 |
+
for item in dataset:
|
| 130 |
+
|
| 131 |
+
print("1. 데이터셋 로드 시작...")
|
| 132 |
+
# 모든 클래스가 목표 수집량을 채웠��지 확인
|
| 133 |
+
if all(info["collected_count"] >= NUM_IMAGES_PER_CLASS for info in class_info.values()):
|
| 134 |
+
print("모든 클래스의 이미지 수집이 완료되었습니다.")
|
| 135 |
+
break
|
| 136 |
+
|
| 137 |
+
print("2. 데이터셋 라벨 아이템 꺼내기...")
|
| 138 |
+
current_label = item.get(LABEL_FIELD_NAME)
|
| 139 |
+
|
| 140 |
+
print(current_label)
|
| 141 |
+
# 현재 뽑힌 라벨이 정의한 매핑 딕셔너리에 존재하는지 확인
|
| 142 |
+
if current_label in label_to_rep_class:
|
| 143 |
+
rep_class = label_to_rep_class[current_label]
|
| 144 |
+
target_info = class_info[rep_class]
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
print("4. 이미지 유효성 검사...")
|
| 148 |
+
# 이미 목표 개수를 채운 클래스라면 스킵
|
| 149 |
+
if target_info["collected_count"] >= NUM_IMAGES_PER_CLASS:
|
| 150 |
+
continue
|
| 151 |
+
|
| 152 |
+
# 이미지 유효성 체크
|
| 153 |
+
image = item.get(IMAGE_FIELD_NAME)
|
| 154 |
+
if image is None:
|
| 155 |
+
continue
|
| 156 |
+
|
| 157 |
+
print("5. 이미지 변환...")
|
| 158 |
+
try:
|
| 159 |
+
# 이미지를 jpg/jpeg로만 취급하기 위해 RGB 모드로 변환 (알파 채널 등 제거)
|
| 160 |
+
if image.mode != "RGB":
|
| 161 |
+
image = image.convert("RGB")
|
| 162 |
+
|
| 163 |
+
#이미지 해상도가 최소 256px만 수집
|
| 164 |
+
if image.width < TARGET_RESOLUTION or image.height < TARGET_RESOLUTION:
|
| 165 |
+
continue
|
| 166 |
+
|
| 167 |
+
print("6. 클래스 명명 규칙에 따라...")
|
| 168 |
+
# [규칙 3, 4] 이미지 명명 규칙 (hf_[클래스명]_[3자리숫자].jpg)
|
| 169 |
+
# {:03d}를 통해 3자리 숫자로 맞추고 빈자리는 0으로 채움
|
| 170 |
+
file_name = f"hf_{target_info['formatted_name']}_{target_info['current_idx']:03d}.jpg"
|
| 171 |
+
file_path = os.path.join(target_info["save_path"], file_name)
|
| 172 |
+
|
| 173 |
+
print("7. 이미지 저장...")
|
| 174 |
+
image.save(file_path, "JPEG", quality=95)
|
| 175 |
+
|
| 176 |
+
# 카운트 및 인덱스 증가
|
| 177 |
+
target_info["collected_count"] += 1
|
| 178 |
+
target_info["current_idx"] += 1
|
| 179 |
+
|
| 180 |
+
print(f"Saved: {file_path} ({target_info['collected_count']}/{NUM_IMAGES_PER_CLASS})")
|
| 181 |
+
|
| 182 |
+
except Exception as e:
|
| 183 |
+
# 오류 발생 시 스크립트가 멈추지 않도록 예외 처리
|
| 184 |
+
print(f"이미지 저장 중 오류 발생 (Label: {current_label}): {e}")
|
| 185 |
+
|
| 186 |
+
if __name__ == "__main__":
|
| 187 |
+
collect_hf_images()
|
src/collection/download_dataset_kg.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
# ================================
|
| 4 |
+
# 0. 설정
|
| 5 |
+
# ================================
|
| 6 |
+
TARGET_COUNT = 60
|
| 7 |
+
MIN_RES = 128 # 해상도 128
|
| 8 |
+
PREFIX = "kg"
|
| 9 |
+
BASE_DIR = "./data/raw"
|
| 10 |
+
|
| 11 |
+
# ================================
|
| 12 |
+
# 1. 다운로드 경로
|
| 13 |
+
# ================================
|
| 14 |
+
DOWNLOAD_PATH = "data/raw_full_kg"
|
| 15 |
+
|
| 16 |
+
os.makedirs(DOWNLOAD_PATH, exist_ok=True)
|
| 17 |
+
|
| 18 |
+
# ================================
|
| 19 |
+
# 2. 사용할 Kaggle 데이터셋 (slug 기준)
|
| 20 |
+
# ================================
|
| 21 |
+
DATASETS = [
|
| 22 |
+
# 음식 및 식재료
|
| 23 |
+
"kmader/food41",
|
| 24 |
+
|
| 25 |
+
# 동물
|
| 26 |
+
"alessiocorrado99/animals10",
|
| 27 |
+
"gpiosenka/100-bird-species",
|
| 28 |
+
|
| 29 |
+
# 꽃
|
| 30 |
+
"alxmamaev/flowers-recognition",
|
| 31 |
+
|
| 32 |
+
# 과일
|
| 33 |
+
"moltean/fruits",
|
| 34 |
+
"yihfeng/strawberry-maturity",
|
| 35 |
+
|
| 36 |
+
# 탈것
|
| 37 |
+
"sshikamaru/car-object-detection",
|
| 38 |
+
"jessicali9530/stanford-cars-dataset",
|
| 39 |
+
"dataclusterlabs/vehicle-detection-image-dataset",
|
| 40 |
+
"meowmeowmeowmeowmeow/vehicle-type-recognition",
|
| 41 |
+
|
| 42 |
+
# 패션 및 잡화
|
| 43 |
+
"promptcloudhq/jewelry-text-to-image-dataset",
|
| 44 |
+
"ashwingupta3012/glasses-dataset",
|
| 45 |
+
"agrigorev/clothing-dataset-full",
|
| 46 |
+
"paramaggarwal/fashion-product-images-small"
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
# ================================
|
| 50 |
+
# 3. 다운로드 실행
|
| 51 |
+
# ================================
|
| 52 |
+
for ds in DATASETS:
|
| 53 |
+
print(f"\nDownloading {ds} ...")
|
| 54 |
+
os.system(f"kaggle datasets download -d {ds} -p {DOWNLOAD_PATH}")
|
| 55 |
+
|
| 56 |
+
print("\n모든 데이터셋 다운로드 완료!")
|
src/collection/download_dataset_us.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import requests
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from io import BytesIO
|
| 6 |
+
import time
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
|
| 9 |
+
# =========================
|
| 10 |
+
# 1. 설정 (여기만 수정하면 됨)
|
| 11 |
+
# =========================
|
| 12 |
+
load_dotenv()
|
| 13 |
+
US_TOKEN = os.environ.get("US_TOKEN")
|
| 14 |
+
|
| 15 |
+
ACCESS_KEY = "US_TOKEN"
|
| 16 |
+
|
| 17 |
+
TARGET_COUNT = 100
|
| 18 |
+
MIN_WIDTH = 256
|
| 19 |
+
MIN_HEIGHT = 256
|
| 20 |
+
SLEEP_TIME = 2
|
| 21 |
+
|
| 22 |
+
BASE_DIR = "un_images"
|
| 23 |
+
|
| 24 |
+
MASTER_CLASSES = [
|
| 25 |
+
"pizza","hamburger","sushi","pasta","salad","steak","cake","sandwich","fried_chicken","bread",
|
| 26 |
+
"apple","banana","strawberry","orange","carrot",
|
| 27 |
+
"golden_retriever","bulldog","siamese_cat","persian_cat","eagle","owl","lion","elephant","zebra","giraffe",
|
| 28 |
+
"rose","sunflower","daisy","tulip","palm_tree","pine_tree","maple_tree","bamboo",
|
| 29 |
+
"laptop","watch","camera","chair","clock","microwave","refrigerator",
|
| 30 |
+
"car","bicycle","motorcycle","airplane","bus",
|
| 31 |
+
"backpack","sneakers","umbrella","glasses","hat"
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
SIMPLE_CLASSES = [
|
| 35 |
+
"pizza","burger","sushi","pasta","salad","steak","cake","sandwich","fried_chicken","bread",
|
| 36 |
+
"apple","banana","strawberry","orange","carrot",
|
| 37 |
+
"golden_retriever","bulldog","siamese_cat","persian_cat","eagle","owl","lion","elephant","zebra","giraffe",
|
| 38 |
+
"rose","sunflower","daisy","tulip","palm_tree","pine_tree","maple_tree","bamboo",
|
| 39 |
+
"laptop","wristwatch","camera","chair","wall_clock","microwave","refrigerator",
|
| 40 |
+
"car","bicycle","motorcycle","airplane","bus",
|
| 41 |
+
"backpack","sneakers","umbrella","glasses","hat"
|
| 42 |
+
]
|
| 43 |
+
|
| 44 |
+
os.makedirs(BASE_DIR, exist_ok=True)
|
| 45 |
+
|
| 46 |
+
# =========================
|
| 47 |
+
# 2. 유틸
|
| 48 |
+
# =========================
|
| 49 |
+
def format_name(name):
|
| 50 |
+
return name.replace("_", "-")
|
| 51 |
+
|
| 52 |
+
def get_start_index(folder, simple_cls):
|
| 53 |
+
pattern = re.compile(rf"un_{simple_cls}_(\d+)\.jpg")
|
| 54 |
+
max_idx = 0
|
| 55 |
+
|
| 56 |
+
for f in os.listdir(folder):
|
| 57 |
+
match = pattern.match(f)
|
| 58 |
+
if match:
|
| 59 |
+
num = int(match.group(1))
|
| 60 |
+
max_idx = max(max_idx, num)
|
| 61 |
+
|
| 62 |
+
return max_idx + 1
|
| 63 |
+
|
| 64 |
+
# =========================
|
| 65 |
+
# 3. API
|
| 66 |
+
# =========================
|
| 67 |
+
def search_images(query, page):
|
| 68 |
+
url = "https://api.unsplash.com/search/photos"
|
| 69 |
+
|
| 70 |
+
headers = {
|
| 71 |
+
"Authorization": f"Client-ID {ACCESS_KEY}"
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
params = {
|
| 75 |
+
"query": query.replace("_", " "),
|
| 76 |
+
"per_page": 30,
|
| 77 |
+
"page": page
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
res = requests.get(url, headers=headers, params=params)
|
| 81 |
+
|
| 82 |
+
if res.status_code == 429:
|
| 83 |
+
print("⏳ Rate limit → 60초 대기")
|
| 84 |
+
time.sleep(60)
|
| 85 |
+
return []
|
| 86 |
+
|
| 87 |
+
if res.status_code != 200:
|
| 88 |
+
print("API ERROR:", res.text)
|
| 89 |
+
return []
|
| 90 |
+
|
| 91 |
+
return [item["urls"]["regular"] for item in res.json().get("results", [])]
|
| 92 |
+
|
| 93 |
+
def download_image(url):
|
| 94 |
+
try:
|
| 95 |
+
res = requests.get(url, timeout=10)
|
| 96 |
+
if res.status_code != 200:
|
| 97 |
+
return None
|
| 98 |
+
|
| 99 |
+
img = Image.open(BytesIO(res.content))
|
| 100 |
+
w, h = img.size
|
| 101 |
+
|
| 102 |
+
if w < MIN_WIDTH or h < MIN_HEIGHT:
|
| 103 |
+
return None
|
| 104 |
+
|
| 105 |
+
return res.content
|
| 106 |
+
except:
|
| 107 |
+
return None
|
| 108 |
+
|
| 109 |
+
# =========================
|
| 110 |
+
# 4. 메인
|
| 111 |
+
# =========================
|
| 112 |
+
for idx in range(len(MASTER_CLASSES)):
|
| 113 |
+
|
| 114 |
+
master_cls = MASTER_CLASSES[idx]
|
| 115 |
+
simple_cls = format_name(SIMPLE_CLASSES[idx])
|
| 116 |
+
|
| 117 |
+
print(f"\n[START] {master_cls}")
|
| 118 |
+
|
| 119 |
+
class_dir = os.path.join(BASE_DIR, master_cls)
|
| 120 |
+
os.makedirs(class_dir, exist_ok=True)
|
| 121 |
+
|
| 122 |
+
start_idx = get_start_index(class_dir, simple_cls)
|
| 123 |
+
count = start_idx - 1
|
| 124 |
+
|
| 125 |
+
page = 1
|
| 126 |
+
seen = set()
|
| 127 |
+
|
| 128 |
+
while count < TARGET_COUNT:
|
| 129 |
+
urls = search_images(simple_cls, page)
|
| 130 |
+
|
| 131 |
+
if not urls:
|
| 132 |
+
print("이미지 없음")
|
| 133 |
+
break
|
| 134 |
+
|
| 135 |
+
for url in urls:
|
| 136 |
+
if count >= TARGET_COUNT:
|
| 137 |
+
break
|
| 138 |
+
|
| 139 |
+
if url in seen:
|
| 140 |
+
continue
|
| 141 |
+
seen.add(url)
|
| 142 |
+
|
| 143 |
+
img_data = download_image(url)
|
| 144 |
+
if img_data is None:
|
| 145 |
+
continue
|
| 146 |
+
|
| 147 |
+
count += 1
|
| 148 |
+
|
| 149 |
+
file_name = f"un_{simple_cls}_{count:03d}.jpg"
|
| 150 |
+
path = os.path.join(class_dir, file_name)
|
| 151 |
+
|
| 152 |
+
with open(path, "wb") as f:
|
| 153 |
+
f.write(img_data)
|
| 154 |
+
|
| 155 |
+
print(f"Saved: {path}")
|
| 156 |
+
|
| 157 |
+
page += 1
|
| 158 |
+
time.sleep(SLEEP_TIME)
|
| 159 |
+
|
| 160 |
+
print(f"[DONE] {master_cls} -> {count}/{TARGET_COUNT}")
|
| 161 |
+
|
| 162 |
+
# =========================
|
| 163 |
+
# 5. 검증 + 부족분 자동 보충
|
| 164 |
+
# =========================
|
| 165 |
+
print("\n[검증 시작]\n")
|
| 166 |
+
|
| 167 |
+
for idx in range(len(MASTER_CLASSES)):
|
| 168 |
+
|
| 169 |
+
master_cls = MASTER_CLASSES[idx]
|
| 170 |
+
simple_cls = format_name(SIMPLE_CLASSES[idx])
|
| 171 |
+
|
| 172 |
+
class_dir = os.path.join(BASE_DIR, master_cls)
|
| 173 |
+
files = [f for f in os.listdir(class_dir) if f.endswith(".jpg")]
|
| 174 |
+
|
| 175 |
+
valid_files = []
|
| 176 |
+
|
| 177 |
+
for f in files:
|
| 178 |
+
path = os.path.join(class_dir, f)
|
| 179 |
+
try:
|
| 180 |
+
img = Image.open(path)
|
| 181 |
+
w, h = img.size
|
| 182 |
+
|
| 183 |
+
if w >= MIN_WIDTH and h >= MIN_HEIGHT:
|
| 184 |
+
valid_files.append(f)
|
| 185 |
+
else:
|
| 186 |
+
os.remove(path)
|
| 187 |
+
except:
|
| 188 |
+
os.remove(path)
|
| 189 |
+
|
| 190 |
+
count = len(valid_files)
|
| 191 |
+
|
| 192 |
+
print(f"{master_cls}: {count}/{TARGET_COUNT}")
|
| 193 |
+
|
| 194 |
+
if count < TARGET_COUNT:
|
| 195 |
+
print(f"→ 부족분 재수집 시작")
|
| 196 |
+
|
| 197 |
+
page = 1
|
| 198 |
+
seen = set()
|
| 199 |
+
start_idx = get_start_index(class_dir, simple_cls)
|
| 200 |
+
|
| 201 |
+
while count < TARGET_COUNT:
|
| 202 |
+
urls = search_images(simple_cls, page)
|
| 203 |
+
|
| 204 |
+
if not urls:
|
| 205 |
+
break
|
| 206 |
+
|
| 207 |
+
for url in urls:
|
| 208 |
+
if count >= TARGET_COUNT:
|
| 209 |
+
break
|
| 210 |
+
|
| 211 |
+
if url in seen:
|
| 212 |
+
continue
|
| 213 |
+
seen.add(url)
|
| 214 |
+
|
| 215 |
+
img_data = download_image(url)
|
| 216 |
+
if img_data is None:
|
| 217 |
+
continue
|
| 218 |
+
|
| 219 |
+
file_name = f"un_{simple_cls}_{start_idx:03d}.jpg"
|
| 220 |
+
path = os.path.join(class_dir, file_name)
|
| 221 |
+
|
| 222 |
+
with open(path, "wb") as f:
|
| 223 |
+
f.write(img_data)
|
| 224 |
+
|
| 225 |
+
print(f"ReSaved: {path}")
|
| 226 |
+
|
| 227 |
+
start_idx += 1
|
| 228 |
+
count += 1
|
| 229 |
+
|
| 230 |
+
page += 1
|
| 231 |
+
time.sleep(SLEEP_TIME)
|
| 232 |
+
|
| 233 |
+
print("\n[완료]")
|
src/collection/get_label_list_hf.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
# ============================================================
|
| 5 |
+
# [설정 부분]
|
| 6 |
+
# ============================================================
|
| 7 |
+
|
| 8 |
+
# 확인할 Hugging Face 데이터셋 이름
|
| 9 |
+
DATASET_NAME = "KrushiJethe/fashion_data" #uran66/animals
|
| 10 |
+
|
| 11 |
+
# 확인할 split 이름
|
| 12 |
+
SPLIT_NAME = "train"
|
| 13 |
+
|
| 14 |
+
# 라벨 필드명
|
| 15 |
+
LABEL_FIELD_NAME = "articleType"
|
| 16 |
+
|
| 17 |
+
# streaming 사용 여부
|
| 18 |
+
# 라벨 구조만 확인할 때는 streaming=True로 해도 된다.
|
| 19 |
+
USE_STREAMING = True
|
| 20 |
+
|
| 21 |
+
# 문자열 라벨 데이터셋일 경우 전체 데이터를 훑어야 할 수 있다.
|
| 22 |
+
# None이면 전체 확인, 숫자를 넣으면 일부 샘플만 확인한다.
|
| 23 |
+
MAX_SCAN_ITEMS = None
|
| 24 |
+
|
| 25 |
+
# ============================================================
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def get_unique_labels():
|
| 29 |
+
"""
|
| 30 |
+
Hugging Face 데이터셋에서 라벨 목록을 중복 없이 출력한다.
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
print(f"[{DATASET_NAME}] 데이터셋 로드 중...")
|
| 34 |
+
|
| 35 |
+
dataset = load_dataset(
|
| 36 |
+
DATASET_NAME,
|
| 37 |
+
split=SPLIT_NAME,
|
| 38 |
+
streaming=USE_STREAMING,
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
# 데이터셋의 feature 정보에서 라벨 필드를 가져온다.
|
| 42 |
+
label_feature = dataset.features[LABEL_FIELD_NAME]
|
| 43 |
+
|
| 44 |
+
# ------------------------------------------------------------
|
| 45 |
+
# 1. Food101처럼 label이 ClassLabel 타입인 경우
|
| 46 |
+
# ------------------------------------------------------------
|
| 47 |
+
# 이 경우 데이터 전체를 순회하지 않아도
|
| 48 |
+
# dataset.features["label"].names 에서 전체 라벨명을 바로 확인할 수 있다.
|
| 49 |
+
if hasattr(label_feature, "names") and label_feature.names is not None:
|
| 50 |
+
label_names = label_feature.names
|
| 51 |
+
|
| 52 |
+
print("\n라벨 목록")
|
| 53 |
+
print("-" * 50)
|
| 54 |
+
|
| 55 |
+
for idx, label_name in enumerate(label_names):
|
| 56 |
+
print(f"{idx}: {label_name}")
|
| 57 |
+
|
| 58 |
+
print("-" * 50)
|
| 59 |
+
print(f"총 라벨 개수: {len(label_names)}")
|
| 60 |
+
|
| 61 |
+
return label_names
|
| 62 |
+
|
| 63 |
+
# ------------------------------------------------------------
|
| 64 |
+
# 2. label이 문자열로 직접 들어있는 데이터셋인 경우
|
| 65 |
+
# ------------------------------------------------------------
|
| 66 |
+
# 이 경우에는 데이터를 직접 순회하면서 중복을 제거해야 한다.
|
| 67 |
+
unique_labels = set()
|
| 68 |
+
|
| 69 |
+
print("\n라벨 필드가 ClassLabel 타입이 아니므로 데이터를 순회합니다...")
|
| 70 |
+
|
| 71 |
+
for idx, item in enumerate(dataset):
|
| 72 |
+
if MAX_SCAN_ITEMS is not None and idx >= MAX_SCAN_ITEMS:
|
| 73 |
+
break
|
| 74 |
+
|
| 75 |
+
label_value = item.get(LABEL_FIELD_NAME)
|
| 76 |
+
|
| 77 |
+
if label_value is None:
|
| 78 |
+
continue
|
| 79 |
+
|
| 80 |
+
unique_labels.add(str(label_value))
|
| 81 |
+
|
| 82 |
+
label_names = sorted(unique_labels)
|
| 83 |
+
|
| 84 |
+
print("\n라벨 목록")
|
| 85 |
+
print("-" * 50)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
for idx, label_name in enumerate(label_names):
|
| 89 |
+
print(f"{idx}: {label_name}")
|
| 90 |
+
|
| 91 |
+
print("-" * 50)
|
| 92 |
+
print(f"총 라벨 개수: {len(label_names)}")
|
| 93 |
+
|
| 94 |
+
return label_names
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
if __name__ == "__main__":
|
| 98 |
+
get_unique_labels()
|
src/collection/select_60_images.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import random
|
| 3 |
+
import shutil
|
| 4 |
+
|
| 5 |
+
# ================================
|
| 6 |
+
# 0. 설정
|
| 7 |
+
# ================================
|
| 8 |
+
TARGET_COUNT = 60
|
| 9 |
+
MIN_RES = 128 # 해상도 128
|
| 10 |
+
PREFIX = "kg"
|
| 11 |
+
BASE_DIR = "./data/raw"
|
| 12 |
+
|
| 13 |
+
# ================================
|
| 14 |
+
# 1. 경로
|
| 15 |
+
# ================================
|
| 16 |
+
DATA_DIR = r"C:\Users\qud46\Desktop\raw_kg"
|
| 17 |
+
|
| 18 |
+
# ================================
|
| 19 |
+
# 2. 클래스 목록
|
| 20 |
+
# ================================
|
| 21 |
+
CLASS_LIST = [
|
| 22 |
+
# 음식 및 식재료
|
| 23 |
+
"pizza","hamburger","sushi","pasta","salad",
|
| 24 |
+
"steak","cup_cake","sandwich","waffle","dumpling",
|
| 25 |
+
|
| 26 |
+
# 동물
|
| 27 |
+
"golden-retriever","bulldog","siamese-cat",
|
| 28 |
+
"persian-cat","elephant","sheep","horse",
|
| 29 |
+
"penguin","butterfly","squirrel",
|
| 30 |
+
|
| 31 |
+
# 꽃
|
| 32 |
+
"rose","sunflower","daisy","tulip","dandelion",
|
| 33 |
+
"lily","lavender","orchid","iris","marigold","aster",
|
| 34 |
+
|
| 35 |
+
# 과일
|
| 36 |
+
"apple","banana","strawberry","orange",
|
| 37 |
+
"carrot","tomato","cucumber",
|
| 38 |
+
|
| 39 |
+
# 탈것
|
| 40 |
+
"car","bicycle","motorcycle","airplane","bus",
|
| 41 |
+
|
| 42 |
+
# 패션 및 잡화
|
| 43 |
+
"t-shirt","sneakers","earrings","glasses",
|
| 44 |
+
"pants","bracelet","necklace"
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
print("클래스별 이미지 60장 맞추기 시작\n")
|
| 48 |
+
|
| 49 |
+
# ================================
|
| 50 |
+
# 3. 메인 로직
|
| 51 |
+
# ================================
|
| 52 |
+
for cls in CLASS_LIST:
|
| 53 |
+
|
| 54 |
+
cls_path = os.path.join(DATA_DIR, cls)
|
| 55 |
+
|
| 56 |
+
if not os.path.exists(cls_path):
|
| 57 |
+
print(f"{cls}: 폴더 없음 (skip)")
|
| 58 |
+
continue
|
| 59 |
+
|
| 60 |
+
# 이미지 파일 목록
|
| 61 |
+
images = [
|
| 62 |
+
f for f in os.listdir(cls_path)
|
| 63 |
+
if os.path.isfile(os.path.join(cls_path, f))
|
| 64 |
+
]
|
| 65 |
+
|
| 66 |
+
current_count = len(images)
|
| 67 |
+
|
| 68 |
+
print(
|
| 69 |
+
f"{cls}: 현재 {current_count}장 "
|
| 70 |
+
f"→ 목표 {TARGET_COUNT}장"
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
# ================================
|
| 74 |
+
# 1) 60장 초과 → 랜덤 삭제
|
| 75 |
+
# ================================
|
| 76 |
+
if current_count > TARGET_COUNT:
|
| 77 |
+
|
| 78 |
+
delete_count = current_count - TARGET_COUNT
|
| 79 |
+
|
| 80 |
+
to_delete = random.sample(
|
| 81 |
+
images,
|
| 82 |
+
delete_count
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
for file in to_delete:
|
| 86 |
+
|
| 87 |
+
file_path = os.path.join(cls_path, file)
|
| 88 |
+
|
| 89 |
+
try:
|
| 90 |
+
os.remove(file_path)
|
| 91 |
+
|
| 92 |
+
except:
|
| 93 |
+
continue
|
| 94 |
+
|
| 95 |
+
print(f" → {delete_count}장 삭제 완료")
|
| 96 |
+
|
| 97 |
+
# ================================
|
| 98 |
+
# 2) 60장 미만 → 부족 개수 출력
|
| 99 |
+
# ================================
|
| 100 |
+
elif current_count < TARGET_COUNT:
|
| 101 |
+
|
| 102 |
+
need_count = TARGET_COUNT - current_count
|
| 103 |
+
|
| 104 |
+
print(
|
| 105 |
+
f" → {need_count}장 부족 "
|
| 106 |
+
f"(추가 수집 필요)"
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
# ================================
|
| 110 |
+
# 3) 정확히 60장
|
| 111 |
+
# ================================
|
| 112 |
+
else:
|
| 113 |
+
print(" → 이미 60장 완료")
|
| 114 |
+
|
| 115 |
+
print("\n전체 정리 완료!")
|
src/collection/unzip_data_kg.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import zipfile
|
| 3 |
+
|
| 4 |
+
# ================================
|
| 5 |
+
# 0. 설정
|
| 6 |
+
# ================================
|
| 7 |
+
TARGET_COUNT = 60
|
| 8 |
+
MIN_RES = 128 # 해상도 128
|
| 9 |
+
PREFIX = "kg"
|
| 10 |
+
BASE_DIR = "./data/raw"
|
| 11 |
+
|
| 12 |
+
# ================================
|
| 13 |
+
# 1. 경로
|
| 14 |
+
# ================================
|
| 15 |
+
ZIP_DIR = "data/raw_full_kg"
|
| 16 |
+
|
| 17 |
+
# 압축 해제 위치
|
| 18 |
+
EXTRACT_DIR = os.path.join(ZIP_DIR, "extracted")
|
| 19 |
+
|
| 20 |
+
os.makedirs(EXTRACT_DIR, exist_ok=True)
|
| 21 |
+
|
| 22 |
+
# ================================
|
| 23 |
+
# 2. zip 파일 목록
|
| 24 |
+
# ================================
|
| 25 |
+
zip_files = [
|
| 26 |
+
f for f in os.listdir(ZIP_DIR)
|
| 27 |
+
if f.endswith(".zip")
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
# ================================
|
| 31 |
+
# 3. 압축 해제
|
| 32 |
+
# ================================
|
| 33 |
+
for zip_file in zip_files:
|
| 34 |
+
zip_path = os.path.join(ZIP_DIR, zip_file)
|
| 35 |
+
|
| 36 |
+
print(f"{zip_file} 압축 해제 중...")
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
| 40 |
+
zip_ref.extractall(EXTRACT_DIR)
|
| 41 |
+
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"오류 발생: {zip_file} → {e}")
|
| 44 |
+
|
| 45 |
+
print("모든 압축 해제 완료!")
|
src/dataset/.ipynb_checkpoints/captioning_dataset-checkpoint.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import random
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from torch.utils.data import Dataset
|
| 6 |
+
import torch
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class CaptionDataset(Dataset):
|
| 10 |
+
|
| 11 |
+
def __init__(
|
| 12 |
+
self,
|
| 13 |
+
json_path,
|
| 14 |
+
image_dir,
|
| 15 |
+
w2i,
|
| 16 |
+
tokenizer: callable,
|
| 17 |
+
split='train',
|
| 18 |
+
transform=None,
|
| 19 |
+
max_len=30,
|
| 20 |
+
train_num_caption=1,
|
| 21 |
+
debug=False,
|
| 22 |
+
use_subword=False,
|
| 23 |
+
sp_model_path="tokenizer.model"
|
| 24 |
+
):
|
| 25 |
+
|
| 26 |
+
with open(json_path, 'r') as f:
|
| 27 |
+
self.data = json.load(f)
|
| 28 |
+
|
| 29 |
+
# 디버깅용
|
| 30 |
+
if debug:
|
| 31 |
+
self.data= self.data[:10]
|
| 32 |
+
|
| 33 |
+
if split == "val":
|
| 34 |
+
self.is_val = True
|
| 35 |
+
else:
|
| 36 |
+
self.is_val = False
|
| 37 |
+
|
| 38 |
+
self.image_dir = image_dir
|
| 39 |
+
self.w2i = w2i
|
| 40 |
+
self.transform = transform
|
| 41 |
+
self.max_len = max_len
|
| 42 |
+
self.tokenizer = tokenizer
|
| 43 |
+
self.train_num_caption = train_num_caption
|
| 44 |
+
self.use_subword = use_subword
|
| 45 |
+
if self.use_subword:
|
| 46 |
+
import sentencepiece as spm
|
| 47 |
+
|
| 48 |
+
self.sp = spm.SentencePieceProcessor()
|
| 49 |
+
self.sp.load(sp_model_path)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def __len__(self):
|
| 53 |
+
return len(self.data)
|
| 54 |
+
|
| 55 |
+
def encode_caption(self, caption):
|
| 56 |
+
|
| 57 |
+
if self.use_subword:
|
| 58 |
+
words = self.sp.encode(caption, out_type=str)
|
| 59 |
+
|
| 60 |
+
tokens = (
|
| 61 |
+
[self.w2i["<sos>"]] +
|
| 62 |
+
[self.w2i.get(w, self.w2i["<unk>"]) for w in words] +
|
| 63 |
+
[self.w2i["<eos>"]]
|
| 64 |
+
)
|
| 65 |
+
else:
|
| 66 |
+
words = self.tokenizer(caption)
|
| 67 |
+
|
| 68 |
+
tokens = (
|
| 69 |
+
[self.w2i["<sos>"]] +
|
| 70 |
+
[self.w2i.get(w, self.w2i["<unk>"]) for w in words] +
|
| 71 |
+
[self.w2i["<eos>"]]
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# truncation
|
| 75 |
+
if len(tokens) > self.max_len:
|
| 76 |
+
tokens = (tokens[:self.max_len - 1])
|
| 77 |
+
tokens.append(self.w2i["<eos>"])
|
| 78 |
+
else:
|
| 79 |
+
tokens += ([self.w2i["<pad>"]] * (self.max_len - len(tokens)))
|
| 80 |
+
|
| 81 |
+
return torch.tensor(tokens, dtype=torch.long)
|
| 82 |
+
|
| 83 |
+
def __getitem__(self, index):
|
| 84 |
+
|
| 85 |
+
data = self.data[index]
|
| 86 |
+
file_name = data["file_name"]
|
| 87 |
+
|
| 88 |
+
image_path = os.path.join(self.image_dir, file_name)
|
| 89 |
+
|
| 90 |
+
image = Image.open(image_path).convert('RGB')
|
| 91 |
+
|
| 92 |
+
if self.transform:
|
| 93 |
+
image = self.transform(image)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
captions = data["captions"]
|
| 97 |
+
|
| 98 |
+
captions = captions[:5] # 캡션 5개 초과시 5개까지만 씀
|
| 99 |
+
|
| 100 |
+
while len(captions) < 5: # 캡션 5개 보다 부족할 시 마지막 캡션 복제해서 씀
|
| 101 |
+
captions.append(captions[-1])
|
| 102 |
+
|
| 103 |
+
# validation
|
| 104 |
+
if self.is_val:
|
| 105 |
+
caption = random.choice(captions)
|
| 106 |
+
|
| 107 |
+
tokens = (self.encode_caption(caption))
|
| 108 |
+
|
| 109 |
+
return image, tokens, captions, file_name
|
| 110 |
+
|
| 111 |
+
# train
|
| 112 |
+
selected_captions = (random.sample(captions, k=self.train_num_caption))
|
| 113 |
+
|
| 114 |
+
images = []
|
| 115 |
+
token_list = []
|
| 116 |
+
for caption in selected_captions:
|
| 117 |
+
images.append(image)
|
| 118 |
+
token_list.append(self.encode_caption(caption))
|
| 119 |
+
|
| 120 |
+
images = torch.stack(images)
|
| 121 |
+
tokens = torch.stack(token_list)
|
| 122 |
+
|
| 123 |
+
return images, tokens
|
| 124 |
+
|
src/dataset/.ipynb_checkpoints/train_sub_tokenizer-checkpoint.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import sentencepiece as spm
|
| 3 |
+
import yaml
|
| 4 |
+
|
| 5 |
+
# params
|
| 6 |
+
with open("/workspace/params.yaml", "r", encoding="utf-8") as f:
|
| 7 |
+
params = yaml.safe_load(f)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def train_sentencepiece(
|
| 11 |
+
json_path,
|
| 12 |
+
model_prefix="sub_tokenizer",
|
| 13 |
+
vocab_size=500,
|
| 14 |
+
model_type="unigram"
|
| 15 |
+
):
|
| 16 |
+
|
| 17 |
+
with open(json_path, 'r') as f:
|
| 18 |
+
data = json.load(f)
|
| 19 |
+
|
| 20 |
+
txt_path = "/workspace/src/dataset/sub_tokenizing_captions.txt"
|
| 21 |
+
|
| 22 |
+
with open(txt_path, "w", encoding="utf-8") as f:
|
| 23 |
+
for item in data:
|
| 24 |
+
captions = item["captions"]
|
| 25 |
+
|
| 26 |
+
for caption in captions:
|
| 27 |
+
f.write(caption.lower() + "\n")
|
| 28 |
+
|
| 29 |
+
spm.SentencePieceTrainer.train(
|
| 30 |
+
input=txt_path,
|
| 31 |
+
model_prefix=model_prefix,
|
| 32 |
+
vocab_size=vocab_size,
|
| 33 |
+
model_type=model_type,
|
| 34 |
+
|
| 35 |
+
pad_piece="<pad>",
|
| 36 |
+
bos_piece="<sos>",
|
| 37 |
+
eos_piece="<eos>",
|
| 38 |
+
unk_piece="<unk>",
|
| 39 |
+
|
| 40 |
+
pad_id=0,
|
| 41 |
+
bos_id=1,
|
| 42 |
+
eos_id=2,
|
| 43 |
+
unk_id=3
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
print("tokenizer training done")
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
if __name__ == "__main__":
|
| 50 |
+
train_sentencepiece(
|
| 51 |
+
json_path="/workspace/data/captioning/annotations/train.json",
|
| 52 |
+
model_prefix="/workspace/src/dataset/sub_tokenizer",
|
| 53 |
+
vocab_size=params["captioning"]["tokenizer"]["sp_vocab_size"],
|
| 54 |
+
model_type="unigram"
|
| 55 |
+
)
|
src/dataset/__pycache__/build_vocab.cpython-310.pyc
ADDED
|
Binary file (1.62 kB). View file
|
|
|
src/dataset/__pycache__/captioning_dataset.cpython-310.pyc
ADDED
|
Binary file (2.74 kB). View file
|
|
|
src/dataset/__pycache__/classification_dataset.cpython-310.pyc
ADDED
|
Binary file (1.53 kB). View file
|
|
|
src/dataset/__pycache__/collate_caption.cpython-310.pyc
ADDED
|
Binary file (410 Bytes). View file
|
|
|
src/dataset/build_vocab.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from collections import Counter
|
| 3 |
+
import re
|
| 4 |
+
import sentencepiece as spm
|
| 5 |
+
|
| 6 |
+
def tokenizer(captions):
|
| 7 |
+
text = captions.lower()
|
| 8 |
+
text = re.sub(r"([.,!?])", r" \1 ", text) # 특수문자 제거
|
| 9 |
+
tokens = text.split()
|
| 10 |
+
|
| 11 |
+
return tokens
|
| 12 |
+
|
| 13 |
+
def sub_tokenizer(caption, sp):
|
| 14 |
+
tokens = sp.encode(caption, out_type=str)
|
| 15 |
+
|
| 16 |
+
return tokens
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def build_vocab(json_path, min_freq=3, max_size=10000, use_subword=False, sp_model_path="/workspace/src/dataset/sub_tokenizer.model"):
|
| 20 |
+
w2i = dict()
|
| 21 |
+
i2w = dict()
|
| 22 |
+
|
| 23 |
+
# ==================================================
|
| 24 |
+
# SentencePiece tokenizer 사용
|
| 25 |
+
# ==================================================
|
| 26 |
+
if use_subword:
|
| 27 |
+
|
| 28 |
+
sp = spm.SentencePieceProcessor()
|
| 29 |
+
sp.load(sp_model_path)
|
| 30 |
+
|
| 31 |
+
voca_size = sp.get_piece_size()
|
| 32 |
+
|
| 33 |
+
for i in range(voca_size):
|
| 34 |
+
token = sp.id_to_piece(i)
|
| 35 |
+
|
| 36 |
+
w2i[token] = i
|
| 37 |
+
i2w[i] = token
|
| 38 |
+
else:
|
| 39 |
+
with open(json_path, 'r') as f:
|
| 40 |
+
data = json.load(f)
|
| 41 |
+
|
| 42 |
+
counter = Counter()
|
| 43 |
+
|
| 44 |
+
for item in data:
|
| 45 |
+
captions = item["captions"]
|
| 46 |
+
for caption in captions:
|
| 47 |
+
tokens = tokenizer(caption)
|
| 48 |
+
counter.update(tokens)
|
| 49 |
+
|
| 50 |
+
words = [w for w, freq in counter.most_common() if freq >= min_freq]
|
| 51 |
+
|
| 52 |
+
voca = ["<pad>", "<sos>", "<eos>", "<unk>"]
|
| 53 |
+
voca.extend(words[:max_size-4])
|
| 54 |
+
voca_size = len(voca)
|
| 55 |
+
|
| 56 |
+
for i, w in enumerate(voca):
|
| 57 |
+
w2i[w] = i
|
| 58 |
+
i2w[i] = w
|
| 59 |
+
|
| 60 |
+
print(voca_size)
|
| 61 |
+
|
| 62 |
+
return w2i, i2w, voca_size
|
src/dataset/captioning_dataset.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import random
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from torch.utils.data import Dataset
|
| 6 |
+
import torch
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class CaptionDataset(Dataset):
|
| 10 |
+
|
| 11 |
+
def __init__(
|
| 12 |
+
self,
|
| 13 |
+
json_path,
|
| 14 |
+
image_dir,
|
| 15 |
+
w2i,
|
| 16 |
+
tokenizer: callable,
|
| 17 |
+
split='train',
|
| 18 |
+
transform=None,
|
| 19 |
+
max_len=30,
|
| 20 |
+
train_num_caption=1,
|
| 21 |
+
debug=False,
|
| 22 |
+
use_subword=False,
|
| 23 |
+
sp_model_path="tokenizer.model"
|
| 24 |
+
):
|
| 25 |
+
|
| 26 |
+
with open(json_path, 'r') as f:
|
| 27 |
+
self.data = json.load(f)
|
| 28 |
+
|
| 29 |
+
# 디버깅용
|
| 30 |
+
if debug:
|
| 31 |
+
self.data= self.data[:10]
|
| 32 |
+
|
| 33 |
+
if split == "val":
|
| 34 |
+
self.is_val = True
|
| 35 |
+
else:
|
| 36 |
+
self.is_val = False
|
| 37 |
+
|
| 38 |
+
self.image_dir = image_dir
|
| 39 |
+
self.w2i = w2i
|
| 40 |
+
self.transform = transform
|
| 41 |
+
self.max_len = max_len
|
| 42 |
+
self.tokenizer = tokenizer
|
| 43 |
+
self.train_num_caption = train_num_caption
|
| 44 |
+
self.use_subword = use_subword
|
| 45 |
+
if self.use_subword:
|
| 46 |
+
import sentencepiece as spm
|
| 47 |
+
|
| 48 |
+
self.sp = spm.SentencePieceProcessor()
|
| 49 |
+
self.sp.load(sp_model_path)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def __len__(self):
|
| 53 |
+
return len(self.data)
|
| 54 |
+
|
| 55 |
+
def encode_caption(self, caption):
|
| 56 |
+
|
| 57 |
+
if self.use_subword:
|
| 58 |
+
words = self.sp.encode(caption.lower(), out_type=str)
|
| 59 |
+
|
| 60 |
+
tokens = (
|
| 61 |
+
[self.w2i["<sos>"]] +
|
| 62 |
+
[self.w2i.get(w, self.w2i["<unk>"]) for w in words] +
|
| 63 |
+
[self.w2i["<eos>"]]
|
| 64 |
+
)
|
| 65 |
+
else:
|
| 66 |
+
words = self.tokenizer(caption)
|
| 67 |
+
|
| 68 |
+
tokens = (
|
| 69 |
+
[self.w2i["<sos>"]] +
|
| 70 |
+
[self.w2i.get(w, self.w2i["<unk>"]) for w in words] +
|
| 71 |
+
[self.w2i["<eos>"]]
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# truncation
|
| 75 |
+
if len(tokens) > self.max_len:
|
| 76 |
+
tokens = (tokens[:self.max_len - 1])
|
| 77 |
+
tokens.append(self.w2i["<eos>"])
|
| 78 |
+
else:
|
| 79 |
+
tokens += ([self.w2i["<pad>"]] * (self.max_len - len(tokens)))
|
| 80 |
+
|
| 81 |
+
return torch.tensor(tokens, dtype=torch.long)
|
| 82 |
+
|
| 83 |
+
def __getitem__(self, index):
|
| 84 |
+
|
| 85 |
+
data = self.data[index]
|
| 86 |
+
file_name = data["file_name"]
|
| 87 |
+
|
| 88 |
+
image_path = os.path.join(self.image_dir, file_name)
|
| 89 |
+
|
| 90 |
+
image = Image.open(image_path).convert('RGB')
|
| 91 |
+
|
| 92 |
+
if self.transform:
|
| 93 |
+
image = self.transform(image)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
captions = data["captions"]
|
| 97 |
+
|
| 98 |
+
captions = captions[:5] # 캡션 5개 초과시 5개까지만 씀
|
| 99 |
+
|
| 100 |
+
while len(captions) < 5: # 캡션 5개 보다 부족할 시 마지막 캡션 복제해서 씀
|
| 101 |
+
captions.append(captions[-1])
|
| 102 |
+
|
| 103 |
+
# validation
|
| 104 |
+
if self.is_val:
|
| 105 |
+
caption = random.choice(captions)
|
| 106 |
+
|
| 107 |
+
tokens = (self.encode_caption(caption))
|
| 108 |
+
|
| 109 |
+
return image, tokens, captions, file_name
|
| 110 |
+
|
| 111 |
+
# train
|
| 112 |
+
selected_captions = (random.sample(captions, k=self.train_num_caption))
|
| 113 |
+
|
| 114 |
+
images = []
|
| 115 |
+
token_list = []
|
| 116 |
+
for caption in selected_captions:
|
| 117 |
+
images.append(image)
|
| 118 |
+
token_list.append(self.encode_caption(caption))
|
| 119 |
+
|
| 120 |
+
images = torch.stack(images)
|
| 121 |
+
tokens = torch.stack(token_list)
|
| 122 |
+
|
| 123 |
+
return images, tokens
|
| 124 |
+
|
src/dataset/classification_dataset.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from PIL import Image
|
| 3 |
+
from torch.utils.data import Dataset
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class ClassificationDataset(Dataset):
|
| 7 |
+
|
| 8 |
+
def __init__(
|
| 9 |
+
self,
|
| 10 |
+
root_dir,
|
| 11 |
+
class_to_idx,
|
| 12 |
+
split="train",
|
| 13 |
+
transform=None,
|
| 14 |
+
split_ratio=(0.7, 0.15, 0.15)
|
| 15 |
+
):
|
| 16 |
+
|
| 17 |
+
self.transform = transform
|
| 18 |
+
self.samples = []
|
| 19 |
+
|
| 20 |
+
for class_name in sorted(os.listdir(root_dir)):
|
| 21 |
+
class_path = os.path.join(
|
| 22 |
+
root_dir,
|
| 23 |
+
class_name
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
if not os.path.isdir(class_path):
|
| 27 |
+
continue
|
| 28 |
+
|
| 29 |
+
images = sorted(os.listdir(class_path))
|
| 30 |
+
|
| 31 |
+
total = len(images)
|
| 32 |
+
|
| 33 |
+
train_end = int(total * split_ratio[0])
|
| 34 |
+
val_end = train_end + int(total * split_ratio[1])
|
| 35 |
+
|
| 36 |
+
if split == "train":
|
| 37 |
+
split_images = images[:train_end]
|
| 38 |
+
|
| 39 |
+
elif split == "val":
|
| 40 |
+
split_images = images[train_end:val_end]
|
| 41 |
+
|
| 42 |
+
else:
|
| 43 |
+
split_images = images[val_end:]
|
| 44 |
+
|
| 45 |
+
label = class_to_idx[class_name]
|
| 46 |
+
|
| 47 |
+
for image_name in split_images:
|
| 48 |
+
image_path = os.path.join(
|
| 49 |
+
class_path,
|
| 50 |
+
image_name
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
self.samples.append(
|
| 54 |
+
(image_path, label)
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
def __len__(self):
|
| 58 |
+
return len(self.samples)
|
| 59 |
+
|
| 60 |
+
def __getitem__(self, index):
|
| 61 |
+
image_path, label = self.samples[index]
|
| 62 |
+
image = Image.open(
|
| 63 |
+
image_path
|
| 64 |
+
).convert("RGB")
|
| 65 |
+
|
| 66 |
+
if self.transform:
|
| 67 |
+
image = self.transform(image)
|
| 68 |
+
|
| 69 |
+
return image, label, image_path
|
src/dataset/collate_caption.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
def collate_caption(batch):
|
| 4 |
+
images = []
|
| 5 |
+
tokens = []
|
| 6 |
+
|
| 7 |
+
for image, token in batch:
|
| 8 |
+
images.append(image)
|
| 9 |
+
tokens.append(token)
|
| 10 |
+
|
| 11 |
+
images = torch.cat(images, dim=0)
|
| 12 |
+
tokens = torch.cat(tokens, dim=0)
|
| 13 |
+
|
| 14 |
+
return images, tokens
|
src/dataset/sub_tokenizer1000.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4f24d6fdba624cc28650fc27f6ef8bd2002d3a3b652205a2e0e8af2aa21ea7be
|
| 3 |
+
size 254104
|
src/dataset/sub_tokenizer1000.vocab
ADDED
|
@@ -0,0 +1,1000 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<pad> 0
|
| 2 |
+
<sos> 0
|
| 3 |
+
<eos> 0
|
| 4 |
+
<unk> 0
|
| 5 |
+
▁a -1.98322
|
| 6 |
+
. -2.9407
|
| 7 |
+
s -3.42779
|
| 8 |
+
▁of -3.61005
|
| 9 |
+
▁on -3.6172
|
| 10 |
+
▁in -3.86704
|
| 11 |
+
▁the -3.87608
|
| 12 |
+
nd -3.93335
|
| 13 |
+
▁with -3.9459
|
| 14 |
+
ing -4.4554
|
| 15 |
+
▁is -4.4607
|
| 16 |
+
▁ -4.56396
|
| 17 |
+
▁to -4.59365
|
| 18 |
+
e -4.65398
|
| 19 |
+
▁man -4.85533
|
| 20 |
+
▁sitting -4.95263
|
| 21 |
+
, -5.03648
|
| 22 |
+
ed -5.04331
|
| 23 |
+
y -5.11805
|
| 24 |
+
▁an -5.13757
|
| 25 |
+
▁next -5.20493
|
| 26 |
+
▁two -5.22965
|
| 27 |
+
d -5.2952
|
| 28 |
+
▁white -5.36199
|
| 29 |
+
n -5.36722
|
| 30 |
+
a -5.37838
|
| 31 |
+
i -5.38911
|
| 32 |
+
▁are -5.39818
|
| 33 |
+
▁holding -5.40034
|
| 34 |
+
▁standing -5.40128
|
| 35 |
+
o -5.40533
|
| 36 |
+
▁table -5.46676
|
| 37 |
+
p -5.47577
|
| 38 |
+
t -5.48538
|
| 39 |
+
▁it -5.49508
|
| 40 |
+
r -5.51872
|
| 41 |
+
le -5.59946
|
| 42 |
+
▁plate -5.6039
|
| 43 |
+
▁woman -5.63334
|
| 44 |
+
▁that -5.6784
|
| 45 |
+
▁up -5.68785
|
| 46 |
+
▁at -5.71356
|
| 47 |
+
er -5.7243
|
| 48 |
+
▁top -5.76056
|
| 49 |
+
▁people -5.76236
|
| 50 |
+
▁some -5.79632
|
| 51 |
+
▁person -5.81136
|
| 52 |
+
m -5.8687
|
| 53 |
+
▁black -5.88285
|
| 54 |
+
▁large -5.88415
|
| 55 |
+
▁street -5.92398
|
| 56 |
+
es -5.93578
|
| 57 |
+
▁dog -5.93954
|
| 58 |
+
▁red -5.97105
|
| 59 |
+
▁s -6.00218
|
| 60 |
+
▁f -6.02117
|
| 61 |
+
▁his -6.02288
|
| 62 |
+
▁small -6.0281
|
| 63 |
+
▁near -6.04428
|
| 64 |
+
l -6.05039
|
| 65 |
+
u -6.10616
|
| 66 |
+
▁front -6.11685
|
| 67 |
+
g -6.13449
|
| 68 |
+
b -6.16149
|
| 69 |
+
▁sign -6.19078
|
| 70 |
+
f -6.22392
|
| 71 |
+
▁cat -6.23333
|
| 72 |
+
▁bear -6.2501
|
| 73 |
+
▁by -6.25248
|
| 74 |
+
▁group -6.27012
|
| 75 |
+
▁field -6.28847
|
| 76 |
+
▁blue -6.29729
|
| 77 |
+
▁has -6.30494
|
| 78 |
+
▁food -6.30817
|
| 79 |
+
▁down -6.34169
|
| 80 |
+
▁green -6.34463
|
| 81 |
+
ar -6.34581
|
| 82 |
+
▁young -6.36549
|
| 83 |
+
in -6.3685
|
| 84 |
+
▁tennis -6.38354
|
| 85 |
+
▁snow -6.3845
|
| 86 |
+
▁other -6.39216
|
| 87 |
+
▁close -6.39402
|
| 88 |
+
c -6.40273
|
| 89 |
+
▁whi -6.40908
|
| 90 |
+
k -6.41607
|
| 91 |
+
▁water -6.41957
|
| 92 |
+
▁grass -6.42378
|
| 93 |
+
▁there -6.42571
|
| 94 |
+
st -6.42804
|
| 95 |
+
al -6.42888
|
| 96 |
+
w -6.44482
|
| 97 |
+
h -6.46665
|
| 98 |
+
▁side -6.47196
|
| 99 |
+
ll -6.4794
|
| 100 |
+
▁train -6.51638
|
| 101 |
+
▁computer -6.52006
|
| 102 |
+
▁lay -6.53283
|
| 103 |
+
ch -6.54474
|
| 104 |
+
▁stop -6.56702
|
| 105 |
+
▁board -6.56877
|
| 106 |
+
▁for -6.56976
|
| 107 |
+
▁bus -6.57341
|
| 108 |
+
▁baseball -6.58612
|
| 109 |
+
▁phone -6.58992
|
| 110 |
+
▁parked -6.59899
|
| 111 |
+
▁walking -6.60393
|
| 112 |
+
▁her -6.614
|
| 113 |
+
▁b -6.6172
|
| 114 |
+
▁riding -6.62086
|
| 115 |
+
▁sits -6.62201
|
| 116 |
+
▁bowl -6.62631
|
| 117 |
+
▁clock -6.63455
|
| 118 |
+
▁parking -6.63607
|
| 119 |
+
▁kitchen -6.64843
|
| 120 |
+
▁light -6.65729
|
| 121 |
+
or -6.68298
|
| 122 |
+
▁wearing -6.69127
|
| 123 |
+
▁picture -6.69418
|
| 124 |
+
▁boy -6.69421
|
| 125 |
+
▁looking -6.69659
|
| 126 |
+
▁building -6.69858
|
| 127 |
+
en -6.71042
|
| 128 |
+
▁out -6.71159
|
| 129 |
+
▁girl -6.71339
|
| 130 |
+
▁vase -6.73144
|
| 131 |
+
▁fire -6.73145
|
| 132 |
+
▁hand -6.73375
|
| 133 |
+
▁pizza -6.7406
|
| 134 |
+
an -6.75261
|
| 135 |
+
' -6.75293
|
| 136 |
+
▁thre -6.76074
|
| 137 |
+
▁t -6.76468
|
| 138 |
+
▁couple -6.77331
|
| 139 |
+
at -6.7805
|
| 140 |
+
▁be -6.78194
|
| 141 |
+
▁yellow -6.78446
|
| 142 |
+
▁bathroom -6.78927
|
| 143 |
+
▁glass -6.79439
|
| 144 |
+
▁woode -6.79814
|
| 145 |
+
▁toilet -6.80878
|
| 146 |
+
▁meter -6.80878
|
| 147 |
+
▁from -6.82199
|
| 148 |
+
▁m -6.82553
|
| 149 |
+
▁several -6.82698
|
| 150 |
+
▁bat -6.82956
|
| 151 |
+
▁skateboard -6.83116
|
| 152 |
+
▁boat -6.84214
|
| 153 |
+
▁brown -6.84553
|
| 154 |
+
on -6.84954
|
| 155 |
+
▁vegetables -6.85327
|
| 156 |
+
▁hydrant -6.86097
|
| 157 |
+
▁over -6.87842
|
| 158 |
+
▁beach -6.88194
|
| 159 |
+
▁scissors -6.88724
|
| 160 |
+
▁sink -6.89616
|
| 161 |
+
▁their -6.89832
|
| 162 |
+
▁room -6.89974
|
| 163 |
+
▁filled -6.90334
|
| 164 |
+
el -6.9077
|
| 165 |
+
▁umbrella -6.93262
|
| 166 |
+
re -6.93408
|
| 167 |
+
v -6.93453
|
| 168 |
+
▁desk -6.93802
|
| 169 |
+
▁wine -6.94951
|
| 170 |
+
ly -6.95091
|
| 171 |
+
▁elephant -6.95316
|
| 172 |
+
▁horse -6.95723
|
| 173 |
+
▁motorcycle -6.95896
|
| 174 |
+
▁hot -6.96056
|
| 175 |
+
▁road -6.96278
|
| 176 |
+
▁open -6.96662
|
| 177 |
+
▁st -6.97948
|
| 178 |
+
▁pair -6.98018
|
| 179 |
+
ro -6.98102
|
| 180 |
+
▁d -6.98209
|
| 181 |
+
it -6.9932
|
| 182 |
+
▁tie -6.99493
|
| 183 |
+
▁bench -6.99587
|
| 184 |
+
ur -6.99703
|
| 185 |
+
▁g -7.0009
|
| 186 |
+
▁piece -7.0058
|
| 187 |
+
▁cake -7.01609
|
| 188 |
+
▁pa -7.02046
|
| 189 |
+
▁keyboard -7.02191
|
| 190 |
+
▁each -7.02191
|
| 191 |
+
▁thi -7.02415
|
| 192 |
+
▁wall -7.03008
|
| 193 |
+
▁sheep -7.03431
|
| 194 |
+
▁orange -7.03899
|
| 195 |
+
▁ball -7.03925
|
| 196 |
+
▁different -7.04036
|
| 197 |
+
▁frisbee -7.04036
|
| 198 |
+
▁counter -7.04036
|
| 199 |
+
▁flying -7.04119
|
| 200 |
+
▁flowers -7.04797
|
| 201 |
+
▁traffic -7.04866
|
| 202 |
+
▁giraffe -7.05405
|
| 203 |
+
▁laptop -7.05704
|
| 204 |
+
▁car -7.06113
|
| 205 |
+
▁tree -7.06714
|
| 206 |
+
▁eating -7.06818
|
| 207 |
+
▁teddy -7.07615
|
| 208 |
+
▁its -7.07656
|
| 209 |
+
▁bunch -7.07657
|
| 210 |
+
▁around -7.0783
|
| 211 |
+
▁mouse -7.08046
|
| 212 |
+
li -7.08325
|
| 213 |
+
▁covered -7.08612
|
| 214 |
+
il -7.09208
|
| 215 |
+
▁bike -7.10221
|
| 216 |
+
▁broccoli -7.10883
|
| 217 |
+
▁cell -7.11314
|
| 218 |
+
▁through -7.11997
|
| 219 |
+
▁inside -7.12178
|
| 220 |
+
la -7.12697
|
| 221 |
+
▁fruit -7.12824
|
| 222 |
+
▁one -7.13683
|
| 223 |
+
▁remote -7.14262
|
| 224 |
+
▁c -7.14674
|
| 225 |
+
▁zebra -7.14708
|
| 226 |
+
▁outside -7.15234
|
| 227 |
+
very -7.15784
|
| 228 |
+
▁little -7.15879
|
| 229 |
+
▁microwave -7.16345
|
| 230 |
+
▁truck -7.1658
|
| 231 |
+
▁area -7.16815
|
| 232 |
+
▁hold -7.16833
|
| 233 |
+
ting -7.16983
|
| 234 |
+
et -7.17782
|
| 235 |
+
▁city -7.18235
|
| 236 |
+
▁old -7.18386
|
| 237 |
+
ter -7.18462
|
| 238 |
+
▁lot -7.18947
|
| 239 |
+
▁pe -7.19085
|
| 240 |
+
▁back -7.19676
|
| 241 |
+
▁bed -7.19905
|
| 242 |
+
▁surfboard -7.19922
|
| 243 |
+
▁sandwich -7.20647
|
| 244 |
+
▁w -7.20741
|
| 245 |
+
▁together -7.21876
|
| 246 |
+
▁air -7.2197
|
| 247 |
+
▁co -7.22215
|
| 248 |
+
▁playing -7.22223
|
| 249 |
+
▁sit -7.24205
|
| 250 |
+
▁player -7.2433
|
| 251 |
+
▁book -7.25398
|
| 252 |
+
▁under -7.25907
|
| 253 |
+
▁head -7.25926
|
| 254 |
+
▁racket -7.25933
|
| 255 |
+
▁behind -7.26428
|
| 256 |
+
▁fence -7.26947
|
| 257 |
+
▁bananas -7.27123
|
| 258 |
+
pped -7.27309
|
| 259 |
+
▁la -7.27649
|
| 260 |
+
▁bird -7.27907
|
| 261 |
+
▁ground -7.27992
|
| 262 |
+
▁men -7.28063
|
| 263 |
+
▁po -7.28079
|
| 264 |
+
▁background -7.28255
|
| 265 |
+
▁snowboard -7.28561
|
| 266 |
+
▁glasses -7.2927
|
| 267 |
+
▁child -7.30655
|
| 268 |
+
ers -7.30667
|
| 269 |
+
▁do -7.31758
|
| 270 |
+
▁carrots -7.31859
|
| 271 |
+
▁display -7.31909
|
| 272 |
+
▁stuff -7.32014
|
| 273 |
+
▁bag -7.32028
|
| 274 |
+
ic -7.32392
|
| 275 |
+
ous -7.32735
|
| 276 |
+
ol -7.32786
|
| 277 |
+
▁cup -7.33429
|
| 278 |
+
th -7.33652
|
| 279 |
+
ck -7.33703
|
| 280 |
+
▁baby -7.34507
|
| 281 |
+
▁full -7.35071
|
| 282 |
+
▁cut -7.35128
|
| 283 |
+
▁stands -7.3625
|
| 284 |
+
▁into -7.37317
|
| 285 |
+
▁luggage -7.37353
|
| 286 |
+
id -7.37802
|
| 287 |
+
ion -7.38241
|
| 288 |
+
▁being -7.38631
|
| 289 |
+
▁oven -7.39099
|
| 290 |
+
▁re -7.39201
|
| 291 |
+
▁sh -7.39833
|
| 292 |
+
▁beside -7.40047
|
| 293 |
+
▁photo -7.40281
|
| 294 |
+
▁sidewalk -7.41477
|
| 295 |
+
ve -7.41626
|
| 296 |
+
▁shirt -7.42155
|
| 297 |
+
▁paper -7.42387
|
| 298 |
+
▁di -7.42522
|
| 299 |
+
▁floor -7.42687
|
| 300 |
+
▁window -7.42687
|
| 301 |
+
▁dirt -7.42994
|
| 302 |
+
▁bottle -7.43605
|
| 303 |
+
▁knife -7.43605
|
| 304 |
+
▁cutt -7.43621
|
| 305 |
+
▁pink -7.43913
|
| 306 |
+
▁ski -7.44157
|
| 307 |
+
ra -7.44446
|
| 308 |
+
z -7.44525
|
| 309 |
+
▁cow -7.44585
|
| 310 |
+
▁trees -7.45224
|
| 311 |
+
▁game -7.45465
|
| 312 |
+
▁bicycle -7.46093
|
| 313 |
+
▁wood -7.46571
|
| 314 |
+
▁look -7.47155
|
| 315 |
+
▁pole -7.47177
|
| 316 |
+
▁big -7.48325
|
| 317 |
+
▁skis -7.48476
|
| 318 |
+
way -7.48747
|
| 319 |
+
▁sp -7.49752
|
| 320 |
+
▁chair -7.50274
|
| 321 |
+
ct -7.50429
|
| 322 |
+
ut -7.50672
|
| 323 |
+
▁view -7.51264
|
| 324 |
+
▁kite -7.51347
|
| 325 |
+
▁pile -7.51817
|
| 326 |
+
▁suitcase -7.51929
|
| 327 |
+
▁face -7.52049
|
| 328 |
+
▁airplane -7.52599
|
| 329 |
+
▁walk -7.52786
|
| 330 |
+
▁oranges -7.53837
|
| 331 |
+
▁mouth -7.54637
|
| 332 |
+
▁toothbrush -7.54683
|
| 333 |
+
▁k -7.55424
|
| 334 |
+
ce -7.5561
|
| 335 |
+
▁image -7.55671
|
| 336 |
+
▁apples -7.55932
|
| 337 |
+
▁zebras -7.56039
|
| 338 |
+
▁n -7.56135
|
| 339 |
+
▁sky -7.56722
|
| 340 |
+
▁plane -7.57067
|
| 341 |
+
ti -7.574
|
| 342 |
+
▁box -7.5742
|
| 343 |
+
▁stand -7.57636
|
| 344 |
+
▁kites -7.5947
|
| 345 |
+
▁sa -7.59849
|
| 346 |
+
▁ta -7.60838
|
| 347 |
+
▁meat -7.62116
|
| 348 |
+
▁apple -7.62208
|
| 349 |
+
▁ca -7.62627
|
| 350 |
+
j -7.63227
|
| 351 |
+
▁another -7.63229
|
| 352 |
+
▁park -7.63289
|
| 353 |
+
▁refrigerator -7.65497
|
| 354 |
+
▁rock -7.65499
|
| 355 |
+
ent -7.66018
|
| 356 |
+
age -7.66522
|
| 357 |
+
▁vari -7.68129
|
| 358 |
+
▁cr -7.68348
|
| 359 |
+
▁he -7.68909
|
| 360 |
+
▁grassy -7.69146
|
| 361 |
+
▁ma -7.70065
|
| 362 |
+
▁four -7.70599
|
| 363 |
+
▁slice -7.7131
|
| 364 |
+
▁colorful -7.71405
|
| 365 |
+
▁bears -7.71567
|
| 366 |
+
▁line -7.72202
|
| 367 |
+
▁hat -7.72466
|
| 368 |
+
▁lo -7.72602
|
| 369 |
+
▁someone -7.72644
|
| 370 |
+
▁donuts -7.74847
|
| 371 |
+
▁tall -7.74884
|
| 372 |
+
▁ha -7.75025
|
| 373 |
+
▁women -7.75124
|
| 374 |
+
▁fork -7.75241
|
| 375 |
+
▁banana -7.75259
|
| 376 |
+
us -7.75503
|
| 377 |
+
▁cows -7.75741
|
| 378 |
+
▁cheese -7.7597
|
| 379 |
+
▁stove -7.75971
|
| 380 |
+
▁control -7.76392
|
| 381 |
+
▁h -7.76772
|
| 382 |
+
▁them -7.76867
|
| 383 |
+
um -7.76899
|
| 384 |
+
▁coffee -7.77252
|
| 385 |
+
▁ra -7.77705
|
| 386 |
+
▁station -7.77735
|
| 387 |
+
▁mirror -7.79426
|
| 388 |
+
▁along -7.79426
|
| 389 |
+
▁ready -7.79866
|
| 390 |
+
▁can -7.7987
|
| 391 |
+
▁we -7.80183
|
| 392 |
+
▁no -7.80262
|
| 393 |
+
▁com -7.80472
|
| 394 |
+
▁pan -7.80668
|
| 395 |
+
te -7.81235
|
| 396 |
+
▁herd -7.81529
|
| 397 |
+
▁pot -7.81626
|
| 398 |
+
▁tak -7.81771
|
| 399 |
+
▁camera -7.82099
|
| 400 |
+
▁dish -7.82169
|
| 401 |
+
▁pi -7.82775
|
| 402 |
+
▁hanging -7.83368
|
| 403 |
+
▁items -7.83463
|
| 404 |
+
▁off -7.83819
|
| 405 |
+
▁mo -7.84324
|
| 406 |
+
▁tray -7.84383
|
| 407 |
+
▁cook -7.84846
|
| 408 |
+
to -7.85812
|
| 409 |
+
▁above -7.86718
|
| 410 |
+
▁giraffes -7.87394
|
| 411 |
+
▁spoon -7.87669
|
| 412 |
+
▁middle -7.88628
|
| 413 |
+
▁long -7.88667
|
| 414 |
+
▁wii -7.88846
|
| 415 |
+
▁suit -7.89112
|
| 416 |
+
▁half -7.90085
|
| 417 |
+
▁elephants -7.90111
|
| 418 |
+
▁posing -7.90154
|
| 419 |
+
▁metal -7.90575
|
| 420 |
+
▁silver -7.90575
|
| 421 |
+
▁get -7.90787
|
| 422 |
+
▁ne -7.90954
|
| 423 |
+
▁past -7.91062
|
| 424 |
+
▁grazing -7.91067
|
| 425 |
+
un -7.91793
|
| 426 |
+
mp -7.92552
|
| 427 |
+
▁bar -7.92721
|
| 428 |
+
▁stick -7.93568
|
| 429 |
+
▁ru -7.94538
|
| 430 |
+
▁day -7.94592
|
| 431 |
+
▁sand -7.94793
|
| 432 |
+
▁smiling -7.95098
|
| 433 |
+
▁like -7.95098
|
| 434 |
+
gain -7.95197
|
| 435 |
+
▁mountain -7.95613
|
| 436 |
+
▁carry -7.95619
|
| 437 |
+
▁brush -7.96003
|
| 438 |
+
qu -7.96664
|
| 439 |
+
▁fruits -7.96821
|
| 440 |
+
▁de -7.96922
|
| 441 |
+
ot -7.97654
|
| 442 |
+
▁plant -7.97702
|
| 443 |
+
▁living -7.98231
|
| 444 |
+
ate -7.98572
|
| 445 |
+
▁court -7.98764
|
| 446 |
+
▁surf -7.9939
|
| 447 |
+
▁basket -7.99835
|
| 448 |
+
▁drink -8.00293
|
| 449 |
+
ive -8.0033
|
| 450 |
+
▁ho -8.00457
|
| 451 |
+
▁door -8.00632
|
| 452 |
+
▁hill -8.00921
|
| 453 |
+
▁animals -8.01801
|
| 454 |
+
side -8.02104
|
| 455 |
+
ies -8.02683
|
| 456 |
+
▁signs -8.03339
|
| 457 |
+
▁crowd -8.03682
|
| 458 |
+
▁hair -8.03684
|
| 459 |
+
▁pull -8.03822
|
| 460 |
+
▁cars -8.03923
|
| 461 |
+
x -8.05366
|
| 462 |
+
▁row -8.05382
|
| 463 |
+
ping -8.05595
|
| 464 |
+
ng -8.05924
|
| 465 |
+
▁airport -8.05948
|
| 466 |
+
▁across -8.05948
|
| 467 |
+
▁bread -8.05949
|
| 468 |
+
▁animal -8.06748
|
| 469 |
+
how -8.06865
|
| 470 |
+
▁lean -8.07115
|
| 471 |
+
▁swing -8.07118
|
| 472 |
+
▁plastic -8.07682
|
| 473 |
+
▁cross -8.07683
|
| 474 |
+
▁who -8.07684
|
| 475 |
+
▁couch -8.09448
|
| 476 |
+
▁trick -8.09464
|
| 477 |
+
▁tracks -8.11186
|
| 478 |
+
▁horses -8.1119
|
| 479 |
+
led -8.11552
|
| 480 |
+
▁donut -8.11644
|
| 481 |
+
▁screen -8.1185
|
| 482 |
+
▁set -8.11926
|
| 483 |
+
▁prepar -8.1246
|
| 484 |
+
▁ride -8.12466
|
| 485 |
+
▁bu -8.13179
|
| 486 |
+
up -8.13602
|
| 487 |
+
▁skate -8.13679
|
| 488 |
+
▁birds -8.1389
|
| 489 |
+
▁type -8.14312
|
| 490 |
+
▁store -8.14344
|
| 491 |
+
ment -8.14676
|
| 492 |
+
▁driving -8.14936
|
| 493 |
+
▁empty -8.14936
|
| 494 |
+
ard -8.15133
|
| 495 |
+
▁un -8.15811
|
| 496 |
+
lying -8.16103
|
| 497 |
+
ew -8.16374
|
| 498 |
+
▁pu -8.1642
|
| 499 |
+
▁chocolate -8.16835
|
| 500 |
+
▁enclosure -8.16835
|
| 501 |
+
▁color -8.16981
|
| 502 |
+
▁something -8.17484
|
| 503 |
+
▁hands -8.17591
|
| 504 |
+
▁pen -8.17919
|
| 505 |
+
▁market -8.1814
|
| 506 |
+
▁kid -8.18247
|
| 507 |
+
▁seat -8.19436
|
| 508 |
+
▁purple -8.20082
|
| 509 |
+
▁television -8.20082
|
| 510 |
+
▁using -8.20104
|
| 511 |
+
▁displayed -8.20337
|
| 512 |
+
▁snowboarder -8.20711
|
| 513 |
+
▁house -8.20748
|
| 514 |
+
ake -8.21209
|
| 515 |
+
▁includ -8.21411
|
| 516 |
+
▁slope -8.21411
|
| 517 |
+
▁video -8.21411
|
| 518 |
+
▁hit -8.21564
|
| 519 |
+
▁con -8.21607
|
| 520 |
+
▁or -8.21672
|
| 521 |
+
▁skier -8.21749
|
| 522 |
+
▁controller -8.22088
|
| 523 |
+
▁shown -8.22145
|
| 524 |
+
own -8.22178
|
| 525 |
+
▁container -8.22238
|
| 526 |
+
▁pro -8.23454
|
| 527 |
+
▁multi -8.24123
|
| 528 |
+
▁case -8.24262
|
| 529 |
+
▁tooth -8.24826
|
| 530 |
+
▁gra -8.25037
|
| 531 |
+
▁flower -8.25042
|
| 532 |
+
▁outdoor -8.25507
|
| 533 |
+
uring -8.2612
|
| 534 |
+
▁number -8.26206
|
| 535 |
+
▁jet -8.26208
|
| 536 |
+
co -8.26604
|
| 537 |
+
▁tv -8.26658
|
| 538 |
+
ish -8.27151
|
| 539 |
+
▁double -8.2762
|
| 540 |
+
▁hotdog -8.2762
|
| 541 |
+
▁monitor -8.2762
|
| 542 |
+
▁salad -8.27621
|
| 543 |
+
▁lady -8.27628
|
| 544 |
+
▁doughnuts -8.28027
|
| 545 |
+
▁attached -8.28334
|
| 546 |
+
▁contain -8.28887
|
| 547 |
+
▁teeth -8.29054
|
| 548 |
+
▁she -8.29109
|
| 549 |
+
▁jump -8.29778
|
| 550 |
+
▁resting -8.29782
|
| 551 |
+
▁guy -8.30513
|
| 552 |
+
▁made -8.30785
|
| 553 |
+
▁ocean -8.31244
|
| 554 |
+
▁work -8.31244
|
| 555 |
+
▁tower -8.31265
|
| 556 |
+
▁bun -8.31679
|
| 557 |
+
▁corner -8.31984
|
| 558 |
+
▁meal -8.31991
|
| 559 |
+
ling -8.32351
|
| 560 |
+
▁mak -8.32645
|
| 561 |
+
▁passenger -8.3273
|
| 562 |
+
▁take -8.33001
|
| 563 |
+
▁racquet -8.33482
|
| 564 |
+
▁pose -8.33792
|
| 565 |
+
▁backpack -8.35003
|
| 566 |
+
▁high -8.35773
|
| 567 |
+
▁surround -8.36548
|
| 568 |
+
▁office -8.36662
|
| 569 |
+
▁zoo -8.37329
|
| 570 |
+
▁brick -8.37353
|
| 571 |
+
ation -8.37577
|
| 572 |
+
▁neck -8.3822
|
| 573 |
+
light -8.38662
|
| 574 |
+
▁restaurant -8.3891
|
| 575 |
+
▁dry -8.3892
|
| 576 |
+
▁cellphone -8.38966
|
| 577 |
+
▁photograph -8.3971
|
| 578 |
+
▁fresh -8.3971
|
| 579 |
+
▁surface -8.3971
|
| 580 |
+
▁shelf -8.39711
|
| 581 |
+
ween -8.39718
|
| 582 |
+
▁jacket -8.40516
|
| 583 |
+
▁sauce -8.40517
|
| 584 |
+
▁wave -8.40517
|
| 585 |
+
▁adult -8.41329
|
| 586 |
+
▁statue -8.41329
|
| 587 |
+
▁sc -8.41573
|
| 588 |
+
▁waiting -8.41797
|
| 589 |
+
▁branch -8.42149
|
| 590 |
+
▁cabinet -8.42149
|
| 591 |
+
▁kind -8.42173
|
| 592 |
+
▁watching -8.42253
|
| 593 |
+
▁painted -8.42488
|
| 594 |
+
▁play -8.42802
|
| 595 |
+
▁post -8.42841
|
| 596 |
+
▁polar -8.42979
|
| 597 |
+
▁track -8.43055
|
| 598 |
+
▁cloth -8.43809
|
| 599 |
+
colored -8.44458
|
| 600 |
+
▁runway -8.4465
|
| 601 |
+
▁glove -8.45497
|
| 602 |
+
▁time -8.45497
|
| 603 |
+
▁clear -8.45499
|
| 604 |
+
▁showing -8.45502
|
| 605 |
+
▁sun -8.45814
|
| 606 |
+
top -8.45927
|
| 607 |
+
▁toothbrushes -8.46235
|
| 608 |
+
▁helmet -8.46351
|
| 609 |
+
▁sleeping -8.46351
|
| 610 |
+
▁chicken -8.46352
|
| 611 |
+
▁rice -8.46352
|
| 612 |
+
▁dark -8.46353
|
| 613 |
+
▁children -8.46355
|
| 614 |
+
▁but -8.46491
|
| 615 |
+
▁arm -8.4712
|
| 616 |
+
▁electronic -8.47214
|
| 617 |
+
▁body -8.47214
|
| 618 |
+
▁eaten -8.47334
|
| 619 |
+
▁dressed -8.4777
|
| 620 |
+
▁skiing -8.47879
|
| 621 |
+
▁bet -8.48197
|
| 622 |
+
▁going -8.48353
|
| 623 |
+
▁tile -8.48614
|
| 624 |
+
ight -8.48895
|
| 625 |
+
ead -8.503
|
| 626 |
+
▁use -8.50637
|
| 627 |
+
▁talking -8.50738
|
| 628 |
+
▁smile -8.51639
|
| 629 |
+
▁pack -8.51932
|
| 630 |
+
▁doughnut -8.52028
|
| 631 |
+
fri -8.52173
|
| 632 |
+
ant -8.52342
|
| 633 |
+
▁go -8.52519
|
| 634 |
+
▁bright -8.52549
|
| 635 |
+
▁grey -8.52551
|
| 636 |
+
▁dock -8.52768
|
| 637 |
+
▁sliced -8.52772
|
| 638 |
+
▁eat -8.53152
|
| 639 |
+
▁wire -8.53602
|
| 640 |
+
▁rail -8.53658
|
| 641 |
+
▁cart -8.54105
|
| 642 |
+
▁place -8.5424
|
| 643 |
+
▁about -8.54392
|
| 644 |
+
▁ex -8.54408
|
| 645 |
+
▁gray -8.552
|
| 646 |
+
▁left -8.55326
|
| 647 |
+
▁fries -8.55331
|
| 648 |
+
▁strip -8.5535
|
| 649 |
+
▁toaster -8.5552
|
| 650 |
+
▁carrot -8.55864
|
| 651 |
+
▁star -8.57154
|
| 652 |
+
▁intersection -8.57222
|
| 653 |
+
▁toward -8.58186
|
| 654 |
+
▁rack -8.5825
|
| 655 |
+
▁pick -8.58321
|
| 656 |
+
▁have -8.58323
|
| 657 |
+
▁drinking -8.59303
|
| 658 |
+
▁device -8.60135
|
| 659 |
+
▁soup -8.60137
|
| 660 |
+
▁tub -8.61128
|
| 661 |
+
▁brushing -8.61371
|
| 662 |
+
▁beer -8.61576
|
| 663 |
+
▁blanket -8.62125
|
| 664 |
+
▁shower -8.62126
|
| 665 |
+
▁rain -8.62579
|
| 666 |
+
▁skateboarder -8.62651
|
| 667 |
+
▁leaves -8.63135
|
| 668 |
+
▁trunk -8.63135
|
| 669 |
+
eep -8.63335
|
| 670 |
+
▁variet -8.63348
|
| 671 |
+
▁stra -8.63487
|
| 672 |
+
▁dress -8.63501
|
| 673 |
+
▁watch -8.65057
|
| 674 |
+
▁male -8.6518
|
| 675 |
+
▁potatoes -8.65186
|
| 676 |
+
▁cover -8.65653
|
| 677 |
+
▁wi -8.66218
|
| 678 |
+
▁toppings -8.6625
|
| 679 |
+
▁night -8.66297
|
| 680 |
+
ver -8.66416
|
| 681 |
+
▁home -8.67288
|
| 682 |
+
▁signal -8.68183
|
| 683 |
+
▁eye -8.68385
|
| 684 |
+
▁vehicle -8.69419
|
| 685 |
+
▁shop -8.69747
|
| 686 |
+
▁batter -8.69925
|
| 687 |
+
▁decker -8.70507
|
| 688 |
+
▁was -8.70587
|
| 689 |
+
▁closeup -8.70815
|
| 690 |
+
▁dessert -8.71605
|
| 691 |
+
▁graffiti -8.71605
|
| 692 |
+
▁older -8.72031
|
| 693 |
+
line -8.72433
|
| 694 |
+
▁decorated -8.72717
|
| 695 |
+
▁traveling -8.73226
|
| 696 |
+
▁uniform -8.7384
|
| 697 |
+
▁not -8.7476
|
| 698 |
+
▁chi -8.7524
|
| 699 |
+
▁stacked -8.75395
|
| 700 |
+
▁hug -8.76127
|
| 701 |
+
▁shaped -8.76411
|
| 702 |
+
phone -8.7734
|
| 703 |
+
▁rest -8.7802
|
| 704 |
+
▁river -8.78465
|
| 705 |
+
▁motor -8.78468
|
| 706 |
+
▁roll -8.78475
|
| 707 |
+
▁lit -8.7856
|
| 708 |
+
able -8.78876
|
| 709 |
+
▁egg -8.79656
|
| 710 |
+
ical -8.79692
|
| 711 |
+
placed -8.79851
|
| 712 |
+
▁shot -8.81013
|
| 713 |
+
▁setting -8.81027
|
| 714 |
+
▁cloud -8.8208
|
| 715 |
+
▁assortment -8.82108
|
| 716 |
+
lic -8.82399
|
| 717 |
+
▁served -8.83315
|
| 718 |
+
▁onions -8.83693
|
| 719 |
+
▁ice -8.85444
|
| 720 |
+
▁appliances -8.8583
|
| 721 |
+
▁giant -8.85831
|
| 722 |
+
lush -8.85832
|
| 723 |
+
▁underneath -8.85857
|
| 724 |
+
▁vegetable -8.86435
|
| 725 |
+
▁assort -8.87084
|
| 726 |
+
▁machine -8.87112
|
| 727 |
+
▁produc -8.87113
|
| 728 |
+
▁gear -8.87123
|
| 729 |
+
▁forest -8.8714
|
| 730 |
+
▁smart -8.88411
|
| 731 |
+
▁electric -8.88411
|
| 732 |
+
▁reading -8.88411
|
| 733 |
+
▁purse -8.88412
|
| 734 |
+
▁mitt -8.88414
|
| 735 |
+
▁moving -8.88414
|
| 736 |
+
▁seen -8.88435
|
| 737 |
+
▁van -8.88446
|
| 738 |
+
▁fish -8.88869
|
| 739 |
+
unny -8.89689
|
| 740 |
+
▁edge -8.8973
|
| 741 |
+
▁feet -8.89741
|
| 742 |
+
▁part -8.89746
|
| 743 |
+
▁surfer -8.90804
|
| 744 |
+
▁tea -8.91376
|
| 745 |
+
▁fridge -8.93781
|
| 746 |
+
▁perched -8.93781
|
| 747 |
+
▁running -8.93782
|
| 748 |
+
▁size -8.93785
|
| 749 |
+
▁yard -8.93796
|
| 750 |
+
▁mount -8.9517
|
| 751 |
+
▁female -8.9517
|
| 752 |
+
▁mustard -8.9517
|
| 753 |
+
▁curb -8.95172
|
| 754 |
+
▁turn -8.95174
|
| 755 |
+
▁him -8.95175
|
| 756 |
+
▁scene -8.95183
|
| 757 |
+
▁ramp -8.95295
|
| 758 |
+
▁log -8.95909
|
| 759 |
+
▁object -8.96579
|
| 760 |
+
▁draw -8.9658
|
| 761 |
+
▁drive -8.9658
|
| 762 |
+
▁cattle -8.96602
|
| 763 |
+
▁cla -8.9711
|
| 764 |
+
▁wheel -8.98007
|
| 765 |
+
▁towel -8.98022
|
| 766 |
+
▁point -8.9804
|
| 767 |
+
▁putt -8.98076
|
| 768 |
+
▁tomatoes -8.98212
|
| 769 |
+
▁beautiful -8.99457
|
| 770 |
+
▁mother -8.99457
|
| 771 |
+
▁among -8.99458
|
| 772 |
+
▁gold -8.99461
|
| 773 |
+
▁hang -8.99878
|
| 774 |
+
intend -9.00038
|
| 775 |
+
▁pasture -9.00931
|
| 776 |
+
▁cage -9.01343
|
| 777 |
+
▁bridge -9.0242
|
| 778 |
+
▁clean -9.02423
|
| 779 |
+
▁frost -9.02425
|
| 780 |
+
▁leg -9.0249
|
| 781 |
+
▁reflect -9.03935
|
| 782 |
+
▁sausage -9.03935
|
| 783 |
+
▁right -9.03942
|
| 784 |
+
▁stone -9.03967
|
| 785 |
+
▁candle -9.03972
|
| 786 |
+
▁wrappe -9.03975
|
| 787 |
+
▁single -9.04037
|
| 788 |
+
▁goat -9.04131
|
| 789 |
+
▁arranged -9.05473
|
| 790 |
+
▁writ -9.05476
|
| 791 |
+
▁blow -9.05488
|
| 792 |
+
▁desktop -9.05653
|
| 793 |
+
board -9.05754
|
| 794 |
+
▁equipment -9.07036
|
| 795 |
+
▁cream -9.07036
|
| 796 |
+
▁trail -9.07036
|
| 797 |
+
▁mug -9.0704
|
| 798 |
+
▁wild -9.0704
|
| 799 |
+
▁sub -9.07047
|
| 800 |
+
▁beans -9.07074
|
| 801 |
+
▁paint -9.08035
|
| 802 |
+
▁below -9.08623
|
| 803 |
+
▁featur -9.08623
|
| 804 |
+
▁pitch -9.08625
|
| 805 |
+
▁perform -9.10236
|
| 806 |
+
▁toothpaste -9.10236
|
| 807 |
+
▁held -9.10243
|
| 808 |
+
▁float -9.11875
|
| 809 |
+
▁broken -9.11875
|
| 810 |
+
▁glazed -9.11875
|
| 811 |
+
▁ketchup -9.11875
|
| 812 |
+
▁check -9.11876
|
| 813 |
+
▁dinner -9.11876
|
| 814 |
+
▁grill -9.11876
|
| 815 |
+
▁police -9.11877
|
| 816 |
+
▁pre -9.11932
|
| 817 |
+
▁design -9.13542
|
| 818 |
+
▁flock -9.13542
|
| 819 |
+
▁gather -9.13542
|
| 820 |
+
▁platform -9.13542
|
| 821 |
+
▁trash -9.13542
|
| 822 |
+
▁veggie -9.13542
|
| 823 |
+
▁short -9.13545
|
| 824 |
+
▁foot -9.13546
|
| 825 |
+
▁flip -9.14075
|
| 826 |
+
▁birthday -9.15237
|
| 827 |
+
▁pretty -9.15237
|
| 828 |
+
▁soda -9.15237
|
| 829 |
+
▁reach -9.15241
|
| 830 |
+
▁nice -9.15242
|
| 831 |
+
▁public -9.16961
|
| 832 |
+
▁round -9.16961
|
| 833 |
+
having -9.17397
|
| 834 |
+
▁alone -9.18719
|
| 835 |
+
▁bushes -9.18722
|
| 836 |
+
▁lie -9.19505
|
| 837 |
+
▁stack -9.19848
|
| 838 |
+
just -9.20508
|
| 839 |
+
▁lake -9.20519
|
| 840 |
+
▁stopped -9.20543
|
| 841 |
+
▁fac -9.23775
|
| 842 |
+
▁engine -9.24171
|
| 843 |
+
▁tarmac -9.24171
|
| 844 |
+
▁garden -9.24172
|
| 845 |
+
▁vin -9.2419
|
| 846 |
+
tuck -9.24618
|
| 847 |
+
▁toast -9.25758
|
| 848 |
+
▁flat -9.26074
|
| 849 |
+
▁try -9.26088
|
| 850 |
+
▁french -9.27981
|
| 851 |
+
▁beverage -9.27981
|
| 852 |
+
▁shore -9.27993
|
| 853 |
+
▁grow -9.28019
|
| 854 |
+
▁fall -9.28069
|
| 855 |
+
▁supplies -9.29942
|
| 856 |
+
▁farm -9.29955
|
| 857 |
+
▁mushroom -9.31942
|
| 858 |
+
▁mix -9.31942
|
| 859 |
+
▁chain -9.31942
|
| 860 |
+
▁throw -9.31942
|
| 861 |
+
▁good -9.31947
|
| 862 |
+
loaded -9.31949
|
| 863 |
+
▁steel -9.31954
|
| 864 |
+
▁wait -9.32815
|
| 865 |
+
▁shape -9.33477
|
| 866 |
+
▁advertis -9.33983
|
| 867 |
+
▁lunch -9.33983
|
| 868 |
+
▁modern -9.33983
|
| 869 |
+
▁square -9.33983
|
| 870 |
+
▁smo -9.34026
|
| 871 |
+
▁tomato -9.35768
|
| 872 |
+
▁enjoy -9.36066
|
| 873 |
+
▁shoulder -9.36066
|
| 874 |
+
▁lemon -9.36066
|
| 875 |
+
▁pastries -9.36067
|
| 876 |
+
▁milk -9.36068
|
| 877 |
+
▁lamb -9.36071
|
| 878 |
+
▁match -9.36071
|
| 879 |
+
▁chew -9.36072
|
| 880 |
+
▁rose -9.36084
|
| 881 |
+
▁style -9.36108
|
| 882 |
+
▁appear -9.38194
|
| 883 |
+
▁breakfast -9.38194
|
| 884 |
+
▁foreground -9.38194
|
| 885 |
+
▁napkin -9.38194
|
| 886 |
+
▁platter -9.38194
|
| 887 |
+
▁ripe -9.38194
|
| 888 |
+
▁strawberries -9.38194
|
| 889 |
+
▁sunglasses -9.38194
|
| 890 |
+
▁chopped -9.38194
|
| 891 |
+
▁desert -9.38194
|
| 892 |
+
▁blender -9.38194
|
| 893 |
+
▁leaf -9.38194
|
| 894 |
+
▁shade -9.38197
|
| 895 |
+
▁lawn -9.382
|
| 896 |
+
▁bri -9.39481
|
| 897 |
+
▁decoration -9.40368
|
| 898 |
+
▁pedestrian -9.40368
|
| 899 |
+
▁baked -9.4037
|
| 900 |
+
▁kneel -9.4259
|
| 901 |
+
▁bottom -9.4259
|
| 902 |
+
▁christmas -9.4259
|
| 903 |
+
▁country -9.4259
|
| 904 |
+
▁decorative -9.4259
|
| 905 |
+
▁scooter -9.4259
|
| 906 |
+
▁sculpture -9.4259
|
| 907 |
+
▁sprinkles -9.4259
|
| 908 |
+
▁things -9.42606
|
| 909 |
+
▁chips -9.42617
|
| 910 |
+
▁catcher -9.42642
|
| 911 |
+
ball -9.43836
|
| 912 |
+
▁serving -9.44863
|
| 913 |
+
▁cub -9.44867
|
| 914 |
+
▁horn -9.44874
|
| 915 |
+
▁bathtub -9.47188
|
| 916 |
+
▁concrete -9.47188
|
| 917 |
+
▁distance -9.47188
|
| 918 |
+
where -9.47193
|
| 919 |
+
▁balloon -9.47194
|
| 920 |
+
▁tri -9.47568
|
| 921 |
+
bow -9.50266
|
| 922 |
+
▁bacon -9.52008
|
| 923 |
+
▁bedroom -9.52008
|
| 924 |
+
▁carriage -9.52008
|
| 925 |
+
▁kitten -9.52008
|
| 926 |
+
▁stainless -9.52008
|
| 927 |
+
▁reads -9.52009
|
| 928 |
+
▁bite -9.52042
|
| 929 |
+
▁graze -9.52045
|
| 930 |
+
▁carrie -9.52051
|
| 931 |
+
▁juice -9.54508
|
| 932 |
+
▁lettuce -9.54508
|
| 933 |
+
▁partially -9.54508
|
| 934 |
+
▁swimming -9.54508
|
| 935 |
+
▁position -9.54508
|
| 936 |
+
▁carpet -9.54512
|
| 937 |
+
▁sort -9.54545
|
| 938 |
+
▁travel -9.55899
|
| 939 |
+
▁blurr -9.57072
|
| 940 |
+
▁pigeons -9.57072
|
| 941 |
+
▁what -9.57074
|
| 942 |
+
▁cement -9.57078
|
| 943 |
+
▁word -9.57079
|
| 944 |
+
▁same -9.57092
|
| 945 |
+
▁ship -9.58422
|
| 946 |
+
▁climb -9.59704
|
| 947 |
+
▁arrangement -9.59704
|
| 948 |
+
▁collecti -9.59704
|
| 949 |
+
▁shadow -9.59704
|
| 950 |
+
▁parade -9.59704
|
| 951 |
+
▁bucket -9.59704
|
| 952 |
+
▁lift -9.59704
|
| 953 |
+
▁center -9.59707
|
| 954 |
+
berry -9.5971
|
| 955 |
+
▁flag -9.5971
|
| 956 |
+
▁lead -9.59712
|
| 957 |
+
▁giv -9.61391
|
| 958 |
+
▁you -9.62387
|
| 959 |
+
▁family -9.62407
|
| 960 |
+
▁military -9.62407
|
| 961 |
+
▁picnic -9.62407
|
| 962 |
+
▁soccer -9.62407
|
| 963 |
+
▁pavement -9.62407
|
| 964 |
+
▁peanut -9.62407
|
| 965 |
+
▁space -9.62409
|
| 966 |
+
▁mark -9.65112
|
| 967 |
+
▁curtain -9.65184
|
| 968 |
+
▁himself -9.65184
|
| 969 |
+
▁railroad -9.65184
|
| 970 |
+
▁ledge -9.65184
|
| 971 |
+
ddler -9.65185
|
| 972 |
+
▁duck -9.65187
|
| 973 |
+
▁model -9.65188
|
| 974 |
+
rcial -9.65191
|
| 975 |
+
▁base -9.65213
|
| 976 |
+
arrow -9.65254
|
| 977 |
+
made -9.6699
|
| 978 |
+
▁propeller -9.68041
|
| 979 |
+
▁school -9.68041
|
| 980 |
+
▁puppy -9.68042
|
| 981 |
+
▁cupcake -9.68042
|
| 982 |
+
▁built -9.68043
|
| 983 |
+
▁block -9.70983
|
| 984 |
+
▁event -9.70983
|
| 985 |
+
▁spread -9.70983
|
| 986 |
+
▁winter -9.70985
|
| 987 |
+
▁sport -9.70997
|
| 988 |
+
▁antique -9.74013
|
| 989 |
+
▁pattern -9.74013
|
| 990 |
+
▁professional -9.74013
|
| 991 |
+
▁balanc -9.74013
|
| 992 |
+
▁consist -9.74013
|
| 993 |
+
▁spray -9.74016
|
| 994 |
+
ough -9.74019
|
| 995 |
+
▁figure -9.77138
|
| 996 |
+
▁furniture -9.77138
|
| 997 |
+
▁notebook -9.77138
|
| 998 |
+
▁parrot -9.77138
|
| 999 |
+
▁sofa -9.77138
|
| 1000 |
+
q -10.8674
|
src/dataset/sub_tokenizer1500.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:33febb00379c559196d197de44fa529e8e3cfe5853e9a778dee369f6f46aa4fe
|
| 3 |
+
size 262628
|
src/dataset/sub_tokenizer1500.vocab
ADDED
|
@@ -0,0 +1,1500 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<pad> 0
|
| 2 |
+
<sos> 0
|
| 3 |
+
<eos> 0
|
| 4 |
+
<unk> 0
|
| 5 |
+
▁a -1.93118
|
| 6 |
+
. -2.88169
|
| 7 |
+
▁of -3.55221
|
| 8 |
+
▁on -3.56188
|
| 9 |
+
s -3.62067
|
| 10 |
+
▁in -3.81444
|
| 11 |
+
▁the -3.81817
|
| 12 |
+
▁with -3.88689
|
| 13 |
+
nd -3.89226
|
| 14 |
+
▁is -4.40178
|
| 15 |
+
ing -4.52367
|
| 16 |
+
▁to -4.57599
|
| 17 |
+
▁ -4.76533
|
| 18 |
+
▁man -4.79839
|
| 19 |
+
▁sitting -4.89362
|
| 20 |
+
, -4.97747
|
| 21 |
+
▁an -5.06532
|
| 22 |
+
ed -5.14206
|
| 23 |
+
▁next -5.14591
|
| 24 |
+
e -5.16728
|
| 25 |
+
▁two -5.17064
|
| 26 |
+
▁white -5.30298
|
| 27 |
+
y -5.31383
|
| 28 |
+
▁are -5.32805
|
| 29 |
+
▁holding -5.34089
|
| 30 |
+
▁standing -5.34217
|
| 31 |
+
▁table -5.41202
|
| 32 |
+
▁it -5.45231
|
| 33 |
+
d -5.5279
|
| 34 |
+
▁plate -5.54488
|
| 35 |
+
▁woman -5.57433
|
| 36 |
+
▁at -5.59347
|
| 37 |
+
n -5.59711
|
| 38 |
+
▁that -5.61939
|
| 39 |
+
▁up -5.62864
|
| 40 |
+
▁top -5.69587
|
| 41 |
+
▁people -5.70335
|
| 42 |
+
▁some -5.73733
|
| 43 |
+
▁person -5.75235
|
| 44 |
+
▁black -5.82383
|
| 45 |
+
▁large -5.82513
|
| 46 |
+
▁street -5.86497
|
| 47 |
+
le -5.9101
|
| 48 |
+
▁red -5.91133
|
| 49 |
+
a -5.94493
|
| 50 |
+
▁his -5.96388
|
| 51 |
+
▁small -5.96909
|
| 52 |
+
t -5.98331
|
| 53 |
+
r -5.98939
|
| 54 |
+
▁dog -6.01085
|
| 55 |
+
▁near -6.01845
|
| 56 |
+
o -6.04639
|
| 57 |
+
▁front -6.05784
|
| 58 |
+
p -6.09572
|
| 59 |
+
▁sign -6.13869
|
| 60 |
+
er -6.16551
|
| 61 |
+
▁cat -6.17326
|
| 62 |
+
m -6.1807
|
| 63 |
+
▁by -6.19106
|
| 64 |
+
▁bear -6.19634
|
| 65 |
+
▁group -6.2111
|
| 66 |
+
▁field -6.22948
|
| 67 |
+
▁has -6.24171
|
| 68 |
+
▁food -6.24916
|
| 69 |
+
▁blue -6.25015
|
| 70 |
+
▁green -6.28562
|
| 71 |
+
▁down -6.295
|
| 72 |
+
▁young -6.30647
|
| 73 |
+
▁tennis -6.32453
|
| 74 |
+
▁snow -6.32548
|
| 75 |
+
▁other -6.33314
|
| 76 |
+
▁close -6.335
|
| 77 |
+
▁whi -6.35017
|
| 78 |
+
▁water -6.36056
|
| 79 |
+
▁there -6.36118
|
| 80 |
+
▁grass -6.36577
|
| 81 |
+
▁side -6.41189
|
| 82 |
+
▁train -6.45737
|
| 83 |
+
▁computer -6.46105
|
| 84 |
+
▁lay -6.47228
|
| 85 |
+
▁board -6.50962
|
| 86 |
+
▁baseball -6.52711
|
| 87 |
+
▁phone -6.53028
|
| 88 |
+
▁parked -6.53973
|
| 89 |
+
▁for -6.54057
|
| 90 |
+
▁walking -6.54314
|
| 91 |
+
b -6.54853
|
| 92 |
+
▁her -6.5544
|
| 93 |
+
g -6.55726
|
| 94 |
+
▁sits -6.55971
|
| 95 |
+
▁riding -6.56185
|
| 96 |
+
▁bowl -6.5673
|
| 97 |
+
i -6.56754
|
| 98 |
+
▁clock -6.57554
|
| 99 |
+
▁stop -6.57563
|
| 100 |
+
▁parking -6.57676
|
| 101 |
+
▁kitchen -6.58942
|
| 102 |
+
▁wearing -6.63272
|
| 103 |
+
▁picture -6.63517
|
| 104 |
+
▁boy -6.63583
|
| 105 |
+
▁looking -6.63721
|
| 106 |
+
▁bus -6.64201
|
| 107 |
+
▁girl -6.65437
|
| 108 |
+
▁vase -6.67243
|
| 109 |
+
▁pizza -6.68159
|
| 110 |
+
u -6.68776
|
| 111 |
+
es -6.68791
|
| 112 |
+
▁fire -6.68928
|
| 113 |
+
▁f -6.6895
|
| 114 |
+
' -6.69392
|
| 115 |
+
▁thre -6.70171
|
| 116 |
+
▁out -6.70226
|
| 117 |
+
▁woode -6.70691
|
| 118 |
+
▁couple -6.7143
|
| 119 |
+
▁yellow -6.72545
|
| 120 |
+
▁bathroom -6.73026
|
| 121 |
+
▁glass -6.73696
|
| 122 |
+
▁toilet -6.74976
|
| 123 |
+
al -6.75068
|
| 124 |
+
▁from -6.76297
|
| 125 |
+
▁several -6.76797
|
| 126 |
+
▁skateboard -6.77339
|
| 127 |
+
▁building -6.78257
|
| 128 |
+
▁brown -6.78652
|
| 129 |
+
▁vegetables -6.79345
|
| 130 |
+
▁light -6.79579
|
| 131 |
+
▁hydrant -6.80196
|
| 132 |
+
▁hand -6.80907
|
| 133 |
+
l -6.82187
|
| 134 |
+
▁beach -6.82292
|
| 135 |
+
▁scissors -6.82898
|
| 136 |
+
▁sink -6.83714
|
| 137 |
+
st -6.83938
|
| 138 |
+
▁room -6.84073
|
| 139 |
+
▁be -6.84238
|
| 140 |
+
▁their -6.84242
|
| 141 |
+
▁filled -6.84433
|
| 142 |
+
k -6.85956
|
| 143 |
+
▁over -6.86437
|
| 144 |
+
w -6.87453
|
| 145 |
+
▁desk -6.87899
|
| 146 |
+
in -6.88124
|
| 147 |
+
▁wine -6.89047
|
| 148 |
+
▁bat -6.89335
|
| 149 |
+
▁elephant -6.90038
|
| 150 |
+
▁hot -6.90048
|
| 151 |
+
▁road -6.90377
|
| 152 |
+
▁open -6.90761
|
| 153 |
+
▁meter -6.92037
|
| 154 |
+
▁pair -6.92121
|
| 155 |
+
▁tie -6.93415
|
| 156 |
+
▁bench -6.93685
|
| 157 |
+
▁horse -6.94242
|
| 158 |
+
▁one -6.95584
|
| 159 |
+
▁s -6.95612
|
| 160 |
+
▁cake -6.95696
|
| 161 |
+
▁thi -6.96189
|
| 162 |
+
▁each -6.9629
|
| 163 |
+
▁keyboard -6.96291
|
| 164 |
+
▁wall -6.97106
|
| 165 |
+
▁sheep -6.9752
|
| 166 |
+
▁ball -6.98006
|
| 167 |
+
▁counter -6.98135
|
| 168 |
+
▁different -6.98135
|
| 169 |
+
▁frisbee -6.98135
|
| 170 |
+
ch -6.98225
|
| 171 |
+
▁flying -6.98315
|
| 172 |
+
▁orange -6.98616
|
| 173 |
+
▁flowers -6.98714
|
| 174 |
+
▁traffic -6.98965
|
| 175 |
+
▁d -6.99022
|
| 176 |
+
c -6.99385
|
| 177 |
+
▁laptop -6.99806
|
| 178 |
+
▁giraffe -7.00126
|
| 179 |
+
▁eating -7.00898
|
| 180 |
+
en -7.0135
|
| 181 |
+
▁tree -7.01429
|
| 182 |
+
▁car -7.01644
|
| 183 |
+
▁teddy -7.01714
|
| 184 |
+
▁bunch -7.01741
|
| 185 |
+
▁around -7.01929
|
| 186 |
+
▁mouse -7.02144
|
| 187 |
+
▁covered -7.02696
|
| 188 |
+
▁its -7.04127
|
| 189 |
+
▁broccoli -7.04982
|
| 190 |
+
▁boat -7.05254
|
| 191 |
+
▁cell -7.05417
|
| 192 |
+
▁inside -7.05999
|
| 193 |
+
▁through -7.06096
|
| 194 |
+
▁fruit -7.07545
|
| 195 |
+
▁remote -7.08361
|
| 196 |
+
▁motorcycle -7.08501
|
| 197 |
+
▁outside -7.09315
|
| 198 |
+
on -7.09401
|
| 199 |
+
▁zebra -7.09424
|
| 200 |
+
▁area -7.0966
|
| 201 |
+
very -7.09841
|
| 202 |
+
▁little -7.09978
|
| 203 |
+
▁microwave -7.10444
|
| 204 |
+
▁truck -7.10678
|
| 205 |
+
ar -7.11999
|
| 206 |
+
▁city -7.12334
|
| 207 |
+
ly -7.12514
|
| 208 |
+
▁old -7.12601
|
| 209 |
+
▁lot -7.12874
|
| 210 |
+
v -7.13036
|
| 211 |
+
▁bed -7.13144
|
| 212 |
+
▁surfboard -7.14022
|
| 213 |
+
▁umbrella -7.15312
|
| 214 |
+
▁together -7.15975
|
| 215 |
+
▁playing -7.16295
|
| 216 |
+
or -7.17949
|
| 217 |
+
▁back -7.17972
|
| 218 |
+
▁player -7.18388
|
| 219 |
+
▁sit -7.18649
|
| 220 |
+
▁under -7.20005
|
| 221 |
+
▁racket -7.20024
|
| 222 |
+
h -7.20133
|
| 223 |
+
el -7.20277
|
| 224 |
+
▁behind -7.20527
|
| 225 |
+
▁bananas -7.20846
|
| 226 |
+
ting -7.21381
|
| 227 |
+
▁men -7.22024
|
| 228 |
+
▁ground -7.22091
|
| 229 |
+
▁background -7.22354
|
| 230 |
+
▁b -7.22428
|
| 231 |
+
▁bird -7.22629
|
| 232 |
+
▁snowboard -7.22802
|
| 233 |
+
▁bike -7.22918
|
| 234 |
+
▁glasses -7.23108
|
| 235 |
+
▁piece -7.23459
|
| 236 |
+
f -7.24452
|
| 237 |
+
▁child -7.24754
|
| 238 |
+
▁carrots -7.25782
|
| 239 |
+
▁air -7.25869
|
| 240 |
+
▁display -7.26076
|
| 241 |
+
▁stuff -7.26113
|
| 242 |
+
▁head -7.2612
|
| 243 |
+
▁sandwich -7.26248
|
| 244 |
+
▁cup -7.27509
|
| 245 |
+
pped -7.27733
|
| 246 |
+
▁baby -7.28606
|
| 247 |
+
▁full -7.2917
|
| 248 |
+
▁hold -7.2918
|
| 249 |
+
an -7.29489
|
| 250 |
+
▁stands -7.29853
|
| 251 |
+
ous -7.30242
|
| 252 |
+
▁luggage -7.31451
|
| 253 |
+
▁into -7.31494
|
| 254 |
+
▁being -7.32332
|
| 255 |
+
▁oven -7.33198
|
| 256 |
+
▁beside -7.3412
|
| 257 |
+
ll -7.3429
|
| 258 |
+
▁photo -7.3438
|
| 259 |
+
it -7.34923
|
| 260 |
+
▁sidewalk -7.35576
|
| 261 |
+
▁cutt -7.35913
|
| 262 |
+
▁shirt -7.36241
|
| 263 |
+
▁paper -7.36492
|
| 264 |
+
▁floor -7.36792
|
| 265 |
+
▁dirt -7.37091
|
| 266 |
+
▁knife -7.37703
|
| 267 |
+
▁g -7.37863
|
| 268 |
+
▁pink -7.38024
|
| 269 |
+
▁trees -7.38423
|
| 270 |
+
▁fence -7.38543
|
| 271 |
+
▁cow -7.3919
|
| 272 |
+
▁game -7.39564
|
| 273 |
+
▁bicycle -7.40192
|
| 274 |
+
▁window -7.40507
|
| 275 |
+
▁pole -7.41001
|
| 276 |
+
▁look -7.41334
|
| 277 |
+
▁skis -7.41535
|
| 278 |
+
re -7.42092
|
| 279 |
+
▁big -7.42447
|
| 280 |
+
▁t -7.42848
|
| 281 |
+
▁m -7.4504
|
| 282 |
+
▁pile -7.45319
|
| 283 |
+
▁view -7.45363
|
| 284 |
+
▁face -7.4605
|
| 285 |
+
▁kite -7.4606
|
| 286 |
+
il -7.46525
|
| 287 |
+
▁airplane -7.46698
|
| 288 |
+
▁oranges -7.46926
|
| 289 |
+
▁wood -7.47265
|
| 290 |
+
ro -7.47918
|
| 291 |
+
▁mouth -7.48736
|
| 292 |
+
▁do -7.48758
|
| 293 |
+
▁toothbrush -7.48941
|
| 294 |
+
▁zebras -7.49212
|
| 295 |
+
▁apples -7.49458
|
| 296 |
+
▁image -7.4977
|
| 297 |
+
▁plane -7.51166
|
| 298 |
+
ur -7.51727
|
| 299 |
+
▁stand -7.52431
|
| 300 |
+
ers -7.52482
|
| 301 |
+
▁kites -7.52906
|
| 302 |
+
▁sky -7.54382
|
| 303 |
+
▁ski -7.54406
|
| 304 |
+
▁meat -7.56227
|
| 305 |
+
▁cut -7.56842
|
| 306 |
+
▁apple -7.56932
|
| 307 |
+
▁another -7.57328
|
| 308 |
+
▁park -7.57555
|
| 309 |
+
▁refrigerator -7.59596
|
| 310 |
+
la -7.60542
|
| 311 |
+
▁vari -7.6226
|
| 312 |
+
▁st -7.62544
|
| 313 |
+
▁co -7.6269
|
| 314 |
+
▁grassy -7.62858
|
| 315 |
+
▁bears -7.62972
|
| 316 |
+
▁bag -7.63261
|
| 317 |
+
et -7.63472
|
| 318 |
+
▁four -7.64696
|
| 319 |
+
▁colorful -7.65507
|
| 320 |
+
▁hat -7.66174
|
| 321 |
+
▁someone -7.66735
|
| 322 |
+
▁them -7.66998
|
| 323 |
+
▁book -7.67032
|
| 324 |
+
▁donuts -7.68524
|
| 325 |
+
▁tall -7.68824
|
| 326 |
+
▁cows -7.68997
|
| 327 |
+
▁bottle -7.69113
|
| 328 |
+
▁fork -7.69134
|
| 329 |
+
▁women -7.69223
|
| 330 |
+
▁banana -7.69969
|
| 331 |
+
▁stove -7.70069
|
| 332 |
+
▁off -7.70246
|
| 333 |
+
▁n -7.70318
|
| 334 |
+
▁box -7.70362
|
| 335 |
+
▁control -7.70493
|
| 336 |
+
▁coffee -7.71351
|
| 337 |
+
▁station -7.71806
|
| 338 |
+
▁chair -7.72184
|
| 339 |
+
at -7.72473
|
| 340 |
+
id -7.73245
|
| 341 |
+
ter -7.73264
|
| 342 |
+
▁mirror -7.73525
|
| 343 |
+
▁along -7.73525
|
| 344 |
+
▁ready -7.73965
|
| 345 |
+
▁herd -7.75265
|
| 346 |
+
▁cr -7.75359
|
| 347 |
+
▁camera -7.76197
|
| 348 |
+
li -7.76254
|
| 349 |
+
▁suitcase -7.76274
|
| 350 |
+
▁w -7.7628
|
| 351 |
+
▁c -7.76326
|
| 352 |
+
ck -7.76521
|
| 353 |
+
to -7.76797
|
| 354 |
+
▁cheese -7.77104
|
| 355 |
+
▁hanging -7.77442
|
| 356 |
+
ic -7.77562
|
| 357 |
+
▁items -7.77855
|
| 358 |
+
▁la -7.78103
|
| 359 |
+
▁line -7.78226
|
| 360 |
+
▁tray -7.78482
|
| 361 |
+
▁giraffes -7.80095
|
| 362 |
+
▁above -7.80817
|
| 363 |
+
ent -7.80837
|
| 364 |
+
▁he -7.81318
|
| 365 |
+
▁spoon -7.81767
|
| 366 |
+
▁can -7.82528
|
| 367 |
+
▁elephants -7.82618
|
| 368 |
+
▁middle -7.82727
|
| 369 |
+
▁long -7.82739
|
| 370 |
+
▁wii -7.82816
|
| 371 |
+
ion -7.83194
|
| 372 |
+
▁suit -7.8321
|
| 373 |
+
▁re -7.83355
|
| 374 |
+
▁po -7.83624
|
| 375 |
+
▁half -7.84183
|
| 376 |
+
▁posing -7.84212
|
| 377 |
+
ra -7.84547
|
| 378 |
+
▁metal -7.84673
|
| 379 |
+
▁pa -7.84758
|
| 380 |
+
▁tak -7.84771
|
| 381 |
+
▁grazing -7.85166
|
| 382 |
+
▁get -7.85233
|
| 383 |
+
ve -7.85276
|
| 384 |
+
▁bar -7.86318
|
| 385 |
+
th -7.86829
|
| 386 |
+
▁sand -7.87105
|
| 387 |
+
way -7.87305
|
| 388 |
+
▁stick -7.87666
|
| 389 |
+
▁walk -7.8805
|
| 390 |
+
ut -7.88539
|
| 391 |
+
▁day -7.88687
|
| 392 |
+
▁smiling -7.89196
|
| 393 |
+
▁like -7.89197
|
| 394 |
+
gain -7.89252
|
| 395 |
+
▁fruits -7.89493
|
| 396 |
+
ce -7.89643
|
| 397 |
+
▁mountain -7.89712
|
| 398 |
+
▁carry -7.89715
|
| 399 |
+
▁sh -7.90562
|
| 400 |
+
▁living -7.9233
|
| 401 |
+
▁pe -7.92486
|
| 402 |
+
▁court -7.92862
|
| 403 |
+
▁all -7.92962
|
| 404 |
+
▁signs -7.93469
|
| 405 |
+
▁surf -7.93646
|
| 406 |
+
▁basket -7.93934
|
| 407 |
+
ol -7.94429
|
| 408 |
+
▁silver -7.94475
|
| 409 |
+
▁drink -7.94482
|
| 410 |
+
ct -7.94505
|
| 411 |
+
▁door -7.94597
|
| 412 |
+
▁animals -7.95318
|
| 413 |
+
▁ma -7.95935
|
| 414 |
+
▁cars -7.96219
|
| 415 |
+
▁hair -7.97801
|
| 416 |
+
▁pull -7.9781
|
| 417 |
+
▁pan -7.98197
|
| 418 |
+
▁dogs -7.98348
|
| 419 |
+
▁row -7.99477
|
| 420 |
+
▁h -7.99801
|
| 421 |
+
▁across -8.00047
|
| 422 |
+
▁airport -8.00047
|
| 423 |
+
▁bread -8.00047
|
| 424 |
+
▁lean -8.01205
|
| 425 |
+
▁animal -8.01461
|
| 426 |
+
▁plastic -8.01781
|
| 427 |
+
▁who -8.01782
|
| 428 |
+
te -8.02911
|
| 429 |
+
▁horses -8.03068
|
| 430 |
+
▁trick -8.03557
|
| 431 |
+
▁couch -8.03559
|
| 432 |
+
▁no -8.03695
|
| 433 |
+
▁com -8.03871
|
| 434 |
+
▁dish -8.04053
|
| 435 |
+
z -8.04088
|
| 436 |
+
▁tracks -8.04841
|
| 437 |
+
▁mo -8.05569
|
| 438 |
+
▁set -8.05572
|
| 439 |
+
▁ca -8.05817
|
| 440 |
+
▁screen -8.05949
|
| 441 |
+
▁slice -8.05954
|
| 442 |
+
▁donut -8.06354
|
| 443 |
+
▁rock -8.06384
|
| 444 |
+
▁birds -8.06533
|
| 445 |
+
▁hands -8.06726
|
| 446 |
+
▁ra -8.06906
|
| 447 |
+
▁skate -8.07777
|
| 448 |
+
▁store -8.0842
|
| 449 |
+
▁driving -8.09035
|
| 450 |
+
▁empty -8.09035
|
| 451 |
+
▁un -8.09227
|
| 452 |
+
lying -8.09266
|
| 453 |
+
▁pot -8.09951
|
| 454 |
+
up -8.1032
|
| 455 |
+
▁chocolate -8.10934
|
| 456 |
+
▁enclosure -8.10934
|
| 457 |
+
▁something -8.1158
|
| 458 |
+
▁market -8.12232
|
| 459 |
+
▁seat -8.14076
|
| 460 |
+
▁purple -8.14181
|
| 461 |
+
▁television -8.14181
|
| 462 |
+
▁using -8.14191
|
| 463 |
+
▁displayed -8.1427
|
| 464 |
+
▁snowboarder -8.1446
|
| 465 |
+
▁house -8.14844
|
| 466 |
+
▁skier -8.15227
|
| 467 |
+
▁slope -8.1551
|
| 468 |
+
▁video -8.1551
|
| 469 |
+
▁color -8.15566
|
| 470 |
+
▁hit -8.15587
|
| 471 |
+
▁controller -8.16183
|
| 472 |
+
▁container -8.16206
|
| 473 |
+
▁shown -8.16321
|
| 474 |
+
▁sp -8.16484
|
| 475 |
+
ies -8.16508
|
| 476 |
+
us -8.1671
|
| 477 |
+
▁hill -8.16834
|
| 478 |
+
▁di -8.18087
|
| 479 |
+
▁multi -8.18222
|
| 480 |
+
uring -8.1826
|
| 481 |
+
▁case -8.18262
|
| 482 |
+
▁tooth -8.18916
|
| 483 |
+
▁outdoor -8.19606
|
| 484 |
+
▁flower -8.1975
|
| 485 |
+
▁number -8.20305
|
| 486 |
+
▁tv -8.20468
|
| 487 |
+
▁brush -8.20488
|
| 488 |
+
ping -8.21358
|
| 489 |
+
▁doughnuts -8.21646
|
| 490 |
+
▁double -8.21719
|
| 491 |
+
▁hotdog -8.21719
|
| 492 |
+
▁monitor -8.21719
|
| 493 |
+
▁salad -8.21719
|
| 494 |
+
▁lady -8.21722
|
| 495 |
+
▁pen -8.22209
|
| 496 |
+
▁attached -8.22433
|
| 497 |
+
is -8.22731
|
| 498 |
+
▁boats -8.23018
|
| 499 |
+
um -8.23022
|
| 500 |
+
▁contain -8.23126
|
| 501 |
+
▁teeth -8.23161
|
| 502 |
+
▁k -8.23797
|
| 503 |
+
▁resting -8.23827
|
| 504 |
+
▁ha -8.24136
|
| 505 |
+
▁bun -8.24273
|
| 506 |
+
▁guy -8.24609
|
| 507 |
+
▁made -8.24831
|
| 508 |
+
ot -8.25082
|
| 509 |
+
▁de -8.25114
|
| 510 |
+
▁ocean -8.25342
|
| 511 |
+
▁tower -8.25351
|
| 512 |
+
ew -8.25769
|
| 513 |
+
▁meal -8.26085
|
| 514 |
+
▁corner -8.26138
|
| 515 |
+
▁passenger -8.26829
|
| 516 |
+
▁racquet -8.27581
|
| 517 |
+
▁crowd -8.27749
|
| 518 |
+
un -8.28344
|
| 519 |
+
▁umbrellas -8.28864
|
| 520 |
+
▁backpack -8.29102
|
| 521 |
+
▁high -8.29871
|
| 522 |
+
▁toy -8.30503
|
| 523 |
+
▁surround -8.30647
|
| 524 |
+
▁zoo -8.31428
|
| 525 |
+
▁brick -8.3144
|
| 526 |
+
▁lights -8.31599
|
| 527 |
+
colored -8.32935
|
| 528 |
+
▁restaurant -8.33009
|
| 529 |
+
▁cellphone -8.33042
|
| 530 |
+
ng -8.3362
|
| 531 |
+
▁pieces -8.33696
|
| 532 |
+
led -8.33796
|
| 533 |
+
▁fresh -8.33809
|
| 534 |
+
▁photograph -8.33809
|
| 535 |
+
▁shelf -8.33809
|
| 536 |
+
▁surface -8.33809
|
| 537 |
+
ween -8.33811
|
| 538 |
+
▁jacket -8.34615
|
| 539 |
+
▁sauce -8.34615
|
| 540 |
+
▁wave -8.34616
|
| 541 |
+
▁ride -8.35186
|
| 542 |
+
▁statue -8.35428
|
| 543 |
+
▁adult -8.35428
|
| 544 |
+
▁waiting -8.3586
|
| 545 |
+
▁watching -8.35954
|
| 546 |
+
▁plant -8.36065
|
| 547 |
+
▁branch -8.36248
|
| 548 |
+
▁jet -8.36291
|
| 549 |
+
▁post -8.36497
|
| 550 |
+
▁painted -8.3655
|
| 551 |
+
▁books -8.36582
|
| 552 |
+
ie -8.36742
|
| 553 |
+
▁past -8.37075
|
| 554 |
+
▁polar -8.37096
|
| 555 |
+
▁play -8.37126
|
| 556 |
+
▁so -8.37503
|
| 557 |
+
▁track -8.37767
|
| 558 |
+
▁runway -8.38762
|
| 559 |
+
side -8.38829
|
| 560 |
+
▁glove -8.39596
|
| 561 |
+
▁clear -8.39596
|
| 562 |
+
▁time -8.39597
|
| 563 |
+
▁toothbrushes -8.39938
|
| 564 |
+
▁showing -8.40062
|
| 565 |
+
▁chicken -8.4045
|
| 566 |
+
▁helmet -8.4045
|
| 567 |
+
▁dark -8.40451
|
| 568 |
+
▁children -8.40452
|
| 569 |
+
▁rice -8.40464
|
| 570 |
+
▁sleeping -8.4071
|
| 571 |
+
▁arm -8.40782
|
| 572 |
+
▁bet -8.40891
|
| 573 |
+
▁skiing -8.41273
|
| 574 |
+
▁electronic -8.41312
|
| 575 |
+
▁body -8.41314
|
| 576 |
+
▁eaten -8.41397
|
| 577 |
+
ation -8.41581
|
| 578 |
+
top -8.41755
|
| 579 |
+
▁dressed -8.4181
|
| 580 |
+
▁tile -8.42344
|
| 581 |
+
▁going -8.42404
|
| 582 |
+
▁prepar -8.43043
|
| 583 |
+
co -8.43223
|
| 584 |
+
ple -8.43542
|
| 585 |
+
if -8.43911
|
| 586 |
+
▁go -8.43936
|
| 587 |
+
▁talking -8.45039
|
| 588 |
+
ling -8.45079
|
| 589 |
+
ive -8.45333
|
| 590 |
+
▁or -8.46099
|
| 591 |
+
▁sliced -8.46327
|
| 592 |
+
▁bright -8.46647
|
| 593 |
+
▁grey -8.46648
|
| 594 |
+
▁doughnut -8.46741
|
| 595 |
+
▁dock -8.46803
|
| 596 |
+
▁lo -8.47278
|
| 597 |
+
▁wire -8.47628
|
| 598 |
+
▁rail -8.47653
|
| 599 |
+
▁eat -8.47835
|
| 600 |
+
▁sun -8.47862
|
| 601 |
+
▁types -8.47913
|
| 602 |
+
ig -8.48178
|
| 603 |
+
▁ex -8.48495
|
| 604 |
+
▁about -8.48501
|
| 605 |
+
▁gray -8.4893
|
| 606 |
+
▁left -8.49425
|
| 607 |
+
▁fries -8.49426
|
| 608 |
+
▁strip -8.49429
|
| 609 |
+
▁toaster -8.49558
|
| 610 |
+
age -8.49645
|
| 611 |
+
▁includ -8.50229
|
| 612 |
+
▁neck -8.5041
|
| 613 |
+
▁carrot -8.5057
|
| 614 |
+
▁star -8.50571
|
| 615 |
+
▁intersection -8.51321
|
| 616 |
+
▁ne -8.51381
|
| 617 |
+
mp -8.51572
|
| 618 |
+
▁toward -8.52283
|
| 619 |
+
▁rack -8.523
|
| 620 |
+
▁have -8.52335
|
| 621 |
+
ti -8.52431
|
| 622 |
+
▁drinking -8.53239
|
| 623 |
+
▁device -8.54233
|
| 624 |
+
▁soup -8.5427
|
| 625 |
+
▁beer -8.54704
|
| 626 |
+
▁brushing -8.54769
|
| 627 |
+
▁tub -8.55224
|
| 628 |
+
▁skateboarder -8.5601
|
| 629 |
+
▁blanket -8.56224
|
| 630 |
+
▁shower -8.56331
|
| 631 |
+
▁ho -8.56386
|
| 632 |
+
▁rain -8.56402
|
| 633 |
+
▁leaves -8.57234
|
| 634 |
+
▁trunk -8.57234
|
| 635 |
+
▁variet -8.57362
|
| 636 |
+
▁walks -8.57371
|
| 637 |
+
▁stra -8.57436
|
| 638 |
+
▁dress -8.57668
|
| 639 |
+
▁bags -8.57846
|
| 640 |
+
▁rocks -8.5855
|
| 641 |
+
▁male -8.58767
|
| 642 |
+
▁potatoes -8.59314
|
| 643 |
+
▁place -8.59391
|
| 644 |
+
▁take -8.597
|
| 645 |
+
▁cover -8.59788
|
| 646 |
+
▁gra -8.59976
|
| 647 |
+
▁swing -8.60294
|
| 648 |
+
x -8.60304
|
| 649 |
+
▁toppings -8.60342
|
| 650 |
+
▁night -8.60375
|
| 651 |
+
ight -8.60386
|
| 652 |
+
ian -8.60506
|
| 653 |
+
▁meters -8.60745
|
| 654 |
+
placed -8.61271
|
| 655 |
+
▁cross -8.6135
|
| 656 |
+
▁home -8.61381
|
| 657 |
+
ment -8.61483
|
| 658 |
+
▁cart -8.61607
|
| 659 |
+
▁signal -8.61711
|
| 660 |
+
▁bo -8.6237
|
| 661 |
+
▁shows -8.62811
|
| 662 |
+
▁kid -8.63083
|
| 663 |
+
▁vehicle -8.63518
|
| 664 |
+
▁shop -8.63771
|
| 665 |
+
▁batter -8.63776
|
| 666 |
+
▁office -8.64358
|
| 667 |
+
▁decker -8.64688
|
| 668 |
+
▁closeup -8.64917
|
| 669 |
+
▁sa -8.6532
|
| 670 |
+
▁she -8.6543
|
| 671 |
+
▁older -8.65575
|
| 672 |
+
ten -8.6566
|
| 673 |
+
▁dessert -8.65704
|
| 674 |
+
▁graffiti -8.65704
|
| 675 |
+
▁buildings -8.66063
|
| 676 |
+
▁decorated -8.66817
|
| 677 |
+
▁traveling -8.67289
|
| 678 |
+
▁uniform -8.67939
|
| 679 |
+
▁motorcycles -8.68378
|
| 680 |
+
▁not -8.68576
|
| 681 |
+
ke -8.68938
|
| 682 |
+
ca -8.68955
|
| 683 |
+
ish -8.69322
|
| 684 |
+
▁bottles -8.69376
|
| 685 |
+
▁stacked -8.6945
|
| 686 |
+
▁hug -8.70225
|
| 687 |
+
▁shaped -8.70448
|
| 688 |
+
ir -8.70499
|
| 689 |
+
ake -8.70533
|
| 690 |
+
▁work -8.71475
|
| 691 |
+
▁ru -8.71526
|
| 692 |
+
ate -8.71737
|
| 693 |
+
▁rest -8.71757
|
| 694 |
+
im -8.72285
|
| 695 |
+
▁se -8.72481
|
| 696 |
+
▁motor -8.72565
|
| 697 |
+
▁roll -8.72566
|
| 698 |
+
able -8.72568
|
| 699 |
+
▁lit -8.72596
|
| 700 |
+
▁river -8.72599
|
| 701 |
+
▁egg -8.73755
|
| 702 |
+
op -8.74301
|
| 703 |
+
ta -8.74331
|
| 704 |
+
▁cabinets -8.74989
|
| 705 |
+
▁shot -8.75016
|
| 706 |
+
▁setting -8.75117
|
| 707 |
+
▁dry -8.75973
|
| 708 |
+
▁cloud -8.76179
|
| 709 |
+
▁assortment -8.76196
|
| 710 |
+
ver -8.76214
|
| 711 |
+
▁slices -8.76601
|
| 712 |
+
▁cooking -8.77002
|
| 713 |
+
▁served -8.77589
|
| 714 |
+
▁onions -8.77609
|
| 715 |
+
▁way -8.78289
|
| 716 |
+
qu -8.78524
|
| 717 |
+
▁cute -8.78994
|
| 718 |
+
▁ice -8.7915
|
| 719 |
+
▁we -8.79685
|
| 720 |
+
▁cloth -8.7979
|
| 721 |
+
▁bu -8.79794
|
| 722 |
+
▁appliances -8.79929
|
| 723 |
+
lush -8.79929
|
| 724 |
+
▁giant -8.79929
|
| 725 |
+
▁underneath -8.79957
|
| 726 |
+
▁pu -8.80217
|
| 727 |
+
▁suitcases -8.80993
|
| 728 |
+
▁vegetable -8.81141
|
| 729 |
+
▁assort -8.81193
|
| 730 |
+
▁machine -8.81211
|
| 731 |
+
▁gear -8.81213
|
| 732 |
+
▁forest -8.81221
|
| 733 |
+
▁pack -8.813
|
| 734 |
+
ard -8.81466
|
| 735 |
+
▁mak -8.81679
|
| 736 |
+
▁jumping -8.82059
|
| 737 |
+
▁bikes -8.82337
|
| 738 |
+
▁electric -8.8251
|
| 739 |
+
▁purse -8.8251
|
| 740 |
+
▁reading -8.8251
|
| 741 |
+
▁smart -8.8251
|
| 742 |
+
▁mitt -8.82511
|
| 743 |
+
▁moving -8.82512
|
| 744 |
+
▁swinging -8.8256
|
| 745 |
+
▁fish -8.82654
|
| 746 |
+
▁seen -8.82973
|
| 747 |
+
▁edge -8.83827
|
| 748 |
+
▁feet -8.83828
|
| 749 |
+
unny -8.84257
|
| 750 |
+
▁surfer -8.84536
|
| 751 |
+
▁part -8.84568
|
| 752 |
+
ha -8.85162
|
| 753 |
+
▁tea -8.85311
|
| 754 |
+
▁ta -8.85313
|
| 755 |
+
ul -8.85659
|
| 756 |
+
▁pi -8.8584
|
| 757 |
+
▁con -8.86021
|
| 758 |
+
▁see -8.86344
|
| 759 |
+
▁chairs -8.86605
|
| 760 |
+
tra -8.86699
|
| 761 |
+
light -8.86717
|
| 762 |
+
ant -8.87635
|
| 763 |
+
ical -8.87771
|
| 764 |
+
▁fridge -8.8788
|
| 765 |
+
▁perched -8.8788
|
| 766 |
+
▁running -8.87883
|
| 767 |
+
▁yard -8.87885
|
| 768 |
+
▁din -8.87895
|
| 769 |
+
▁pro -8.88325
|
| 770 |
+
▁cooked -8.88379
|
| 771 |
+
▁but -8.88445
|
| 772 |
+
▁cap -8.88554
|
| 773 |
+
▁end -8.8862
|
| 774 |
+
▁hay -8.88776
|
| 775 |
+
▁mount -8.89269
|
| 776 |
+
▁female -8.89269
|
| 777 |
+
▁mustard -8.89269
|
| 778 |
+
▁curb -8.89269
|
| 779 |
+
▁turn -8.8927
|
| 780 |
+
▁scene -8.89274
|
| 781 |
+
▁him -8.89311
|
| 782 |
+
▁ramp -8.89321
|
| 783 |
+
▁log -8.89489
|
| 784 |
+
▁wi -8.90234
|
| 785 |
+
▁pie -8.90312
|
| 786 |
+
▁object -8.90678
|
| 787 |
+
▁draw -8.90678
|
| 788 |
+
▁drive -8.90678
|
| 789 |
+
▁cattle -8.90688
|
| 790 |
+
▁cla -8.91037
|
| 791 |
+
lic -8.91151
|
| 792 |
+
▁sale -8.91173
|
| 793 |
+
ead -8.91193
|
| 794 |
+
▁wheel -8.92106
|
| 795 |
+
▁towel -8.92112
|
| 796 |
+
▁point -8.92116
|
| 797 |
+
▁tomatoes -8.92201
|
| 798 |
+
▁size -8.92222
|
| 799 |
+
▁holder -8.92301
|
| 800 |
+
fri -8.92629
|
| 801 |
+
▁bath -8.92676
|
| 802 |
+
tage -8.92972
|
| 803 |
+
▁poses -8.93077
|
| 804 |
+
▁shoes -8.93101
|
| 805 |
+
per -8.93464
|
| 806 |
+
▁buses -8.93471
|
| 807 |
+
▁beautiful -8.93555
|
| 808 |
+
▁mother -8.93556
|
| 809 |
+
▁among -8.93556
|
| 810 |
+
▁gold -8.93557
|
| 811 |
+
▁hang -8.93983
|
| 812 |
+
intend -8.9417
|
| 813 |
+
▁pasture -8.95027
|
| 814 |
+
▁cage -8.9514
|
| 815 |
+
▁plants -8.95357
|
| 816 |
+
▁bridge -8.96519
|
| 817 |
+
▁clean -8.96519
|
| 818 |
+
▁frost -8.9652
|
| 819 |
+
▁leg -8.96545
|
| 820 |
+
▁smiles -8.96762
|
| 821 |
+
ff -8.96829
|
| 822 |
+
▁sc -8.96887
|
| 823 |
+
▁pick -8.97804
|
| 824 |
+
▁putt -8.97847
|
| 825 |
+
▁sausage -8.98034
|
| 826 |
+
▁right -8.98037
|
| 827 |
+
▁stone -8.98045
|
| 828 |
+
▁candle -8.98051
|
| 829 |
+
▁wrappe -8.98052
|
| 830 |
+
▁single -8.98062
|
| 831 |
+
▁crossing -8.98077
|
| 832 |
+
▁goat -8.98108
|
| 833 |
+
▁lap -8.98755
|
| 834 |
+
▁arranged -8.99572
|
| 835 |
+
▁writ -8.99573
|
| 836 |
+
▁blow -8.99574
|
| 837 |
+
▁van -8.99579
|
| 838 |
+
▁desktop -8.99754
|
| 839 |
+
▁war -8.99867
|
| 840 |
+
board -9.00012
|
| 841 |
+
▁handle -9.00077
|
| 842 |
+
ated -9.00415
|
| 843 |
+
▁cream -9.01135
|
| 844 |
+
▁equipment -9.01135
|
| 845 |
+
▁mug -9.01136
|
| 846 |
+
▁beans -9.01151
|
| 847 |
+
▁pose -9.01615
|
| 848 |
+
▁kinds -9.01994
|
| 849 |
+
▁paint -9.02154
|
| 850 |
+
▁below -9.02722
|
| 851 |
+
▁pitch -9.02722
|
| 852 |
+
▁fly -9.04102
|
| 853 |
+
▁cook -9.04289
|
| 854 |
+
▁toothpaste -9.04335
|
| 855 |
+
▁perform -9.04335
|
| 856 |
+
▁held -9.04337
|
| 857 |
+
eat -9.04504
|
| 858 |
+
▁fenced -9.0479
|
| 859 |
+
▁kids -9.05097
|
| 860 |
+
▁peppers -9.05207
|
| 861 |
+
▁make -9.05829
|
| 862 |
+
▁broken -9.05974
|
| 863 |
+
▁check -9.05974
|
| 864 |
+
▁glazed -9.05974
|
| 865 |
+
▁grill -9.05974
|
| 866 |
+
▁ketchup -9.05974
|
| 867 |
+
▁police -9.05975
|
| 868 |
+
▁dinner -9.05975
|
| 869 |
+
▁pre -9.05987
|
| 870 |
+
▁float -9.05998
|
| 871 |
+
lar -9.06452
|
| 872 |
+
▁jump -9.06548
|
| 873 |
+
▁coat -9.06712
|
| 874 |
+
▁flip -9.07263
|
| 875 |
+
▁eyes -9.07455
|
| 876 |
+
▁design -9.07641
|
| 877 |
+
▁platform -9.07641
|
| 878 |
+
▁trash -9.07641
|
| 879 |
+
▁veggie -9.07641
|
| 880 |
+
▁short -9.07642
|
| 881 |
+
▁foot -9.07642
|
| 882 |
+
▁gather -9.07647
|
| 883 |
+
▁flock -9.0766
|
| 884 |
+
less -9.07765
|
| 885 |
+
▁watch -9.08105
|
| 886 |
+
▁use -9.09076
|
| 887 |
+
▁birthday -9.09336
|
| 888 |
+
▁pretty -9.09336
|
| 889 |
+
▁reach -9.09337
|
| 890 |
+
▁nice -9.09337
|
| 891 |
+
▁soda -9.09338
|
| 892 |
+
▁five -9.09576
|
| 893 |
+
▁kind -9.10127
|
| 894 |
+
ial -9.10589
|
| 895 |
+
▁public -9.1106
|
| 896 |
+
▁round -9.1106
|
| 897 |
+
having -9.11272
|
| 898 |
+
▁alone -9.12815
|
| 899 |
+
▁bushes -9.12816
|
| 900 |
+
▁lie -9.13024
|
| 901 |
+
▁taken -9.13789
|
| 902 |
+
▁stack -9.14013
|
| 903 |
+
just -9.14601
|
| 904 |
+
▁stopped -9.14618
|
| 905 |
+
▁lake -9.14671
|
| 906 |
+
▁new -9.14915
|
| 907 |
+
▁sea -9.15017
|
| 908 |
+
out -9.15305
|
| 909 |
+
▁pet -9.15388
|
| 910 |
+
eep -9.16366
|
| 911 |
+
▁produce -9.16723
|
| 912 |
+
▁used -9.16863
|
| 913 |
+
tic -9.17655
|
| 914 |
+
▁fac -9.18216
|
| 915 |
+
▁engine -9.1827
|
| 916 |
+
▁tarmac -9.1827
|
| 917 |
+
▁garden -9.1827
|
| 918 |
+
▁wild -9.18272
|
| 919 |
+
▁vin -9.18277
|
| 920 |
+
tuck -9.18447
|
| 921 |
+
tro -9.18453
|
| 922 |
+
▁mid -9.18527
|
| 923 |
+
▁dishes -9.18707
|
| 924 |
+
▁toast -9.1992
|
| 925 |
+
▁flat -9.20159
|
| 926 |
+
▁try -9.20164
|
| 927 |
+
▁both -9.20226
|
| 928 |
+
▁jar -9.20273
|
| 929 |
+
own -9.20329
|
| 930 |
+
▁hole -9.20671
|
| 931 |
+
▁wet -9.20733
|
| 932 |
+
▁feeding -9.20948
|
| 933 |
+
▁type -9.2135
|
| 934 |
+
j -9.21763
|
| 935 |
+
▁french -9.2208
|
| 936 |
+
▁beverage -9.2208
|
| 937 |
+
▁shore -9.22082
|
| 938 |
+
▁grow -9.22087
|
| 939 |
+
▁fall -9.22102
|
| 940 |
+
▁put -9.22688
|
| 941 |
+
▁show -9.23742
|
| 942 |
+
▁supplies -9.24041
|
| 943 |
+
▁farm -9.24043
|
| 944 |
+
▁pool -9.24189
|
| 945 |
+
▁stoplight -9.24247
|
| 946 |
+
▁working -9.2589
|
| 947 |
+
▁mix -9.2604
|
| 948 |
+
▁mushroom -9.26041
|
| 949 |
+
▁chain -9.26041
|
| 950 |
+
▁throw -9.26041
|
| 951 |
+
loaded -9.26042
|
| 952 |
+
▁good -9.26043
|
| 953 |
+
▁steel -9.26043
|
| 954 |
+
▁prepare -9.26078
|
| 955 |
+
▁brushes -9.2644
|
| 956 |
+
▁wait -9.27001
|
| 957 |
+
▁lin -9.27581
|
| 958 |
+
▁shape -9.27684
|
| 959 |
+
▁lunch -9.28082
|
| 960 |
+
▁modern -9.28082
|
| 961 |
+
▁square -9.28082
|
| 962 |
+
▁smo -9.28088
|
| 963 |
+
▁ear -9.28125
|
| 964 |
+
▁wash -9.28147
|
| 965 |
+
▁boxes -9.28756
|
| 966 |
+
phone -9.29231
|
| 967 |
+
▁pad -9.29713
|
| 968 |
+
ock -9.29772
|
| 969 |
+
▁tomato -9.30026
|
| 970 |
+
▁enjoy -9.30165
|
| 971 |
+
▁shoulder -9.30165
|
| 972 |
+
▁lemon -9.30165
|
| 973 |
+
▁pastries -9.30165
|
| 974 |
+
▁milk -9.30165
|
| 975 |
+
▁match -9.30166
|
| 976 |
+
▁lamb -9.30166
|
| 977 |
+
▁chew -9.30167
|
| 978 |
+
▁rose -9.30169
|
| 979 |
+
▁style -9.30179
|
| 980 |
+
▁well -9.30274
|
| 981 |
+
▁appear -9.32293
|
| 982 |
+
▁breakfast -9.32293
|
| 983 |
+
▁desert -9.32293
|
| 984 |
+
▁foreground -9.32293
|
| 985 |
+
▁napkin -9.32293
|
| 986 |
+
▁platter -9.32293
|
| 987 |
+
▁strawberries -9.32293
|
| 988 |
+
▁sunglasses -9.32293
|
| 989 |
+
▁blender -9.32293
|
| 990 |
+
▁leaf -9.32293
|
| 991 |
+
▁chopped -9.32293
|
| 992 |
+
▁shade -9.32294
|
| 993 |
+
▁lawn -9.32294
|
| 994 |
+
▁ripe -9.32297
|
| 995 |
+
▁sail -9.32409
|
| 996 |
+
▁doll -9.32734
|
| 997 |
+
▁gate -9.33599
|
| 998 |
+
▁crowded -9.3398
|
| 999 |
+
▁pedestrian -9.34466
|
| 1000 |
+
▁baked -9.34467
|
| 1001 |
+
▁decoration -9.34474
|
| 1002 |
+
▁mess -9.34492
|
| 1003 |
+
▁pasta -9.3516
|
| 1004 |
+
line -9.3603
|
| 1005 |
+
▁bottom -9.36689
|
| 1006 |
+
▁christmas -9.36689
|
| 1007 |
+
▁country -9.36689
|
| 1008 |
+
▁decorative -9.36689
|
| 1009 |
+
▁kneel -9.36689
|
| 1010 |
+
▁scooter -9.36689
|
| 1011 |
+
▁sculpture -9.36689
|
| 1012 |
+
▁sprinkles -9.36689
|
| 1013 |
+
▁chips -9.36694
|
| 1014 |
+
▁things -9.36697
|
| 1015 |
+
▁catcher -9.36718
|
| 1016 |
+
▁butter -9.36763
|
| 1017 |
+
▁sandwiches -9.37834
|
| 1018 |
+
▁potte -9.3786
|
| 1019 |
+
uch -9.37978
|
| 1020 |
+
ball -9.38093
|
| 1021 |
+
▁smile -9.38591
|
| 1022 |
+
▁serving -9.38961
|
| 1023 |
+
▁horn -9.38965
|
| 1024 |
+
▁says -9.3899
|
| 1025 |
+
▁cub -9.39066
|
| 1026 |
+
▁includes -9.393
|
| 1027 |
+
▁dryer -9.39362
|
| 1028 |
+
▁skies -9.39838
|
| 1029 |
+
▁was -9.41263
|
| 1030 |
+
▁bathtub -9.41287
|
| 1031 |
+
▁concrete -9.41287
|
| 1032 |
+
▁distance -9.41287
|
| 1033 |
+
where -9.41288
|
| 1034 |
+
▁balloon -9.41289
|
| 1035 |
+
▁nearby -9.41308
|
| 1036 |
+
▁spot -9.41374
|
| 1037 |
+
▁lamp -9.41404
|
| 1038 |
+
▁tri -9.41457
|
| 1039 |
+
▁path -9.41472
|
| 1040 |
+
▁chi -9.42199
|
| 1041 |
+
▁par -9.42369
|
| 1042 |
+
▁ri -9.42577
|
| 1043 |
+
air -9.43144
|
| 1044 |
+
▁nose -9.43838
|
| 1045 |
+
ular -9.44005
|
| 1046 |
+
▁em -9.44072
|
| 1047 |
+
▁run -9.44797
|
| 1048 |
+
▁gl -9.45232
|
| 1049 |
+
▁bacon -9.46107
|
| 1050 |
+
▁bedroom -9.46107
|
| 1051 |
+
▁carriage -9.46107
|
| 1052 |
+
▁kitten -9.46107
|
| 1053 |
+
▁stainless -9.46107
|
| 1054 |
+
▁reads -9.46107
|
| 1055 |
+
▁graze -9.46125
|
| 1056 |
+
▁carrie -9.46131
|
| 1057 |
+
▁bull -9.46233
|
| 1058 |
+
▁race -9.46325
|
| 1059 |
+
▁clothes -9.46378
|
| 1060 |
+
▁low -9.46384
|
| 1061 |
+
▁rider -9.46863
|
| 1062 |
+
▁bite -9.4693
|
| 1063 |
+
▁juice -9.48607
|
| 1064 |
+
▁lettuce -9.48607
|
| 1065 |
+
▁partially -9.48607
|
| 1066 |
+
▁position -9.48607
|
| 1067 |
+
▁swimming -9.48607
|
| 1068 |
+
▁carpet -9.48608
|
| 1069 |
+
▁sort -9.48618
|
| 1070 |
+
▁plain -9.48756
|
| 1071 |
+
▁paw -9.49126
|
| 1072 |
+
▁travel -9.50079
|
| 1073 |
+
▁cabinet -9.51108
|
| 1074 |
+
▁blurr -9.51171
|
| 1075 |
+
▁fashion -9.51171
|
| 1076 |
+
▁pigeons -9.51171
|
| 1077 |
+
▁what -9.51171
|
| 1078 |
+
▁cement -9.51173
|
| 1079 |
+
▁word -9.51173
|
| 1080 |
+
▁same -9.51174
|
| 1081 |
+
▁reflection -9.51199
|
| 1082 |
+
▁after -9.51234
|
| 1083 |
+
▁tiny -9.51597
|
| 1084 |
+
▁pin -9.51823
|
| 1085 |
+
▁ship -9.51836
|
| 1086 |
+
▁feed -9.52763
|
| 1087 |
+
▁arrangement -9.53803
|
| 1088 |
+
▁bucket -9.53803
|
| 1089 |
+
▁climb -9.53803
|
| 1090 |
+
▁collecti -9.53803
|
| 1091 |
+
▁shadow -9.53803
|
| 1092 |
+
▁lift -9.53803
|
| 1093 |
+
▁parade -9.53803
|
| 1094 |
+
▁center -9.53804
|
| 1095 |
+
▁flag -9.53804
|
| 1096 |
+
berry -9.53804
|
| 1097 |
+
▁lead -9.53805
|
| 1098 |
+
▁pears -9.53813
|
| 1099 |
+
▁sheet -9.5393
|
| 1100 |
+
▁tape -9.54224
|
| 1101 |
+
▁pickle -9.5429
|
| 1102 |
+
▁giv -9.54341
|
| 1103 |
+
▁bri -9.55714
|
| 1104 |
+
▁you -9.56488
|
| 1105 |
+
▁cluttered -9.56505
|
| 1106 |
+
▁family -9.56505
|
| 1107 |
+
▁military -9.56505
|
| 1108 |
+
▁pavement -9.56505
|
| 1109 |
+
▁picnic -9.56505
|
| 1110 |
+
▁soccer -9.56505
|
| 1111 |
+
▁peanut -9.56505
|
| 1112 |
+
▁space -9.56506
|
| 1113 |
+
▁pastry -9.56509
|
| 1114 |
+
▁lone -9.56512
|
| 1115 |
+
▁finger -9.56707
|
| 1116 |
+
▁watches -9.56732
|
| 1117 |
+
most -9.56742
|
| 1118 |
+
bow -9.57068
|
| 1119 |
+
▁officer -9.57132
|
| 1120 |
+
ful -9.57246
|
| 1121 |
+
▁serve -9.58911
|
| 1122 |
+
▁mark -9.59236
|
| 1123 |
+
▁deck -9.59241
|
| 1124 |
+
▁curtain -9.59283
|
| 1125 |
+
▁himself -9.59283
|
| 1126 |
+
▁ledge -9.59283
|
| 1127 |
+
▁railroad -9.59283
|
| 1128 |
+
���duck -9.59284
|
| 1129 |
+
▁base -9.59284
|
| 1130 |
+
▁model -9.59285
|
| 1131 |
+
rcial -9.59286
|
| 1132 |
+
▁comme -9.59289
|
| 1133 |
+
arrow -9.59302
|
| 1134 |
+
▁hillside -9.59378
|
| 1135 |
+
▁tools -9.5943
|
| 1136 |
+
▁flor -9.59538
|
| 1137 |
+
ci -9.6108
|
| 1138 |
+
made -9.6127
|
| 1139 |
+
ddler -9.61507
|
| 1140 |
+
fore -9.61889
|
| 1141 |
+
▁puppy -9.6214
|
| 1142 |
+
▁school -9.6214
|
| 1143 |
+
▁propeller -9.6214
|
| 1144 |
+
▁cupcake -9.6214
|
| 1145 |
+
▁built -9.62141
|
| 1146 |
+
▁mini -9.62144
|
| 1147 |
+
▁step -9.62151
|
| 1148 |
+
▁string -9.6216
|
| 1149 |
+
▁panda -9.62177
|
| 1150 |
+
▁port -9.62248
|
| 1151 |
+
▁pipe -9.62495
|
| 1152 |
+
▁qui -9.62497
|
| 1153 |
+
▁spi -9.64844
|
| 1154 |
+
▁event -9.65082
|
| 1155 |
+
▁block -9.65082
|
| 1156 |
+
▁spread -9.65082
|
| 1157 |
+
▁winter -9.65082
|
| 1158 |
+
▁flies -9.65083
|
| 1159 |
+
▁still -9.65083
|
| 1160 |
+
▁sport -9.65085
|
| 1161 |
+
▁fried -9.65092
|
| 1162 |
+
▁direction -9.65107
|
| 1163 |
+
cal -9.65183
|
| 1164 |
+
▁landing -9.65229
|
| 1165 |
+
▁trailer -9.65376
|
| 1166 |
+
▁eye -9.65481
|
| 1167 |
+
▁bit -9.6791
|
| 1168 |
+
▁sub -9.68077
|
| 1169 |
+
▁balanc -9.68112
|
| 1170 |
+
▁pattern -9.68112
|
| 1171 |
+
▁professional -9.68112
|
| 1172 |
+
▁consist -9.68112
|
| 1173 |
+
▁grapes -9.68113
|
| 1174 |
+
▁spray -9.68113
|
| 1175 |
+
▁antique -9.68113
|
| 1176 |
+
ough -9.68114
|
| 1177 |
+
▁stall -9.68115
|
| 1178 |
+
▁package -9.68161
|
| 1179 |
+
▁corn -9.68243
|
| 1180 |
+
▁town -9.6827
|
| 1181 |
+
▁tag -9.68465
|
| 1182 |
+
▁tin -9.68496
|
| 1183 |
+
▁figure -9.71237
|
| 1184 |
+
▁furniture -9.71237
|
| 1185 |
+
▁notebook -9.71237
|
| 1186 |
+
▁lime -9.71237
|
| 1187 |
+
▁parrot -9.71237
|
| 1188 |
+
▁sofa -9.71237
|
| 1189 |
+
▁outfit -9.71238
|
| 1190 |
+
▁power -9.71239
|
| 1191 |
+
▁disc -9.71248
|
| 1192 |
+
▁fry -9.71262
|
| 1193 |
+
▁wide -9.71277
|
| 1194 |
+
▁chili -9.71294
|
| 1195 |
+
▁hard -9.71328
|
| 1196 |
+
▁features -9.71373
|
| 1197 |
+
▁tail -9.71384
|
| 1198 |
+
▁featur -9.74332
|
| 1199 |
+
▁contents -9.74463
|
| 1200 |
+
▁delicious -9.74463
|
| 1201 |
+
▁expired -9.74463
|
| 1202 |
+
▁guitar -9.74463
|
| 1203 |
+
▁leash -9.74463
|
| 1204 |
+
▁snack -9.74463
|
| 1205 |
+
▁steak -9.74463
|
| 1206 |
+
▁name -9.74463
|
| 1207 |
+
▁push -9.74468
|
| 1208 |
+
▁touching -9.74479
|
| 1209 |
+
▁subway -9.74504
|
| 1210 |
+
▁stir -9.74607
|
| 1211 |
+
▁wear -9.76934
|
| 1212 |
+
▁trail -9.77464
|
| 1213 |
+
neath -9.7773
|
| 1214 |
+
▁berries -9.77796
|
| 1215 |
+
▁ceramic -9.77796
|
| 1216 |
+
▁condiments -9.77796
|
| 1217 |
+
▁fabric -9.77796
|
| 1218 |
+
▁fancy -9.77796
|
| 1219 |
+
▁blond -9.77796
|
| 1220 |
+
▁stairs -9.77798
|
| 1221 |
+
▁pants -9.77835
|
| 1222 |
+
uzz -9.77851
|
| 1223 |
+
▁tee -9.79758
|
| 1224 |
+
▁craft -9.81244
|
| 1225 |
+
▁visible -9.81244
|
| 1226 |
+
▁chop -9.81246
|
| 1227 |
+
▁rope -9.81249
|
| 1228 |
+
▁beef -9.81252
|
| 1229 |
+
▁key -9.81259
|
| 1230 |
+
▁rais -9.8143
|
| 1231 |
+
▁sleep -9.83725
|
| 1232 |
+
bby -9.84671
|
| 1233 |
+
▁bouquet -9.84816
|
| 1234 |
+
▁museum -9.84816
|
| 1235 |
+
▁restroom -9.84816
|
| 1236 |
+
▁shelves -9.84816
|
| 1237 |
+
▁advertisement -9.84816
|
| 1238 |
+
▁flown -9.84816
|
| 1239 |
+
▁tank -9.84818
|
| 1240 |
+
▁vest -9.84824
|
| 1241 |
+
ature -9.84836
|
| 1242 |
+
soft -9.84883
|
| 1243 |
+
▁icing -9.84891
|
| 1244 |
+
▁flo -9.88084
|
| 1245 |
+
band -9.8828
|
| 1246 |
+
▁reflect -9.88479
|
| 1247 |
+
▁amount -9.88519
|
| 1248 |
+
▁owl -9.88519
|
| 1249 |
+
▁steam -9.88519
|
| 1250 |
+
▁tongue -9.88519
|
| 1251 |
+
▁business -9.88519
|
| 1252 |
+
▁costume -9.88519
|
| 1253 |
+
▁heart -9.88521
|
| 1254 |
+
▁calf -9.88521
|
| 1255 |
+
▁worn -9.88522
|
| 1256 |
+
▁sill -9.88598
|
| 1257 |
+
▁propp -9.91136
|
| 1258 |
+
ized -9.92239
|
| 1259 |
+
how -9.92279
|
| 1260 |
+
▁happy -9.92365
|
| 1261 |
+
▁harbor -9.92365
|
| 1262 |
+
▁pillow -9.92365
|
| 1263 |
+
▁roof -9.92365
|
| 1264 |
+
▁sugar -9.92365
|
| 1265 |
+
▁airliner -9.92366
|
| 1266 |
+
▁ornate -9.92366
|
| 1267 |
+
▁indoor -9.92366
|
| 1268 |
+
▁frame -9.92366
|
| 1269 |
+
▁itself -9.92367
|
| 1270 |
+
▁residen -9.92367
|
| 1271 |
+
▁rusted -9.92368
|
| 1272 |
+
▁selling -9.92368
|
| 1273 |
+
▁ja -9.96136
|
| 1274 |
+
▁figurine -9.96365
|
| 1275 |
+
▁freezer -9.96365
|
| 1276 |
+
▁garbage -9.96365
|
| 1277 |
+
▁goggles -9.96365
|
| 1278 |
+
▁waffle -9.96365
|
| 1279 |
+
▁overhead -9.96366
|
| 1280 |
+
▁section -9.96366
|
| 1281 |
+
▁patio -9.96366
|
| 1282 |
+
▁tasty -9.96367
|
| 1283 |
+
▁frog -9.96368
|
| 1284 |
+
▁mud -9.96372
|
| 1285 |
+
▁belt -9.96374
|
| 1286 |
+
▁fast -9.96384
|
| 1287 |
+
▁curl -9.96391
|
| 1288 |
+
▁item -9.97951
|
| 1289 |
+
shirt -9.99666
|
| 1290 |
+
▁celery -10.0053
|
| 1291 |
+
▁faucet -10.0053
|
| 1292 |
+
▁kept -10.0053
|
| 1293 |
+
▁leather -10.0053
|
| 1294 |
+
▁structure -10.0053
|
| 1295 |
+
▁loading -10.0053
|
| 1296 |
+
▁relax -10.0053
|
| 1297 |
+
▁scatter -10.0053
|
| 1298 |
+
▁numer -10.0053
|
| 1299 |
+
▁six -10.0053
|
| 1300 |
+
▁asleep -10.0053
|
| 1301 |
+
ixture -10.0053
|
| 1302 |
+
▁iron -10.0053
|
| 1303 |
+
▁hood -10.0054
|
| 1304 |
+
▁owner -10.0055
|
| 1305 |
+
▁also -10.0056
|
| 1306 |
+
rban -10.0135
|
| 1307 |
+
▁produc -10.0415
|
| 1308 |
+
thered -10.0476
|
| 1309 |
+
▁accessories -10.0488
|
| 1310 |
+
▁bakery -10.0488
|
| 1311 |
+
▁cereal -10.0488
|
| 1312 |
+
▁champagne -10.0488
|
| 1313 |
+
▁commuter -10.0488
|
| 1314 |
+
▁individual -10.0488
|
| 1315 |
+
▁practic -10.0488
|
| 1316 |
+
▁sweater -10.0488
|
| 1317 |
+
▁ceiling -10.0488
|
| 1318 |
+
▁text -10.0488
|
| 1319 |
+
▁neatly -10.0488
|
| 1320 |
+
▁rusty -10.0488
|
| 1321 |
+
▁headphones -10.0488
|
| 1322 |
+
▁cord -10.0489
|
| 1323 |
+
▁lid -10.0492
|
| 1324 |
+
▁je -10.0523
|
| 1325 |
+
▁direct -10.0939
|
| 1326 |
+
▁circle -10.0943
|
| 1327 |
+
▁crochet -10.0943
|
| 1328 |
+
▁magazine -10.0943
|
| 1329 |
+
▁marble -10.0943
|
| 1330 |
+
▁marina -10.0943
|
| 1331 |
+
▁measur -10.0943
|
| 1332 |
+
▁monkey -10.0943
|
| 1333 |
+
▁roman -10.0943
|
| 1334 |
+
▁urinal -10.0943
|
| 1335 |
+
▁garage -10.0943
|
| 1336 |
+
▁speak -10.0943
|
| 1337 |
+
▁shoe -10.109
|
| 1338 |
+
form -10.1219
|
| 1339 |
+
▁talk -10.131
|
| 1340 |
+
▁potato -10.1405
|
| 1341 |
+
▁advertis -10.1419
|
| 1342 |
+
▁approach -10.1419
|
| 1343 |
+
▁burger -10.1419
|
| 1344 |
+
▁character -10.1419
|
| 1345 |
+
▁depict -10.1419
|
| 1346 |
+
▁jockey -10.1419
|
| 1347 |
+
▁kiwi -10.1419
|
| 1348 |
+
▁stream -10.1419
|
| 1349 |
+
▁terminal -10.1419
|
| 1350 |
+
▁attempt -10.1419
|
| 1351 |
+
▁jetliner -10.1419
|
| 1352 |
+
▁vendor -10.1419
|
| 1353 |
+
▁stunt -10.1419
|
| 1354 |
+
▁collar -10.1419
|
| 1355 |
+
foam -10.1419
|
| 1356 |
+
▁palm -10.1419
|
| 1357 |
+
▁necktie -10.1419
|
| 1358 |
+
▁indicat -10.1419
|
| 1359 |
+
▁foil -10.1419
|
| 1360 |
+
▁than -10.1419
|
| 1361 |
+
▁burn -10.1419
|
| 1362 |
+
▁help -10.1421
|
| 1363 |
+
▁tire -10.1427
|
| 1364 |
+
▁jo -10.1432
|
| 1365 |
+
▁pepper -10.1649
|
| 1366 |
+
▁shak -10.1781
|
| 1367 |
+
▁laughing -10.1919
|
| 1368 |
+
▁ribbon -10.1919
|
| 1369 |
+
▁shrimp -10.1919
|
| 1370 |
+
▁sniff -10.1919
|
| 1371 |
+
▁custom -10.1919
|
| 1372 |
+
▁pepperoni -10.1919
|
| 1373 |
+
▁missing -10.1919
|
| 1374 |
+
▁rubb -10.1919
|
| 1375 |
+
▁wool -10.1919
|
| 1376 |
+
▁cartoon -10.1919
|
| 1377 |
+
ique -10.1919
|
| 1378 |
+
▁slid -10.1919
|
| 1379 |
+
▁needle -10.192
|
| 1380 |
+
▁canoe -10.1922
|
| 1381 |
+
▁paddle -10.2032
|
| 1382 |
+
▁bikini -10.2445
|
| 1383 |
+
▁connect -10.2445
|
| 1384 |
+
▁focus -10.2445
|
| 1385 |
+
▁furry -10.2445
|
| 1386 |
+
▁garnish -10.2445
|
| 1387 |
+
▁grizzl -10.2445
|
| 1388 |
+
▁horseback -10.2445
|
| 1389 |
+
▁jersey -10.2445
|
| 1390 |
+
▁liquid -10.2445
|
| 1391 |
+
▁motorbike -10.2445
|
| 1392 |
+
▁newspaper -10.2445
|
| 1393 |
+
▁opposite -10.2445
|
| 1394 |
+
▁ostrich -10.2445
|
| 1395 |
+
▁powder -10.2445
|
| 1396 |
+
▁selection -10.2445
|
| 1397 |
+
▁silverware -10.2445
|
| 1398 |
+
▁america -10.2445
|
| 1399 |
+
▁money -10.2445
|
| 1400 |
+
▁process -10.2445
|
| 1401 |
+
▁pocket -10.2445
|
| 1402 |
+
▁relish -10.2445
|
| 1403 |
+
▁jelly -10.2445
|
| 1404 |
+
▁odd -10.2445
|
| 1405 |
+
▁santa -10.2445
|
| 1406 |
+
▁fighter -10.2445
|
| 1407 |
+
▁patch -10.2445
|
| 1408 |
+
▁typing -10.2445
|
| 1409 |
+
▁cold -10.2446
|
| 1410 |
+
▁tasting -10.2446
|
| 1411 |
+
lumin -10.2456
|
| 1412 |
+
ador -10.2834
|
| 1413 |
+
▁aircraft -10.3001
|
| 1414 |
+
▁bookshelf -10.3001
|
| 1415 |
+
▁cigarette -10.3001
|
| 1416 |
+
▁digital -10.3001
|
| 1417 |
+
▁exhibit -10.3001
|
| 1418 |
+
▁interesting -10.3001
|
| 1419 |
+
▁meadow -10.3001
|
| 1420 |
+
▁muffin -10.3001
|
| 1421 |
+
▁natural -10.3001
|
| 1422 |
+
▁organiz -10.3001
|
| 1423 |
+
▁overlook -10.3001
|
| 1424 |
+
▁sweet -10.3001
|
| 1425 |
+
plug -10.3001
|
| 1426 |
+
▁baking -10.3001
|
| 1427 |
+
▁celebrat -10.3001
|
| 1428 |
+
▁remov -10.3001
|
| 1429 |
+
▁wedding -10.3001
|
| 1430 |
+
▁dozen -10.3001
|
| 1431 |
+
▁forward -10.3001
|
| 1432 |
+
▁jeans -10.3001
|
| 1433 |
+
▁dust -10.3002
|
| 1434 |
+
guard -10.3004
|
| 1435 |
+
▁beak -10.3005
|
| 1436 |
+
ield -10.3036
|
| 1437 |
+
book -10.3502
|
| 1438 |
+
▁avocado -10.3589
|
| 1439 |
+
▁construction -10.3589
|
| 1440 |
+
▁grapefruit -10.3589
|
| 1441 |
+
▁ingredients -10.3589
|
| 1442 |
+
▁instruction -10.3589
|
| 1443 |
+
▁ipod -10.3589
|
| 1444 |
+
▁knives -10.3589
|
| 1445 |
+
▁learning -10.3589
|
| 1446 |
+
▁liquor -10.3589
|
| 1447 |
+
▁ornament -10.3589
|
| 1448 |
+
▁pencils -10.3589
|
| 1449 |
+
▁pineapple -10.3589
|
| 1450 |
+
▁shallow -10.3589
|
| 1451 |
+
▁tourist -10.3589
|
| 1452 |
+
▁transport -10.3589
|
| 1453 |
+
▁trolley -10.3589
|
| 1454 |
+
▁tulips -10.3589
|
| 1455 |
+
▁knitt -10.3589
|
| 1456 |
+
▁magnet -10.3589
|
| 1457 |
+
▁clown -10.3589
|
| 1458 |
+
▁loung -10.3589
|
| 1459 |
+
▁finish -10.3589
|
| 1460 |
+
▁first -10.3589
|
| 1461 |
+
▁operat -10.3589
|
| 1462 |
+
▁backyard -10.3589
|
| 1463 |
+
▁semi -10.359
|
| 1464 |
+
▁medi -10.372
|
| 1465 |
+
▁scissor -10.3945
|
| 1466 |
+
▁plai -10.4176
|
| 1467 |
+
▁calculator -10.4214
|
| 1468 |
+
▁chrome -10.4214
|
| 1469 |
+
▁church -10.4214
|
| 1470 |
+
▁extreme -10.4214
|
| 1471 |
+
▁fixing -10.4214
|
| 1472 |
+
▁flavor -10.4214
|
| 1473 |
+
▁fluffy -10.4214
|
| 1474 |
+
▁motorcyclist -10.4214
|
| 1475 |
+
▁nokia -10.4214
|
| 1476 |
+
▁project -10.4214
|
| 1477 |
+
▁selfie -10.4214
|
| 1478 |
+
▁skating -10.4214
|
| 1479 |
+
▁sneakers -10.4214
|
| 1480 |
+
▁spinach -10.4214
|
| 1481 |
+
▁stretch -10.4214
|
| 1482 |
+
▁transit -10.4214
|
| 1483 |
+
▁vegetation -10.4214
|
| 1484 |
+
▁console -10.4214
|
| 1485 |
+
▁material -10.4214
|
| 1486 |
+
▁roast -10.4214
|
| 1487 |
+
▁cheesecake -10.4214
|
| 1488 |
+
▁crouch -10.4214
|
| 1489 |
+
▁hung -10.4214
|
| 1490 |
+
▁taxi -10.4214
|
| 1491 |
+
▁weather -10.4214
|
| 1492 |
+
▁swan -10.4214
|
| 1493 |
+
▁crib -10.4214
|
| 1494 |
+
▁safe -10.4221
|
| 1495 |
+
▁decor -10.4877
|
| 1496 |
+
▁antelope -10.4881
|
| 1497 |
+
▁bamboo -10.4881
|
| 1498 |
+
▁blood -10.4881
|
| 1499 |
+
▁circu -10.4881
|
| 1500 |
+
q -10.8094
|
src/dataset/sub_tokenizer2000.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c60fef05dfb996f1a074bc5a252a0bd87eba570e346493a778d47dd3ee929f95
|
| 3 |
+
size 271282
|
src/dataset/sub_tokenizer2000.vocab
ADDED
|
@@ -0,0 +1,2000 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<pad> 0
|
| 2 |
+
<sos> 0
|
| 3 |
+
<eos> 0
|
| 4 |
+
<unk> 0
|
| 5 |
+
▁a -1.90379
|
| 6 |
+
. -2.84988
|
| 7 |
+
▁of -3.52043
|
| 8 |
+
▁on -3.53193
|
| 9 |
+
▁in -3.7858
|
| 10 |
+
▁the -3.78741
|
| 11 |
+
s -3.81805
|
| 12 |
+
▁with -3.85508
|
| 13 |
+
nd -3.87567
|
| 14 |
+
▁is -4.37004
|
| 15 |
+
▁to -4.55105
|
| 16 |
+
ing -4.63649
|
| 17 |
+
▁man -4.76743
|
| 18 |
+
▁ -4.8183
|
| 19 |
+
▁sitting -4.86181
|
| 20 |
+
, -4.94566
|
| 21 |
+
▁an -5.02722
|
| 22 |
+
▁next -5.11411
|
| 23 |
+
▁two -5.13883
|
| 24 |
+
ed -5.23179
|
| 25 |
+
▁white -5.27115
|
| 26 |
+
▁are -5.29061
|
| 27 |
+
▁holding -5.3089
|
| 28 |
+
▁standing -5.31022
|
| 29 |
+
▁table -5.3802
|
| 30 |
+
▁it -5.42532
|
| 31 |
+
y -5.47436
|
| 32 |
+
e -5.47748
|
| 33 |
+
▁at -5.5327
|
| 34 |
+
▁woman -5.54252
|
| 35 |
+
▁plate -5.56616
|
| 36 |
+
▁that -5.58758
|
| 37 |
+
▁up -5.61664
|
| 38 |
+
▁top -5.64691
|
| 39 |
+
▁people -5.67154
|
| 40 |
+
d -5.70035
|
| 41 |
+
▁some -5.70552
|
| 42 |
+
n -5.71926
|
| 43 |
+
▁person -5.72054
|
| 44 |
+
▁black -5.79203
|
| 45 |
+
▁large -5.79333
|
| 46 |
+
▁street -5.83316
|
| 47 |
+
▁red -5.87919
|
| 48 |
+
▁his -5.93696
|
| 49 |
+
▁small -5.93728
|
| 50 |
+
▁dog -5.98374
|
| 51 |
+
▁near -5.98661
|
| 52 |
+
▁front -6.02607
|
| 53 |
+
le -6.03631
|
| 54 |
+
▁sign -6.112
|
| 55 |
+
▁by -6.15826
|
| 56 |
+
▁bear -6.16926
|
| 57 |
+
▁cat -6.16953
|
| 58 |
+
▁group -6.1793
|
| 59 |
+
▁field -6.19767
|
| 60 |
+
▁has -6.20777
|
| 61 |
+
▁food -6.21735
|
| 62 |
+
▁blue -6.21834
|
| 63 |
+
▁green -6.25381
|
| 64 |
+
▁down -6.26316
|
| 65 |
+
▁young -6.27466
|
| 66 |
+
▁tennis -6.29272
|
| 67 |
+
▁snow -6.29366
|
| 68 |
+
▁other -6.30134
|
| 69 |
+
▁close -6.3032
|
| 70 |
+
▁whi -6.31842
|
| 71 |
+
▁there -6.32654
|
| 72 |
+
▁grass -6.33462
|
| 73 |
+
▁water -6.33544
|
| 74 |
+
▁side -6.39473
|
| 75 |
+
▁train -6.42556
|
| 76 |
+
▁computer -6.42924
|
| 77 |
+
t -6.44999
|
| 78 |
+
a -6.46645
|
| 79 |
+
▁baseball -6.4953
|
| 80 |
+
▁parked -6.50776
|
| 81 |
+
▁walking -6.51064
|
| 82 |
+
▁board -6.51169
|
| 83 |
+
▁sits -6.52527
|
| 84 |
+
▁riding -6.53004
|
| 85 |
+
o -6.53714
|
| 86 |
+
▁stop -6.54352
|
| 87 |
+
▁parking -6.5445
|
| 88 |
+
▁for -6.55358
|
| 89 |
+
▁kitchen -6.55761
|
| 90 |
+
r -6.58109
|
| 91 |
+
▁phone -6.58369
|
| 92 |
+
▁her -6.59094
|
| 93 |
+
▁wearing -6.60086
|
| 94 |
+
▁boy -6.60371
|
| 95 |
+
▁looking -6.60483
|
| 96 |
+
er -6.60786
|
| 97 |
+
▁clock -6.61485
|
| 98 |
+
▁bowl -6.62845
|
| 99 |
+
▁bus -6.64378
|
| 100 |
+
▁pizza -6.64978
|
| 101 |
+
▁fire -6.6575
|
| 102 |
+
▁woode -6.65812
|
| 103 |
+
' -6.66211
|
| 104 |
+
▁out -6.6706
|
| 105 |
+
m -6.67177
|
| 106 |
+
▁couple -6.68249
|
| 107 |
+
p -6.68396
|
| 108 |
+
▁picture -6.68538
|
| 109 |
+
▁thre -6.69363
|
| 110 |
+
▁yellow -6.69364
|
| 111 |
+
▁bathroom -6.69845
|
| 112 |
+
▁glass -6.70559
|
| 113 |
+
▁girl -6.70816
|
| 114 |
+
▁from -6.73116
|
| 115 |
+
▁toilet -6.73449
|
| 116 |
+
▁several -6.73616
|
| 117 |
+
▁skateboard -6.74251
|
| 118 |
+
▁vase -6.74527
|
| 119 |
+
▁brown -6.75471
|
| 120 |
+
▁building -6.75554
|
| 121 |
+
▁vegetables -6.76098
|
| 122 |
+
▁light -6.76845
|
| 123 |
+
▁hydrant -6.77015
|
| 124 |
+
▁hand -6.79109
|
| 125 |
+
▁beach -6.79111
|
| 126 |
+
▁scissors -6.79703
|
| 127 |
+
▁sink -6.80533
|
| 128 |
+
▁room -6.80892
|
| 129 |
+
▁their -6.80978
|
| 130 |
+
▁filled -6.81271
|
| 131 |
+
g -6.8148
|
| 132 |
+
▁over -6.83258
|
| 133 |
+
▁desk -6.84733
|
| 134 |
+
▁wine -6.85868
|
| 135 |
+
▁bat -6.86169
|
| 136 |
+
▁road -6.87196
|
| 137 |
+
▁elephant -6.87345
|
| 138 |
+
▁open -6.8758
|
| 139 |
+
▁one -6.88385
|
| 140 |
+
▁pair -6.88936
|
| 141 |
+
▁meter -6.89336
|
| 142 |
+
▁be -6.91028
|
| 143 |
+
▁horse -6.91547
|
| 144 |
+
▁hot -6.92288
|
| 145 |
+
▁cake -6.92509
|
| 146 |
+
▁thi -6.92907
|
| 147 |
+
▁each -6.93109
|
| 148 |
+
▁keyboard -6.9311
|
| 149 |
+
▁wall -6.93925
|
| 150 |
+
▁sheep -6.94337
|
| 151 |
+
▁different -6.94954
|
| 152 |
+
▁frisbee -6.9496
|
| 153 |
+
▁flying -6.95108
|
| 154 |
+
▁flowers -6.95386
|
| 155 |
+
b -6.95499
|
| 156 |
+
▁traffic -6.95785
|
| 157 |
+
▁orange -6.95925
|
| 158 |
+
▁laptop -6.96623
|
| 159 |
+
▁giraffe -6.97434
|
| 160 |
+
▁eating -6.97688
|
| 161 |
+
▁bench -6.9801
|
| 162 |
+
▁counter -6.98527
|
| 163 |
+
▁teddy -6.98533
|
| 164 |
+
▁its -6.98622
|
| 165 |
+
▁f -6.98731
|
| 166 |
+
▁tree -6.98738
|
| 167 |
+
▁around -6.98749
|
| 168 |
+
▁mouse -6.98963
|
| 169 |
+
▁covered -6.99463
|
| 170 |
+
▁tie -7.00138
|
| 171 |
+
st -7.00458
|
| 172 |
+
▁lay -7.00734
|
| 173 |
+
u -7.01767
|
| 174 |
+
▁broccoli -7.01801
|
| 175 |
+
▁cell -7.02242
|
| 176 |
+
▁inside -7.0251
|
| 177 |
+
▁boat -7.02576
|
| 178 |
+
▁through -7.02915
|
| 179 |
+
▁fruit -7.04853
|
| 180 |
+
▁remote -7.05181
|
| 181 |
+
▁ball -7.0541
|
| 182 |
+
▁bunch -7.05562
|
| 183 |
+
▁motorcycle -7.05802
|
| 184 |
+
▁area -7.05888
|
| 185 |
+
▁outside -7.06116
|
| 186 |
+
very -7.06637
|
| 187 |
+
al -7.06708
|
| 188 |
+
▁zebra -7.06733
|
| 189 |
+
▁little -7.06797
|
| 190 |
+
▁microwave -7.07264
|
| 191 |
+
▁truck -7.07512
|
| 192 |
+
es -7.08244
|
| 193 |
+
▁city -7.09155
|
| 194 |
+
▁old -7.09496
|
| 195 |
+
▁lot -7.09663
|
| 196 |
+
▁bed -7.11485
|
| 197 |
+
▁car -7.11886
|
| 198 |
+
▁umbrella -7.12617
|
| 199 |
+
▁together -7.12794
|
| 200 |
+
▁playing -7.12974
|
| 201 |
+
▁back -7.14791
|
| 202 |
+
ly -7.14982
|
| 203 |
+
▁sit -7.15914
|
| 204 |
+
▁under -7.16824
|
| 205 |
+
▁behind -7.17346
|
| 206 |
+
▁bananas -7.17365
|
| 207 |
+
ting -7.18579
|
| 208 |
+
▁men -7.1879
|
| 209 |
+
en -7.18889
|
| 210 |
+
▁ground -7.18911
|
| 211 |
+
▁s -7.19022
|
| 212 |
+
▁background -7.19173
|
| 213 |
+
▁glasses -7.19856
|
| 214 |
+
▁bird -7.19888
|
| 215 |
+
▁bike -7.20221
|
| 216 |
+
l -7.20434
|
| 217 |
+
▁piece -7.20762
|
| 218 |
+
▁child -7.21573
|
| 219 |
+
▁carrots -7.2246
|
| 220 |
+
▁stuff -7.22932
|
| 221 |
+
▁head -7.22934
|
| 222 |
+
▁display -7.22943
|
| 223 |
+
▁sandwich -7.23111
|
| 224 |
+
▁air -7.23995
|
| 225 |
+
▁surfboard -7.24968
|
| 226 |
+
▁baby -7.25425
|
| 227 |
+
▁full -7.25989
|
| 228 |
+
▁stands -7.2628
|
| 229 |
+
▁hold -7.2628
|
| 230 |
+
▁into -7.2794
|
| 231 |
+
▁luggage -7.28271
|
| 232 |
+
▁being -7.28796
|
| 233 |
+
▁oven -7.30017
|
| 234 |
+
w -7.3046
|
| 235 |
+
▁player -7.30562
|
| 236 |
+
▁beside -7.30918
|
| 237 |
+
▁photo -7.31199
|
| 238 |
+
ous -7.31987
|
| 239 |
+
▁sidewalk -7.32395
|
| 240 |
+
▁shirt -7.33037
|
| 241 |
+
laying -7.33119
|
| 242 |
+
▁paper -7.33304
|
| 243 |
+
▁cutt -7.33384
|
| 244 |
+
▁racket -7.33603
|
| 245 |
+
▁floor -7.33609
|
| 246 |
+
▁dirt -7.3391
|
| 247 |
+
▁knife -7.34523
|
| 248 |
+
▁trees -7.34536
|
| 249 |
+
▁snowboard -7.34713
|
| 250 |
+
▁pink -7.34836
|
| 251 |
+
▁fence -7.35427
|
| 252 |
+
▁game -7.36383
|
| 253 |
+
▁cup -7.36772
|
| 254 |
+
pped -7.37284
|
| 255 |
+
▁window -7.37327
|
| 256 |
+
i -7.37392
|
| 257 |
+
▁skis -7.37715
|
| 258 |
+
▁look -7.38279
|
| 259 |
+
▁cow -7.3872
|
| 260 |
+
▁big -7.3926
|
| 261 |
+
▁pile -7.42014
|
| 262 |
+
▁view -7.42185
|
| 263 |
+
▁d -7.42569
|
| 264 |
+
▁face -7.4275
|
| 265 |
+
▁oranges -7.42956
|
| 266 |
+
on -7.43198
|
| 267 |
+
▁kite -7.4337
|
| 268 |
+
ch -7.44104
|
| 269 |
+
▁zebras -7.45306
|
| 270 |
+
▁mouth -7.45555
|
| 271 |
+
▁toothbrush -7.45804
|
| 272 |
+
▁apples -7.45822
|
| 273 |
+
▁image -7.46589
|
| 274 |
+
▁plane -7.47986
|
| 275 |
+
c -7.48139
|
| 276 |
+
▁kites -7.49201
|
| 277 |
+
▁stand -7.49869
|
| 278 |
+
▁sky -7.51201
|
| 279 |
+
▁cut -7.52986
|
| 280 |
+
▁meat -7.53038
|
| 281 |
+
h -7.53967
|
| 282 |
+
▁another -7.54147
|
| 283 |
+
▁apple -7.54237
|
| 284 |
+
▁park -7.54538
|
| 285 |
+
▁refrigerator -7.56415
|
| 286 |
+
▁do -7.57386
|
| 287 |
+
���bears -7.57774
|
| 288 |
+
▁airplane -7.58145
|
| 289 |
+
▁pole -7.58642
|
| 290 |
+
▁vari -7.59082
|
| 291 |
+
▁grassy -7.59426
|
| 292 |
+
k -7.6016
|
| 293 |
+
▁bicycle -7.6096
|
| 294 |
+
▁four -7.61515
|
| 295 |
+
▁them -7.62206
|
| 296 |
+
▁colorful -7.62325
|
| 297 |
+
f -7.62581
|
| 298 |
+
▁hat -7.62834
|
| 299 |
+
▁someone -7.63551
|
| 300 |
+
▁ski -7.63961
|
| 301 |
+
▁book -7.64343
|
| 302 |
+
ar -7.64588
|
| 303 |
+
▁donuts -7.65007
|
| 304 |
+
▁cows -7.65097
|
| 305 |
+
▁tall -7.65632
|
| 306 |
+
▁fork -7.65773
|
| 307 |
+
▁women -7.66042
|
| 308 |
+
▁bottle -7.6642
|
| 309 |
+
▁stove -7.66888
|
| 310 |
+
▁off -7.66944
|
| 311 |
+
▁box -7.67222
|
| 312 |
+
▁banana -7.67282
|
| 313 |
+
▁control -7.67313
|
| 314 |
+
or -7.67649
|
| 315 |
+
▁coffee -7.6817
|
| 316 |
+
▁station -7.68613
|
| 317 |
+
▁bag -7.6901
|
| 318 |
+
▁chair -7.69491
|
| 319 |
+
it -7.69954
|
| 320 |
+
▁wood -7.70204
|
| 321 |
+
▁mirror -7.70344
|
| 322 |
+
▁ready -7.70803
|
| 323 |
+
▁herd -7.71799
|
| 324 |
+
an -7.72101
|
| 325 |
+
ll -7.72672
|
| 326 |
+
▁camera -7.73017
|
| 327 |
+
▁suitcase -7.73581
|
| 328 |
+
▁cheese -7.73945
|
| 329 |
+
▁hanging -7.74151
|
| 330 |
+
▁items -7.74619
|
| 331 |
+
▁tray -7.75301
|
| 332 |
+
▁line -7.75759
|
| 333 |
+
▁giraffes -7.75836
|
| 334 |
+
▁above -7.77636
|
| 335 |
+
▁elephants -7.78217
|
| 336 |
+
▁middle -7.79546
|
| 337 |
+
▁long -7.79554
|
| 338 |
+
▁wii -7.79565
|
| 339 |
+
v -7.79936
|
| 340 |
+
▁suit -7.80029
|
| 341 |
+
▁half -7.81002
|
| 342 |
+
▁posing -7.81013
|
| 343 |
+
▁metal -7.81492
|
| 344 |
+
in -7.8172
|
| 345 |
+
▁grazing -7.81985
|
| 346 |
+
▁get -7.82
|
| 347 |
+
▁tak -7.82337
|
| 348 |
+
▁can -7.82451
|
| 349 |
+
▁along -7.82967
|
| 350 |
+
▁sand -7.83648
|
| 351 |
+
▁t -7.85072
|
| 352 |
+
▁fruits -7.85213
|
| 353 |
+
▁day -7.85504
|
| 354 |
+
▁smiling -7.86016
|
| 355 |
+
▁like -7.86016
|
| 356 |
+
gain -7.86042
|
| 357 |
+
▁carry -7.86556
|
| 358 |
+
▁b -7.8661
|
| 359 |
+
▁signs -7.87456
|
| 360 |
+
▁all -7.88738
|
| 361 |
+
▁living -7.89149
|
| 362 |
+
▁co -7.89325
|
| 363 |
+
▁court -7.89681
|
| 364 |
+
ur -7.90155
|
| 365 |
+
▁surf -7.90671
|
| 366 |
+
▁basket -7.90753
|
| 367 |
+
▁cars -7.91145
|
| 368 |
+
▁silver -7.91294
|
| 369 |
+
▁animals -7.91676
|
| 370 |
+
▁dogs -7.91824
|
| 371 |
+
▁n -7.92221
|
| 372 |
+
▁m -7.93004
|
| 373 |
+
▁cr -7.93076
|
| 374 |
+
▁la -7.93381
|
| 375 |
+
re -7.93666
|
| 376 |
+
▁he -7.9454
|
| 377 |
+
la -7.94666
|
| 378 |
+
▁pan -7.94721
|
| 379 |
+
▁row -7.96295
|
| 380 |
+
il -7.96838
|
| 381 |
+
▁across -7.96866
|
| 382 |
+
▁airport -7.96866
|
| 383 |
+
▁bread -7.96866
|
| 384 |
+
el -7.97626
|
| 385 |
+
et -7.97645
|
| 386 |
+
▁lean -7.98022
|
| 387 |
+
ers -7.9831
|
| 388 |
+
▁horses -7.98457
|
| 389 |
+
▁plastic -7.986
|
| 390 |
+
▁hair -7.98608
|
| 391 |
+
▁animal -7.98773
|
| 392 |
+
ck -8.00089
|
| 393 |
+
▁trick -8.00374
|
| 394 |
+
▁couch -8.00375
|
| 395 |
+
▁dish -8.00894
|
| 396 |
+
▁tracks -8.01307
|
| 397 |
+
▁walk -8.0142
|
| 398 |
+
▁hands -8.01734
|
| 399 |
+
▁spoon -8.01806
|
| 400 |
+
▁set -8.02221
|
| 401 |
+
▁birds -8.02343
|
| 402 |
+
to -8.02705
|
| 403 |
+
▁screen -8.02768
|
| 404 |
+
ve -8.02892
|
| 405 |
+
▁slice -8.03331
|
| 406 |
+
▁donut -8.03666
|
| 407 |
+
▁door -8.04045
|
| 408 |
+
▁skate -8.04596
|
| 409 |
+
▁store -8.05233
|
| 410 |
+
▁g -8.05489
|
| 411 |
+
lying -8.05641
|
| 412 |
+
▁driving -8.05854
|
| 413 |
+
▁empty -8.05854
|
| 414 |
+
id -8.06796
|
| 415 |
+
▁st -8.06843
|
| 416 |
+
▁chocolate -8.07753
|
| 417 |
+
▁enclosure -8.07754
|
| 418 |
+
▁something -8.08397
|
| 419 |
+
▁pot -8.08501
|
| 420 |
+
ion -8.08685
|
| 421 |
+
▁market -8.09047
|
| 422 |
+
ol -8.09804
|
| 423 |
+
▁snowboarder -8.10701
|
| 424 |
+
▁displayed -8.10975
|
| 425 |
+
▁purple -8.11
|
| 426 |
+
▁television -8.11
|
| 427 |
+
▁using -8.11007
|
| 428 |
+
up -8.11132
|
| 429 |
+
▁house -8.11663
|
| 430 |
+
te -8.12195
|
| 431 |
+
▁slope -8.12329
|
| 432 |
+
▁video -8.12329
|
| 433 |
+
▁hit -8.12384
|
| 434 |
+
▁controller -8.13001
|
| 435 |
+
▁shown -8.13113
|
| 436 |
+
ct -8.13323
|
| 437 |
+
▁hill -8.13673
|
| 438 |
+
ter -8.13878
|
| 439 |
+
uring -8.14849
|
| 440 |
+
▁mountain -8.14942
|
| 441 |
+
▁case -8.15073
|
| 442 |
+
▁tooth -8.15733
|
| 443 |
+
ce -8.16723
|
| 444 |
+
▁flower -8.17063
|
| 445 |
+
▁number -8.17125
|
| 446 |
+
▁tv -8.1718
|
| 447 |
+
▁brush -8.17476
|
| 448 |
+
th -8.17897
|
| 449 |
+
▁doughnuts -8.18083
|
| 450 |
+
▁boats -8.18276
|
| 451 |
+
▁who -8.18294
|
| 452 |
+
▁double -8.18538
|
| 453 |
+
▁monitor -8.18538
|
| 454 |
+
▁salad -8.18538
|
| 455 |
+
▁lady -8.1854
|
| 456 |
+
▁un -8.19178
|
| 457 |
+
▁attached -8.19252
|
| 458 |
+
▁w -8.19366
|
| 459 |
+
▁teeth -8.19978
|
| 460 |
+
▁contain -8.20528
|
| 461 |
+
▁resting -8.20567
|
| 462 |
+
▁c -8.21269
|
| 463 |
+
ic -8.21423
|
| 464 |
+
▁guy -8.21427
|
| 465 |
+
▁made -8.21638
|
| 466 |
+
▁ocean -8.22162
|
| 467 |
+
▁tower -8.22165
|
| 468 |
+
▁pen -8.22303
|
| 469 |
+
▁pull -8.22371
|
| 470 |
+
▁rock -8.22534
|
| 471 |
+
▁meal -8.22903
|
| 472 |
+
▁corner -8.22938
|
| 473 |
+
ut -8.23325
|
| 474 |
+
▁drink -8.23517
|
| 475 |
+
▁passenger -8.23649
|
| 476 |
+
▁bun -8.23926
|
| 477 |
+
▁umbrellas -8.24185
|
| 478 |
+
▁racquet -8.244
|
| 479 |
+
▁crowd -8.24616
|
| 480 |
+
▁bar -8.249
|
| 481 |
+
▁toy -8.24938
|
| 482 |
+
▁backpack -8.25921
|
| 483 |
+
▁lights -8.26244
|
| 484 |
+
us -8.27446
|
| 485 |
+
▁surround -8.27466
|
| 486 |
+
li -8.27745
|
| 487 |
+
▁zoo -8.28247
|
| 488 |
+
▁brick -8.28255
|
| 489 |
+
▁pieces -8.29064
|
| 490 |
+
▁multi -8.29803
|
| 491 |
+
▁restaurant -8.29828
|
| 492 |
+
▁cellphone -8.29839
|
| 493 |
+
▁fresh -8.30628
|
| 494 |
+
▁photograph -8.30628
|
| 495 |
+
▁shelf -8.30628
|
| 496 |
+
▁surface -8.30628
|
| 497 |
+
▁jacket -8.31435
|
| 498 |
+
▁sauce -8.31435
|
| 499 |
+
ra -8.31944
|
| 500 |
+
▁ride -8.32111
|
| 501 |
+
▁adult -8.32248
|
| 502 |
+
▁statue -8.32248
|
| 503 |
+
▁books -8.3242
|
| 504 |
+
▁waiting -8.32447
|
| 505 |
+
▁watching -8.32712
|
| 506 |
+
▁jet -8.33094
|
| 507 |
+
▁painted -8.33164
|
| 508 |
+
▁plant -8.33394
|
| 509 |
+
▁post -8.33454
|
| 510 |
+
ro -8.33538
|
| 511 |
+
▁seat -8.33698
|
| 512 |
+
▁polar -8.33905
|
| 513 |
+
▁re -8.34023
|
| 514 |
+
▁de -8.34346
|
| 515 |
+
ween -8.34728
|
| 516 |
+
▁track -8.3508
|
| 517 |
+
▁runway -8.35575
|
| 518 |
+
▁glove -8.36415
|
| 519 |
+
▁clear -8.36415
|
| 520 |
+
▁time -8.36415
|
| 521 |
+
▁toothbrushes -8.36648
|
| 522 |
+
colored -8.36734
|
| 523 |
+
▁showing -8.36824
|
| 524 |
+
▁bet -8.37115
|
| 525 |
+
▁chicken -8.37269
|
| 526 |
+
▁helmet -8.37269
|
| 527 |
+
▁dark -8.3727
|
| 528 |
+
▁children -8.3727
|
| 529 |
+
▁rice -8.3728
|
| 530 |
+
▁arm -8.37381
|
| 531 |
+
▁sleeping -8.37499
|
| 532 |
+
▁skiing -8.37539
|
| 533 |
+
▁electronic -8.38132
|
| 534 |
+
▁body -8.38132
|
| 535 |
+
▁eaten -8.38205
|
| 536 |
+
▁dressed -8.38589
|
| 537 |
+
▁po -8.38869
|
| 538 |
+
z -8.38961
|
| 539 |
+
▁going -8.39045
|
| 540 |
+
▁tile -8.39077
|
| 541 |
+
um -8.39819
|
| 542 |
+
▁high -8.40741
|
| 543 |
+
▁no -8.41272
|
| 544 |
+
▁ca -8.41279
|
| 545 |
+
▁container -8.41439
|
| 546 |
+
▁talking -8.41835
|
| 547 |
+
ping -8.4213
|
| 548 |
+
▁sliced -8.43047
|
| 549 |
+
▁grey -8.43467
|
| 550 |
+
▁doughnut -8.44054
|
| 551 |
+
▁rail -8.44409
|
| 552 |
+
▁wire -8.44417
|
| 553 |
+
▁types -8.44495
|
| 554 |
+
▁eat -8.44589
|
| 555 |
+
▁go -8.4494
|
| 556 |
+
▁outdoor -8.44991
|
| 557 |
+
way -8.45313
|
| 558 |
+
▁about -8.45317
|
| 559 |
+
▁gray -8.45481
|
| 560 |
+
▁stick -8.45768
|
| 561 |
+
▁left -8.46244
|
| 562 |
+
▁fries -8.46244
|
| 563 |
+
▁strip -8.46332
|
| 564 |
+
▁toaster -8.46334
|
| 565 |
+
ple -8.46803
|
| 566 |
+
▁includ -8.47093
|
| 567 |
+
▁neck -8.47215
|
| 568 |
+
ew -8.47727
|
| 569 |
+
▁carrot -8.47883
|
| 570 |
+
���plates -8.48014
|
| 571 |
+
▁intersection -8.4814
|
| 572 |
+
ies -8.48551
|
| 573 |
+
▁toward -8.49102
|
| 574 |
+
▁rack -8.49106
|
| 575 |
+
▁have -8.49121
|
| 576 |
+
▁drinking -8.49373
|
| 577 |
+
▁mo -8.49608
|
| 578 |
+
led -8.49823
|
| 579 |
+
at -8.49873
|
| 580 |
+
is -8.4997
|
| 581 |
+
▁h -8.50948
|
| 582 |
+
▁beer -8.50953
|
| 583 |
+
▁ma -8.51031
|
| 584 |
+
▁device -8.51053
|
| 585 |
+
▁soup -8.51077
|
| 586 |
+
▁brushing -8.51413
|
| 587 |
+
▁tub -8.52043
|
| 588 |
+
▁skateboarder -8.52274
|
| 589 |
+
▁walks -8.52647
|
| 590 |
+
▁bags -8.53018
|
| 591 |
+
▁blanket -8.53043
|
| 592 |
+
▁rain -8.53077
|
| 593 |
+
placed -8.53093
|
| 594 |
+
▁shower -8.53111
|
| 595 |
+
▁com -8.53531
|
| 596 |
+
▁rocks -8.53974
|
| 597 |
+
▁leaves -8.54053
|
| 598 |
+
▁trunk -8.54053
|
| 599 |
+
▁variet -8.54173
|
| 600 |
+
▁skier -8.54242
|
| 601 |
+
▁dress -8.54535
|
| 602 |
+
▁meters -8.55009
|
| 603 |
+
▁male -8.55324
|
| 604 |
+
se -8.55394
|
| 605 |
+
▁take -8.55728
|
| 606 |
+
▁potatoes -8.56124
|
| 607 |
+
▁branch -8.5705
|
| 608 |
+
▁toppings -8.57155
|
| 609 |
+
▁night -8.57165
|
| 610 |
+
▁swing -8.57239
|
| 611 |
+
▁color -8.57781
|
| 612 |
+
▁signal -8.58144
|
| 613 |
+
▁home -8.58199
|
| 614 |
+
▁cross -8.58294
|
| 615 |
+
▁she -8.58596
|
| 616 |
+
▁hotdog -8.59239
|
| 617 |
+
▁shows -8.59355
|
| 618 |
+
▁or -8.59559
|
| 619 |
+
ive -8.59638
|
| 620 |
+
▁buildings -8.59816
|
| 621 |
+
▁kid -8.60352
|
| 622 |
+
co -8.60403
|
| 623 |
+
▁batter -8.60504
|
| 624 |
+
▁sun -8.60763
|
| 625 |
+
▁office -8.61285
|
| 626 |
+
▁lo -8.61329
|
| 627 |
+
▁decker -8.61477
|
| 628 |
+
un -8.6149
|
| 629 |
+
▁closeup -8.61732
|
| 630 |
+
▁past -8.62
|
| 631 |
+
▁older -8.62047
|
| 632 |
+
▁dessert -8.62523
|
| 633 |
+
▁graffiti -8.62523
|
| 634 |
+
▁motorcycles -8.62848
|
| 635 |
+
▁decorated -8.63636
|
| 636 |
+
▁cart -8.63985
|
| 637 |
+
▁traveling -8.64054
|
| 638 |
+
▁bright -8.64687
|
| 639 |
+
▁uniform -8.64758
|
| 640 |
+
▁wave -8.64843
|
| 641 |
+
▁bottles -8.64876
|
| 642 |
+
▁sp -8.65093
|
| 643 |
+
▁stacked -8.66238
|
| 644 |
+
ation -8.66312
|
| 645 |
+
▁prepar -8.66412
|
| 646 |
+
▁hug -8.67044
|
| 647 |
+
▁shaped -8.67231
|
| 648 |
+
ie -8.67365
|
| 649 |
+
▁ne -8.68267
|
| 650 |
+
as -8.68526
|
| 651 |
+
▁rest -8.68555
|
| 652 |
+
ake -8.69332
|
| 653 |
+
▁motor -8.69384
|
| 654 |
+
▁roll -8.69384
|
| 655 |
+
▁lit -8.694
|
| 656 |
+
▁river -8.69433
|
| 657 |
+
ng -8.69758
|
| 658 |
+
▁di -8.70117
|
| 659 |
+
mp -8.7023
|
| 660 |
+
▁cabinets -8.71577
|
| 661 |
+
▁shot -8.71788
|
| 662 |
+
ian -8.71819
|
| 663 |
+
ped -8.71821
|
| 664 |
+
▁setting -8.71935
|
| 665 |
+
▁so -8.7209
|
| 666 |
+
▁slices -8.72431
|
| 667 |
+
▁cloud -8.72998
|
| 668 |
+
▁assortment -8.73008
|
| 669 |
+
▁dry -8.73142
|
| 670 |
+
▁cooking -8.7332
|
| 671 |
+
ca -8.73336
|
| 672 |
+
▁way -8.74097
|
| 673 |
+
▁onions -8.74355
|
| 674 |
+
▁served -8.7438
|
| 675 |
+
op -8.74553
|
| 676 |
+
ton -8.74844
|
| 677 |
+
▁cute -8.75342
|
| 678 |
+
▁place -8.75415
|
| 679 |
+
▁ice -8.75825
|
| 680 |
+
▁suitcases -8.76433
|
| 681 |
+
▁cloth -8.76653
|
| 682 |
+
▁giant -8.76748
|
| 683 |
+
▁ex -8.76751
|
| 684 |
+
lush -8.76765
|
| 685 |
+
▁underneath -8.76778
|
| 686 |
+
▁bikes -8.76815
|
| 687 |
+
▁appliances -8.77018
|
| 688 |
+
x -8.78016
|
| 689 |
+
▁assort -8.7802
|
| 690 |
+
▁machine -8.7803
|
| 691 |
+
▁gear -8.78031
|
| 692 |
+
▁forest -8.78036
|
| 693 |
+
ard -8.78049
|
| 694 |
+
▁ra -8.7816
|
| 695 |
+
if -8.78346
|
| 696 |
+
▁vegetable -8.78455
|
| 697 |
+
▁jumping -8.7878
|
| 698 |
+
ti -8.78849
|
| 699 |
+
▁swinging -8.79217
|
| 700 |
+
▁electric -8.79329
|
| 701 |
+
▁purse -8.79329
|
| 702 |
+
▁smart -8.79329
|
| 703 |
+
▁mitt -8.79329
|
| 704 |
+
▁moving -8.7933
|
| 705 |
+
▁fish -8.79417
|
| 706 |
+
▁reading -8.79453
|
| 707 |
+
▁mak -8.7957
|
| 708 |
+
▁seen -8.79576
|
| 709 |
+
▁play -8.79971
|
| 710 |
+
ent -8.80641
|
| 711 |
+
▁edge -8.80646
|
| 712 |
+
▁feet -8.80646
|
| 713 |
+
▁surfer -8.8085
|
| 714 |
+
unny -8.81131
|
| 715 |
+
▁ho -8.81569
|
| 716 |
+
able -8.81641
|
| 717 |
+
▁chairs -8.81908
|
| 718 |
+
ling -8.82076
|
| 719 |
+
▁part -8.82764
|
| 720 |
+
▁din -8.83845
|
| 721 |
+
▁fridge -8.847
|
| 722 |
+
▁running -8.84701
|
| 723 |
+
▁yard -8.84703
|
| 724 |
+
▁perched -8.84816
|
| 725 |
+
▁cooked -8.84902
|
| 726 |
+
▁but -8.85047
|
| 727 |
+
▁hay -8.8505
|
| 728 |
+
ish -8.85124
|
| 729 |
+
▁end -8.85212
|
| 730 |
+
▁female -8.86088
|
| 731 |
+
▁mount -8.86088
|
| 732 |
+
▁mustard -8.86088
|
| 733 |
+
▁curb -8.86088
|
| 734 |
+
▁scene -8.86089
|
| 735 |
+
▁him -8.86148
|
| 736 |
+
▁ramp -8.86157
|
| 737 |
+
▁log -8.86229
|
| 738 |
+
ir -8.86636
|
| 739 |
+
tage -8.87072
|
| 740 |
+
ant -8.87295
|
| 741 |
+
ha -8.87336
|
| 742 |
+
▁object -8.87497
|
| 743 |
+
▁draw -8.87497
|
| 744 |
+
▁cattle -8.87502
|
| 745 |
+
▁sale -8.87809
|
| 746 |
+
ver -8.87839
|
| 747 |
+
▁holder -8.88327
|
| 748 |
+
▁dock -8.8855
|
| 749 |
+
▁poses -8.88726
|
| 750 |
+
▁we -8.88797
|
| 751 |
+
▁wheel -8.88925
|
| 752 |
+
▁size -8.88927
|
| 753 |
+
▁towel -8.88927
|
| 754 |
+
▁tomatoes -8.8899
|
| 755 |
+
▁bath -8.89331
|
| 756 |
+
ten -8.89694
|
| 757 |
+
▁shoes -8.89819
|
| 758 |
+
▁buses -8.89825
|
| 759 |
+
▁atop -8.90093
|
| 760 |
+
▁beautiful -8.90375
|
| 761 |
+
▁among -8.90375
|
| 762 |
+
▁gold -8.90375
|
| 763 |
+
▁mother -8.90388
|
| 764 |
+
▁bu -8.90421
|
| 765 |
+
intend -8.90907
|
| 766 |
+
▁plants -8.9126
|
| 767 |
+
▁pasture -8.91856
|
| 768 |
+
▁cage -8.91891
|
| 769 |
+
▁pa -8.9214
|
| 770 |
+
▁bicycles -8.92412
|
| 771 |
+
▁k -8.92686
|
| 772 |
+
▁con -8.93124
|
| 773 |
+
▁smiles -8.93241
|
| 774 |
+
▁shop -8.93336
|
| 775 |
+
▁bridge -8.93338
|
| 776 |
+
▁clean -8.93338
|
| 777 |
+
▁o -8.94138
|
| 778 |
+
▁sa -8.94237
|
| 779 |
+
king -8.94412
|
| 780 |
+
ty -8.94564
|
| 781 |
+
▁pick -8.94631
|
| 782 |
+
▁crossing -8.94716
|
| 783 |
+
tra -8.94722
|
| 784 |
+
▁sausage -8.94853
|
| 785 |
+
▁wrappe -8.94861
|
| 786 |
+
▁stone -8.94862
|
| 787 |
+
▁candle -8.94864
|
| 788 |
+
▁single -8.9487
|
| 789 |
+
▁right -8.95142
|
| 790 |
+
▁lap -8.95192
|
| 791 |
+
ig -8.95197
|
| 792 |
+
lic -8.95305
|
| 793 |
+
▁putt -8.95306
|
| 794 |
+
▁vases -8.95733
|
| 795 |
+
▁handle -8.96353
|
| 796 |
+
▁arranged -8.96391
|
| 797 |
+
▁blow -8.96392
|
| 798 |
+
▁desktop -8.96441
|
| 799 |
+
▁bowls -8.96528
|
| 800 |
+
board -8.96861
|
| 801 |
+
▁work -8.97108
|
| 802 |
+
ta -8.97472
|
| 803 |
+
▁cream -8.97954
|
| 804 |
+
▁mug -8.97954
|
| 805 |
+
▁equipment -8.97955
|
| 806 |
+
▁beans -8.97962
|
| 807 |
+
▁sh -8.97982
|
| 808 |
+
▁kinds -8.98358
|
| 809 |
+
ate -8.98666
|
| 810 |
+
▁wi -8.98951
|
| 811 |
+
per -8.99501
|
| 812 |
+
▁below -8.99541
|
| 813 |
+
▁phones -9.00677
|
| 814 |
+
▁fly -9.00787
|
| 815 |
+
▁toothpaste -9.01154
|
| 816 |
+
▁perform -9.01154
|
| 817 |
+
▁held -9.01155
|
| 818 |
+
▁kids -9.01168
|
| 819 |
+
▁fenced -9.01268
|
| 820 |
+
▁ha -9.01676
|
| 821 |
+
▁peppers -9.01863
|
| 822 |
+
▁broken -9.02794
|
| 823 |
+
▁ketchup -9.02794
|
| 824 |
+
▁police -9.02794
|
| 825 |
+
▁grill -9.02794
|
| 826 |
+
▁dinner -9.02794
|
| 827 |
+
▁flip -9.02803
|
| 828 |
+
▁float -9.02807
|
| 829 |
+
▁glazed -9.0284
|
| 830 |
+
lar -9.0287
|
| 831 |
+
▁make -9.02938
|
| 832 |
+
▁coat -9.03253
|
| 833 |
+
▁jump -9.03492
|
| 834 |
+
less -9.03789
|
| 835 |
+
▁eyes -9.03995
|
| 836 |
+
▁pe -9.04206
|
| 837 |
+
▁platform -9.0446
|
| 838 |
+
▁trash -9.0446
|
| 839 |
+
▁foot -9.0446
|
| 840 |
+
▁design -9.0446
|
| 841 |
+
▁gather -9.04462
|
| 842 |
+
▁flock -9.04476
|
| 843 |
+
▁rackets -9.04532
|
| 844 |
+
▁cover -9.04694
|
| 845 |
+
ot -9.04859
|
| 846 |
+
▁watch -9.05093
|
| 847 |
+
▁pro -9.05256
|
| 848 |
+
▁poles -9.05323
|
| 849 |
+
▁not -9.05548
|
| 850 |
+
▁use -9.05927
|
| 851 |
+
▁birthday -9.06155
|
| 852 |
+
▁pretty -9.06155
|
| 853 |
+
▁reach -9.06155
|
| 854 |
+
▁nice -9.06156
|
| 855 |
+
▁soda -9.06156
|
| 856 |
+
▁five -9.06325
|
| 857 |
+
ry -9.06769
|
| 858 |
+
▁kind -9.07436
|
| 859 |
+
out -9.07513
|
| 860 |
+
im -9.07559
|
| 861 |
+
▁cap -9.07768
|
| 862 |
+
▁public -9.07879
|
| 863 |
+
▁round -9.0788
|
| 864 |
+
having -9.07927
|
| 865 |
+
ke -9.09157
|
| 866 |
+
▁woods -9.09593
|
| 867 |
+
▁alone -9.09634
|
| 868 |
+
▁bushes -9.09672
|
| 869 |
+
▁sea -9.09702
|
| 870 |
+
▁lie -9.0974
|
| 871 |
+
▁pie -9.09898
|
| 872 |
+
▁players -9.10129
|
| 873 |
+
▁taken -9.1048
|
| 874 |
+
ated -9.10616
|
| 875 |
+
▁ta -9.10644
|
| 876 |
+
▁stack -9.1088
|
| 877 |
+
▁lake -9.11458
|
| 878 |
+
▁tea -9.11498
|
| 879 |
+
▁new -9.11598
|
| 880 |
+
▁stopped -9.11657
|
| 881 |
+
▁bo -9.12561
|
| 882 |
+
▁girls -9.13226
|
| 883 |
+
▁produce -9.1347
|
| 884 |
+
▁used -9.13607
|
| 885 |
+
eep -9.13774
|
| 886 |
+
▁surfboards -9.14452
|
| 887 |
+
ul -9.14582
|
| 888 |
+
▁engine -9.15089
|
| 889 |
+
▁garden -9.15089
|
| 890 |
+
▁tarmac -9.15089
|
| 891 |
+
▁wild -9.1509
|
| 892 |
+
tuck -9.1517
|
| 893 |
+
▁mid -9.15203
|
| 894 |
+
▁vin -9.15343
|
| 895 |
+
▁dishes -9.15387
|
| 896 |
+
▁pictures -9.15426
|
| 897 |
+
▁skiers -9.15512
|
| 898 |
+
light -9.1598
|
| 899 |
+
side -9.16681
|
| 900 |
+
▁lin -9.16698
|
| 901 |
+
▁toast -9.16817
|
| 902 |
+
▁flat -9.16977
|
| 903 |
+
▁both -9.17007
|
| 904 |
+
▁try -9.17021
|
| 905 |
+
▁jar -9.17041
|
| 906 |
+
▁wet -9.17266
|
| 907 |
+
▁hole -9.17314
|
| 908 |
+
▁pet -9.17527
|
| 909 |
+
▁feeding -9.17672
|
| 910 |
+
▁hang -9.17911
|
| 911 |
+
▁put -9.18438
|
| 912 |
+
▁type -9.18665
|
| 913 |
+
▁snowboards -9.18809
|
| 914 |
+
▁beverage -9.18899
|
| 915 |
+
▁french -9.18899
|
| 916 |
+
▁shore -9.189
|
| 917 |
+
▁grow -9.19133
|
| 918 |
+
ment -9.1917
|
| 919 |
+
tro -9.20361
|
| 920 |
+
▁van -9.20857
|
| 921 |
+
▁supplies -9.2086
|
| 922 |
+
▁pre -9.20865
|
| 923 |
+
▁pool -9.20927
|
| 924 |
+
ical -9.20931
|
| 925 |
+
▁stoplight -9.21012
|
| 926 |
+
▁show -9.21323
|
| 927 |
+
▁working -9.22095
|
| 928 |
+
▁pack -9.22349
|
| 929 |
+
j -9.22633
|
| 930 |
+
▁drinks -9.22648
|
| 931 |
+
▁drive -9.22736
|
| 932 |
+
▁mix -9.22837
|
| 933 |
+
▁chain -9.2286
|
| 934 |
+
▁throw -9.2286
|
| 935 |
+
loaded -9.22861
|
| 936 |
+
▁steel -9.22861
|
| 937 |
+
▁good -9.22861
|
| 938 |
+
▁brushes -9.23132
|
| 939 |
+
▁clocks -9.2325
|
| 940 |
+
▁star -9.24495
|
| 941 |
+
▁shape -9.24568
|
| 942 |
+
qu -9.24648
|
| 943 |
+
▁egg -9.24781
|
| 944 |
+
tic -9.2479
|
| 945 |
+
age -9.24878
|
| 946 |
+
▁lunch -9.24901
|
| 947 |
+
▁modern -9.24901
|
| 948 |
+
▁square -9.24901
|
| 949 |
+
▁ear -9.24918
|
| 950 |
+
▁wash -9.24948
|
| 951 |
+
ther -9.2503
|
| 952 |
+
▁pad -9.25358
|
| 953 |
+
▁boxes -9.25361
|
| 954 |
+
▁balls -9.25453
|
| 955 |
+
▁ties -9.2653
|
| 956 |
+
▁sc -9.26645
|
| 957 |
+
ight -9.26724
|
| 958 |
+
▁tomato -9.2689
|
| 959 |
+
▁shoulder -9.26984
|
| 960 |
+
▁enjoy -9.26984
|
| 961 |
+
▁pastries -9.26984
|
| 962 |
+
▁milk -9.26984
|
| 963 |
+
▁lamb -9.26985
|
| 964 |
+
▁chew -9.26985
|
| 965 |
+
▁rose -9.26986
|
| 966 |
+
▁style -9.26992
|
| 967 |
+
▁match -9.27015
|
| 968 |
+
▁well -9.27046
|
| 969 |
+
▁mountains -9.27287
|
| 970 |
+
▁veggies -9.27527
|
| 971 |
+
ma -9.28982
|
| 972 |
+
▁vehicles -9.29095
|
| 973 |
+
▁appear -9.29112
|
| 974 |
+
▁blender -9.29112
|
| 975 |
+
▁breakfast -9.29112
|
| 976 |
+
▁desert -9.29112
|
| 977 |
+
▁foreground -9.29112
|
| 978 |
+
▁leaf -9.29112
|
| 979 |
+
▁napkin -9.29112
|
| 980 |
+
▁platter -9.29112
|
| 981 |
+
▁strawberries -9.29112
|
| 982 |
+
▁sunglasses -9.29112
|
| 983 |
+
▁shade -9.29112
|
| 984 |
+
▁lawn -9.29113
|
| 985 |
+
▁chopped -9.2912
|
| 986 |
+
▁hotdogs -9.29181
|
| 987 |
+
▁stra -9.2927
|
| 988 |
+
▁ripe -9.29287
|
| 989 |
+
▁doll -9.29379
|
| 990 |
+
▁cla -9.29632
|
| 991 |
+
▁dr -9.29828
|
| 992 |
+
▁potte -9.29836
|
| 993 |
+
▁gate -9.29845
|
| 994 |
+
▁see -9.29895
|
| 995 |
+
ff -9.30004
|
| 996 |
+
▁crowded -9.30661
|
| 997 |
+
▁ru -9.30863
|
| 998 |
+
▁pedestrian -9.31286
|
| 999 |
+
▁baked -9.31286
|
| 1000 |
+
▁decoration -9.31291
|
| 1001 |
+
▁mess -9.31294
|
| 1002 |
+
▁vehicle -9.31303
|
| 1003 |
+
ac -9.31455
|
| 1004 |
+
▁par -9.31519
|
| 1005 |
+
▁pasta -9.31593
|
| 1006 |
+
▁noodles -9.31606
|
| 1007 |
+
ock -9.32467
|
| 1008 |
+
▁bottom -9.33508
|
| 1009 |
+
▁christmas -9.33508
|
| 1010 |
+
▁country -9.33508
|
| 1011 |
+
▁decorative -9.33508
|
| 1012 |
+
▁scooter -9.33508
|
| 1013 |
+
▁sculpture -9.33508
|
| 1014 |
+
▁sprinkles -9.33508
|
| 1015 |
+
▁chips -9.3351
|
| 1016 |
+
▁things -9.33513
|
| 1017 |
+
▁butter -9.33556
|
| 1018 |
+
▁war -9.33657
|
| 1019 |
+
▁kneel -9.33963
|
| 1020 |
+
uch -9.3405
|
| 1021 |
+
▁catcher -9.34198
|
| 1022 |
+
▁sandwiches -9.34288
|
| 1023 |
+
▁pi -9.34733
|
| 1024 |
+
▁gra -9.35321
|
| 1025 |
+
▁se -9.35518
|
| 1026 |
+
▁serving -9.35781
|
| 1027 |
+
▁horn -9.35782
|
| 1028 |
+
▁says -9.35794
|
| 1029 |
+
▁smile -9.35928
|
| 1030 |
+
▁includes -9.36011
|
| 1031 |
+
▁dryer -9.36038
|
| 1032 |
+
▁skies -9.36308
|
| 1033 |
+
▁i -9.36614
|
| 1034 |
+
line -9.36833
|
| 1035 |
+
▁spoons -9.37146
|
| 1036 |
+
▁pulled -9.37477
|
| 1037 |
+
▁prepare -9.37951
|
| 1038 |
+
own -9.37978
|
| 1039 |
+
▁colors -9.38069
|
| 1040 |
+
▁bathtub -9.38106
|
| 1041 |
+
▁concrete -9.38106
|
| 1042 |
+
▁distance -9.38106
|
| 1043 |
+
where -9.38106
|
| 1044 |
+
▁balloon -9.38107
|
| 1045 |
+
▁nearby -9.38118
|
| 1046 |
+
▁was -9.38129
|
| 1047 |
+
▁path -9.38151
|
| 1048 |
+
▁tri -9.38157
|
| 1049 |
+
▁lamp -9.38182
|
| 1050 |
+
▁spot -9.38335
|
| 1051 |
+
▁chi -9.38405
|
| 1052 |
+
▁fac -9.38913
|
| 1053 |
+
▁cups -9.40016
|
| 1054 |
+
▁check -9.40265
|
| 1055 |
+
▁nose -9.40615
|
| 1056 |
+
ular -9.40686
|
| 1057 |
+
▁run -9.40878
|
| 1058 |
+
ger -9.41062
|
| 1059 |
+
▁passing -9.41115
|
| 1060 |
+
▁card -9.41367
|
| 1061 |
+
▁ri -9.42765
|
| 1062 |
+
▁bacon -9.42926
|
| 1063 |
+
▁bedroom -9.42926
|
| 1064 |
+
▁carriage -9.42926
|
| 1065 |
+
▁kitten -9.42926
|
| 1066 |
+
▁stainless -9.42926
|
| 1067 |
+
▁graze -9.42934
|
| 1068 |
+
▁carrie -9.42936
|
| 1069 |
+
▁bull -9.43
|
| 1070 |
+
▁low -9.43078
|
| 1071 |
+
▁clothes -9.43111
|
| 1072 |
+
▁fi -9.43248
|
| 1073 |
+
▁vi -9.4333
|
| 1074 |
+
▁rider -9.43351
|
| 1075 |
+
▁painting -9.4345
|
| 1076 |
+
▁reads -9.4346
|
| 1077 |
+
▁bite -9.43579
|
| 1078 |
+
▁pond -9.43614
|
| 1079 |
+
▁airplanes -9.44209
|
| 1080 |
+
▁legs -9.44344
|
| 1081 |
+
cal -9.44764
|
| 1082 |
+
▁turn -9.44882
|
| 1083 |
+
▁juice -9.45426
|
| 1084 |
+
▁lettuce -9.45426
|
| 1085 |
+
▁partially -9.45426
|
| 1086 |
+
▁position -9.45426
|
| 1087 |
+
▁carpet -9.45427
|
| 1088 |
+
▁swimming -9.45427
|
| 1089 |
+
▁sort -9.45431
|
| 1090 |
+
▁pose -9.45529
|
| 1091 |
+
▁paw -9.4554
|
| 1092 |
+
▁plain -9.45556
|
| 1093 |
+
▁docked -9.4639
|
| 1094 |
+
ine -9.46494
|
| 1095 |
+
eat -9.46557
|
| 1096 |
+
▁staring -9.46617
|
| 1097 |
+
▁travel -9.47023
|
| 1098 |
+
▁here -9.47823
|
| 1099 |
+
▁blurr -9.4799
|
| 1100 |
+
▁pigeons -9.4799
|
| 1101 |
+
▁what -9.4799
|
| 1102 |
+
▁cement -9.47991
|
| 1103 |
+
▁word -9.47991
|
| 1104 |
+
▁same -9.47991
|
| 1105 |
+
▁ship -9.47999
|
| 1106 |
+
▁reflection -9.48012
|
| 1107 |
+
▁fashion -9.48133
|
| 1108 |
+
▁tiny -9.48351
|
| 1109 |
+
▁pin -9.48409
|
| 1110 |
+
▁cabinet -9.48423
|
| 1111 |
+
▁spi -9.48493
|
| 1112 |
+
▁mushrooms -9.48603
|
| 1113 |
+
ci -9.49338
|
| 1114 |
+
▁point -9.49645
|
| 1115 |
+
▁feed -9.49684
|
| 1116 |
+
▁giv -9.49926
|
| 1117 |
+
just -9.50571
|
| 1118 |
+
▁arrangement -9.50622
|
| 1119 |
+
▁bucket -9.50622
|
| 1120 |
+
▁collecti -9.50622
|
| 1121 |
+
▁lift -9.50622
|
| 1122 |
+
▁parade -9.50622
|
| 1123 |
+
▁shadow -9.50622
|
| 1124 |
+
▁climb -9.50622
|
| 1125 |
+
▁flag -9.50622
|
| 1126 |
+
▁lead -9.50623
|
| 1127 |
+
▁center -9.50623
|
| 1128 |
+
▁pears -9.50624
|
| 1129 |
+
▁sheet -9.5071
|
| 1130 |
+
▁life -9.50858
|
| 1131 |
+
▁utensils -9.50952
|
| 1132 |
+
▁gl -9.50966
|
| 1133 |
+
▁pickle -9.51051
|
| 1134 |
+
▁writing -9.51316
|
| 1135 |
+
▁pu -9.51557
|
| 1136 |
+
▁containers -9.51904
|
| 1137 |
+
▁tape -9.52638
|
| 1138 |
+
▁race -9.53101
|
| 1139 |
+
▁pitch -9.53126
|
| 1140 |
+
▁you -9.53311
|
| 1141 |
+
▁family -9.53325
|
| 1142 |
+
▁military -9.53325
|
| 1143 |
+
▁pavement -9.53325
|
| 1144 |
+
▁peanut -9.53325
|
| 1145 |
+
▁picnic -9.53325
|
| 1146 |
+
▁soccer -9.53325
|
| 1147 |
+
▁space -9.53325
|
| 1148 |
+
▁lone -9.53328
|
| 1149 |
+
▁pastry -9.53355
|
| 1150 |
+
▁finger -9.53421
|
| 1151 |
+
▁cluttered -9.53437
|
| 1152 |
+
▁watches -9.53479
|
| 1153 |
+
▁officer -9.53677
|
| 1154 |
+
ful -9.53806
|
| 1155 |
+
▁upside -9.53856
|
| 1156 |
+
▁bell -9.54519
|
| 1157 |
+
▁benches -9.54552
|
| 1158 |
+
man -9.54553
|
| 1159 |
+
▁covering -9.549
|
| 1160 |
+
ure -9.54966
|
| 1161 |
+
▁goats -9.55088
|
| 1162 |
+
▁plays -9.55139
|
| 1163 |
+
▁serve -9.55787
|
| 1164 |
+
▁deck -9.56079
|
| 1165 |
+
▁curtain -9.56102
|
| 1166 |
+
▁himself -9.56102
|
| 1167 |
+
▁ledge -9.56102
|
| 1168 |
+
▁railroad -9.56102
|
| 1169 |
+
▁duck -9.56103
|
| 1170 |
+
▁model -9.56103
|
| 1171 |
+
▁comme -9.56104
|
| 1172 |
+
rcial -9.56105
|
| 1173 |
+
arrow -9.56106
|
| 1174 |
+
▁base -9.56112
|
| 1175 |
+
▁hillside -9.56146
|
| 1176 |
+
▁tools -9.56201
|
| 1177 |
+
▁mark -9.56362
|
| 1178 |
+
ial -9.56538
|
| 1179 |
+
▁frosting -9.56948
|
| 1180 |
+
▁sticks -9.57019
|
| 1181 |
+
▁outdoors -9.57072
|
| 1182 |
+
ral -9.57163
|
| 1183 |
+
cer -9.57499
|
| 1184 |
+
made -9.5813
|
| 1185 |
+
ddler -9.58536
|
| 1186 |
+
▁waves -9.58741
|
| 1187 |
+
▁propeller -9.58959
|
| 1188 |
+
▁puppy -9.58959
|
| 1189 |
+
▁school -9.58959
|
| 1190 |
+
▁built -9.5896
|
| 1191 |
+
▁cupcake -9.5896
|
| 1192 |
+
▁mini -9.58962
|
| 1193 |
+
▁step -9.58962
|
| 1194 |
+
▁string -9.58966
|
| 1195 |
+
▁panda -9.58968
|
| 1196 |
+
▁port -9.59003
|
| 1197 |
+
▁eggs -9.59129
|
| 1198 |
+
fore -9.59259
|
| 1199 |
+
▁pipe -9.59269
|
| 1200 |
+
ster -9.59617
|
| 1201 |
+
ead -9.5992
|
| 1202 |
+
▁flor -9.6181
|
| 1203 |
+
▁block -9.61901
|
| 1204 |
+
▁friend -9.61901
|
| 1205 |
+
▁spread -9.61901
|
| 1206 |
+
▁winter -9.61901
|
| 1207 |
+
▁still -9.61901
|
| 1208 |
+
▁flies -9.61901
|
| 1209 |
+
▁sport -9.61902
|
| 1210 |
+
▁fried -9.61904
|
| 1211 |
+
▁direction -9.61921
|
| 1212 |
+
▁event -9.61939
|
| 1213 |
+
▁landing -9.62011
|
| 1214 |
+
▁cub -9.62039
|
| 1215 |
+
▁trailer -9.62104
|
| 1216 |
+
▁cookie -9.62399
|
| 1217 |
+
▁bi -9.62462
|
| 1218 |
+
ary -9.62694
|
| 1219 |
+
▁eye -9.62776
|
| 1220 |
+
▁barn -9.63204
|
| 1221 |
+
▁prepared -9.63753
|
| 1222 |
+
ball -9.64097
|
| 1223 |
+
▁comp -9.64222
|
| 1224 |
+
▁tin -9.64718
|
| 1225 |
+
▁corn -9.64787
|
| 1226 |
+
▁dip -9.64922
|
| 1227 |
+
▁balanc -9.64931
|
| 1228 |
+
▁pattern -9.64931
|
| 1229 |
+
▁professional -9.64931
|
| 1230 |
+
▁consist -9.64931
|
| 1231 |
+
▁spray -9.64931
|
| 1232 |
+
▁antique -9.64932
|
| 1233 |
+
▁stall -9.64932
|
| 1234 |
+
▁package -9.64945
|
| 1235 |
+
▁town -9.65018
|
| 1236 |
+
▁tag -9.6513
|
| 1237 |
+
▁grapes -9.6552
|
| 1238 |
+
▁does -9.65696
|
| 1239 |
+
▁lemons -9.65937
|
| 1240 |
+
▁note -9.66422
|
| 1241 |
+
▁stickers -9.66544
|
| 1242 |
+
▁figure -9.68056
|
| 1243 |
+
▁furniture -9.68056
|
| 1244 |
+
▁lime -9.68056
|
| 1245 |
+
▁parrot -9.68056
|
| 1246 |
+
▁sofa -9.68056
|
| 1247 |
+
▁outfit -9.68056
|
| 1248 |
+
▁power -9.68058
|
| 1249 |
+
▁disc -9.68059
|
| 1250 |
+
▁notebook -9.68059
|
| 1251 |
+
▁wide -9.68085
|
| 1252 |
+
▁after -9.6809
|
| 1253 |
+
▁chili -9.68091
|
| 1254 |
+
▁hard -9.68108
|
| 1255 |
+
▁tail -9.68122
|
| 1256 |
+
▁features -9.68149
|
| 1257 |
+
▁fry -9.68236
|
| 1258 |
+
▁coin -9.6829
|
| 1259 |
+
▁bit -9.6863
|
| 1260 |
+
▁turned -9.68744
|
| 1261 |
+
▁pointing -9.69232
|
| 1262 |
+
▁bunches -9.69379
|
| 1263 |
+
▁seated -9.70181
|
| 1264 |
+
▁frost -9.70307
|
| 1265 |
+
▁featur -9.71192
|
| 1266 |
+
▁contents -9.71282
|
| 1267 |
+
▁delicious -9.71282
|
| 1268 |
+
▁expired -9.71282
|
| 1269 |
+
▁guitar -9.71282
|
| 1270 |
+
▁leash -9.71282
|
| 1271 |
+
▁snack -9.71282
|
| 1272 |
+
▁steak -9.71282
|
| 1273 |
+
▁name -9.71282
|
| 1274 |
+
▁push -9.71283
|
| 1275 |
+
▁touching -9.71297
|
| 1276 |
+
▁subway -9.71301
|
| 1277 |
+
▁stir -9.71352
|
| 1278 |
+
▁catch -9.72328
|
| 1279 |
+
▁short -9.72397
|
| 1280 |
+
▁whole -9.72414
|
| 1281 |
+
▁waits -9.72591
|
| 1282 |
+
▁wear -9.73765
|
| 1283 |
+
▁trail -9.74385
|
| 1284 |
+
neath -9.74541
|
| 1285 |
+
▁berries -9.74615
|
| 1286 |
+
▁blond -9.74615
|
| 1287 |
+
▁ceramic -9.74615
|
| 1288 |
+
▁condiments -9.74615
|
| 1289 |
+
▁fabric -9.74615
|
| 1290 |
+
▁fancy -9.74615
|
| 1291 |
+
▁stairs -9.74616
|
| 1292 |
+
▁pants -9.74635
|
| 1293 |
+
uzz -9.74642
|
| 1294 |
+
▁ham -9.7487
|
| 1295 |
+
▁peeled -9.7509
|
| 1296 |
+
▁tee -9.75839
|
| 1297 |
+
▁paste -9.75922
|
| 1298 |
+
▁goat -9.75928
|
| 1299 |
+
how -9.75951
|
| 1300 |
+
▁bri -9.76824
|
| 1301 |
+
▁shorts -9.76884
|
| 1302 |
+
top -9.77115
|
| 1303 |
+
ity -9.77287
|
| 1304 |
+
▁kn -9.77848
|
| 1305 |
+
cent -9.77932
|
| 1306 |
+
▁smo -9.78057
|
| 1307 |
+
▁key -9.78062
|
| 1308 |
+
▁craft -9.78063
|
| 1309 |
+
▁visible -9.78063
|
| 1310 |
+
▁beef -9.78067
|
| 1311 |
+
▁rope -9.78084
|
| 1312 |
+
▁flo -9.78113
|
| 1313 |
+
▁rais -9.78124
|
| 1314 |
+
▁sail -9.78141
|
| 1315 |
+
▁net -9.78173
|
| 1316 |
+
▁bro -9.78227
|
| 1317 |
+
▁ram -9.78314
|
| 1318 |
+
▁most -9.78953
|
| 1319 |
+
▁propp -9.79941
|
| 1320 |
+
▁rac -9.80259
|
| 1321 |
+
▁farm -9.80443
|
| 1322 |
+
▁sleep -9.80669
|
| 1323 |
+
▁fall -9.81572
|
| 1324 |
+
▁advertisement -9.81635
|
| 1325 |
+
▁bouquet -9.81635
|
| 1326 |
+
▁museum -9.81635
|
| 1327 |
+
▁restroom -9.81635
|
| 1328 |
+
▁shelves -9.81635
|
| 1329 |
+
▁flown -9.81635
|
| 1330 |
+
▁vest -9.8164
|
| 1331 |
+
▁icing -9.81676
|
| 1332 |
+
▁tank -9.81686
|
| 1333 |
+
▁walkway -9.81766
|
| 1334 |
+
▁seem -9.81848
|
| 1335 |
+
▁shiny -9.82033
|
| 1336 |
+
▁hid -9.8206
|
| 1337 |
+
▁pier -9.82143
|
| 1338 |
+
▁hotel -9.82262
|
| 1339 |
+
▁packed -9.82626
|
| 1340 |
+
band -9.82788
|
| 1341 |
+
vi -9.82908
|
| 1342 |
+
ature -9.8321
|
| 1343 |
+
▁cook -9.83224
|
| 1344 |
+
▁tan -9.83303
|
| 1345 |
+
▁rocky -9.83744
|
| 1346 |
+
▁writ -9.84364
|
| 1347 |
+
▁pass -9.84387
|
| 1348 |
+
bby -9.85172
|
| 1349 |
+
▁bin -9.8518
|
| 1350 |
+
▁reflect -9.85308
|
| 1351 |
+
▁sub -9.85318
|
| 1352 |
+
▁amount -9.85338
|
| 1353 |
+
▁steam -9.85338
|
| 1354 |
+
▁tongue -9.85338
|
| 1355 |
+
▁business -9.85339
|
| 1356 |
+
▁costume -9.85339
|
| 1357 |
+
▁owl -9.85339
|
| 1358 |
+
▁worn -9.85339
|
| 1359 |
+
▁heart -9.85339
|
| 1360 |
+
▁calf -9.8534
|
| 1361 |
+
▁sill -9.85379
|
| 1362 |
+
▁alongside -9.85419
|
| 1363 |
+
▁tent -9.85452
|
| 1364 |
+
▁gro -9.85528
|
| 1365 |
+
▁print -9.85611
|
| 1366 |
+
air -9.85803
|
| 1367 |
+
min -9.86074
|
| 1368 |
+
▁leg -9.87016
|
| 1369 |
+
▁peel -9.88722
|
| 1370 |
+
▁shin -9.89114
|
| 1371 |
+
▁happy -9.89185
|
| 1372 |
+
▁harbor -9.89185
|
| 1373 |
+
▁pillow -9.89185
|
| 1374 |
+
▁roof -9.89185
|
| 1375 |
+
▁sugar -9.89185
|
| 1376 |
+
▁airliner -9.89185
|
| 1377 |
+
▁indoor -9.89185
|
| 1378 |
+
▁ornate -9.89185
|
| 1379 |
+
▁frame -9.89185
|
| 1380 |
+
▁itself -9.89185
|
| 1381 |
+
▁residen -9.89185
|
| 1382 |
+
▁rusted -9.89186
|
| 1383 |
+
▁shopping -9.89246
|
| 1384 |
+
ized -9.89285
|
| 1385 |
+
▁gas -9.8932
|
| 1386 |
+
ator -9.89406
|
| 1387 |
+
▁pitcher -9.8947
|
| 1388 |
+
▁branches -9.89544
|
| 1389 |
+
▁selling -9.89747
|
| 1390 |
+
▁posed -9.89757
|
| 1391 |
+
▁boarder -9.8982
|
| 1392 |
+
▁em -9.90128
|
| 1393 |
+
▁gr -9.9121
|
| 1394 |
+
▁son -9.9131
|
| 1395 |
+
▁hi -9.91573
|
| 1396 |
+
▁ja -9.93084
|
| 1397 |
+
▁fun -9.93085
|
| 1398 |
+
▁figurine -9.93185
|
| 1399 |
+
▁freezer -9.93185
|
| 1400 |
+
▁garbage -9.93185
|
| 1401 |
+
▁goggles -9.93185
|
| 1402 |
+
▁overhead -9.93185
|
| 1403 |
+
▁patio -9.93185
|
| 1404 |
+
▁section -9.93185
|
| 1405 |
+
▁waffle -9.93185
|
| 1406 |
+
▁frog -9.93185
|
| 1407 |
+
▁tasty -9.93186
|
| 1408 |
+
▁mud -9.93187
|
| 1409 |
+
▁belt -9.93188
|
| 1410 |
+
ough -9.93191
|
| 1411 |
+
▁curl -9.93195
|
| 1412 |
+
▁fast -9.93216
|
| 1413 |
+
▁rug -9.93392
|
| 1414 |
+
▁ring -9.93443
|
| 1415 |
+
▁bra -9.93457
|
| 1416 |
+
▁mat -9.94022
|
| 1417 |
+
tial -9.94446
|
| 1418 |
+
▁item -9.95206
|
| 1419 |
+
fri -9.95832
|
| 1420 |
+
▁scatter -9.97351
|
| 1421 |
+
▁celery -9.97351
|
| 1422 |
+
▁faucet -9.97351
|
| 1423 |
+
▁kept -9.97351
|
| 1424 |
+
▁leather -9.97351
|
| 1425 |
+
▁loading -9.97351
|
| 1426 |
+
▁relax -9.97351
|
| 1427 |
+
▁structure -9.97351
|
| 1428 |
+
▁numer -9.97352
|
| 1429 |
+
▁six -9.97352
|
| 1430 |
+
▁asleep -9.97352
|
| 1431 |
+
▁hood -9.97353
|
| 1432 |
+
▁iron -9.97353
|
| 1433 |
+
▁owner -9.97359
|
| 1434 |
+
▁also -9.9736
|
| 1435 |
+
▁qui -9.97365
|
| 1436 |
+
▁let -9.97421
|
| 1437 |
+
▁fallen -9.97444
|
| 1438 |
+
▁class -9.97447
|
| 1439 |
+
▁straw -9.97482
|
| 1440 |
+
rban -9.97534
|
| 1441 |
+
▁bow -9.98064
|
| 1442 |
+
right -10.0112
|
| 1443 |
+
▁produc -10.0114
|
| 1444 |
+
▁accessories -10.017
|
| 1445 |
+
▁bakery -10.017
|
| 1446 |
+
▁ceiling -10.017
|
| 1447 |
+
▁cereal -10.017
|
| 1448 |
+
▁champagne -10.017
|
| 1449 |
+
▁commuter -10.017
|
| 1450 |
+
▁individual -10.017
|
| 1451 |
+
▁sweater -10.017
|
| 1452 |
+
▁neatly -10.017
|
| 1453 |
+
▁practic -10.017
|
| 1454 |
+
▁text -10.017
|
| 1455 |
+
▁headphones -10.017
|
| 1456 |
+
▁rusty -10.017
|
| 1457 |
+
▁cord -10.017
|
| 1458 |
+
▁shar -10.0172
|
| 1459 |
+
▁lid -10.0172
|
| 1460 |
+
▁passes -10.0181
|
| 1461 |
+
thered -10.0262
|
| 1462 |
+
ign -10.0263
|
| 1463 |
+
▁pay -10.0297
|
| 1464 |
+
▁farmer -10.0318
|
| 1465 |
+
▁bush -10.0319
|
| 1466 |
+
▁move -10.033
|
| 1467 |
+
ok -10.0372
|
| 1468 |
+
▁sticker -10.047
|
| 1469 |
+
▁paint -10.0473
|
| 1470 |
+
ador -10.0608
|
| 1471 |
+
▁direct -10.0621
|
| 1472 |
+
▁circle -10.0624
|
| 1473 |
+
▁crochet -10.0624
|
| 1474 |
+
▁garage -10.0624
|
| 1475 |
+
▁magazine -10.0624
|
| 1476 |
+
▁measur -10.0624
|
| 1477 |
+
▁monkey -10.0624
|
| 1478 |
+
▁urinal -10.0624
|
| 1479 |
+
▁marina -10.0624
|
| 1480 |
+
▁speak -10.0624
|
| 1481 |
+
▁roman -10.0625
|
| 1482 |
+
▁marble -10.0625
|
| 1483 |
+
▁crate -10.0636
|
| 1484 |
+
▁doorway -10.0641
|
| 1485 |
+
▁je -10.0645
|
| 1486 |
+
▁real -10.0673
|
| 1487 |
+
▁shoe -10.0804
|
| 1488 |
+
▁talk -10.1004
|
| 1489 |
+
▁potato -10.1091
|
| 1490 |
+
▁chop -10.1099
|
| 1491 |
+
▁advertis -10.1101
|
| 1492 |
+
▁approach -10.1101
|
| 1493 |
+
▁attempt -10.1101
|
| 1494 |
+
▁character -10.1101
|
| 1495 |
+
▁depict -10.1101
|
| 1496 |
+
▁jockey -10.1101
|
| 1497 |
+
▁stream -10.1101
|
| 1498 |
+
▁terminal -10.1101
|
| 1499 |
+
▁burger -10.1101
|
| 1500 |
+
▁jetliner -10.1101
|
| 1501 |
+
▁kiwi -10.1101
|
| 1502 |
+
▁vendor -10.1101
|
| 1503 |
+
▁stunt -10.1101
|
| 1504 |
+
▁collar -10.1101
|
| 1505 |
+
▁palm -10.1101
|
| 1506 |
+
foam -10.1101
|
| 1507 |
+
▁foil -10.1101
|
| 1508 |
+
▁necktie -10.1101
|
| 1509 |
+
▁indicat -10.1101
|
| 1510 |
+
▁burn -10.1101
|
| 1511 |
+
▁than -10.1101
|
| 1512 |
+
▁help -10.1102
|
| 1513 |
+
▁tire -10.1105
|
| 1514 |
+
▁workers -10.1113
|
| 1515 |
+
▁jo -10.1113
|
| 1516 |
+
▁grab -10.1122
|
| 1517 |
+
▁driver -10.1135
|
| 1518 |
+
▁brightly -10.1136
|
| 1519 |
+
▁mar -10.1161
|
| 1520 |
+
▁lab -10.1183
|
| 1521 |
+
▁shak -10.1276
|
| 1522 |
+
ound -10.1287
|
| 1523 |
+
▁pepper -10.138
|
| 1524 |
+
▁dea -10.1406
|
| 1525 |
+
▁fill -10.1557
|
| 1526 |
+
▁laughing -10.1601
|
| 1527 |
+
▁missing -10.1601
|
| 1528 |
+
▁pepperoni -10.1601
|
| 1529 |
+
▁ribbon -10.1601
|
| 1530 |
+
▁shrimp -10.1601
|
| 1531 |
+
▁custom -10.1601
|
| 1532 |
+
▁sniff -10.1601
|
| 1533 |
+
▁rubb -10.1601
|
| 1534 |
+
ique -10.1601
|
| 1535 |
+
▁wool -10.1601
|
| 1536 |
+
▁cartoon -10.1601
|
| 1537 |
+
▁slid -10.1601
|
| 1538 |
+
▁needle -10.1601
|
| 1539 |
+
▁iphone -10.1601
|
| 1540 |
+
▁speed -10.1602
|
| 1541 |
+
▁canoe -10.1602
|
| 1542 |
+
berry -10.1603
|
| 1543 |
+
▁multicolored -10.1616
|
| 1544 |
+
▁bagel -10.1642
|
| 1545 |
+
▁bent -10.1671
|
| 1546 |
+
▁paddle -10.1676
|
| 1547 |
+
▁sla -10.2031
|
| 1548 |
+
most -10.2043
|
| 1549 |
+
▁garnish -10.2127
|
| 1550 |
+
▁america -10.2127
|
| 1551 |
+
▁bikini -10.2127
|
| 1552 |
+
▁connect -10.2127
|
| 1553 |
+
▁focus -10.2127
|
| 1554 |
+
▁grizzl -10.2127
|
| 1555 |
+
▁horseback -10.2127
|
| 1556 |
+
▁jersey -10.2127
|
| 1557 |
+
▁liquid -10.2127
|
| 1558 |
+
▁money -10.2127
|
| 1559 |
+
▁motorbike -10.2127
|
| 1560 |
+
▁newspaper -10.2127
|
| 1561 |
+
▁opposite -10.2127
|
| 1562 |
+
▁ostrich -10.2127
|
| 1563 |
+
▁powder -10.2127
|
| 1564 |
+
▁process -10.2127
|
| 1565 |
+
▁selection -10.2127
|
| 1566 |
+
▁silverware -10.2127
|
| 1567 |
+
▁furry -10.2127
|
| 1568 |
+
▁pocket -10.2127
|
| 1569 |
+
▁relish -10.2127
|
| 1570 |
+
▁jelly -10.2127
|
| 1571 |
+
▁patch -10.2127
|
| 1572 |
+
▁fighter -10.2127
|
| 1573 |
+
▁santa -10.2127
|
| 1574 |
+
▁typing -10.2127
|
| 1575 |
+
▁cold -10.2127
|
| 1576 |
+
▁odd -10.2127
|
| 1577 |
+
▁tasting -10.2128
|
| 1578 |
+
lumin -10.2138
|
| 1579 |
+
▁sharp -10.2142
|
| 1580 |
+
▁wings -10.2143
|
| 1581 |
+
▁stew -10.2146
|
| 1582 |
+
▁team -10.2165
|
| 1583 |
+
▁checker -10.2177
|
| 1584 |
+
▁wait -10.2362
|
| 1585 |
+
form -10.2464
|
| 1586 |
+
bow -10.2598
|
| 1587 |
+
plug -10.2683
|
| 1588 |
+
▁aircraft -10.2683
|
| 1589 |
+
▁bookshelf -10.2683
|
| 1590 |
+
▁cigarette -10.2683
|
| 1591 |
+
▁digital -10.2683
|
| 1592 |
+
▁exhibit -10.2683
|
| 1593 |
+
▁interesting -10.2683
|
| 1594 |
+
▁meadow -10.2683
|
| 1595 |
+
▁muffin -10.2683
|
| 1596 |
+
▁natural -10.2683
|
| 1597 |
+
▁organiz -10.2683
|
| 1598 |
+
▁remov -10.2683
|
| 1599 |
+
▁sweet -10.2683
|
| 1600 |
+
▁celebrat -10.2683
|
| 1601 |
+
▁overlook -10.2683
|
| 1602 |
+
▁baking -10.2683
|
| 1603 |
+
▁wedding -10.2683
|
| 1604 |
+
▁dozen -10.2683
|
| 1605 |
+
▁forward -10.2683
|
| 1606 |
+
▁jeans -10.2683
|
| 1607 |
+
▁sailboat -10.2683
|
| 1608 |
+
guard -10.2684
|
| 1609 |
+
▁dust -10.2684
|
| 1610 |
+
▁hook -10.2685
|
| 1611 |
+
▁beak -10.2686
|
| 1612 |
+
▁action -10.2686
|
| 1613 |
+
▁adjust -10.2693
|
| 1614 |
+
ield -10.2699
|
| 1615 |
+
▁more -10.2703
|
| 1616 |
+
tain -10.275
|
| 1617 |
+
tted -10.2755
|
| 1618 |
+
▁layer -10.2867
|
| 1619 |
+
▁medi -10.2971
|
| 1620 |
+
▁start -10.3157
|
| 1621 |
+
book -10.3187
|
| 1622 |
+
▁avocado -10.3271
|
| 1623 |
+
▁construction -10.3271
|
| 1624 |
+
▁grapefruit -10.3271
|
| 1625 |
+
▁ingredients -10.3271
|
| 1626 |
+
▁instruction -10.3271
|
| 1627 |
+
▁ipod -10.3271
|
| 1628 |
+
▁learning -10.3271
|
| 1629 |
+
▁liquor -10.3271
|
| 1630 |
+
▁ornament -10.3271
|
| 1631 |
+
▁pencils -10.3271
|
| 1632 |
+
▁shallow -10.3271
|
| 1633 |
+
▁tourist -10.3271
|
| 1634 |
+
▁transport -10.3271
|
| 1635 |
+
▁trolley -10.3271
|
| 1636 |
+
▁tulips -10.3271
|
| 1637 |
+
▁clown -10.3271
|
| 1638 |
+
▁magnet -10.3271
|
| 1639 |
+
▁pineapple -10.3271
|
| 1640 |
+
▁loung -10.3271
|
| 1641 |
+
▁finish -10.3271
|
| 1642 |
+
▁first -10.3271
|
| 1643 |
+
▁knives -10.3271
|
| 1644 |
+
▁backyard -10.3271
|
| 1645 |
+
▁operat -10.3271
|
| 1646 |
+
▁knitt -10.3271
|
| 1647 |
+
▁skirt -10.3271
|
| 1648 |
+
▁semi -10.3271
|
| 1649 |
+
▁formal -10.3271
|
| 1650 |
+
▁apartment -10.3272
|
| 1651 |
+
▁mask -10.3274
|
| 1652 |
+
soft -10.3274
|
| 1653 |
+
▁peas -10.3278
|
| 1654 |
+
▁highway -10.3286
|
| 1655 |
+
▁countertop -10.3288
|
| 1656 |
+
▁flight -10.3301
|
| 1657 |
+
▁scissor -10.3677
|
| 1658 |
+
shirt -10.3813
|
| 1659 |
+
phone -10.3817
|
| 1660 |
+
▁perch -10.3842
|
| 1661 |
+
▁plai -10.3863
|
| 1662 |
+
▁calculator -10.3896
|
| 1663 |
+
▁chrome -10.3896
|
| 1664 |
+
▁church -10.3896
|
| 1665 |
+
▁console -10.3896
|
| 1666 |
+
▁flavor -10.3896
|
| 1667 |
+
▁fluffy -10.3896
|
| 1668 |
+
▁motorcyclist -10.3896
|
| 1669 |
+
▁nokia -10.3896
|
| 1670 |
+
▁project -10.3896
|
| 1671 |
+
▁selfie -10.3896
|
| 1672 |
+
▁skating -10.3896
|
| 1673 |
+
▁sneakers -10.3896
|
| 1674 |
+
▁spinach -10.3896
|
| 1675 |
+
▁stretch -10.3896
|
| 1676 |
+
▁transit -10.3896
|
| 1677 |
+
▁vegetation -10.3896
|
| 1678 |
+
▁cheesecake -10.3896
|
| 1679 |
+
▁extreme -10.3896
|
| 1680 |
+
▁material -10.3896
|
| 1681 |
+
▁roast -10.3896
|
| 1682 |
+
▁crouch -10.3896
|
| 1683 |
+
▁hung -10.3896
|
| 1684 |
+
▁fixing -10.3896
|
| 1685 |
+
▁taxi -10.3896
|
| 1686 |
+
▁crib -10.3896
|
| 1687 |
+
▁swan -10.3896
|
| 1688 |
+
▁boots -10.3896
|
| 1689 |
+
▁weather -10.3896
|
| 1690 |
+
▁safe -10.3896
|
| 1691 |
+
▁sunset -10.3896
|
| 1692 |
+
▁strawberry -10.3896
|
| 1693 |
+
▁disk -10.3896
|
| 1694 |
+
▁great -10.3897
|
| 1695 |
+
▁competition -10.3898
|
| 1696 |
+
▁alley -10.3899
|
| 1697 |
+
▁label -10.3901
|
| 1698 |
+
▁hangar -10.3912
|
| 1699 |
+
▁bundle -10.3928
|
| 1700 |
+
ither -10.3942
|
| 1701 |
+
ould -10.3944
|
| 1702 |
+
▁knee -10.395
|
| 1703 |
+
▁page -10.4205
|
| 1704 |
+
▁lemon -10.4341
|
| 1705 |
+
▁sell -10.4478
|
| 1706 |
+
amon -10.4558
|
| 1707 |
+
▁decor -10.456
|
| 1708 |
+
▁antelope -10.4562
|
| 1709 |
+
▁bamboo -10.4562
|
| 1710 |
+
▁blood -10.4562
|
| 1711 |
+
▁circu -10.4562
|
| 1712 |
+
▁dishwasher -10.4562
|
| 1713 |
+
▁habitat -10.4562
|
| 1714 |
+
▁library -10.4562
|
| 1715 |
+
▁located -10.4562
|
| 1716 |
+
▁pancakes -10.4562
|
| 1717 |
+
▁separate -10.4562
|
| 1718 |
+
▁special -10.4562
|
| 1719 |
+
▁tissue -10.4562
|
| 1720 |
+
▁wagon -10.4562
|
| 1721 |
+
▁basin -10.4562
|
| 1722 |
+
▁elaborate -10.4562
|
| 1723 |
+
▁follow -10.4562
|
| 1724 |
+
▁garlic -10.4562
|
| 1725 |
+
▁gravel -10.4562
|
| 1726 |
+
▁mobile -10.4562
|
| 1727 |
+
▁roam -10.4562
|
| 1728 |
+
▁baggage -10.4562
|
| 1729 |
+
▁bank -10.4562
|
| 1730 |
+
▁complete -10.4562
|
| 1731 |
+
▁gravy -10.4562
|
| 1732 |
+
▁attire -10.4563
|
| 1733 |
+
▁dried -10.4563
|
| 1734 |
+
▁ruler -10.4563
|
| 1735 |
+
▁cafe -10.4563
|
| 1736 |
+
▁nap -10.4563
|
| 1737 |
+
▁sold -10.4563
|
| 1738 |
+
▁saying -10.4564
|
| 1739 |
+
▁thread -10.4565
|
| 1740 |
+
▁smoke -10.4565
|
| 1741 |
+
▁candy -10.4571
|
| 1742 |
+
shells -10.4622
|
| 1743 |
+
▁enclos -10.5276
|
| 1744 |
+
▁artistic -10.5277
|
| 1745 |
+
▁cauliflower -10.5277
|
| 1746 |
+
▁charger -10.5277
|
| 1747 |
+
▁chinese -10.5277
|
| 1748 |
+
▁cushion -10.5277
|
| 1749 |
+
▁dispenser -10.5277
|
| 1750 |
+
▁grip -10.5277
|
| 1751 |
+
▁illuminated -10.5277
|
| 1752 |
+
▁industr -10.5277
|
| 1753 |
+
▁kissing -10.5277
|
| 1754 |
+
▁limb -10.5277
|
| 1755 |
+
▁locomotive -10.5277
|
| 1756 |
+
▁monument -10.5277
|
| 1757 |
+
▁price -10.5277
|
| 1758 |
+
▁splash -10.5277
|
| 1759 |
+
▁system -10.5277
|
| 1760 |
+
▁telephone -10.5277
|
| 1761 |
+
▁turkey -10.5277
|
| 1762 |
+
▁tusks -10.5277
|
| 1763 |
+
▁wetsuit -10.5277
|
| 1764 |
+
▁elder -10.5277
|
| 1765 |
+
▁repair -10.5277
|
| 1766 |
+
▁scarf -10.5277
|
| 1767 |
+
terior -10.5277
|
| 1768 |
+
▁wax -10.5277
|
| 1769 |
+
▁groom -10.5277
|
| 1770 |
+
▁punch -10.5277
|
| 1771 |
+
▁storm -10.5277
|
| 1772 |
+
▁barrel -10.5277
|
| 1773 |
+
▁pudd -10.5277
|
| 1774 |
+
▁extend -10.5277
|
| 1775 |
+
▁dump -10.5277
|
| 1776 |
+
▁trop -10.5277
|
| 1777 |
+
▁chas -10.5278
|
| 1778 |
+
▁enter -10.5278
|
| 1779 |
+
▁heav -10.5279
|
| 1780 |
+
▁vanity -10.5279
|
| 1781 |
+
▁lick -10.528
|
| 1782 |
+
▁wok -10.5282
|
| 1783 |
+
ixture -10.5285
|
| 1784 |
+
boat -10.5962
|
| 1785 |
+
work -10.5963
|
| 1786 |
+
▁apart -10.6045
|
| 1787 |
+
▁micro -10.6046
|
| 1788 |
+
▁donkey -10.6046
|
| 1789 |
+
▁honey -10.6046
|
| 1790 |
+
▁huddle -10.6046
|
| 1791 |
+
▁jumbo -10.6046
|
| 1792 |
+
▁pedestal -10.6046
|
| 1793 |
+
▁protect -10.6046
|
| 1794 |
+
▁symbol -10.6046
|
| 1795 |
+
▁wrapping -10.6046
|
| 1796 |
+
▁bloom -10.6046
|
| 1797 |
+
▁break -10.6046
|
| 1798 |
+
▁chest -10.6046
|
| 1799 |
+
▁cruise -10.6046
|
| 1800 |
+
▁gaze -10.6046
|
| 1801 |
+
▁polish -10.6046
|
| 1802 |
+
▁serious -10.6046
|
| 1803 |
+
▁wrist -10.6046
|
| 1804 |
+
▁booth -10.6046
|
| 1805 |
+
▁soldier -10.6046
|
| 1806 |
+
usual -10.6046
|
| 1807 |
+
▁nail -10.6046
|
| 1808 |
+
▁coach -10.6046
|
| 1809 |
+
▁formation -10.6046
|
| 1810 |
+
▁fighting -10.6046
|
| 1811 |
+
▁dvd -10.6046
|
| 1812 |
+
▁bald -10.6046
|
| 1813 |
+
▁sewing -10.6047
|
| 1814 |
+
▁warning -10.6047
|
| 1815 |
+
▁sideways -10.6049
|
| 1816 |
+
▁range -10.6062
|
| 1817 |
+
▁clip -10.6085
|
| 1818 |
+
▁veggie -10.6658
|
| 1819 |
+
▁swim -10.6879
|
| 1820 |
+
▁cluster -10.6879
|
| 1821 |
+
▁mattress -10.6879
|
| 1822 |
+
▁suspende -10.6879
|
| 1823 |
+
▁blueberries -10.6879
|
| 1824 |
+
▁brocolli -10.6879
|
| 1825 |
+
▁conveyor -10.6879
|
| 1826 |
+
▁cucumber -10.6879
|
| 1827 |
+
▁downhill -10.6879
|
| 1828 |
+
▁festival -10.6879
|
| 1829 |
+
▁fountain -10.6879
|
| 1830 |
+
▁frozen -10.6879
|
| 1831 |
+
▁handbag -10.6879
|
| 1832 |
+
▁kayak -10.6879
|
| 1833 |
+
▁language -10.6879
|
| 1834 |
+
▁league -10.6879
|
| 1835 |
+
▁mustache -10.6879
|
| 1836 |
+
▁sleek -10.6879
|
| 1837 |
+
▁vandalized -10.6879
|
| 1838 |
+
▁windowsill -10.6879
|
| 1839 |
+
▁yogurt -10.6879
|
| 1840 |
+
▁twig -10.6879
|
| 1841 |
+
▁strange -10.6879
|
| 1842 |
+
▁peach -10.6879
|
| 1843 |
+
▁drain -10.6879
|
| 1844 |
+
▁root -10.6879
|
| 1845 |
+
▁soap -10.6879
|
| 1846 |
+
▁grind -10.688
|
| 1847 |
+
▁microphone -10.688
|
| 1848 |
+
▁melt -10.6881
|
| 1849 |
+
▁peek -10.6882
|
| 1850 |
+
▁tip -10.7229
|
| 1851 |
+
mote -10.7542
|
| 1852 |
+
▁mushroom -10.7567
|
| 1853 |
+
dog -10.7698
|
| 1854 |
+
white -10.7706
|
| 1855 |
+
life -10.7736
|
| 1856 |
+
q -10.7779
|
| 1857 |
+
▁navy -10.7788
|
| 1858 |
+
tangerines -10.7788
|
| 1859 |
+
▁alcohol -10.7788
|
| 1860 |
+
▁artwork -10.7788
|
| 1861 |
+
▁asphalt -10.7788
|
| 1862 |
+
▁belong -10.7788
|
| 1863 |
+
▁bronze -10.7788
|
| 1864 |
+
▁casserole -10.7788
|
| 1865 |
+
▁charging -10.7788
|
| 1866 |
+
▁chef -10.7788
|
| 1867 |
+
▁cherries -10.7788
|
| 1868 |
+
▁citrus -10.7788
|
| 1869 |
+
▁crosswalk -10.7788
|
| 1870 |
+
▁factory -10.7788
|
| 1871 |
+
▁fireplace -10.7788
|
| 1872 |
+
▁gentlem -10.7788
|
| 1873 |
+
▁herself -10.7788
|
| 1874 |
+
▁junk -10.7788
|
| 1875 |
+
▁kettle -10.7788
|
| 1876 |
+
▁krisp -10.7788
|
| 1877 |
+
▁landscape -10.7788
|
| 1878 |
+
▁leaving -10.7788
|
| 1879 |
+
▁macaroni -10.7788
|
| 1880 |
+
▁squash -10.7788
|
| 1881 |
+
▁thumb -10.7788
|
| 1882 |
+
▁interact -10.7788
|
| 1883 |
+
▁softball -10.7788
|
| 1884 |
+
▁storage -10.7788
|
| 1885 |
+
▁depot -10.7788
|
| 1886 |
+
▁human -10.7788
|
| 1887 |
+
▁quilt -10.7788
|
| 1888 |
+
▁wicker -10.7788
|
| 1889 |
+
▁second -10.7789
|
| 1890 |
+
▁stalk -10.7789
|
| 1891 |
+
▁twin -10.7789
|
| 1892 |
+
▁order -10.7789
|
| 1893 |
+
▁steer -10.7789
|
| 1894 |
+
▁music -10.7789
|
| 1895 |
+
▁mash -10.779
|
| 1896 |
+
▁crane -10.779
|
| 1897 |
+
▁heels -10.779
|
| 1898 |
+
eagle -10.779
|
| 1899 |
+
▁carving -10.779
|
| 1900 |
+
▁appli -10.7793
|
| 1901 |
+
▁swis -10.7796
|
| 1902 |
+
▁ferr -10.78
|
| 1903 |
+
dual -10.7817
|
| 1904 |
+
pokes -10.7818
|
| 1905 |
+
▁lu -10.7963
|
| 1906 |
+
ello -10.7972
|
| 1907 |
+
▁read -10.8426
|
| 1908 |
+
▁appliance -10.8568
|
| 1909 |
+
ripe -10.8612
|
| 1910 |
+
wood -10.8706
|
| 1911 |
+
▁barrier -10.8788
|
| 1912 |
+
▁bicyclist -10.8788
|
| 1913 |
+
▁briefcase -10.8788
|
| 1914 |
+
▁change -10.8788
|
| 1915 |
+
▁convention -10.8788
|
| 1916 |
+
▁conversation -10.8788
|
| 1917 |
+
▁expos -10.8788
|
| 1918 |
+
▁kreme -10.8788
|
| 1919 |
+
▁mannequin -10.8788
|
| 1920 |
+
▁plunger -10.8788
|
| 1921 |
+
▁polka -10.8788
|
| 1922 |
+
▁resembl -10.8788
|
| 1923 |
+
▁savannah -10.8788
|
| 1924 |
+
▁seafood -10.8788
|
| 1925 |
+
▁service -10.8788
|
| 1926 |
+
▁shaggy -10.8788
|
| 1927 |
+
▁skyscraper -10.8788
|
| 1928 |
+
▁spectators -10.8788
|
| 1929 |
+
▁sunflower -10.8788
|
| 1930 |
+
▁tablecloth -10.8788
|
| 1931 |
+
▁wilderness -10.8788
|
| 1932 |
+
▁zone -10.8788
|
| 1933 |
+
▁elev -10.8788
|
| 1934 |
+
▁hallway -10.8788
|
| 1935 |
+
▁pepsi -10.8788
|
| 1936 |
+
▁retriev -10.8788
|
| 1937 |
+
▁sparse -10.8788
|
| 1938 |
+
▁squat -10.8788
|
| 1939 |
+
▁student -10.8788
|
| 1940 |
+
▁cubicle -10.8788
|
| 1941 |
+
▁freight -10.8788
|
| 1942 |
+
▁bubbl -10.8788
|
| 1943 |
+
▁duff -10.8788
|
| 1944 |
+
▁skillet -10.8788
|
| 1945 |
+
▁curve -10.8788
|
| 1946 |
+
▁german -10.8788
|
| 1947 |
+
▁frown -10.8788
|
| 1948 |
+
▁mural -10.8788
|
| 1949 |
+
▁mesh -10.8788
|
| 1950 |
+
▁porch -10.8788
|
| 1951 |
+
▁broad -10.8789
|
| 1952 |
+
▁present -10.8789
|
| 1953 |
+
▁toiletries -10.8789
|
| 1954 |
+
▁mice -10.8789
|
| 1955 |
+
▁fixtures -10.8789
|
| 1956 |
+
▁stem -10.8789
|
| 1957 |
+
were -10.9497
|
| 1958 |
+
▁grape -10.9678
|
| 1959 |
+
print -10.9815
|
| 1960 |
+
waiting -10.9816
|
| 1961 |
+
▁chees -10.9844
|
| 1962 |
+
▁glaze -10.9866
|
| 1963 |
+
▁even -10.9884
|
| 1964 |
+
ploye -10.9899
|
| 1965 |
+
▁audience -10.9899
|
| 1966 |
+
▁bristles -10.9899
|
| 1967 |
+
▁coleslaw -10.9899
|
| 1968 |
+
▁commode -10.9899
|
| 1969 |
+
▁consumption -10.9899
|
| 1970 |
+
▁damage -10.9899
|
| 1971 |
+
▁elegant -10.9899
|
| 1972 |
+
▁entree -10.9899
|
| 1973 |
+
▁environment -10.9899
|
| 1974 |
+
▁inspect -10.9899
|
| 1975 |
+
▁jungle -10.9899
|
| 1976 |
+
▁radio -10.9899
|
| 1977 |
+
▁receipt -10.9899
|
| 1978 |
+
▁return -10.9899
|
| 1979 |
+
▁scratch -10.9899
|
| 1980 |
+
▁slow -10.9899
|
| 1981 |
+
▁spaghetti -10.9899
|
| 1982 |
+
▁surprised -10.9899
|
| 1983 |
+
▁world -10.9899
|
| 1984 |
+
▁describ -10.9899
|
| 1985 |
+
▁pilot -10.9899
|
| 1986 |
+
▁roost -10.9899
|
| 1987 |
+
▁shoot -10.9899
|
| 1988 |
+
▁smell -10.9899
|
| 1989 |
+
▁wedge -10.9899
|
| 1990 |
+
▁hawk -10.99
|
| 1991 |
+
▁placemat -10.99
|
| 1992 |
+
▁sauerk -10.99
|
| 1993 |
+
▁peace -10.99
|
| 1994 |
+
▁loaf -10.99
|
| 1995 |
+
▁dairy -10.99
|
| 1996 |
+
▁worm -10.99
|
| 1997 |
+
▁carousel -10.99
|
| 1998 |
+
▁apron -10.99
|
| 1999 |
+
▁kick -10.99
|
| 2000 |
+
▁begin -10.99
|
src/dataset/sub_tokenizing_captions.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/dataset/train_sub_tokenizer.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import sentencepiece as spm
|
| 4 |
+
import yaml
|
| 5 |
+
|
| 6 |
+
# params
|
| 7 |
+
with open("/workspace/params.yaml", "r", encoding="utf-8") as f:
|
| 8 |
+
params = yaml.safe_load(f)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def train_sentencepiece(
|
| 12 |
+
json_path,
|
| 13 |
+
model_prefix="sub_tokenizer",
|
| 14 |
+
vocab_size=500,
|
| 15 |
+
model_type="unigram"
|
| 16 |
+
):
|
| 17 |
+
|
| 18 |
+
with open(json_path, 'r') as f:
|
| 19 |
+
data = json.load(f)
|
| 20 |
+
|
| 21 |
+
txt_path = "/workspace/src/dataset/sub_tokenizing_captions.txt"
|
| 22 |
+
|
| 23 |
+
with open(txt_path, "w", encoding="utf-8") as f:
|
| 24 |
+
for item in data:
|
| 25 |
+
captions = item["captions"]
|
| 26 |
+
|
| 27 |
+
for caption in captions:
|
| 28 |
+
f.write(caption.lower() + "\n")
|
| 29 |
+
|
| 30 |
+
spm.SentencePieceTrainer.train(
|
| 31 |
+
input=txt_path,
|
| 32 |
+
model_prefix=model_prefix,
|
| 33 |
+
vocab_size=vocab_size,
|
| 34 |
+
model_type=model_type,
|
| 35 |
+
|
| 36 |
+
pad_piece="<pad>",
|
| 37 |
+
bos_piece="<sos>",
|
| 38 |
+
eos_piece="<eos>",
|
| 39 |
+
unk_piece="<unk>",
|
| 40 |
+
|
| 41 |
+
pad_id=0,
|
| 42 |
+
bos_id=1,
|
| 43 |
+
eos_id=2,
|
| 44 |
+
unk_id=3
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
print("tokenizer training done")
|
| 48 |
+
|
| 49 |
+
if __name__ == "__main__":
|
| 50 |
+
train_sentencepiece(
|
| 51 |
+
json_path="/workspace/data/captioning/annotations/train.json",
|
| 52 |
+
model_prefix="/workspace/src/dataset/sub_tokenizer2000",
|
| 53 |
+
vocab_size=params["captioning"]["tokenizer"]["sp_vocab_size"],
|
| 54 |
+
model_type="unigram"
|
| 55 |
+
)
|
src/debug/test_forward.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
sys.path.append("/workspace/src/models")
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# model imports
|
| 7 |
+
from lstm import DecoderLSTM
|
| 8 |
+
from gru import DecoderGRU
|
| 9 |
+
from transformer import DecoderTransformer
|
| 10 |
+
# from transformer_scratch import DecoderTransformer
|
| 11 |
+
|
| 12 |
+
from resnet18 import EncoderResnet18
|
| 13 |
+
from efficientnet import EncoderEfficientNetB0
|
| 14 |
+
from convnext import EncoderConvNextTiny
|
| 15 |
+
from mobilenet import EncoderMobileNetV3Small
|
| 16 |
+
from vit import EncoderViTB16
|
| 17 |
+
from swin import EncoderSwinTiny
|
| 18 |
+
from deit import EncoderDeiTTiny
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# device
|
| 23 |
+
device = torch.device(
|
| 24 |
+
"cuda" if torch.cuda.is_available() else "cpu"
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
print(f"device: {device}")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# caption model dummy input
|
| 32 |
+
feature = torch.randn(1, 512).to(device)
|
| 33 |
+
# feature = torch.randn(1, 49, 512).to(device)
|
| 34 |
+
|
| 35 |
+
caption = torch.tensor(
|
| 36 |
+
[[0, 1, 2, 3, 4]]
|
| 37 |
+
).to(device)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
### LSTM Forward ###
|
| 42 |
+
lstm_model = DecoderLSTM().to(device)
|
| 43 |
+
|
| 44 |
+
lstm_out = lstm_model(
|
| 45 |
+
feature,
|
| 46 |
+
caption
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
print(f"LSTM: {lstm_out.shape}")
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
### GRU Forward ###
|
| 54 |
+
gru_model = DecoderGRU().to(device)
|
| 55 |
+
|
| 56 |
+
gru_out = gru_model(
|
| 57 |
+
feature,
|
| 58 |
+
caption
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
print(f"GRU: {gru_out.shape}")
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
### Transformer Forward ###
|
| 66 |
+
transformer_model = DecoderTransformer().to(device)
|
| 67 |
+
|
| 68 |
+
transformer_out, map, map = transformer_model(
|
| 69 |
+
caption,
|
| 70 |
+
feature,
|
| 71 |
+
0
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
print(f"Transformer: {transformer_out.shape}")
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
### ResNet18 Forward ###
|
| 79 |
+
NUM_CLASSES = 50
|
| 80 |
+
|
| 81 |
+
resnet18_model = EncoderResnet18(
|
| 82 |
+
num_classes=NUM_CLASSES
|
| 83 |
+
).to(device)
|
| 84 |
+
|
| 85 |
+
dummy_images = torch.randn(
|
| 86 |
+
8, 3, 224, 224
|
| 87 |
+
).to(device)
|
| 88 |
+
|
| 89 |
+
logits, features = resnet18_model(
|
| 90 |
+
dummy_images
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
print(f"ResNet18 logits: {logits.shape}")
|
| 94 |
+
print(f"ResNet18 features: {features.shape}")
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
### EfficientNet-B0 Forward ###
|
| 99 |
+
efficientnet_model = EncoderEfficientNetB0(
|
| 100 |
+
num_classes=NUM_CLASSES
|
| 101 |
+
).to(device)
|
| 102 |
+
|
| 103 |
+
efficientnet_out = efficientnet_model(
|
| 104 |
+
dummy_images
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
print(
|
| 108 |
+
f"EfficientNet-B0: "
|
| 109 |
+
f"{efficientnet_out.shape}"
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
# expected:
|
| 113 |
+
# torch.Size([8, 50])
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
### ConvNeXt-Tiny Forward ###
|
| 118 |
+
convnext_model = EncoderConvNextTiny(
|
| 119 |
+
num_classes=NUM_CLASSES
|
| 120 |
+
).to(device)
|
| 121 |
+
|
| 122 |
+
convnext_out = convnext_model(
|
| 123 |
+
dummy_images
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
print(
|
| 127 |
+
f"ConvNeXt-Tiny: "
|
| 128 |
+
f"{convnext_out.shape}"
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
# expected:
|
| 132 |
+
# torch.Size([8, 50])
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
### MobileNetV3 Small Forward ###
|
| 137 |
+
mobilenet_model = EncoderMobileNetV3Small(
|
| 138 |
+
num_classes=NUM_CLASSES
|
| 139 |
+
).to(device)
|
| 140 |
+
|
| 141 |
+
mobilenet_out = mobilenet_model(
|
| 142 |
+
dummy_images
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
print(
|
| 146 |
+
f"MobileNetV3 Small: "
|
| 147 |
+
f"{mobilenet_out.shape}"
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
# expected:
|
| 151 |
+
# torch.Size([8, 50])
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
### ViT-B/16 Forward ###
|
| 156 |
+
vit_model = EncoderViTB16(
|
| 157 |
+
num_classes=NUM_CLASSES
|
| 158 |
+
).to(device)
|
| 159 |
+
|
| 160 |
+
vit_out = vit_model(
|
| 161 |
+
dummy_images
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
print(
|
| 165 |
+
f"ViT-B/16: "
|
| 166 |
+
f"{vit_out.shape}"
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
# expected:
|
| 170 |
+
# torch.Size([8, 50])
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
### Swin-T Forward ###
|
| 175 |
+
swin_model = EncoderSwinTiny(
|
| 176 |
+
num_classes=NUM_CLASSES
|
| 177 |
+
).to(device)
|
| 178 |
+
|
| 179 |
+
swin_out = swin_model(
|
| 180 |
+
dummy_images
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
print(
|
| 184 |
+
f"Swin-T: "
|
| 185 |
+
f"{swin_out.shape}"
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
# expected:
|
| 189 |
+
# torch.Size([8, 50])
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
### DeiT-Tiny Forward ###
|
| 194 |
+
deit_model = EncoderDeiTTiny(
|
| 195 |
+
num_classes=NUM_CLASSES
|
| 196 |
+
).to(device)
|
| 197 |
+
|
| 198 |
+
deit_out = deit_model(
|
| 199 |
+
dummy_images
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
print(
|
| 203 |
+
f"DeiT-Tiny: "
|
| 204 |
+
f"{deit_out.shape}"
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
# expected:
|
| 208 |
+
# torch.Size([8, 50])
|
src/engines/__pycache__/captioning_trainer.cpython-310.pyc
ADDED
|
Binary file (765 Bytes). View file
|
|
|
src/engines/__pycache__/captioning_validator.cpython-310.pyc
ADDED
|
Binary file (734 Bytes). View file
|
|
|
src/engines/__pycache__/classification_trainer.cpython-310.pyc
ADDED
|
Binary file (1.07 kB). View file
|
|
|
src/engines/__pycache__/classification_validator.cpython-310.pyc
ADDED
|
Binary file (972 Bytes). View file
|
|
|
src/engines/__pycache__/resnet18_decoder_trainer.cpython-310.pyc
ADDED
|
Binary file (731 Bytes). View file
|
|
|
src/engines/__pycache__/resnet18_decoder_validator.cpython-310.pyc
ADDED
|
Binary file (1.07 kB). View file
|
|
|
src/engines/captioning_trainer.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def train_one_epoch(
|
| 5 |
+
encoder,
|
| 6 |
+
decoder,
|
| 7 |
+
loader,
|
| 8 |
+
criterion,
|
| 9 |
+
optimizer,
|
| 10 |
+
device,
|
| 11 |
+
scheduler=None
|
| 12 |
+
):
|
| 13 |
+
|
| 14 |
+
encoder.train()
|
| 15 |
+
decoder.train()
|
| 16 |
+
|
| 17 |
+
total_loss = 0
|
| 18 |
+
for images, captions in loader:
|
| 19 |
+
images = images.to(device)
|
| 20 |
+
captions = captions.to(device)
|
| 21 |
+
|
| 22 |
+
feature = encoder(images, return_features=True)
|
| 23 |
+
|
| 24 |
+
input_caption = captions[:, :-1]
|
| 25 |
+
target_caption = captions[:, 1:]
|
| 26 |
+
|
| 27 |
+
outputs = decoder(feature, input_caption)
|
| 28 |
+
|
| 29 |
+
loss = criterion(
|
| 30 |
+
outputs.reshape(-1, outputs.shape[-1]),
|
| 31 |
+
target_caption.reshape(-1)
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
if scheduler is not None:
|
| 35 |
+
scheduler.step()
|
| 36 |
+
|
| 37 |
+
optimizer.zero_grad()
|
| 38 |
+
loss.backward()
|
| 39 |
+
optimizer.step()
|
| 40 |
+
|
| 41 |
+
total_loss += loss.item()
|
| 42 |
+
|
| 43 |
+
return total_loss / len(loader)
|
src/engines/captioning_validator.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def validation_one_epoch(
|
| 5 |
+
encoder,
|
| 6 |
+
decoder,
|
| 7 |
+
loader,
|
| 8 |
+
criterion,
|
| 9 |
+
device,
|
| 10 |
+
):
|
| 11 |
+
|
| 12 |
+
encoder.eval()
|
| 13 |
+
decoder.eval()
|
| 14 |
+
|
| 15 |
+
with torch.no_grad():
|
| 16 |
+
total_loss = 0
|
| 17 |
+
for images, captions, _, __ in loader:
|
| 18 |
+
|
| 19 |
+
images = images.to(device) # B, 3, 224, 224
|
| 20 |
+
captions = captions.to(device) # B, seq_len
|
| 21 |
+
|
| 22 |
+
feature = encoder(images, return_features=True) # B, 49, 512
|
| 23 |
+
|
| 24 |
+
input_caption = captions[:, :-1] # B, seq_len-1
|
| 25 |
+
target_caption = captions[:, 1:] # B, seq_len-1
|
| 26 |
+
|
| 27 |
+
outputs = decoder(feature, input_caption) # B, seq_len-1, voca_size
|
| 28 |
+
|
| 29 |
+
loss = criterion(
|
| 30 |
+
outputs.reshape(-1, outputs.shape[-1]), # B*(seq_len-1), voca_size
|
| 31 |
+
target_caption.reshape(-1) # B*seq_len-1
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
total_loss += loss.item()
|
| 35 |
+
|
| 36 |
+
return total_loss / len(loader)
|
src/engines/classification_trainer.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from torchmetrics.classification import (
|
| 2 |
+
MulticlassAccuracy
|
| 3 |
+
)
|
| 4 |
+
|
| 5 |
+
from transforms.mixup import mixup_data
|
| 6 |
+
from transforms.cutmix import cutmix_data
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def train_one_epoch(
|
| 10 |
+
model,
|
| 11 |
+
loader,
|
| 12 |
+
criterion,
|
| 13 |
+
optimizer,
|
| 14 |
+
device,
|
| 15 |
+
num_classes,
|
| 16 |
+
augmentation=None
|
| 17 |
+
):
|
| 18 |
+
|
| 19 |
+
model.train()
|
| 20 |
+
|
| 21 |
+
metric = MulticlassAccuracy(
|
| 22 |
+
num_classes=num_classes
|
| 23 |
+
).to(device)
|
| 24 |
+
|
| 25 |
+
total_loss = 0
|
| 26 |
+
|
| 27 |
+
for images, labels in loader:
|
| 28 |
+
images = images.to(device)
|
| 29 |
+
labels = labels.to(device)
|
| 30 |
+
|
| 31 |
+
if augmentation == "mixup":
|
| 32 |
+
images, labels_a, labels_b, lam = mixup_data(
|
| 33 |
+
images,
|
| 34 |
+
labels
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
elif augmentation == "cutmix":
|
| 38 |
+
images, labels_a, labels_b, lam = cutmix_data(
|
| 39 |
+
images,
|
| 40 |
+
labels
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
outputs = model(images)
|
| 44 |
+
|
| 45 |
+
if augmentation in ["mixup", "cutmix"]:
|
| 46 |
+
loss = (
|
| 47 |
+
lam * criterion(outputs, labels_a)
|
| 48 |
+
+ (1 - lam) * criterion(outputs, labels_b)
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
else:
|
| 52 |
+
loss = criterion(
|
| 53 |
+
outputs,
|
| 54 |
+
labels
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
optimizer.zero_grad()
|
| 58 |
+
loss.backward()
|
| 59 |
+
optimizer.step()
|
| 60 |
+
total_loss += loss.item()
|
| 61 |
+
preds = outputs.argmax(dim=1)
|
| 62 |
+
|
| 63 |
+
metric.update(
|
| 64 |
+
preds,
|
| 65 |
+
labels
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
acc = metric.compute().item()
|
| 69 |
+
|
| 70 |
+
return total_loss / len(loader), acc
|
src/engines/classification_validator.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
from torchmetrics.classification import (
|
| 4 |
+
MulticlassAccuracy,
|
| 5 |
+
MulticlassF1Score,
|
| 6 |
+
|
| 7 |
+
# precision / recall
|
| 8 |
+
MulticlassPrecision,
|
| 9 |
+
MulticlassRecall
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def validation_one_epoch(
|
| 14 |
+
model,
|
| 15 |
+
loader,
|
| 16 |
+
criterion,
|
| 17 |
+
device,
|
| 18 |
+
num_classes
|
| 19 |
+
):
|
| 20 |
+
|
| 21 |
+
model.eval()
|
| 22 |
+
|
| 23 |
+
acc_metric = MulticlassAccuracy(
|
| 24 |
+
num_classes=num_classes
|
| 25 |
+
).to(device)
|
| 26 |
+
|
| 27 |
+
f1_metric = MulticlassF1Score(
|
| 28 |
+
num_classes=num_classes,
|
| 29 |
+
average="macro"
|
| 30 |
+
).to(device)
|
| 31 |
+
|
| 32 |
+
# precision metric
|
| 33 |
+
# precision_metric = MulticlassPrecision(
|
| 34 |
+
# num_classes=num_classes,
|
| 35 |
+
# average="macro"
|
| 36 |
+
# ).to(device)
|
| 37 |
+
|
| 38 |
+
# recall metric
|
| 39 |
+
# recall_metric = MulticlassRecall(
|
| 40 |
+
# num_classes=num_classes,
|
| 41 |
+
# average="macro"
|
| 42 |
+
# ).to(device)
|
| 43 |
+
|
| 44 |
+
total_loss = 0
|
| 45 |
+
|
| 46 |
+
with torch.no_grad():
|
| 47 |
+
|
| 48 |
+
for images, labels in loader:
|
| 49 |
+
images = images.to(device)
|
| 50 |
+
labels = labels.to(device)
|
| 51 |
+
outputs = model(images)
|
| 52 |
+
loss = criterion(
|
| 53 |
+
outputs,
|
| 54 |
+
labels
|
| 55 |
+
)
|
| 56 |
+
total_loss += loss.item()
|
| 57 |
+
preds = outputs.argmax(dim=1)
|
| 58 |
+
|
| 59 |
+
acc_metric.update(
|
| 60 |
+
preds,
|
| 61 |
+
labels
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
f1_metric.update(
|
| 65 |
+
preds,
|
| 66 |
+
labels
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# precision_metric.update(
|
| 70 |
+
# preds,
|
| 71 |
+
# labels
|
| 72 |
+
# )
|
| 73 |
+
|
| 74 |
+
# recall_metric.update(
|
| 75 |
+
# preds,
|
| 76 |
+
# labels
|
| 77 |
+
# )
|
| 78 |
+
|
| 79 |
+
acc = acc_metric.compute().item()
|
| 80 |
+
f1 = f1_metric.compute().item()
|
| 81 |
+
# precision = precision_metric.compute().item()
|
| 82 |
+
# recall = recall_metric.compute().item()
|
| 83 |
+
|
| 84 |
+
return (
|
| 85 |
+
total_loss / len(loader),
|
| 86 |
+
acc,
|
| 87 |
+
f1,
|
| 88 |
+
# precision,
|
| 89 |
+
# recall
|
| 90 |
+
)
|
src/metrics/.ipynb_checkpoints/evaluate_caption-checkpoint.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from pycocoevalcap.bleu.bleu import Bleu
|
| 3 |
+
from pycocoevalcap.cider.cider import Cider
|
| 4 |
+
|
| 5 |
+
def evaluate_caption(
|
| 6 |
+
all_generated_sentence,
|
| 7 |
+
all_references
|
| 8 |
+
):
|
| 9 |
+
|
| 10 |
+
references_dict = {i:list(sentences) for i, sentences in enumerate(all_references)}
|
| 11 |
+
generated_dict = {i:[sentence] for i, sentence in enumerate(all_generated_sentence)}
|
| 12 |
+
|
| 13 |
+
bleu_scorer = Bleu(4)
|
| 14 |
+
bleu_score, _ = bleu_scorer.compute_score(
|
| 15 |
+
references_dict,
|
| 16 |
+
generated_dict
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
cider_scorer = Cider()
|
| 20 |
+
cider_score, _ = cider_scorer.compute_score(
|
| 21 |
+
references_dict,
|
| 22 |
+
generated_dict
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
metric_result = {
|
| 26 |
+
"bleu1": bleu_score[0],
|
| 27 |
+
"bleu2": bleu_score[1],
|
| 28 |
+
"bleu3": bleu_score[2],
|
| 29 |
+
"bleu4": bleu_score[3],
|
| 30 |
+
"cider": cider_score,
|
| 31 |
+
"generated": generated_dict,
|
| 32 |
+
"references": references_dict
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
return metric_result
|
src/metrics/.ipynb_checkpoints/make_show_all_caption-checkpoint.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
###### best val loss 지점에서 모든 생성 캡션 출력 및 반환, heatmap 저장 #####
|
| 2 |
+
import torch
|
| 3 |
+
import random
|
| 4 |
+
from utils.checkpoint_manager import load_checkpoint
|
| 5 |
+
|
| 6 |
+
def make_show_all_caption(
|
| 7 |
+
loader,
|
| 8 |
+
encoder,
|
| 9 |
+
decoder,
|
| 10 |
+
optimizer,
|
| 11 |
+
w2i,
|
| 12 |
+
i2w,
|
| 13 |
+
best_path,
|
| 14 |
+
dec_atten_dir,
|
| 15 |
+
enc_dec_atten_dir,
|
| 16 |
+
SEED,
|
| 17 |
+
heatmap_sample,
|
| 18 |
+
layer,
|
| 19 |
+
device,
|
| 20 |
+
use_subword,
|
| 21 |
+
sp_model_path
|
| 22 |
+
):
|
| 23 |
+
|
| 24 |
+
_, best_val_loss = load_checkpoint(
|
| 25 |
+
best_path,
|
| 26 |
+
encoder,
|
| 27 |
+
decoder,
|
| 28 |
+
optimizer,
|
| 29 |
+
device
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
all_references = []
|
| 33 |
+
all_generated_token = []
|
| 34 |
+
all_dec_atten = []
|
| 35 |
+
all_enc_dec_atten = []
|
| 36 |
+
all_images = []
|
| 37 |
+
all_file_name = []
|
| 38 |
+
for images, _, batch_references, file_name in loader:
|
| 39 |
+
images = images.to(device)
|
| 40 |
+
|
| 41 |
+
features = encoder(images, return_features=True)
|
| 42 |
+
|
| 43 |
+
generated_token, dec_atten, enc_dec_atten = decoder.generate(
|
| 44 |
+
features, # B, 49, 512
|
| 45 |
+
torch.full((features.size(0),), w2i["<sos>"], device=device), # B,
|
| 46 |
+
w2i["<eos>"],
|
| 47 |
+
)
|
| 48 |
+
all_dec_atten.extend(dec_atten) # all_B, layers, nhead, seq_len, seq_len
|
| 49 |
+
all_enc_dec_atten.extend(enc_dec_atten) # all_B, layers, nhead, seq_len, 49
|
| 50 |
+
all_images.extend(images.cpu())
|
| 51 |
+
all_references.extend(list(zip(*batch_references)))
|
| 52 |
+
all_generated_token.extend(generated_token) # all_B, seq_len-1
|
| 53 |
+
all_file_name.extend(file_name)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
all_generated_sentence = []
|
| 57 |
+
for sentence_token in all_generated_token:
|
| 58 |
+
if w2i["<eos>"] in sentence_token:
|
| 59 |
+
end_inx = sentence_token.index(w2i["<eos>"])
|
| 60 |
+
sentence_token = sentence_token[:end_inx]
|
| 61 |
+
|
| 62 |
+
# ==================================
|
| 63 |
+
# SentencePiece tokenizer
|
| 64 |
+
# ==================================
|
| 65 |
+
if use_subword:
|
| 66 |
+
import sentencepiece as spm
|
| 67 |
+
|
| 68 |
+
sp = spm.SentencePieceProcessor()
|
| 69 |
+
sp.load(sp_model_path)
|
| 70 |
+
# special token 제거
|
| 71 |
+
sentence_token = [token for token in sentence_token
|
| 72 |
+
if token not in [
|
| 73 |
+
w2i["<pad>"],
|
| 74 |
+
w2i["<sos>"],
|
| 75 |
+
w2i["<eos>"]
|
| 76 |
+
]
|
| 77 |
+
]
|
| 78 |
+
sentence = sp.decode(sentence_token)
|
| 79 |
+
|
| 80 |
+
else:
|
| 81 |
+
words = [i2w[i] for i in sentence_token]
|
| 82 |
+
sentence = ' '.join(words)
|
| 83 |
+
|
| 84 |
+
all_generated_sentence.append(sentence) # all_B, 1(문장)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
decoder.show_dec_atten(all_dec_atten[heatmap_sample], all_generated_sentence[heatmap_sample].split(), layer, dec_atten_dir)
|
| 88 |
+
decoder.show_cross_atten(all_enc_dec_atten[heatmap_sample], all_generated_sentence[heatmap_sample].split(), layer, all_images[heatmap_sample], enc_dec_atten_dir)
|
| 89 |
+
|
| 90 |
+
random.seed(SEED)
|
| 91 |
+
all_B = len(all_generated_sentence)
|
| 92 |
+
sample = random.sample(range(0, all_B), 5)
|
| 93 |
+
for i in sample:
|
| 94 |
+
print("-" * 60)
|
| 95 |
+
print(f' {all_file_name[i]}: {all_generated_sentence[i]}')
|
| 96 |
+
print("-" * 60)
|
| 97 |
+
|
| 98 |
+
for inx, reference in enumerate(all_references[i], start=1):
|
| 99 |
+
print(f'Reference {inx}: {reference}')
|
| 100 |
+
print("=" * 60)
|
| 101 |
+
|
| 102 |
+
print(f'Best Val Loss: {best_val_loss}')
|
| 103 |
+
|
| 104 |
+
return all_generated_sentence, all_references
|
| 105 |
+
|