Spaces:

ImAMJayKIM
/

Mini-ImageNet

Sleeping

App Files Files Community

Mini-ImageNet / src /caption /check_clip_score.py

ImAMJayKIM

Upload 96 files

c1596ac verified 26 days ago

Raw

History Blame Contribute Delete

13.2 kB

	import json
	from pathlib import Path
	from typing import Any

	import numpy as np
	import torch
	import torch.nn.functional as F
	from PIL import Image
	from tqdm import tqdm
	from transformers import CLIPModel, CLIPProcessor
	import os
	from dotenv import load_dotenv

	# ============================================================
	# 설정값
	# ============================================================

	load_dotenv()

	# .env 안의 HF_TOKEN 읽기
	hf_token = os.getenv("HF_TOKEN")

	# 전체 클래스를 검수하려면 True
	# 특정 클래스만 검수하려면 False
	CHECK_ALL_CLASSES = True

	# 전체 클래스 검수 시 기준이 되는 raw 데이터 루트
	DATA_RAW_ROOT_DIR = Path("data/raw")

	# 특정 클래스만 검수할 때 사용할 클래스 폴더 경로
	# CHECK_ALL_CLASSES = False 일 때만 사용됨
	TARGET_CLASS_DIR = Path("data/raw")

	# 입력 JSON 파일
	INPUT_JSON_PATH = Path("data/annotations/captions_flo_all.json")

	# 출력 JSON 파일
	OUTPUT_JSON_PATH = Path("data/annotations/clip_checked_flo_all.json")

	# 사용할 CLIP 모델
	MODEL_NAME = "openai/clip-vit-base-patch32"

	# 한 번에 처리할 이미지-캡션 쌍 개수
	BATCH_SIZE = 32

	# 하위 몇 %를 fail / review로 볼지
	FAIL_BOTTOM_PERCENT = 10
	REVIEW_BOTTOM_PERCENT = 20

	print("경로 : " , INPUT_JSON_PATH)

	# ============================================================
	# JSON 입출력
	# ============================================================

	def load_json(path: Path) -> list[dict[str, Any]]:
	with path.open("r", encoding="utf-8") as f:
	data = json.load(f)

	if not isinstance(data, list):
	raise ValueError("입력 JSON은 반드시 배열 형태여야 합니다.")

	return data


	def save_json(data: list[dict[str, Any]], path: Path) -> None:
	path.parent.mkdir(parents=True, exist_ok=True)

	with path.open("w", encoding="utf-8") as f:
	json.dump(data, f, ensure_ascii=False, indent=4)


	# ============================================================
	# 클래스 / 경로 처리
	# ============================================================

	def get_target_class_name() -> str:
	"""
	TARGET_CLASS_DIR = data/raw/airplane 이면 airplane 반환
	"""
	return TARGET_CLASS_DIR.name


	def get_class_name_from_image_value(image_value: str) -> str:
	"""
	JSON의 image 값이 airplane/hf_airplane_001.jpg 라면 airplane 반환
	"""
	image_value = image_value.replace("\\", "/")
	image_path = Path(image_value)

	if len(image_path.parts) < 2:
	return ""

	return image_path.parts[0]


	def is_target_item(item: dict[str, Any]) -> bool:
	"""
	CHECK_ALL_CLASSES = True:
	모든 item 처리

	CHECK_ALL_CLASSES = False:
	TARGET_CLASS_DIR.name과 JSON image의 첫 번째 폴더명이 같은 item만 처리
	"""
	if CHECK_ALL_CLASSES:
	return True

	image_value = str(item.get("image", ""))
	image_class_name = get_class_name_from_image_value(image_value)

	return image_class_name == get_target_class_name()


	def resolve_image_path(image_value: str) -> Path:
	"""
	JSON:
	"image": "airplane/hf_airplane_001.jpg"

	전체 클래스 검수:
	DATA_RAW_ROOT_DIR / image
	→ data/raw/airplane/hf_airplane_001.jpg

	특정 클래스 검수:
	TARGET_CLASS_DIR / 파일명
	→ data/raw/airplane/hf_airplane_001.jpg
	"""
	image_value = image_value.replace("\\", "/")
	image_path = Path(image_value)

	if CHECK_ALL_CLASSES:
	return DATA_RAW_ROOT_DIR / image_path

	return TARGET_CLASS_DIR / image_path.name


	def load_image(image_path: Path) -> Image.Image \| None:
	try:
	with Image.open(image_path) as img:
	return img.convert("RGB").copy()
	except Exception:
	return None


	# ============================================================
	# 캡션 펼치기
	# ============================================================

	def flatten_caption_items(data: list[dict[str, Any]]) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
	"""
	이미지 1장에 caption 3개가 있으면
	이미지-캡션 쌍 3개로 펼친다.
	"""
	target_data = []
	flat_items = []

	for item in data:
	if not is_target_item(item):
	continue

	target_item_index = len(target_data)
	target_data.append(item)

	image_value = str(item.get("image", ""))
	captions = item.get("captions", [])

	if not isinstance(captions, list):
	captions = []

	for caption_index, caption in enumerate(captions):
	flat_items.append({
	"item_index": target_item_index,
	"caption_index": caption_index,
	"image": image_value,
	"class": item.get("class", ""),
	"split": item.get("split", ""),
	"caption": str(caption).strip()
	})

	return target_data, flat_items


	# ============================================================
	# CLIP Score 계산
	# ============================================================

	@torch.no_grad()
	def compute_clip_scores(
	flat_items: list[dict[str, Any]],
	model: CLIPModel,
	processor: CLIPProcessor,
	device: torch.device
	) -> list[dict[str, Any]]:

	results = []

	for start in tqdm(range(0, len(flat_items), BATCH_SIZE), desc="computing CLIP scores"):
	batch_items = flat_items[start:start + BATCH_SIZE]

	valid_items = []
	images = []
	texts = []

	for item in batch_items:
	image_path = resolve_image_path(item["image"])
	image = load_image(image_path)

	if image is None:
	results.append({
	**item,
	"resolved_image_path": str(image_path).replace("\\", "/"),
	"clip_cosine": None,
	"clip_score": None,
	"clip_status": "missing_image",
	"clip_reason": f"image file could not be opened: {image_path}"
	})
	continue

	caption = item["caption"]

	if not caption:
	results.append({
	**item,
	"resolved_image_path": str(image_path).replace("\\", "/"),
	"clip_cosine": None,
	"clip_score": None,
	"clip_status": "empty_caption",
	"clip_reason": "caption is empty"
	})
	continue

	valid_items.append({
	**item,
	"resolved_image_path": str(image_path).replace("\\", "/")
	})
	images.append(image)
	texts.append(caption)

	if not valid_items:
	continue

	inputs = processor(
	text=texts,
	images=images,
	return_tensors="pt",
	padding=True,
	truncation=True
	)

	inputs = {
	key: value.to(device)
	for key, value in inputs.items()
	}

	outputs = model(
	input_ids=inputs["input_ids"],
	attention_mask=inputs["attention_mask"],
	pixel_values=inputs["pixel_values"]
	)

	image_features = outputs.image_embeds
	text_features = outputs.text_embeds

	image_features = F.normalize(image_features, p=2, dim=1)
	text_features = F.normalize(text_features, p=2, dim=1)

	cosine_scores = (image_features * text_features).sum(dim=1)

	for item, cosine in zip(valid_items, cosine_scores):
	cosine_value = float(cosine.detach().cpu().item())
	clip_score = 2.5 * max(cosine_value, 0.0)

	results.append({
	**item,
	"clip_cosine": round(cosine_value, 6),
	"clip_score": round(clip_score, 6),
	"clip_status": "pending",
	"clip_reason": ""
	})

	return results


	# ============================================================
	# pass / review / fail 판정
	# ============================================================

	def assign_clip_status(results: list[dict[str, Any]]) -> None:
	valid_scores = [
	result["clip_score"]
	for result in results
	if isinstance(result.get("clip_score"), float)
	]

	if not valid_scores:
	return

	fail_threshold = np.percentile(valid_scores, FAIL_BOTTOM_PERCENT)
	review_threshold = np.percentile(valid_scores, REVIEW_BOTTOM_PERCENT)

	for result in results:
	clip_score = result.get("clip_score")

	if clip_score is None:
	continue

	if clip_score <= fail_threshold:
	result["clip_status"] = "fail"
	result["clip_reason"] = f"clip score is in the bottom {FAIL_BOTTOM_PERCENT}%"
	elif clip_score <= review_threshold:
	result["clip_status"] = "review"
	result["clip_reason"] = f"clip score is in the bottom {REVIEW_BOTTOM_PERCENT}%"
	else:
	result["clip_status"] = "pass"
	result["clip_reason"] = "clip score is acceptable"


	# ============================================================
	# 결과를 원래 JSON 구조에 붙이기
	# ============================================================

	def attach_results_to_data(
	target_data: list[dict[str, Any]],
	results: list[dict[str, Any]]
	) -> list[dict[str, Any]]:

	for item in target_data:
	item["caption_checks"] = []

	results = sorted(
	results,
	key=lambda x: (x["item_index"], x["caption_index"])
	)

	for result in results:
	item_index = result["item_index"]

	check = {
	"caption_index": result["caption_index"],
	"caption": result["caption"],
	"resolved_image_path": result.get("resolved_image_path"),
	"clip_cosine": result.get("clip_cosine"),
	"clip_score": result.get("clip_score"),
	"clip_status": result.get("clip_status"),
	"clip_reason": result.get("clip_reason", "")
	}

	target_data[item_index]["caption_checks"].append(check)

	return target_data


	# ============================================================
	# 요약 출력
	# ============================================================

	def print_summary(
	target_data: list[dict[str, Any]],
	flat_items: list[dict[str, Any]],
	results: list[dict[str, Any]]
	) -> None:

	status_count = {}
	valid_scores = []

	for result in results:
	status = result.get("clip_status", "unknown")
	status_count[status] = status_count.get(status, 0) + 1

	if isinstance(result.get("clip_score"), float):
	valid_scores.append(result["clip_score"])

	print("\n===== CLIP Score Summary =====")
	print(f"check all classes: {CHECK_ALL_CLASSES}")

	if CHECK_ALL_CLASSES:
	print(f"data raw root dir: {DATA_RAW_ROOT_DIR}")
	else:
	print(f"target class dir: {TARGET_CLASS_DIR}")
	print(f"target class name: {get_target_class_name()}")

	print(f"target images: {len(target_data)}")
	print(f"target image-caption pairs: {len(flat_items)}")
	print(f"status count: {status_count}")

	if valid_scores:
	print(f"min score: {min(valid_scores):.4f}")
	print(f"max score: {max(valid_scores):.4f}")
	print(f"mean score: {np.mean(valid_scores):.4f}")
	print(f"bottom {FAIL_BOTTOM_PERCENT}% threshold: {np.percentile(valid_scores, FAIL_BOTTOM_PERCENT):.4f}")
	print(f"bottom {REVIEW_BOTTOM_PERCENT}% threshold: {np.percentile(valid_scores, REVIEW_BOTTOM_PERCENT):.4f}")


	# ============================================================
	# 실행
	# ============================================================

	def main():
	if not INPUT_JSON_PATH.exists():
	raise FileNotFoundError(f"input file not found: {INPUT_JSON_PATH}")

	if CHECK_ALL_CLASSES:
	if not DATA_RAW_ROOT_DIR.exists():
	raise FileNotFoundError(f"data raw root directory not found: {DATA_RAW_ROOT_DIR}")
	else:
	if not TARGET_CLASS_DIR.exists():
	raise FileNotFoundError(f"target class directory not found: {TARGET_CLASS_DIR}")

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"device: {device}")

	print(f"loading model: {MODEL_NAME}")
	model = CLIPModel.from_pretrained(MODEL_NAME, token=hf_token).to(device)
	processor = CLIPProcessor.from_pretrained(MODEL_NAME, token=hf_token)
	model.eval()

	data = load_json(INPUT_JSON_PATH)
	target_data, flat_items = flatten_caption_items(data)

	if not target_data:
	raise ValueError("검수 대상 데이터가 없습니다. CHECK_ALL_CLASSES 또는 TARGET_CLASS_DIR 설정을 확인하세요.")

	results = compute_clip_scores(
	flat_items=flat_items,
	model=model,
	processor=processor,
	device=device
	)

	assign_clip_status(results)

	checked_data = attach_results_to_data(target_data, results)

	save_json(checked_data, OUTPUT_JSON_PATH)

	print_summary(target_data, flat_items, results)
	print(f"\nsaved: {OUTPUT_JSON_PATH}")


	if __name__ == "__main__":
	main()