Spaces:
Sleeping
Sleeping
File size: 4,091 Bytes
c1596ac | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | from datasets import load_dataset
from collections import Counter
from dotenv import load_dotenv
import os
# ============================================================
# [์ค์ ๋ถ๋ถ]
# ============================================================
load_dotenv()
HF_TOKEN = os.environ.get("HF_TOKEN")
# ํ์ธํ Hugging Face ๋ฐ์ดํฐ์
์ด๋ฆ
DATASET_NAME = "jbarat/plant_species" # ์: "uran66/animals"
# ํ์ธํ split ์ด๋ฆ
SPLIT_NAME = "train"
# ๋ผ๋ฒจ ํ๋๋ช
LABEL_FIELD_NAME = "label"
# streaming ์ฌ์ฉ ์ฌ๋ถ
# True : ์ ์ฒด ๋ฐ์ดํฐ์
์ ๋ฏธ๋ฆฌ ๋ค์ด๋ก๋ํ์ง ์๊ณ ํ๋์ฉ ์ฝ์ผ๋ฉด์ ํ์ธ
# False : ๋ก์ปฌ ์บ์์ ๋ฐ์ดํฐ์
์ ๋ค์ด๋ก๋ํ ๋ค ํ์ธ
USE_STREAMING = True
# ๋ฌธ์์ด ๋ผ๋ฒจ ๋ฐ์ดํฐ์
์ผ ๊ฒฝ์ฐ ์ ์ฒด ๋ฐ์ดํฐ๋ฅผ ํ์ด์ผ ์ ํํ ๊ฐ์๋ฅผ ์ ์ ์๋ค.
# None์ด๋ฉด ์ ์ฒด ํ์ธ, ์ซ์๋ฅผ ๋ฃ์ผ๋ฉด ์ผ๋ถ ์ํ๋ง ํ์ธํ๋ค.
MAX_SCAN_ITEMS = None
# ============================================================
def get_label_name(dataset, label_value):
label_feature = dataset.features[LABEL_FIELD_NAME]
# ClassLabel ํ์
์ด๋ฉด ์ซ์ ๋ผ๋ฒจ์ ๋ฌธ์์ด ๋ผ๋ฒจ๋ช
์ผ๋ก ๋ณํํ๋ค.
if hasattr(label_feature, "int2str") and isinstance(label_value, int):
return label_feature.int2str(label_value)
# ์ด๋ฏธ ๋ฌธ์์ด ๋ผ๋ฒจ์ด๋ฉด ๊ทธ๋๋ก ๋ฌธ์์ด๋ก ๋ณํํด์ ์ฌ์ฉํ๋ค.
return str(label_value)
def get_unique_labels_with_counts():
print(f"[{DATASET_NAME}] ๋ฐ์ดํฐ์
๋ก๋ ์ค...")
dataset = load_dataset(
DATASET_NAME,
split=SPLIT_NAME,
streaming=USE_STREAMING,
token=HF_TOKEN
)
# ๋ฐ์ดํฐ์
์ feature ์ ๋ณด์์ ๋ผ๋ฒจ ํ๋๋ฅผ ๊ฐ์ ธ์จ๋ค.
label_feature = dataset.features[LABEL_FIELD_NAME]
# ํด๋์ค๋ณ ์ด๋ฏธ์ง ๊ฐ์๋ฅผ ์ ์ฅํ Counter
label_counter = Counter()
print("\nํด๋์ค๋ณ ์ด๋ฏธ์ง ๊ฐ์ ์ง๊ณ ์ค...")
# streaming=True์ธ ๊ฒฝ์ฐ์๋ dataset์ ์ํํ๋ฉด์ ๊ฐ์๋ฅผ ์
์ ์๋ค.
for idx, item in enumerate(dataset):
# MAX_SCAN_ITEMS๊ฐ ์ค์ ๋์ด ์์ผ๋ฉด ์ง์ ํ ๊ฐ์๊น์ง๋ง ํ์ธํ๋ค.
if MAX_SCAN_ITEMS is not None and idx >= MAX_SCAN_ITEMS:
break
label_value = item.get(LABEL_FIELD_NAME)
# ๋ผ๋ฒจ ๊ฐ์ด ์๋ ๋ฐ์ดํฐ๋ ๊ฑด๋๋ด๋ค.
if label_value is None:
continue
# ์ซ์ ๋ผ๋ฒจ์ด๋ฉด ์ค์ ๋ผ๋ฒจ๋ช
์ผ๋ก ๋ณํํ๊ณ ,
# ๋ฌธ์์ด ๋ผ๋ฒจ์ด๋ฉด ๊ทธ๋๋ก ์ฌ์ฉํ๋ค.
label_name = get_label_name(dataset, label_value)
# ํด๋น ๋ผ๋ฒจ์ ์ด๋ฏธ์ง ๊ฐ์๋ฅผ 1 ์ฆ๊ฐ์ํจ๋ค.
label_counter[label_name] += 1
print("\n๋ผ๋ฒจ ๋ชฉ๋ก ๋ฐ ํด๋์ค๋ณ ์ด๋ฏธ์ง ๊ฐ์")
print("-" * 60)
# ------------------------------------------------------------
# 1. Food101์ฒ๋ผ label์ด ClassLabel ํ์
์ธ ๊ฒฝ์ฐ
# ------------------------------------------------------------
# label_feature.names๊ฐ ์์ผ๋ฉด ์๋ ๋ฐ์ดํฐ์
์ ๋ผ๋ฒจ ์์๋๋ก ์ถ๋ ฅํ๋ค.
if hasattr(label_feature, "names") and label_feature.names is not None:
label_names = label_feature.names
for idx, label_name in enumerate(label_names):
count = label_counter.get(label_name, 0)
print(f"{idx}: {label_name} - {count} ์ฅ")
# ------------------------------------------------------------
# 2. label์ด ๋ฌธ์์ด๋ก ์ง์ ๋ค์ด์๋ ๋ฐ์ดํฐ์
์ธ ๊ฒฝ์ฐ
# ------------------------------------------------------------
# Counter์ ๋ชจ์ธ ๋ผ๋ฒจ๋ช
์ ์ด๋ฆ์์ผ๋ก ์ ๋ ฌํด์ ์ถ๋ ฅํ๋ค.
else:
label_names = sorted(label_counter.keys())
for idx, label_name in enumerate(label_names):
count = label_counter[label_name]
print(f"{idx}: {label_name} - {count} ์ฅ")
print("-" * 60)
print(f"์ด ๋ผ๋ฒจ ๊ฐ์: {len(label_counter)}")
print(f"์ด ์ด๋ฏธ์ง ๊ฐ์: {sum(label_counter.values())}")
return label_counter
if __name__ == "__main__":
get_unique_labels_with_counts() |