Spaces:
Sleeping
Sleeping
| from datasets import load_dataset | |
| from collections import Counter | |
| from dotenv import load_dotenv | |
| import os | |
| # ============================================================ | |
| # [์ค์ ๋ถ๋ถ] | |
| # ============================================================ | |
| load_dotenv() | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| # ํ์ธํ Hugging Face ๋ฐ์ดํฐ์ ์ด๋ฆ | |
| DATASET_NAME = "jbarat/plant_species" # ์: "uran66/animals" | |
| # ํ์ธํ split ์ด๋ฆ | |
| SPLIT_NAME = "train" | |
| # ๋ผ๋ฒจ ํ๋๋ช | |
| LABEL_FIELD_NAME = "label" | |
| # streaming ์ฌ์ฉ ์ฌ๋ถ | |
| # True : ์ ์ฒด ๋ฐ์ดํฐ์ ์ ๋ฏธ๋ฆฌ ๋ค์ด๋ก๋ํ์ง ์๊ณ ํ๋์ฉ ์ฝ์ผ๋ฉด์ ํ์ธ | |
| # False : ๋ก์ปฌ ์บ์์ ๋ฐ์ดํฐ์ ์ ๋ค์ด๋ก๋ํ ๋ค ํ์ธ | |
| USE_STREAMING = True | |
| # ๋ฌธ์์ด ๋ผ๋ฒจ ๋ฐ์ดํฐ์ ์ผ ๊ฒฝ์ฐ ์ ์ฒด ๋ฐ์ดํฐ๋ฅผ ํ์ด์ผ ์ ํํ ๊ฐ์๋ฅผ ์ ์ ์๋ค. | |
| # None์ด๋ฉด ์ ์ฒด ํ์ธ, ์ซ์๋ฅผ ๋ฃ์ผ๋ฉด ์ผ๋ถ ์ํ๋ง ํ์ธํ๋ค. | |
| MAX_SCAN_ITEMS = None | |
| # ============================================================ | |
| def get_label_name(dataset, label_value): | |
| label_feature = dataset.features[LABEL_FIELD_NAME] | |
| # ClassLabel ํ์ ์ด๋ฉด ์ซ์ ๋ผ๋ฒจ์ ๋ฌธ์์ด ๋ผ๋ฒจ๋ช ์ผ๋ก ๋ณํํ๋ค. | |
| if hasattr(label_feature, "int2str") and isinstance(label_value, int): | |
| return label_feature.int2str(label_value) | |
| # ์ด๋ฏธ ๋ฌธ์์ด ๋ผ๋ฒจ์ด๋ฉด ๊ทธ๋๋ก ๋ฌธ์์ด๋ก ๋ณํํด์ ์ฌ์ฉํ๋ค. | |
| return str(label_value) | |
| def get_unique_labels_with_counts(): | |
| print(f"[{DATASET_NAME}] ๋ฐ์ดํฐ์ ๋ก๋ ์ค...") | |
| dataset = load_dataset( | |
| DATASET_NAME, | |
| split=SPLIT_NAME, | |
| streaming=USE_STREAMING, | |
| token=HF_TOKEN | |
| ) | |
| # ๋ฐ์ดํฐ์ ์ feature ์ ๋ณด์์ ๋ผ๋ฒจ ํ๋๋ฅผ ๊ฐ์ ธ์จ๋ค. | |
| label_feature = dataset.features[LABEL_FIELD_NAME] | |
| # ํด๋์ค๋ณ ์ด๋ฏธ์ง ๊ฐ์๋ฅผ ์ ์ฅํ Counter | |
| label_counter = Counter() | |
| print("\nํด๋์ค๋ณ ์ด๋ฏธ์ง ๊ฐ์ ์ง๊ณ ์ค...") | |
| # streaming=True์ธ ๊ฒฝ์ฐ์๋ dataset์ ์ํํ๋ฉด์ ๊ฐ์๋ฅผ ์ ์ ์๋ค. | |
| for idx, item in enumerate(dataset): | |
| # MAX_SCAN_ITEMS๊ฐ ์ค์ ๋์ด ์์ผ๋ฉด ์ง์ ํ ๊ฐ์๊น์ง๋ง ํ์ธํ๋ค. | |
| if MAX_SCAN_ITEMS is not None and idx >= MAX_SCAN_ITEMS: | |
| break | |
| label_value = item.get(LABEL_FIELD_NAME) | |
| # ๋ผ๋ฒจ ๊ฐ์ด ์๋ ๋ฐ์ดํฐ๋ ๊ฑด๋๋ด๋ค. | |
| if label_value is None: | |
| continue | |
| # ์ซ์ ๋ผ๋ฒจ์ด๋ฉด ์ค์ ๋ผ๋ฒจ๋ช ์ผ๋ก ๋ณํํ๊ณ , | |
| # ๋ฌธ์์ด ๋ผ๋ฒจ์ด๋ฉด ๊ทธ๋๋ก ์ฌ์ฉํ๋ค. | |
| label_name = get_label_name(dataset, label_value) | |
| # ํด๋น ๋ผ๋ฒจ์ ์ด๋ฏธ์ง ๊ฐ์๋ฅผ 1 ์ฆ๊ฐ์ํจ๋ค. | |
| label_counter[label_name] += 1 | |
| print("\n๋ผ๋ฒจ ๋ชฉ๋ก ๋ฐ ํด๋์ค๋ณ ์ด๋ฏธ์ง ๊ฐ์") | |
| print("-" * 60) | |
| # ------------------------------------------------------------ | |
| # 1. Food101์ฒ๋ผ label์ด ClassLabel ํ์ ์ธ ๊ฒฝ์ฐ | |
| # ------------------------------------------------------------ | |
| # label_feature.names๊ฐ ์์ผ๋ฉด ์๋ ๋ฐ์ดํฐ์ ์ ๋ผ๋ฒจ ์์๋๋ก ์ถ๋ ฅํ๋ค. | |
| if hasattr(label_feature, "names") and label_feature.names is not None: | |
| label_names = label_feature.names | |
| for idx, label_name in enumerate(label_names): | |
| count = label_counter.get(label_name, 0) | |
| print(f"{idx}: {label_name} - {count} ์ฅ") | |
| # ------------------------------------------------------------ | |
| # 2. label์ด ๋ฌธ์์ด๋ก ์ง์ ๋ค์ด์๋ ๋ฐ์ดํฐ์ ์ธ ๊ฒฝ์ฐ | |
| # ------------------------------------------------------------ | |
| # Counter์ ๋ชจ์ธ ๋ผ๋ฒจ๋ช ์ ์ด๋ฆ์์ผ๋ก ์ ๋ ฌํด์ ์ถ๋ ฅํ๋ค. | |
| else: | |
| label_names = sorted(label_counter.keys()) | |
| for idx, label_name in enumerate(label_names): | |
| count = label_counter[label_name] | |
| print(f"{idx}: {label_name} - {count} ์ฅ") | |
| print("-" * 60) | |
| print(f"์ด ๋ผ๋ฒจ ๊ฐ์: {len(label_counter)}") | |
| print(f"์ด ์ด๋ฏธ์ง ๊ฐ์: {sum(label_counter.values())}") | |
| return label_counter | |
| if __name__ == "__main__": | |
| get_unique_labels_with_counts() |