Spaces:
Sleeping
Sleeping
| from datasets import load_dataset | |
| # ============================================================ | |
| # [์ค์ ๋ถ๋ถ] | |
| # ============================================================ | |
| # ํ์ธํ Hugging Face ๋ฐ์ดํฐ์ ์ด๋ฆ | |
| DATASET_NAME = "KrushiJethe/fashion_data" #uran66/animals | |
| # ํ์ธํ split ์ด๋ฆ | |
| SPLIT_NAME = "train" | |
| # ๋ผ๋ฒจ ํ๋๋ช | |
| LABEL_FIELD_NAME = "articleType" | |
| # streaming ์ฌ์ฉ ์ฌ๋ถ | |
| # ๋ผ๋ฒจ ๊ตฌ์กฐ๋ง ํ์ธํ ๋๋ streaming=True๋ก ํด๋ ๋๋ค. | |
| USE_STREAMING = True | |
| # ๋ฌธ์์ด ๋ผ๋ฒจ ๋ฐ์ดํฐ์ ์ผ ๊ฒฝ์ฐ ์ ์ฒด ๋ฐ์ดํฐ๋ฅผ ํ์ด์ผ ํ ์ ์๋ค. | |
| # None์ด๋ฉด ์ ์ฒด ํ์ธ, ์ซ์๋ฅผ ๋ฃ์ผ๋ฉด ์ผ๋ถ ์ํ๋ง ํ์ธํ๋ค. | |
| MAX_SCAN_ITEMS = None | |
| # ============================================================ | |
| def get_unique_labels(): | |
| """ | |
| Hugging Face ๋ฐ์ดํฐ์ ์์ ๋ผ๋ฒจ ๋ชฉ๋ก์ ์ค๋ณต ์์ด ์ถ๋ ฅํ๋ค. | |
| """ | |
| print(f"[{DATASET_NAME}] ๋ฐ์ดํฐ์ ๋ก๋ ์ค...") | |
| dataset = load_dataset( | |
| DATASET_NAME, | |
| split=SPLIT_NAME, | |
| streaming=USE_STREAMING, | |
| ) | |
| # ๋ฐ์ดํฐ์ ์ feature ์ ๋ณด์์ ๋ผ๋ฒจ ํ๋๋ฅผ ๊ฐ์ ธ์จ๋ค. | |
| label_feature = dataset.features[LABEL_FIELD_NAME] | |
| # ------------------------------------------------------------ | |
| # 1. Food101์ฒ๋ผ label์ด ClassLabel ํ์ ์ธ ๊ฒฝ์ฐ | |
| # ------------------------------------------------------------ | |
| # ์ด ๊ฒฝ์ฐ ๋ฐ์ดํฐ ์ ์ฒด๋ฅผ ์ํํ์ง ์์๋ | |
| # dataset.features["label"].names ์์ ์ ์ฒด ๋ผ๋ฒจ๋ช ์ ๋ฐ๋ก ํ์ธํ ์ ์๋ค. | |
| if hasattr(label_feature, "names") and label_feature.names is not None: | |
| label_names = label_feature.names | |
| print("\n๋ผ๋ฒจ ๋ชฉ๋ก") | |
| print("-" * 50) | |
| for idx, label_name in enumerate(label_names): | |
| print(f"{idx}: {label_name}") | |
| print("-" * 50) | |
| print(f"์ด ๋ผ๋ฒจ ๊ฐ์: {len(label_names)}") | |
| return label_names | |
| # ------------------------------------------------------------ | |
| # 2. label์ด ๋ฌธ์์ด๋ก ์ง์ ๋ค์ด์๋ ๋ฐ์ดํฐ์ ์ธ ๊ฒฝ์ฐ | |
| # ------------------------------------------------------------ | |
| # ์ด ๊ฒฝ์ฐ์๋ ๋ฐ์ดํฐ๋ฅผ ์ง์ ์ํํ๋ฉด์ ์ค๋ณต์ ์ ๊ฑฐํด์ผ ํ๋ค. | |
| unique_labels = set() | |
| print("\n๋ผ๋ฒจ ํ๋๊ฐ ClassLabel ํ์ ์ด ์๋๋ฏ๋ก ๋ฐ์ดํฐ๋ฅผ ์ํํฉ๋๋ค...") | |
| for idx, item in enumerate(dataset): | |
| if MAX_SCAN_ITEMS is not None and idx >= MAX_SCAN_ITEMS: | |
| break | |
| label_value = item.get(LABEL_FIELD_NAME) | |
| if label_value is None: | |
| continue | |
| unique_labels.add(str(label_value)) | |
| label_names = sorted(unique_labels) | |
| print("\n๋ผ๋ฒจ ๋ชฉ๋ก") | |
| print("-" * 50) | |
| for idx, label_name in enumerate(label_names): | |
| print(f"{idx}: {label_name}") | |
| print("-" * 50) | |
| print(f"์ด ๋ผ๋ฒจ ๊ฐ์: {len(label_names)}") | |
| return label_names | |
| if __name__ == "__main__": | |
| get_unique_labels() |