Spaces:
No application file
No application file
| def inspect_tokenized_dataset(dataset, num_samples=3): | |
| print(f"\n๐ Inspecting first {num_samples} samples in tokenized dataset...") | |
| for i in range(min(num_samples, len(dataset))): | |
| sample = dataset[i] | |
| print(f"\n=== Sample {i} ===") | |
| print("input_ids:", sample.get("input_ids")) | |
| print("attention_mask:", sample.get("attention_mask")) | |
| print("labels:", sample.get("labels")) | |
| # ํ์ ํ์ธ | |
| if not isinstance(sample.get("labels"), list): | |
| print(f"โ Sample {i}: labels๊ฐ list๊ฐ ์๋ โ {type(sample.get('labels'))}") | |
| elif not all(isinstance(x, int) for x in sample["labels"]): | |
| print(f"โ Sample {i}: labels ๋ด๋ถ์ int๊ฐ ์๋ ๊ฐ ์กด์ฌ") | |
| else: | |
| print(f"โ Sample {i}: labels ๊ตฌ์กฐ ์ ์") | |
| # ๊ธธ์ด ๋น๊ต | |
| input_lens = [len(s["input_ids"]) for s in dataset[:num_samples]] | |
| label_lens = [len(s["labels"]) for s in dataset[:num_samples]] | |
| print("\n๐ input_ids ๊ธธ์ด:", input_lens) | |
| print("๐ labels ๊ธธ์ด: ", label_lens) | |
| def print_label_lengths(dataset): | |
| lengths = [len(sample["labels"]) for sample in dataset] | |
| print(f"[๋๋ฒ๊น ] labels ๊ธธ์ด - min: {min(lengths)}, max: {max(lengths)}, mean: {sum(lengths) / len(lengths):.2f}") | |
| # ์์: 10๊ฐ ์ํ ์ค์ ๊ธธ์ด ์ง์ ํ์ธ | |
| print("[๋๋ฒ๊น ] ์ํ๋ณ labels ๊ธธ์ด (์์ 10๊ฐ):", lengths[:10]) | |
| def print_field_lengths(dataset, n=10, stage=""): | |
| """ | |
| ๋ฐ์ดํฐ์ ์์ input_ids, attention_mask, labels ๊ธธ์ด ๋ถํฌ ๋ฐ ์ํ ํ์ (์์ n๊ฐ) | |
| """ | |
| print(f"\n[๋๋ฒ๊น ][{stage}] ๊ธธ์ด ํต๊ณ ================================") | |
| for key in ["input_ids", "attention_mask", "labels"]: | |
| try: | |
| lengths = [len(x[key]) for x in dataset] | |
| print(f"{key} โ min: {min(lengths)}, max: {max(lengths)}, mean: {sum(lengths)/len(lengths):.2f}") | |
| print(f"{key} ์ํ (์์ {n}๊ฐ):", lengths[:n]) | |
| except Exception as e: | |
| print(f"{key}: (์กด์ฌํ์ง ์๊ฑฐ๋ ์๋ฌ) {e}") | |
| print("====================================================\n") | |