| from __future__ import annotations |
|
|
| import sys |
| from pathlib import Path |
|
|
| PROJECT_ROOT = Path(__file__).resolve().parents[1] |
| SRC_DIR = PROJECT_ROOT / "src" |
| sys.path.insert(0, str(SRC_DIR)) |
|
|
| from transformers import AutoTokenizer |
|
|
| from hf_processor_practice.utils import SAVED_PROCESSOR_DIR, ensure_dirs, load_tokenizer_with_fallback, print_title |
|
|
|
|
| def main() -> None: |
| ensure_dirs() |
| print_title("01. AutoTokenizer Practice") |
|
|
| |
| |
| tokenizer = load_tokenizer_with_fallback() |
| print("Tokenizer type:", type(tokenizer)) |
| print("Using fast tokenizer:", getattr(tokenizer, "is_fast", None)) |
|
|
| |
| batch = tokenizer( |
| ["hello world", "this is a test"], |
| padding=True, |
| truncation=True, |
| return_tensors="pt", |
| ) |
|
|
| print("\nBatch keys:", list(batch.keys())) |
| for key, value in batch.items(): |
| print(f"{key}: shape={tuple(value.shape)}") |
|
|
| |
| decoded = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=False) |
| decoded_clean = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True) |
| print("\nDecoded with special tokens:", decoded) |
| print("Decoded clean:", decoded_clean) |
|
|
| |
| save_dir = SAVED_PROCESSOR_DIR / "tmp_tok" |
| tokenizer.save_pretrained(save_dir) |
| tokenizer2 = AutoTokenizer.from_pretrained(save_dir) |
|
|
| batch2 = tokenizer2(["hello world"], return_tensors="pt") |
| print("\nReloaded tokenizer type:", type(tokenizer2)) |
| print("Reloaded vocab size:", tokenizer2.vocab_size) |
| print("Reloaded input_ids shape:", tuple(batch2["input_ids"].shape)) |
| print("Saved files:", sorted(p.name for p in save_dir.iterdir())) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|