|
|
"""EZ-Tokenizer: Adaptive tokenizer creation for Python code with hardware optimization. |
|
|
|
|
|
This script creates a high-performance ByteLevel BPE tokenizer specifically optimized for code, |
|
|
with automatic adaptation to available system resources (RAM, CPU, GPU). It efficiently scales |
|
|
from low-end systems (2 cores, 4GB RAM) to high-end workstations while maintaining perfect |
|
|
reconstruction accuracy and high throughput. |
|
|
|
|
|
Key Features: |
|
|
- 100% reconstruction accuracy |
|
|
- ~3.5 characters per token (exceeding industry standards) |
|
|
- Adaptive resource management |
|
|
- Memory-efficient processing of large datasets |
|
|
- Support for mixed code and text content |
|
|
""" |
|
|
|
|
|
import os |
|
|
import time |
|
|
import glob |
|
|
import logging |
|
|
import sys |
|
|
import gc |
|
|
import traceback |
|
|
from pathlib import Path |
|
|
from concurrent.futures import ProcessPoolExecutor |
|
|
import psutil |
|
|
from typing import Dict, List, Optional, Tuple, Union, Any, NamedTuple |
|
|
|
|
|
|
|
|
import torch |
|
|
|
|
|
|
|
|
from .resources import SystemResources |
|
|
|
|
|
|
|
|
from tokenizers import Tokenizer |
|
|
from tokenizers.models import BPE |
|
|
from tokenizers.trainers import BpeTrainer |
|
|
from tokenizers.pre_tokenizers import ByteLevel |
|
|
from tokenizers.decoders import ByteLevel as ByteLevelDecoder |
|
|
|
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
|
|
handlers=[ |
|
|
logging.StreamHandler(), |
|
|
logging.FileHandler('tokenizer.log') |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
def log_memory_usage(): |
|
|
"""Log current RAM and GPU memory usage.""" |
|
|
process = psutil.Process() |
|
|
ram_usage = process.memory_info().rss / (1024 * 1024 * 1024) |
|
|
ram_percent = psutil.virtual_memory().percent |
|
|
available_ram = psutil.virtual_memory().available / (1024 * 1024 * 1024) |
|
|
total_ram = psutil.virtual_memory().total / (1024 * 1024 * 1024) |
|
|
logging.info(f"RAM: {ram_usage:.2f} GB used, {available_ram:.2f} GB available ({ram_percent}% used of {total_ram:.1f} GB total)") |
|
|
|
|
|
if torch.cuda.is_available(): |
|
|
for i in range(torch.cuda.device_count()): |
|
|
allocated = torch.cuda.memory_allocated(i) / (1024 * 1024 * 1024) |
|
|
cached = torch.cuda.memory_reserved(i) / (1024 * 1024 * 1024) |
|
|
logging.info(f"CUDA Device {i}: {allocated:.2f} GB allocated, {cached:.2f} GB cached") |
|
|
|
|
|
def manage_ram(aggressive: bool = False): |
|
|
"""Perform RAM-specific memory management and garbage collection. |
|
|
|
|
|
Args: |
|
|
aggressive: If True, performs more thorough memory cleanup operations |
|
|
""" |
|
|
|
|
|
before_ram = psutil.virtual_memory().percent |
|
|
before_process = psutil.Process().memory_info().rss / (1024 * 1024 * 1024) |
|
|
|
|
|
|
|
|
gc.collect() |
|
|
|
|
|
if aggressive: |
|
|
|
|
|
for _ in range(2): |
|
|
for i in range(3): |
|
|
gc.collect(i) |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
traceback.clear_frames(sys.exc_info()[2]) |
|
|
|
|
|
|
|
|
import builtins |
|
|
for name in list(builtins.__dict__.keys()): |
|
|
if name.startswith('__') and name.endswith('__'): |
|
|
continue |
|
|
if not isinstance(builtins.__dict__[name], type): |
|
|
continue |
|
|
|
|
|
if hasattr(builtins.__dict__[name], '__dict__') and '__cache__' in builtins.__dict__[name].__dict__: |
|
|
builtins.__dict__[name].__dict__['__cache__'].clear() |
|
|
|
|
|
|
|
|
gc.collect() |
|
|
|
|
|
|
|
|
if sys.platform.startswith('win'): |
|
|
try: |
|
|
import ctypes |
|
|
ctypes.windll.kernel32.SetProcessWorkingSetSize(-1, -1) |
|
|
except Exception as e: |
|
|
logging.debug(f"Failed to compact Windows memory: {e}") |
|
|
except Exception as e: |
|
|
logging.warning(f"Error during aggressive memory cleanup: {e}") |
|
|
|
|
|
|
|
|
after_ram = psutil.virtual_memory().percent |
|
|
after_process = psutil.Process().memory_info().rss / (1024 * 1024 * 1024) |
|
|
freed_gb = before_process - after_process |
|
|
|
|
|
if freed_gb > 0.01: |
|
|
logging.info(f"Memory cleaned: {freed_gb:.2f} GB freed, RAM usage {before_ram}% → {after_ram}%") |
|
|
|
|
|
|
|
|
return freed_gb > 0 |
|
|
|
|
|
def cleanup_cuda(force: bool = False): |
|
|
"""Perform CUDA memory cleanup with garbage collection.""" |
|
|
|
|
|
manage_ram(aggressive=force) |
|
|
|
|
|
|
|
|
if not torch.cuda.is_available(): |
|
|
return |
|
|
|
|
|
try: |
|
|
|
|
|
torch.cuda.empty_cache() |
|
|
|
|
|
if force: |
|
|
|
|
|
torch.cuda.synchronize() |
|
|
|
|
|
|
|
|
for i in range(torch.cuda.device_count()): |
|
|
torch.cuda.synchronize(i) |
|
|
except Exception as e: |
|
|
logging.warning(f"Error during CUDA cleanup: {e}") |
|
|
|
|
|
def process_file(file_path): |
|
|
"""Process a single file to extract its content.""" |
|
|
try: |
|
|
|
|
|
file_size = os.path.getsize(file_path) |
|
|
logging.info(f"Processing file: {os.path.basename(file_path)} (Size: {file_size} bytes)") |
|
|
|
|
|
|
|
|
with open(file_path, 'r', encoding='utf-8', errors='replace') as f: |
|
|
content = f.read() |
|
|
|
|
|
if not content: |
|
|
logging.warning(f"File {file_path} is empty") |
|
|
else: |
|
|
logging.info(f"Successfully read {len(content)} characters from {os.path.basename(file_path)}") |
|
|
|
|
|
return content, file_size, True |
|
|
except Exception as e: |
|
|
logging.error(f"Error processing file {file_path}: {e}", exc_info=True) |
|
|
return "", 0, False |
|
|
|
|
|
def write_texts_to_disk(texts, file_path, max_chars_per_text=5000): |
|
|
"""Write text data to disk to free up memory. |
|
|
|
|
|
Args: |
|
|
texts (list): List of text entries to save |
|
|
file_path (str): Path to save the data |
|
|
max_chars_per_text (int): Maximum characters to save per text entry |
|
|
|
|
|
Returns: |
|
|
bool: True if successful, False otherwise |
|
|
""" |
|
|
try: |
|
|
with open(file_path, 'w', encoding='utf-8', errors='replace') as f: |
|
|
for text in texts: |
|
|
|
|
|
f.write(text[:max_chars_per_text] + '\n---END_ENTRY---\n') |
|
|
return True |
|
|
except Exception as e: |
|
|
logging.error(f"Error writing texts to disk: {e}") |
|
|
return False |
|
|
|
|
|
def read_texts_from_disk(file_path): |
|
|
"""Read text data from disk file. |
|
|
|
|
|
Args: |
|
|
file_path (str): Path to read data from |
|
|
|
|
|
Returns: |
|
|
list: List of text entries read from file |
|
|
""" |
|
|
try: |
|
|
texts = [] |
|
|
with open(file_path, 'r', encoding='utf-8', errors='replace') as f: |
|
|
current_text = "" |
|
|
for line in f: |
|
|
if line.strip() == "---END_ENTRY---": |
|
|
texts.append(current_text) |
|
|
current_text = "" |
|
|
else: |
|
|
current_text += line |
|
|
if current_text: |
|
|
texts.append(current_text) |
|
|
return texts |
|
|
except Exception as e: |
|
|
logging.error(f"Error reading texts from disk: {e}") |
|
|
return [] |
|
|
|
|
|
def build_tokenizer(input_dir, output_path, vocab_size=40000, min_frequency=2, max_files=None, resources=None, temp_dir=None): |
|
|
"""Build a tokenizer directly from Python code files with adaptive resource management. |
|
|
|
|
|
This function automatically adapts to the available system resources, scaling its |
|
|
processing based on available RAM, CPU cores, and GPU capabilities. It implements |
|
|
extreme memory conservation strategies to prevent OOM crashes. |
|
|
|
|
|
Features: |
|
|
- Progressive file loading (smallest files first) |
|
|
- Memory monitoring with emergency intervention |
|
|
- Disk offloading for memory pressure relief |
|
|
- Dynamic chunk sizing with retry mechanisms |
|
|
- Text truncation for oversized entries |
|
|
|
|
|
Args: |
|
|
input_dir (str): Directory containing Python code files (*.txt) |
|
|
output_path (str): Path where to save the tokenizer JSON file |
|
|
vocab_size (int, optional): Size of vocabulary to generate. Defaults to 40000. |
|
|
min_frequency (int, optional): Minimum frequency threshold for tokens. Defaults to 2. |
|
|
max_files (int, optional): Maximum number of files to process. If None, determined automatically. |
|
|
resources (SystemResources, optional): Pre-detected system resources. If None, resources |
|
|
will be automatically detected. |
|
|
|
|
|
Returns: |
|
|
bool: True if tokenizer was successfully created and saved, False otherwise |
|
|
""" |
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
if resources is None: |
|
|
resources = SystemResources() |
|
|
|
|
|
try: |
|
|
|
|
|
log_memory_usage() |
|
|
|
|
|
|
|
|
if os.path.isfile(input_dir): |
|
|
|
|
|
files = [input_dir] |
|
|
logging.info(f"Processing single file: {input_dir}") |
|
|
else: |
|
|
|
|
|
files = glob.glob(os.path.join(input_dir, "*.txt")) |
|
|
logging.info(f"Found {len(files)} files in {input_dir}") |
|
|
|
|
|
if not files: |
|
|
logging.error(f"No files found in {input_dir}") |
|
|
return False |
|
|
|
|
|
|
|
|
try: |
|
|
files = sorted(files, key=lambda f: os.path.getsize(f)) |
|
|
logging.info("Files sorted by size (processing smallest files first)") |
|
|
except Exception as e: |
|
|
logging.warning(f"Unable to sort files by size: {e}") |
|
|
|
|
|
|
|
|
process = psutil.Process() |
|
|
|
|
|
|
|
|
sample_count = min(10, len(files)) |
|
|
if sample_count > 0: |
|
|
sample_sizes = [] |
|
|
for i in range(sample_count): |
|
|
try: |
|
|
file_size = os.path.getsize(files[i]) / (1024 * 1024) |
|
|
sample_sizes.append(file_size) |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
avg_file_size_estimate = 5 |
|
|
if sample_sizes: |
|
|
avg_file_size_estimate = sum(sample_sizes) / len(sample_sizes) |
|
|
logging.info(f"Average file size based on {len(sample_sizes)} samples: {avg_file_size_estimate:.2f} MB") |
|
|
else: |
|
|
avg_file_size_estimate = 5 |
|
|
|
|
|
|
|
|
|
|
|
safe_file_count = min( |
|
|
len(files), |
|
|
int(resources.available_ram_gb * 1024 / avg_file_size_estimate * resources.max_files_multiplier) |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
if resources.total_ram_gb >= 32: |
|
|
max_files_multiplier = 0.3 |
|
|
elif resources.total_ram_gb >= 16: |
|
|
max_files_multiplier = 0.2 |
|
|
else: |
|
|
max_files_multiplier = 0.1 |
|
|
|
|
|
max_files_cap = max(3, int(resources.total_ram_gb * max_files_multiplier)) |
|
|
safe_file_count = min(safe_file_count, max_files_cap) |
|
|
|
|
|
|
|
|
default_max_files = 10 |
|
|
|
|
|
|
|
|
if max_files is not None: |
|
|
if max_files == float('inf'): |
|
|
logging.info("Processing ALL files in dataset (MAX mode)") |
|
|
safe_file_count = len(files) |
|
|
else: |
|
|
logging.info(f"User specified max_files: {max_files}") |
|
|
safe_file_count = min(len(files), max_files) |
|
|
else: |
|
|
safe_file_count = min(safe_file_count, default_max_files) |
|
|
|
|
|
|
|
|
safe_file_count = max(1, safe_file_count) |
|
|
|
|
|
logging.info(f"Processing up to {safe_file_count} files based on available memory of {resources.available_ram_gb:.2f} GB") |
|
|
|
|
|
files = files[:safe_file_count] |
|
|
|
|
|
all_texts = [] |
|
|
total_chars = 0 |
|
|
|
|
|
|
|
|
initial_batch_size = max(1, resources.batch_size // 2) |
|
|
logging.info(f"Starting with conservative batch size of {initial_batch_size}") |
|
|
|
|
|
|
|
|
batch_size = initial_batch_size |
|
|
batches = [files[i:i+batch_size] for i in range(0, len(files), batch_size)] |
|
|
|
|
|
for batch_idx, batch in enumerate(batches): |
|
|
batch_texts = [] |
|
|
|
|
|
|
|
|
with ProcessPoolExecutor(max_workers=resources.max_workers) as executor: |
|
|
results = list(executor.map(process_file, batch)) |
|
|
|
|
|
for content, size, success in results: |
|
|
if success and content: |
|
|
|
|
|
|
|
|
if len(content) > resources.max_text_chunk_size: |
|
|
logging.warning(f"Truncating oversized text: {len(content)} chars -> {resources.max_text_chunk_size} chars") |
|
|
content = content[:resources.max_text_chunk_size] |
|
|
|
|
|
batch_texts.append(content) |
|
|
total_chars += len(content) |
|
|
|
|
|
logging.info(f"Batch {batch_idx+1}/{len(batches)}: Processed {len(batch)} files - {total_chars:,} total characters") |
|
|
|
|
|
all_texts.extend(batch_texts) |
|
|
|
|
|
|
|
|
available_ram_gb = psutil.virtual_memory().available / (1024 * 1024 * 1024) |
|
|
ram_usage = process.memory_info().rss / (1024 * 1024 * 1024) |
|
|
ram_percent = psutil.virtual_memory().percent |
|
|
logging.info(f"RAM usage after batch {batch_idx+1}: {ram_usage:.2f} GB ({ram_percent}%)") |
|
|
|
|
|
|
|
|
if available_ram_gb < resources.emergency_reserve_gb: |
|
|
logging.critical(f"EMERGENCY: Available RAM ({available_ram_gb:.2f} GB) below reserve threshold ({resources.emergency_reserve_gb:.2f} GB)") |
|
|
logging.critical("Taking emergency measures to prevent system crash") |
|
|
|
|
|
|
|
|
emergency_path = os.path.join(temp_dir, f"emergency_tokenizer_data_{int(time.time())}.txt") |
|
|
write_texts_to_disk(all_texts, emergency_path) |
|
|
logging.critical(f"Emergency data saved to {emergency_path}") |
|
|
|
|
|
|
|
|
emergency_keep = min(max(5, len(all_texts) // 10), 20) |
|
|
logging.critical(f"Reducing dataset from {len(all_texts)} entries to {emergency_keep} entries") |
|
|
all_texts = all_texts[:emergency_keep] |
|
|
|
|
|
|
|
|
manage_ram(aggressive=True) |
|
|
cleanup_cuda(force=True) |
|
|
|
|
|
|
|
|
break |
|
|
|
|
|
|
|
|
disk_offload_frequency = 1 |
|
|
|
|
|
|
|
|
|
|
|
if resources.use_disk_offload and batch_idx > 0 and batch_idx % disk_offload_frequency == 0: |
|
|
temp_file_path = os.path.join(temp_dir, f"temp_tokenizer_data_{batch_idx}.txt") |
|
|
logging.info(f"Writing intermediate batch results to {temp_file_path}") |
|
|
|
|
|
|
|
|
current_ram_percent = psutil.virtual_memory().percent |
|
|
|
|
|
|
|
|
if current_ram_percent > 70: |
|
|
offload_percentage = 0.8 |
|
|
elif current_ram_percent > 50: |
|
|
offload_percentage = 0.6 |
|
|
else: |
|
|
offload_percentage = 0.4 |
|
|
|
|
|
entries_to_save = max(1, int(len(all_texts) * offload_percentage)) |
|
|
entries_to_save = min(entries_to_save, len(all_texts) - 1) |
|
|
|
|
|
|
|
|
if write_texts_to_disk(all_texts[:entries_to_save], temp_file_path): |
|
|
|
|
|
logging.info(f"Offloaded {entries_to_save} entries ({offload_percentage*100:.0f}%) to disk, {len(all_texts)-entries_to_save} remain in memory") |
|
|
all_texts = all_texts[entries_to_save:] |
|
|
|
|
|
|
|
|
manage_ram(aggressive=True) |
|
|
cleanup_cuda(force=True) |
|
|
|
|
|
|
|
|
if ram_usage > resources.ram_usage_warning: |
|
|
logging.warning(f"RAM usage high ({ram_usage:.2f} GB), running RAM-focused cleanup") |
|
|
manage_ram() |
|
|
|
|
|
|
|
|
ram_usage = process.memory_info().rss / (1024 * 1024 * 1024) |
|
|
if ram_usage > resources.ram_usage_critical: |
|
|
logging.warning(f"RAM usage critical ({ram_usage:.2f} GB), performing emergency cleanup") |
|
|
|
|
|
batch_texts.clear() |
|
|
manage_ram(aggressive=True) |
|
|
|
|
|
|
|
|
if len(batches) - batch_idx > 3: |
|
|
|
|
|
remaining_batch_count = 3 if resources.total_ram_gb >= 8 else 2 |
|
|
logging.warning(f"Reducing remaining batches from {len(batches) - batch_idx} to {remaining_batch_count}") |
|
|
batches = batches[:batch_idx+remaining_batch_count] |
|
|
|
|
|
if not all_texts: |
|
|
logging.error("No content found in files") |
|
|
return False |
|
|
|
|
|
logging.info(f"Successfully loaded {len(all_texts)} text entries with {total_chars:,} characters") |
|
|
|
|
|
|
|
|
python_tokens = [ |
|
|
'def', 'class', 'if', 'else', 'elif', 'for', 'while', 'try', 'except', 'import', |
|
|
'from', 'as', 'with', 'return', 'yield', 'break', 'continue', 'pass', 'raise', |
|
|
'True', 'False', 'None', 'self', 'and', 'or', 'not', 'is', 'in', 'lambda', |
|
|
|
|
|
'import numpy as np', 'import pandas as pd', 'import torch', 'import tensorflow as tf', |
|
|
|
|
|
'def __init__(self):', 'def forward(self, x):', |
|
|
] |
|
|
|
|
|
|
|
|
tokenizer = Tokenizer(BPE(unk_token="[UNK]")) |
|
|
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False) |
|
|
tokenizer.decoder = ByteLevelDecoder() |
|
|
|
|
|
|
|
|
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<s>", "</s>", "<pad>", "<unk>", "<mask>"] |
|
|
|
|
|
|
|
|
trainer = BpeTrainer( |
|
|
vocab_size=vocab_size, |
|
|
min_frequency=min_frequency, |
|
|
special_tokens=special_tokens, |
|
|
show_progress=True, |
|
|
initial_alphabet=list("abcdefghijklmnopqrstuvwxyz0123456789!@#$%^&*()_+-=[]{}|;:'\",./<>?`~ "), |
|
|
|
|
|
initial_tokens=python_tokens |
|
|
) |
|
|
|
|
|
|
|
|
logging.info(f"Training tokenizer on {len(all_texts):,} texts (target vocab: {vocab_size:,})") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
initial_chunk_size = 1 |
|
|
max_chunk_size = max(1, resources.training_chunk_size // 2) |
|
|
|
|
|
|
|
|
memory_failures = 0 |
|
|
current_chunk_size = initial_chunk_size |
|
|
|
|
|
|
|
|
for i in range(0, len(all_texts), current_chunk_size): |
|
|
try: |
|
|
|
|
|
current_ram_percent = psutil.virtual_memory().percent |
|
|
if current_ram_percent > 85: |
|
|
logging.warning(f"Memory usage critical before training: {current_ram_percent}%") |
|
|
current_chunk_size = max(1, current_chunk_size // 2) |
|
|
logging.info(f"Reducing chunk size to {current_chunk_size} due to memory pressure") |
|
|
manage_ram(aggressive=True) |
|
|
cleanup_cuda(force=True) |
|
|
|
|
|
|
|
|
end_idx = min(i + current_chunk_size, len(all_texts)) |
|
|
chunk = all_texts[i:end_idx] |
|
|
|
|
|
|
|
|
chunks_total = (len(all_texts) + current_chunk_size - 1) // current_chunk_size |
|
|
current_chunk = i // current_chunk_size + 1 |
|
|
logging.info(f"Training on chunk {current_chunk}/{chunks_total} with size {len(chunk)}") |
|
|
|
|
|
|
|
|
tokenizer.train_from_iterator( |
|
|
chunk, |
|
|
trainer=trainer, |
|
|
length=len(chunk) |
|
|
) |
|
|
|
|
|
|
|
|
del chunk |
|
|
manage_ram(aggressive=True) |
|
|
cleanup_cuda(force=True) |
|
|
|
|
|
|
|
|
if current_chunk_size < max_chunk_size and memory_failures == 0 and current_chunk > 3: |
|
|
new_size = min(max_chunk_size, current_chunk_size * 2) |
|
|
logging.info(f"Increasing chunk size from {current_chunk_size} to {new_size}") |
|
|
current_chunk_size = new_size |
|
|
|
|
|
except Exception as e: |
|
|
if "memory" in str(e).lower() or "allocation" in str(e).lower(): |
|
|
memory_failures += 1 |
|
|
logging.error(f"Memory error during training: {e}") |
|
|
|
|
|
|
|
|
old_size = current_chunk_size |
|
|
current_chunk_size = max(1, current_chunk_size // 2) |
|
|
logging.warning(f"Reducing chunk size from {old_size} to {current_chunk_size} and retrying") |
|
|
|
|
|
|
|
|
manage_ram(aggressive=True) |
|
|
cleanup_cuda(force=True) |
|
|
|
|
|
|
|
|
i = max(0, i - current_chunk_size) |
|
|
continue |
|
|
else: |
|
|
|
|
|
raise |
|
|
|
|
|
|
|
|
output_dir = os.path.dirname(output_path) or '.' |
|
|
if output_dir: |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
tokenizer.save(output_path) |
|
|
|
|
|
final_vocab_size = len(tokenizer.get_vocab()) |
|
|
elapsed = time.time() - start_time |
|
|
logging.info(f"Tokenizer created with {final_vocab_size:,} tokens in {elapsed:.1f} seconds") |
|
|
logging.info(f"Saved to: {output_path}") |
|
|
|
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
logging.error(f"Error training tokenizer: {e}") |
|
|
logging.error(traceback.format_exc()) |
|
|
|
|
|
|
|
|
if "memory" in str(e).lower() or "allocation" in str(e).lower(): |
|
|
logging.warning("Memory error detected, implementing adaptive sampling strategy...") |
|
|
|
|
|
|
|
|
cleanup_cuda(True) |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
sample_size = 5 if resources.total_ram_gb < 8 else 10 |
|
|
all_texts_backup = all_texts[:sample_size] |
|
|
del all_texts |
|
|
gc.collect() |
|
|
|
|
|
|
|
|
cleanup_cuda(True) |
|
|
|
|
|
logging.info(f"Trying with a smaller sample size: {sample_size} texts") |
|
|
tokenizer = Tokenizer(BPE(unk_token="[UNK]")) |
|
|
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False) |
|
|
tokenizer.decoder = ByteLevelDecoder() |
|
|
|
|
|
tokenizer.train_from_iterator(all_texts_backup, trainer=trainer) |
|
|
tokenizer.save(output_path) |
|
|
|
|
|
final_vocab_size = len(tokenizer.get_vocab()) |
|
|
elapsed = time.time() - start_time |
|
|
logging.info(f"Tokenizer created with {final_vocab_size:,} tokens in {elapsed:.1f} seconds") |
|
|
logging.info(f"Saved to: {output_path}") |
|
|
return True |
|
|
except Exception as e2: |
|
|
logging.error(f"Retry failed: {e2}") |
|
|
return False |
|
|
|
|
|
return False |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
logging.info("Starting EZ-Tokenizer creation script") |
|
|
logging.info(f"EZ-Tokenizer v1.0.0 - Optimized for performance and accuracy") |
|
|
logging.info("Copyright (c) 2025 EZ-Tokenizer Team. All rights reserved.") |
|
|
|
|
|
if len(sys.argv) < 3: |
|
|
print("Usage: python adaptive_tokenizer.py <input_dir> <output_path> [vocab_size] [min_frequency] [max_files]") |
|
|
print(" max_files: Optional maximum number of files to process (default: auto-determined)") |
|
|
print(" Use 'MAX' to process all files in the directory") |
|
|
sys.exit(1) |
|
|
|
|
|
input_dir = sys.argv[1] |
|
|
output_path = sys.argv[2] |
|
|
|
|
|
vocab_size = int(sys.argv[3]) if len(sys.argv) > 3 else 40000 |
|
|
min_frequency = int(sys.argv[4]) if len(sys.argv) > 4 else 2 |
|
|
|
|
|
|
|
|
max_files = None |
|
|
if len(sys.argv) > 5: |
|
|
if sys.argv[5].upper() == 'MAX': |
|
|
max_files = float('inf') |
|
|
logging.info("MAX keyword detected - will process all available files") |
|
|
else: |
|
|
try: |
|
|
max_files = int(sys.argv[5]) |
|
|
except ValueError: |
|
|
logging.warning(f"Invalid max_files value: {sys.argv[5]} - using auto determination") |
|
|
max_files = None |
|
|
|
|
|
|
|
|
resources = SystemResources() |
|
|
|
|
|
logging.info("Starting tokenizer creation with the following parameters:") |
|
|
logging.info(f"Configuration:") |
|
|
logging.info(f" Input directory: {input_dir}") |
|
|
logging.info(f" Output path: {output_path}") |
|
|
logging.info(f" Vocabulary size: {vocab_size}") |
|
|
logging.info(f" Minimum frequency: {min_frequency}") |
|
|
if max_files == float('inf'): |
|
|
logging.info(f" Maximum files: MAX (all files)") |
|
|
else: |
|
|
logging.info(f" Maximum files: {max_files if max_files is not None else 'auto'}") |
|
|
|
|
|
|
|
|
|
|
|
import tempfile |
|
|
import atexit |
|
|
import shutil |
|
|
|
|
|
|
|
|
temp_dir = tempfile.mkdtemp(prefix='nexforge_tokenizer_') |
|
|
logging.info(f"Created temporary directory for data offloading: {temp_dir}") |
|
|
|
|
|
|
|
|
def cleanup_temp(): |
|
|
try: |
|
|
if os.path.exists(temp_dir): |
|
|
shutil.rmtree(temp_dir, ignore_errors=True) |
|
|
logging.info(f"Cleaned up temporary directory: {temp_dir}") |
|
|
except Exception as e: |
|
|
logging.warning(f"Error cleaning up temporary directory: {e}") |
|
|
|
|
|
atexit.register(cleanup_temp) |
|
|
|
|
|
|
|
|
log_memory_usage() |
|
|
|
|
|
|
|
|
success = build_tokenizer( |
|
|
input_dir=input_dir, |
|
|
output_path=output_path, |
|
|
vocab_size=vocab_size, |
|
|
min_frequency=min_frequency, |
|
|
max_files=max_files, |
|
|
resources=resources, |
|
|
temp_dir=temp_dir |
|
|
) |
|
|
|
|
|
|
|
|
logging.info("Temporary files will be cleaned up on exit") |
|
|
|
|
|
|
|
|
if success: |
|
|
logging.info("Tokenizer creation completed successfully") |
|
|
sys.exit(0) |
|
|
else: |
|
|
logging.error("Tokenizer creation failed") |
|
|
sys.exit(1) |
|
|
|