EZ-Tokenizer / src /nexforgetokenizer /adaptive_tokenizer.py

Upload 38 files

4265aea verified 9 months ago

31.8 kB

	"""EZ-Tokenizer: Adaptive tokenizer creation for Python code with hardware optimization.

	This script creates a high-performance ByteLevel BPE tokenizer specifically optimized for code,
	with automatic adaptation to available system resources (RAM, CPU, GPU). It efficiently scales
	from low-end systems (2 cores, 4GB RAM) to high-end workstations while maintaining perfect
	reconstruction accuracy and high throughput.

	Key Features:
	- 100% reconstruction accuracy
	- ~3.5 characters per token (exceeding industry standards)
	- Adaptive resource management
	- Memory-efficient processing of large datasets
	- Support for mixed code and text content
	"""

	import os
	import time
	import glob
	import logging
	import sys
	import gc
	import traceback
	from pathlib import Path
	from concurrent.futures import ProcessPoolExecutor
	import psutil
	from typing import Dict, List, Optional, Tuple, Union, Any, NamedTuple

	# Try to use CUDA if available
	import torch

	# Local imports
	from .resources import SystemResources

	# Third-party tokenizer dependencies
	from tokenizers import Tokenizer
	from tokenizers.models import BPE
	from tokenizers.trainers import BpeTrainer
	from tokenizers.pre_tokenizers import ByteLevel
	from tokenizers.decoders import ByteLevel as ByteLevelDecoder

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
	handlers=[
	logging.StreamHandler(),
	logging.FileHandler('tokenizer.log')
	]
	)

	# SystemResources class moved to resources.py to fix circular import warning

	def log_memory_usage():
	"""Log current RAM and GPU memory usage."""
	process = psutil.Process()
	ram_usage = process.memory_info().rss / (1024 * 1024 * 1024) # GB
	ram_percent = psutil.virtual_memory().percent
	available_ram = psutil.virtual_memory().available / (1024 * 1024 * 1024) # GB
	total_ram = psutil.virtual_memory().total / (1024 * 1024 * 1024) # GB
	logging.info(f"RAM: {ram_usage:.2f} GB used, {available_ram:.2f} GB available ({ram_percent}% used of {total_ram:.1f} GB total)")

	if torch.cuda.is_available():
	for i in range(torch.cuda.device_count()):
	allocated = torch.cuda.memory_allocated(i) / (1024 * 1024 * 1024) # GB
	cached = torch.cuda.memory_reserved(i) / (1024 * 1024 * 1024) # GB
	logging.info(f"CUDA Device {i}: {allocated:.2f} GB allocated, {cached:.2f} GB cached")

	def manage_ram(aggressive: bool = False):
	"""Perform RAM-specific memory management and garbage collection.

	Args:
	aggressive: If True, performs more thorough memory cleanup operations
	"""
	# Record memory before cleanup
	before_ram = psutil.virtual_memory().percent
	before_process = psutil.Process().memory_info().rss / (1024 * 1024 * 1024) # GB

	# Run standard garbage collection first
	gc.collect()

	if aggressive:
	# Force the most thorough collection possible
	for _ in range(2): # Multiple passes
	for i in range(3): # All generations 0, 1, 2
	gc.collect(i)

	# More aggressive memory management for critical situations
	try:
	# Clear any traceback objects which can hold references
	traceback.clear_frames(sys.exc_info()[2])

	# Emergency measures for severe memory pressure
	import builtins
	for name in list(builtins.__dict__.keys()):
	if name.startswith('__') and name.endswith('__'):
	continue # Skip special builtins
	if not isinstance(builtins.__dict__[name], type):
	continue # Skip non-types
	# Clear type caches which can hold memory
	if hasattr(builtins.__dict__[name], '__dict__') and '__cache__' in builtins.__dict__[name].__dict__:
	builtins.__dict__[name].__dict__['__cache__'].clear()

	# Force a compaction of freed memory back to the system
	gc.collect()

	# On Windows, explicitly request memory compaction from OS
	if sys.platform.startswith('win'):
	try:
	import ctypes
	ctypes.windll.kernel32.SetProcessWorkingSetSize(-1, -1)
	except Exception as e:
	logging.debug(f"Failed to compact Windows memory: {e}")
	except Exception as e:
	logging.warning(f"Error during aggressive memory cleanup: {e}")

	# Calculate and log memory freed
	after_ram = psutil.virtual_memory().percent
	after_process = psutil.Process().memory_info().rss / (1024 * 1024 * 1024) # GB
	freed_gb = before_process - after_process

	if freed_gb > 0.01: # If we freed a noticeable amount
	logging.info(f"Memory cleaned: {freed_gb:.2f} GB freed, RAM usage {before_ram}% → {after_ram}%")

	# Return True if we successfully freed memory
	return freed_gb > 0

	def cleanup_cuda(force: bool = False):
	"""Perform CUDA memory cleanup with garbage collection."""
	# Run RAM cleanup first
	manage_ram(aggressive=force)

	# Then handle CUDA if available
	if not torch.cuda.is_available():
	return

	try:
	# Clear CUDA cache
	torch.cuda.empty_cache()

	if force:
	# Force synchronize CUDA
	torch.cuda.synchronize()

	# On aggressive cleanup, try to clear everything
	for i in range(torch.cuda.device_count()):
	torch.cuda.synchronize(i)
	except Exception as e:
	logging.warning(f"Error during CUDA cleanup: {e}")

	def process_file(file_path):
	"""Process a single file to extract its content."""
	try:
	# Get file size for logging
	file_size = os.path.getsize(file_path)
	logging.info(f"Processing file: {os.path.basename(file_path)} (Size: {file_size} bytes)")

	# Read file content
	with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
	content = f.read()

	if not content:
	logging.warning(f"File {file_path} is empty")
	else:
	logging.info(f"Successfully read {len(content)} characters from {os.path.basename(file_path)}")

	return content, file_size, True
	except Exception as e:
	logging.error(f"Error processing file {file_path}: {e}", exc_info=True)
	return "", 0, False

	def write_texts_to_disk(texts, file_path, max_chars_per_text=5000):
	"""Write text data to disk to free up memory.

	Args:
	texts (list): List of text entries to save
	file_path (str): Path to save the data
	max_chars_per_text (int): Maximum characters to save per text entry

	Returns:
	bool: True if successful, False otherwise
	"""
	try:
	with open(file_path, 'w', encoding='utf-8', errors='replace') as f:
	for text in texts:
	# Limit each text to prevent huge files
	f.write(text[:max_chars_per_text] + '\n---END_ENTRY---\n')
	return True
	except Exception as e:
	logging.error(f"Error writing texts to disk: {e}")
	return False

	def read_texts_from_disk(file_path):
	"""Read text data from disk file.

	Args:
	file_path (str): Path to read data from

	Returns:
	list: List of text entries read from file
	"""
	try:
	texts = []
	with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
	current_text = ""
	for line in f:
	if line.strip() == "---END_ENTRY---":
	texts.append(current_text)
	current_text = ""
	else:
	current_text += line
	if current_text: # Add the last entry if file doesn't end with marker
	texts.append(current_text)
	return texts
	except Exception as e:
	logging.error(f"Error reading texts from disk: {e}")
	return []

	def build_tokenizer(input_dir, output_path, vocab_size=40000, min_frequency=2, max_files=None, resources=None, temp_dir=None):
	"""Build a tokenizer directly from Python code files with adaptive resource management.

	This function automatically adapts to the available system resources, scaling its
	processing based on available RAM, CPU cores, and GPU capabilities. It implements
	extreme memory conservation strategies to prevent OOM crashes.

	Features:
	- Progressive file loading (smallest files first)
	- Memory monitoring with emergency intervention
	- Disk offloading for memory pressure relief
	- Dynamic chunk sizing with retry mechanisms
	- Text truncation for oversized entries

	Args:
	input_dir (str): Directory containing Python code files (*.txt)
	output_path (str): Path where to save the tokenizer JSON file
	vocab_size (int, optional): Size of vocabulary to generate. Defaults to 40000.
	min_frequency (int, optional): Minimum frequency threshold for tokens. Defaults to 2.
	max_files (int, optional): Maximum number of files to process. If None, determined automatically.
	resources (SystemResources, optional): Pre-detected system resources. If None, resources
	will be automatically detected.

	Returns:
	bool: True if tokenizer was successfully created and saved, False otherwise
	"""
	start_time = time.time()

	# Detect system resources if not provided
	if resources is None:
	resources = SystemResources()

	try:
	# Monitor system resources
	log_memory_usage() # Initial memory benchmark

	# Get all text files in directory
	if os.path.isfile(input_dir):
	# If input is a single file, use it directly
	files = [input_dir]
	logging.info(f"Processing single file: {input_dir}")
	else:
	# If input is a directory, get all .txt files
	files = glob.glob(os.path.join(input_dir, "*.txt"))
	logging.info(f"Found {len(files)} files in {input_dir}")

	if not files:
	logging.error(f"No files found in {input_dir}")
	return False

	# Sort files by size (smallest first) to allow progressive loading
	try:
	files = sorted(files, key=lambda f: os.path.getsize(f))
	logging.info("Files sorted by size (processing smallest files first)")
	except Exception as e:
	logging.warning(f"Unable to sort files by size: {e}")

	# Adaptive file processing based on available memory
	process = psutil.Process()

	# Analyze a few sample files to get a better estimate of average file size
	sample_count = min(10, len(files))
	if sample_count > 0:
	sample_sizes = []
	for i in range(sample_count):
	try:
	file_size = os.path.getsize(files[i]) / (1024 * 1024) # MB
	sample_sizes.append(file_size)
	except Exception:
	pass

	avg_file_size_estimate = 5 # Default fallback value in MB
	if sample_sizes:
	avg_file_size_estimate = sum(sample_sizes) / len(sample_sizes)
	logging.info(f"Average file size based on {len(sample_sizes)} samples: {avg_file_size_estimate:.2f} MB")
	else:
	avg_file_size_estimate = 5 # MB per file (default estimate)

	# Calculate safe file count based on resources
	# Use a portion of available RAM, determined by our resources multiplier
	safe_file_count = min(
	len(files),
	int(resources.available_ram_gb * 1024 / avg_file_size_estimate * resources.max_files_multiplier)
	)

	# EXTREME MEMORY CONSERVATION: Much more conservative file limits
	# Even for high-RAM systems, we'll process fewer files at once after OOM testing
	if resources.total_ram_gb >= 32: # Even for very high RAM systems
	max_files_multiplier = 0.3 # 1/3 of previous value
	elif resources.total_ram_gb >= 16:
	max_files_multiplier = 0.2 # Less than half of previous value
	else:
	max_files_multiplier = 0.1 # Very conservative for lower RAM

	max_files_cap = max(3, int(resources.total_ram_gb * max_files_multiplier))
	safe_file_count = min(safe_file_count, max_files_cap)

	# Set an absolute maximum number of files regardless of RAM if max_files not specified
	default_max_files = 10 # Default hard limit to prevent OOM

	# Apply user-specified max_files if provided, otherwise use calculated safe limit
	if max_files is not None:
	if max_files == float('inf'):
	logging.info("Processing ALL files in dataset (MAX mode)")
	safe_file_count = len(files) # Use all available files
	else:
	logging.info(f"User specified max_files: {max_files}")
	safe_file_count = min(len(files), max_files)
	else:
	safe_file_count = min(safe_file_count, default_max_files)

	# Ensure we process at least one file
	safe_file_count = max(1, safe_file_count)

	logging.info(f"Processing up to {safe_file_count} files based on available memory of {resources.available_ram_gb:.2f} GB")
	# Use subset of files to match our determined safe count
	files = files[:safe_file_count]

	all_texts = []
	total_chars = 0

	# Use smaller batches for initial processing to gauge memory impact
	initial_batch_size = max(1, resources.batch_size // 2)
	logging.info(f"Starting with conservative batch size of {initial_batch_size}")

	# Create batches with adaptive batch size - start with smaller batches
	batch_size = initial_batch_size
	batches = [files[i:i+batch_size] for i in range(0, len(files), batch_size)]

	for batch_idx, batch in enumerate(batches):
	batch_texts = []

	# Use optimized worker count
	with ProcessPoolExecutor(max_workers=resources.max_workers) as executor:
	results = list(executor.map(process_file, batch))

	for content, size, success in results:
	if success and content:
	# MEMORY PROTECTION: Limit the size of any individual text entry
	# This prevents single massive files from causing OOM
	if len(content) > resources.max_text_chunk_size:
	logging.warning(f"Truncating oversized text: {len(content)} chars -> {resources.max_text_chunk_size} chars")
	content = content[:resources.max_text_chunk_size]

	batch_texts.append(content)
	total_chars += len(content)

	logging.info(f"Batch {batch_idx+1}/{len(batches)}: Processed {len(batch)} files - {total_chars:,} total characters")

	all_texts.extend(batch_texts)

	# EMERGENCY MEMORY CHECK: Verify we haven't exceeded critical thresholds
	available_ram_gb = psutil.virtual_memory().available / (1024 * 1024 * 1024)
	ram_usage = process.memory_info().rss / (1024 * 1024 * 1024) # in GB
	ram_percent = psutil.virtual_memory().percent
	logging.info(f"RAM usage after batch {batch_idx+1}: {ram_usage:.2f} GB ({ram_percent}%)")

	# EXTREME MEMORY PROTECTION: Emergency intervention if available RAM drops below reserve
	if available_ram_gb < resources.emergency_reserve_gb:
	logging.critical(f"EMERGENCY: Available RAM ({available_ram_gb:.2f} GB) below reserve threshold ({resources.emergency_reserve_gb:.2f} GB)")
	logging.critical("Taking emergency measures to prevent system crash")

	# Save what we have and proceed with drastically reduced processing
	emergency_path = os.path.join(temp_dir, f"emergency_tokenizer_data_{int(time.time())}.txt")
	write_texts_to_disk(all_texts, emergency_path)
	logging.critical(f"Emergency data saved to {emergency_path}")

	# Keep only 10% of data or 5 entries, whichever is smaller
	emergency_keep = min(max(5, len(all_texts) // 10), 20)
	logging.critical(f"Reducing dataset from {len(all_texts)} entries to {emergency_keep} entries")
	all_texts = all_texts[:emergency_keep]

	# Force memory cleanup
	manage_ram(aggressive=True)
	cleanup_cuda(force=True)

	# Stop processing more files
	break

	# Always use disk offloading if enabled
	disk_offload_frequency = 1 # Every batch

	# Write intermediate results to disk to reduce memory pressure
	# Do this more aggressively to prevent OOM crashes
	if resources.use_disk_offload and batch_idx > 0 and batch_idx % disk_offload_frequency == 0:
	temp_file_path = os.path.join(temp_dir, f"temp_tokenizer_data_{batch_idx}.txt")
	logging.info(f"Writing intermediate batch results to {temp_file_path}")

	# Calculate how many entries to offload based on current memory pressure
	current_ram_percent = psutil.virtual_memory().percent

	# More aggressive offloading at higher memory pressure
	if current_ram_percent > 70:
	offload_percentage = 0.8 # Offload 80% of data if memory pressure high
	elif current_ram_percent > 50:
	offload_percentage = 0.6 # Offload 60% if moderate pressure
	else:
	offload_percentage = 0.4 # Offload 40% if low pressure

	entries_to_save = max(1, int(len(all_texts) * offload_percentage))
	entries_to_save = min(entries_to_save, len(all_texts) - 1) # Keep at least 1 entry

	# Write data to disk
	if write_texts_to_disk(all_texts[:entries_to_save], temp_file_path):
	# Remove what we wrote from memory
	logging.info(f"Offloaded {entries_to_save} entries ({offload_percentage*100:.0f}%) to disk, {len(all_texts)-entries_to_save} remain in memory")
	all_texts = all_texts[entries_to_save:]

	# Force RAM cleanup after file write
	manage_ram(aggressive=True)
	cleanup_cuda(force=True)

	# Check against adaptive memory thresholds
	if ram_usage > resources.ram_usage_warning:
	logging.warning(f"RAM usage high ({ram_usage:.2f} GB), running RAM-focused cleanup")
	manage_ram()

	# If still high after cleanup, take more aggressive measures
	ram_usage = process.memory_info().rss / (1024 * 1024 * 1024)
	if ram_usage > resources.ram_usage_critical:
	logging.warning(f"RAM usage critical ({ram_usage:.2f} GB), performing emergency cleanup")
	# Force Python to release memory
	batch_texts.clear()
	manage_ram(aggressive=True)

	# Adaptive batch reduction - if we're processing too many files, reduce remaining batches
	if len(batches) - batch_idx > 3:
	# For low RAM systems, be more aggressive in reduction
	remaining_batch_count = 3 if resources.total_ram_gb >= 8 else 2
	logging.warning(f"Reducing remaining batches from {len(batches) - batch_idx} to {remaining_batch_count}")
	batches = batches[:batch_idx+remaining_batch_count]

	if not all_texts:
	logging.error("No content found in files")
	return False

	logging.info(f"Successfully loaded {len(all_texts)} text entries with {total_chars:,} characters")

	# Python keywords and common tokens to ensure they're in the vocabulary
	python_tokens = [
	'def', 'class', 'if', 'else', 'elif', 'for', 'while', 'try', 'except', 'import',
	'from', 'as', 'with', 'return', 'yield', 'break', 'continue', 'pass', 'raise',
	'True', 'False', 'None', 'self', 'and', 'or', 'not', 'is', 'in', 'lambda',
	# Common Python library imports
	'import numpy as np', 'import pandas as pd', 'import torch', 'import tensorflow as tf',
	# Function signatures
	'def __init__(self):', 'def forward(self, x):',
	]

	# Initialize tokenizer - using BPE model which works well for code
	tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
	tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
	tokenizer.decoder = ByteLevelDecoder()

	# Special tokens for Python code
	special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<s>", "</s>", "<pad>", "<unk>", "<mask>"]

	# Configure trainer with larger vocabulary for code
	trainer = BpeTrainer(
	vocab_size=vocab_size,
	min_frequency=min_frequency,
	special_tokens=special_tokens,
	show_progress=True,
	initial_alphabet=list("abcdefghijklmnopqrstuvwxyz0123456789!@#$%^&*()_+-=[]{}\|;:'\",./<>?`~ "),
	# Add Python keywords as initial tokens
	initial_tokens=python_tokens
	)

	# Train tokenizer in smaller chunks to save memory
	logging.info(f"Training tokenizer on {len(all_texts):,} texts (target vocab: {vocab_size:,})")

	# Split texts into smaller chunks for training - chunk size adapted to resources
	# EXTREME MEMORY CONSERVATION: Start with tiny chunk sizes
	# Start with just 1 item for the first iteration to gauge memory impact
	initial_chunk_size = 1 # Start with just 1 item
	max_chunk_size = max(1, resources.training_chunk_size // 2) # Half the normal max

	# Track memory failures to adapt
	memory_failures = 0
	current_chunk_size = initial_chunk_size

	# Process in smaller chunks first
	for i in range(0, len(all_texts), current_chunk_size):
	try:
	# Emergency memory check before processing
	current_ram_percent = psutil.virtual_memory().percent
	if current_ram_percent > 85: # Critical threshold
	logging.warning(f"Memory usage critical before training: {current_ram_percent}%")
	current_chunk_size = max(1, current_chunk_size // 2) # Reduce chunk size
	logging.info(f"Reducing chunk size to {current_chunk_size} due to memory pressure")
	manage_ram(aggressive=True)
	cleanup_cuda(force=True)

	# Get the chunk to process
	end_idx = min(i + current_chunk_size, len(all_texts))
	chunk = all_texts[i:end_idx]

	# Log progress
	chunks_total = (len(all_texts) + current_chunk_size - 1) // current_chunk_size
	current_chunk = i // current_chunk_size + 1
	logging.info(f"Training on chunk {current_chunk}/{chunks_total} with size {len(chunk)}")

	# Train on this chunk
	tokenizer.train_from_iterator(
	chunk,
	trainer=trainer,
	length=len(chunk)
	)

	# Clean up memory between chunks
	del chunk
	manage_ram(aggressive=True)
	cleanup_cuda(force=True)

	# If successful and we're still using a reduced chunk size, try increasing it
	if current_chunk_size < max_chunk_size and memory_failures == 0 and current_chunk > 3:
	new_size = min(max_chunk_size, current_chunk_size * 2)
	logging.info(f"Increasing chunk size from {current_chunk_size} to {new_size}")
	current_chunk_size = new_size

	except Exception as e:
	if "memory" in str(e).lower() or "allocation" in str(e).lower():
	memory_failures += 1
	logging.error(f"Memory error during training: {e}")

	# Reduce chunk size and retry
	old_size = current_chunk_size
	current_chunk_size = max(1, current_chunk_size // 2)
	logging.warning(f"Reducing chunk size from {old_size} to {current_chunk_size} and retrying")

	# Force cleanup
	manage_ram(aggressive=True)
	cleanup_cuda(force=True)

	# Back up a bit to retry with smaller chunk
	i = max(0, i - current_chunk_size)
	continue
	else:
	# Non-memory error, re-raise
	raise

	# Ensure output directory exists
	output_dir = os.path.dirname(output_path) or '.'
	if output_dir:
	os.makedirs(output_dir, exist_ok=True)

	# Save tokenizer
	tokenizer.save(output_path)

	final_vocab_size = len(tokenizer.get_vocab())
	elapsed = time.time() - start_time
	logging.info(f"Tokenizer created with {final_vocab_size:,} tokens in {elapsed:.1f} seconds")
	logging.info(f"Saved to: {output_path}")

	return True

	except Exception as e:
	logging.error(f"Error training tokenizer: {e}")
	logging.error(traceback.format_exc())

	# Adaptive retry strategy for memory errors
	if "memory" in str(e).lower() or "allocation" in str(e).lower():
	logging.warning("Memory error detected, implementing adaptive sampling strategy...")

	# Clear as much memory as possible
	cleanup_cuda(True)

	# Try progressively smaller samples until success or giving up
	try:
	# For very low memory systems, use even smaller sample
	sample_size = 5 if resources.total_ram_gb < 8 else 10
	all_texts_backup = all_texts[:sample_size] # Keep a small sample
	del all_texts
	gc.collect()

	# Release all other large objects and force collection
	cleanup_cuda(True)

	logging.info(f"Trying with a smaller sample size: {sample_size} texts")
	tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
	tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
	tokenizer.decoder = ByteLevelDecoder()

	tokenizer.train_from_iterator(all_texts_backup, trainer=trainer)
	tokenizer.save(output_path)

	final_vocab_size = len(tokenizer.get_vocab())
	elapsed = time.time() - start_time
	logging.info(f"Tokenizer created with {final_vocab_size:,} tokens in {elapsed:.1f} seconds")
	logging.info(f"Saved to: {output_path}")
	return True
	except Exception as e2:
	logging.error(f"Retry failed: {e2}")
	return False

	return False


	if __name__ == "__main__":
	# Main entry point with command-line argument handling
	logging.info("Starting EZ-Tokenizer creation script")
	logging.info(f"EZ-Tokenizer v1.0.0 - Optimized for performance and accuracy")
	logging.info("Copyright (c) 2025 EZ-Tokenizer Team. All rights reserved.")

	if len(sys.argv) < 3:
	print("Usage: python adaptive_tokenizer.py <input_dir> <output_path> [vocab_size] [min_frequency] [max_files]")
	print(" max_files: Optional maximum number of files to process (default: auto-determined)")
	print(" Use 'MAX' to process all files in the directory")
	sys.exit(1)

	input_dir = sys.argv[1]
	output_path = sys.argv[2]

	vocab_size = int(sys.argv[3]) if len(sys.argv) > 3 else 40000
	min_frequency = int(sys.argv[4]) if len(sys.argv) > 4 else 2

	# Handle max_files parameter with special 'MAX' keyword
	max_files = None
	if len(sys.argv) > 5:
	if sys.argv[5].upper() == 'MAX':
	max_files = float('inf') # Effectively no limit
	logging.info("MAX keyword detected - will process all available files")
	else:
	try:
	max_files = int(sys.argv[5])
	except ValueError:
	logging.warning(f"Invalid max_files value: {sys.argv[5]} - using auto determination")
	max_files = None

	# Detect system resources automatically
	resources = SystemResources()

	logging.info("Starting tokenizer creation with the following parameters:")
	logging.info(f"Configuration:")
	logging.info(f" Input directory: {input_dir}")
	logging.info(f" Output path: {output_path}")
	logging.info(f" Vocabulary size: {vocab_size}")
	logging.info(f" Minimum frequency: {min_frequency}")
	if max_files == float('inf'):
	logging.info(f" Maximum files: MAX (all files)")
	else:
	logging.info(f" Maximum files: {max_files if max_files is not None else 'auto'}")


	# Create a temp directory for offloaded data
	import tempfile
	import atexit
	import shutil

	# Create a temporary directory that will be automatically cleaned up
	temp_dir = tempfile.mkdtemp(prefix='nexforge_tokenizer_')
	logging.info(f"Created temporary directory for data offloading: {temp_dir}")

	# Register cleanup function to remove the temp directory on exit
	def cleanup_temp():
	try:
	if os.path.exists(temp_dir):
	shutil.rmtree(temp_dir, ignore_errors=True)
	logging.info(f"Cleaned up temporary directory: {temp_dir}")
	except Exception as e:
	logging.warning(f"Error cleaning up temporary directory: {e}")

	atexit.register(cleanup_temp)

	# Initial memory check
	log_memory_usage()

	# Pass the temp_dir to the build_tokenizer function
	success = build_tokenizer(
	input_dir=input_dir,
	output_path=output_path,
	vocab_size=vocab_size,
	min_frequency=min_frequency,
	max_files=max_files,
	resources=resources,
	temp_dir=temp_dir # Pass temp_dir to the function
	)

	# Cleanup is now handled by the atexit handler
	logging.info("Temporary files will be cleaned up on exit")

	# Final status
	if success:
	logging.info("Tokenizer creation completed successfully")
	sys.exit(0)
	else:
	logging.error("Tokenizer creation failed")
	sys.exit(1)