EZ-Tokenizer / run_ez_tokenizer.bat

Upload 38 files

4265aea verified 11 months ago

8.41 kB

	@echo off

	:: Set up directory variables first
	set "SCRIPT_DIR=%~dp0"
	set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%"
	set "CURRENT_DIR=%CD%"
	cd /d "%SCRIPT_DIR%"

	:: EZ-Tokenizer Launcher with Banner
	:: This script must be run as administrator
	:: Previous versions were known as NexForge Tokenizer
	:: All functionality remains the same, only the name has been updated

	cls

	echo.
	echo =======================================================
	echo EZ-TOKENIZER v1.0.0
	echo (CodeGen-NF Model Pre-Release)
	echo =======================================================
	echo Script running from: %SCRIPT_DIR%

	:check_admin
	net session >nul 2>&1
	if %errorLevel% == 0 (
	echo Running with administrator privileges...
	) else (
	echo ###########################################################
	echo # #
	echo # EZ-TOKENIZER REQUIRES ADMINISTRATOR PRIVILEGES #
	echo # Please right-click and select 'Run as administrator' #
	echo # #
	echo ###########################################################
	echo.
	echo Please right-click on this file and select "Run as administrator"
	pause
	exit /b
	)

	:menu
	cls
	:: Display banner
	echo N N EEEEE X X FFFFF OOOOO RRRR GGGG EEEEE
	echo NN N E X X F O O R R G E
	echo N N N EEEE X FFFF O O RRRR G GG EEEE
	echo N NN E X X F O O R R G G E
	echo N N EEEEE X X F OOOOO R R GGGG EEEEE
	echo.
	echo PRESENTS:
	echo =======================================================
	echo EZ-TOKENIZER v1.0.0
	echo =======================================================
	:: Display current directory with error checking
	if defined SCRIPT_DIR (
	echo Current directory: %~dp0
	echo Script directory: %~dp0
	) else (
	echo [WARNING] SCRIPT_DIR not defined. Using current directory: %CD%
	set "SCRIPT_DIR=%CD%"
	)
	echo.
	echo MINIMUM REQUIREMENTS:
	echo - Python 3.8 or higher
	echo - 4GB RAM minimum (8GB+ recommended)
	echo - 1GB free disk space

	echo.
	echo DATASET INFORMATION:
	echo - Dataset location: %SCRIPT_DIR%\Dataset\
	echo - Please add your dataset files to the directory or use 4. Open Dataset Directory and insert your files.

	echo.
	echo MENU:
	echo 1. Install Dependencies
	echo 2. Create Tokenizer (50k vocab, min_freq=2)
	echo 3. Test Tokenizer (2 runs with 10,000 samples)
	echo 4. Open Dataset Directory
	echo 5. Exit
	echo.
	set /p choice=Enter your choice (1-5):

	echo.

	if "%choice%"=="1" goto install_deps
	if "%choice%"=="2" goto create_tokenizer
	if "%choice%"=="3" goto test_tokenizer
	if "%choice%"=="4" goto open_dataset
	if "%choice%"=="5" goto exit

	echo Invalid choice. Please enter a number between 1 and 5.
	pause
	goto menu

	:install_deps
	echo Installing dependencies...
	echo This may take a few minutes...
	echo.

	:: Create virtual environment if it doesn't exist
	if not exist "%SCRIPT_DIR%\venv" (
	echo Creating virtual environment...
	python -m venv "%SCRIPT_DIR%\venv"
	if errorlevel 1 (
	echo Failed to create virtual environment
	pause
	goto menu
	)
	)

	:: Activate virtual environment and install dependencies
	call "%SCRIPT_DIR%\venv\Scripts\activate"

	:: Upgrade pip first
	echo [INFO] Upgrading pip...
	python -m pip install --upgrade pip
	if errorlevel 1 (
	echo [ERROR] Failed to upgrade pip
	pause
	goto menu
	)

	:: Install PyTorch CPU version
	echo [INFO] Installing PyTorch CPU version...
	pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu
	if errorlevel 1 (
	echo [WARNING] Failed to install specific PyTorch version, trying latest compatible version...
	pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
	if errorlevel 1 (
	echo [ERROR] Failed to install PyTorch
	echo [INFO] Please check your internet connection and try again
	pause
	goto menu
	)
	)

	:: Install other dependencies one by one
	echo [INFO] Installing additional dependencies...
	pip install tqdm==4.65.0 psutil==5.9.5 python-dateutil==2.8.2 python-Levenshtein
	if errorlevel 1 (
	echo [WARNING] Failed to install some dependencies, trying with --no-cache-dir...
	pip install --no-cache-dir tqdm==4.65.0 psutil==5.9.5 python-dateutil==2.8.2 python-Levenshtein
	if errorlevel 1 (
	echo [ERROR] Failed to install additional dependencies
	pause
	goto menu
	)
	)

	:: Install tokenizers with pre-built wheel
	echo [INFO] Installing tokenizers...
	pip install tokenizers==0.21.1 --only-binary :all:
	if errorlevel 1 (
	echo [WARNING] Could not install tokenizers with pre-built wheel
	echo [INFO] Trying alternative installation method...
	pip install tokenizers==0.21.1 --no-deps
	if errorlevel 1 (
	echo [ERROR] Failed to install tokenizers
	echo Note: This package requires a C++ build toolchain or a pre-built wheel.
	echo On Windows, you may need to install Visual Studio Build Tools with C++ workload.
	pause
	goto menu
	)
	)

	echo.
	echo [INFO] All dependencies installed successfully!

	echo [INFO] Installing nexforgetokenizer in development mode...
	python -m pip install -e .
	if errorlevel 1 (
	echo [ERROR] Failed to install nexforgetokenizer in development mode
	pause
	goto menu
	)

	echo [INFO] Package installation complete!
	pause
	goto menu

	:create_tokenizer
	if not exist "%SCRIPT_DIR%\venv" (
	echo Virtual environment not found. Please install dependencies first.
	pause
	goto menu
	)

	call "%SCRIPT_DIR%\venv\Scripts\activate"

	:: Create output directory if it doesn't exist
	if not exist "%SCRIPT_DIR%\output" mkdir "%SCRIPT_DIR%\output"

	:: Check if dataset directory exists
	if not exist "%SCRIPT_DIR%\Dataset" (
	echo Creating Dataset directory...
	mkdir "%SCRIPT_DIR%\Dataset"
	echo Please add your dataset files to: %SCRIPT_DIR%\Dataset
	pause
	start "" "%SCRIPT_DIR%\Dataset"
	goto menu
	)

	:: Check if there are any files in the Dataset directory
	dir /b "%SCRIPT_DIR%\Dataset\." >nul 2>&1
	if %ERRORLEVEL% NEQ 0 (
	echo No files found in: %SCRIPT_DIR%\Dataset
	echo Please add your dataset files to this directory.
	pause
	start "" "%SCRIPT_DIR%\Dataset"
	goto menu
	)

	echo Creating EZ-Tokenizer with 50k vocabulary and min_freq=2 (all files)...
	python -m nexforgetokenizer.adaptive_tokenizer "%SCRIPT_DIR%\Dataset" "%SCRIPT_DIR%\output\tokenizer.json" 50000 2 MAX

	if errorlevel 1 (
	echo Failed to create tokenizer
	pause
	goto menu
	)

	echo.
	echo EZ-Tokenizer created successfully at: %SCRIPT_DIR%\output\tokenizer.json
	echo Vocabulary size: 50,000
	echo Minimum frequency: 2
	echo Processed all available files in the dataset
	echo.
	echo You can now use this tokenizer in your projects by loading: output\tokenizer.json
	pause
	goto menu

	:test_tokenizer
	if not exist "%SCRIPT_DIR%\venv" (
	echo Virtual environment not found. Please install dependencies first.
	pause
	goto menu
	)

	call "%SCRIPT_DIR%\venv\Scripts\activate"

	:: Create test_result directory if it doesn't exist
	if not exist "%SCRIPT_DIR%\test_result" mkdir "%SCRIPT_DIR%\test_result"

	:: Check if tokenizer exists
	if not exist "%SCRIPT_DIR%\output\tokenizer.json" (
	echo EZ-Tokenizer not found. Please create a tokenizer first.
	echo Looking for: %SCRIPT_DIR%\output\tokenizer.json
	pause
	goto menu
	)

	echo Running test with 10,000 samples...
	echo Testing EZ-Tokenizer with 10,000 samples...
	python "%SCRIPT_DIR%\Test_tokenizer\test_tokenizer.py" --tokenizer "%SCRIPT_DIR%\output\tokenizer.json" --input "%SCRIPT_DIR%\Dataset" --sample 10000 --output "%SCRIPT_DIR%\test_result\test_run.txt"

	if errorlevel 1 (
	echo Test run failed
	pause
	goto menu
	)

	echo.
	echo Both test runs completed successfully!
	echo Results saved to: %SCRIPT_DIR%\test_result\

	:: Open the test results directory
	if exist "%SCRIPT_DIR%\test_result\" (
	start "" "%SCRIPT_DIR%\test_result\"
	) else (
	echo Warning: Test results directory not found.
	)

	pause
	goto menu

	:open_dataset
	if not exist "%SCRIPT_DIR%\Dataset" (
	mkdir "%SCRIPT_DIR%\Dataset"
	)
	start "" "%SCRIPT_DIR%\Dataset"
	goto menu

	:exit
	cd /d "%CURRENT_DIR%"
	echo Exiting NexForge Tokenizer Manager...
	timeout /t 2 >nul
	exit