EZ-Tokenizer / run_ez_tokenizer.bat
Johnnyman1100's picture
Upload 38 files
4265aea verified
@echo off
:: Set up directory variables first
set "SCRIPT_DIR=%~dp0"
set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%"
set "CURRENT_DIR=%CD%"
cd /d "%SCRIPT_DIR%"
:: EZ-Tokenizer Launcher with Banner
:: This script must be run as administrator
:: Previous versions were known as NexForge Tokenizer
:: All functionality remains the same, only the name has been updated
cls
echo.
echo =======================================================
echo EZ-TOKENIZER v1.0.0
echo (CodeGen-NF Model Pre-Release)
echo =======================================================
echo Script running from: %SCRIPT_DIR%
:check_admin
net session >nul 2>&1
if %errorLevel% == 0 (
echo Running with administrator privileges...
) else (
echo ###########################################################
echo # #
echo # EZ-TOKENIZER REQUIRES ADMINISTRATOR PRIVILEGES #
echo # Please right-click and select 'Run as administrator' #
echo # #
echo ###########################################################
echo.
echo Please right-click on this file and select "Run as administrator"
pause
exit /b
)
:menu
cls
:: Display banner
echo N N EEEEE X X FFFFF OOOOO RRRR GGGG EEEEE
echo NN N E X X F O O R R G E
echo N N N EEEE X FFFF O O RRRR G GG EEEE
echo N NN E X X F O O R R G G E
echo N N EEEEE X X F OOOOO R R GGGG EEEEE
echo.
echo PRESENTS:
echo =======================================================
echo EZ-TOKENIZER v1.0.0
echo =======================================================
:: Display current directory with error checking
if defined SCRIPT_DIR (
echo Current directory: %~dp0
echo Script directory: %~dp0
) else (
echo [WARNING] SCRIPT_DIR not defined. Using current directory: %CD%
set "SCRIPT_DIR=%CD%"
)
echo.
echo MINIMUM REQUIREMENTS:
echo - Python 3.8 or higher
echo - 4GB RAM minimum (8GB+ recommended)
echo - 1GB free disk space
echo.
echo DATASET INFORMATION:
echo - Dataset location: %SCRIPT_DIR%\Dataset\
echo - Please add your dataset files to the directory or use 4. Open Dataset Directory and insert your files.
echo.
echo MENU:
echo 1. Install Dependencies
echo 2. Create Tokenizer (50k vocab, min_freq=2)
echo 3. Test Tokenizer (2 runs with 10,000 samples)
echo 4. Open Dataset Directory
echo 5. Exit
echo.
set /p choice=Enter your choice (1-5):
echo.
if "%choice%"=="1" goto install_deps
if "%choice%"=="2" goto create_tokenizer
if "%choice%"=="3" goto test_tokenizer
if "%choice%"=="4" goto open_dataset
if "%choice%"=="5" goto exit
echo Invalid choice. Please enter a number between 1 and 5.
pause
goto menu
:install_deps
echo Installing dependencies...
echo This may take a few minutes...
echo.
:: Create virtual environment if it doesn't exist
if not exist "%SCRIPT_DIR%\venv" (
echo Creating virtual environment...
python -m venv "%SCRIPT_DIR%\venv"
if errorlevel 1 (
echo Failed to create virtual environment
pause
goto menu
)
)
:: Activate virtual environment and install dependencies
call "%SCRIPT_DIR%\venv\Scripts\activate"
:: Upgrade pip first
echo [INFO] Upgrading pip...
python -m pip install --upgrade pip
if errorlevel 1 (
echo [ERROR] Failed to upgrade pip
pause
goto menu
)
:: Install PyTorch CPU version
echo [INFO] Installing PyTorch CPU version...
pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu
if errorlevel 1 (
echo [WARNING] Failed to install specific PyTorch version, trying latest compatible version...
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
if errorlevel 1 (
echo [ERROR] Failed to install PyTorch
echo [INFO] Please check your internet connection and try again
pause
goto menu
)
)
:: Install other dependencies one by one
echo [INFO] Installing additional dependencies...
pip install tqdm==4.65.0 psutil==5.9.5 python-dateutil==2.8.2 python-Levenshtein
if errorlevel 1 (
echo [WARNING] Failed to install some dependencies, trying with --no-cache-dir...
pip install --no-cache-dir tqdm==4.65.0 psutil==5.9.5 python-dateutil==2.8.2 python-Levenshtein
if errorlevel 1 (
echo [ERROR] Failed to install additional dependencies
pause
goto menu
)
)
:: Install tokenizers with pre-built wheel
echo [INFO] Installing tokenizers...
pip install tokenizers==0.21.1 --only-binary :all:
if errorlevel 1 (
echo [WARNING] Could not install tokenizers with pre-built wheel
echo [INFO] Trying alternative installation method...
pip install tokenizers==0.21.1 --no-deps
if errorlevel 1 (
echo [ERROR] Failed to install tokenizers
echo Note: This package requires a C++ build toolchain or a pre-built wheel.
echo On Windows, you may need to install Visual Studio Build Tools with C++ workload.
pause
goto menu
)
)
echo.
echo [INFO] All dependencies installed successfully!
echo [INFO] Installing nexforgetokenizer in development mode...
python -m pip install -e .
if errorlevel 1 (
echo [ERROR] Failed to install nexforgetokenizer in development mode
pause
goto menu
)
echo [INFO] Package installation complete!
pause
goto menu
:create_tokenizer
if not exist "%SCRIPT_DIR%\venv" (
echo Virtual environment not found. Please install dependencies first.
pause
goto menu
)
call "%SCRIPT_DIR%\venv\Scripts\activate"
:: Create output directory if it doesn't exist
if not exist "%SCRIPT_DIR%\output" mkdir "%SCRIPT_DIR%\output"
:: Check if dataset directory exists
if not exist "%SCRIPT_DIR%\Dataset" (
echo Creating Dataset directory...
mkdir "%SCRIPT_DIR%\Dataset"
echo Please add your dataset files to: %SCRIPT_DIR%\Dataset
pause
start "" "%SCRIPT_DIR%\Dataset"
goto menu
)
:: Check if there are any files in the Dataset directory
dir /b "%SCRIPT_DIR%\Dataset\*.*" >nul 2>&1
if %ERRORLEVEL% NEQ 0 (
echo No files found in: %SCRIPT_DIR%\Dataset
echo Please add your dataset files to this directory.
pause
start "" "%SCRIPT_DIR%\Dataset"
goto menu
)
echo Creating EZ-Tokenizer with 50k vocabulary and min_freq=2 (all files)...
python -m nexforgetokenizer.adaptive_tokenizer "%SCRIPT_DIR%\Dataset" "%SCRIPT_DIR%\output\tokenizer.json" 50000 2 MAX
if errorlevel 1 (
echo Failed to create tokenizer
pause
goto menu
)
echo.
echo EZ-Tokenizer created successfully at: %SCRIPT_DIR%\output\tokenizer.json
echo Vocabulary size: 50,000
echo Minimum frequency: 2
echo Processed all available files in the dataset
echo.
echo You can now use this tokenizer in your projects by loading: output\tokenizer.json
pause
goto menu
:test_tokenizer
if not exist "%SCRIPT_DIR%\venv" (
echo Virtual environment not found. Please install dependencies first.
pause
goto menu
)
call "%SCRIPT_DIR%\venv\Scripts\activate"
:: Create test_result directory if it doesn't exist
if not exist "%SCRIPT_DIR%\test_result" mkdir "%SCRIPT_DIR%\test_result"
:: Check if tokenizer exists
if not exist "%SCRIPT_DIR%\output\tokenizer.json" (
echo EZ-Tokenizer not found. Please create a tokenizer first.
echo Looking for: %SCRIPT_DIR%\output\tokenizer.json
pause
goto menu
)
echo Running test with 10,000 samples...
echo Testing EZ-Tokenizer with 10,000 samples...
python "%SCRIPT_DIR%\Test_tokenizer\test_tokenizer.py" --tokenizer "%SCRIPT_DIR%\output\tokenizer.json" --input "%SCRIPT_DIR%\Dataset" --sample 10000 --output "%SCRIPT_DIR%\test_result\test_run.txt"
if errorlevel 1 (
echo Test run failed
pause
goto menu
)
echo.
echo Both test runs completed successfully!
echo Results saved to: %SCRIPT_DIR%\test_result\
:: Open the test results directory
if exist "%SCRIPT_DIR%\test_result\" (
start "" "%SCRIPT_DIR%\test_result\"
) else (
echo Warning: Test results directory not found.
)
pause
goto menu
:open_dataset
if not exist "%SCRIPT_DIR%\Dataset" (
mkdir "%SCRIPT_DIR%\Dataset"
)
start "" "%SCRIPT_DIR%\Dataset"
goto menu
:exit
cd /d "%CURRENT_DIR%"
echo Exiting NexForge Tokenizer Manager...
timeout /t 2 >nul
exit