| @echo off |
|
|
| :: Set up directory variables first |
| set "SCRIPT_DIR=%~dp0" |
| set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%" |
| set "CURRENT_DIR=%CD%" |
| cd /d "%SCRIPT_DIR%" |
|
|
| :: EZ-Tokenizer Launcher with Banner |
| :: This script must be run as administrator |
| :: Previous versions were known as NexForge Tokenizer |
| :: All functionality remains the same, only the name has been updated |
|
|
| cls |
|
|
| echo. |
| echo ======================================================= |
| echo EZ-TOKENIZER v1.0.0 |
| echo (CodeGen-NF Model Pre-Release) |
| echo ======================================================= |
| echo Script running from: %SCRIPT_DIR% |
|
|
| :check_admin |
| net session >nul 2>&1 |
| if %errorLevel% == 0 ( |
| echo Running with administrator privileges... |
| ) else ( |
| echo ########################################################### |
| echo # # |
| echo # EZ-TOKENIZER REQUIRES ADMINISTRATOR PRIVILEGES # |
| echo # Please right-click and select 'Run as administrator' # |
| echo # # |
| echo ########################################################### |
| echo. |
| echo Please right-click on this file and select "Run as administrator" |
| pause |
| exit /b |
| ) |
|
|
| :menu |
| cls |
| :: Display banner |
| echo N N EEEEE X X FFFFF OOOOO RRRR GGGG EEEEE |
| echo NN N E X X F O O R R G E |
| echo N N N EEEE X FFFF O O RRRR G GG EEEE |
| echo N NN E X X F O O R R G G E |
| echo N N EEEEE X X F OOOOO R R GGGG EEEEE |
| echo. |
| echo PRESENTS: |
| echo ======================================================= |
| echo EZ-TOKENIZER v1.0.0 |
| echo ======================================================= |
| :: Display current directory with error checking |
| if defined SCRIPT_DIR ( |
| echo Current directory: %~dp0 |
| echo Script directory: %~dp0 |
| ) else ( |
| echo [WARNING] SCRIPT_DIR not defined. Using current directory: %CD% |
| set "SCRIPT_DIR=%CD%" |
| ) |
| echo. |
| echo MINIMUM REQUIREMENTS: |
| echo - Python 3.8 or higher |
| echo - 4GB RAM minimum (8GB+ recommended) |
| echo - 1GB free disk space |
|
|
| echo. |
| echo DATASET INFORMATION: |
| echo - Dataset location: %SCRIPT_DIR%\Dataset\ |
| echo - Please add your dataset files to the directory or use 4. Open Dataset Directory and insert your files. |
|
|
| echo. |
| echo MENU: |
| echo 1. Install Dependencies |
| echo 2. Create Tokenizer (50k vocab, min_freq=2) |
| echo 3. Test Tokenizer (2 runs with 10,000 samples) |
| echo 4. Open Dataset Directory |
| echo 5. Exit |
| echo. |
| set /p choice=Enter your choice (1-5): |
|
|
| echo. |
|
|
| if "%choice%"=="1" goto install_deps |
| if "%choice%"=="2" goto create_tokenizer |
| if "%choice%"=="3" goto test_tokenizer |
| if "%choice%"=="4" goto open_dataset |
| if "%choice%"=="5" goto exit |
|
|
| echo Invalid choice. Please enter a number between 1 and 5. |
| pause |
| goto menu |
|
|
| :install_deps |
| echo Installing dependencies... |
| echo This may take a few minutes... |
| echo. |
|
|
| :: Create virtual environment if it doesn't exist |
| if not exist "%SCRIPT_DIR%\venv" ( |
| echo Creating virtual environment... |
| python -m venv "%SCRIPT_DIR%\venv" |
| if errorlevel 1 ( |
| echo Failed to create virtual environment |
| pause |
| goto menu |
| ) |
| ) |
|
|
| :: Activate virtual environment and install dependencies |
| call "%SCRIPT_DIR%\venv\Scripts\activate" |
|
|
| :: Upgrade pip first |
| echo [INFO] Upgrading pip... |
| python -m pip install --upgrade pip |
| if errorlevel 1 ( |
| echo [ERROR] Failed to upgrade pip |
| pause |
| goto menu |
| ) |
|
|
| :: Install PyTorch CPU version |
| echo [INFO] Installing PyTorch CPU version... |
| pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu |
| if errorlevel 1 ( |
| echo [WARNING] Failed to install specific PyTorch version, trying latest compatible version... |
| pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu |
| if errorlevel 1 ( |
| echo [ERROR] Failed to install PyTorch |
| echo [INFO] Please check your internet connection and try again |
| pause |
| goto menu |
| ) |
| ) |
|
|
| :: Install other dependencies one by one |
| echo [INFO] Installing additional dependencies... |
| pip install tqdm==4.65.0 psutil==5.9.5 python-dateutil==2.8.2 python-Levenshtein |
| if errorlevel 1 ( |
| echo [WARNING] Failed to install some dependencies, trying with --no-cache-dir... |
| pip install --no-cache-dir tqdm==4.65.0 psutil==5.9.5 python-dateutil==2.8.2 python-Levenshtein |
| if errorlevel 1 ( |
| echo [ERROR] Failed to install additional dependencies |
| pause |
| goto menu |
| ) |
| ) |
|
|
| :: Install tokenizers with pre-built wheel |
| echo [INFO] Installing tokenizers... |
| pip install tokenizers==0.21.1 --only-binary :all: |
| if errorlevel 1 ( |
| echo [WARNING] Could not install tokenizers with pre-built wheel |
| echo [INFO] Trying alternative installation method... |
| pip install tokenizers==0.21.1 --no-deps |
| if errorlevel 1 ( |
| echo [ERROR] Failed to install tokenizers |
| echo Note: This package requires a C++ build toolchain or a pre-built wheel. |
| echo On Windows, you may need to install Visual Studio Build Tools with C++ workload. |
| pause |
| goto menu |
| ) |
| ) |
|
|
| echo. |
| echo [INFO] All dependencies installed successfully! |
|
|
| echo [INFO] Installing nexforgetokenizer in development mode... |
| python -m pip install -e . |
| if errorlevel 1 ( |
| echo [ERROR] Failed to install nexforgetokenizer in development mode |
| pause |
| goto menu |
| ) |
|
|
| echo [INFO] Package installation complete! |
| pause |
| goto menu |
|
|
| :create_tokenizer |
| if not exist "%SCRIPT_DIR%\venv" ( |
| echo Virtual environment not found. Please install dependencies first. |
| pause |
| goto menu |
| ) |
|
|
| call "%SCRIPT_DIR%\venv\Scripts\activate" |
|
|
| :: Create output directory if it doesn't exist |
| if not exist "%SCRIPT_DIR%\output" mkdir "%SCRIPT_DIR%\output" |
|
|
| :: Check if dataset directory exists |
| if not exist "%SCRIPT_DIR%\Dataset" ( |
| echo Creating Dataset directory... |
| mkdir "%SCRIPT_DIR%\Dataset" |
| echo Please add your dataset files to: %SCRIPT_DIR%\Dataset |
| pause |
| start "" "%SCRIPT_DIR%\Dataset" |
| goto menu |
| ) |
|
|
| :: Check if there are any files in the Dataset directory |
| dir /b "%SCRIPT_DIR%\Dataset\*.*" >nul 2>&1 |
| if %ERRORLEVEL% NEQ 0 ( |
| echo No files found in: %SCRIPT_DIR%\Dataset |
| echo Please add your dataset files to this directory. |
| pause |
| start "" "%SCRIPT_DIR%\Dataset" |
| goto menu |
| ) |
|
|
| echo Creating EZ-Tokenizer with 50k vocabulary and min_freq=2 (all files)... |
| python -m nexforgetokenizer.adaptive_tokenizer "%SCRIPT_DIR%\Dataset" "%SCRIPT_DIR%\output\tokenizer.json" 50000 2 MAX |
|
|
| if errorlevel 1 ( |
| echo Failed to create tokenizer |
| pause |
| goto menu |
| ) |
|
|
| echo. |
| echo EZ-Tokenizer created successfully at: %SCRIPT_DIR%\output\tokenizer.json |
| echo Vocabulary size: 50,000 |
| echo Minimum frequency: 2 |
| echo Processed all available files in the dataset |
| echo. |
| echo You can now use this tokenizer in your projects by loading: output\tokenizer.json |
| pause |
| goto menu |
|
|
| :test_tokenizer |
| if not exist "%SCRIPT_DIR%\venv" ( |
| echo Virtual environment not found. Please install dependencies first. |
| pause |
| goto menu |
| ) |
|
|
| call "%SCRIPT_DIR%\venv\Scripts\activate" |
|
|
| :: Create test_result directory if it doesn't exist |
| if not exist "%SCRIPT_DIR%\test_result" mkdir "%SCRIPT_DIR%\test_result" |
|
|
| :: Check if tokenizer exists |
| if not exist "%SCRIPT_DIR%\output\tokenizer.json" ( |
| echo EZ-Tokenizer not found. Please create a tokenizer first. |
| echo Looking for: %SCRIPT_DIR%\output\tokenizer.json |
| pause |
| goto menu |
| ) |
|
|
| echo Running test with 10,000 samples... |
| echo Testing EZ-Tokenizer with 10,000 samples... |
| python "%SCRIPT_DIR%\Test_tokenizer\test_tokenizer.py" --tokenizer "%SCRIPT_DIR%\output\tokenizer.json" --input "%SCRIPT_DIR%\Dataset" --sample 10000 --output "%SCRIPT_DIR%\test_result\test_run.txt" |
|
|
| if errorlevel 1 ( |
| echo Test run failed |
| pause |
| goto menu |
| ) |
|
|
| echo. |
| echo Both test runs completed successfully! |
| echo Results saved to: %SCRIPT_DIR%\test_result\ |
|
|
| :: Open the test results directory |
| if exist "%SCRIPT_DIR%\test_result\" ( |
| start "" "%SCRIPT_DIR%\test_result\" |
| ) else ( |
| echo Warning: Test results directory not found. |
| ) |
|
|
| pause |
| goto menu |
|
|
| :open_dataset |
| if not exist "%SCRIPT_DIR%\Dataset" ( |
| mkdir "%SCRIPT_DIR%\Dataset" |
| ) |
| start "" "%SCRIPT_DIR%\Dataset" |
| goto menu |
|
|
| :exit |
| cd /d "%CURRENT_DIR%" |
| echo Exiting NexForge Tokenizer Manager... |
| timeout /t 2 >nul |
| exit |
|
|