change up setups

Browse files

didn't test linux but should work

Files changed (6) hide show

auto-exl2-upload/auto-exl2-upload.zip +2 -2
auto-exl2-upload/linux-setup.sh +11 -9
auto-exl2-upload/windows-setup.bat +11 -9
exl2-multi-quant-local/exl2-multi-quant-local.zip +2 -2
exl2-multi-quant-local/linux-setup.sh +11 -9
exl2-multi-quant-local/windows-setup.bat +11 -9

auto-exl2-upload/auto-exl2-upload.zip CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:64bc897ec0699349f1f1d6e6a9cd9f2e4c8e94d4de6e45453603afe8f93f6803
-size 8403

 version https://git-lfs.github.com/spec/v1
+oid sha256:d112e7bf1d8f4f6f42c961edb46f89f8356ec5265798b493f3d6b55e2c994376
+size 8585

auto-exl2-upload/linux-setup.sh CHANGED Viewed

@@ -40,7 +40,7 @@ fi
 read -p "Please enter your GPU compute version, CUDA 11/12 or AMD ROCm (11, 12, rocm): " pytorch_version
 # ask to install flash attention
-echo "Flash attention is a feature that could fix overflow issues on some more broken models."
 read -p "Would you like to install flash-attention? (rarely needed and optional) (y/n) " flash_attention
 if [ "$flash_attention" != "y" ] && [ "$flash_attention" != "n" ]; then
     echo "Invalid input. Please enter y or n."
@@ -69,7 +69,6 @@ rm download-model.py
 rm -rf exllamav2
 rm start-quant.sh
 rm enter-venv.sh
-rm -rf flash-attention
 # download stuff
 echo "Downloading files"
@@ -87,13 +86,7 @@ venv/bin/python -m pip install -r exllamav2/requirements.txt
 venv/bin/python -m pip install huggingface-hub transformers accelerate
 venv/bin/python -m pip install ./exllamav2
-if [ "$flash_attention" = "y" ]; then
-    echo "Installing flash-attention..."
-    echo "If failed, retry without flash-attention."
-    git clone https://github.com/Dao-AILab/flash-attention
-    venv/bin/python -m pip install ./flash-attention
-    rm -rf flash-attention
-fi
 # create start-quant.sh
 echo "#!/bin/bash" > start-quant.sh
@@ -107,6 +100,15 @@ echo "#!/bin/bash" > enter-venv.sh
 echo "bash --init-file venv/bin/activate" >> enter-venv.sh
 chmod +x enter-venv.sh
 echo "If you use ctrl+c to stop, you may need to also use 'pkill python' to stop running scripts."
 echo "Environment setup complete. run start-quant.sh to start the quantization process."
 read -p "Press enter to exit"

 read -p "Please enter your GPU compute version, CUDA 11/12 or AMD ROCm (11, 12, rocm): " pytorch_version
 # ask to install flash attention
+echo "Flash attention is a feature that could fix overflow issues on some more broken models, however, it will increase install time by a few hours."
 read -p "Would you like to install flash-attention? (rarely needed and optional) (y/n) " flash_attention
 if [ "$flash_attention" != "y" ] && [ "$flash_attention" != "n" ]; then
     echo "Invalid input. Please enter y or n."
 rm -rf exllamav2
 rm start-quant.sh
 rm enter-venv.sh
 # download stuff
 echo "Downloading files"
 venv/bin/python -m pip install huggingface-hub transformers accelerate
 venv/bin/python -m pip install ./exllamav2
+echo "Writing shell files..."
 # create start-quant.sh
 echo "#!/bin/bash" > start-quant.sh
 echo "bash --init-file venv/bin/activate" >> enter-venv.sh
 chmod +x enter-venv.sh
+if [ "$flash_attention" = "y" ]; then
+    echo "Going to attempt to install flash attention but it isn't required."
+    echo "You may close now if you'd like and continue without flash attention."
+    read -p "Press enter to continue and install flash attention"
+    echo "Get some popcorn and watch a movie, this will take a while."
+    echo "Installing flash-attn..."
+    venv/bin/python -m pip install git+https://github.com/Dao-AILab/flash-attention.git
+fi
 echo "If you use ctrl+c to stop, you may need to also use 'pkill python' to stop running scripts."
 echo "Environment setup complete. run start-quant.sh to start the quantization process."
 read -p "Press enter to exit"

auto-exl2-upload/windows-setup.bat CHANGED Viewed

@@ -43,7 +43,7 @@ where nvcc
 set /p cuda_version="Please enter your CUDA version (11 or 12): "
 REM ask to install flash attention
-echo Flash attention is a feature that could fix overflow issues on some more broken models. However it will increase install time by a few hours.
 set /p flash_attention="Would you like to install flash-attention? (rarely needed and optional) (y/n) "
 if not "%flash_attention%"=="y" if not "%flash_attention%"=="n" (
     echo Invalid input. Please enter y or n.
@@ -69,7 +69,6 @@ del download-model.py
 rmdir /s /q exllamav2
 del start-quant.bat
 del enter-venv.bat
-rmdir /s /q flash-attention
 REM download stuff
 echo Downloading files...
@@ -87,13 +86,7 @@ venv\scripts\python.exe -m pip install -r exllamav2/requirements.txt
 venv\scripts\python.exe -m pip install huggingface-hub transformers accelerate
 venv\scripts\python.exe -m pip install .\exllamav2
-if "%flash_attention%"=="y" (
-    echo Installing flash-attention. Go watch some movies, this will take a while...
-    echo If failed, retry without flash-attention.
-    git clone https://github.com/Dao-AILab/flash-attention
-    venv\scripts\python.exe -m pip install .\flash-attention
-    rmdir /s /q flash-attention
-)
 REM create start-quant-windows.bat
 echo @echo off > start-quant.bat
@@ -106,6 +99,15 @@ REM create enter-venv.bat
 echo @echo off > enter-venv.bat
 echo cmd /k call venv\scripts\activate.bat >> enter-venv.bat
 powershell -c (New-Object Media.SoundPlayer "C:\Windows\Media\tada.wav").PlaySync();
 echo Environment setup complete. run start-quant.bat to start the quantization process.
 pause

 set /p cuda_version="Please enter your CUDA version (11 or 12): "
 REM ask to install flash attention
+echo Flash attention is a feature that could fix overflow issues on some more broken models, however, it will increase install time by a few hours.
 set /p flash_attention="Would you like to install flash-attention? (rarely needed and optional) (y/n) "
 if not "%flash_attention%"=="y" if not "%flash_attention%"=="n" (
     echo Invalid input. Please enter y or n.
 rmdir /s /q exllamav2
 del start-quant.bat
 del enter-venv.bat
 REM download stuff
 echo Downloading files...
 venv\scripts\python.exe -m pip install huggingface-hub transformers accelerate
 venv\scripts\python.exe -m pip install .\exllamav2
+echo Writing batch files...
 REM create start-quant-windows.bat
 echo @echo off > start-quant.bat
 echo @echo off > enter-venv.bat
 echo cmd /k call venv\scripts\activate.bat >> enter-venv.bat
+if "%flash_attention%"=="y" (
+    echo Going to attempt to install flash attention but it isn't required.
+    echo You may close now if you'd like and continue without flash attention.
+    pause
+    echo Get some popcorn and watch a movie. This will take a while.
+    echo Installing flash-attn...
+    venv\scripts\python.exe -m pip install git+https://github.com/Dao-AILab/flash-attention.git
+)
 powershell -c (New-Object Media.SoundPlayer "C:\Windows\Media\tada.wav").PlaySync();
 echo Environment setup complete. run start-quant.bat to start the quantization process.
 pause

exl2-multi-quant-local/exl2-multi-quant-local.zip CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5bd8f0bfff817ece26c0fe1a0886c2851f761386b61fe5d53e69b080341a634a
-size 7226

 version https://git-lfs.github.com/spec/v1
+oid sha256:96d89522925670652ab7ea1d6152a4e64c15302a940c9753a37345f2e9a06e58
+size 7408

exl2-multi-quant-local/linux-setup.sh CHANGED Viewed

@@ -40,7 +40,7 @@ fi
 read -p "Please enter your GPU compute version, CUDA 11/12 or AMD ROCm (11, 12, rocm): " pytorch_version
 # ask to install flash attention
-echo "Flash attention is a feature that could fix overflow issues on some more broken models."
 read -p "Would you like to install flash-attention? (rarely needed and optional) (y/n) " flash_attention
 if [ "$flash_attention" != "y" ] && [ "$flash_attention" != "n" ]; then
     echo "Invalid input. Please enter y or n."
@@ -69,7 +69,6 @@ rm download-model.py
 rm -rf exllamav2
 rm start-quant.sh
 rm enter-venv.sh
-rm -rf flash-attention
 # download stuff
 echo "Downloading files"
@@ -87,13 +86,7 @@ venv/bin/python -m pip install -r exllamav2/requirements.txt
 venv/bin/python -m pip install huggingface-hub transformers accelerate
 venv/bin/python -m pip install ./exllamav2
-if [ "$flash_attention" = "y" ]; then
-    echo "Installing flash-attention..."
-    echo "If failed, retry without flash-attention."
-    git clone https://github.com/Dao-AILab/flash-attention
-    venv/bin/python -m pip install ./flash-attention
-    rm -rf flash-attention
-fi
 # create start-quant.sh
 echo "#!/bin/bash" > start-quant.sh
@@ -107,6 +100,15 @@ echo "#!/bin/bash" > enter-venv.sh
 echo "bash --init-file venv/bin/activate" >> enter-venv.sh
 chmod +x enter-venv.sh
 echo "If you use ctrl+c to stop, you may need to also use 'pkill python' to stop running scripts."
 echo "Environment setup complete. run start-quant.sh to start the quantization process."
 read -p "Press enter to exit"

 read -p "Please enter your GPU compute version, CUDA 11/12 or AMD ROCm (11, 12, rocm): " pytorch_version
 # ask to install flash attention
+echo "Flash attention is a feature that could fix overflow issues on some more broken models, however, it will increase install time by a few hours."
 read -p "Would you like to install flash-attention? (rarely needed and optional) (y/n) " flash_attention
 if [ "$flash_attention" != "y" ] && [ "$flash_attention" != "n" ]; then
     echo "Invalid input. Please enter y or n."
 rm -rf exllamav2
 rm start-quant.sh
 rm enter-venv.sh
 # download stuff
 echo "Downloading files"
 venv/bin/python -m pip install huggingface-hub transformers accelerate
 venv/bin/python -m pip install ./exllamav2
+echo "Writing shell files..."
 # create start-quant.sh
 echo "#!/bin/bash" > start-quant.sh
 echo "bash --init-file venv/bin/activate" >> enter-venv.sh
 chmod +x enter-venv.sh
+if [ "$flash_attention" = "y" ]; then
+    echo "Going to attempt to install flash attention but it isn't required."
+    echo "You may close now if you'd like and continue without flash attention."
+    read -p "Press enter to continue and install flash attention"
+    echo "Get some popcorn and watch a movie, this will take a while."
+    echo "Installing flash-attn..."
+    venv/bin/python -m pip install git+https://github.com/Dao-AILab/flash-attention.git
+fi
 echo "If you use ctrl+c to stop, you may need to also use 'pkill python' to stop running scripts."
 echo "Environment setup complete. run start-quant.sh to start the quantization process."
 read -p "Press enter to exit"

exl2-multi-quant-local/windows-setup.bat CHANGED Viewed

@@ -43,7 +43,7 @@ where nvcc
 set /p cuda_version="Please enter your CUDA version (11 or 12): "
 REM ask to install flash attention
-echo Flash attention is a feature that could fix overflow issues on some more broken models. However it will increase install time by a few hours.
 set /p flash_attention="Would you like to install flash-attention? (rarely needed and optional) (y/n) "
 if not "%flash_attention%"=="y" if not "%flash_attention%"=="n" (
     echo Invalid input. Please enter y or n.
@@ -69,7 +69,6 @@ del download-model.py
 rmdir /s /q exllamav2
 del start-quant.bat
 del enter-venv.bat
-rmdir /s /q flash-attention
 REM download stuff
 echo Downloading files...
@@ -87,13 +86,7 @@ venv\scripts\python.exe -m pip install -r exllamav2/requirements.txt
 venv\scripts\python.exe -m pip install huggingface-hub transformers accelerate
 venv\scripts\python.exe -m pip install .\exllamav2
-if "%flash_attention%"=="y" (
-    echo Installing flash-attention. Go watch some movies, this will take a while...
-    echo If failed, retry without flash-attention.
-    git clone https://github.com/Dao-AILab/flash-attention
-    venv\scripts\python.exe -m pip install .\flash-attention
-    rmdir /s /q flash-attention
-)
 REM create start-quant-windows.bat
 echo @echo off > start-quant.bat
@@ -106,6 +99,15 @@ REM create enter-venv.bat
 echo @echo off > enter-venv.bat
 echo cmd /k call venv\scripts\activate.bat >> enter-venv.bat
 powershell -c (New-Object Media.SoundPlayer "C:\Windows\Media\tada.wav").PlaySync();
 echo Environment setup complete. run start-quant.bat to start the quantization process.
 pause

 set /p cuda_version="Please enter your CUDA version (11 or 12): "
 REM ask to install flash attention
+echo Flash attention is a feature that could fix overflow issues on some more broken models, however, it will increase install time by a few hours.
 set /p flash_attention="Would you like to install flash-attention? (rarely needed and optional) (y/n) "
 if not "%flash_attention%"=="y" if not "%flash_attention%"=="n" (
     echo Invalid input. Please enter y or n.
 rmdir /s /q exllamav2
 del start-quant.bat
 del enter-venv.bat
 REM download stuff
 echo Downloading files...
 venv\scripts\python.exe -m pip install huggingface-hub transformers accelerate
 venv\scripts\python.exe -m pip install .\exllamav2
+echo Writing batch files...
 REM create start-quant-windows.bat
 echo @echo off > start-quant.bat
 echo @echo off > enter-venv.bat
 echo cmd /k call venv\scripts\activate.bat >> enter-venv.bat
+if "%flash_attention%"=="y" (
+    echo Going to attempt to install flash attention but it isn't required.
+    echo You may close now if you'd like and continue without flash attention.
+    pause
+    echo Get some popcorn and watch a movie. This will take a while.
+    echo Installing flash-attn...
+    venv\scripts\python.exe -m pip install git+https://github.com/Dao-AILab/flash-attention.git
+)
 powershell -c (New-Object Media.SoundPlayer "C:\Windows\Media\tada.wav").PlaySync();
 echo Environment setup complete. run start-quant.bat to start the quantization process.
 pause