Commit
·
f3cac53
1
Parent(s):
8effb79
change up setups
Browse filesdidn't test linux but should work
auto-exl2-upload/auto-exl2-upload.zip
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d112e7bf1d8f4f6f42c961edb46f89f8356ec5265798b493f3d6b55e2c994376
|
| 3 |
+
size 8585
|
auto-exl2-upload/linux-setup.sh
CHANGED
|
@@ -40,7 +40,7 @@ fi
|
|
| 40 |
read -p "Please enter your GPU compute version, CUDA 11/12 or AMD ROCm (11, 12, rocm): " pytorch_version
|
| 41 |
|
| 42 |
# ask to install flash attention
|
| 43 |
-
echo "Flash attention is a feature that could fix overflow issues on some more broken models."
|
| 44 |
read -p "Would you like to install flash-attention? (rarely needed and optional) (y/n) " flash_attention
|
| 45 |
if [ "$flash_attention" != "y" ] && [ "$flash_attention" != "n" ]; then
|
| 46 |
echo "Invalid input. Please enter y or n."
|
|
@@ -69,7 +69,6 @@ rm download-model.py
|
|
| 69 |
rm -rf exllamav2
|
| 70 |
rm start-quant.sh
|
| 71 |
rm enter-venv.sh
|
| 72 |
-
rm -rf flash-attention
|
| 73 |
|
| 74 |
# download stuff
|
| 75 |
echo "Downloading files"
|
|
@@ -87,13 +86,7 @@ venv/bin/python -m pip install -r exllamav2/requirements.txt
|
|
| 87 |
venv/bin/python -m pip install huggingface-hub transformers accelerate
|
| 88 |
venv/bin/python -m pip install ./exllamav2
|
| 89 |
|
| 90 |
-
|
| 91 |
-
echo "Installing flash-attention..."
|
| 92 |
-
echo "If failed, retry without flash-attention."
|
| 93 |
-
git clone https://github.com/Dao-AILab/flash-attention
|
| 94 |
-
venv/bin/python -m pip install ./flash-attention
|
| 95 |
-
rm -rf flash-attention
|
| 96 |
-
fi
|
| 97 |
|
| 98 |
# create start-quant.sh
|
| 99 |
echo "#!/bin/bash" > start-quant.sh
|
|
@@ -107,6 +100,15 @@ echo "#!/bin/bash" > enter-venv.sh
|
|
| 107 |
echo "bash --init-file venv/bin/activate" >> enter-venv.sh
|
| 108 |
chmod +x enter-venv.sh
|
| 109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
echo "If you use ctrl+c to stop, you may need to also use 'pkill python' to stop running scripts."
|
| 111 |
echo "Environment setup complete. run start-quant.sh to start the quantization process."
|
| 112 |
read -p "Press enter to exit"
|
|
|
|
| 40 |
read -p "Please enter your GPU compute version, CUDA 11/12 or AMD ROCm (11, 12, rocm): " pytorch_version
|
| 41 |
|
| 42 |
# ask to install flash attention
|
| 43 |
+
echo "Flash attention is a feature that could fix overflow issues on some more broken models, however, it will increase install time by a few hours."
|
| 44 |
read -p "Would you like to install flash-attention? (rarely needed and optional) (y/n) " flash_attention
|
| 45 |
if [ "$flash_attention" != "y" ] && [ "$flash_attention" != "n" ]; then
|
| 46 |
echo "Invalid input. Please enter y or n."
|
|
|
|
| 69 |
rm -rf exllamav2
|
| 70 |
rm start-quant.sh
|
| 71 |
rm enter-venv.sh
|
|
|
|
| 72 |
|
| 73 |
# download stuff
|
| 74 |
echo "Downloading files"
|
|
|
|
| 86 |
venv/bin/python -m pip install huggingface-hub transformers accelerate
|
| 87 |
venv/bin/python -m pip install ./exllamav2
|
| 88 |
|
| 89 |
+
echo "Writing shell files..."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
# create start-quant.sh
|
| 92 |
echo "#!/bin/bash" > start-quant.sh
|
|
|
|
| 100 |
echo "bash --init-file venv/bin/activate" >> enter-venv.sh
|
| 101 |
chmod +x enter-venv.sh
|
| 102 |
|
| 103 |
+
if [ "$flash_attention" = "y" ]; then
|
| 104 |
+
echo "Going to attempt to install flash attention but it isn't required."
|
| 105 |
+
echo "You may close now if you'd like and continue without flash attention."
|
| 106 |
+
read -p "Press enter to continue and install flash attention"
|
| 107 |
+
echo "Get some popcorn and watch a movie, this will take a while."
|
| 108 |
+
echo "Installing flash-attn..."
|
| 109 |
+
venv/bin/python -m pip install git+https://github.com/Dao-AILab/flash-attention.git
|
| 110 |
+
fi
|
| 111 |
+
|
| 112 |
echo "If you use ctrl+c to stop, you may need to also use 'pkill python' to stop running scripts."
|
| 113 |
echo "Environment setup complete. run start-quant.sh to start the quantization process."
|
| 114 |
read -p "Press enter to exit"
|
auto-exl2-upload/windows-setup.bat
CHANGED
|
@@ -43,7 +43,7 @@ where nvcc
|
|
| 43 |
set /p cuda_version="Please enter your CUDA version (11 or 12): "
|
| 44 |
|
| 45 |
REM ask to install flash attention
|
| 46 |
-
echo Flash attention is a feature that could fix overflow issues on some more broken models
|
| 47 |
set /p flash_attention="Would you like to install flash-attention? (rarely needed and optional) (y/n) "
|
| 48 |
if not "%flash_attention%"=="y" if not "%flash_attention%"=="n" (
|
| 49 |
echo Invalid input. Please enter y or n.
|
|
@@ -69,7 +69,6 @@ del download-model.py
|
|
| 69 |
rmdir /s /q exllamav2
|
| 70 |
del start-quant.bat
|
| 71 |
del enter-venv.bat
|
| 72 |
-
rmdir /s /q flash-attention
|
| 73 |
|
| 74 |
REM download stuff
|
| 75 |
echo Downloading files...
|
|
@@ -87,13 +86,7 @@ venv\scripts\python.exe -m pip install -r exllamav2/requirements.txt
|
|
| 87 |
venv\scripts\python.exe -m pip install huggingface-hub transformers accelerate
|
| 88 |
venv\scripts\python.exe -m pip install .\exllamav2
|
| 89 |
|
| 90 |
-
|
| 91 |
-
echo Installing flash-attention. Go watch some movies, this will take a while...
|
| 92 |
-
echo If failed, retry without flash-attention.
|
| 93 |
-
git clone https://github.com/Dao-AILab/flash-attention
|
| 94 |
-
venv\scripts\python.exe -m pip install .\flash-attention
|
| 95 |
-
rmdir /s /q flash-attention
|
| 96 |
-
)
|
| 97 |
|
| 98 |
REM create start-quant-windows.bat
|
| 99 |
echo @echo off > start-quant.bat
|
|
@@ -106,6 +99,15 @@ REM create enter-venv.bat
|
|
| 106 |
echo @echo off > enter-venv.bat
|
| 107 |
echo cmd /k call venv\scripts\activate.bat >> enter-venv.bat
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
powershell -c (New-Object Media.SoundPlayer "C:\Windows\Media\tada.wav").PlaySync();
|
| 110 |
echo Environment setup complete. run start-quant.bat to start the quantization process.
|
| 111 |
pause
|
|
|
|
| 43 |
set /p cuda_version="Please enter your CUDA version (11 or 12): "
|
| 44 |
|
| 45 |
REM ask to install flash attention
|
| 46 |
+
echo Flash attention is a feature that could fix overflow issues on some more broken models, however, it will increase install time by a few hours.
|
| 47 |
set /p flash_attention="Would you like to install flash-attention? (rarely needed and optional) (y/n) "
|
| 48 |
if not "%flash_attention%"=="y" if not "%flash_attention%"=="n" (
|
| 49 |
echo Invalid input. Please enter y or n.
|
|
|
|
| 69 |
rmdir /s /q exllamav2
|
| 70 |
del start-quant.bat
|
| 71 |
del enter-venv.bat
|
|
|
|
| 72 |
|
| 73 |
REM download stuff
|
| 74 |
echo Downloading files...
|
|
|
|
| 86 |
venv\scripts\python.exe -m pip install huggingface-hub transformers accelerate
|
| 87 |
venv\scripts\python.exe -m pip install .\exllamav2
|
| 88 |
|
| 89 |
+
echo Writing batch files...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
REM create start-quant-windows.bat
|
| 92 |
echo @echo off > start-quant.bat
|
|
|
|
| 99 |
echo @echo off > enter-venv.bat
|
| 100 |
echo cmd /k call venv\scripts\activate.bat >> enter-venv.bat
|
| 101 |
|
| 102 |
+
if "%flash_attention%"=="y" (
|
| 103 |
+
echo Going to attempt to install flash attention but it isn't required.
|
| 104 |
+
echo You may close now if you'd like and continue without flash attention.
|
| 105 |
+
pause
|
| 106 |
+
echo Get some popcorn and watch a movie. This will take a while.
|
| 107 |
+
echo Installing flash-attn...
|
| 108 |
+
venv\scripts\python.exe -m pip install git+https://github.com/Dao-AILab/flash-attention.git
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
powershell -c (New-Object Media.SoundPlayer "C:\Windows\Media\tada.wav").PlaySync();
|
| 112 |
echo Environment setup complete. run start-quant.bat to start the quantization process.
|
| 113 |
pause
|
exl2-multi-quant-local/exl2-multi-quant-local.zip
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:96d89522925670652ab7ea1d6152a4e64c15302a940c9753a37345f2e9a06e58
|
| 3 |
+
size 7408
|
exl2-multi-quant-local/linux-setup.sh
CHANGED
|
@@ -40,7 +40,7 @@ fi
|
|
| 40 |
read -p "Please enter your GPU compute version, CUDA 11/12 or AMD ROCm (11, 12, rocm): " pytorch_version
|
| 41 |
|
| 42 |
# ask to install flash attention
|
| 43 |
-
echo "Flash attention is a feature that could fix overflow issues on some more broken models."
|
| 44 |
read -p "Would you like to install flash-attention? (rarely needed and optional) (y/n) " flash_attention
|
| 45 |
if [ "$flash_attention" != "y" ] && [ "$flash_attention" != "n" ]; then
|
| 46 |
echo "Invalid input. Please enter y or n."
|
|
@@ -69,7 +69,6 @@ rm download-model.py
|
|
| 69 |
rm -rf exllamav2
|
| 70 |
rm start-quant.sh
|
| 71 |
rm enter-venv.sh
|
| 72 |
-
rm -rf flash-attention
|
| 73 |
|
| 74 |
# download stuff
|
| 75 |
echo "Downloading files"
|
|
@@ -87,13 +86,7 @@ venv/bin/python -m pip install -r exllamav2/requirements.txt
|
|
| 87 |
venv/bin/python -m pip install huggingface-hub transformers accelerate
|
| 88 |
venv/bin/python -m pip install ./exllamav2
|
| 89 |
|
| 90 |
-
|
| 91 |
-
echo "Installing flash-attention..."
|
| 92 |
-
echo "If failed, retry without flash-attention."
|
| 93 |
-
git clone https://github.com/Dao-AILab/flash-attention
|
| 94 |
-
venv/bin/python -m pip install ./flash-attention
|
| 95 |
-
rm -rf flash-attention
|
| 96 |
-
fi
|
| 97 |
|
| 98 |
# create start-quant.sh
|
| 99 |
echo "#!/bin/bash" > start-quant.sh
|
|
@@ -107,6 +100,15 @@ echo "#!/bin/bash" > enter-venv.sh
|
|
| 107 |
echo "bash --init-file venv/bin/activate" >> enter-venv.sh
|
| 108 |
chmod +x enter-venv.sh
|
| 109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
echo "If you use ctrl+c to stop, you may need to also use 'pkill python' to stop running scripts."
|
| 111 |
echo "Environment setup complete. run start-quant.sh to start the quantization process."
|
| 112 |
read -p "Press enter to exit"
|
|
|
|
| 40 |
read -p "Please enter your GPU compute version, CUDA 11/12 or AMD ROCm (11, 12, rocm): " pytorch_version
|
| 41 |
|
| 42 |
# ask to install flash attention
|
| 43 |
+
echo "Flash attention is a feature that could fix overflow issues on some more broken models, however, it will increase install time by a few hours."
|
| 44 |
read -p "Would you like to install flash-attention? (rarely needed and optional) (y/n) " flash_attention
|
| 45 |
if [ "$flash_attention" != "y" ] && [ "$flash_attention" != "n" ]; then
|
| 46 |
echo "Invalid input. Please enter y or n."
|
|
|
|
| 69 |
rm -rf exllamav2
|
| 70 |
rm start-quant.sh
|
| 71 |
rm enter-venv.sh
|
|
|
|
| 72 |
|
| 73 |
# download stuff
|
| 74 |
echo "Downloading files"
|
|
|
|
| 86 |
venv/bin/python -m pip install huggingface-hub transformers accelerate
|
| 87 |
venv/bin/python -m pip install ./exllamav2
|
| 88 |
|
| 89 |
+
echo "Writing shell files..."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
# create start-quant.sh
|
| 92 |
echo "#!/bin/bash" > start-quant.sh
|
|
|
|
| 100 |
echo "bash --init-file venv/bin/activate" >> enter-venv.sh
|
| 101 |
chmod +x enter-venv.sh
|
| 102 |
|
| 103 |
+
if [ "$flash_attention" = "y" ]; then
|
| 104 |
+
echo "Going to attempt to install flash attention but it isn't required."
|
| 105 |
+
echo "You may close now if you'd like and continue without flash attention."
|
| 106 |
+
read -p "Press enter to continue and install flash attention"
|
| 107 |
+
echo "Get some popcorn and watch a movie, this will take a while."
|
| 108 |
+
echo "Installing flash-attn..."
|
| 109 |
+
venv/bin/python -m pip install git+https://github.com/Dao-AILab/flash-attention.git
|
| 110 |
+
fi
|
| 111 |
+
|
| 112 |
echo "If you use ctrl+c to stop, you may need to also use 'pkill python' to stop running scripts."
|
| 113 |
echo "Environment setup complete. run start-quant.sh to start the quantization process."
|
| 114 |
read -p "Press enter to exit"
|
exl2-multi-quant-local/windows-setup.bat
CHANGED
|
@@ -43,7 +43,7 @@ where nvcc
|
|
| 43 |
set /p cuda_version="Please enter your CUDA version (11 or 12): "
|
| 44 |
|
| 45 |
REM ask to install flash attention
|
| 46 |
-
echo Flash attention is a feature that could fix overflow issues on some more broken models
|
| 47 |
set /p flash_attention="Would you like to install flash-attention? (rarely needed and optional) (y/n) "
|
| 48 |
if not "%flash_attention%"=="y" if not "%flash_attention%"=="n" (
|
| 49 |
echo Invalid input. Please enter y or n.
|
|
@@ -69,7 +69,6 @@ del download-model.py
|
|
| 69 |
rmdir /s /q exllamav2
|
| 70 |
del start-quant.bat
|
| 71 |
del enter-venv.bat
|
| 72 |
-
rmdir /s /q flash-attention
|
| 73 |
|
| 74 |
REM download stuff
|
| 75 |
echo Downloading files...
|
|
@@ -87,13 +86,7 @@ venv\scripts\python.exe -m pip install -r exllamav2/requirements.txt
|
|
| 87 |
venv\scripts\python.exe -m pip install huggingface-hub transformers accelerate
|
| 88 |
venv\scripts\python.exe -m pip install .\exllamav2
|
| 89 |
|
| 90 |
-
|
| 91 |
-
echo Installing flash-attention. Go watch some movies, this will take a while...
|
| 92 |
-
echo If failed, retry without flash-attention.
|
| 93 |
-
git clone https://github.com/Dao-AILab/flash-attention
|
| 94 |
-
venv\scripts\python.exe -m pip install .\flash-attention
|
| 95 |
-
rmdir /s /q flash-attention
|
| 96 |
-
)
|
| 97 |
|
| 98 |
REM create start-quant-windows.bat
|
| 99 |
echo @echo off > start-quant.bat
|
|
@@ -106,6 +99,15 @@ REM create enter-venv.bat
|
|
| 106 |
echo @echo off > enter-venv.bat
|
| 107 |
echo cmd /k call venv\scripts\activate.bat >> enter-venv.bat
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
powershell -c (New-Object Media.SoundPlayer "C:\Windows\Media\tada.wav").PlaySync();
|
| 110 |
echo Environment setup complete. run start-quant.bat to start the quantization process.
|
| 111 |
pause
|
|
|
|
| 43 |
set /p cuda_version="Please enter your CUDA version (11 or 12): "
|
| 44 |
|
| 45 |
REM ask to install flash attention
|
| 46 |
+
echo Flash attention is a feature that could fix overflow issues on some more broken models, however, it will increase install time by a few hours.
|
| 47 |
set /p flash_attention="Would you like to install flash-attention? (rarely needed and optional) (y/n) "
|
| 48 |
if not "%flash_attention%"=="y" if not "%flash_attention%"=="n" (
|
| 49 |
echo Invalid input. Please enter y or n.
|
|
|
|
| 69 |
rmdir /s /q exllamav2
|
| 70 |
del start-quant.bat
|
| 71 |
del enter-venv.bat
|
|
|
|
| 72 |
|
| 73 |
REM download stuff
|
| 74 |
echo Downloading files...
|
|
|
|
| 86 |
venv\scripts\python.exe -m pip install huggingface-hub transformers accelerate
|
| 87 |
venv\scripts\python.exe -m pip install .\exllamav2
|
| 88 |
|
| 89 |
+
echo Writing batch files...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
REM create start-quant-windows.bat
|
| 92 |
echo @echo off > start-quant.bat
|
|
|
|
| 99 |
echo @echo off > enter-venv.bat
|
| 100 |
echo cmd /k call venv\scripts\activate.bat >> enter-venv.bat
|
| 101 |
|
| 102 |
+
if "%flash_attention%"=="y" (
|
| 103 |
+
echo Going to attempt to install flash attention but it isn't required.
|
| 104 |
+
echo You may close now if you'd like and continue without flash attention.
|
| 105 |
+
pause
|
| 106 |
+
echo Get some popcorn and watch a movie. This will take a while.
|
| 107 |
+
echo Installing flash-attn...
|
| 108 |
+
venv\scripts\python.exe -m pip install git+https://github.com/Dao-AILab/flash-attention.git
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
powershell -c (New-Object Media.SoundPlayer "C:\Windows\Media\tada.wav").PlaySync();
|
| 112 |
echo Environment setup complete. run start-quant.bat to start the quantization process.
|
| 113 |
pause
|