Spaces:
Running
on
Zero
Running
on
Zero
zero
Browse files
app.py
CHANGED
|
@@ -5,11 +5,21 @@ import time
|
|
| 5 |
from threading import Thread
|
| 6 |
import sys
|
| 7 |
import os
|
| 8 |
-
os.environ["BNB_CUDA_VERSION"] = "0" # Forces bitsandbytes to recognize no GPU
|
| 9 |
os.environ["OMP_NUM_THREADS"] = "1" # Prevents race conditions in custom CPU kernels
|
| 10 |
os.environ["VECLIB_MAXIMUM_ISA"] = "AVX2"
|
| 11 |
os.environ["MKL_DEBUG_CPU_TYPE"] = "5" # Forces MKL to use AVX2
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
# Model configuration
|
| 14 |
if len(sys.argv) > 1 and os.path.exists(sys.argv[1]):
|
| 15 |
MODEL_NAME = sys.argv[1]
|
|
@@ -67,6 +77,7 @@ def load_model():
|
|
| 67 |
except Exception as e:
|
| 68 |
return f"❌ Error loading model: {str(e)}"
|
| 69 |
|
|
|
|
| 70 |
def chat_predict(message, history, max_length, temperature, top_p, repetition_penalty, system_prompt):
|
| 71 |
"""Generate text using the loaded model with streaming and history"""
|
| 72 |
global model, tokenizer
|
|
@@ -76,6 +87,9 @@ def chat_predict(message, history, max_length, temperature, top_p, repetition_pe
|
|
| 76 |
return
|
| 77 |
|
| 78 |
try:
|
|
|
|
|
|
|
|
|
|
| 79 |
# Prepare messages for chat template
|
| 80 |
messages = []
|
| 81 |
if system_prompt:
|
|
|
|
| 5 |
from threading import Thread
|
| 6 |
import sys
|
| 7 |
import os
|
| 8 |
+
# os.environ["BNB_CUDA_VERSION"] = "0" # Forces bitsandbytes to recognize no GPU
|
| 9 |
os.environ["OMP_NUM_THREADS"] = "1" # Prevents race conditions in custom CPU kernels
|
| 10 |
os.environ["VECLIB_MAXIMUM_ISA"] = "AVX2"
|
| 11 |
os.environ["MKL_DEBUG_CPU_TYPE"] = "5" # Forces MKL to use AVX2
|
| 12 |
|
| 13 |
+
try:
|
| 14 |
+
import spaces
|
| 15 |
+
except ImportError:
|
| 16 |
+
spaces = None
|
| 17 |
+
|
| 18 |
+
def gpu_decorator(func):
|
| 19 |
+
if spaces:
|
| 20 |
+
return spaces.GPU(func)
|
| 21 |
+
return func
|
| 22 |
+
|
| 23 |
# Model configuration
|
| 24 |
if len(sys.argv) > 1 and os.path.exists(sys.argv[1]):
|
| 25 |
MODEL_NAME = sys.argv[1]
|
|
|
|
| 77 |
except Exception as e:
|
| 78 |
return f"❌ Error loading model: {str(e)}"
|
| 79 |
|
| 80 |
+
@gpu_decorator
|
| 81 |
def chat_predict(message, history, max_length, temperature, top_p, repetition_penalty, system_prompt):
|
| 82 |
"""Generate text using the loaded model with streaming and history"""
|
| 83 |
global model, tokenizer
|
|
|
|
| 87 |
return
|
| 88 |
|
| 89 |
try:
|
| 90 |
+
if torch.cuda.is_available():
|
| 91 |
+
model.to("cuda")
|
| 92 |
+
|
| 93 |
# Prepare messages for chat template
|
| 94 |
messages = []
|
| 95 |
if system_prompt:
|