using now TheBloke/Falcon-180B-Chat-GPTQ
Browse files
app.py
CHANGED
|
@@ -10,7 +10,7 @@ print("Loading optimized Mistral model...")
|
|
| 10 |
try:
|
| 11 |
# First try: AWQ quantized model (best performance)
|
| 12 |
print("π Attempting to load AWQ model...")
|
| 13 |
-
tokenizer = AutoTokenizer.from_pretrained("TheBloke/
|
| 14 |
model = AutoModelForCausalLM.from_pretrained(
|
| 15 |
"TheBloke/Mistral-7B-Instruct-v0.2-AWQ",
|
| 16 |
device_map="auto",
|
|
@@ -25,9 +25,9 @@ except Exception as e:
|
|
| 25 |
try:
|
| 26 |
# Second try: Use a smaller, more compatible model
|
| 27 |
print("π Falling back to Mistral-7B-Instruct-v0.1 (more compatible)...")
|
| 28 |
-
tokenizer = AutoTokenizer.from_pretrained("TheBloke/
|
| 29 |
model = AutoModelForCausalLM.from_pretrained(
|
| 30 |
-
"TheBloke/
|
| 31 |
device_map="auto",
|
| 32 |
torch_dtype=torch.float16,
|
| 33 |
low_cpu_mem_usage=True,
|
|
|
|
| 10 |
try:
|
| 11 |
# First try: AWQ quantized model (best performance)
|
| 12 |
print("π Attempting to load AWQ model...")
|
| 13 |
+
tokenizer = AutoTokenizer.from_pretrained("TheBloke/Falcon-180B-Chat-GPTQ")
|
| 14 |
model = AutoModelForCausalLM.from_pretrained(
|
| 15 |
"TheBloke/Mistral-7B-Instruct-v0.2-AWQ",
|
| 16 |
device_map="auto",
|
|
|
|
| 25 |
try:
|
| 26 |
# Second try: Use a smaller, more compatible model
|
| 27 |
print("π Falling back to Mistral-7B-Instruct-v0.1 (more compatible)...")
|
| 28 |
+
tokenizer = AutoTokenizer.from_pretrained("TheBloke/Falcon-180B-Chat-GPTQ")
|
| 29 |
model = AutoModelForCausalLM.from_pretrained(
|
| 30 |
+
"TheBloke/Falcon-180B-Chat-GPTQ",
|
| 31 |
device_map="auto",
|
| 32 |
torch_dtype=torch.float16,
|
| 33 |
low_cpu_mem_usage=True,
|