Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import torch
|
| 3 |
-
from transformers import
|
| 4 |
from transformers.cache_utils import DynamicCache
|
| 5 |
import os
|
| 6 |
from time import time
|
|
@@ -84,17 +84,11 @@ def calculate_cache_size(cache):
|
|
| 84 |
@st.cache_resource
|
| 85 |
def load_model_and_tokenizer(doc_text_count):
|
| 86 |
model_name = "google/gemma-3-4b-it" # Configure quantization for 4-bit loading
|
| 87 |
-
|
| 88 |
-
load_in_4bit=True, # Enable 4-bit quantization
|
| 89 |
-
bnb_4bit_compute_dtype=torch.float16, # Set computation precision
|
| 90 |
-
bnb_4bit_quant_type="nf4", # Use Normal Float 4 (NF4) quantization
|
| 91 |
-
bnb_4bit_use_double_quant=True, # Enable double quantization
|
| 92 |
-
)
|
| 93 |
# Load the pre-trained model with quantization
|
| 94 |
model = AutoModelForCausalLM.from_pretrained(
|
| 95 |
model_name,
|
| 96 |
device_map="auto", # Automatically allocate model to devices
|
| 97 |
-
quantization_config=quantization_config,
|
| 98 |
model_max_length=1.3*round(doc_text_count * 0.3 + 1),
|
| 99 |
token=HF_TOKEN
|
| 100 |
)
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import torch
|
| 3 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 4 |
from transformers.cache_utils import DynamicCache
|
| 5 |
import os
|
| 6 |
from time import time
|
|
|
|
| 84 |
@st.cache_resource
|
| 85 |
def load_model_and_tokenizer(doc_text_count):
|
| 86 |
model_name = "google/gemma-3-4b-it" # Configure quantization for 4-bit loading
|
| 87 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
# Load the pre-trained model with quantization
|
| 89 |
model = AutoModelForCausalLM.from_pretrained(
|
| 90 |
model_name,
|
| 91 |
device_map="auto", # Automatically allocate model to devices
|
|
|
|
| 92 |
model_max_length=1.3*round(doc_text_count * 0.3 + 1),
|
| 93 |
token=HF_TOKEN
|
| 94 |
)
|