Spaces:

InnovisionLLC
/

example_test

Paused

Wenye He commited on Feb 17, 2025

Commit

0ffa0b9

verified ·

1 Parent(s): 2c277a8

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ MODEL_CONFIG = {
     }
 }
-# Quantization config for 4-bit loading
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
@@ -39,9 +39,9 @@ class ChatModel:
             model = AutoModelForCausalLM.from_pretrained(
                 config["model_name"],
                 device_map="auto",
                 torch_dtype=torch.float16,
-                attn_implementation="flash_attention_2" if "phi-3" in model_name else "eager",
                 low_cpu_mem_usage=True
             )
@@ -60,7 +60,7 @@ class ChatModel:
             "text-generation",
             model=self.models[model_name],
             tokenizer=self.tokenizers[model_name],
-            max_new_tokens=512,
             temperature=0.7,
             top_p=0.9,
             repetition_penalty=1.1,

     }
 }
+# Quantization config (4-bit)
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
             model = AutoModelForCausalLM.from_pretrained(
                 config["model_name"],
+                quantization_config=bnb_config,
                 device_map="auto",
                 torch_dtype=torch.float16,
                 low_cpu_mem_usage=True
             )
             "text-generation",
             model=self.models[model_name],
             tokenizer=self.tokenizers[model_name],
+            max_new_tokens=384,
             temperature=0.7,
             top_p=0.9,
             repetition_penalty=1.1,