Spaces:

udituen
/

agriquery

Sleeping

udituen commited on Nov 1, 2025

Commit

0f68754

verified ·

1 Parent(s): ee4e258

Update src/streamlit_app.py

Files changed (1) hide show

src/streamlit_app.py CHANGED Viewed

@@ -67,11 +67,19 @@ def load_retriever():
 def load_llm():
     # pipe = pipeline("text-generation", model="google/flan-t5-small", max_new_tokens=256)
     # load the tokenizer and model on cpu/gpu
-    quantization_config = BitsAndBytesConfig(load_in_8bit=True,llm_int8_enable_fp32_cpu_offload=True)
     model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
     # model_name = "meta-llama/Llama-2-7b-chat-hf"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True)
     pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
     return HuggingFacePipeline(pipeline=pipe)

 def load_llm():
     # pipe = pipeline("text-generation", model="google/flan-t5-small", max_new_tokens=256)
     # load the tokenizer and model on cpu/gpu
+    # quantization_config = BitsAndBytesConfig(load_in_8bit=True,llm_int8_enable_fp32_cpu_offload=True)
+    quantization_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=True
+    )
     model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
     # model_name = "meta-llama/Llama-2-7b-chat-hf"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config, low_cpu_mem_usage=True)
     pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
     return HuggingFacePipeline(pipeline=pipe)