Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -23,16 +23,32 @@ print(f"Loading tokenizer from: {BASE_MODEL}")
|
|
| 23 |
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True, token=HF_TOKEN)
|
| 24 |
|
| 25 |
print(f"Loading base model: {BASE_MODEL}")
|
|
|
|
|
|
|
| 26 |
bnb_config = BitsAndBytesConfig(
|
| 27 |
load_in_4bit=True,
|
| 28 |
bnb_4bit_compute_dtype=torch.float16,
|
| 29 |
bnb_4bit_use_double_quant=True,
|
| 30 |
bnb_4bit_quant_type="nf4",
|
|
|
|
| 31 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
base_model = AutoModelForCausalLM.from_pretrained(
|
| 33 |
BASE_MODEL,
|
| 34 |
torch_dtype=torch.float16,
|
| 35 |
device_map="auto",
|
|
|
|
| 36 |
trust_remote_code=True,
|
| 37 |
quantization_config=bnb_config,
|
| 38 |
token=HF_TOKEN,
|
|
@@ -56,7 +72,7 @@ def answer_question(question: str, history: list):
|
|
| 56 |
output = model.generate(
|
| 57 |
**inputs,
|
| 58 |
max_new_tokens=MAX_NEW_TOKENS,
|
| 59 |
-
do_sample=False,
|
| 60 |
pad_token_id=tokenizer.eos_token_id,
|
| 61 |
eos_token_id=tokenizer.eos_token_id,
|
| 62 |
)
|
|
@@ -214,7 +230,6 @@ body, .gradio-container {
|
|
| 214 |
"""
|
| 215 |
|
| 216 |
# ββ UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 217 |
-
# CSS is passed to gr.Blocks(), NOT to demo.launch() β this was the main input bug
|
| 218 |
with gr.Blocks(css=css, title="FAQ Agent") as demo:
|
| 219 |
|
| 220 |
gr.HTML("""
|
|
|
|
| 23 |
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True, token=HF_TOKEN)
|
| 24 |
|
| 25 |
print(f"Loading base model: {BASE_MODEL}")
|
| 26 |
+
|
| 27 |
+
# llm_int8_enable_fp32_cpu_offload lets layers spill to CPU RAM when VRAM is full
|
| 28 |
bnb_config = BitsAndBytesConfig(
|
| 29 |
load_in_4bit=True,
|
| 30 |
bnb_4bit_compute_dtype=torch.float16,
|
| 31 |
bnb_4bit_use_double_quant=True,
|
| 32 |
bnb_4bit_quant_type="nf4",
|
| 33 |
+
llm_int8_enable_fp32_cpu_offload=True,
|
| 34 |
)
|
| 35 |
+
|
| 36 |
+
# Give GPU as much VRAM as possible, spill the rest to CPU RAM
|
| 37 |
+
max_memory = {}
|
| 38 |
+
if torch.cuda.is_available():
|
| 39 |
+
vram_bytes = torch.cuda.get_device_properties(0).total_memory
|
| 40 |
+
usable_mib = int((vram_bytes - 500 * 1024 ** 2) / 1024 ** 2) # reserve 500 MB
|
| 41 |
+
max_memory[0] = f"{usable_mib}MiB"
|
| 42 |
+
print(f"GPU detected β allocating {usable_mib} MiB")
|
| 43 |
+
else:
|
| 44 |
+
print("No GPU β running on CPU (slow)")
|
| 45 |
+
max_memory["cpu"] = "12GiB"
|
| 46 |
+
|
| 47 |
base_model = AutoModelForCausalLM.from_pretrained(
|
| 48 |
BASE_MODEL,
|
| 49 |
torch_dtype=torch.float16,
|
| 50 |
device_map="auto",
|
| 51 |
+
max_memory=max_memory,
|
| 52 |
trust_remote_code=True,
|
| 53 |
quantization_config=bnb_config,
|
| 54 |
token=HF_TOKEN,
|
|
|
|
| 72 |
output = model.generate(
|
| 73 |
**inputs,
|
| 74 |
max_new_tokens=MAX_NEW_TOKENS,
|
| 75 |
+
do_sample=False,
|
| 76 |
pad_token_id=tokenizer.eos_token_id,
|
| 77 |
eos_token_id=tokenizer.eos_token_id,
|
| 78 |
)
|
|
|
|
| 230 |
"""
|
| 231 |
|
| 232 |
# ββ UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 233 |
with gr.Blocks(css=css, title="FAQ Agent") as demo:
|
| 234 |
|
| 235 |
gr.HTML("""
|