TechAvenger commited on
Commit
a00bcdc
Β·
verified Β·
1 Parent(s): ae4e7b1

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -2
app.py CHANGED
@@ -23,16 +23,32 @@ print(f"Loading tokenizer from: {BASE_MODEL}")
23
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True, token=HF_TOKEN)
24
 
25
  print(f"Loading base model: {BASE_MODEL}")
 
 
26
  bnb_config = BitsAndBytesConfig(
27
  load_in_4bit=True,
28
  bnb_4bit_compute_dtype=torch.float16,
29
  bnb_4bit_use_double_quant=True,
30
  bnb_4bit_quant_type="nf4",
 
31
  )
 
 
 
 
 
 
 
 
 
 
 
 
32
  base_model = AutoModelForCausalLM.from_pretrained(
33
  BASE_MODEL,
34
  torch_dtype=torch.float16,
35
  device_map="auto",
 
36
  trust_remote_code=True,
37
  quantization_config=bnb_config,
38
  token=HF_TOKEN,
@@ -56,7 +72,7 @@ def answer_question(question: str, history: list):
56
  output = model.generate(
57
  **inputs,
58
  max_new_tokens=MAX_NEW_TOKENS,
59
- do_sample=False, # greedy β€” no temperature needed
60
  pad_token_id=tokenizer.eos_token_id,
61
  eos_token_id=tokenizer.eos_token_id,
62
  )
@@ -214,7 +230,6 @@ body, .gradio-container {
214
  """
215
 
216
  # ── UI ─────────────────────────────────────────────────────────────────────────
217
- # CSS is passed to gr.Blocks(), NOT to demo.launch() β€” this was the main input bug
218
  with gr.Blocks(css=css, title="FAQ Agent") as demo:
219
 
220
  gr.HTML("""
 
23
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True, token=HF_TOKEN)
24
 
25
  print(f"Loading base model: {BASE_MODEL}")
26
+
27
+ # llm_int8_enable_fp32_cpu_offload lets layers spill to CPU RAM when VRAM is full
28
  bnb_config = BitsAndBytesConfig(
29
  load_in_4bit=True,
30
  bnb_4bit_compute_dtype=torch.float16,
31
  bnb_4bit_use_double_quant=True,
32
  bnb_4bit_quant_type="nf4",
33
+ llm_int8_enable_fp32_cpu_offload=True,
34
  )
35
+
36
+ # Give GPU as much VRAM as possible, spill the rest to CPU RAM
37
+ max_memory = {}
38
+ if torch.cuda.is_available():
39
+ vram_bytes = torch.cuda.get_device_properties(0).total_memory
40
+ usable_mib = int((vram_bytes - 500 * 1024 ** 2) / 1024 ** 2) # reserve 500 MB
41
+ max_memory[0] = f"{usable_mib}MiB"
42
+ print(f"GPU detected β€” allocating {usable_mib} MiB")
43
+ else:
44
+ print("No GPU β€” running on CPU (slow)")
45
+ max_memory["cpu"] = "12GiB"
46
+
47
  base_model = AutoModelForCausalLM.from_pretrained(
48
  BASE_MODEL,
49
  torch_dtype=torch.float16,
50
  device_map="auto",
51
+ max_memory=max_memory,
52
  trust_remote_code=True,
53
  quantization_config=bnb_config,
54
  token=HF_TOKEN,
 
72
  output = model.generate(
73
  **inputs,
74
  max_new_tokens=MAX_NEW_TOKENS,
75
+ do_sample=False,
76
  pad_token_id=tokenizer.eos_token_id,
77
  eos_token_id=tokenizer.eos_token_id,
78
  )
 
230
  """
231
 
232
  # ── UI ─────────────────────────────────────────────────────────────────────────
 
233
  with gr.Blocks(css=css, title="FAQ Agent") as demo:
234
 
235
  gr.HTML("""