Spaces:

TechAvenger
/

ICT_FAQ_SOLVER

Sleeping

App Files Files Community

TechAvenger commited on Apr 16

Commit

a00bcdc

verified ·

1 Parent(s): ae4e7b1

Upload app.py

Browse files

Files changed (1) hide show

app.py +17 -2

app.py CHANGED Viewed

@@ -23,16 +23,32 @@ print(f"Loading tokenizer from: {BASE_MODEL}")
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True, token=HF_TOKEN)
 print(f"Loading base model: {BASE_MODEL}")
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_compute_dtype=torch.float16,
     bnb_4bit_use_double_quant=True,
     bnb_4bit_quant_type="nf4",
 )
 base_model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL,
     torch_dtype=torch.float16,
     device_map="auto",
     trust_remote_code=True,
     quantization_config=bnb_config,
     token=HF_TOKEN,
@@ -56,7 +72,7 @@ def answer_question(question: str, history: list):
         output = model.generate(
             **inputs,
             max_new_tokens=MAX_NEW_TOKENS,
-            do_sample=False,                       # greedy — no temperature needed
             pad_token_id=tokenizer.eos_token_id,
             eos_token_id=tokenizer.eos_token_id,
         )
@@ -214,7 +230,6 @@ body, .gradio-container {
 """
 # ── UI ─────────────────────────────────────────────────────────────────────────
-# CSS is passed to gr.Blocks(), NOT to demo.launch() — this was the main input bug
 with gr.Blocks(css=css, title="FAQ Agent") as demo:
     gr.HTML("""

 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True, token=HF_TOKEN)
 print(f"Loading base model: {BASE_MODEL}")
+# llm_int8_enable_fp32_cpu_offload lets layers spill to CPU RAM when VRAM is full
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_compute_dtype=torch.float16,
     bnb_4bit_use_double_quant=True,
     bnb_4bit_quant_type="nf4",
+    llm_int8_enable_fp32_cpu_offload=True,
 )
+# Give GPU as much VRAM as possible, spill the rest to CPU RAM
+max_memory = {}
+if torch.cuda.is_available():
+    vram_bytes = torch.cuda.get_device_properties(0).total_memory
+    usable_mib = int((vram_bytes - 500 * 1024 ** 2) / 1024 ** 2)  # reserve 500 MB
+    max_memory[0] = f"{usable_mib}MiB"
+    print(f"GPU detected — allocating {usable_mib} MiB")
+else:
+    print("No GPU — running on CPU (slow)")
+max_memory["cpu"] = "12GiB"
 base_model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL,
     torch_dtype=torch.float16,
     device_map="auto",
+    max_memory=max_memory,
     trust_remote_code=True,
     quantization_config=bnb_config,
     token=HF_TOKEN,
         output = model.generate(
             **inputs,
             max_new_tokens=MAX_NEW_TOKENS,
+            do_sample=False,
             pad_token_id=tokenizer.eos_token_id,
             eos_token_id=tokenizer.eos_token_id,
         )
 """
 # ── UI ─────────────────────────────────────────────────────────────────────────
 with gr.Blocks(css=css, title="FAQ Agent") as demo:
     gr.HTML("""