import os import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from peft import PeftModel from huggingface_hub import login # ── Credentials from HF Secrets ─────────────────────────────────────────────── HF_TOKEN = os.environ.get("FAQ") BASE_MODEL = os.environ.get("BASE_MODEL", "unsloth/Phi-3-mini-4k-instruct-bnb-4bit") LORA_ADAPTER = os.environ.get("LORA_ADAPTER", "TechAvenger/MyFaqSolver") MAX_NEW_TOKENS = 512 if HF_TOKEN: login(token=HF_TOKEN) print("Logged in to HuggingFace Hub ✅") else: print("No HF_TOKEN found — assuming public model") # ── Load model ───────────────────────────────────────────────────────────────── print(f"Loading tokenizer from: {BASE_MODEL}") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True, token=HF_TOKEN) print(f"Loading base model: {BASE_MODEL}") # llm_int8_enable_fp32_cpu_offload lets layers spill to CPU RAM when VRAM is full bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", llm_int8_enable_fp32_cpu_offload=True, ) # Give GPU as much VRAM as possible, spill the rest to CPU RAM max_memory = {} if torch.cuda.is_available(): vram_bytes = torch.cuda.get_device_properties(0).total_memory usable_mib = int((vram_bytes - 500 * 1024 ** 2) / 1024 ** 2) # reserve 500 MB max_memory[0] = f"{usable_mib}MiB" print(f"GPU detected — allocating {usable_mib} MiB") else: print("No GPU — running on CPU (slow)") max_memory["cpu"] = "12GiB" base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, torch_dtype=torch.float16, device_map="auto", max_memory=max_memory, trust_remote_code=True, quantization_config=bnb_config, token=HF_TOKEN, ) print(f"Loading LoRA adapter: {LORA_ADAPTER}") model = PeftModel.from_pretrained(base_model, LORA_ADAPTER, token=HF_TOKEN) model.eval() print("Model ready ✅") # ── Inference ────────────────────────────────────────────────────────────────── def answer_question(question: str, history: list): if not question.strip(): return history, "" prompt = f"### Question:\n{question.strip()}\n\n### Answer:\n" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=MAX_NEW_TOKENS, do_sample=False, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id, ) decoded = tokenizer.decode(output[0], skip_special_tokens=True) answer = decoded[len(prompt):].strip() history.append((question, answer)) return history, "" def clear_chat(): return [], "" # ── CSS ──────────────────────────────────────────────────────────────────────── css = """ @import url('https://fonts.googleapis.com/css2?family=Sora:wght@300;400;600;700&family=JetBrains+Mono:wght@400;500&display=swap'); *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; } body, .gradio-container { font-family: 'Sora', sans-serif !important; background: #080a10 !important; color: #e2e8f0 !important; } .gradio-container { max-width: 800px !important; margin: 0 auto !important; padding: 2rem 1.25rem 3rem !important; } #header { text-align: center; padding: 2.5rem 1rem 2rem; border-bottom: 1px solid #16192a; margin-bottom: 1.75rem; } #header .badge { display: inline-block; background: #0f1220; border: 1px solid #252d4a; border-radius: 100px; padding: 0.3rem 1rem; font-size: 0.72rem; color: #6b82c0; margin-bottom: 1rem; font-family: 'JetBrains Mono', monospace; letter-spacing: 0.05em; } #header h1 { font-size: 2.1rem; font-weight: 700; letter-spacing: -0.04em; color: #fff; margin-bottom: 0.4rem; } #header p { font-size: 0.88rem; color: #4b5675; } #chatbot { background: #0b0d15 !important; border: 1px solid #16192a !important; border-radius: 16px !important; min-height: 400px !important; padding: 1rem !important; } #chatbot .message.user > div, #chatbot [data-testid="user"] { background: #172044 !important; border: 1px solid #233060 !important; border-radius: 14px 14px 4px 14px !important; color: #c5d3f0 !important; font-size: 0.91rem !important; line-height: 1.55 !important; padding: 0.75rem 1rem !important; } #chatbot .message.bot > div, #chatbot [data-testid="bot"] { background: #0f1118 !important; border: 1px solid #1a1e2e !important; border-radius: 14px 14px 14px 4px !important; color: #dde4f5 !important; font-size: 0.91rem !important; line-height: 1.6 !important; padding: 0.75rem 1rem !important; } #input-row { display: flex !important; flex-direction: row !important; align-items: flex-end !important; gap: 0.6rem !important; margin-top: 1rem !important; width: 100% !important; } #question-input { flex: 1 1 auto !important; min-width: 0 !important; } #question-input textarea { background: #0f1118 !important; border: 1px solid #1e2338 !important; border-radius: 12px !important; color: #e2e8f0 !important; font-family: 'Sora', sans-serif !important; font-size: 0.91rem !important; padding: 0.8rem 1rem !important; resize: none !important; width: 100% !important; transition: border-color 0.2s !important; } #question-input textarea:focus { border-color: #3b5bdb !important; box-shadow: 0 0 0 3px rgba(59,91,219,0.12) !important; outline: none !important; } #question-input label { display: none !important; } #btn-ask, #btn-clear { flex: 0 0 auto !important; align-self: flex-end !important; } #btn-ask button { background: #3b5bdb !important; border: none !important; border-radius: 12px !important; color: #fff !important; font-family: 'Sora', sans-serif !important; font-weight: 600 !important; font-size: 0.88rem !important; padding: 0.8rem 1.4rem !important; cursor: pointer !important; white-space: nowrap !important; height: 44px !important; transition: background 0.2s, transform 0.1s !important; } #btn-ask button:hover { background: #2f4ac4 !important; } #btn-ask button:active { transform: scale(0.97) !important; } #btn-clear button { background: #0f1118 !important; border: 1px solid #1e2338 !important; border-radius: 12px !important; color: #5a6a9a !important; font-family: 'Sora', sans-serif !important; font-size: 0.85rem !important; padding: 0.8rem 1.1rem !important; cursor: pointer !important; white-space: nowrap !important; height: 44px !important; transition: background 0.2s, color 0.2s !important; } #btn-clear button:hover { background: #151a28 !important; color: #8fa0d0 !important; } #footer { text-align: center; color: #252d45; font-size: 0.74rem; margin-top: 2rem; font-family: 'JetBrains Mono', monospace; letter-spacing: 0.03em; } """ # ── UI ───────────────────────────────────────────────────────────────────────── with gr.Blocks(css=css, title="FAQ Agent") as demo: gr.HTML("""
Ask any question — powered by your fine-tuned model