ERAv4S18 / app.py
ishwarraja's picture
Update app.py
ccdd409 verified
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
ERAv4S19_project — app.py (CPU-safe version)
Loads Phi‑2 in 4‑bit when GPU available, otherwise safely on CPU.
Applies LoRA adapters from:
- local folder ./adapters
- OR a Hub repo: username/repo
"""
import os
import traceback
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
# --- Disable audio to avoid pydub/audioop errors on Python 3.13 ---
os.environ["GRADIO_DISABLE_AUDIO"] = "True"
BASE_MODEL_ID = os.environ.get("BASE_MODEL_ID", "microsoft/phi-2")
ADAPTER_PATH = os.environ.get("ADAPTER_PATH", "adapters")
HF_TOKEN = os.environ.get("HF_TOKEN")
DEF_MAX_NEW = int(os.environ.get("MAX_NEW_TOKENS", 256))
DEF_TEMP = float(os.environ.get("TEMPERATURE", 0.7))
DEF_TOP_P = float(os.environ.get("TOP_P", 0.95))
def _is_local_adapter_dir(path):
return os.path.isdir(path) and os.path.exists(os.path.join(path, "adapter_config.json"))
def _load_tokenizer():
tok = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True)
if tok.pad_token is None:
tok.pad_token = tok.eos_token
return tok
def _load_base_model():
print(f"[INIT] Loading base model: {BASE_MODEL_ID}")
# 4-bit config works on GPU; on CPU we fallback to float32
bnb_cfg = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
bnb_4bit_use_double_quant=True,
)
if torch.cuda.is_available():
device_map = "auto"
dtype = torch.bfloat16
else:
# ---------- CPU SAFE BLOCK ----------
device_map = {"": "cpu"}
dtype = torch.float32
# ------------------------------------
base = AutoModelForCausalLM.from_pretrained(
BASE_MODEL_ID,
quantization_config=bnb_cfg,
device_map=device_map,
torch_dtype=dtype,
low_cpu_mem_usage=True, # avoids accelerate sharding
trust_remote_code=False,
)
base.eval()
return base
def _apply_adapters_if_any(base):
print(f"[ADAPTER] ADAPTER_PATH={ADAPTER_PATH}")
try:
if _is_local_adapter_dir(ADAPTER_PATH):
print(f"[ADAPTER] Loading local adapters from './{ADAPTER_PATH}'")
model = PeftModel.from_pretrained(base, ADAPTER_PATH)
else:
print(f"[ADAPTER] Loading adapters from Hub repo: {ADAPTER_PATH}")
model = PeftModel.from_pretrained(base, ADAPTER_PATH, token=HF_TOKEN)
print("[ADAPTER] Adapters applied successfully.")
return model
except Exception as e:
print(f"[WARN] Could not load adapters ({e}). Using base model only.")
traceback.print_exc()
return base
def _build_prompt(tokenizer, user_text):
user_text = (user_text or "").strip()
if hasattr(tokenizer, "apply_chat_template"):
msg = [{"role": "user", "content": user_text}]
return tokenizer.apply_chat_template(msg, add_generation_prompt=True, tokenize=False)
return f"User: {user_text}\nAssistant:"
def generate(text, max_new_tokens, temperature, top_p):
if not text.strip():
return "Please enter a prompt."
prompt = _build_prompt(tokenizer, text)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=int(max_new_tokens),
do_sample=True,
temperature=float(temperature),
top_p=float(top_p),
pad_token_id=tokenizer.eos_token_id,
)
out = tokenizer.decode(output_ids[0], skip_special_tokens=True)
if "Assistant:" in out:
out = out.split("Assistant:")[-1].strip()
return out
# ------ INIT PIPELINE ------
tokenizer = _load_tokenizer()
base_model = _load_base_model()
model = _apply_adapters_if_any(base_model)
def health():
dev = "cuda" if torch.cuda.is_available() else "cpu"
msg = f"OK — device={dev}; BASE_MODEL_ID={BASE_MODEL_ID}; ADAPTER_PATH={ADAPTER_PATH}"
if _is_local_adapter_dir(ADAPTER_PATH):
msg += " (local adapters)"
return msg
# ------ GRADIO UI ------
with gr.Blocks(title="Phi‑2 (QLoRA + GRPO) — ERAv4S19") as demo:
gr.Markdown("## Phi‑2 (4‑bit) + optional LoRA adapters\nUpload `adapters/` or set `ADAPTER_PATH` to a Hub repo.")
with gr.Row():
inp = gr.Textbox(lines=6, label="Your prompt")
with gr.Row():
max_new = gr.Slider(32, 1024, DEF_MAX_NEW, step=8, label="max_new_tokens")
temp = gr.Slider(0.0, 1.5, DEF_TEMP, step=0.05, label="temperature")
topp = gr.Slider(0.1, 1.0, DEF_TOP_P, step=0.05, label="top_p")
out = gr.Textbox(label="Assistant")
btn = gr.Button("Generate")
btn.click(generate, inputs=[inp, max_new, temp, topp], outputs=out)
with gr.Row():
health_btn = gr.Button("Health check")
health_out = gr.Textbox(label="Status")
health_btn.click(fn=lambda: health(), outputs=health_out)
if __name__ == "__main__":
# On HF Spaces – share=False ALWAYS.
demo.launch()