Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| """ | |
| ERAv4S19_project — app.py (CPU-safe version) | |
| Loads Phi‑2 in 4‑bit when GPU available, otherwise safely on CPU. | |
| Applies LoRA adapters from: | |
| - local folder ./adapters | |
| - OR a Hub repo: username/repo | |
| """ | |
| import os | |
| import traceback | |
| import torch | |
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
| from peft import PeftModel | |
| # --- Disable audio to avoid pydub/audioop errors on Python 3.13 --- | |
| os.environ["GRADIO_DISABLE_AUDIO"] = "True" | |
| BASE_MODEL_ID = os.environ.get("BASE_MODEL_ID", "microsoft/phi-2") | |
| ADAPTER_PATH = os.environ.get("ADAPTER_PATH", "adapters") | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| DEF_MAX_NEW = int(os.environ.get("MAX_NEW_TOKENS", 256)) | |
| DEF_TEMP = float(os.environ.get("TEMPERATURE", 0.7)) | |
| DEF_TOP_P = float(os.environ.get("TOP_P", 0.95)) | |
| def _is_local_adapter_dir(path): | |
| return os.path.isdir(path) and os.path.exists(os.path.join(path, "adapter_config.json")) | |
| def _load_tokenizer(): | |
| tok = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True) | |
| if tok.pad_token is None: | |
| tok.pad_token = tok.eos_token | |
| return tok | |
| def _load_base_model(): | |
| print(f"[INIT] Loading base model: {BASE_MODEL_ID}") | |
| # 4-bit config works on GPU; on CPU we fallback to float32 | |
| bnb_cfg = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, | |
| bnb_4bit_use_double_quant=True, | |
| ) | |
| if torch.cuda.is_available(): | |
| device_map = "auto" | |
| dtype = torch.bfloat16 | |
| else: | |
| # ---------- CPU SAFE BLOCK ---------- | |
| device_map = {"": "cpu"} | |
| dtype = torch.float32 | |
| # ------------------------------------ | |
| base = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL_ID, | |
| quantization_config=bnb_cfg, | |
| device_map=device_map, | |
| torch_dtype=dtype, | |
| low_cpu_mem_usage=True, # avoids accelerate sharding | |
| trust_remote_code=False, | |
| ) | |
| base.eval() | |
| return base | |
| def _apply_adapters_if_any(base): | |
| print(f"[ADAPTER] ADAPTER_PATH={ADAPTER_PATH}") | |
| try: | |
| if _is_local_adapter_dir(ADAPTER_PATH): | |
| print(f"[ADAPTER] Loading local adapters from './{ADAPTER_PATH}'") | |
| model = PeftModel.from_pretrained(base, ADAPTER_PATH) | |
| else: | |
| print(f"[ADAPTER] Loading adapters from Hub repo: {ADAPTER_PATH}") | |
| model = PeftModel.from_pretrained(base, ADAPTER_PATH, token=HF_TOKEN) | |
| print("[ADAPTER] Adapters applied successfully.") | |
| return model | |
| except Exception as e: | |
| print(f"[WARN] Could not load adapters ({e}). Using base model only.") | |
| traceback.print_exc() | |
| return base | |
| def _build_prompt(tokenizer, user_text): | |
| user_text = (user_text or "").strip() | |
| if hasattr(tokenizer, "apply_chat_template"): | |
| msg = [{"role": "user", "content": user_text}] | |
| return tokenizer.apply_chat_template(msg, add_generation_prompt=True, tokenize=False) | |
| return f"User: {user_text}\nAssistant:" | |
| def generate(text, max_new_tokens, temperature, top_p): | |
| if not text.strip(): | |
| return "Please enter a prompt." | |
| prompt = _build_prompt(tokenizer, text) | |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
| with torch.no_grad(): | |
| output_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=int(max_new_tokens), | |
| do_sample=True, | |
| temperature=float(temperature), | |
| top_p=float(top_p), | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| out = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
| if "Assistant:" in out: | |
| out = out.split("Assistant:")[-1].strip() | |
| return out | |
| # ------ INIT PIPELINE ------ | |
| tokenizer = _load_tokenizer() | |
| base_model = _load_base_model() | |
| model = _apply_adapters_if_any(base_model) | |
| def health(): | |
| dev = "cuda" if torch.cuda.is_available() else "cpu" | |
| msg = f"OK — device={dev}; BASE_MODEL_ID={BASE_MODEL_ID}; ADAPTER_PATH={ADAPTER_PATH}" | |
| if _is_local_adapter_dir(ADAPTER_PATH): | |
| msg += " (local adapters)" | |
| return msg | |
| # ------ GRADIO UI ------ | |
| with gr.Blocks(title="Phi‑2 (QLoRA + GRPO) — ERAv4S19") as demo: | |
| gr.Markdown("## Phi‑2 (4‑bit) + optional LoRA adapters\nUpload `adapters/` or set `ADAPTER_PATH` to a Hub repo.") | |
| with gr.Row(): | |
| inp = gr.Textbox(lines=6, label="Your prompt") | |
| with gr.Row(): | |
| max_new = gr.Slider(32, 1024, DEF_MAX_NEW, step=8, label="max_new_tokens") | |
| temp = gr.Slider(0.0, 1.5, DEF_TEMP, step=0.05, label="temperature") | |
| topp = gr.Slider(0.1, 1.0, DEF_TOP_P, step=0.05, label="top_p") | |
| out = gr.Textbox(label="Assistant") | |
| btn = gr.Button("Generate") | |
| btn.click(generate, inputs=[inp, max_new, temp, topp], outputs=out) | |
| with gr.Row(): | |
| health_btn = gr.Button("Health check") | |
| health_out = gr.Textbox(label="Status") | |
| health_btn.click(fn=lambda: health(), outputs=health_out) | |
| if __name__ == "__main__": | |
| # On HF Spaces – share=False ALWAYS. | |
| demo.launch() | |