import os import torch import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM from transformers.utils.quantization_config import Mxfp4Config from peft import PeftModel BASE_MODEL = "openai/gpt-oss-20b" ADAPTER_ID = "AnalyticsIntelligence/pidgin_oss" SYSTEM_PROMPT = ( "You be helpful customer service AI. " "You must answer only in Nigerian Pidgin. " "No use English unless person ask am." ) # Hugging Face Spaces-safe writable dir for disk offload OFFLOAD_DIR = os.getenv("OFFLOAD_DIR", "/tmp/offload") def load_model(): # Tokenizer (base) tokenizer = AutoTokenizer.from_pretrained( BASE_MODEL, trust_remote_code=True, ) if not torch.cuda.is_available(): raise RuntimeError("CUDA GPU not detected. gpt-oss-20b needs a GPU for this demo.") # Make sure offload folder exists (required when device_map triggers disk offload) os.makedirs(OFFLOAD_DIR, exist_ok=True) # MXFP4 model: do NOT use BitsAndBytes. # dequantize=True allows running on non-H100 GPUs too (L4/A10/T4 etc). qconfig = Mxfp4Config(dequantize=True) base = AutoModelForCausalLM.from_pretrained( BASE_MODEL, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=qconfig, trust_remote_code=True, offload_folder=OFFLOAD_DIR, offload_state_dict=True, ) model = PeftModel.from_pretrained(base, ADAPTER_ID) model.eval() return tokenizer, model tokenizer, model = load_model() def build_prompt(message, history, max_turns=8): history = (history or [])[-max_turns:] lines = [SYSTEM_PROMPT, ""] for u, a in history: lines.append(f"User: {u}") lines.append(f"Assistant: {a}") lines.append(f"User: {message}") lines.append("Assistant:") return "\n".join(lines) @torch.inference_mode() def chat(message, history, max_new_tokens, temperature, top_p): prompt = build_prompt(message, history) inputs = tokenizer( prompt, return_tensors="pt", truncation=True, max_length=2048, ).to(model.device) out = model.generate( **inputs, max_new_tokens=int(max_new_tokens), do_sample=True, temperature=float(temperature), top_p=float(top_p), pad_token_id=tokenizer.eos_token_id, ) decoded = tokenizer.decode(out[0], skip_special_tokens=True) reply = decoded.split("Assistant:")[-1].strip() return reply demo = gr.ChatInterface( fn=chat, additional_inputs=[ gr.Slider(16, 512, value=192, step=1, label="max_new_tokens"), gr.Slider(0.1, 1.5, value=0.7, step=0.05, label="temperature"), gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p"), ], title="Pidgin OSS – gpt-oss-20b + LoRA", description=f"Base: {BASE_MODEL} | Adapter: {ADAPTER_ID}", ) demo.launch(server_name="0.0.0.0", server_port=int(os.getenv('PORT', '7860')))