pidgin_test / app.py
Ephraimmm's picture
Update app.py
5ca8718 verified
import os
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils.quantization_config import Mxfp4Config
from peft import PeftModel
BASE_MODEL = "openai/gpt-oss-20b"
ADAPTER_ID = "AnalyticsIntelligence/pidgin_oss"
SYSTEM_PROMPT = (
"You be helpful customer service AI. "
"You must answer only in Nigerian Pidgin. "
"No use English unless person ask am."
)
# Hugging Face Spaces-safe writable dir for disk offload
OFFLOAD_DIR = os.getenv("OFFLOAD_DIR", "/tmp/offload")
def load_model():
# Tokenizer (base)
tokenizer = AutoTokenizer.from_pretrained(
BASE_MODEL,
trust_remote_code=True,
)
if not torch.cuda.is_available():
raise RuntimeError("CUDA GPU not detected. gpt-oss-20b needs a GPU for this demo.")
# Make sure offload folder exists (required when device_map triggers disk offload)
os.makedirs(OFFLOAD_DIR, exist_ok=True)
# MXFP4 model: do NOT use BitsAndBytes.
# dequantize=True allows running on non-H100 GPUs too (L4/A10/T4 etc).
qconfig = Mxfp4Config(dequantize=True)
base = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
device_map="auto",
torch_dtype=torch.bfloat16,
quantization_config=qconfig,
trust_remote_code=True,
offload_folder=OFFLOAD_DIR,
offload_state_dict=True,
)
model = PeftModel.from_pretrained(base, ADAPTER_ID)
model.eval()
return tokenizer, model
tokenizer, model = load_model()
def build_prompt(message, history, max_turns=8):
history = (history or [])[-max_turns:]
lines = [SYSTEM_PROMPT, ""]
for u, a in history:
lines.append(f"User: {u}")
lines.append(f"Assistant: {a}")
lines.append(f"User: {message}")
lines.append("Assistant:")
return "\n".join(lines)
@torch.inference_mode()
def chat(message, history, max_new_tokens, temperature, top_p):
prompt = build_prompt(message, history)
inputs = tokenizer(
prompt,
return_tensors="pt",
truncation=True,
max_length=2048,
).to(model.device)
out = model.generate(
**inputs,
max_new_tokens=int(max_new_tokens),
do_sample=True,
temperature=float(temperature),
top_p=float(top_p),
pad_token_id=tokenizer.eos_token_id,
)
decoded = tokenizer.decode(out[0], skip_special_tokens=True)
reply = decoded.split("Assistant:")[-1].strip()
return reply
demo = gr.ChatInterface(
fn=chat,
additional_inputs=[
gr.Slider(16, 512, value=192, step=1, label="max_new_tokens"),
gr.Slider(0.1, 1.5, value=0.7, step=0.05, label="temperature"),
gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p"),
],
title="Pidgin OSS – gpt-oss-20b + LoRA",
description=f"Base: {BASE_MODEL} | Adapter: {ADAPTER_ID}",
)
demo.launch(server_name="0.0.0.0", server_port=int(os.getenv('PORT', '7860')))