Spaces:
Runtime error
Runtime error
| import os | |
| import torch | |
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from transformers.utils.quantization_config import Mxfp4Config | |
| from peft import PeftModel | |
| BASE_MODEL = "openai/gpt-oss-20b" | |
| ADAPTER_ID = "AnalyticsIntelligence/pidgin_oss" | |
| SYSTEM_PROMPT = ( | |
| "You be helpful customer service AI. " | |
| "You must answer only in Nigerian Pidgin. " | |
| "No use English unless person ask am." | |
| ) | |
| # Hugging Face Spaces-safe writable dir for disk offload | |
| OFFLOAD_DIR = os.getenv("OFFLOAD_DIR", "/tmp/offload") | |
| def load_model(): | |
| # Tokenizer (base) | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| BASE_MODEL, | |
| trust_remote_code=True, | |
| ) | |
| if not torch.cuda.is_available(): | |
| raise RuntimeError("CUDA GPU not detected. gpt-oss-20b needs a GPU for this demo.") | |
| # Make sure offload folder exists (required when device_map triggers disk offload) | |
| os.makedirs(OFFLOAD_DIR, exist_ok=True) | |
| # MXFP4 model: do NOT use BitsAndBytes. | |
| # dequantize=True allows running on non-H100 GPUs too (L4/A10/T4 etc). | |
| qconfig = Mxfp4Config(dequantize=True) | |
| base = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL, | |
| device_map="auto", | |
| torch_dtype=torch.bfloat16, | |
| quantization_config=qconfig, | |
| trust_remote_code=True, | |
| offload_folder=OFFLOAD_DIR, | |
| offload_state_dict=True, | |
| ) | |
| model = PeftModel.from_pretrained(base, ADAPTER_ID) | |
| model.eval() | |
| return tokenizer, model | |
| tokenizer, model = load_model() | |
| def build_prompt(message, history, max_turns=8): | |
| history = (history or [])[-max_turns:] | |
| lines = [SYSTEM_PROMPT, ""] | |
| for u, a in history: | |
| lines.append(f"User: {u}") | |
| lines.append(f"Assistant: {a}") | |
| lines.append(f"User: {message}") | |
| lines.append("Assistant:") | |
| return "\n".join(lines) | |
| def chat(message, history, max_new_tokens, temperature, top_p): | |
| prompt = build_prompt(message, history) | |
| inputs = tokenizer( | |
| prompt, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=2048, | |
| ).to(model.device) | |
| out = model.generate( | |
| **inputs, | |
| max_new_tokens=int(max_new_tokens), | |
| do_sample=True, | |
| temperature=float(temperature), | |
| top_p=float(top_p), | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| decoded = tokenizer.decode(out[0], skip_special_tokens=True) | |
| reply = decoded.split("Assistant:")[-1].strip() | |
| return reply | |
| demo = gr.ChatInterface( | |
| fn=chat, | |
| additional_inputs=[ | |
| gr.Slider(16, 512, value=192, step=1, label="max_new_tokens"), | |
| gr.Slider(0.1, 1.5, value=0.7, step=0.05, label="temperature"), | |
| gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p"), | |
| ], | |
| title="Pidgin OSS – gpt-oss-20b + LoRA", | |
| description=f"Base: {BASE_MODEL} | Adapter: {ADAPTER_ID}", | |
| ) | |
| demo.launch(server_name="0.0.0.0", server_port=int(os.getenv('PORT', '7860'))) | |