import gradio as gr import torch from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread # Load model print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained("togethercomputer/gpt-oss-20b-bf16") print("Loading base model...") base_model = AutoModelForCausalLM.from_pretrained( "togethercomputer/gpt-oss-20b-bf16", torch_dtype=torch.bfloat16, device_map="auto", ) print("Loading PEFT adapter...") model = PeftModel.from_pretrained(base_model, "oki0ki/gptoss") model.eval() print("Model ready.") def generate( message: str, history: list, system_prompt: str, max_new_tokens: int, temperature: float, top_p: float, repetition_penalty: float, ): # Build conversation conversation = [] if system_prompt.strip(): conversation.append({"role": "system", "content": system_prompt.strip()}) for user_msg, assistant_msg in history: conversation.append({"role": "user", "content": user_msg}) if assistant_msg: conversation.append({"role": "assistant", "content": assistant_msg}) conversation.append({"role": "user", "content": message}) # Tokenize if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template: input_ids = tokenizer.apply_chat_template( conversation, return_tensors="pt", add_generation_prompt=True, ).to(model.device) else: prompt = "" for turn in conversation: role = turn["role"].capitalize() prompt += f"{role}: {turn['content']}\n" prompt += "Assistant:" input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device) streamer = TextIteratorStreamer( tokenizer, skip_prompt=True, skip_special_tokens=True, ) generation_kwargs = dict( input_ids=input_ids, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=temperature > 0, temperature=temperature if temperature > 0 else 1.0, top_p=top_p, repetition_penalty=repetition_penalty, pad_token_id=tokenizer.eos_token_id, ) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() partial = "" for token in streamer: partial += token yield partial thread.join() with gr.Blocks( title="oki0ki/gptoss — PEFT Chat", theme=gr.themes.Default( primary_hue="slate", secondary_hue="zinc", font=gr.themes.GoogleFont("IBM Plex Mono"), ), css=""" body { background: #0a0a0a; } .gradio-container { max-width: 860px !important; margin: 0 auto; } #header { text-align: center; padding: 2rem 0 1rem; } #header h1 { font-size: 1.6rem; color: #e2e2e2; letter-spacing: 0.05em; } #header p { color: #666; font-size: 0.85rem; margin-top: 0.25rem; } """, ) as demo: with gr.Column(elem_id="header"): gr.Markdown("# oki0ki/gptoss") gr.Markdown("togethercomputer/gpt-oss-20b-bf16 + PEFT adapter · streaming") with gr.Row(): with gr.Column(scale=3): chatbot = gr.ChatInterface( fn=generate, additional_inputs=[ gr.Textbox( label="System prompt", value="You are a helpful assistant.", lines=2, ), gr.Slider( label="Max new tokens", minimum=64, maximum=2048, value=512, step=64, ), gr.Slider( label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.05, ), gr.Slider( label="Top-p", minimum=0.1, maximum=1.0, value=0.95, step=0.05, ), gr.Slider( label="Repetition penalty", minimum=1.0, maximum=1.5, value=1.1, step=0.05, ), ], additional_inputs_accordion=gr.Accordion( label="⚙ Generation parameters", open=False ), submit_btn="Send", retry_btn="↺ Retry", undo_btn="↩ Undo", clear_btn="✕ Clear", ) if __name__ == "__main__": demo.queue().launch()