| import gradio as gr |
| import torch |
| from peft import PeftModel |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer |
| from threading import Thread |
|
|
| |
| print("Loading tokenizer...") |
| tokenizer = AutoTokenizer.from_pretrained("togethercomputer/gpt-oss-20b-bf16") |
|
|
| print("Loading base model...") |
| base_model = AutoModelForCausalLM.from_pretrained( |
| "togethercomputer/gpt-oss-20b-bf16", |
| torch_dtype=torch.bfloat16, |
| device_map="auto", |
| ) |
|
|
| print("Loading PEFT adapter...") |
| model = PeftModel.from_pretrained(base_model, "oki0ki/gptoss") |
| model.eval() |
| print("Model ready.") |
|
|
|
|
| def generate( |
| message: str, |
| history: list, |
| system_prompt: str, |
| max_new_tokens: int, |
| temperature: float, |
| top_p: float, |
| repetition_penalty: float, |
| ): |
| |
| conversation = [] |
| if system_prompt.strip(): |
| conversation.append({"role": "system", "content": system_prompt.strip()}) |
| for user_msg, assistant_msg in history: |
| conversation.append({"role": "user", "content": user_msg}) |
| if assistant_msg: |
| conversation.append({"role": "assistant", "content": assistant_msg}) |
| conversation.append({"role": "user", "content": message}) |
|
|
| |
| if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template: |
| input_ids = tokenizer.apply_chat_template( |
| conversation, |
| return_tensors="pt", |
| add_generation_prompt=True, |
| ).to(model.device) |
| else: |
| prompt = "" |
| for turn in conversation: |
| role = turn["role"].capitalize() |
| prompt += f"{role}: {turn['content']}\n" |
| prompt += "Assistant:" |
| input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device) |
|
|
| streamer = TextIteratorStreamer( |
| tokenizer, |
| skip_prompt=True, |
| skip_special_tokens=True, |
| ) |
|
|
| generation_kwargs = dict( |
| input_ids=input_ids, |
| streamer=streamer, |
| max_new_tokens=max_new_tokens, |
| do_sample=temperature > 0, |
| temperature=temperature if temperature > 0 else 1.0, |
| top_p=top_p, |
| repetition_penalty=repetition_penalty, |
| pad_token_id=tokenizer.eos_token_id, |
| ) |
|
|
| thread = Thread(target=model.generate, kwargs=generation_kwargs) |
| thread.start() |
|
|
| partial = "" |
| for token in streamer: |
| partial += token |
| yield partial |
|
|
| thread.join() |
|
|
|
|
| with gr.Blocks( |
| title="oki0ki/gptoss — PEFT Chat", |
| theme=gr.themes.Default( |
| primary_hue="slate", |
| secondary_hue="zinc", |
| font=gr.themes.GoogleFont("IBM Plex Mono"), |
| ), |
| css=""" |
| body { background: #0a0a0a; } |
| .gradio-container { max-width: 860px !important; margin: 0 auto; } |
| #header { text-align: center; padding: 2rem 0 1rem; } |
| #header h1 { font-size: 1.6rem; color: #e2e2e2; letter-spacing: 0.05em; } |
| #header p { color: #666; font-size: 0.85rem; margin-top: 0.25rem; } |
| """, |
| ) as demo: |
| with gr.Column(elem_id="header"): |
| gr.Markdown("# oki0ki/gptoss") |
| gr.Markdown("togethercomputer/gpt-oss-20b-bf16 + PEFT adapter · streaming") |
|
|
| with gr.Row(): |
| with gr.Column(scale=3): |
| chatbot = gr.ChatInterface( |
| fn=generate, |
| additional_inputs=[ |
| gr.Textbox( |
| label="System prompt", |
| value="You are a helpful assistant.", |
| lines=2, |
| ), |
| gr.Slider( |
| label="Max new tokens", |
| minimum=64, |
| maximum=2048, |
| value=512, |
| step=64, |
| ), |
| gr.Slider( |
| label="Temperature", |
| minimum=0.0, |
| maximum=2.0, |
| value=0.7, |
| step=0.05, |
| ), |
| gr.Slider( |
| label="Top-p", |
| minimum=0.1, |
| maximum=1.0, |
| value=0.95, |
| step=0.05, |
| ), |
| gr.Slider( |
| label="Repetition penalty", |
| minimum=1.0, |
| maximum=1.5, |
| value=1.1, |
| step=0.05, |
| ), |
| ], |
| additional_inputs_accordion=gr.Accordion( |
| label="⚙ Generation parameters", open=False |
| ), |
| submit_btn="Send", |
| retry_btn="↺ Retry", |
| undo_btn="↩ Undo", |
| clear_btn="✕ Clear", |
| ) |
|
|
| if __name__ == "__main__": |
| demo.queue().launch() |
|
|