gptoss / app.py
oki0ki's picture
Upload 2 files
057129d verified
import gradio as gr
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread
# Load model
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/gpt-oss-20b-bf16")
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
"togethercomputer/gpt-oss-20b-bf16",
torch_dtype=torch.bfloat16,
device_map="auto",
)
print("Loading PEFT adapter...")
model = PeftModel.from_pretrained(base_model, "oki0ki/gptoss")
model.eval()
print("Model ready.")
def generate(
message: str,
history: list,
system_prompt: str,
max_new_tokens: int,
temperature: float,
top_p: float,
repetition_penalty: float,
):
# Build conversation
conversation = []
if system_prompt.strip():
conversation.append({"role": "system", "content": system_prompt.strip()})
for user_msg, assistant_msg in history:
conversation.append({"role": "user", "content": user_msg})
if assistant_msg:
conversation.append({"role": "assistant", "content": assistant_msg})
conversation.append({"role": "user", "content": message})
# Tokenize
if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template:
input_ids = tokenizer.apply_chat_template(
conversation,
return_tensors="pt",
add_generation_prompt=True,
).to(model.device)
else:
prompt = ""
for turn in conversation:
role = turn["role"].capitalize()
prompt += f"{role}: {turn['content']}\n"
prompt += "Assistant:"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
streamer = TextIteratorStreamer(
tokenizer,
skip_prompt=True,
skip_special_tokens=True,
)
generation_kwargs = dict(
input_ids=input_ids,
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=temperature > 0,
temperature=temperature if temperature > 0 else 1.0,
top_p=top_p,
repetition_penalty=repetition_penalty,
pad_token_id=tokenizer.eos_token_id,
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
partial = ""
for token in streamer:
partial += token
yield partial
thread.join()
with gr.Blocks(
title="oki0ki/gptoss — PEFT Chat",
theme=gr.themes.Default(
primary_hue="slate",
secondary_hue="zinc",
font=gr.themes.GoogleFont("IBM Plex Mono"),
),
css="""
body { background: #0a0a0a; }
.gradio-container { max-width: 860px !important; margin: 0 auto; }
#header { text-align: center; padding: 2rem 0 1rem; }
#header h1 { font-size: 1.6rem; color: #e2e2e2; letter-spacing: 0.05em; }
#header p { color: #666; font-size: 0.85rem; margin-top: 0.25rem; }
""",
) as demo:
with gr.Column(elem_id="header"):
gr.Markdown("# oki0ki/gptoss")
gr.Markdown("togethercomputer/gpt-oss-20b-bf16 + PEFT adapter · streaming")
with gr.Row():
with gr.Column(scale=3):
chatbot = gr.ChatInterface(
fn=generate,
additional_inputs=[
gr.Textbox(
label="System prompt",
value="You are a helpful assistant.",
lines=2,
),
gr.Slider(
label="Max new tokens",
minimum=64,
maximum=2048,
value=512,
step=64,
),
gr.Slider(
label="Temperature",
minimum=0.0,
maximum=2.0,
value=0.7,
step=0.05,
),
gr.Slider(
label="Top-p",
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
),
gr.Slider(
label="Repetition penalty",
minimum=1.0,
maximum=1.5,
value=1.1,
step=0.05,
),
],
additional_inputs_accordion=gr.Accordion(
label="⚙ Generation parameters", open=False
),
submit_btn="Send",
retry_btn="↺ Retry",
undo_btn="↩ Undo",
clear_btn="✕ Clear",
)
if __name__ == "__main__":
demo.queue().launch()