File size: 2,040 Bytes
be16a0b
 
 
 
 
 
 
1c325f0
be16a0b
 
 
 
 
 
 
 
 
 
2f54331
 
 
 
be16a0b
2f54331
ef2bd77
2f54331
ef2bd77
be16a0b
ef2bd77
be16a0b
 
 
ef2bd77
be16a0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5f63c4
be16a0b
 
f5f63c4
be16a0b
 
 
 
 
 
2f54331
be16a0b
 
 
 
 
2f54331
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import spaces
import gradio as gr
from transformers import pipeline, TextIteratorStreamer
import torch
import threading

# Load model and tokenizer
model_name = "krish10/Qwen3_14B_16bit_Sleep"
pipe = pipeline("text-generation", model=model_name, device=0)
tokenizer = pipe.tokenizer
model = pipe.model

# Fixed generation config
MAX_TOKENS = 3000
TEMPERATURE = 0.1
TOP_P = 0.9

@spaces.GPU
def respond_stream(user_input):
    # Validate input
    if not user_input.strip():
        return "❌ Error: Input text is required."

    # Use the entire input directly in the prompt
    prompt = (
        f"Instruction: \n\n{user_input.strip()}"
    )

    # Wrap into message for chat template
    messages = [{"role": "user", "content": prompt}]
    prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Tokenize and prepare streamer
    inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    generation_kwargs = dict(
        input_ids=inputs["input_ids"],
        streamer=streamer,
        max_new_tokens=MAX_TOKENS,
        temperature=TEMPERATURE,
        top_p=TOP_P,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )

    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    partial_text = ""
    for token in streamer:
        partial_text += token
        yield partial_text

# Build Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## 🤖 Sleep trained Qwen3-14b")

    with gr.Column():
        user_input = gr.Textbox(label="Input Text", lines=15, placeholder="Paste your full input here")

    output_box = gr.Textbox(label="Model Response", lines=15, interactive=False)
    generate_btn = gr.Button("Generate")

    generate_btn.click(
        fn=respond_stream,
        inputs=[user_input],
        outputs=[output_box]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()