File size: 2,703 Bytes
ea7de56
62758b7
d3770a3
 
 
 
 
041c8f6
d3770a3
 
 
 
62758b7
aed121c
62758b7
d3770a3
a897f14
8766425
d3770a3
7acd060
d3770a3
 
 
078b240
d3770a3
 
 
a897f14
d3770a3
 
 
 
 
 
62758b7
d3770a3
 
7e95097
 
d3770a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49f94bf
d3770a3
 
 
7e95097
d3770a3
 
7e95097
d3770a3
 
62758b7
d3770a3
 
 
 
ea7de56
d3770a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TextIteratorStreamer,
)
import os
from threading import Thread
import spaces
import time
import subprocess



token = os.environ["HF_TOKEN"]


model = AutoModelForCausalLM.from_pretrained(
    "rajj0/autotrain-phi3-mini-4k-abstract-lora-full",
    token=token,
    trust_remote_code=True,
)
tok = AutoTokenizer.from_pretrained("rajj0/autotrain-phi3-mini-4k-abstract-lora-sft", token=token)
terminators = [
    tok.eos_token_id,
]

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(device)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

model = model.to(device)
# Dispatch Errors


@spaces.GPU(duration=60)
def chat(message, history, temperature, do_sample, max_tokens):
    chat = []
    for item in history:
        chat.append({"role": "user", "content": item[0]})
        if item[1] is not None:
            chat.append({"role": "assistant", "content": item[1]})
    chat.append({"role": "user", "content": message})
    messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    model_inputs = tok([messages], return_tensors="pt").to(device)
    streamer = TextIteratorStreamer(
        tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
    )
    generate_kwargs = dict(
        model_inputs,
        streamer=streamer,
        max_new_tokens=max_tokens,
        do_sample=True,
        temperature=temperature,
        eos_token_id=terminators,
    )

    if temperature == 0:
        generate_kwargs["do_sample"] = False

    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    partial_text = ""
    for new_text in streamer:
        partial_text += new_text
        yield partial_text

    yield partial_text


demo = gr.ChatInterface(
    fn=chat,
    examples=[["Write me a poem about Machine Learning."]],
    # multimodal=False,
    additional_inputs_accordion=gr.Accordion(
        label="⚙️ Parameters", open=False, render=False
    ),
    additional_inputs=[
        gr.Slider(
            minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
        ),
        gr.Checkbox(label="Sampling", value=True),
        gr.Slider(
            minimum=128,
            maximum=4096,
            step=1,
            value=512,
            label="Max new tokens",
            render=False,
        ),
    ],
    stop_btn="Stop Generation",
    title="Chat With LLMs",
    description="Now Running [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)",
)
demo.launch()