File size: 2,396 Bytes
8cfe629
c1cccf2
0195be0
 
 
 
 
c1cccf2
8cfe629
0195be0
 
 
 
 
 
 
 
 
c1cccf2
0195be0
8cfe629
a7e0131
0195be0
d28e427
 
 
0195be0
c1cccf2
eec20e0
0195be0
 
 
 
 
 
 
 
 
 
eec20e0
c1cccf2
eec20e0
c1cccf2
d28e427
8cfe629
0195be0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8cfe629
bd093f8
ff8bdc6
 
31ab39c
93f6e7c
bd093f8
0195be0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import gradio as gr
import spaces
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoModelForSequenceClassification,
)
import torch

chat_model_name = "sapienzanlp/Minerva-7B-instruct-v1.0"
chat_model = AutoModelForCausalLM.from_pretrained(
    chat_model_name, torch_dtype=torch.bfloat16, device_map="auto"
)
chat_tokenizer = AutoTokenizer.from_pretrained(chat_model_name)

moderator_model_name = "saiteki-kai/QA-DeBERTa-v3-large"
moderator_model = AutoModelForSequenceClassification.from_pretrained(
    moderator_model_name, device_map="auto"
)
moderator_tokenizer = AutoTokenizer.from_pretrained(moderator_model_name)


def generate_responses(model, tokenizer, prompts):
    messages = [[{"role": "user", "content": message}] for message in prompts]

    texts = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    with torch.inference_mode():
        model_inputs = tokenizer(texts, padding=True, return_tensors="pt").to(model.device)
        generated_ids = model.generate(
            **model_inputs,
            do_sample=False,
            temperature=0,
            repetition_penalty=1.0,
            max_new_tokens=512,
        )

    prompt_lengths = (model_inputs.input_ids != tokenizer.pad_token_id).sum(dim=1)
    generated_ids = [
        output_ids[length:] for length, output_ids in zip(prompt_lengths, generated_ids)
    ]
    responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    return responses


def classify_pairs(model, tokenizer, prompts, responses):
    texts = [
        prompt + "[SEP]" + response for prompt, response in zip(prompts, responses)
    ]

    with torch.inference_mode():
        input_ids = tokenizer(texts, padding=True, max_length=512).to(model.device)
        outputs = model(**input_ids)

    return outputs


@spaces.GPU()
def generate(prompts: list[str]) -> list[dict[str, str | float]]:
    responses = generate_responses(chat_model, chat_tokenizer, prompts)
    scores = classify_pairs(moderator_model, moderator_tokenizer, prompts, responses)

    return [
        {"prompt": prompt, "response": response, "score": score}
        for prompt, response, score in zip(prompts, responses, scores)
    ]


with gr.Blocks() as demo:
    gr.Markdown("Welcome")
    gr.api(generate, api_name="scores", batch=False)

demo.queue()
demo.launch()