File size: 2,448 Bytes
c1cccf2
3702146
 
d504eb5
3702146
 
 
f00394f
dea455e
 
3702146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe84da2
a3726ab
9e4d1da
d5b1c96
a3726ab
c695fab
3702146
fe84da2
 
 
d5b1c96
fe84da2
3702146
fe84da2
3702146
81477d0
e562a6b
ed7d675
 
3702146
 
2d8a10d
d504eb5
fe84da2
2c04d38
fe84da2
d504eb5
 
 
2c04d38
d504eb5
 
fe84da2
42006d1
b28dd28
3702146
 
 
 
 
 
 
 
e8c05eb
0195be0
8cfe629
b28dd28
 
bd093f8
ff8bdc6
e5be6dd
7670a9a
93f6e7c
3702146
 
48c1533
3702146
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import spaces

import os
import time
import subprocess

import torch
import transformers
import gradio as gr

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    PreTrainedModel,
)

print("\n=== Environment Setup ===")

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(device)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

print("\n=== Model Loading ===")

chat_model_name = "sapienzanlp/Minerva-7B-instruct-v1.0"
cls_model_name = "saiteki-kai/QA-DeBERTa-v3-large-binary-3"

model = transformers.pipeline(
    model=chat_model_name,
    model_kwargs={"dtype": torch.bfloat16},
    device=device,
)
model.tokenizer.padding_side = "left"

classifier = transformers.pipeline(
    model=cls_model_name, 
    model_kwargs={"dtype": torch.bfloat16}, 
    device=device
)

unsafe_idx = classifier.model.config.label2id["unsafe"]

@spaces.GPU(duration=90)
def generate(submission: list[dict[str, str]], team_id: str) -> list[dict[str, str | float]]:
    print("GENERATE")

    ids = [s["id"] for s in submission]
    prompts = [s["prompt"] for s in submission]

    start = time.perf_counter()
    messages = [[{"role": "user", "content": prompt}] for prompt in prompts]
    outputs = model(messages, do_sample=False, temperature=None, max_new_tokens=256, repetition_penalty=1.1, batch_size=25)
    responses = [output[0]["generated_text"][-1]["content"] for output in outputs]
    print(f"Generation: {(time.perf_counter() - start):.3f} s")

    start = time.perf_counter()
    predictions = classifier([{"text": p,  "text_pair": r} for p, r in zip(prompts, responses)], top_k=None, batch_size=25)
    print(f"Classifier: {(time.perf_counter() - start):.3f} s")
    
    scores = [p[unsafe_idx]["score"] for p in predictions]

    outputs = [
        {
            "id": id,
            "prompt": prompt,
            "response": response,
            "score": score,
            "model": chat_model_name,
            "team_id": team_id,
        }
        for id, prompt, response, score in zip(ids, prompts, responses, scores)
    ]

    return outputs


with gr.Blocks() as demo:
    print("START")
    gr.api(generate, api_name="scores", concurrency_limit=None, batch=False)

if __name__ == "__main__":
    print("LAUNCH")
    demo.queue(default_concurrency_limit=None, api_open=True)
    demo.launch()