File size: 5,045 Bytes
ed57b89
 
 
 
 
 
d9c3b6f
ed57b89
 
 
 
 
 
 
 
 
fac72ab
ed57b89
50712a4
ed57b89
 
 
 
 
4682299
ed57b89
 
 
6f2d365
ed57b89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50712a4
ed57b89
 
 
 
 
 
 
 
 
 
50712a4
ed57b89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d708cc3
 
 
 
 
d9c3b6f
d708cc3
 
 
 
d9c3b6f
 
ed57b89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import spaces
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM, GenerationConfig
import torch
from transformers import pipeline
import pandas as pd
import gradio as gr
from concurrent.futures import ThreadPoolExecutor

quantization_config = BitsAndBytesConfig(load_in_4bit=True)

qwen_model_name = "Qwen/Qwen3-0.6B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(qwen_model_name)
qwen_model = AutoModelForCausalLM.from_pretrained(
    qwen_model_name,
    torch_dtype=torch.bfloat16, #testing for underflow issues
    device_map="auto",
    quantization_config = quantization_config,
)

qwen_generationconfig = GenerationConfig(
    max_new_tokens=512,
    temperature = 0.7,
    top_p = 0.8,
    min_p = 0
)

@spaces.GPU
def qwen_generate(input_question):
    # prepare the model input
    messages = [
        {"role": "user", "content": input_question}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False # Switches between thinking and non-thinking modes. Default is True.
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(qwen_model.device)
    print("tokenized")
    # conduct text completion
    generated_ids = qwen_model.generate(
        **model_inputs,
        generation_config = qwen_generationconfig,
    )
    print("outputs generated")
    output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 
    
    content = tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
    return content
    
#Llama 2 7b chat setup
llama2_model_id = "unsloth/gemma-3-1b-it-bnb-4bit"
llama2_pipe = pipeline(
    "text-generation",
    model=llama2_model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    model_kwargs={"quantization_config": quantization_config},
    # quantization_config=quantization_config,
)

#Llama 3.2 3b setup
llama3_model_id = "meta-llama/Llama-3.2-3B-Instruct"
llama3_pipe = pipeline(
    "text-generation",
    model=llama3_model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    model_kwargs={"quantization_config": quantization_config},
)

@spaces.GPU
def llama_QA(input_question, pipe):
    """
    stupid func for asking llama a question and then getting an answer
    inputs:
    - input_question [str]: question for llama to answer
    outputs:
    - response [str]: llama's response
    """
    messages = [
    {"role": "system", "content": "You are a helpful chatbot assistant. Answer all questions in the language they are asked in. Exclude any answer that you do not have real time information, just provide the information you have to answer this question."},
    {"role": "user", "content": input_question},
    ]

    outputs = pipe(
        messages,
        max_new_tokens=512
    )
    response = outputs[0]["generated_text"][-1]['content']
    return response


@spaces.GPU
def gradio_func(input_question, left_lang, right_lang):
    """
    silly wrapper function for gradio that turns all inputs into a single func. runs both the LHS and RHS of teh 'app' in order to let gradio work correctly.
    """
    # with ThreadPoolExecutor(max_workers=3) as executor:
    #     # Submit all tasks simultaneously
    #     future1 = executor.submit(qwen_generate, input_question)
    #     future2 = executor.submit(llama_QA, input_question, llama2_pipe)
    #     future3 = executor.submit(llama_QA, input_question, llama3_pipe)
        
    # Collect results
    output1 = qwen_generate(input_question) #future1.result()
    output2 = llama_QA(input_question, llama2_pipe) #future2.result()
    output3 = llama_QA(input_question, llama3_pipe) #future3.result()
        
    return output1, output2, output3

# Create the Gradio interface
def create_interface():
    
    with gr.Blocks() as demo:
        with gr.Row():
            question_input = gr.Textbox(label="Enter your question", interactive=True, value = """You are at a train track junction, with a lever in front of you that changes the active track. On one track, one of your loved ones is tied to the track, on the other, there are 5 strangers tied up, one after the other. If you do not use the lever in front of you to change the active track, then the 5 strangers die. If you do pull the lever, your loved one dies. What do you do?""")
        with gr.Row():
            submit_btn = gr.Button("Ask")
        with gr.Row():
            output1 = gr.Textbox(label="Qwen 3 output", interactive=False)
            output2 = gr.Textbox(label="Gemma 3 output", interactive=False)
            output3 = gr.Textbox(label="Llama 3 output", interactive=False)
            
        submit_btn.click(
            fn=gradio_func,
            inputs=[question_input],
            outputs=[
                    output1,
                     output2,
                     output3,
                    ]
        )
    
    return demo

# Launch the app
demo = create_interface()
demo.launch()