File size: 3,536 Bytes
87d8688
3e11881
b8b5a68
b8bda1a
3e11881
 
87d8688
be2cd9b
66c504a
3e11881
 
66c504a
3e11881
b8b5a68
 
 
66c504a
 
b8b5a68
87d8688
716de19
1b19c76
2873048
1b19c76
3e11881
 
92849b7
1b19c76
 
 
b96433e
2cbd116
b96433e
 
 
 
2cbd116
 
 
 
 
b96433e
563331b
b96433e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bfdc52b
b96433e
 
 
 
 
 
4de4cdd
3e11881
ff787d3
3e11881
1b19c76
 
 
563331b
1b19c76
716de19
 
 
 
 
 
563331b
1b19c76
3e11881
 
563331b
 
1b19c76
 
 
576c828
563331b
1b19c76
7a523b6
1b19c76
3e11881
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import gradio as gr
import spaces
import torch
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
from datetime import datetime

model_id = "BSC-LT/SalamandraTA-7B-instructed-Aranese"
token = os.getenv("AudreyVM")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16, 
    token=token
)

languages = sorted([ 'Aragonese', 'Asturian', 'Basque', 'Bulgarian', 'Catalan', 'Catalan_Valencian', 'Croatian', 'Czech', 'Danish', 'Dutch', 'English', 'Estonian',
             'Finnish', 'French', 'Galician', 'German', 'Greek', 'Hungarian', 'Irish', 'Italian', 'Latvian', 'Lithuanian', 'Maltese', 'Norwegian Bokmål',
             'Norwegian Nynorsk', 'Occitan', 'Aranese', 'Polish', 'Portuguese', 'Romanian', 'Russian', 'Serbian_Cyrillic', 'Slovak', 'Slovenian', 'Spanish', 'Swedish',
             'Ukrainian', 'Welsh' ])

@spaces.GPU(duration=120)
def generate_output(source, target, input_text, mt_text=None):
    date_string = datetime.today().strftime('%Y-%m-%d')


    sentences = input_text.split('\n')
    #sentences = [s for s in sentences if len(s.strip()) > 0]
    generated_text = []

    for sentence in sentences:
        sentence = sentence.strip()

        if len(sentence) == 0:
            # Preserve empty lines
            generated_text.append('')
            continue
        
        prompt = f"Translate the following text from {source} into {target}.\n{source}: {sentence.strip()} \n{target}:"
    
        messages = [{"role": "user", "content": prompt}]
        final_prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            date_string=date_string
        )
    
        inputs = tokenizer(final_prompt, return_tensors="pt", add_special_tokens=False).to(model.device)
        input_length = inputs.input_ids.shape[1]
    
        output = model.generate(
            input_ids=inputs.input_ids,
            max_new_tokens=4000,
            early_stopping=True,
            num_beams=1
        )
    
        decoded = tokenizer.decode(output[0, input_length:], skip_special_tokens=True).strip()
        generated_text.append(decoded)
        
    return '\n'.join(generated_text), ""

with gr.Blocks() as demo:
    gr.Markdown("# 🦎 SalamandraTA 7B - Aranese Demo")


    with gr.Row():
        source_lang = gr.Dropdown(choices=languages, value="Catalan", label="Source Language")
        target_lang = gr.Dropdown(choices=languages, value="Aranese", label="Target Language")

        if source_lang == 'Catalan_Valencian':
            source_lang = 'Valencian'

        if target_lang == 'Catalan_Valencian':
            target_lang = 'Valencian'

    input_textbox = gr.Textbox(lines=6, placeholder="Enter source text here", label="Input Text")
    output_textbox = gr.Textbox(lines=6, label="Output")

    info_label = gr.HTML("")
    translate_btn = gr.Button("Translate")
    translate_btn.click(generate_output, inputs=[source_lang, target_lang, input_textbox], outputs=[output_textbox, info_label])

    gr.Examples(
        examples=[
            ["Catalan", "Aranese", "Als antics egipcis del període de l'Imperi Nou els fascinaven els monuments dels seus predecessors, que llavors tenien més de mil anys.", ""],
            
        ],
        inputs=[source_lang, target_lang, input_textbox]
    )

if __name__ == "__main__":
    demo.launch()