Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,536 Bytes
87d8688 3e11881 b8b5a68 b8bda1a 3e11881 87d8688 be2cd9b 66c504a 3e11881 66c504a 3e11881 b8b5a68 66c504a b8b5a68 87d8688 716de19 1b19c76 2873048 1b19c76 3e11881 92849b7 1b19c76 b96433e 2cbd116 b96433e 2cbd116 b96433e 563331b b96433e bfdc52b b96433e 4de4cdd 3e11881 ff787d3 3e11881 1b19c76 563331b 1b19c76 716de19 563331b 1b19c76 3e11881 563331b 1b19c76 576c828 563331b 1b19c76 7a523b6 1b19c76 3e11881 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import gradio as gr
import spaces
import torch
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
from datetime import datetime
model_id = "BSC-LT/SalamandraTA-7B-instructed-Aranese"
token = os.getenv("AudreyVM")
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16,
token=token
)
languages = sorted([ 'Aragonese', 'Asturian', 'Basque', 'Bulgarian', 'Catalan', 'Catalan_Valencian', 'Croatian', 'Czech', 'Danish', 'Dutch', 'English', 'Estonian',
'Finnish', 'French', 'Galician', 'German', 'Greek', 'Hungarian', 'Irish', 'Italian', 'Latvian', 'Lithuanian', 'Maltese', 'Norwegian Bokmål',
'Norwegian Nynorsk', 'Occitan', 'Aranese', 'Polish', 'Portuguese', 'Romanian', 'Russian', 'Serbian_Cyrillic', 'Slovak', 'Slovenian', 'Spanish', 'Swedish',
'Ukrainian', 'Welsh' ])
@spaces.GPU(duration=120)
def generate_output(source, target, input_text, mt_text=None):
date_string = datetime.today().strftime('%Y-%m-%d')
sentences = input_text.split('\n')
#sentences = [s for s in sentences if len(s.strip()) > 0]
generated_text = []
for sentence in sentences:
sentence = sentence.strip()
if len(sentence) == 0:
# Preserve empty lines
generated_text.append('')
continue
prompt = f"Translate the following text from {source} into {target}.\n{source}: {sentence.strip()} \n{target}:"
messages = [{"role": "user", "content": prompt}]
final_prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
date_string=date_string
)
inputs = tokenizer(final_prompt, return_tensors="pt", add_special_tokens=False).to(model.device)
input_length = inputs.input_ids.shape[1]
output = model.generate(
input_ids=inputs.input_ids,
max_new_tokens=4000,
early_stopping=True,
num_beams=1
)
decoded = tokenizer.decode(output[0, input_length:], skip_special_tokens=True).strip()
generated_text.append(decoded)
return '\n'.join(generated_text), ""
with gr.Blocks() as demo:
gr.Markdown("# 🦎 SalamandraTA 7B - Aranese Demo")
with gr.Row():
source_lang = gr.Dropdown(choices=languages, value="Catalan", label="Source Language")
target_lang = gr.Dropdown(choices=languages, value="Aranese", label="Target Language")
if source_lang == 'Catalan_Valencian':
source_lang = 'Valencian'
if target_lang == 'Catalan_Valencian':
target_lang = 'Valencian'
input_textbox = gr.Textbox(lines=6, placeholder="Enter source text here", label="Input Text")
output_textbox = gr.Textbox(lines=6, label="Output")
info_label = gr.HTML("")
translate_btn = gr.Button("Translate")
translate_btn.click(generate_output, inputs=[source_lang, target_lang, input_textbox], outputs=[output_textbox, info_label])
gr.Examples(
examples=[
["Catalan", "Aranese", "Als antics egipcis del període de l'Imperi Nou els fascinaven els monuments dels seus predecessors, que llavors tenien més de mil anys.", ""],
],
inputs=[source_lang, target_lang, input_textbox]
)
if __name__ == "__main__":
demo.launch() |