SalamandraTA-7B-Aranese-Optimised

Sleeping

App Files Files Community

javi8979 commited on Mar 21, 2025

Commit

b96433e

verified ·

1 Parent(s): 4de4cdd

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -50

app.py CHANGED Viewed

@@ -24,61 +24,52 @@ languages = sorted([ 'Aragonese', 'Asturian', 'Basque', 'Bulgarian', 'Catalan',
 def generate_output(task, source, target, input_text, mt_text=None):
     date_string = datetime.today().strftime('%Y-%m-%d')
-    if task == "Translation":
-        prompt = f"Translate the following text from {source} into {target}.\n{source}: {input_text.strip()} \n{target}:"
-    elif task == "Post-editing":
-        if not mt_text:
-            return "Please provide machine translation (MT) for post-editing.", ""
-        prompt = f"Please fix any mistakes in the following {source}-{target} machine translation or keep it unedited if it's correct.\nSource: {input_text.strip()} \nMT: {mt_text.strip()} \nCorrected:"
-    elif task == "Document translation":
-        prompt = f"Please translate this text from {source} into {target}.\n{source}: {input_text.strip()}\n{target}:"
-    elif task == "Grammar checker":
-        prompt = f"Please fix any mistakes in the following {source} sentence or keep it unedited if it's correct.\nSentence: {input_text.strip()} \nCorrected:"
-    elif task == "Named-entity recognition":
-        prompt = """Analyse the following tokenized text and mark the tokens containing named entities.
-Use the following annotation guidelines with these tags for named entities:
-- ORG (Refers to named groups or organizations)
-- PER (Refers to individual people or named groups of people)
-- LOC (Refers to physical places or natural landmarks)
-- MISC (Refers to entities that don't fit into standard categories).
-Prepend B- to the first token of a given entity and I- to the remaining ones if they exist.
-If a token is not a named entity, label it as O.
-Input: """ + str(input_text.strip()) + "\nMarked:"
-    messages = [{"role": "user", "content": prompt}]
-    final_prompt = tokenizer.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True,
-        date_string=date_string
-    )
-    inputs = tokenizer(final_prompt, return_tensors="pt", add_special_tokens=False).to(model.device)
-    input_length = inputs.input_ids.shape[1]
-    output = model.generate(
-        input_ids=inputs.input_ids,
-        max_new_tokens=512,
-        early_stopping=True,
-        num_beams=5
-    )
-    decoded = tokenizer.decode(output[0, input_length:], skip_special_tokens=True).strip()
-    return decoded, ""
-doc_level_example = """President Donald Trump, who campaigned on promises to crack down on illegal immigration, has raised alarms in the U.S. dairy industry with his threat to impose 25% tariffs on Mexico and Canada by February 2025. This move is part of a broader strategy to declare a national emergency at the southern border to halt illegal migration completely.
-However, the implications for the agriculture sector, particularly dairy, are significant. Approximately half of the U.S. dairy industry's workforce consists of immigrant labor, many of whom are undocumented. The National Milk Producers Federation estimates that removing immigrant workers could decimate the dairy herd by 2.1 million cows and slash milk production by nearly 50 billion pounds, leading to a dramatic 90.4% increase in milk prices.
-The complex perspectives of Americans on undocumented workers were highlighted in a Pew Research Center study. While 64% of U.S. adults support legal pathways for undocumented immigrants, 35% oppose it—a gap that has been narrowing recently. Factors influencing public opinion include the belief that immigrants should have jobs and pass security checks, contrasted by concerns about lawbreakers being rewarded, fairness for legal migrants, and resource allocation.
-According to Zach Rutledge, an agricultural economist at Michigan State University, as nations grow wealthier, their labor forces transition away from agriculture toward sectors like services and manufacturing. This shift has led to the U.S. relying heavily on immigrant labor for agricultural work. Domestic workers, even with employment taxes, may cost $15 to $25 an hour, while H-2A visa program workers might cost $25 to $30 an hour, accounting for additional housing expenses.
-The National Milk Producers Federation has been vocal in advocating for changes to the H-2A visa program, which outside of its current seasonal limitations, does not support the dairy industry's year-round labor needs. Executive vice-president Jaime Castaneda reiterated the need for legislative clarity to address the undocumented workforce issues in dairy farming.
-The Farm Workforce Modernization Act of 2023, which could grant legal status to certain undocumented farmworkers, has been stalled in Congress, despite acknowledgment of the sector's importance to feeding America. The need for coordinated legislative efforts to ensure both border security and labor market stability is imperative moving forward."""
 with gr.Blocks() as demo:
     gr.Markdown("# 🦎 SalamandraTA 7B - Multitask Demo")
     gr.Markdown("Explore the translation, grammar correction, NER and post-editing capabilities of the SalamandraTA 7B model.")
     with gr.Row():
-        task_selector = gr.Radio(["Translation", "Document translation", "Post-editing", "Grammar checker", "Named-entity recognition"], value="Translation", label="Select Task")
     with gr.Row():
         source_lang = gr.Dropdown(choices=languages, value="Catalan", label="Source Language")
@@ -97,8 +88,6 @@ with gr.Blocks() as demo:
             ["Translation", "Catalan", "Galician", "Als antics egipcis del període de l'Imperi Nou els fascinaven els monuments dels seus predecessors, que llavors tenien més de mil anys.", ""],
             ["Post-editing", "Catalan", "English", "Rafael Nadal i Maria Magdalena van inspirar a una generació sencera.", "Rafael Christmas and Maria the Muffin inspired an entire generation each in their own way."],
             ["Grammar checker", "Catalan", "", "Entonses, el meu jefe m’ha dit que he de treballar els fins de setmana.", ""],
-            ["Named-entity recognition", "", "", "['La', 'defensa', 'del', 'antiguo', 'responsable', 'de', 'la', 'RFEF', 'confirma', 'que', 'interpondrá', 'un', 'recurso.']", ""],
-            ["Document translation", "English", "Asturian", doc_level_example, ""]
         ],
         inputs=[task_selector, source_lang, target_lang, input_textbox, mt_textbox]
     )

 def generate_output(task, source, target, input_text, mt_text=None):
     date_string = datetime.today().strftime('%Y-%m-%d')
+    sentences = input_text.split('\n')
+    sentences = [s for s in sentences if len(s.strip()) > 0]
+    generated_text = []
+    for sentence in sentences:
+        sentence = sentence.strip()
+        if task == "Translation":
+            prompt = f"Translate the following text from {source} into {target}.\n{source}: {sentence.strip()} \n{target}:"
+        elif task == "Post-editing":
+            if not mt_text:
+                return "Please provide machine translation (MT) for post-editing.", ""
+            prompt = f"Please fix any mistakes in the following {source}-{target} machine translation or keep it unedited if it's correct.\nSource: {sentence.strip()} \nMT: {mt_text.strip()} \nCorrected:"
+        elif task == "Grammar checker":
+            prompt = f"Please fix any mistakes in the following {source} sentence or keep it unedited if it's correct.\nSentence: {sentence.strip()} \nCorrected:"
+        messages = [{"role": "user", "content": prompt}]
+        final_prompt = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+            date_string=date_string
+        )
+        inputs = tokenizer(final_prompt, return_tensors="pt", add_special_tokens=False).to(model.device)
+        input_length = inputs.input_ids.shape[1]
+        output = model.generate(
+            input_ids=inputs.input_ids,
+            max_new_tokens=4000,
+            early_stopping=True,
+            num_beams=5
+        )
+        decoded = tokenizer.decode(output[0, input_length:], skip_special_tokens=True).strip()
+        generated_text.append(decoded)
+    return '\n'.join(generated_text), ""
 with gr.Blocks() as demo:
     gr.Markdown("# 🦎 SalamandraTA 7B - Multitask Demo")
     gr.Markdown("Explore the translation, grammar correction, NER and post-editing capabilities of the SalamandraTA 7B model.")
     with gr.Row():
+        task_selector = gr.Radio(["Translation", "Post-editing", "Grammar checker"], value="Translation", label="Select Task")
     with gr.Row():
         source_lang = gr.Dropdown(choices=languages, value="Catalan", label="Source Language")
             ["Translation", "Catalan", "Galician", "Als antics egipcis del període de l'Imperi Nou els fascinaven els monuments dels seus predecessors, que llavors tenien més de mil anys.", ""],
             ["Post-editing", "Catalan", "English", "Rafael Nadal i Maria Magdalena van inspirar a una generació sencera.", "Rafael Christmas and Maria the Muffin inspired an entire generation each in their own way."],
             ["Grammar checker", "Catalan", "", "Entonses, el meu jefe m’ha dit que he de treballar els fins de setmana.", ""],
         ],
         inputs=[task_selector, source_lang, target_lang, input_textbox, mt_textbox]
     )