Spaces:

dinalzein
/

LanguageIdentifier

Runtime error

App Files Files Community

dinalzein commited on May 24, 2022

Commit

409b791

1 Parent(s): a00b2b5

add app file

Browse files

Files changed (1) hide show

app.py +84 -0

app.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
+import gradio as gr
+import torch
+import numpy as np
+from mapping_labels import languages_map, id2label
+model_checkpoint = "dinalzein/xlm-roberta-base-finetuned-language-identification"
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
+model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
+trainer = Trainer(model)
+class Dataset(torch.utils.data.Dataset):
+    def __init__(self, encodings, labels=None):
+        self.encodings = encodings
+        self.labels = labels
+    def __getitem__(self, idx):
+        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
+        if self.labels:
+            item["labels"] = torch.tensor(self.labels[idx])
+        return item
+    def __len__(self):
+        return len(self.encodings["input_ids"])
+def identify_language(txt):
+  txt=[txt]
+  tokenized_txt = tokenizer(txt, truncation=True, max_length=20)
+  txt_dataset = Dataset(tokenized_txt)
+  raw_pred, _, _ = trainer.predict(txt_dataset)
+  # Preprocess raw predictions
+  y_pred = np.argmax(raw_pred, axis=1)
+  return languages_map[id2label[str(y_pred[0])]]
+#with gr.Row():
+examples = [
+    "C'est La Vie",
+    "So ist das Leben",
+    "That is life",
+    "هذه هي الحياة"
+]
+inputs=gr.inputs.Textbox(placeholder="Enter your text here", label="Text content", lines=5)
+outputs=gr.outputs.Label(label="Language Identified:")
+article = ('''## Suppoted Langauges \n
+* Arabic (ar)
+* Bulgarian (bg)
+* German (de)
+* Modern greek (el)
+* English (en)
+* Spanish (es)
+* French (fr)
+* Hindi (hi)
+* Italian (it)
+* Japanese (ja)
+* Dutch (nl)
+* Polish (pl)
+* Portuguese (pt)
+* Russian (ru)
+* Swahili (sw)
+* Thai (th)
+* Turkish (tr)
+* Urdu (ur)
+* Vietnamese (vi)
+* Chinese (zh)
+''')
+gr.Interface(
+    fn=identify_language,
+    inputs=inputs,
+    outputs=outputs,
+    verbose=True,
+    examples = examples,
+    title="Language Identifier",
+    description="It aims at identifing the language a document is written in. It supports 20 different languages.",
+    article=article,
+    theme="huggingface"
+).launch()