kbd-pos-tagger / app.py
panagoa
Add application file
e5faaae
import string
import gradio as gr
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('panagoa/xlm-roberta-base-kbd-pos-tagger')
model = AutoModelForTokenClassification.from_pretrained('panagoa/xlm-roberta-base-kbd-pos-tagger')
def predict_pos_tags(text):
"""
Function for POS tagging.
Takes a string, splits it into words, tokenizes it, passes to the model,
and returns the result as a string with the corresponding tag for each word.
"""
# Split text into words, removing punctuation
words = [word.strip(string.punctuation) for word in text.split()]
words = [word for word in words if word]
# Set up device (CPU/GPU)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
# Tokenize input data
encoded_input = tokenizer(
words,
truncation=True,
is_split_into_words=True,
return_tensors='pt'
)
inputs = {k: v.to(device) for k, v in encoded_input.items()}
# Get predictions from the model
with torch.no_grad():
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=2)
# Map tokens to words
word_ids = encoded_input.word_ids()
previous_word_idx = None
predicted_tags = []
# Extract POS tags for each word
for idx, word_idx in enumerate(word_ids):
if word_idx != previous_word_idx:
predicted_tags.append(model.config.id2label[predictions[0][idx].item()])
previous_word_idx = word_idx
# Format the result
result = "\n".join(f"{word}: {tag}" for word, tag in zip(words, predicted_tags[:len(words)]))
return result
# Example sentences for demonstration
examples = [
"Iуащхьэмахуэ лъапэ щаухуащ хьэщIэщхэмрэ кIапсэ гъуэгухэмрэ.",
"Арати, зы жэщым щIалэм псори фIэкIуэдащ.",
"Мадинэ и пэшым дэкIуеижри, хущхъуэхэм ефащ.",
"Апхуэдэ цIыху къабзэ куэди бгъуэтынукъым.",
]
# Create Gradio interface
with gr.Blocks(title="XLM-RoBERTa POS Tagger for Kabardian") as demo:
gr.Markdown("# 🏷️ XLM-RoBERTa POS Tagger for Kabardian")
gr.Markdown(
"""
This application identifies Parts of Speech (POS) in text using the
[panagoa/xlm-roberta-base-kbd-pos-tagger](https://huggingface.co/panagoa/xlm-roberta-base-kbd-pos-tagger) model.
The model is specifically fine-tuned for Kabardian language (адыгэбзэ) but also works with other languages.
It was trained on the [panagoa/kbd-pos-tags](https://huggingface.co/datasets/panagoa/kbd-pos-tags) dataset
containing 82,925 tagged sentences in Kabardian.
"""
)
with gr.Row():
with gr.Column(scale=6):
input_text = gr.Textbox(
label="Text for analysis",
placeholder="Enter text in Kabardian or another language...",
lines=3
)
with gr.Row():
submit_btn = gr.Button("Analyze", variant="primary")
gr.Examples(
examples,
inputs=[input_text],
label="Example Kabardian sentences"
)
with gr.Column(scale=4):
output_text = gr.Textbox(
label="POS Tagging Results",
lines=12
)
with gr.Accordion("About POS Tags", open=False):
gr.Markdown("""
## POS Tags Supported
The model identifies 17 different POS tags:
| Tag | Description | Examples |
|-----|-------------|----------|
| `ADJ` | Adjective | хужь (white), къабзэ (clean) |
| `ADP` | Adposition | щхьэкIэ (for), папщIэ (because of) |
| `ADV` | Adverb | псынщIэу (quickly), жыжьэу (far) |
| `AUX` | Auxiliary | хъунщ (will be), щытащ (was) |
| `CCONJ` | Coordinating conjunction | икIи (and), ауэ (but) |
| `DET` | Determiner | мо (that), мыпхуэдэ (this kind) |
| `INTJ` | Interjection | уэлэхьи (by God), зиунагъуэрэ (oh my) |
| `NOUN` | Noun | унэ (house), щIалэ (boy) |
| `NUM` | Numeral | зы (one), тIу (two) |
| `PART` | Particle | мы (this), а (that) |
| `PRON` | Pronoun | сэ (I), уэ (you) |
| `PROPN` | Proper noun | Мурат (Murat), Налшык (Nalchik) |
| `PUNCT` | Punctuation | . (period), , (comma) |
| `SCONJ` | Subordinating conjunction | щхьэкIэ (because), щыгъуэ (when) |
| `SYM` | Symbol | % (percent), $ (dollar) |
| `VERB` | Verb | мэкIуэ (goes), матхэ (writes) |
| `X` | Other | - |
""")
# Event handlers
submit_btn.click(fn=predict_pos_tags, inputs=[input_text], outputs=[output_text])
# Launch the app
if __name__ == "__main__":
demo.launch()