import string import gradio as gr from transformers import AutoTokenizer, AutoModelForTokenClassification import torch # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained('panagoa/xlm-roberta-base-kbd-pos-tagger') model = AutoModelForTokenClassification.from_pretrained('panagoa/xlm-roberta-base-kbd-pos-tagger') def predict_pos_tags(text): """ Function for POS tagging. Takes a string, splits it into words, tokenizes it, passes to the model, and returns the result as a string with the corresponding tag for each word. """ # Split text into words, removing punctuation words = [word.strip(string.punctuation) for word in text.split()] words = [word for word in words if word] # Set up device (CPU/GPU) device = 'cuda' if torch.cuda.is_available() else 'cpu' model.to(device) # Tokenize input data encoded_input = tokenizer( words, truncation=True, is_split_into_words=True, return_tensors='pt' ) inputs = {k: v.to(device) for k, v in encoded_input.items()} # Get predictions from the model with torch.no_grad(): outputs = model(**inputs) predictions = torch.argmax(outputs.logits, dim=2) # Map tokens to words word_ids = encoded_input.word_ids() previous_word_idx = None predicted_tags = [] # Extract POS tags for each word for idx, word_idx in enumerate(word_ids): if word_idx != previous_word_idx: predicted_tags.append(model.config.id2label[predictions[0][idx].item()]) previous_word_idx = word_idx # Format the result result = "\n".join(f"{word}: {tag}" for word, tag in zip(words, predicted_tags[:len(words)])) return result # Example sentences for demonstration examples = [ "Iуащхьэмахуэ лъапэ щаухуащ хьэщIэщхэмрэ кIапсэ гъуэгухэмрэ.", "Арати, зы жэщым щIалэм псори фIэкIуэдащ.", "Мадинэ и пэшым дэкIуеижри, хущхъуэхэм ефащ.", "Апхуэдэ цIыху къабзэ куэди бгъуэтынукъым.", ] # Create Gradio interface with gr.Blocks(title="XLM-RoBERTa POS Tagger for Kabardian") as demo: gr.Markdown("# 🏷️ XLM-RoBERTa POS Tagger for Kabardian") gr.Markdown( """ This application identifies Parts of Speech (POS) in text using the [panagoa/xlm-roberta-base-kbd-pos-tagger](https://huggingface.co/panagoa/xlm-roberta-base-kbd-pos-tagger) model. The model is specifically fine-tuned for Kabardian language (адыгэбзэ) but also works with other languages. It was trained on the [panagoa/kbd-pos-tags](https://huggingface.co/datasets/panagoa/kbd-pos-tags) dataset containing 82,925 tagged sentences in Kabardian. """ ) with gr.Row(): with gr.Column(scale=6): input_text = gr.Textbox( label="Text for analysis", placeholder="Enter text in Kabardian or another language...", lines=3 ) with gr.Row(): submit_btn = gr.Button("Analyze", variant="primary") gr.Examples( examples, inputs=[input_text], label="Example Kabardian sentences" ) with gr.Column(scale=4): output_text = gr.Textbox( label="POS Tagging Results", lines=12 ) with gr.Accordion("About POS Tags", open=False): gr.Markdown(""" ## POS Tags Supported The model identifies 17 different POS tags: | Tag | Description | Examples | |-----|-------------|----------| | `ADJ` | Adjective | хужь (white), къабзэ (clean) | | `ADP` | Adposition | щхьэкIэ (for), папщIэ (because of) | | `ADV` | Adverb | псынщIэу (quickly), жыжьэу (far) | | `AUX` | Auxiliary | хъунщ (will be), щытащ (was) | | `CCONJ` | Coordinating conjunction | икIи (and), ауэ (but) | | `DET` | Determiner | мо (that), мыпхуэдэ (this kind) | | `INTJ` | Interjection | уэлэхьи (by God), зиунагъуэрэ (oh my) | | `NOUN` | Noun | унэ (house), щIалэ (boy) | | `NUM` | Numeral | зы (one), тIу (two) | | `PART` | Particle | мы (this), а (that) | | `PRON` | Pronoun | сэ (I), уэ (you) | | `PROPN` | Proper noun | Мурат (Murat), Налшык (Nalchik) | | `PUNCT` | Punctuation | . (period), , (comma) | | `SCONJ` | Subordinating conjunction | щхьэкIэ (because), щыгъуэ (when) | | `SYM` | Symbol | % (percent), $ (dollar) | | `VERB` | Verb | мэкIуэ (goes), матхэ (writes) | | `X` | Other | - | """) # Event handlers submit_btn.click(fn=predict_pos_tags, inputs=[input_text], outputs=[output_text]) # Launch the app if __name__ == "__main__": demo.launch()