Spaces:
Sleeping
Sleeping
| import string | |
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoModelForTokenClassification | |
| import torch | |
| # Load tokenizer and model | |
| tokenizer = AutoTokenizer.from_pretrained('panagoa/xlm-roberta-base-kbd-pos-tagger') | |
| model = AutoModelForTokenClassification.from_pretrained('panagoa/xlm-roberta-base-kbd-pos-tagger') | |
| def predict_pos_tags(text): | |
| """ | |
| Function for POS tagging. | |
| Takes a string, splits it into words, tokenizes it, passes to the model, | |
| and returns the result as a string with the corresponding tag for each word. | |
| """ | |
| # Split text into words, removing punctuation | |
| words = [word.strip(string.punctuation) for word in text.split()] | |
| words = [word for word in words if word] | |
| # Set up device (CPU/GPU) | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| model.to(device) | |
| # Tokenize input data | |
| encoded_input = tokenizer( | |
| words, | |
| truncation=True, | |
| is_split_into_words=True, | |
| return_tensors='pt' | |
| ) | |
| inputs = {k: v.to(device) for k, v in encoded_input.items()} | |
| # Get predictions from the model | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| predictions = torch.argmax(outputs.logits, dim=2) | |
| # Map tokens to words | |
| word_ids = encoded_input.word_ids() | |
| previous_word_idx = None | |
| predicted_tags = [] | |
| # Extract POS tags for each word | |
| for idx, word_idx in enumerate(word_ids): | |
| if word_idx != previous_word_idx: | |
| predicted_tags.append(model.config.id2label[predictions[0][idx].item()]) | |
| previous_word_idx = word_idx | |
| # Format the result | |
| result = "\n".join(f"{word}: {tag}" for word, tag in zip(words, predicted_tags[:len(words)])) | |
| return result | |
| # Example sentences for demonstration | |
| examples = [ | |
| "Iуащхьэмахуэ лъапэ щаухуащ хьэщIэщхэмрэ кIапсэ гъуэгухэмрэ.", | |
| "Арати, зы жэщым щIалэм псори фIэкIуэдащ.", | |
| "Мадинэ и пэшым дэкIуеижри, хущхъуэхэм ефащ.", | |
| "Апхуэдэ цIыху къабзэ куэди бгъуэтынукъым.", | |
| ] | |
| # Create Gradio interface | |
| with gr.Blocks(title="XLM-RoBERTa POS Tagger for Kabardian") as demo: | |
| gr.Markdown("# 🏷️ XLM-RoBERTa POS Tagger for Kabardian") | |
| gr.Markdown( | |
| """ | |
| This application identifies Parts of Speech (POS) in text using the | |
| [panagoa/xlm-roberta-base-kbd-pos-tagger](https://huggingface.co/panagoa/xlm-roberta-base-kbd-pos-tagger) model. | |
| The model is specifically fine-tuned for Kabardian language (адыгэбзэ) but also works with other languages. | |
| It was trained on the [panagoa/kbd-pos-tags](https://huggingface.co/datasets/panagoa/kbd-pos-tags) dataset | |
| containing 82,925 tagged sentences in Kabardian. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=6): | |
| input_text = gr.Textbox( | |
| label="Text for analysis", | |
| placeholder="Enter text in Kabardian or another language...", | |
| lines=3 | |
| ) | |
| with gr.Row(): | |
| submit_btn = gr.Button("Analyze", variant="primary") | |
| gr.Examples( | |
| examples, | |
| inputs=[input_text], | |
| label="Example Kabardian sentences" | |
| ) | |
| with gr.Column(scale=4): | |
| output_text = gr.Textbox( | |
| label="POS Tagging Results", | |
| lines=12 | |
| ) | |
| with gr.Accordion("About POS Tags", open=False): | |
| gr.Markdown(""" | |
| ## POS Tags Supported | |
| The model identifies 17 different POS tags: | |
| | Tag | Description | Examples | | |
| |-----|-------------|----------| | |
| | `ADJ` | Adjective | хужь (white), къабзэ (clean) | | |
| | `ADP` | Adposition | щхьэкIэ (for), папщIэ (because of) | | |
| | `ADV` | Adverb | псынщIэу (quickly), жыжьэу (far) | | |
| | `AUX` | Auxiliary | хъунщ (will be), щытащ (was) | | |
| | `CCONJ` | Coordinating conjunction | икIи (and), ауэ (but) | | |
| | `DET` | Determiner | мо (that), мыпхуэдэ (this kind) | | |
| | `INTJ` | Interjection | уэлэхьи (by God), зиунагъуэрэ (oh my) | | |
| | `NOUN` | Noun | унэ (house), щIалэ (boy) | | |
| | `NUM` | Numeral | зы (one), тIу (two) | | |
| | `PART` | Particle | мы (this), а (that) | | |
| | `PRON` | Pronoun | сэ (I), уэ (you) | | |
| | `PROPN` | Proper noun | Мурат (Murat), Налшык (Nalchik) | | |
| | `PUNCT` | Punctuation | . (period), , (comma) | | |
| | `SCONJ` | Subordinating conjunction | щхьэкIэ (because), щыгъуэ (when) | | |
| | `SYM` | Symbol | % (percent), $ (dollar) | | |
| | `VERB` | Verb | мэкIуэ (goes), матхэ (writes) | | |
| | `X` | Other | - | | |
| """) | |
| # Event handlers | |
| submit_btn.click(fn=predict_pos_tags, inputs=[input_text], outputs=[output_text]) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() |