import string
import gradio as gr
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('panagoa/xlm-roberta-base-kbd-pos-tagger')
model = AutoModelForTokenClassification.from_pretrained('panagoa/xlm-roberta-base-kbd-pos-tagger')

def predict_pos_tags(text):
    """
    Function for POS tagging.
    Takes a string, splits it into words, tokenizes it, passes to the model,
    and returns the result as a string with the corresponding tag for each word.
    """
    # Split text into words, removing punctuation
    words = [word.strip(string.punctuation) for word in text.split()]
    words = [word for word in words if word]

    # Set up device (CPU/GPU)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)

    # Tokenize input data
    encoded_input = tokenizer(
        words,
        truncation=True,
        is_split_into_words=True,
        return_tensors='pt'
    )
    inputs = {k: v.to(device) for k, v in encoded_input.items()}

    # Get predictions from the model
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)

    # Map tokens to words
    word_ids = encoded_input.word_ids()
    previous_word_idx = None
    predicted_tags = []

    # Extract POS tags for each word
    for idx, word_idx in enumerate(word_ids):
        if word_idx != previous_word_idx:
            predicted_tags.append(model.config.id2label[predictions[0][idx].item()])
        previous_word_idx = word_idx

    # Format the result
    result = "\n".join(f"{word}: {tag}" for word, tag in zip(words, predicted_tags[:len(words)]))
    return result

# Example sentences for demonstration
examples = [
    "Iуащхьэмахуэ лъапэ щаухуащ хьэщIэщхэмрэ кIапсэ гъуэгухэмрэ.",
    "Арати, зы жэщым щIалэм псори фIэкIуэдащ.",
    "Мадинэ и пэшым дэкIуеижри, хущхъуэхэм ефащ.",
    "Апхуэдэ цIыху къабзэ куэди бгъуэтынукъым.",
]

# Create Gradio interface
with gr.Blocks(title="XLM-RoBERTa POS Tagger for Kabardian") as demo:
    gr.Markdown("# 🏷️ XLM-RoBERTa POS Tagger for Kabardian")
    gr.Markdown(
        """
        This application identifies Parts of Speech (POS) in text using the 
        [panagoa/xlm-roberta-base-kbd-pos-tagger](https://huggingface.co/panagoa/xlm-roberta-base-kbd-pos-tagger) model.

        The model is specifically fine-tuned for Kabardian language (адыгэбзэ) but also works with other languages.
        It was trained on the [panagoa/kbd-pos-tags](https://huggingface.co/datasets/panagoa/kbd-pos-tags) dataset 
        containing 82,925 tagged sentences in Kabardian.
        """
    )

    with gr.Row():
        with gr.Column(scale=6):
            input_text = gr.Textbox(
                label="Text for analysis",
                placeholder="Enter text in Kabardian or another language...",
                lines=3
            )

            with gr.Row():
                submit_btn = gr.Button("Analyze", variant="primary")

            gr.Examples(
                examples,
                inputs=[input_text],
                label="Example Kabardian sentences"
            )

        with gr.Column(scale=4):
            output_text = gr.Textbox(
                label="POS Tagging Results",
                lines=12
            )

    with gr.Accordion("About POS Tags", open=False):
        gr.Markdown("""
        ## POS Tags Supported

        The model identifies 17 different POS tags:

        | Tag | Description | Examples |
        |-----|-------------|----------|
        | `ADJ` | Adjective | хужь (white), къабзэ (clean) |
        | `ADP` | Adposition | щхьэкIэ (for), папщIэ (because of) |
        | `ADV` | Adverb | псынщIэу (quickly), жыжьэу (far) |
        | `AUX` | Auxiliary | хъунщ (will be), щытащ (was) |
        | `CCONJ` | Coordinating conjunction | икIи (and), ауэ (but) |
        | `DET` | Determiner | мо (that), мыпхуэдэ (this kind) |
        | `INTJ` | Interjection | уэлэхьи (by God), зиунагъуэрэ (oh my) |
        | `NOUN` | Noun | унэ (house), щIалэ (boy) |
        | `NUM` | Numeral | зы (one), тIу (two) |
        | `PART` | Particle | мы (this), а (that) |
        | `PRON` | Pronoun | сэ (I), уэ (you) |
        | `PROPN` | Proper noun | Мурат (Murat), Налшык (Nalchik) |
        | `PUNCT` | Punctuation | . (period), , (comma) |
        | `SCONJ` | Subordinating conjunction | щхьэкIэ (because), щыгъуэ (when) |
        | `SYM` | Symbol | % (percent), $ (dollar) |
        | `VERB` | Verb | мэкIуэ (goes), матхэ (writes) |
        | `X` | Other | - |
        """)

    # Event handlers
    submit_btn.click(fn=predict_pos_tags, inputs=[input_text], outputs=[output_text])

# Launch the app
if __name__ == "__main__":
    demo.launch()