# quick_demo.py
import gradio as gr
from huggingface_hub import hf_hub_download
from dytr import DynamicTransformer
import json
import pyarabic.araby as araby
import torch
# Load model
model_path = hf_hub_download(
    repo_id="alsubari/bert-base-multilingual-cased-dytr",
    filename="dytr.pt"
)
config_path = hf_hub_download(
    repo_id="alsubari/bert-base-multilingual-cased-dytr",
    filename="config.json"
)

model = DynamicTransformer.load_model(model_path)
device =model.config.device 

model.to(device)

tokenizer=model.tokenizer
model.eval()
with open(config_path, 'r', encoding='utf-8') as f:
    ar_char_tagging_config=json.load(f)
id2label,label2id=ar_char_tagging_config["id2label"],ar_char_tagging_config["label2id"]


def process(text, task='ar_pos_tagging'):
    if not text.strip():
        return "Please enter text"
    if task !='ar_pos_tagging':
        result = model.generate(text, task_name=task)
        return str(result)

    def tokenize_chars(text, language='ar'):
        """Tokenize text into characters with WordPiece-style prefixes"""
        result = []
        for word in text.split():
            if language == 'ar':

                token = araby.normalize_hamza(araby.strip_tashkeel(word.strip()))
                token = ''.join([c for c in token if c.isalpha()])
            else:
                token = word.strip()

            if not token:
                continue

            for char_idx, char in enumerate(list(token)):
                if char_idx == 0:
                    result.append(char)
                else:
                    result.append(f'##{char}')
        return result
    chars = tokenize_chars(text)

    if not chars:
            return str({'text': text, 'words': [], 'annotated': text})

    # Prepare input
    cls_id = tokenizer.vocab.get('[CLS]', 0)
    sep_id = tokenizer.vocab.get('[SEP]', 0)
    unk_id = tokenizer.vocab.get('[UNK]', 0)
    # Convert to IDs
    input_ids = [cls_id]
    for char in chars:
        if char in tokenizer.vocab:
            input_ids.append(tokenizer.vocab[char])
        else:
            input_ids.append(unk_id)
    input_ids.append(sep_id)

    attention_mask = [1] * len(input_ids)
    # Convert to tensors
    input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)
    mask_tensor = torch.tensor([attention_mask], dtype=torch.long).to(device)
    # Inference
    with torch.no_grad():
        outputs = model.forward(
            input_ids=input_tensor,
            attention_mask=mask_tensor,
            task_name=task
            )

    # Get predictions
    logits = outputs['logits']
    predictions = logits.argmax(dim=-1).squeeze().tolist()
    # Map to tags (skip CLS and SEP)
    char_tags = []
    for i, pred_id in enumerate(predictions[1:len(chars)+1]):
            char_tags.append(id2label.get(str(pred_id), 'O'))
    # Group into words
    words = text.split()
    word_results = []
    char_idx = 0
    for word in words:
        word_chars = tokenize_chars(word)
        word_length = len(word_chars)
        word_char_tags = char_tags[char_idx:char_idx + word_length]
        # Extract unique tags (B- tags indicate new tokens)
        tags = []
        token={}
        for idx, tag in enumerate(word_char_tags):
            if tag.startswith('B-'):
                tags.append(tag[2:])
                token[len(token)]=word_chars[idx].replace('##','')
            elif tag.startswith('I-') and not tags:
                tags.append(tag[2:])
                token[len(token)]=word_chars[idx].replace('##','')
            else:
                token[len(token)-1]+=word_chars[idx].replace('##','')
        word_tag = '+'.join(tags) if tags else 'O'

        word_results.append({
                'word': word,
                'tag': word_tag,
                'token':list(token.values()),

            })

        char_idx += word_length

    return str({
            'text': text,
            'words': word_results,
            'annotated': ' '.join([f"{w['word']}/{w['tag']}" for w in word_results])
        })
demo = gr.Interface(
    fn=process,
    inputs=[
        gr.Textbox(label="Text", lines=3, placeholder="Enter Arabic text here..."),
        gr.Dropdown(
            choices=["ar_pos_tagging"],#,"sentiment", "ner_detection", "error_detection", "generation"],
            label="Task",
            value="ar_pos_tagging"
        )
    ],
    outputs=gr.Markdown(label="Result"),
    title="dytr - Multi-Task Transformer Demo",
    description="""
    ## ⚠️ Important Note

    This is a **demonstration model** trained for educational purposes only.

    **Model Limitations:**
    - Limited training epochs (10 epochs only)
    - Small dataset size (demonstration only)

    **Expected Performance:**
    - Results may not be accurate for production use
    - The model may produce incorrect predictions
    - This demo only shows that dytr can successfully fine-tune BERT for multi-task learning

    **Purpose:**
    This demo proves that the dytr library can:
    - Load pretrained BERT models
    - Fine-tune them on multiple tasks simultaneously
    - Handle different task types (classification, NER, generation, error detection)

    """,
    examples=[
        ["بسم الله الرحمن الرحيم", "ar_pos_tagging"],
        #["أحمد يعيش في القاهرة", "ner_detection"],
        #["الكتاب هذا جدا ممتاز", "error_detection"],
        #["المصفوفات", "generation"],
    ]
)

demo.launch(share=True)