| |
| import gradio as gr |
| from huggingface_hub import hf_hub_download |
| from dytr import DynamicTransformer |
| import json |
| import pyarabic.araby as araby |
| import torch |
| |
| model_path = hf_hub_download( |
| repo_id="alsubari/bert-base-multilingual-cased-dytr", |
| filename="dytr.pt" |
| ) |
| config_path = hf_hub_download( |
| repo_id="alsubari/bert-base-multilingual-cased-dytr", |
| filename="config.json" |
| ) |
|
|
| model = DynamicTransformer.load_model(model_path) |
| device =model.config.device |
|
|
| model.to(device) |
|
|
| tokenizer=model.tokenizer |
| model.eval() |
| with open(config_path, 'r', encoding='utf-8') as f: |
| ar_char_tagging_config=json.load(f) |
| id2label,label2id=ar_char_tagging_config["id2label"],ar_char_tagging_config["label2id"] |
|
|
|
|
| def process(text, task='ar_pos_tagging'): |
| if not text.strip(): |
| return "Please enter text" |
| if task !='ar_pos_tagging': |
| result = model.generate(text, task_name=task) |
| return str(result) |
|
|
| def tokenize_chars(text, language='ar'): |
| """Tokenize text into characters with WordPiece-style prefixes""" |
| result = [] |
| for word in text.split(): |
| if language == 'ar': |
|
|
| token = araby.normalize_hamza(araby.strip_tashkeel(word.strip())) |
| token = ''.join([c for c in token if c.isalpha()]) |
| else: |
| token = word.strip() |
|
|
| if not token: |
| continue |
|
|
| for char_idx, char in enumerate(list(token)): |
| if char_idx == 0: |
| result.append(char) |
| else: |
| result.append(f'##{char}') |
| return result |
| chars = tokenize_chars(text) |
|
|
| if not chars: |
| return str({'text': text, 'words': [], 'annotated': text}) |
|
|
| |
| cls_id = tokenizer.vocab.get('[CLS]', 0) |
| sep_id = tokenizer.vocab.get('[SEP]', 0) |
| unk_id = tokenizer.vocab.get('[UNK]', 0) |
| |
| input_ids = [cls_id] |
| for char in chars: |
| if char in tokenizer.vocab: |
| input_ids.append(tokenizer.vocab[char]) |
| else: |
| input_ids.append(unk_id) |
| input_ids.append(sep_id) |
|
|
| attention_mask = [1] * len(input_ids) |
| |
| input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device) |
| mask_tensor = torch.tensor([attention_mask], dtype=torch.long).to(device) |
| |
| with torch.no_grad(): |
| outputs = model.forward( |
| input_ids=input_tensor, |
| attention_mask=mask_tensor, |
| task_name=task |
| ) |
|
|
| |
| logits = outputs['logits'] |
| predictions = logits.argmax(dim=-1).squeeze().tolist() |
| |
| char_tags = [] |
| for i, pred_id in enumerate(predictions[1:len(chars)+1]): |
| char_tags.append(id2label.get(str(pred_id), 'O')) |
| |
| words = text.split() |
| word_results = [] |
| char_idx = 0 |
| for word in words: |
| word_chars = tokenize_chars(word) |
| word_length = len(word_chars) |
| word_char_tags = char_tags[char_idx:char_idx + word_length] |
| |
| tags = [] |
| token={} |
| for idx, tag in enumerate(word_char_tags): |
| if tag.startswith('B-'): |
| tags.append(tag[2:]) |
| token[len(token)]=word_chars[idx].replace('##','') |
| elif tag.startswith('I-') and not tags: |
| tags.append(tag[2:]) |
| token[len(token)]=word_chars[idx].replace('##','') |
| else: |
| token[len(token)-1]+=word_chars[idx].replace('##','') |
| word_tag = '+'.join(tags) if tags else 'O' |
|
|
| word_results.append({ |
| 'word': word, |
| 'tag': word_tag, |
| 'token':list(token.values()), |
|
|
| }) |
|
|
| char_idx += word_length |
|
|
| return str({ |
| 'text': text, |
| 'words': word_results, |
| 'annotated': ' '.join([f"{w['word']}/{w['tag']}" for w in word_results]) |
| }) |
| demo = gr.Interface( |
| fn=process, |
| inputs=[ |
| gr.Textbox(label="Text", lines=3, placeholder="Enter Arabic text here..."), |
| gr.Dropdown( |
| choices=["ar_pos_tagging"], |
| label="Task", |
| value="ar_pos_tagging" |
| ) |
| ], |
| outputs=gr.Markdown(label="Result"), |
| title="dytr - Multi-Task Transformer Demo", |
| description=""" |
| ## ⚠️ Important Note |
| |
| This is a **demonstration model** trained for educational purposes only. |
| |
| **Model Limitations:** |
| - Limited training epochs (10 epochs only) |
| - Small dataset size (demonstration only) |
| |
| **Expected Performance:** |
| - Results may not be accurate for production use |
| - The model may produce incorrect predictions |
| - This demo only shows that dytr can successfully fine-tune BERT for multi-task learning |
| |
| **Purpose:** |
| This demo proves that the dytr library can: |
| - Load pretrained BERT models |
| - Fine-tune them on multiple tasks simultaneously |
| - Handle different task types (classification, NER, generation, error detection) |
| |
| """, |
| examples=[ |
| ["بسم الله الرحمن الرحيم", "ar_pos_tagging"], |
| |
| |
| |
| ] |
| ) |
|
|
| demo.launch(share=True) |