# quick_demo.py import gradio as gr from huggingface_hub import hf_hub_download from dytr import DynamicTransformer import json import pyarabic.araby as araby import torch # Load model model_path = hf_hub_download( repo_id="alsubari/bert-base-multilingual-cased-dytr", filename="dytr.pt" ) config_path = hf_hub_download( repo_id="alsubari/bert-base-multilingual-cased-dytr", filename="config.json" ) model = DynamicTransformer.load_model(model_path) device =model.config.device model.to(device) tokenizer=model.tokenizer model.eval() with open(config_path, 'r', encoding='utf-8') as f: ar_char_tagging_config=json.load(f) id2label,label2id=ar_char_tagging_config["id2label"],ar_char_tagging_config["label2id"] def process(text, task='ar_pos_tagging'): if not text.strip(): return "Please enter text" if task !='ar_pos_tagging': result = model.generate(text, task_name=task) return str(result) def tokenize_chars(text, language='ar'): """Tokenize text into characters with WordPiece-style prefixes""" result = [] for word in text.split(): if language == 'ar': token = araby.normalize_hamza(araby.strip_tashkeel(word.strip())) token = ''.join([c for c in token if c.isalpha()]) else: token = word.strip() if not token: continue for char_idx, char in enumerate(list(token)): if char_idx == 0: result.append(char) else: result.append(f'##{char}') return result chars = tokenize_chars(text) if not chars: return str({'text': text, 'words': [], 'annotated': text}) # Prepare input cls_id = tokenizer.vocab.get('[CLS]', 0) sep_id = tokenizer.vocab.get('[SEP]', 0) unk_id = tokenizer.vocab.get('[UNK]', 0) # Convert to IDs input_ids = [cls_id] for char in chars: if char in tokenizer.vocab: input_ids.append(tokenizer.vocab[char]) else: input_ids.append(unk_id) input_ids.append(sep_id) attention_mask = [1] * len(input_ids) # Convert to tensors input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device) mask_tensor = torch.tensor([attention_mask], dtype=torch.long).to(device) # Inference with torch.no_grad(): outputs = model.forward( input_ids=input_tensor, attention_mask=mask_tensor, task_name=task ) # Get predictions logits = outputs['logits'] predictions = logits.argmax(dim=-1).squeeze().tolist() # Map to tags (skip CLS and SEP) char_tags = [] for i, pred_id in enumerate(predictions[1:len(chars)+1]): char_tags.append(id2label.get(str(pred_id), 'O')) # Group into words words = text.split() word_results = [] char_idx = 0 for word in words: word_chars = tokenize_chars(word) word_length = len(word_chars) word_char_tags = char_tags[char_idx:char_idx + word_length] # Extract unique tags (B- tags indicate new tokens) tags = [] token={} for idx, tag in enumerate(word_char_tags): if tag.startswith('B-'): tags.append(tag[2:]) token[len(token)]=word_chars[idx].replace('##','') elif tag.startswith('I-') and not tags: tags.append(tag[2:]) token[len(token)]=word_chars[idx].replace('##','') else: token[len(token)-1]+=word_chars[idx].replace('##','') word_tag = '+'.join(tags) if tags else 'O' word_results.append({ 'word': word, 'tag': word_tag, 'token':list(token.values()), }) char_idx += word_length return str({ 'text': text, 'words': word_results, 'annotated': ' '.join([f"{w['word']}/{w['tag']}" for w in word_results]) }) demo = gr.Interface( fn=process, inputs=[ gr.Textbox(label="Text", lines=3, placeholder="Enter Arabic text here..."), gr.Dropdown( choices=["ar_pos_tagging"],#,"sentiment", "ner_detection", "error_detection", "generation"], label="Task", value="ar_pos_tagging" ) ], outputs=gr.Markdown(label="Result"), title="dytr - Multi-Task Transformer Demo", description=""" ## ⚠️ Important Note This is a **demonstration model** trained for educational purposes only. **Model Limitations:** - Limited training epochs (10 epochs only) - Small dataset size (demonstration only) **Expected Performance:** - Results may not be accurate for production use - The model may produce incorrect predictions - This demo only shows that dytr can successfully fine-tune BERT for multi-task learning **Purpose:** This demo proves that the dytr library can: - Load pretrained BERT models - Fine-tune them on multiple tasks simultaneously - Handle different task types (classification, NER, generation, error detection) """, examples=[ ["بسم الله الرحمن الرحيم", "ar_pos_tagging"], #["أحمد يعيش في القاهرة", "ner_detection"], #["الكتاب هذا جدا ممتاز", "error_detection"], #["المصفوفات", "generation"], ] ) demo.launch(share=True)