dytr / app.py
alsubari's picture
Update app.py
547049a verified
# quick_demo.py
import gradio as gr
from huggingface_hub import hf_hub_download
from dytr import DynamicTransformer
import json
import pyarabic.araby as araby
import torch
# Load model
model_path = hf_hub_download(
repo_id="alsubari/bert-base-multilingual-cased-dytr",
filename="dytr.pt"
)
config_path = hf_hub_download(
repo_id="alsubari/bert-base-multilingual-cased-dytr",
filename="config.json"
)
model = DynamicTransformer.load_model(model_path)
device =model.config.device
model.to(device)
tokenizer=model.tokenizer
model.eval()
with open(config_path, 'r', encoding='utf-8') as f:
ar_char_tagging_config=json.load(f)
id2label,label2id=ar_char_tagging_config["id2label"],ar_char_tagging_config["label2id"]
def process(text, task='ar_pos_tagging'):
if not text.strip():
return "Please enter text"
if task !='ar_pos_tagging':
result = model.generate(text, task_name=task)
return str(result)
def tokenize_chars(text, language='ar'):
"""Tokenize text into characters with WordPiece-style prefixes"""
result = []
for word in text.split():
if language == 'ar':
token = araby.normalize_hamza(araby.strip_tashkeel(word.strip()))
token = ''.join([c for c in token if c.isalpha()])
else:
token = word.strip()
if not token:
continue
for char_idx, char in enumerate(list(token)):
if char_idx == 0:
result.append(char)
else:
result.append(f'##{char}')
return result
chars = tokenize_chars(text)
if not chars:
return str({'text': text, 'words': [], 'annotated': text})
# Prepare input
cls_id = tokenizer.vocab.get('[CLS]', 0)
sep_id = tokenizer.vocab.get('[SEP]', 0)
unk_id = tokenizer.vocab.get('[UNK]', 0)
# Convert to IDs
input_ids = [cls_id]
for char in chars:
if char in tokenizer.vocab:
input_ids.append(tokenizer.vocab[char])
else:
input_ids.append(unk_id)
input_ids.append(sep_id)
attention_mask = [1] * len(input_ids)
# Convert to tensors
input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)
mask_tensor = torch.tensor([attention_mask], dtype=torch.long).to(device)
# Inference
with torch.no_grad():
outputs = model.forward(
input_ids=input_tensor,
attention_mask=mask_tensor,
task_name=task
)
# Get predictions
logits = outputs['logits']
predictions = logits.argmax(dim=-1).squeeze().tolist()
# Map to tags (skip CLS and SEP)
char_tags = []
for i, pred_id in enumerate(predictions[1:len(chars)+1]):
char_tags.append(id2label.get(str(pred_id), 'O'))
# Group into words
words = text.split()
word_results = []
char_idx = 0
for word in words:
word_chars = tokenize_chars(word)
word_length = len(word_chars)
word_char_tags = char_tags[char_idx:char_idx + word_length]
# Extract unique tags (B- tags indicate new tokens)
tags = []
token={}
for idx, tag in enumerate(word_char_tags):
if tag.startswith('B-'):
tags.append(tag[2:])
token[len(token)]=word_chars[idx].replace('##','')
elif tag.startswith('I-') and not tags:
tags.append(tag[2:])
token[len(token)]=word_chars[idx].replace('##','')
else:
token[len(token)-1]+=word_chars[idx].replace('##','')
word_tag = '+'.join(tags) if tags else 'O'
word_results.append({
'word': word,
'tag': word_tag,
'token':list(token.values()),
})
char_idx += word_length
return str({
'text': text,
'words': word_results,
'annotated': ' '.join([f"{w['word']}/{w['tag']}" for w in word_results])
})
demo = gr.Interface(
fn=process,
inputs=[
gr.Textbox(label="Text", lines=3, placeholder="Enter Arabic text here..."),
gr.Dropdown(
choices=["ar_pos_tagging"],#,"sentiment", "ner_detection", "error_detection", "generation"],
label="Task",
value="ar_pos_tagging"
)
],
outputs=gr.Markdown(label="Result"),
title="dytr - Multi-Task Transformer Demo",
description="""
## ⚠️ Important Note
This is a **demonstration model** trained for educational purposes only.
**Model Limitations:**
- Limited training epochs (10 epochs only)
- Small dataset size (demonstration only)
**Expected Performance:**
- Results may not be accurate for production use
- The model may produce incorrect predictions
- This demo only shows that dytr can successfully fine-tune BERT for multi-task learning
**Purpose:**
This demo proves that the dytr library can:
- Load pretrained BERT models
- Fine-tune them on multiple tasks simultaneously
- Handle different task types (classification, NER, generation, error detection)
""",
examples=[
["بسم الله الرحمن الرحيم", "ar_pos_tagging"],
#["أحمد يعيش في القاهرة", "ner_detection"],
#["الكتاب هذا جدا ممتاز", "error_detection"],
#["المصفوفات", "generation"],
]
)
demo.launch(share=True)