Spaces:

alsubari
/

dytr

Sleeping

App Files Files Community

dytr / app.py

alsubari

Update app.py

547049a verified about 1 month ago

raw

history blame contribute delete

5.52 kB

	# quick_demo.py
	import gradio as gr
	from huggingface_hub import hf_hub_download
	from dytr import DynamicTransformer
	import json
	import pyarabic.araby as araby
	import torch
	# Load model
	model_path = hf_hub_download(
	repo_id="alsubari/bert-base-multilingual-cased-dytr",
	filename="dytr.pt"
	)
	config_path = hf_hub_download(
	repo_id="alsubari/bert-base-multilingual-cased-dytr",
	filename="config.json"
	)

	model = DynamicTransformer.load_model(model_path)
	device =model.config.device

	model.to(device)

	tokenizer=model.tokenizer
	model.eval()
	with open(config_path, 'r', encoding='utf-8') as f:
	ar_char_tagging_config=json.load(f)
	id2label,label2id=ar_char_tagging_config["id2label"],ar_char_tagging_config["label2id"]


	def process(text, task='ar_pos_tagging'):
	if not text.strip():
	return "Please enter text"
	if task !='ar_pos_tagging':
	result = model.generate(text, task_name=task)
	return str(result)

	def tokenize_chars(text, language='ar'):
	"""Tokenize text into characters with WordPiece-style prefixes"""
	result = []
	for word in text.split():
	if language == 'ar':

	token = araby.normalize_hamza(araby.strip_tashkeel(word.strip()))
	token = ''.join([c for c in token if c.isalpha()])
	else:
	token = word.strip()

	if not token:
	continue

	for char_idx, char in enumerate(list(token)):
	if char_idx == 0:
	result.append(char)
	else:
	result.append(f'##{char}')
	return result
	chars = tokenize_chars(text)

	if not chars:
	return str({'text': text, 'words': [], 'annotated': text})

	# Prepare input
	cls_id = tokenizer.vocab.get('[CLS]', 0)
	sep_id = tokenizer.vocab.get('[SEP]', 0)
	unk_id = tokenizer.vocab.get('[UNK]', 0)
	# Convert to IDs
	input_ids = [cls_id]
	for char in chars:
	if char in tokenizer.vocab:
	input_ids.append(tokenizer.vocab[char])
	else:
	input_ids.append(unk_id)
	input_ids.append(sep_id)

	attention_mask = [1] * len(input_ids)
	# Convert to tensors
	input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)
	mask_tensor = torch.tensor([attention_mask], dtype=torch.long).to(device)
	# Inference
	with torch.no_grad():
	outputs = model.forward(
	input_ids=input_tensor,
	attention_mask=mask_tensor,
	task_name=task
	)

	# Get predictions
	logits = outputs['logits']
	predictions = logits.argmax(dim=-1).squeeze().tolist()
	# Map to tags (skip CLS and SEP)
	char_tags = []
	for i, pred_id in enumerate(predictions[1:len(chars)+1]):
	char_tags.append(id2label.get(str(pred_id), 'O'))
	# Group into words
	words = text.split()
	word_results = []
	char_idx = 0
	for word in words:
	word_chars = tokenize_chars(word)
	word_length = len(word_chars)
	word_char_tags = char_tags[char_idx:char_idx + word_length]
	# Extract unique tags (B- tags indicate new tokens)
	tags = []
	token={}
	for idx, tag in enumerate(word_char_tags):
	if tag.startswith('B-'):
	tags.append(tag[2:])
	token[len(token)]=word_chars[idx].replace('##','')
	elif tag.startswith('I-') and not tags:
	tags.append(tag[2:])
	token[len(token)]=word_chars[idx].replace('##','')
	else:
	token[len(token)-1]+=word_chars[idx].replace('##','')
	word_tag = '+'.join(tags) if tags else 'O'

	word_results.append({
	'word': word,
	'tag': word_tag,
	'token':list(token.values()),

	})

	char_idx += word_length

	return str({
	'text': text,
	'words': word_results,
	'annotated': ' '.join([f"{w['word']}/{w['tag']}" for w in word_results])
	})
	demo = gr.Interface(
	fn=process,
	inputs=[
	gr.Textbox(label="Text", lines=3, placeholder="Enter Arabic text here..."),
	gr.Dropdown(
	choices=["ar_pos_tagging"],#,"sentiment", "ner_detection", "error_detection", "generation"],
	label="Task",
	value="ar_pos_tagging"
	)
	],
	outputs=gr.Markdown(label="Result"),
	title="dytr - Multi-Task Transformer Demo",
	description="""
	## ⚠️ Important Note

	This is a demonstration model trained for educational purposes only.

	Model Limitations:
	- Limited training epochs (10 epochs only)
	- Small dataset size (demonstration only)

	Expected Performance:
	- Results may not be accurate for production use
	- The model may produce incorrect predictions
	- This demo only shows that dytr can successfully fine-tune BERT for multi-task learning

	Purpose:
	This demo proves that the dytr library can:
	- Load pretrained BERT models
	- Fine-tune them on multiple tasks simultaneously
	- Handle different task types (classification, NER, generation, error detection)

	""",
	examples=[
	["بسم الله الرحمن الرحيم", "ar_pos_tagging"],
	#["أحمد يعيش في القاهرة", "ner_detection"],
	#["الكتاب هذا جدا ممتاز", "error_detection"],
	#["المصفوفات", "generation"],
	]
	)

	demo.launch(share=True)