Spaces:

panagoa
/

kbd-pos-tagger

Sleeping

kbd-pos-tagger / app.py

panagoa

Add application file

e5faaae 11 months ago

5.09 kB

	import string
	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForTokenClassification
	import torch

	# Load tokenizer and model
	tokenizer = AutoTokenizer.from_pretrained('panagoa/xlm-roberta-base-kbd-pos-tagger')
	model = AutoModelForTokenClassification.from_pretrained('panagoa/xlm-roberta-base-kbd-pos-tagger')

	def predict_pos_tags(text):
	"""
	Function for POS tagging.
	Takes a string, splits it into words, tokenizes it, passes to the model,
	and returns the result as a string with the corresponding tag for each word.
	"""
	# Split text into words, removing punctuation
	words = [word.strip(string.punctuation) for word in text.split()]
	words = [word for word in words if word]

	# Set up device (CPU/GPU)
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	model.to(device)

	# Tokenize input data
	encoded_input = tokenizer(
	words,
	truncation=True,
	is_split_into_words=True,
	return_tensors='pt'
	)
	inputs = {k: v.to(device) for k, v in encoded_input.items()}

	# Get predictions from the model
	with torch.no_grad():
	outputs = model(**inputs)
	predictions = torch.argmax(outputs.logits, dim=2)

	# Map tokens to words
	word_ids = encoded_input.word_ids()
	previous_word_idx = None
	predicted_tags = []

	# Extract POS tags for each word
	for idx, word_idx in enumerate(word_ids):
	if word_idx != previous_word_idx:
	predicted_tags.append(model.config.id2label[predictions[0][idx].item()])
	previous_word_idx = word_idx

	# Format the result
	result = "\n".join(f"{word}: {tag}" for word, tag in zip(words, predicted_tags[:len(words)]))
	return result

	# Example sentences for demonstration
	examples = [
	"Iуащхьэмахуэ лъапэ щаухуащ хьэщIэщхэмрэ кIапсэ гъуэгухэмрэ.",
	"Арати, зы жэщым щIалэм псори фIэкIуэдащ.",
	"Мадинэ и пэшым дэкIуеижри, хущхъуэхэм ефащ.",
	"Апхуэдэ цIыху къабзэ куэди бгъуэтынукъым.",
	]

	# Create Gradio interface
	with gr.Blocks(title="XLM-RoBERTa POS Tagger for Kabardian") as demo:
	gr.Markdown("# 🏷️ XLM-RoBERTa POS Tagger for Kabardian")
	gr.Markdown(
	"""
	This application identifies Parts of Speech (POS) in text using the
	[panagoa/xlm-roberta-base-kbd-pos-tagger](https://huggingface.co/panagoa/xlm-roberta-base-kbd-pos-tagger) model.

	The model is specifically fine-tuned for Kabardian language (адыгэбзэ) but also works with other languages.
	It was trained on the [panagoa/kbd-pos-tags](https://huggingface.co/datasets/panagoa/kbd-pos-tags) dataset
	containing 82,925 tagged sentences in Kabardian.
	"""
	)

	with gr.Row():
	with gr.Column(scale=6):
	input_text = gr.Textbox(
	label="Text for analysis",
	placeholder="Enter text in Kabardian or another language...",
	lines=3
	)

	with gr.Row():
	submit_btn = gr.Button("Analyze", variant="primary")

	gr.Examples(
	examples,
	inputs=[input_text],
	label="Example Kabardian sentences"
	)

	with gr.Column(scale=4):
	output_text = gr.Textbox(
	label="POS Tagging Results",
	lines=12
	)

	with gr.Accordion("About POS Tags", open=False):
	gr.Markdown("""
	## POS Tags Supported

	The model identifies 17 different POS tags:

	\| Tag \| Description \| Examples \|
	\|-----\|-------------\|----------\|
	\| `ADJ` \| Adjective \| хужь (white), къабзэ (clean) \|
	\| `ADP` \| Adposition \| щхьэкIэ (for), папщIэ (because of) \|
	\| `ADV` \| Adverb \| псынщIэу (quickly), жыжьэу (far) \|
	\| `AUX` \| Auxiliary \| хъунщ (will be), щытащ (was) \|
	\| `CCONJ` \| Coordinating conjunction \| икIи (and), ауэ (but) \|
	\| `DET` \| Determiner \| мо (that), мыпхуэдэ (this kind) \|
	\| `INTJ` \| Interjection \| уэлэхьи (by God), зиунагъуэрэ (oh my) \|
	\| `NOUN` \| Noun \| унэ (house), щIалэ (boy) \|
	\| `NUM` \| Numeral \| зы (one), тIу (two) \|
	\| `PART` \| Particle \| мы (this), а (that) \|
	\| `PRON` \| Pronoun \| сэ (I), уэ (you) \|
	\| `PROPN` \| Proper noun \| Мурат (Murat), Налшык (Nalchik) \|
	\| `PUNCT` \| Punctuation \| . (period), , (comma) \|
	\| `SCONJ` \| Subordinating conjunction \| щхьэкIэ (because), щыгъуэ (when) \|
	\| `SYM` \| Symbol \| % (percent), $ (dollar) \|
	\| `VERB` \| Verb \| мэкIуэ (goes), матхэ (writes) \|
	\| `X` \| Other \| - \|
	""")

	# Event handlers
	submit_btn.click(fn=predict_pos_tags, inputs=[input_text], outputs=[output_text])

	# Launch the app
	if __name__ == "__main__":
	demo.launch()