Spaces:

bbeattie612
/

OlderScotsTagger

Runtime error

App Files Files Community

OlderScotsTagger / app.py

bbeattie612

undo mistake with demo.launch()

4c5c3e8 2 months ago

raw

history blame contribute delete

19.3 kB

	import spacy
	import gradio as gr
	import os
	import csv
	import xml.etree.ElementTree as ET
	from xml.dom import minidom
	import tempfile
	from collections import Counter
	import pandas as pd

	MODEL_DIR = os.path.join(os.path.dirname(__file__), "en_osc_claws7")
	nlp = spacy.load(MODEL_DIR)

	SAMPLE_TEXT = """Gif ze, throw curiositie of novationis, hes forzet our auld plane Scottis quhilk zour mother learnit zou in times coming, I sall write to zou my mind in Latin, for I am nocht acquaintit with zour Southern."""

	# Global variable to store current tagged data
	current_data = []

	def tag_text(text):
	global current_data
	doc = nlp(text)
	table_data = [(token.text, token.tag_) for token in doc]
	current_data = table_data # Store for statistics

	# Write csv
	csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", encoding="utf-8", newline="")
	writer = csv.writer(csv_file)
	writer.writerow(["Token", "Tag"])
	for token in doc:
	tag = token.text if token.is_punct else token.tag_
	writer.writerow([token.text, tag])
	csv_file.close()

	# Write XML
	root = ET.Element("text", id="typed-input")
	sentence_el = ET.SubElement(root, "s")
	for token in doc:
	tag = token.text if token.is_punct else token.tag_
	word_el = ET.SubElement(sentence_el, "w")
	word_el.set("pos", tag)
	word_el.text = token.text

	xml_str = ET.tostring(root, encoding="unicode")
	xml_pretty = minidom.parseString(xml_str).toprettyxml(indent=" ")
	xml_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xml", mode="w", encoding="utf-8")
	xml_file.write(xml_pretty)
	xml_file.close()

	return table_data, csv_file.name, xml_file.name

	def process_file(file_obj):
	global current_data
	raw_text = file_obj.read().decode("utf-8")
	lines = raw_text.splitlines()

	csv_rows = [("Token", "Tag")]
	table_data = []
	root = ET.Element("text", id="uploaded")

	for line in lines:
	line = line.strip()
	if not line:
	continue

	doc = nlp(line)
	sentence_el = ET.SubElement(root, "s")

	for token in doc:
	tag = token.text if token.is_punct else token.tag_

	word_el = ET.SubElement(sentence_el, "w")
	word_el.set("pos", tag)
	word_el.text = token.text

	csv_rows.append((token.text, tag))
	table_data.append((token.text, tag))

	current_data = table_data # Store for statistics

	# Write csv
	csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", encoding="utf-8", newline="")
	writer = csv.writer(csv_file)
	writer.writerows(csv_rows)
	csv_file.close()

	# Write xml
	xml_str = ET.tostring(root, encoding="unicode")
	xml_pretty = minidom.parseString(xml_str).toprettyxml(indent=" ")
	xml_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xml", mode="w", encoding="utf-8")
	xml_file.write(xml_pretty)
	xml_file.close()

	return table_data, csv_file.name, xml_file.name

	def generate_statistics():
	if not current_data:
	return "No data to analyse. Please tag some text first."

	# Count tags
	tags = [tag for _, tag in current_data]
	tag_counts = Counter(tags)
	total_tokens = len(tags)

	# Create frequency table with all columns
	freq_data = [
	[tag, count, f"{(count/total_tokens)10000:.2f}", f"{count/total_tokens100:.2f}%"]
	for tag, count in tag_counts.most_common()
	]

	return freq_data

	def filter_nouns():
	if not current_data:
	return "No data to analyse. Please tag some text first."

	tags = [tag for _, tag in current_data]
	tag_counts = Counter(tags)
	total_tokens = len(tags)

	# Filter only tags starting with N
	freq_data = [
	[tag, count, f"{(count/total_tokens)10000:.2f}", f"{count/total_tokens100:.2f}%"]
	for tag, count in tag_counts.most_common()
	if tag.startswith('N')
	]

	return freq_data

	def filter_verbs():
	if not current_data:
	return "No data to analyse. Please tag some text first."

	tags = [tag for _, tag in current_data]
	tag_counts = Counter(tags)
	total_tokens = len(tags)

	freq_data = [
	[tag, count, f"{(count/total_tokens)10000:.2f}", f"{count/total_tokens100:.2f}%"]
	for tag, count in tag_counts.most_common()
	if tag.startswith('V')
	]

	return freq_data

	def filter_open_class():
	if not current_data:
	return "No data to analyse. Please tag some text first."

	tags = [tag for _, tag in current_data]
	tag_counts = Counter(tags)
	total_tokens = len(tags)

	freq_data = [
	[tag, count, f"{(count/total_tokens)10000:.2f}", f"{count/total_tokens100:.2f}%"]
	for tag, count in tag_counts.most_common()
	if tag[0] in ['N', 'V', 'J', 'R']
	]

	return freq_data

	#User Interface

	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# Older Scots CLAWS PoS Tagger")
	with gr.Tab("Tag Text"):
	gr.Markdown("Enter text or upload a file to tag and download outputs in .csv and .xml formats.")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### Type Text")
	txt_input = gr.Textbox(lines=5, label="Enter text in Older Scots")

	# Add buttons in a row
	with gr.Row():
	tag_button = gr.Button("Tag Text", variant="primary")
	sample_button = gr.Button("Try Sample Text")

	with gr.Column():
	gr.Markdown("### Upload File")
	file_input = gr.File(label="Upload a .txt file", file_types=[".txt"])
	process_button = gr.Button("Process File")

	output_table = gr.Dataframe(headers=["Token", "CLAWS Tag"], interactive=True)

	with gr.Row():
	csv_download = gr.DownloadButton(label="Download CSV")
	xml_download = gr.DownloadButton(label="Download XML")

	# Add the sample button click handler here
	sample_button.click(fn=lambda: SAMPLE_TEXT, inputs=None, outputs=txt_input)
	tag_button.click(fn=tag_text, inputs=txt_input, outputs=[output_table, csv_download, xml_download])
	process_button.click(fn=process_file, inputs=file_input, outputs=[output_table, csv_download, xml_download])

	with gr.Tab("Statistics"):
	gr.Markdown("View statistical analysis of your tagged text.")

	analyze_button = gr.Button("Generate Statistics", variant="primary")

	stats_table = gr.Dataframe(
	headers=["Tag", "Raw Frequency", "Relative Frequency (per 10,000 words)", "Percentage of total tags"],
	interactive=True
	)

	# Add filter buttons (INDENTED to be inside the Statistics tab)
	gr.Markdown("### Filter by Category")
	with gr.Row():
	nouns_button = gr.Button("Nouns")
	verbs_button = gr.Button("Verbs")
	open_class_button = gr.Button("Open-class words")
	all_button = gr.Button("Show All")

	analyze_button.click(fn=generate_statistics, inputs=None, outputs=stats_table)
	nouns_button.click(fn=filter_nouns, inputs=None, outputs=stats_table)
	verbs_button.click(fn=filter_verbs, inputs=None, outputs=stats_table)
	open_class_button.click(fn=filter_open_class, inputs=None, outputs=stats_table)
	all_button.click(fn=generate_statistics, inputs=None, outputs=stats_table)

	with gr.Tab("About the Tagger"):
	gr.Markdown("""
	## What is this tagger?

	This is a Part-of-Speech (PoS) tagger trained specifically for Older Scots texts. It uses the CLAWS7 tagset to identify grammatical categories (Garside et al. 1997).

	## How does it work?

	The tagger uses a [spaCy](https://spacy.io/) model trained on a pre-tagged corpus of Older Scots literature (Bushnell 2021). The model builds on existing English training to be able to recognise and tag Older Scots language.

	## Output Formats

	- In-browser table: Within the browser, you can view the original text alongside each PoS tag
	- CSV: Spreadsheet format with columns for words and tags (like what you see in the browser)
	- XML: Holds the same data as the table and csv files, but in a format that can be uploaded pre-tagged to LancsBox for further analysis

	## Citation

	If you use this tagger in your research, please cite:

	Beattie, Beth. 2026. 'Older Scots Tagger'. Hugging Face. https://huggingface.co/spaces/bbeattie612/OlderScotsTagger.


	## Feedback and Questions

	This tagger is very much a work in progress. If you have any feedback or ideas for future developments, please fill out [this form](https://forms.cloud.microsoft/r/s7zZpDeDAn).

	## References

	- Bushnell, Megan. 2021. 'Equivalency, Page Design, and Corpus Linguistics: An Interdisciplinary Approach to Gavin Douglas's "Eneados"'. PhD, University of Oxford. https://ora.ox.ac.uk/objects/uuid:1ee08a4e-8a00-4641-b368-1d568b97ac31.
	- Garside, Roger, and Nicholas Smith. 1997. 'A Hybrid Grammatical Tagger: CLAWS4'. In _Corpus Annotation: Linguistic Information from Computer Text Corpora_, edited by Roger Garside, Geoffery N. Leech, and Anthony McEnery. Longman.


	""")
	with gr.Tab("CLAWS7 Tagset"):
	gr.Markdown("""
	## CLAWS7 Tagset

	This table explains the tags used in the CLAWS7 tagset, as found [here](https://ucrel.lancs.ac.uk/claws7tags.html).""")

	tag_definitions = gr.Dataframe(
	value=[
	["APPGE", "possessive pronoun, pre-nominal (e.g. my, your, our)"],
	["AT", "article (e.g. the, no)"],
	["AT1", "singular article (e.g. a, an, every)"],
	["BCL", "before-clause marker (e.g. in order (that),in order (to))"],
	["CC", "coordinating conjunction (e.g. and, or)"],
	["CCB", "adversative coordinating conjunction (but)"],
	["CS", "subordinating conjunction (e.g. if, because, unless, so, for)"],
	["CSA", "as (as conjunction)"],
	["CSN", "than (as conjunction)"],
	["CST", "that (as conjunction)"],
	["CSW", "whether (as conjunction)"],
	["DA", "after-determiner or post-determiner capable of pronominal function (e.g. such, former, same)"],
	["DA1", "singular after-determiner (e.g. little, much)"],
	["DA2", "plural after-determiner (e.g. few, several, many)"],
	["DAR", "comparative after-determiner (e.g. more, less, fewer)"],
	["DAT", "superlative after-determiner (e.g. most, least, fewest)"],
	["DB", "before determiner or pre-determiner capable of pronominal function (all, half)"],
	["DB2", "plural before-determiner ( both)"],
	["DD", "determiner (capable of pronominal function) (e.g any, some)"],
	["DD1", "singular determiner (e.g. this, that, another)"],
	["DD2", "plural determiner ( these,those)"],
	["DDQ", "wh-determiner (which, what)"],
	["DDQGE", "wh-determiner, genitive (whose)"],
	["DDQV", "wh-ever determiner, (whichever, whatever)"],
	["EX", "existential there"],
	["FO", "formula"],
	["FU", "unclassified word"],
	["FW", "foreign word"],
	["GE", "germanic genitive marker - (' or 's)"],
	["IF", "for (as preposition)"],
	["II", "general preposition"],
	["IO", "of (as preposition)"],
	["IW", "with, without (as prepositions)"],
	["JJ", "general adjective"],
	["JJR", "general comparative adjective (e.g. older, better, stronger)"],
	["JJT", "general superlative adjective (e.g. oldest, best, strongest)"],
	["JK", "catenative adjective (able in be able to, willing in be willing to)"],
	["MC", "cardinal number,neutral for number (two, three..)"],
	["MC1", "singular cardinal number (one)"],
	["MC2", "plural cardinal number (e.g. sixes, sevens)"],
	["MCGE", "genitive cardinal number, neutral for number (two's, 100's)"],
	["MCMC", "hyphenated number (40-50, 1770-1827)"],
	["MD", "ordinal number (e.g. first, second, next, last)"],
	["MF", "fraction,neutral for number (e.g. quarters, two-thirds)"],
	["ND1", "singular noun of direction (e.g. north, southeast)"],
	["NN", "common noun, neutral for number (e.g. sheep, cod, headquarters)"],
	["NN1", "singular common noun (e.g. book, girl)"],
	["NN2", "plural common noun (e.g. books, girls)"],
	["NNA", "following noun of title (e.g. M.A.)"],
	["NNB", "preceding noun of title (e.g. Mr., Prof.)"],
	["NNL1", "singular locative noun (e.g. Island, Street)"],
	["NNL2", "plural locative noun (e.g. Islands, Streets)"],
	["NNO", "numeral noun, neutral for number (e.g. dozen, hundred)"],
	["NNO2", "numeral noun, plural (e.g. hundreds, thousands)"],
	["NNT1", "temporal noun, singular (e.g. day, week, year)"],
	["NNT2", "temporal noun, plural (e.g. days, weeks, years)"],
	["NNU", "unit of measurement, neutral for number (e.g. in, cc)"],
	["NNU1", "singular unit of measurement (e.g. inch, centimetre)"],
	["NNU2", "plural unit of measurement (e.g. ins., feet)"],
	["NP", "proper noun, neutral for number (e.g. IBM, Andes)"],
	["NP1", "singular proper noun (e.g. London, Jane, Frederick)"],
	["NP2", "plural proper noun (e.g. Browns, Reagans, Koreas)"],
	["NPD1", "singular weekday noun (e.g. Sunday)"],
	["NPD2", "plural weekday noun (e.g. Sundays)"],
	["NPM1", "singular month noun (e.g. October)"],
	["NPM2", "plural month noun (e.g. Octobers)"],
	["PN", "indefinite pronoun, neutral for number (none)"],
	["PN1", "indefinite pronoun, singular (e.g. anyone, everything, nobody, one)"],
	["PNQO", "objective wh-pronoun (whom)"],
	["PNQS", "subjective wh-pronoun (who)"],
	["PNQV", "wh-ever pronoun (whoever)"],
	["PNX1", "reflexive indefinite pronoun (oneself)"],
	["PPGE", "nominal possessive personal pronoun (e.g. mine, yours)"],
	["PPH1", "3rd person sing. neuter personal pronoun (it)"],
	["PPHO1", "3rd person sing. objective personal pronoun (him, her)"],
	["PPHO2", "3rd person plural objective personal pronoun (them)"],
	["PPHS1", "3rd person sing. subjective personal pronoun (he, she)"],
	["PPHS2", "3rd person plural subjective personal pronoun (they)"],
	["PPIO1", "1st person sing. objective personal pronoun (me)"],
	["PPIO2", "1st person plural objective personal pronoun (us)"],
	["PPIS1", "1st person sing. subjective personal pronoun (I)"],
	["PPIS2", "1st person plural subjective personal pronoun (we)"],
	["PPX1", "singular reflexive personal pronoun (e.g. yourself, itself)"],
	["PPX2", "plural reflexive personal pronoun (e.g. yourselves, themselves)"],
	["PPY", "2nd person personal pronoun (you)"],
	["RA", "adverb, after nominal head (e.g. else, galore)"],
	["REX", "adverb introducing appositional constructions (namely, e.g.)"],
	["RG", "degree adverb (very, so, too)"],
	["RGQ", "wh- degree adverb (how)"],
	["RGQV", "wh-ever degree adverb (however)"],
	["RGR", "comparative degree adverb (more, less)"],
	["RGT", "superlative degree adverb (most, least)"],
	["RL", "locative adverb (e.g. alongside, forward)"],
	["RP", "prep. adverb, particle (e.g about, in)"],
	["RPK", "prep. adv., catenative (about in be about to)"],
	["RR", "general adverb"],
	["RRQ", "wh- general adverb (where, when, why, how)"],
	["RRQV", "wh-ever general adverb (wherever, whenever)"],
	["RRR", "comparative general adverb (e.g. better, longer)"],
	["RRT", "superlative general adverb (e.g. best, longest)"],
	["RT", "quasi-nominal adverb of time (e.g. now, tomorrow)"],
	["TO", "infinitive marker (to)"],
	["UH", "interjection (e.g. oh, yes, um)"],
	["VB0", "be, base form (finite i.e. imperative, subjunctive)"],
	["VBDR", "were"],
	["VBDZ", "was"],
	["VBG", "being"],
	["VBI", "be, infinitive (To be or not... It will be ..)"],
	["VBM", "am"],
	["VBN", "been"],
	["VBR", "are"],
	["VBZ", "is"],
	["VD0", "do, base form (finite)"],
	["VDD", "did"],
	["VDG", "doing"],
	["VDI", "do, infinitive (I may do... To do...)"],
	["VDN", "done"],
	["VDZ", "does"],
	["VH0", "have, base form (finite)"],
	["VHD", "had (past tense)"],
	["VHG", "having"],
	["VHI", "have, infinitive"],
	["VHN", "had (past participle)"],
	["VHZ", "has"],
	["VM", "modal auxiliary (can, will, would, etc.)"],
	["VMK", "modal catenative (ought, used)"],
	["VV0", "base form of lexical verb (e.g. give, work)"],
	["VVD", "past tense of lexical verb (e.g. gave, worked)"],
	["VVG", "-ing participle of lexical verb (e.g. giving, working)"],
	["VVGK", "-ing participle catenative (going in be going to)"],
	["VVI", "infinitive (e.g. to give... It will work...)"],
	["VVN", "past participle of lexical verb (e.g. given, worked)"],
	["VVNK", "past participle catenative (e.g. bound in be bound to)"],
	["VVZ", "-s form of lexical verb (e.g. gives, works)"],
	["XX", "not, n't"],
	["ZZ1", "singular letter of the alphabet (e.g. A, b)"],
	["ZZ2", "plural letter of the alphabet (e.g. A's, b's)"]
	],
	headers=["Tag", "Definition"],
	interactive=True, # Makes it searchable/filterable
	wrap=True
	)

	if __name__ == "__main__":
	demo.launch()