Spaces:
Runtime error
Runtime error
| import spacy | |
| import gradio as gr | |
| import os | |
| import csv | |
| import xml.etree.ElementTree as ET | |
| from xml.dom import minidom | |
| import tempfile | |
| from collections import Counter | |
| import pandas as pd | |
| MODEL_DIR = os.path.join(os.path.dirname(__file__), "en_osc_claws7") | |
| nlp = spacy.load(MODEL_DIR) | |
| SAMPLE_TEXT = """Gif ze, throw curiositie of novationis, hes forzet our auld plane Scottis quhilk zour mother learnit zou in times coming, I sall write to zou my mind in Latin, for I am nocht acquaintit with zour Southern.""" | |
| # Global variable to store current tagged data | |
| current_data = [] | |
| def tag_text(text): | |
| global current_data | |
| doc = nlp(text) | |
| table_data = [(token.text, token.tag_) for token in doc] | |
| current_data = table_data # Store for statistics | |
| # Write csv | |
| csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", encoding="utf-8", newline="") | |
| writer = csv.writer(csv_file) | |
| writer.writerow(["Token", "Tag"]) | |
| for token in doc: | |
| tag = token.text if token.is_punct else token.tag_ | |
| writer.writerow([token.text, tag]) | |
| csv_file.close() | |
| # Write XML | |
| root = ET.Element("text", id="typed-input") | |
| sentence_el = ET.SubElement(root, "s") | |
| for token in doc: | |
| tag = token.text if token.is_punct else token.tag_ | |
| word_el = ET.SubElement(sentence_el, "w") | |
| word_el.set("pos", tag) | |
| word_el.text = token.text | |
| xml_str = ET.tostring(root, encoding="unicode") | |
| xml_pretty = minidom.parseString(xml_str).toprettyxml(indent=" ") | |
| xml_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xml", mode="w", encoding="utf-8") | |
| xml_file.write(xml_pretty) | |
| xml_file.close() | |
| return table_data, csv_file.name, xml_file.name | |
| def process_file(file_obj): | |
| global current_data | |
| raw_text = file_obj.read().decode("utf-8") | |
| lines = raw_text.splitlines() | |
| csv_rows = [("Token", "Tag")] | |
| table_data = [] | |
| root = ET.Element("text", id="uploaded") | |
| for line in lines: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| doc = nlp(line) | |
| sentence_el = ET.SubElement(root, "s") | |
| for token in doc: | |
| tag = token.text if token.is_punct else token.tag_ | |
| word_el = ET.SubElement(sentence_el, "w") | |
| word_el.set("pos", tag) | |
| word_el.text = token.text | |
| csv_rows.append((token.text, tag)) | |
| table_data.append((token.text, tag)) | |
| current_data = table_data # Store for statistics | |
| # Write csv | |
| csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", encoding="utf-8", newline="") | |
| writer = csv.writer(csv_file) | |
| writer.writerows(csv_rows) | |
| csv_file.close() | |
| # Write xml | |
| xml_str = ET.tostring(root, encoding="unicode") | |
| xml_pretty = minidom.parseString(xml_str).toprettyxml(indent=" ") | |
| xml_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xml", mode="w", encoding="utf-8") | |
| xml_file.write(xml_pretty) | |
| xml_file.close() | |
| return table_data, csv_file.name, xml_file.name | |
| def generate_statistics(): | |
| if not current_data: | |
| return "No data to analyse. Please tag some text first." | |
| # Count tags | |
| tags = [tag for _, tag in current_data] | |
| tag_counts = Counter(tags) | |
| total_tokens = len(tags) | |
| # Create frequency table with all columns | |
| freq_data = [ | |
| [tag, count, f"{(count/total_tokens)*10000:.2f}", f"{count/total_tokens*100:.2f}%"] | |
| for tag, count in tag_counts.most_common() | |
| ] | |
| return freq_data | |
| def filter_nouns(): | |
| if not current_data: | |
| return "No data to analyse. Please tag some text first." | |
| tags = [tag for _, tag in current_data] | |
| tag_counts = Counter(tags) | |
| total_tokens = len(tags) | |
| # Filter only tags starting with N | |
| freq_data = [ | |
| [tag, count, f"{(count/total_tokens)*10000:.2f}", f"{count/total_tokens*100:.2f}%"] | |
| for tag, count in tag_counts.most_common() | |
| if tag.startswith('N') | |
| ] | |
| return freq_data | |
| def filter_verbs(): | |
| if not current_data: | |
| return "No data to analyse. Please tag some text first." | |
| tags = [tag for _, tag in current_data] | |
| tag_counts = Counter(tags) | |
| total_tokens = len(tags) | |
| freq_data = [ | |
| [tag, count, f"{(count/total_tokens)*10000:.2f}", f"{count/total_tokens*100:.2f}%"] | |
| for tag, count in tag_counts.most_common() | |
| if tag.startswith('V') | |
| ] | |
| return freq_data | |
| def filter_open_class(): | |
| if not current_data: | |
| return "No data to analyse. Please tag some text first." | |
| tags = [tag for _, tag in current_data] | |
| tag_counts = Counter(tags) | |
| total_tokens = len(tags) | |
| freq_data = [ | |
| [tag, count, f"{(count/total_tokens)*10000:.2f}", f"{count/total_tokens*100:.2f}%"] | |
| for tag, count in tag_counts.most_common() | |
| if tag[0] in ['N', 'V', 'J', 'R'] | |
| ] | |
| return freq_data | |
| #User Interface | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# Older Scots CLAWS PoS Tagger") | |
| with gr.Tab("Tag Text"): | |
| gr.Markdown("Enter text or upload a file to tag and download outputs in .csv and .xml formats.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### Type Text") | |
| txt_input = gr.Textbox(lines=5, label="Enter text in Older Scots") | |
| # Add buttons in a row | |
| with gr.Row(): | |
| tag_button = gr.Button("Tag Text", variant="primary") | |
| sample_button = gr.Button("Try Sample Text") | |
| with gr.Column(): | |
| gr.Markdown("### Upload File") | |
| file_input = gr.File(label="Upload a .txt file", file_types=[".txt"]) | |
| process_button = gr.Button("Process File") | |
| output_table = gr.Dataframe(headers=["Token", "CLAWS Tag"], interactive=True) | |
| with gr.Row(): | |
| csv_download = gr.DownloadButton(label="Download CSV") | |
| xml_download = gr.DownloadButton(label="Download XML") | |
| # Add the sample button click handler here | |
| sample_button.click(fn=lambda: SAMPLE_TEXT, inputs=None, outputs=txt_input) | |
| tag_button.click(fn=tag_text, inputs=txt_input, outputs=[output_table, csv_download, xml_download]) | |
| process_button.click(fn=process_file, inputs=file_input, outputs=[output_table, csv_download, xml_download]) | |
| with gr.Tab("Statistics"): | |
| gr.Markdown("View statistical analysis of your tagged text.") | |
| analyze_button = gr.Button("Generate Statistics", variant="primary") | |
| stats_table = gr.Dataframe( | |
| headers=["Tag", "Raw Frequency", "Relative Frequency (per 10,000 words)", "Percentage of total tags"], | |
| interactive=True | |
| ) | |
| # Add filter buttons (INDENTED to be inside the Statistics tab) | |
| gr.Markdown("### Filter by Category") | |
| with gr.Row(): | |
| nouns_button = gr.Button("Nouns") | |
| verbs_button = gr.Button("Verbs") | |
| open_class_button = gr.Button("Open-class words") | |
| all_button = gr.Button("Show All") | |
| analyze_button.click(fn=generate_statistics, inputs=None, outputs=stats_table) | |
| nouns_button.click(fn=filter_nouns, inputs=None, outputs=stats_table) | |
| verbs_button.click(fn=filter_verbs, inputs=None, outputs=stats_table) | |
| open_class_button.click(fn=filter_open_class, inputs=None, outputs=stats_table) | |
| all_button.click(fn=generate_statistics, inputs=None, outputs=stats_table) | |
| with gr.Tab("About the Tagger"): | |
| gr.Markdown(""" | |
| ## What is this tagger? | |
| This is a Part-of-Speech (PoS) tagger trained specifically for Older Scots texts. It uses the CLAWS7 tagset to identify grammatical categories (Garside et al. 1997). | |
| ## How does it work? | |
| The tagger uses a [spaCy](https://spacy.io/) model trained on a pre-tagged corpus of Older Scots literature (Bushnell 2021). The model builds on existing English training to be able to recognise and tag Older Scots language. | |
| ## Output Formats | |
| - **In-browser table**: Within the browser, you can view the original text alongside each PoS tag | |
| - **CSV**: Spreadsheet format with columns for words and tags (like what you see in the browser) | |
| - **XML**: Holds the same data as the table and csv files, but in a format that can be uploaded pre-tagged to LancsBox for further analysis | |
| ## Citation | |
| If you use this tagger in your research, please cite: | |
| Beattie, Beth. 2026. 'Older Scots Tagger'. Hugging Face. https://huggingface.co/spaces/bbeattie612/OlderScotsTagger. | |
| ## Feedback and Questions | |
| This tagger is very much a work in progress. If you have any feedback or ideas for future developments, please fill out [this form](https://forms.cloud.microsoft/r/s7zZpDeDAn). | |
| ## References | |
| - Bushnell, Megan. 2021. 'Equivalency, Page Design, and Corpus Linguistics: An Interdisciplinary Approach to Gavin Douglas's "Eneados"'. PhD, University of Oxford. https://ora.ox.ac.uk/objects/uuid:1ee08a4e-8a00-4641-b368-1d568b97ac31. | |
| - Garside, Roger, and Nicholas Smith. 1997. 'A Hybrid Grammatical Tagger: CLAWS4'. In _Corpus Annotation: Linguistic Information from Computer Text Corpora_, edited by Roger Garside, Geoffery N. Leech, and Anthony McEnery. Longman. | |
| """) | |
| with gr.Tab("CLAWS7 Tagset"): | |
| gr.Markdown(""" | |
| ## CLAWS7 Tagset | |
| This table explains the tags used in the CLAWS7 tagset, as found [here](https://ucrel.lancs.ac.uk/claws7tags.html).""") | |
| tag_definitions = gr.Dataframe( | |
| value=[ | |
| ["APPGE", "possessive pronoun, pre-nominal (e.g. my, your, our)"], | |
| ["AT", "article (e.g. the, no)"], | |
| ["AT1", "singular article (e.g. a, an, every)"], | |
| ["BCL", "before-clause marker (e.g. in order (that),in order (to))"], | |
| ["CC", "coordinating conjunction (e.g. and, or)"], | |
| ["CCB", "adversative coordinating conjunction (but)"], | |
| ["CS", "subordinating conjunction (e.g. if, because, unless, so, for)"], | |
| ["CSA", "as (as conjunction)"], | |
| ["CSN", "than (as conjunction)"], | |
| ["CST", "that (as conjunction)"], | |
| ["CSW", "whether (as conjunction)"], | |
| ["DA", "after-determiner or post-determiner capable of pronominal function (e.g. such, former, same)"], | |
| ["DA1", "singular after-determiner (e.g. little, much)"], | |
| ["DA2", "plural after-determiner (e.g. few, several, many)"], | |
| ["DAR", "comparative after-determiner (e.g. more, less, fewer)"], | |
| ["DAT", "superlative after-determiner (e.g. most, least, fewest)"], | |
| ["DB", "before determiner or pre-determiner capable of pronominal function (all, half)"], | |
| ["DB2", "plural before-determiner ( both)"], | |
| ["DD", "determiner (capable of pronominal function) (e.g any, some)"], | |
| ["DD1", "singular determiner (e.g. this, that, another)"], | |
| ["DD2", "plural determiner ( these,those)"], | |
| ["DDQ", "wh-determiner (which, what)"], | |
| ["DDQGE", "wh-determiner, genitive (whose)"], | |
| ["DDQV", "wh-ever determiner, (whichever, whatever)"], | |
| ["EX", "existential there"], | |
| ["FO", "formula"], | |
| ["FU", "unclassified word"], | |
| ["FW", "foreign word"], | |
| ["GE", "germanic genitive marker - (' or 's)"], | |
| ["IF", "for (as preposition)"], | |
| ["II", "general preposition"], | |
| ["IO", "of (as preposition)"], | |
| ["IW", "with, without (as prepositions)"], | |
| ["JJ", "general adjective"], | |
| ["JJR", "general comparative adjective (e.g. older, better, stronger)"], | |
| ["JJT", "general superlative adjective (e.g. oldest, best, strongest)"], | |
| ["JK", "catenative adjective (able in be able to, willing in be willing to)"], | |
| ["MC", "cardinal number,neutral for number (two, three..)"], | |
| ["MC1", "singular cardinal number (one)"], | |
| ["MC2", "plural cardinal number (e.g. sixes, sevens)"], | |
| ["MCGE", "genitive cardinal number, neutral for number (two's, 100's)"], | |
| ["MCMC", "hyphenated number (40-50, 1770-1827)"], | |
| ["MD", "ordinal number (e.g. first, second, next, last)"], | |
| ["MF", "fraction,neutral for number (e.g. quarters, two-thirds)"], | |
| ["ND1", "singular noun of direction (e.g. north, southeast)"], | |
| ["NN", "common noun, neutral for number (e.g. sheep, cod, headquarters)"], | |
| ["NN1", "singular common noun (e.g. book, girl)"], | |
| ["NN2", "plural common noun (e.g. books, girls)"], | |
| ["NNA", "following noun of title (e.g. M.A.)"], | |
| ["NNB", "preceding noun of title (e.g. Mr., Prof.)"], | |
| ["NNL1", "singular locative noun (e.g. Island, Street)"], | |
| ["NNL2", "plural locative noun (e.g. Islands, Streets)"], | |
| ["NNO", "numeral noun, neutral for number (e.g. dozen, hundred)"], | |
| ["NNO2", "numeral noun, plural (e.g. hundreds, thousands)"], | |
| ["NNT1", "temporal noun, singular (e.g. day, week, year)"], | |
| ["NNT2", "temporal noun, plural (e.g. days, weeks, years)"], | |
| ["NNU", "unit of measurement, neutral for number (e.g. in, cc)"], | |
| ["NNU1", "singular unit of measurement (e.g. inch, centimetre)"], | |
| ["NNU2", "plural unit of measurement (e.g. ins., feet)"], | |
| ["NP", "proper noun, neutral for number (e.g. IBM, Andes)"], | |
| ["NP1", "singular proper noun (e.g. London, Jane, Frederick)"], | |
| ["NP2", "plural proper noun (e.g. Browns, Reagans, Koreas)"], | |
| ["NPD1", "singular weekday noun (e.g. Sunday)"], | |
| ["NPD2", "plural weekday noun (e.g. Sundays)"], | |
| ["NPM1", "singular month noun (e.g. October)"], | |
| ["NPM2", "plural month noun (e.g. Octobers)"], | |
| ["PN", "indefinite pronoun, neutral for number (none)"], | |
| ["PN1", "indefinite pronoun, singular (e.g. anyone, everything, nobody, one)"], | |
| ["PNQO", "objective wh-pronoun (whom)"], | |
| ["PNQS", "subjective wh-pronoun (who)"], | |
| ["PNQV", "wh-ever pronoun (whoever)"], | |
| ["PNX1", "reflexive indefinite pronoun (oneself)"], | |
| ["PPGE", "nominal possessive personal pronoun (e.g. mine, yours)"], | |
| ["PPH1", "3rd person sing. neuter personal pronoun (it)"], | |
| ["PPHO1", "3rd person sing. objective personal pronoun (him, her)"], | |
| ["PPHO2", "3rd person plural objective personal pronoun (them)"], | |
| ["PPHS1", "3rd person sing. subjective personal pronoun (he, she)"], | |
| ["PPHS2", "3rd person plural subjective personal pronoun (they)"], | |
| ["PPIO1", "1st person sing. objective personal pronoun (me)"], | |
| ["PPIO2", "1st person plural objective personal pronoun (us)"], | |
| ["PPIS1", "1st person sing. subjective personal pronoun (I)"], | |
| ["PPIS2", "1st person plural subjective personal pronoun (we)"], | |
| ["PPX1", "singular reflexive personal pronoun (e.g. yourself, itself)"], | |
| ["PPX2", "plural reflexive personal pronoun (e.g. yourselves, themselves)"], | |
| ["PPY", "2nd person personal pronoun (you)"], | |
| ["RA", "adverb, after nominal head (e.g. else, galore)"], | |
| ["REX", "adverb introducing appositional constructions (namely, e.g.)"], | |
| ["RG", "degree adverb (very, so, too)"], | |
| ["RGQ", "wh- degree adverb (how)"], | |
| ["RGQV", "wh-ever degree adverb (however)"], | |
| ["RGR", "comparative degree adverb (more, less)"], | |
| ["RGT", "superlative degree adverb (most, least)"], | |
| ["RL", "locative adverb (e.g. alongside, forward)"], | |
| ["RP", "prep. adverb, particle (e.g about, in)"], | |
| ["RPK", "prep. adv., catenative (about in be about to)"], | |
| ["RR", "general adverb"], | |
| ["RRQ", "wh- general adverb (where, when, why, how)"], | |
| ["RRQV", "wh-ever general adverb (wherever, whenever)"], | |
| ["RRR", "comparative general adverb (e.g. better, longer)"], | |
| ["RRT", "superlative general adverb (e.g. best, longest)"], | |
| ["RT", "quasi-nominal adverb of time (e.g. now, tomorrow)"], | |
| ["TO", "infinitive marker (to)"], | |
| ["UH", "interjection (e.g. oh, yes, um)"], | |
| ["VB0", "be, base form (finite i.e. imperative, subjunctive)"], | |
| ["VBDR", "were"], | |
| ["VBDZ", "was"], | |
| ["VBG", "being"], | |
| ["VBI", "be, infinitive (To be or not... It will be ..)"], | |
| ["VBM", "am"], | |
| ["VBN", "been"], | |
| ["VBR", "are"], | |
| ["VBZ", "is"], | |
| ["VD0", "do, base form (finite)"], | |
| ["VDD", "did"], | |
| ["VDG", "doing"], | |
| ["VDI", "do, infinitive (I may do... To do...)"], | |
| ["VDN", "done"], | |
| ["VDZ", "does"], | |
| ["VH0", "have, base form (finite)"], | |
| ["VHD", "had (past tense)"], | |
| ["VHG", "having"], | |
| ["VHI", "have, infinitive"], | |
| ["VHN", "had (past participle)"], | |
| ["VHZ", "has"], | |
| ["VM", "modal auxiliary (can, will, would, etc.)"], | |
| ["VMK", "modal catenative (ought, used)"], | |
| ["VV0", "base form of lexical verb (e.g. give, work)"], | |
| ["VVD", "past tense of lexical verb (e.g. gave, worked)"], | |
| ["VVG", "-ing participle of lexical verb (e.g. giving, working)"], | |
| ["VVGK", "-ing participle catenative (going in be going to)"], | |
| ["VVI", "infinitive (e.g. to give... It will work...)"], | |
| ["VVN", "past participle of lexical verb (e.g. given, worked)"], | |
| ["VVNK", "past participle catenative (e.g. bound in be bound to)"], | |
| ["VVZ", "-s form of lexical verb (e.g. gives, works)"], | |
| ["XX", "not, n't"], | |
| ["ZZ1", "singular letter of the alphabet (e.g. A, b)"], | |
| ["ZZ2", "plural letter of the alphabet (e.g. A's, b's)"] | |
| ], | |
| headers=["Tag", "Definition"], | |
| interactive=True, # Makes it searchable/filterable | |
| wrap=True | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |