Spaces:

DanielGallagherIRE
/

Grew-TSE

Sleeping

App Files Files Community

DanielGallagherIRE commited on Jul 23, 2025

Commit

586650a

1 Parent(s): 698e6dd

upload files

Browse files

Files changed (28) hide show

.gradio/certificate.pem +31 -0
.requirements.txt.un~ +0 -0
app.py +196 -4
polish-test-lg.conllu.conllu +0 -0
requirements.txt +107 -0
spanish-test-sm.conllu +100 -0
treetse/__init__.py +0 -0
treetse/__pycache__/__init__.cpython-312.pyc +0 -0
treetse/__pycache__/__init__.cpython-313.pyc +0 -0
treetse/__pycache__/pipeline.cpython-312.pyc +0 -0
treetse/evaluators/__pycache__/evaluator.cpython-312.pyc +0 -0
treetse/evaluators/__pycache__/evaluator.cpython-313.pyc +0 -0
treetse/evaluators/evaluator.py +93 -0
treetse/evaluators/perplexity.py +32 -0
treetse/pipeline.py +200 -0
treetse/preprocessing/__init__.py +0 -0
treetse/preprocessing/__pycache__/__init__.cpython-312.pyc +0 -0
treetse/preprocessing/__pycache__/conllu_parser.cpython-312.pyc +0 -0
treetse/preprocessing/__pycache__/conllu_parser.cpython-313.pyc +0 -0
treetse/preprocessing/__pycache__/grew_dependencies.cpython-312.pyc +0 -0
treetse/preprocessing/__pycache__/reconstruction.cpython-312.pyc +0 -0
treetse/preprocessing/__pycache__/reconstruction.cpython-313.pyc +0 -0
treetse/preprocessing/conllu_parser.py +402 -0
treetse/preprocessing/grew_dependencies.py +20 -0
treetse/preprocessing/reconstruction.py +78 -0
treetse/visualise/__pycache__/visualiser.cpython-312.pyc +0 -0
treetse/visualise/__pycache__/visualiser.cpython-313.pyc +0 -0
treetse/visualise/visualiser.py +114 -0

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

.requirements.txt.un~ ADDED Viewed

Binary file (947 Bytes). View file

app.py CHANGED Viewed

@@ -1,7 +1,199 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+import pandas as pd
+import tempfile
+import ast
+import sys
+import os
+from PIL import Image
+# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from treetse.pipeline import Grewtse
+grewtse = Grewtse()
+treebank_path = None
+def parse_treebank(path: str, treebank_selection: str) -> pd.DataFrame:
+    if treebank_selection == "None":
+        successful_treebank_parse = grewtse.parse_treebank(path)
+        treebank_path = path
+    else:
+        successful_treebank_parse = grewtse.parse_treebank(treebank_selection)
+        treebank_path = treebank_selection
+    print("changing treebank parse success")
+    is_treebank_parse_success = True
+    return grewtse.get_morphological_features().head()
+def to_masked_dataset(query, node) -> pd.DataFrame:
+    df = grewtse.generate_masked_dataset(query, node)
+    return df
+def safe_str_to_dict(s):
+    try:
+        return ast.literal_eval(s)
+    except (ValueError, SyntaxError):
+        return None
+def generate_minimal_pairs(query: str, node: str, alt_features: str):
+    if not grewtse.is_treebank_loaded():
+        raise ValueError("Please parse a treebank first.")
+    # mask each sentence
+    resulting_dataset = to_masked_dataset(query, node)
+    # determine whether an alternative LI should be found
+    alt_features_as_dict = safe_str_to_dict(alt_features)
+    if alt_features_as_dict is not None:
+        resulting_dataset = grewtse.generate_minimal_pairs(alt_features_as_dict, {})
+    # resulting_dataset = grewtse.get_masked_dataset()
+    print(resulting_dataset)
+    # save to a temporary CSV file
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
+    resulting_dataset.to_csv(temp_file.name, index=False)
+    return resulting_dataset, temp_file.name
+def evaluate_model(model_repo: str, target_x_label: str, alt_x_label: str, x_axis_label: str, title: str):
+    if not grewtse.are_minimal_pairs_generated():
+        raise ValueError("Please parse a treebank, mask a dataset and generate minimal pairs first.")
+    mp_with_eval_dataset = grewtse.evaluate_bert_mlm(model_repo)
+    vis_filename = "vis.png"
+    grewtse.visualise_syntactic_performance(vis_filename,
+        mp_with_eval_dataset,
+        target_x_label,
+        alt_x_label,
+        x_axis_label,
+        "Confidence",
+        title)
+    # save to a temporary CSV file
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
+    mp_with_eval_dataset.to_csv(temp_file.name, index=False)
+    return mp_with_eval_dataset, temp_file.name, vis_filename
+def show_df():
+    return gr.update(visible=True)
+with gr.Blocks(theme=gr.themes.Ocean()) as demo:
+    is_treebank_parse_success = False
+    with gr.Row():
+        gr.Markdown("# GREW-TSE: A Pipeline for Query-based Targeted Syntactic Evaluation")
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("""
+            #### Load a Treebank
+            You can begin by loading up a particular treebank that you'd like to work with.<br>
+            You can either select a treebank from the pre-loaded options below, or upload your own.<br>
+            """)
+        with gr.Column():
+            with gr.Tabs():
+                with gr.TabItem("Choose Treebank"):
+                    treebank_selection = gr.Dropdown(
+                        choices=["None", "spanish-test-sm.conllu", "polish-test-lg.conllu"],
+                        label="Select a treebank",
+                        value="spanish-test-sm.conllu"
+                    )
+                with gr.TabItem("Upload Your Own"):
+                    gr.Markdown("## Upload a .conllu File")
+                    file_input = gr.File(
+                        label="Upload .conllu file",
+                        file_types=[".conllu"],
+                        type="filepath"
+                    )
+            parse_file_button = gr.Button("Parse Treebank", size='sm', scale=1)
+    gr.Markdown("## Isolate A Syntactic Phenomenon")
+    morph_table = gr.Dataframe(interactive=False, visible=False)
+    parse_file_button.click(
+        fn=parse_treebank,
+        inputs=[file_input, treebank_selection],
+        outputs=[morph_table]
+    )
+    parse_file_button.click(
+        fn=show_df,
+        outputs=morph_table
+    )
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("""
+                **GREW (Graph Rewriting for Universal Dependencies)** is a query and transformation language used to search within and manipulate dependency treebanks. A GREW query allows linguists and NLP researchers to find specific syntactic patterns in parsed linguistic data (such as Universal Dependencies treebanks).
+                Queries are expressed as graph constraints using a concise pattern-matching syntax.
+                #### Example
+                The following short GREW query will find target any verbs. Try it with one of the sample treebanks above.
+                Make sure to include the variable V as the target that we're trying to isolate.
+                ```grew
+                V [upos=\"VERB\"];
+                ```
+            """)
+        with gr.Column():
+            query_input = gr.Textbox(label="GREW Query", lines=5, placeholder="Enter your GREW query here...", value="V [upos=\"VERB\"];")
+            node_input = gr.Textbox(label="Node", placeholder="The variable in your GREW query to isolate, e.g., N", value="V")
+            feature_input = gr.Textbox(
+                label="Enter Alternative Feature Values for Minimal Pair as a Dictionary",
+                placeholder='e.g. {"case": "Acc", "number": "Sing"}',
+                value="{\"mood\": \"Sub\"}",
+                lines=3
+            )
+            run_button = gr.Button("Run Query", size='sm', scale=3)
+    output_table = gr.Dataframe(label="Output Table", visible=False)
+    download_file = gr.File(label="Download CSV")
+    run_button.click(
+        fn=generate_minimal_pairs,
+        inputs=[query_input, node_input, feature_input],
+        outputs=[output_table, download_file]
+    )
+    run_button.click(
+        fn=show_df,
+        outputs=output_table
+    )
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("""
+            ## Evaluate A Model
+            You can evaluate any BERT for MLM model by providing the name of the model repository.
+            """)
+        with gr.Column():
+            repository_input = gr.Textbox(label="Model Repository", lines=1, placeholder="Enter the model repository here...", value="dccuchile/distilbert-base-spanish-uncased")
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("""
+            ## Choose Visualisation Settings
+            The results will be displayed as a visualisation which you can edit using the following settings.
+            """)
+        with gr.Column():
+            target_x_label_textbox = gr.Textbox(label="Original Label Name i.e type of the 'right' token", lines=1, placeholder="Genitive Version")
+            alt_x_label_textbox = gr.Textbox(label="Alternative Label Name i.e type of the 'wrong' token", lines=1, placeholder="Accusative Version")
+            x_axis_label_textbox = gr.Textbox(label="X Axis Title i.e what features are you comparing?", lines=1, placeholder="Case of Nouns in Transitive Verbs")
+            title_textbox = gr.Textbox(label="Visualisation Title", lines=1, placeholder="Syntactic Performance of BERT on English Transitive Noun Case")
+            evaluate_button = gr.Button("Evaluate Model", size='sm', scale=3)
+    mp_with_eval_output_dataset = gr.Dataframe(label="Output Table", visible=False)
+    mp_with_eval_output_download = gr.File(label="Download CSV")
+    visualisation_widget = gr.Image(type="pil", label="Loaded Image")
+    evaluate_button.click(
+        fn=evaluate_model,
+        inputs=[repository_input, target_x_label_textbox, alt_x_label_textbox, x_axis_label_textbox, title_textbox],
+        outputs=[mp_with_eval_output_dataset, mp_with_eval_output_download, visualisation_widget]
+    )
+    evaluate_button.click(
+        fn=show_df,
+        outputs=[mp_with_eval_output_dataset]
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)

polish-test-lg.conllu.conllu ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,107 @@

+aiofiles==24.1.0
+annotated-types==0.7.0
+anyio==4.9.0
+black==25.1.0
+Brotli==1.1.0
+certifi==2025.4.26
+charset-normalizer==3.4.2
+click==8.2.0
+colorama==0.4.6
+conllu==6.0.0
+contourpy==1.3.2
+coverage==7.8.0
+cycler==0.12.1
+fastapi==0.116.1
+ffmpy==0.6.0
+filelock==3.18.0
+fonttools==4.58.4
+fsspec==2025.5.1
+gradio==5.37.0
+gradio_client==1.10.4
+grewpy==0.6.0
+groovy==0.1.2
+h11==0.16.0
+hf-xet==1.1.5
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.32.2
+idna==3.10
+iniconfig==2.1.0
+isort==6.0.1
+Jinja2==3.1.6
+kiwisolver==1.4.8
+lark==1.2.2
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib==3.10.3
+mdurl==0.1.2
+mizani==0.13.5
+mpmath==1.3.0
+mypy==1.15.0
+mypy_extensions==1.1.0
+networkx==3.4.2
+numpy==2.2.5
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvrtc-cu12==12.6.77
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.5.1.17
+nvidia-cufft-cu12==11.3.0.4
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-cusparselt-cu12==0.6.3
+nvidia-nccl-cu12==2.26.2
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvtx-cu12==12.6.77
+orjson==3.10.18
+packaging==25.0
+pandas==2.2.3
+pandas-stubs==2.2.3.250527
+pathspec==0.12.1
+patsy==1.0.1
+pillow==11.2.1
+platformdirs==4.3.8
+plotnine==0.14.5
+pluggy==1.5.0
+pydantic==2.11.7
+pydantic_core==2.33.2
+pydub==0.25.1
+Pygments==2.19.2
+pyparsing==3.2.3
+pytest==8.3.5
+pytest-cov==6.1.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2025.2
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+rich==14.0.0
+ruff==0.11.9
+safehttpx==0.1.6
+safetensors==0.5.3
+scipy==1.15.3
+semantic-version==2.10.0
+setuptools==80.4.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+starlette==0.47.1
+statsmodels==0.14.4
+sympy==1.14.0
+tokenizers==0.21.1
+tomlkit==0.13.3
+torch==2.7.0
+tqdm==4.67.1
+transformers==4.52.3
+triton==3.3.0
+typer==0.16.0
+types-pytz==2025.2.0.20250516
+typing-inspection==0.4.1
+typing_extensions==4.13.2
+tzdata==2025.2
+urllib3==2.4.0
+uvicorn==0.35.0
+websockets==15.0.1

spanish-test-sm.conllu ADDED Viewed

	@@ -0,0 +1,100 @@

+# global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC
+# sent_id = 3LB-CAST-c2-2-s18
+# text = Sea enhorabuena.
+# orig_file_sentence 011#21
+1	Sea	ser	AUX	vssp3s0	Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	0	root	0:root	_
+2	enhorabuena	enhorabuena	NOUN	ncfs000	Gender=Fem|Number=Sing	1	nsubj	1:nsubj	ArgTem=arg1:tem|SpaceAfter=No
+3	.	.	PUNCT	fp	PunctType=Peri	1	punct	1:punct	_
+# sent_id = 3LB-CAST-d2-12-s5
+# text = Esperemos que muy pronto.
+# orig_file_sentence 002#8
+1	Esperemos	esperar	VERB	vmsp1p0	Mood=Sub|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin	0	root	0:root	_
+2	que	que	SCONJ	cs	_	4	mark	4:mark	_
+3	muy	mucho	ADV	rg	_	4	advmod	4:advmod	_
+4	pronto	pronto	ADV	rg	_	1	obj	1:obj	ArgTem=arg1:pat|SpaceAfter=No
+5	.	.	PUNCT	fp	PunctType=Peri	1	punct	1:punct	_
+# sent_id = CESS-CAST-P-20000701-69-s8
+# text = Mujer hubiera mostrado arrepentimiento.
+# orig_file_sentence 131#40
+1	Mujer	mujer	NOUN	ncfs000	Gender=Fem|Number=Sing	3	nsubj	3:nsubj	ArgTem=arg0:agt|Entity=(NOCOREF:Gen--1-gstype:gen)
+2	hubiera	haber	AUX	vasi3s0	Mood=Sub|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin	3	aux	3:aux	_
+3	mostrado	mostrar	VERB	vmp00sm	Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part	0	root	0:root	_
+4	arrepentimiento	arrepentimiento	NOUN	ncms000	Gender=Masc|Number=Sing	3	obj	3:obj	ArgTem=arg1:pat|Entity=(NOCOREF:Gen--1-gstype:gen)|SpaceAfter=No
+5	.	.	PUNCT	fp	PunctType=Peri	3	punct	3:punct	_
+# sent_id = CESS-CAST-P-20000601-41-s10
+# text = Que por ritmo no quede.
+# orig_file_sentence 119#88
+1	Que	que	SCONJ	cs	_	5	mark	5:mark	_
+2	por	por	ADP	sps00	_	3	case	3:case	_
+3	ritmo	ritmo	NOUN	ncms000	Gender=Masc|Number=Sing	5	obl	5:obl	ArgTem=argM:adv
+4	no	no	ADV	rn	Polarity=Neg	5	advmod	5:advmod	_
+5	quede	quedar	VERB	vmsp3s0	Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	0	root	0:root	SpaceAfter=No
+6	.	.	PUNCT	fp	PunctType=Peri	5	punct	5:punct	_
+# sent_id = 3LB-CAST-t3-4-s23
+# text = Yo esperar cuanto haga falta.
+# orig_file_sentence 027#95
+1	Yo	yo	PRON	pp1csn00	Case=Nom|Number=Sing|Person=1|PronType=Prs	2	obj	2:obj	_
+2	esperar	esperar	VERB	vmn0000	VerbForm=Inf	0	root	0:root	_
+3	cuanto	cuanto	PRON	pr0ms000	Gender=Masc|Number=Sing|NumType=Card|PronType=Int,Rel	4	nsubj	4:nsubj	ArgTem=arg1:tem
+4	haga	hacer	VERB	vmsp3s0	Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	2	advcl	2:advcl	MWE=haga_falta|MWEPOS=VERB
+5	falta	falta	NOUN	_	Gender=Fem|Number=Sing	4	compound	4:compound	SpaceAfter=No
+6	.	.	PUNCT	fp	PunctType=Peri	2	punct	2:punct	_
+# sent_id = 3LB-CAST-c1-4-s5
+# text = Un soldador que me meta fuego.
+# orig_file_sentence 010#41
+1	Un	uno	DET	di0ms0	Definite=Ind|Gender=Masc|Number=Sing|PronType=Art	2	det	2:det	_
+2	soldador	soldador	NOUN	ncms000	Gender=Masc|Number=Sing	0	root	0:root	_
+3	que	que	PRON	pr0cn000	PronType=Rel	5	nsubj	5:nsubj	ArgTem=arg0:agt
+4	me	yo	PRON	pp1cs000	Case=Dat|Number=Sing|Person=1|PrepCase=Npr|PronType=Prs	5	obl:arg	5:obl:arg	ArgTem=arg2:ben
+5	meta	meter	VERB	vmsp3s0	Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	2	acl	2:acl	_
+6	fuego	fuego	NOUN	ncms000	Gender=Masc|Number=Sing	5	obj	5:obj	ArgTem=arg1:pat|SpaceAfter=No
+7	.	.	PUNCT	fp	PunctType=Peri	2	punct	2:punct	_
+# sent_id = CESS-CAST-P-20010701-38-s94
+# text = Por mucho que me lo pidan.
+# orig_file_sentence 117#2
+1	Por	por	ADP	cs	_	6	mark	6:mark	MWE=Por_mucho_que|MWEPOS=SCONJ
+2	mucho	mucho	ADV	_	_	1	fixed	1:fixed	_
+3	que	que	SCONJ	_	_	1	fixed	1:fixed	_
+4	me	yo	PRON	pp1cs000	Case=Dat|Number=Sing|Person=1|PrepCase=Npr|PronType=Prs	6	obl:arg	6:obl:arg	ArgTem=arg2:ben|Entity=(CESSCASTP2001070138c1-person-1-CorefType:ident,gstype:spec)
+5	lo	él	PRON	pp3msa00	Case=Acc|Gender=Masc|Number=Sing|Person=3|PrepCase=Npr|PronType=Prs	6	obj	6:obj	ArgTem=arg1:pat|Entity=(CESSCASTP2001070138c51--1-CorefType:dx.prop)
+6	pidan	pedir	VERB	vmsp3p0	Mood=Sub|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin	0	root	0:root	SpaceAfter=No
+7	.	.	PUNCT	fp	PunctType=Peri	6	punct	6:punct	_
+# sent_id = 3LB-CAST-t6-3-s5
+# text = Tal vez eso fuera lo peor.
+# orig_file_sentence 030#40
+1	Tal	tal	NOUN	rg	_	6	advmod	6:advmod	MWE=Tal_vez|MWEPOS=ADV
+2	vez	vez	NOUN	_	_	1	fixed	1:fixed	_
+3	eso	ese	PRON	pd0ns000	Number=Sing|PronType=Dem	6	nsubj	6:nsubj	ArgTem=arg1:tem
+4	fuera	ser	AUX	vssi3s0	Mood=Sub|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin	6	cop	6:cop	_
+5	lo	él	PRON	da0ns0	Case=Acc|Definite=Def|Gender=Masc|Number=Sing|Person=3|PrepCase=Npr|PronType=Prs	6	det	6:det	_
+6	peor	peor	ADJ	aq0cs0	Degree=Cmp|Number=Sing	0	root	0:root	ArgTem=arg2:atr|SpaceAfter=No
+7	.	.	PUNCT	fp	PunctType=Peri	6	punct	6:punct	_
+# sent_id = 3LB-CAST-n1-3-s13
+# text = La bandera, que no falte.
+# orig_file_sentence 024#5
+1	La	el	DET	da0fs0	Definite=Def|Gender=Fem|Number=Sing|PronType=Art	2	det	2:det	_
+2	bandera	bandera	NOUN	ncfs000	Gender=Fem|Number=Sing	6	nsubj	6:nsubj	ArgTem=arg1:tem|SpaceAfter=No
+3	,	,	PUNCT	fc	PunctType=Comm	2	punct	2:punct	_
+4	que	que	SCONJ	cs	_	6	mark	6:mark	_
+5	no	no	ADV	rn	Polarity=Neg	6	advmod	6:advmod	_
+6	falte	faltar	VERB	vmsp3s0	Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	0	root	0:root	SpaceAfter=No
+7	.	.	PUNCT	fp	PunctType=Peri	6	punct	6:punct	_
+# sent_id = CESS-CAST-P-20020202-102-s11
+# text = Diga lo que diga.
+# orig_file_sentence 076#95
+1	Diga	decir	VERB	vmm03s0	Mood=Imp|Number=Sing|Person=3|VerbForm=Fin	0	root	0:root	_
+1.1	_	_	PRON	p	_	_	_	1:nsubj	ArgTem=arg0:agt|Entity=(CESSCASTP20020202102c2--1-CorefType:ident)|wordform=__EMPTY__
+2	lo	él	PRON	da0ns0	Case=Acc|Definite=Def|Gender=Masc|Number=Sing|Person=3|PrepCase=Npr|PronType=Prs	4	det	4:det	_
+3	que	que	PRON	pr0cn000	PronType=Rel	4	obj	4:obj	ArgTem=arg1:pat
+4	diga	decir	VERB	vmsp3s0	Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	1	ccomp	1:ccomp	ArgTem=arg1:pat|SpaceAfter=No
+4.1	_	_	PRON	p	_	_	_	4:nsubj	ArgTem=arg0:agt|Entity=(CESSCASTP20020202102c2--1-CorefType:ident)|wordform=__EMPTY__
+5	.	.	PUNCT	fp	PunctType=Peri	1	punct	1:punct	_

treetse/__init__.py ADDED Viewed

File without changes

treetse/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (165 Bytes). View file

treetse/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (161 Bytes). View file

treetse/__pycache__/pipeline.cpython-312.pyc ADDED Viewed

Binary file (8.34 kB). View file

treetse/evaluators/__pycache__/evaluator.cpython-312.pyc ADDED Viewed

Binary file (5.25 kB). View file

treetse/evaluators/__pycache__/evaluator.cpython-313.pyc ADDED Viewed

Binary file (5.35 kB). View file

treetse/evaluators/evaluator.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from transformers import AutoModelForMaskedLM, AutoTokenizer
+from typing import Any, Tuple
+import torch
+import torch.nn.functional as F
+class Evaluator:
+    def __init__(self):
+        self.mask_token_index: int = -1
+        self.mask_probs: torch.Tensor | None = None
+        self.tokeniser: Any = None
+        self.model: Any = None
+        self.logits: torch.Tensor = None
+    def setup_parameters(self, model_name: str) -> Tuple[Any, Any]:
+        # Q: what sort of tokenisers are being used?
+        self.tokeniser = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForMaskedLM.from_pretrained(model_name)
+        # set to eval mode, disabling things like dropout
+        self.model.eval()
+        return self.model, self.tokeniser
+    def run_masked_prediction(
+        self, model: Any, tokeniser: Any, sentence: str, target_token: str
+    ) -> Tuple[Any, Any]:
+        mask_token = tokeniser.mask_token
+        sentence_masked = sentence.replace("[MASK]", mask_token)
+        if sentence_masked.count("[MASK]") != 1:
+            raise ValueError("Only single-mask sentences are supported.")
+        inputs = tokeniser(sentence_masked, return_tensors="pt")
+        # Get logits from model
+        with torch.no_grad():
+            outputs = model(**inputs)
+            logits = outputs.logits
+        self.logits = logits
+        self.mask_token_index = self._get_mask_index(inputs, tokeniser)
+        self.mask_probs = self._get_mask_probabilities(
+            self.mask_token_index, self.logits
+        )
+        return self.mask_token_index, self.mask_probs
+    def get_token_prob(self, token: str) -> float:
+        target_id = self.tokeniser.convert_tokens_to_ids(token)
+        prob = self.get_prob_by_id(target_id)
+        return prob
+    def get_top_pred(self) -> dict:
+        top_pred_id = int(torch.argmax(self.mask_probs, dim=-1).item())
+        top_pred_token = self.tokeniser.convert_ids_to_tokens(top_pred_id)
+        top_token_prob = self.get_prob_by_id(top_pred_id)
+        return top_pred_token, top_token_prob
+    def get_prob_by_id(self, id: int) -> float:
+        if self.mask_probs is not None:
+            return self.mask_probs[id].item()
+        else:
+            raise KeyError("Please evaluate a dataset first. Results empty")
+    def _get_mask_index(self, inputs: Any, tokeniser: Any) -> int:
+        if "input_ids" not in inputs:
+            raise ValueError("Missing 'input_ids' in inputs.")
+        if tokeniser.mask_token_id is None:
+            raise ValueError("The tokeniser does not have a defined mask_token_id.")
+        input_ids = inputs["input_ids"]
+        mask_positions = torch.where(input_ids == tokeniser.mask_token_id)
+        if len(mask_positions[0]) == 0:
+            raise ValueError("No mask token found in input_ids.")
+        if len(mask_positions[0]) > 1:
+            raise ValueError("Multiple mask tokens found; expected only one.")
+        return (
+            mask_positions[1].item()
+            if len(mask_positions) > 1
+            else mask_positions[0].item()
+        )
+    def _get_mask_probabilities(
+        self, mask_token_index: int, logits: Any
+    ) -> torch.Tensor:
+        mask_logits = logits[0, mask_token_index, :]  # shape: (1, vocab_size)
+        probs = F.softmax(mask_logits, dim=-1)  # shape: (1, vocab_size)
+        return probs

treetse/evaluators/perplexity.py ADDED Viewed

	@@ -0,0 +1,32 @@

+class PerplexityEvaluator:
+    def __init__(self) -> None:
+        pass
+    def compute_perplexity(self, logits: list) -> list:
+        pass
+    """
+    -- Classic TSE --
+    Evaluates based on minimal pairs, where a particular feature
+    is chosen and two values of that feature are compared.
+    1. Accepts the inputs, logits, feature name, and feature values as input.
+       Finds the lexical items which are the same accept for these values of this
+       feature, including in UPOS and lemma.
+    2. Computes the perplexity scores for the correct value and the alternative syntactic
+       option.
+    """
+    def compute_classic_tse(self) -> None:
+        pass
+    """
+    --- Generalised TSE --
+    Evaluates based on minimal syntactic pairs, that is, a candidate set is created for the
+    correct token as well as the alternate values for that particular features
+    """
+    def compute_generalised_tse(
+        self,
+    ) -> None:
+        pass

treetse/pipeline.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import pandas as pd
+import logging
+from pathlib import Path
+from treetse.preprocessing.conllu_parser import ConlluParser
+from treetse.evaluators.evaluator import Evaluator
+from treetse.visualise.visualiser import Visualiser
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    handlers=[logging.FileHandler("app.log"), logging.StreamHandler()],
+)
+class Grewtse:
+    def __init__(self):
+        self.parser = ConlluParser()
+        self.evaluator = Evaluator()
+        self.visualiser = Visualiser()
+        self.treebank_path: str = None
+        self.lexical_items: pd.DataFrame = None
+        self.masked_dataset: pd.DataFrame = None
+        self.exception_dataset: pd.DataFrame = None
+        self.evaluation_results: pd.DataFrame = None
+    def parse_treebank(self, filepath: str) -> bool:
+        try:
+            self.treebank_path = filepath
+            self.lexical_items = self.parser._build_lexical_item_dataset(filepath)
+            return True
+        except Exception as e:
+            self.treebank_path = None
+            self.lexical_items = None
+            return False
+    def is_treebank_loaded(self) -> bool:
+        return self.lexical_items is not None
+    def is_dataset_masked(self) -> bool:
+        return self.masked_dataset is not None
+    def get_lexical_items(self) -> pd.DataFrame:
+        return self.lexical_items
+    def get_morphological_features(self) -> list:
+        if self.lexical_items is None:
+            raise ValueError("Cannot get features: You must parse a treebank first.")
+        morph_df = self.lexical_items
+        morph_df.columns = [col.replace("feats__", "") if col.startswith("feats__") else col for col in morph_df.columns]
+        return morph_df
+    def generate_masked_dataset(
+        self, query: str, target_node: str, mask_token: str = "[MASK]"
+    ) -> pd.DataFrame:
+        if self.treebank_path is None:
+            raise ValueError("Cannot create masked dataset: no treebank filepath provided.")
+        results = self.parser._build_masked_dataset_grew(
+            self.treebank_path, query, target_node, mask_token
+        )
+        self.masked_dataset = results['masked']
+        self.exception_dataset = results['exception']
+        return self.masked_dataset
+    def get_masked_dataset(self) -> pd.DataFrame:
+        return self.masked_dataset
+    def generate_minimal_pairs(self, morph_features: dict, upos_features: dict | None) -> pd.DataFrame:
+        if self.masked_dataset is None:
+            raise ValueError("Cannot generate minimal pairs: treebank must be parsed and masked first.")
+        def convert_row_to_feature(row):
+            return self.parser.to_syntactic_feature(
+                row['sentence_id'],
+                row['match_id']-1,
+                morph_features,
+                {},
+            )
+        alternative_row = self.masked_dataset.apply(convert_row_to_feature, axis=1)
+        self.masked_dataset['alternative'] = alternative_row
+        return self.masked_dataset
+    def are_minimal_pairs_generated(self) -> bool:
+        return self.is_treebank_loaded() and \
+                self.is_dataset_masked() and \
+                ('alternative' in self.masked_dataset.columns)
+    def evaluate_bert_mlm(self, model_repo: str, row_limit: int = None) -> pd.DataFrame:
+        if self.masked_dataset is None:
+            raise ValueError("Cannot evaluate: treebank must be parsed and masked first.")
+        test_model, test_tokeniser = self.evaluator.setup_parameters(model_repo)
+        results = []
+        counter = 0
+        for row in self.masked_dataset.itertuples():
+            masked_sentence = row.masked_text
+            label = row.match_token
+            alternative_form = row.alternative
+            row_results = {
+                "sentence_id": row.sentence_id,
+                "token_id": row.match_id,
+                "masked_sentence": masked_sentence,
+                "num_tokens": row.num_tokens,
+                "label": label,
+                "label_prob": None,
+                "alternative": alternative_form,
+                "alternative_prob": None,
+                "top_pred_label": None,
+                "top_pred_prob": None,
+            }
+            try:
+                self.evaluator.run_masked_prediction(
+                    test_model, test_tokeniser, masked_sentence, label
+                )
+            except Exception as e:
+                raise Exception("There was an issue with the model or tokeniser")
+            # -- LABEL PROB --
+            label_prob = self.evaluator.get_token_prob(label)
+            row_results["label_prob"] = label_prob
+            # -- ALTERNATIVE FORM --
+            if alternative_form:
+                logging.info("----")
+                logging.info(f"Label Form: {label}")
+                logging.info(f"Alternative Form: {alternative_form}")
+                logging.info("----")
+                alt_form_prob = self.evaluator.get_token_prob(alternative_form)
+                row_results["alternative_prob"] = alt_form_prob
+            # -- HIGHEST PROB --
+            top_pred_label, top_pred_prob = self.evaluator.get_top_pred()
+            row_results["top_pred_label"] = top_pred_label
+            row_results["top_pred_prob"] = top_pred_prob
+            results.append(row_results)
+            if row_limit:
+                counter += 1
+                if counter == row_limit:
+                    break
+        results_df = pd.DataFrame(results)
+        self.evaluation_dataset = results_df
+        return results_df
+    def visualise_syntactic_performance(
+        self,
+        filename: str,
+        results: pd.DataFrame,
+        target_x_label: str,
+        alt_x_label: str,
+        x_axis_label: str,
+        y_axis_label: str,
+        title: str,
+    ) -> None:
+        visualiser = Visualiser()
+        visualiser.visualise_slope(
+            filename,
+            results,
+            target_x_label,
+            alt_x_label,
+            x_axis_label,
+            y_axis_label,
+            title,
+        )
+"""
+def store_results(
+    results_filename: str,
+    li_set_filename: str,
+    model_results: pd.DataFrame,
+    li_set: pd.DataFrame,
+):
+    try:
+        model_results.to_csv(base_dir / "output" / results_filename, index=False)
+        li_set.to_csv(base_dir / li_set_filename, index=True)
+        model_results["difference"] = (
+            model_results["label_prob"] - model_results["alternative_prob"]
+        )
+        model_results = model_results.sort_values("difference")
+        model_results.dropna().to_csv(
+            base_dir / "output" / f"filtered_{results_filename}", index=False
+        )
+    except Exception as e:
+        logging.error(f"Failed to output to CSV: {e}")
+        raise
+"""

treetse/preprocessing/__init__.py ADDED Viewed

File without changes

treetse/preprocessing/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (179 Bytes). View file

treetse/preprocessing/__pycache__/conllu_parser.cpython-312.pyc ADDED Viewed

Binary file (16.4 kB). View file

treetse/preprocessing/__pycache__/conllu_parser.cpython-313.pyc ADDED Viewed

Binary file (12.3 kB). View file

treetse/preprocessing/__pycache__/grew_dependencies.cpython-312.pyc ADDED Viewed

Binary file (1.05 kB). View file

treetse/preprocessing/__pycache__/reconstruction.cpython-312.pyc ADDED Viewed

Binary file (2.33 kB). View file

treetse/preprocessing/__pycache__/reconstruction.cpython-313.pyc ADDED Viewed

Binary file (2.42 kB). View file

treetse/preprocessing/conllu_parser.py ADDED Viewed

	@@ -0,0 +1,402 @@

+from treetse.preprocessing.grew_dependencies import match_dependencies
+from treetse.preprocessing.reconstruction import Lexer
+from conllu import parse_incr, Token
+from pathlib import Path
+from typing import Any
+import pandas as pd
+import numpy as np
+import logging
+def test_function():
+    return True
+class ConlluParser:
+    def __init__(self) -> None:
+        self.li_feature_set: pd.DataFrame = None
+        self.masked_dataset: pd.DataFrame = None
+        self.exception_dataset: pd.DataFrame = None
+        self.lexer: Lexer = Lexer()
+    # todo: add error handling here
+    def parse_grew(
+        self, path: str, grew_query: str, grew_variable_to_mask: str, mask_token: str = "[MASK]"
+    ) -> bool:
+        self.li_feature_set = self._build_lexical_item_dataset(path)
+        masking_results = self._build_masked_dataset_grew(
+            path, grew_query, grew_variable_to_mask, mask_token
+        )
+        self.masked_dataset = masking_results["masked"]
+        self.exception_dataset = masking_results["exception"]
+        return self.masked_dataset, self.exception_dataset
+    # todo: add error handling here
+    def parse(
+        self, path: str, morphological_constraints: dict, universal_constraints: dict, mask_token: str = "[MASK]"
+    ) -> bool:
+        self.li_feature_set = self._build_lexical_item_dataset(path)
+        upos_constraint = universal_constraints["upos"] if "upos" in universal_constraints else None
+        masking_results = self._build_masked_dataset(
+            path, morphological_constraints, upos_constraint, mask_token
+        )
+        self.masked_dataset = masking_results["masked"]
+        self.exception_dataset = masking_results["exception"]
+        return True
+    def get_masked_dataset(self) -> pd.DataFrame:
+        return self.masked_dataset
+    def get_lexical_item_dataset(self) -> pd.DataFrame:
+        return self.li_feature_set
+    # this shouldn't be hard coded
+    def get_feature_names(self) -> list:
+        return self.li_feature_set.columns[4:].to_list()
+    # todo: add more safety
+    def get_features(self, sentence_id: str, token_id: int) -> dict:
+        print(sentence_id)
+        print(token_id)
+        print(self.li_feature_set.index)
+        return self.li_feature_set.loc[(sentence_id, token_id)][self.get_feature_names()].to_dict()
+    def get_lemma(self, sentence_id: str, token_id: str) -> str:
+        return self.li_feature_set.loc[(sentence_id, token_id)]["lemma"]
+    # todo: handle making sure that it is the exact same as the lemma
+    def to_syntactic_feature(self, sentence_id: str, token_id: str, alt_morph_constraints: dict, alt_universal_constraints: dict) -> str | None:
+        # distinguish morphological from universal features
+        # todo: find a better way to do this
+        # prefix = 'feats__'
+        prefix = ''
+        alt_morph_constraints = {prefix + key: value for key, value in alt_morph_constraints.items()}
+        token_features = self.get_features(sentence_id, token_id)
+        token_features.update(alt_morph_constraints)
+        token_features.update(alt_universal_constraints)
+        lexical_items = self.li_feature_set
+        # get only those items which are the same lemma
+        lemma = self.get_lemma(sentence_id, token_id)
+        lemma_mask = lexical_items['lemma'] == lemma
+        lexical_items = lexical_items[lemma_mask]
+        logging.info(f"Looking for form {lemma}")
+        logging.info(lexical_items)
+        print(token_features.items())
+        for feat, value in token_features.items():
+            # ensure feature is a valid feature in feature set
+            if feat not in lexical_items.columns:
+                raise KeyError(
+                    "Invalid feature provided to confound set: {}".format(feat)
+                )
+            # slim the mask down using each feature
+            # interesting edge case: np.nan == np.nan returns false!
+            mask = (lexical_items[feat] == value) | (lexical_items[feat].isna() & pd.isna(value))
+            lexical_items = lexical_items[mask]
+        if len(lexical_items) > 0:
+            return lexical_items["form"].iloc[0]
+        else:
+            return None
+    def get_candidate_set(self, universal_constraints: dict, morph_constraints: dict) -> pd.DataFrame:
+        has_parsed_conllu = self.li_feature_set is not None
+        if not has_parsed_conllu:
+            raise ValueError("Please parse a ConLLU file first.")
+        morph_constraints = {f"feats__{k}": v for k, v in morph_constraints.items()}
+        are_morph_features_valid = all(
+            f in self.li_feature_set.columns for f in morph_constraints.keys()
+        )
+        are_universal_features_valid = all(
+            f in self.li_feature_set.columns for f in universal_constraints.keys()
+        )
+        if not are_morph_features_valid or not are_universal_features_valid:
+            raise KeyError(
+                "Features provided for candidate set are not valid features in the dataset."
+            )
+        all_constraints = {**universal_constraints, **morph_constraints}
+        candidate_set = self._construct_candidate_set(
+            self.li_feature_set, all_constraints
+        )
+        return candidate_set
+    def _build_masked_dataset_grew(self, filepath: Path, grew_query: str, dependency_node: str,
+        mask_token, encoding: str = "utf-8"):
+        masked_dataset = []
+        exception_dataset = []
+        get_tokens_to_mask = match_dependencies(filepath, grew_query, dependency_node)
+        try:
+            with open(filepath, "r", encoding=encoding) as data_file:
+                for sentence in parse_incr(data_file):
+                    logging.info(f"Processing sentence: {sentence.metadata["sent_id"]}")
+                    sentence_id = sentence.metadata["sent_id"]
+                    sentence_text = sentence.metadata["text"]
+                    if sentence_id in get_tokens_to_mask:
+                        for i in range(len(sentence)):
+                            sentence[i]["index"] = i
+                        token_to_mask_id = get_tokens_to_mask[sentence_id]
+                        try:
+                            t_match = [tok for tok in sentence if tok.get("id") == token_to_mask_id][0]
+                            t_match_form = t_match["form"]
+                            t_match_index = t_match["index"]
+                            sentence_as_str_list = [t["form"] for t in sentence]
+                        except KeyError:
+                            logging.info("There was a mismatch for the GREW-based ID and the Conllu ID.")
+                            exception_dataset.append(
+                                {
+                                    "sentence_id": sentence_id,
+                                    "match_id": None,
+                                    "all_tokens": None,
+                                    "match_token": None,
+                                    "original_text": sentence_text,
+                                }
+                            )
+                            continue
+                        try:
+                            matched_token_start_index = self.lexer.recursive_match_token(
+                                sentence_text, # the original string
+                                sentence_as_str_list.copy(), # the string as a list of tokens
+                                t_match_index, # the index of the token to be replaced
+                                [
+                                    "_",
+                                    " ",
+                                ],  # todo: skip lines where we don't encounter accounted for tokens
+                            )
+                        except ValueError:
+                            print("Token not found. Saving as exception.")
+                            exception_dataset.append(
+                                {
+                                    "sentence_id": sentence_id,
+                                    "match_id": token_to_mask_id,
+                                    "all_tokens": sentence_as_str_list,
+                                    "match_token": t_match_form,
+                                    "original_text": sentence_text,
+                                }
+                            )
+                            continue
+                        # let's replace the matched token with a MASK token
+                        masked_sentence = self.lexer.perform_token_surgery(
+                            sentence_text,
+                            t_match_form,
+                            mask_token,
+                            matched_token_start_index,
+                        )
+                        # the sentence ID and match ID are together a primary key
+                        masked_dataset.append(
+                            {
+                                "sentence_id": sentence_id,
+                                "match_id": token_to_mask_id,
+                                "all_tokens": sentence_as_str_list,
+                                "num_tokens": len(sentence_as_str_list),
+                                "match_token": t_match_form,
+                                "original_text": sentence_text,
+                                "masked_text": masked_sentence,
+                            }
+                        )
+        except FileNotFoundError:
+            print(f"Error: The file '{filepath}' was not found.")
+        masked_dataset_df = pd.DataFrame(masked_dataset)
+        exception_dataset_df = pd.DataFrame(exception_dataset)
+        return {"masked": masked_dataset_df, "exception": exception_dataset_df}
+    def _build_masked_dataset(
+        self, filepath: str, morph_constraints: dict, upos_constraint: str | None, mask_token: str, encoding: str = "utf-8"
+    ) -> dict[str, pd.DataFrame]:
+        masked_dataset = []
+        exception_dataset = []
+        try:
+            with open(filepath, "r", encoding=encoding) as data_file:
+                constraints_kwargs = {f"feats__{k.capitalize()}": v for k, v in morph_constraints.items()}
+                for sentence in parse_incr(data_file):
+                    logging.info(f"Processing sentence: {sentence.metadata["sent_id"]}")
+                    # MORPHOLOGICAL FILTER
+                    token_constraint_matches = sentence.filter(**constraints_kwargs)
+                    # UNIVERSAL POS FILTER
+                    if upos_constraint:
+                        token_constraint_matches = sentence.filter(lambda token: token.upos == upos_constraint)
+                    if token_constraint_matches:
+                        for i in range(len(sentence)):
+                            sentence[i]["index"] = i
+                        # sentence_text = " ".join(token["form"] for token in sentence)
+                        sentence_text = sentence.metadata["text"]
+                        sentence_id = sentence.metadata["sent_id"]
+                        matches = [t["form"] for t in token_constraint_matches]
+                        match_indices = [t["index"] for t in token_constraint_matches]
+                        # iterate over each match in the sentence
+                        for t_match_index, t_match in zip(match_indices, matches):
+                            # we want to create one sentence entry per example
+                            # so if we have two subjunctive's in one sentence for instance,
+                            # there will be two test sentences
+                            # at what point in the string does the matched token start?
+                            sentence_as_str_list = [t["form"] for t in sentence]
+                            try:
+                                matched_token_start_index = self.lexer.recursive_match_token(
+                                    sentence_text,
+                                    sentence_as_str_list.copy(),
+                                    t_match_index,
+                                    [
+                                        "_",
+                                        " ",
+                                    ],  # todo: skip lines where we don't encounter accounted for tokens
+                                )
+                            except ValueError:
+                                print("Token not found. Saving as exception.")
+                                exception_dataset.append(
+                                    {
+                                        "sentence_id": sentence_id,
+                                        "match_id": t_match_index,
+                                        "all_tokens": sentence_as_str_list,
+                                        "match_token": t_match,
+                                        "original_text": sentence_text,
+                                    }
+                                )
+                                continue
+                            # let's replace the matched token with a MASK token
+                            masked_sentence = self.lexer.perform_token_surgery(
+                                sentence_text,
+                                t_match,
+                                mask_token,
+                                matched_token_start_index,
+                            )
+                            # the sentence ID and match ID are together a primary key
+                            masked_dataset.append(
+                                {
+                                    "sentence_id": sentence_id,
+                                    "match_id": t_match_index,
+                                    "all_tokens": sentence_as_str_list,
+                                    "num_tokens": len(sentence_as_str_list),
+                                    "match_token": t_match,
+                                    "original_text": sentence_text,
+                                    "masked_text": masked_sentence,
+                                }
+                            )
+        except FileNotFoundError:
+            print(f"Error: The file '{filepath}' was not found.")
+        masked_dataset_df = pd.DataFrame(masked_dataset)
+        exception_dataset_df = pd.DataFrame(exception_dataset)
+        return {"masked": masked_dataset_df, "exception": exception_dataset_df}
+    def _is_valid_token(self, token: Token) -> bool:
+        punctuation = [".", ",", "!", "?", "*"]
+        # skip multiword tokens, malformed entries and punctuation
+        is_punctuation = token.get("form") in punctuation
+        is_valid_type = isinstance(token, dict)
+        has_valid_id = isinstance(token.get("id"), int)
+        return is_valid_type and has_valid_id and not is_punctuation
+    def _build_token_row(self, token: Token, sentence_id: str) -> dict[str, Any]:
+        # get all token features such as Person, Mood, etc
+        feats = token.get("feats") or {}
+        row = {
+            "sentence_id": sentence_id,
+            "token_id": token.get("id") - 1,  # ID's are reduced by one to start at 0
+            "form": token.get("form"),
+            "lemma": token.get("lemma"),
+            "upos": token.get("upos"),
+            "xpos": token.get("xpos"),
+        }
+        # add each morphological feature as a column
+        for feat_name, feat_value in feats.items():
+            row["feats__" + feat_name.lower()] = feat_value
+        return row
+    def _build_lexical_item_dataset(self, conllu_path: str) -> pd.DataFrame:
+        rows = []
+        with open(conllu_path, "r", encoding="utf-8") as f:
+            for i, tokenlist in enumerate(parse_incr(f)):
+                # get the sentence ID in the dataset
+                sent_id = tokenlist.metadata["sent_id"]
+                logging.info(f"Building LI Set For Sentence: {sent_id}")
+                # iterate over each token
+                for token in tokenlist:
+                    # check if it's worth saving to our lexical item dataset
+                    is_valid_token = self._is_valid_token(token)
+                    if not is_valid_token:
+                        continue
+                    # from the token object create a dict and append
+                    row = self._build_token_row(token, sent_id)
+                    rows.append(row)
+            lexical_item_df = pd.DataFrame(rows)
+            # make sure our nan values are interpreted as such
+            lexical_item_df.replace("nan", np.nan, inplace=True)
+            # create the (Sentence ID, Token ID) primary key
+            lexical_item_df.set_index(["sentence_id", "token_id"], inplace=True)
+            self.li_feature_set = lexical_item_df
+        return lexical_item_df
+    """
+    -- Candidate Set --
+    This constructs a list of words which have the same feature set as the
+    target features which are passed as an argument.
+    """
+    def _construct_candidate_set(
+        self, li_feature_set: pd.DataFrame, target_features: dict
+    ) -> pd.DataFrame:
+        # optionally restrict search to a certain type of lexical item
+        subset = li_feature_set
+        # continuously filter the dataframe so as to be left
+        # only with those lexical items which match the target
+        # features
+        # this includes cases
+        for feat, value in target_features.items():
+            # ensure feature is a valid feature in feature set
+            if feat not in subset.columns:
+                raise KeyError(
+                    "Invalid feature provided to confound set: {}".format(feat)
+                )
+            # slim the mask down using each feature
+            # interesting edge case: np.nan == np.nan returns false!
+            mask = (subset[feat] == value) | (subset[feat].isna() & pd.isna(value))
+            subset = subset[mask]
+        return subset

treetse/preprocessing/grew_dependencies.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from grewpy import Corpus, Request, set_config
+from pathlib import Path
+def match_dependencies(
+    corpus_path: Path, grew_query: str, dependency_node: str
+) -> dict:
+    set_config("sud")  # ud or basic
+    # run the GREW request on the corpus
+    corpus = Corpus(str(corpus_path))
+    request = Request().pattern(grew_query)
+    occurrences = corpus.search(request)
+    # step 2
+    dep_matches = {}
+    for occ in occurrences:
+        sent_id = occ["sent_id"]
+        object_node_id = int(occ["matching"]["nodes"][dependency_node])
+        dep_matches[sent_id] = object_node_id
+    return dep_matches

treetse/preprocessing/reconstruction.py ADDED Viewed

	@@ -0,0 +1,78 @@

+class Lexer:
+    def __init__(self) -> None:
+        pass
+    def perform_token_surgery(
+        self,
+        sentence: str,
+        original_token: str,
+        replacement_token: str,
+        start_index: int,
+    ) -> str:
+        t_len = len(original_token)
+        return (
+            sentence[:start_index] + replacement_token + sentence[start_index + t_len :]
+        )
+    def recursive_match_token(
+        self,
+        full_sentence: str,
+        token_list: list[str],
+        token_list_mask_index: int,
+        skippable_tokens: list[str],
+    ) -> int:
+        # ensure we can retrieve another token
+        n_remaining_tokens = len(token_list)
+        if n_remaining_tokens == 0:
+            raise ValueError(
+                "Mask index not reached but token list has been iterated for sentence: {}".format(
+                    full_sentence
+                )
+            )
+        t = token_list[0]
+        # returns the index of the first occurrence
+        # of the token t
+        match_index = full_sentence.find(t)
+        is_match_found = match_index != -1
+        has_reached_mask_token = token_list_mask_index == 0
+        # BASE CASE
+        if has_reached_mask_token and is_match_found:
+            # we're at the end
+            return match_index
+        # RECURSIVE CASE
+        elif is_match_found:
+            sliced_sentence = full_sentence[match_index + len(t) :]
+            token_list.pop(0)
+            return (
+                match_index
+                + len(t)
+                + self.recursive_match_token(
+                    sliced_sentence,
+                    token_list,
+                    token_list_mask_index - 1,
+                    skippable_tokens,
+                )
+            )
+        else:
+            # no match found, is t irrelevant?
+            if t in skippable_tokens:
+                # need to watch out with the slicing here
+                # tests are important
+                sliced_sentence = full_sentence[len(t) - 1 :]
+                token_list.pop(0)
+                return self.recursive_match_token(
+                    sliced_sentence,
+                    token_list,
+                    token_list_mask_index - 1,
+                    skippable_tokens,
+                )
+            else:
+                raise ValueError(
+                    "Token not found in string nor has it been specified as skippable: {}".format(
+                        t
+                    )
+                )

treetse/visualise/__pycache__/visualiser.cpython-312.pyc ADDED Viewed

Binary file (4.78 kB). View file

treetse/visualise/__pycache__/visualiser.cpython-313.pyc ADDED Viewed

Binary file (4.4 kB). View file

treetse/visualise/visualiser.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import pandas as pd
+from plotnine import labs, theme, theme_bw, guides, position_nudge, aes, geom_violin, geom_boxplot, geom_line, geom_jitter, scale_x_discrete, ggplot
+from pathlib import Path
+import math
+class Visualiser:
+    def __init__(self) -> None:
+        self.data = None
+    def load_dataset(self, results: pd.DataFrame) -> bool:
+        self.data = results
+    def visualise_slope(
+        self,
+        path: Path,
+        results: pd.DataFrame,
+        target_x_label: str,
+        alt_x_label: str,
+        x_axis_label: str,
+        y_axis_label: str,
+        title: str,
+    ):
+        lsize = 0.65
+        fill_alpha = 0.7
+        # X-axis: Acc, Gen
+        # Y-axis: surprisal
+        filtered_df = results[
+            results["alternative"].notna() & (results["alternative"].str.strip() != "")
+        ]
+        print("Number of filtered results: ", len(filtered_df))
+        print(filtered_df.head())
+        filtered_df["subject_id"] = filtered_df.index
+        print(filtered_df.head())
+        # Melt the dataframe
+        df_long = pd.melt(
+            filtered_df,
+            id_vars=["subject_id", "num_tokens"],
+            value_vars=["label_prob", "alternative_prob"],
+            var_name="source",
+            value_name="log_prob",
+        )
+        # Map source to fixed x-axis labels
+        df_long["x_label"] = df_long["source"].map(
+            {"label_prob": target_x_label, "alternative_prob": alt_x_label}
+        )
+        print(df_long.head())
+        def surprisal(p: float) -> float:
+            return -math.log2(p)
+        def confidence(p: float) -> float:
+            return math.log2(p)
+        df_long["surprisal"] = df_long["log_prob"].apply(confidence)
+        print(df_long.head())
+        p = (
+            ggplot(df_long, aes(x="x_label", y="surprisal", fill="x_label"))
+            + scale_x_discrete(limits=[target_x_label, alt_x_label])
+            + geom_jitter(
+                aes(color="x_label", size="num_tokens"), width=0.01, alpha=0.7
+            )
+            +
+            # geom_text(aes(label='label'), nudge_y=0.1) +
+            geom_line(aes(group="subject_id"), color="gray", alpha=0.7, size=0.2)
+            + geom_boxplot(
+                df_long[df_long["x_label"] == target_x_label],
+                aes(x="x_label", y="surprisal", group="x_label"),
+                width=0.2,
+                alpha=0.4,
+                size=0.6,
+                outlier_shape=None,
+                show_legend=False,
+                position=position_nudge(x=-0.2),
+            )
+            + geom_boxplot(
+                df_long[df_long["x_label"] == alt_x_label],
+                aes(x="x_label", y="surprisal", group="x_label"),
+                width=0.2,
+                alpha=0.4,
+                size=0.6,
+                outlier_shape=None,
+                show_legend=False,
+                position=position_nudge(x=0.2),
+            )
+            + geom_violin(
+                df_long[df_long["x_label"] == target_x_label],
+                aes(x="x_label", y="surprisal", group="x_label"),
+                position=position_nudge(x=-0.4),
+                style="left-right",
+                alpha=fill_alpha,
+                size=lsize,
+            )
+            + geom_violin(
+                df_long[df_long["x_label"] == alt_x_label],
+                aes(x="x_label", y="surprisal", group="x_label"),
+                position=position_nudge(x=0.4),
+                style="right-left",
+                alpha=fill_alpha,
+                size=lsize,
+            )
+            + guides(fill=False)
+            + theme_bw()
+            + theme(figure_size=(8, 4), legend_position="none")
+            + labs(x=x_axis_label, y=y_axis_label, title=title)
+        )
+        p.save(path, width=14, height=8, dpi=300)