Albin Thörn Cleland commited on
Commit
f5c8cbe
·
1 Parent(s): abd4157

Cleaning up

Browse files
.ipynb_checkpoints/README-checkpoint.md DELETED
@@ -1,40 +0,0 @@
1
- # Test
2
-
3
- - Train: hela den historiska maskinparsade korpusen `alanev52/Diachronic_Treebanks_DigPhil/stanza_results` plus alla fem svenska trädbanker från ud, och även de från bokmål
4
- - Dev: 10% av guld (`alanev52/Diachronic_Treebanks_DigPhil/stanza_results/validated`)
5
- - Test: 90% av guld
6
-
7
- <!-- tmux attach -t stanza ****-->
8
-
9
- Work flow:
10
-
11
- ```
12
- python prepare-train-val-test.py
13
-
14
- source scripts/config_alvis.sh
15
-
16
- python -m stanza.utils.datasets.prepare_depparse_treebank UD_Swedish-diachronic --wordvec_pretrain_file /cephyr/users/cleland/Alvis/stanza_resources/sv/pretrain/diachronic.pt
17
-
18
- python -m stanza.utils.training.run_depparse UD_Swedish-diachronic --wordvec_pretrain_file /cephyr/users/cleland/Alvis/stanza_resources/sv/pretrain/diachronic.pt --batch_size 32 --dropout 0.33
19
- ```
20
-
21
- ## Pretrained vectors
22
-
23
- We use the incremental vectors up until 1880 from Henchen & Tahmasebi 2021.
24
-
25
- Jag konverterade först kubhist2-vektorerna från gensim fasttext .ft till en vanlig textfil med gensims pythonpaket, sedan använde jag stanzas konverterare till .pt:
26
-
27
- ```
28
- from stanza.models.common.pretrain import Pretrain
29
- pt = Pretrain("foo.pt", "new_vectors.txt")
30
- pt.load()
31
- ```
32
-
33
- Resultatet finns komprimerat i `diachronic.pt.xz`.
34
-
35
- ## References
36
-
37
- **Hengchen, Simon & Tahmasebi, Nina. (2021).**
38
- *A collection of Swedish diachronic word embedding models trained on historical newspaper data.*
39
- **Journal of Open Humanities Data**, 7(2), 1–7.
40
- https://doi.org/10.5334/johd.22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/prepare-train-val-test-checkpoint.py DELETED
@@ -1,246 +0,0 @@
1
- #!/usr/bin/env python3
2
- import random
3
- from pathlib import Path
4
- from collections import defaultdict
5
-
6
- # ============================================================
7
- # BASE PATHS
8
- # ============================================================
9
- BASE = Path("/mimer/NOBACKUP/groups/dionysus/cleland/stanza-digphil").resolve()
10
-
11
- SVENSKA_PROJEKT = BASE / "ud-treebanks-sv"
12
- NORSKA_PROJEKT = BASE / "ud-treebanks-bm"
13
- DANSKA_PROJEKT = BASE / "ud-treebanks-dk"
14
-
15
- DIGPHIL_MACHINE = BASE / "alanev_raw_files/diachron"
16
- DIGPHIL_GOLD = BASE / "alanev_raw_files/diachron-validated"
17
-
18
- OUTPUT_TRAIN = BASE / "sv_diachronic-ud-train.conllu"
19
- OUTPUT_DEV = BASE / "sv_diachronic-ud-dev.conllu"
20
- OUTPUT_TEST = BASE / "sv_diachronic-ud-test.conllu"
21
-
22
- random.seed(1337)
23
-
24
- # ============================================================
25
- # BASIC CONLLU HELPERS
26
- # ============================================================
27
- def read_conllu(path: Path):
28
- text = path.read_text(encoding="utf-8").strip()
29
- return [] if not text else text.split("\n\n")
30
-
31
- def extract_sent_id(block: str) -> str | None:
32
- for line in block.split("\n"):
33
- if line.startswith("# sent_id"):
34
- parts = line.split("=", 1)
35
- if len(parts) == 2:
36
- return parts[1].strip()
37
- return line.split("# sent_id", 1)[1].strip()
38
- return None
39
-
40
- def write_conllu(path: Path, sentences):
41
- with path.open("w", encoding="utf-8") as f:
42
- for s in sentences:
43
- f.write(s.strip() + "\n\n")
44
-
45
- def load_from_treebank_dir(directory: Path):
46
- collected = []
47
- for path in directory.rglob("*.conllu"):
48
- print(f"Reading: {path}")
49
- collected.extend(read_conllu(path))
50
- return collected
51
-
52
- # ============================================================
53
- # CoNLL-U VALIDATOR (unchanged logic, adapted for in-memory)
54
- # ============================================================
55
- class CoNLLUValidator:
56
- def __init__(self):
57
- self.errors = []
58
-
59
- def validate_sentence(self, sentence_lines, sent_id=None):
60
- self.errors = []
61
-
62
- if not sentence_lines:
63
- self.errors.append("Empty sentence")
64
- return False
65
-
66
- tokens = []
67
- roots = []
68
- token_ids = set()
69
-
70
- for line_num, line in enumerate(sentence_lines, 1):
71
- try:
72
- fields = line.split('\t')
73
- if len(fields) != 10:
74
- self.errors.append(f"Line {line_num}: Expected 10 fields, got {len(fields)}")
75
- continue
76
-
77
- token_id, form, lemma, upos, xpos, feats, head, deprel, deps, misc = fields
78
-
79
- if '-' in token_id or '.' in token_id:
80
- continue
81
-
82
- try:
83
- token_id_int = int(token_id)
84
- head_int = int(head)
85
- except ValueError:
86
- self.errors.append(f"Line {line_num}: Invalid token ID or head")
87
- continue
88
-
89
- token_ids.add(token_id_int)
90
-
91
- if head_int == 0:
92
- roots.append(token_id_int)
93
-
94
- tokens.append({
95
- 'id': token_id_int,
96
- 'form': form,
97
- 'lemma': lemma,
98
- 'upos': upos,
99
- 'head': head_int,
100
- 'deprel': deprel
101
- })
102
- except Exception as e:
103
- self.errors.append(f"Line {line_num}: Error: {e}")
104
-
105
- if len(roots) == 0:
106
- self.errors.append("No root found")
107
- elif len(roots) > 1:
108
- self.errors.append(f"Multiple roots found: {roots}")
109
-
110
- for token in tokens:
111
- if token['head'] != 0 and token['head'] not in token_ids:
112
- self.errors.append(f"Token {token['id']} has invalid head {token['head']}")
113
-
114
- if not self._check_no_cycles(tokens):
115
- self.errors.append("Dependency cycle detected")
116
-
117
- for token in tokens:
118
- if not token['form'] or token['form'] == '_':
119
- self.errors.append(f"Token {token['id']}: Missing form")
120
- if not token['upos'] or token['upos'] == '_':
121
- self.errors.append(f"Token {token['id']}: Missing UPOS")
122
- if not token['deprel'] or token['deprel'] == '_':
123
- self.errors.append(f"Token {token['id']}: Missing deprel")
124
-
125
- return len(self.errors) == 0
126
-
127
- def _check_no_cycles(self, tokens):
128
- heads = {t['id']: t['head'] for t in tokens}
129
- for start in tokens:
130
- visited = set()
131
- current = start['id']
132
- while current != 0 and current in heads:
133
- if current in visited:
134
- return False
135
- visited.add(current)
136
- current = heads[current]
137
- return True
138
-
139
- def get_errors(self):
140
- return self.errors
141
-
142
-
143
- # ============================================================
144
- # CLEANING PIPELINE (in-memory)
145
- # ============================================================
146
- def clean_sentences(sentence_blocks):
147
- """
148
- Take a list of CONLLU sentence blocks as strings.
149
- Return: new cleaned list.
150
- """
151
- validator = CoNLLUValidator()
152
- cleaned = []
153
-
154
- for block in sentence_blocks:
155
- lines = [l for l in block.split("\n") if not l.startswith("#")]
156
- comments = [l for l in block.split("\n") if l.startswith("#")]
157
- sent_id = None
158
- for c in comments:
159
- if c.startswith("# sent_id"):
160
- sent_id = c.split("=", 1)[1].strip() if "=" in c else None
161
-
162
- if validator.validate_sentence(lines, sent_id):
163
- cleaned.append(block)
164
- else:
165
- print(f"[REMOVED] sent_id={sent_id} ERRORS={validator.get_errors()}")
166
-
167
- return cleaned
168
-
169
-
170
- # ============================================================
171
- # BUILD TRAIN SENTENCES
172
- # ============================================================
173
- train_sentences = []
174
- train_sentences.extend(load_from_treebank_dir(SVENSKA_PROJEKT))
175
- train_sentences.extend(load_from_treebank_dir(NORSKA_PROJEKT))
176
- train_sentences.extend(load_from_treebank_dir(DANSKA_PROJEKT))
177
-
178
-
179
- # ============================================================
180
- # HANDLE DIGPHIL MACHINE minus gold
181
- # ============================================================
182
- def map_sent_ids_by_file(directory: Path):
183
- mapping = {}
184
- for path in directory.glob("*.conllu"):
185
- blocks = read_conllu(path)
186
- ids = {extract_sent_id(b) for b in blocks if extract_sent_id(b)}
187
- mapping[path.name] = ids
188
- return mapping
189
-
190
- gold_ids = map_sent_ids_by_file(DIGPHIL_GOLD)
191
-
192
- for machine_file in DIGPHIL_MACHINE.glob("*.conllu"):
193
- blocks = read_conllu(machine_file)
194
- filename = machine_file.name
195
- gold_for_this = gold_ids.get(filename, set())
196
-
197
- for block in blocks:
198
- sid = extract_sent_id(block)
199
- if sid and sid in gold_for_this:
200
- continue
201
- train_sentences.append(block)
202
-
203
-
204
- # ============================================================
205
- # GOLD → DEV/TEST SPLIT
206
- # ============================================================
207
- gold_sentences = []
208
- for gold_file in DIGPHIL_GOLD.glob("*.conllu"):
209
- print(f"Reading GOLD: {gold_file}")
210
- gold_sentences.extend(read_conllu(gold_file))
211
-
212
- random.shuffle(gold_sentences)
213
-
214
- n = len(gold_sentences)
215
- dev_size = max(1, int(n * 0.10))
216
-
217
- dev_sentences = gold_sentences[:dev_size]
218
- test_sentences = gold_sentences[dev_size:]
219
-
220
-
221
- # ============================================================
222
- # CLEAN ALL THREE OUTPUTS
223
- # ============================================================
224
- print("Cleaning TRAIN...")
225
- train_sentences = clean_sentences(train_sentences)
226
-
227
- print("Cleaning DEV...")
228
- dev_sentences = clean_sentences(dev_sentences)
229
-
230
- print("Cleaning TEST...")
231
- test_sentences = clean_sentences(test_sentences)
232
-
233
-
234
- # ============================================================
235
- # WRITE FINAL OUTPUTS
236
- # ============================================================
237
- print(f"Writing TRAIN → {OUTPUT_TRAIN} ({len(train_sentences)} valid sentences)")
238
- write_conllu(OUTPUT_TRAIN, train_sentences)
239
-
240
- print(f"Writing DEV → {OUTPUT_DEV} ({len(dev_sentences)} valid sentences)")
241
- write_conllu(OUTPUT_DEV, dev_sentences)
242
-
243
- print(f"Writing TEST → {OUTPUT_TEST} ({len(test_sentences)} valid sentences)")
244
- write_conllu(OUTPUT_TEST, test_sentences)
245
-
246
- print("Done.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/depparse/sv_diachronic.dev.in.conllu DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e187a488028eda24d6acbc32d15d75eb91d816ac0ab1e33479499ebe00c0948
3
- size 30651
 
 
 
 
data/depparse/sv_diachronic.test.in.conllu DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:562acbbc94d10e1dbbfb64a0678aee4c4fe9fa2d25d83ceb577fb1661ada19f3
3
- size 272480
 
 
 
 
data/depparse/sv_diachronic.train.in.conllu DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:10cc5ba42ab7435c03b73588eb514007d54367d696078684de08a308fb5cf989
3
- size 116586108
 
 
 
 
saved_models/depparse/sv_diachronic_charlm_parser.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87b032707188b5d53ed43f02af00b3d22f321b55e426400b0e8a35e2cf021bb9
3
- size 150761755
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:839139656fffb5c71a8df13c7b9dc4599617ed23e79b4dd84802a7d284ebfa37
3
+ size 150761848
saved_models/depparse/sv_diachronic_charlm_parser_checkpoint.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec3577e850a19917e89cd37d0ca98f929c7b5093c05901938fe424cc53966cab
3
- size 450717113
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6fe2ce63c2eb24d762269e19f200e59f69414020225a9221490ec7c5a191323
3
+ size 450717215
ud/UD_Swedish-diachronic/sv_diachronic-ud-dev.conllu DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a612aeecf56e8dabe12273071ee63ca0ed26e9ade00ba101dad2b4972b5e61c
3
- size 30068
 
 
 
 
ud/UD_Swedish-diachronic/sv_diachronic-ud-test.conllu DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c169b239a1f4e90357e439cc29f8c7bb764d550d88ace7de8a1bfeca28c2821d
3
- size 269341
 
 
 
 
ud/UD_Swedish-diachronic/sv_diachronic-ud-train.conllu DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:bed616fe972a13269f92d13ff4d9f73c56bddd045ba3d5a3a47c2ff2582c7a6f
3
- size 108812346