niobures commited on
Commit
4a08ba7
·
verified ·
1 Parent(s): 1964ee9
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +7 -35
  2. .gitignore +5 -0
  3. Code/README.html +0 -0
  4. Code/README.md +12 -0
  5. Code/contextRep.py +200 -0
  6. Code/stopatn.sh +24 -0
  7. Code/sumstats01.py +202 -0
  8. Code/translate03.py +325 -0
  9. Code/translate04.py +346 -0
  10. Data/README.md +10 -0
  11. Data/_compromised/acr_RabinalAchi'/acr.Rmd +112 -0
  12. Data/_compromised/acr_RabinalAchi'/acr.bib +29 -0
  13. Data/_compromised/acr_RabinalAchi'/acr.html +0 -0
  14. Data/_compromised/acr_RabinalAchi'/acr.rules +28 -0
  15. Data/_compromised/acr_RabinalAchi'/acr.verify.csv +20 -0
  16. Data/_compromised/ake_Akawaio/ake.Rmd +113 -0
  17. Data/_compromised/ake_Akawaio/ake.bib +40 -0
  18. Data/_compromised/ake_Akawaio/ake.html +0 -0
  19. Data/_compromised/ake_Akawaio/ake.rules +13 -0
  20. Data/_compromised/ake_Akawaio/ake.verify.csv +30 -0
  21. Data/_compromised/amp_Alamblak/amp.Rmd +125 -0
  22. Data/_compromised/amp_Alamblak/amp.bib +62 -0
  23. Data/_compromised/amp_Alamblak/amp.html +0 -0
  24. Data/_compromised/amp_Alamblak/amp.rules +19 -0
  25. Data/_compromised/amp_Alamblak/amp.verify.csv +30 -0
  26. Data/_compromised/aoj_Mufian/aoj.Rmd +109 -0
  27. Data/_compromised/aoj_Mufian/aoj.bib +27 -0
  28. Data/_compromised/aoj_Mufian/aoj.html +0 -0
  29. Data/_compromised/aoj_Mufian/aoj.rules +25 -0
  30. Data/_compromised/aoj_Mufian/aoj.verify.csv +30 -0
  31. Data/_compromised/ar_Arabic/ar.Rmd +155 -0
  32. Data/_compromised/ar_Arabic/ar.bib +113 -0
  33. Data/_compromised/ar_Arabic/ar.html +0 -0
  34. Data/_compromised/ar_Arabic/ar.rules +167 -0
  35. Data/_compromised/ar_Arabic/ar.verify.csv +39 -0
  36. Data/_compromised/arn_Mapudungun/arn.Rmd +139 -0
  37. Data/_compromised/arn_Mapudungun/arn.bib +38 -0
  38. Data/_compromised/arn_Mapudungun/arn.html +0 -0
  39. Data/_compromised/arn_Mapudungun/arn.rules +47 -0
  40. Data/_compromised/arn_Mapudungun/arn.verify.csv +30 -0
  41. Data/_compromised/awx_Awara/awx.Rmd +119 -0
  42. Data/_compromised/awx_Awara/awx.bib +17 -0
  43. Data/_compromised/awx_Awara/awx.html +0 -0
  44. Data/_compromised/awx_Awara/awx.rules +27 -0
  45. Data/_compromised/awx_Awara/awx.verify.csv +30 -0
  46. Data/_compromised/bcl_CentralBikol/bcl.Rmd +126 -0
  47. Data/_compromised/bcl_CentralBikol/bcl.bib +17 -0
  48. Data/_compromised/bcl_CentralBikol/bcl.html +0 -0
  49. Data/_compromised/bcl_CentralBikol/bcl.rules +35 -0
  50. Data/_compromised/bcl_CentralBikol/bcl.verify.csv +32 -0
.gitattributes CHANGED
@@ -1,35 +1,7 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ docs/* linguist-documentation=false
2
+ docs/css/* linguist-documentation=false
3
+ docs/js/* linguist-documentation=false
4
+ Data/** linguist-detectable=falsedocs/images/brown.jpg filter=lfs diff=lfs merge=lfs -text
5
+ docs/images/wordcloud_image.jpg filter=lfs diff=lfs merge=lfs -text
6
+ docs/manual/xpf_manual.pdf filter=lfs diff=lfs merge=lfs -text
7
+ Manual/xpf_manual.pdf filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .DS_Store
2
+ .sav
3
+ .Rhistory
4
+ Abandoned
5
+ Available
Code/README.html ADDED
The diff for this file is too large to render. See raw diff
 
Code/README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code
2
+
3
+ The scripts within this folder are explained in more detail within the manual in terms of how to use them and how to interpret their respective outputs. I have linked the specific sections within the manual that correspond the each of the files below:
4
+
5
+ * `contextRep.py`:  [2.1 Segment Informativity Measures](https://cohenpr-xpf.github.io/XPF/manual/xpf_manual.pdf#Segment%20Informativity%20Measures)
6
+ * `stopatn.sh`:  [2.0.1 Evaluating Frequency Files](https://cohenpr-xpf.github.io/XPF/manual/xpf_manual.pdf#Evaluating%20Frequency%20Files)
7
+ * `sumstats01.py`:  [2.2 Summary Statistics](https://cohenpr-xpf.github.io/XPF/manual/xpf_manual.pdf#Summary%20Statistics)
8
+ * `translate03.py` and `translate04.py`&thinsp;<sup id="ref1">[1](#foot1)</sup>:&nbsp; [1.5 Translation Scheme](https://cohenpr-xpf.github.io/XPF/manual/xpf_manual.pdf#Translation%20Scheme)
9
+
10
+ </br>
11
+
12
+ <b id="foot1">1</b> The only difference between `translate03.py` and `translate04.py` is that `translate04.py` accounts for the [match rules](https://cohenpr-xpf.github.io/XPF/manual/xpf_manual.pdf#page=10) needed for the phonemic translation of Korean. [←](#ref1)
Code/contextRep.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ from __future__ import print_function
3
+
4
+ from math import log
5
+ from collections import deque
6
+
7
+ class contextRep(object):
8
+
9
+ def __init__(self):
10
+ self.count = 0.0 # times this context was observed
11
+ self.contexts = dict() # continuation:context dictionary
12
+ self.precals = None # probs can be precalculated
13
+ self.terminal = 0.0 # number of times this context was final
14
+
15
+ def __repr__(self):
16
+ return repr([self.count, self.terminal, self.contexts])
17
+
18
+ def __str__(self):
19
+ return repr(self)
20
+
21
+ def add(self, seq, count, func=lambda x: None):
22
+ """
23
+ add a full sequence to the representation
24
+ """
25
+ if len(seq) > 0:
26
+ key = seq[0]
27
+ if not key in self.contexts:
28
+ self.contexts[key] = contextRep()
29
+ self.contexts[key].add(seq[1:], count, func)
30
+ else:
31
+ self.terminal += count
32
+
33
+ self.count += count
34
+
35
+ def prob(self, key, log2=False):
36
+ """
37
+ get the probability of observing a particular continuation in
38
+ the given context
39
+ """
40
+ if self.precals is None:
41
+ ret = self.contexts[key].count / self.count \
42
+ if key in self.contexts else 0.0
43
+ else:
44
+ ret = self.precals[key]
45
+
46
+ return ret if not log2 else log(ret, 2)
47
+
48
+ def probs(self, log2=False):
49
+ """
50
+ Get the probabilities of getting all continuations in the given
51
+ context
52
+ """
53
+ if self.precals is None:
54
+ ret = {key:self.prob(key, log2=log2) for key in self.contexts}
55
+ else:
56
+ ret = self.precals if not log2 \
57
+ else {p:log(self.precals[p], 2) for p in self.precals}
58
+ ##else {p:log(p, 2) for p in self.precals}
59
+ return ret
60
+
61
+ def precalc(self):
62
+ """
63
+ Create a static image of the probabilities
64
+ """
65
+ self.precals = self.probs()
66
+ for key in self.contexts:
67
+ self.contexts[key].precalc()
68
+
69
+
70
+ def contextProb(self, seq, terminal=False):
71
+ """
72
+ Create for each item in a sequence the probability of observing
73
+ it in the given context
74
+ """
75
+ context = self
76
+ ret = deque()
77
+ for key in seq:
78
+ if context is not None and key in context.contexts:
79
+ ret.append(context.prob(key, False))
80
+ context = context.contexts[key]
81
+ else:
82
+ context = None
83
+ ret.append(0.0)
84
+
85
+ if terminal:
86
+ if context is not None:
87
+ ret.append(context.terminal / context.count)
88
+ else:
89
+ ret.append(0.0)
90
+ return list(ret)
91
+
92
+ def informativity_counts(self):
93
+ """
94
+ Create for each item in a sequence the probability of observing
95
+ it in the given context
96
+ """
97
+ retvals = {key:(-log(self.contexts[key].count / self.count, 2))
98
+ for key in self.contexts}
99
+ retcounts = {key:self.contexts[key].count for key in self.contexts}
100
+
101
+ for key in self.contexts:
102
+ (subvals, subcounts) = \
103
+ self.contexts[key].informativity_counts()
104
+ for key in subvals:
105
+ (selfval, selfcount) = (retvals[key], retcounts[key]) \
106
+ if key in retvals \
107
+ else (0.0, 0.0)
108
+ retvals[key] = (selfval*selfcount +
109
+ subvals[key]*subcounts[key]) / (subcounts[key]+selfcount)
110
+ retcounts[key] = selfcount + subcounts[key]
111
+
112
+
113
+ return (retvals, retcounts)
114
+
115
+ def informativity(self):
116
+ (informativity, counts) = self.informativity_counts()
117
+ return informativity
118
+
119
+
120
+ def iter(self, terminal=False, log2=False):
121
+ logfunc = (lambda x: -log(x, 2) if x < 1 else 0) if log2 else (lambda x: x)
122
+
123
+ if self.terminal > 0:
124
+ yield [{"seg":None, "prob":logfunc(self.terminal / self.count), "count":self.count}] if terminal else []
125
+
126
+ for key in sorted(self.contexts):
127
+ for cont in self.contexts[key].iter(terminal=terminal, log2=log2):
128
+ yield [{"seg":key,
129
+ "prob":logfunc(self.contexts[key].count / self.count),
130
+ "count":self.contexts[key].count}
131
+ ] + cont
132
+
133
+
134
+ def __iter__(self):
135
+ for value in self.iter(log2=True, terminal=False):
136
+ yield value
137
+
138
+
139
+ ##
140
+ ## Returns a pure dictionary representation of the object
141
+ ##
142
+ def asdict(self):
143
+ ret = {"count": self.count,
144
+ "contexts": {key: self.contexts[key].asdict() for key in self.contexts},
145
+ "precals": self.precals is None,
146
+ "terminal": self.terminal}
147
+ return ret
148
+
149
+ ##
150
+ ## reconstruct an object from a dictionary (created by asdict)
151
+ ## I failed to create a static method and couldn't bother more with it.
152
+ ## The only real reason to use this method + todict is to save contextRep objects in R / json easily
153
+ ##
154
+ def populate(self, d):
155
+ self.count = d["count"]
156
+ self.terminal = d["terminal"]
157
+ self.contexts = {key:contextRep().populate(d["contexts"][key]) for key in d["contexts"]}
158
+ self.precals = None if d["precals"] is False else self.precalc()
159
+ return self
160
+
161
+
162
+ ##
163
+ ## Object equality (only to check todict / populate)
164
+ ##
165
+ def __eq__(self, other):
166
+ if isinstance(other, contextRep):
167
+ return all([self.terminal == other.terminal,
168
+ self.count == other.count,
169
+ all(self.contexts[key] == other.contexts[key] if key in other.contexts else False
170
+ for key in self.contexts),
171
+ all(key in self.contexts for key in other.contexts)])
172
+ else:
173
+ return False
174
+
175
+
176
+ if __name__ == "__main__":
177
+
178
+ c = contextRep()
179
+
180
+ c.add("ab", 5)
181
+ c.add("ac", 5)
182
+ c.add("a", 5)
183
+ c.add("c", 15)
184
+ c.add("P AO1 R T N OY0".split(), 1)
185
+ print(c)
186
+ print(c.informativity())
187
+ print(c.probs())
188
+ print(c.contextProb("ab"))
189
+ print(c.contextProb("a"))
190
+ print(c.asdict())
191
+
192
+ c2 = contextRep()
193
+ c2.populate(c.asdict())
194
+ print(c2)
195
+ print(c2 == c)
196
+ print(c.informativity() == c2.informativity())
197
+
198
+
199
+ for v in c.iter(terminal=True):
200
+ print(v)
Code/stopatn.sh ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+
4
+ ##
5
+ ## ignore low frequencies so we don't save time dealing with
6
+ ## them. Default: ignore nothing.
7
+ ##
8
+ minfreq=$(test "$2" && echo "$2" || echo 0)
9
+
10
+
11
+ tf=$(mktemp)
12
+ cat | awk "\$2>=$minfreq" | ## remove low freqs if any
13
+ bzip2 -9 > ${tf} ## first save the file,
14
+ ## sorted by freq
15
+
16
+ lastfreq=$(bzcat ${tf} | ## revisit the file
17
+ sort -nr -k 2,2 -k 1,1 | ## sort to find top $1 freq
18
+ head -n "$1" | ## find the top $1 rlines
19
+ tail -n1 | ## keep only last line
20
+ (read w freq; echo $freq)) ## it's the second field
21
+
22
+ bzcat ${tf} | awk "\$2>=$lastfreq" ## use awk to remove lower freqs
23
+
24
+ rm -f ${tf}
Code/sumstats01.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import re
4
+ import argparse
5
+ import sys
6
+ import csv
7
+ import traceback
8
+ from collections import deque, defaultdict
9
+ from math import inf
10
+ import translate04 as translate
11
+ from contextRep import contextRep
12
+
13
+
14
+ def oneNyield(item, iterable):
15
+ yield item
16
+ for item in iterable:
17
+ yield item
18
+
19
+ def getRep(fobj, a2ipa, minfreq=1):
20
+ ret = contextRep()
21
+ stats = {"nlines": 0,
22
+ "skipped": 0,
23
+ "missing": 0,
24
+ "@words": dict()}
25
+
26
+ ret.wordlist = deque()
27
+
28
+ ret.stats = stats
29
+
30
+ finalnr = re.compile("[\r\n]*$")
31
+
32
+ ##
33
+ ## Try to understand file type
34
+ ##
35
+ sniffLine = fobj.readline()
36
+ sniffLine = re.sub(finalnr, "", sniffLine)
37
+ if sniffLine.find("\t") >= 0 and len(sniffLine.split("\t")) == 2:
38
+ sep = "\t"
39
+ elif sniffLine.find(",") >= 0 and len(sniffLine.split(",")) == 2:
40
+ sep = ","
41
+ elif sniffLine.find(" ") >= 0 and len(sniffLine.split(" ")) == 2:
42
+ sep = " "
43
+ else:
44
+ print("Could not understand frequencies file, not proceeding")
45
+ return ret
46
+
47
+ ##
48
+ ## Try to figure whether there are headers
49
+ ##
50
+ try:
51
+ int(sniffLine.split(sep)[-1])
52
+ lines = oneNyield(sniffLine, fobj)
53
+ except ValueError:
54
+ lines = fobj
55
+
56
+ for line in lines:
57
+ stats["nlines"] += 1
58
+ line = re.sub(finalnr, "", line)
59
+ try:
60
+ ##
61
+ ## parse line
62
+ ##
63
+ (word, freq) = line.split(sep)
64
+ freq = int(freq)
65
+
66
+ ##
67
+ ## ignore low frequencies
68
+ ##
69
+ if freq < minfreq:
70
+ stats["skipped"] += 1
71
+ continue
72
+
73
+ ##
74
+ ## Translate
75
+ ##
76
+ translation = a2ipa.translate(word)
77
+ if "@" in translation:
78
+ stats["missing"] += 1
79
+ stats["@words"][word] = {"freq":freq, "translation":translation}
80
+
81
+ ##
82
+ ## Add context to representation
83
+ ##
84
+ else:
85
+ ret.wordlist.append({"word:": word, "translation": translation})
86
+ ret.add(translation, freq)
87
+
88
+
89
+ except Exception as err:
90
+ print("Error in word frequency parsing. Offending line is {}, the message is: {}".format(repr(line), err), file=sys.stderr)
91
+ exit(1)
92
+
93
+ ##ret.precalc()
94
+
95
+ ##for (wx, wordprops) in enumerate(ret.wordlist):
96
+ ## ret.wordlist[wx]["probs"] = ret.probs(wordprops["translation"])
97
+
98
+ return ret
99
+
100
+
101
+ def main(argv):
102
+ parser = argparse.ArgumentParser("Provide summary statistics for language and frequency files")
103
+
104
+ ##
105
+ ## Specifies the rules used for trsnslation
106
+ ##
107
+ parser.add_argument("-l", "--langrules", dest="langrules",
108
+ type=argparse.FileType('r', encoding="utf8"),
109
+ required=True,
110
+ help="language code rules file")
111
+
112
+
113
+ ##
114
+ ## Specifies a verifcation file, in any csv format. Headers are
115
+ ## not expected. The first columns is supposed to be a word, and
116
+ ## the second is its ideal translation
117
+ ##
118
+ parser.add_argument("-c", "--check", dest="check",
119
+ default=None, type=argparse.FileType('r', encoding="utf8"),
120
+ help="file to use for verification")
121
+
122
+ ##
123
+ ## Allows to read date from some file (which should not be compressed)
124
+ ##
125
+ parser.add_argument("-r", "--read", dest="read",
126
+ default=sys.stdin,
127
+ type=argparse.FileType('r', encoding="utf8"),
128
+ help="file used for translation (word and frequency)")
129
+
130
+ ##
131
+ ## Add min frequency
132
+ ##
133
+ parser.add_argument("-m", "--min", dest="min",
134
+ default=1,
135
+ type=int,
136
+ help="minimal frequency to consider")
137
+
138
+ ##
139
+ ## Print summary?
140
+ ##
141
+ parser.add_argument("-N", "--no-summary", dest="nosummary",
142
+ default=False, action="store_true",
143
+ help="suppress summary information")
144
+
145
+ ##
146
+ ## Print all probs?
147
+ ##
148
+ parser.add_argument("-A", "--all-words", dest="allwords",
149
+ default=False, action="store_false",
150
+ help="Enumerate all words and probabilities")
151
+
152
+ ##
153
+ ## How many @ words?
154
+ ##
155
+ parser.add_argument("-@", "--max@", dest="max@",
156
+ default=10,
157
+ type=int,
158
+ help="number of @ words to include in summary")
159
+
160
+ options = vars(parser.parse_args(argv))
161
+
162
+ a2ipa = translate.alphabet2ipa(options["langrules"])
163
+
164
+
165
+ if "check" in options and not options["check"] is None:
166
+ allGood = a2ipa.check(options["check"])
167
+ if not allGood:
168
+ print("Verification failed, not processing additional data", file=sys.stderr)
169
+ exit(1)
170
+
171
+ ##ret = ((word, " ".join(a2ipa.translate(word)))
172
+ ## for word in words)
173
+ rep = getRep(options["read"], a2ipa, minfreq=options["min"])
174
+ rep.precalc()
175
+
176
+ (info, counts) = rep.informativity_counts()
177
+
178
+ print("seg\tinformativity\t,count")
179
+ for (count, seg) in sorted((-counts[seg], seg) for seg in counts):
180
+ print("{seg}\t{info}\t{count}".format(seg=seg,
181
+ info=info[seg],
182
+ count=-count))
183
+
184
+ ##
185
+ ## Print summary information, if not suppressed
186
+ #3
187
+ if not options["nosummary"]:
188
+
189
+ print("## Summary statistics:")
190
+ print("## processed (inc. skipped):", rep.stats["nlines"])
191
+ print("## skipped:", rep.stats["skipped"])
192
+ print("## %@ words:", round(rep.stats["missing"] /
193
+ (rep.stats["nlines"] - rep.stats["skipped"])*100, 1))
194
+
195
+ atwords = rep.stats["@words"]
196
+ print("## Top missing:")
197
+ for (nfreq, word, translation) in sorted((-atwords[word]["freq"], word, atwords[word]["translation"]) for word in atwords)[:options["max@"]]:
198
+ print("## {word} → '{translation}' ({freq})".format(word=word, translation=" ".join(translation), freq=-nfreq))
199
+
200
+
201
+ if __name__ == "__main__":
202
+ main(sys.argv[1:])
Code/translate03.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import re
4
+ import argparse
5
+ import sys
6
+ import csv
7
+ import traceback
8
+ from collections import deque, defaultdict
9
+ from math import inf
10
+
11
+
12
+ def sniff(filestream):
13
+ ##sample = csv.Sniffer().sniff(filestream.read(1024))
14
+ sample = filestream.read(1024)
15
+ sample = "\n".join(line for line in re.split("[\r\n]", sample) if not (line.startswith("#") or len(line) == 0))
16
+ try:
17
+ dialect = csv.Sniffer().sniff(sample)
18
+ except Exception as ex:
19
+ print("Could not determine delimiter type, proceedings as excel csv", file=sys.stderr)
20
+ dialect = csv.get_dialect("excel")
21
+
22
+ filestream.seek(0)
23
+ ret = (line for line in filestream if not line.startswith("#"))
24
+ return (ret, dialect)
25
+
26
+
27
+ class subrule(object):
28
+ ##
29
+ ## simple wrapper for sub rules
30
+ ##
31
+ def __init__(self, values, classes):
32
+ for key in ["sfrom", "sto", "precede", "follow", "weight"]:
33
+ value = values[key]
34
+
35
+ ##
36
+ ## handle classes (and subclasses)
37
+ ##
38
+ while re.search("{.*}", value):
39
+ value = value.format(**classes)
40
+ self.__dict__[key] = value
41
+
42
+ self.weight = float(self.weight)
43
+ self.sfrom = re.compile(self.sfrom)
44
+ self.precede = re.compile(self.precede+"$")
45
+ self.follow = re.compile("^"+self.follow)
46
+
47
+ def subScore(self, sfrom, precede, follow):
48
+ if self.sfrom.match(sfrom) and self.precede.search(precede) and self.follow.search(follow):
49
+ return self.weight
50
+ else:
51
+ return None
52
+
53
+ def sub(self, x):
54
+ return self.sfrom.sub(self.sto, x)
55
+
56
+ def __repr__(self):
57
+ return repr(self.__dict__)
58
+
59
+ def __lt__(self, other):
60
+ if not isinstance(other, subrule):
61
+ raise Exception("Incompatible types for comparison")
62
+ return [self.weight] < [other.weight]
63
+
64
+
65
+ class alphabet2ipa(object):
66
+ ##
67
+ ## interpretation of rules files
68
+ ##
69
+
70
+ classes = None
71
+ subs = None
72
+ ##chartr = None
73
+
74
+ def __init__(self, langrules, missing="@", loglevel=0):
75
+ self.classes = dict()
76
+ self.subs = set()
77
+ self.ipasubs = set()
78
+ self.words = dict()
79
+ self.pre = str.maketrans("", "")
80
+ self.NO_TRANSLATE = missing
81
+ self.loglevel = loglevel
82
+
83
+
84
+ with langrules as csvsource:
85
+ ##rules = csv.DictReader(csvsource)
86
+ (csvsource, dialect) = sniff(csvsource)
87
+ rules = csv.DictReader(csvsource, dialect=dialect)
88
+ for rule in rules:
89
+ if self.loglevel > 2:
90
+ print("Rule found:", rule, file=sys.stderr)
91
+
92
+ try:
93
+
94
+ ##
95
+ ## Pre equivalences
96
+ ##
97
+ if rule["type"] == "pre":
98
+ self.pre = str.maketrans(rule["sfrom"], rule["sto"])
99
+
100
+ ##
101
+ ## Deal with classes
102
+ ##
103
+ elif rule["type"] == "class":
104
+ self.classes[rule["sfrom"]] = rule["sto"]
105
+
106
+ ##
107
+ ## Deal with sub rules
108
+ ##
109
+ elif rule["type"] == "sub":
110
+ newrule = subrule(rule, self.classes)
111
+ self.subs.add(newrule)
112
+
113
+ ##
114
+ ## Deal with IPA sub rules
115
+ ##
116
+ elif rule["type"] == "ipasub":
117
+ newrule = subrule(rule, self.classes)
118
+ self.ipasubs.add(newrule)
119
+
120
+ ##
121
+ ## Deal with whole word substitutions
122
+ ##
123
+ elif rule["type"] == "word":
124
+ self.words[rule["sfrom"]] = rule["sto"].split()
125
+
126
+
127
+ ##
128
+ ## No such rule
129
+ ##
130
+ else:
131
+ print("Unrecognized rule type ({type}), with sfrom={sfrom}, and sto={sto}".format(**rule), file=sys.stderr)
132
+ continue
133
+
134
+ except Exception as ex:
135
+ errInfo = sys.exc_info()
136
+ traceback.print_exception(*errInfo)
137
+ print("Error processing rule, but resuming processing other rules. Rule details: {}".format(rule, ex), file=sys.stderr)
138
+ continue
139
+
140
+
141
+ if self.loglevel > 1:
142
+ print("Rule added:", rule, file=sys.stderr)
143
+
144
+
145
+
146
+
147
+ def translate(self, source):
148
+ ##
149
+ ## fully translated words
150
+ ##
151
+ if source in self.words:
152
+ return self.words[source]
153
+
154
+ ##
155
+ ## preprocess using pre, and turn to lowercase
156
+ ##
157
+ source = source.translate(self.pre).lower()
158
+
159
+ ##
160
+ ## If there are character-based translations, apply them first
161
+ ## (for Cyrillic and other non-latin scripts)
162
+ ##
163
+ ##if not self.chartr is None:
164
+ ## source = source.translate(self.chartr)
165
+
166
+ sourceList = re.findall(".", source)
167
+ targetList = deque()
168
+ for (sx, sfrom) in enumerate(sourceList):
169
+ ##
170
+ ## prepare context
171
+ ##
172
+ precede = "".join(source[:sx])
173
+ follow = "".join(source[sx+1:])
174
+
175
+ ##
176
+ ## perform all possible translations
177
+ ##
178
+ translations = [(rule.subScore(sfrom, precede, follow), rule.sub(sfrom)) for rule in self.subs]
179
+
180
+ ##
181
+ ## Exclude translations that didn't apply
182
+ ##
183
+ translations = [pair for pair in translations if not pair[0] is None]
184
+
185
+ ##
186
+ ## Choose best translation
187
+ ##
188
+ translation = sorted(translations)[-1][-1] if len(translations) > 0 else self.NO_TRANSLATE
189
+
190
+ if len(translation) == 0:
191
+ continue
192
+
193
+ targetList.append(translation)
194
+
195
+ targetString = " ".join(targetList)
196
+ for (weight, rule) in sorted((-rule.weight, rule)
197
+ for rule in self.ipasubs):
198
+ targetString = rule.sub(targetString)
199
+
200
+
201
+
202
+ return list(targetString.split())
203
+
204
+
205
+ def check(self, cfile):
206
+ ##
207
+ ## Check that words translate as they should. Returns True if
208
+ ## they all do (or if there are no words). Unless logevel is
209
+ ## negative, mismatches are printed.
210
+ ##
211
+
212
+
213
+ ##
214
+ ## Open the file, regardless of csv formats, excluding comment lines
215
+ ##
216
+ (csvsource, dialect) = sniff(cfile)
217
+ data = csv.reader(csvsource, dialect=dialect)
218
+
219
+
220
+ ##
221
+ ## Iterate over all lines
222
+ ##
223
+ allGood = True ## default is that it's all good
224
+ for values in data:
225
+ try:
226
+ word = values[0]
227
+ shouldbe = values[1]
228
+ except Exception as ex:
229
+ errInfo = sys.exc_info()
230
+ allGood = False
231
+ traceback.print_exception(*errInfo)
232
+ print("Error processing verification statement, but resuming processing other statements. Statement details: {}".format(values, ex), file=sys.stderr)
233
+ continue
234
+
235
+ translation = " ".join(self.translate(word)) ## translate returns a list, change to spaces
236
+ if self.loglevel > 2:
237
+ print("Does '{}' translate to '{}'?".format(word, shouldbe), file=sys.stderr)
238
+
239
+ if not shouldbe == translation: ## if wrong translation
240
+ allGood = False ## not all translations are good
241
+ if self.loglevel >= 0:
242
+ print("Verification error, '{}' was translated to '{}', not '{}'"
243
+ .format(word, translation, shouldbe)
244
+ , file=sys.stderr)
245
+
246
+ if self.loglevel > 0:
247
+ print("Word '{} ({})' translated to '{}'".format(word, shouldbe, translation)
248
+ , file=sys.stderr)
249
+
250
+ return allGood
251
+
252
+ def concatenate(*seqs):
253
+ for seq in seqs:
254
+ for item in seq:
255
+ yield item
256
+
257
+
258
+ def main(argv):
259
+ parser = argparse.ArgumentParser("Translate words to ipa")
260
+
261
+ ##
262
+ ## Specifies the rules used for trsnslation
263
+ ##
264
+ parser.add_argument("-l", "--langrules", dest="langrules",
265
+ default="es.rules", type=argparse.FileType('r', encoding="utf8"),
266
+ help="language code rules file")
267
+
268
+ ##
269
+ ## Specifies the log level
270
+ ##
271
+ parser.add_argument("-v", "--verbose", dest="loglevel",
272
+ default=0, type=int,
273
+ help="Error level specification")
274
+
275
+
276
+ ##
277
+ ## Specifies a verifcation file, in any csv format. Headers are
278
+ ## not expected. The first columns is supposed to be a word, and
279
+ ## the second is its ideal translation
280
+ ##
281
+ parser.add_argument("-c", "--check", dest="check",
282
+ default=None, type=argparse.FileType('r', encoding="utf8"),
283
+ help="file to use for verification")
284
+
285
+ ##
286
+ ## Allows to read date from some file (which should not be compressed)
287
+ ##
288
+ parser.add_argument("-r", "--read", dest="read",
289
+ default=None, type=argparse.FileType('r', encoding="utf8"),
290
+ help="file used for translation (read up to first space)")
291
+
292
+ ##
293
+ ## Any following words would be translated
294
+ ##
295
+ parser.add_argument("words", nargs="*")
296
+
297
+ options = vars(parser.parse_args(argv))
298
+
299
+ a2ipa = alphabet2ipa(options["langrules"], loglevel=options["loglevel"])
300
+ ##print(options)
301
+
302
+ if "check" in options and not options["check"] is None:
303
+ allGood = a2ipa.check(options["check"])
304
+ if not allGood:
305
+ print("Verification failed, not processing additional data", file=sys.stderr)
306
+ return []
307
+
308
+
309
+ if "read" in options and not options["read"] is None:
310
+ readwords = (fields.replace(",", " ").split()[0] for fields in options["read"])
311
+ else:
312
+ readwords = []
313
+
314
+ words = concatenate(options["words"], readwords)
315
+
316
+ ret = ((word, " ".join(a2ipa.translate(word)))
317
+ for word in words)
318
+
319
+ return ret
320
+
321
+
322
+
323
+ if __name__ == "__main__":
324
+ for output in main(sys.argv[1:]):
325
+ print("\t".join(output))
Code/translate04.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import re
4
+ import argparse
5
+ import sys
6
+ import csv
7
+ import traceback
8
+ from collections import deque, defaultdict
9
+ from math import inf
10
+
11
+
12
+ def sniff(filestream):
13
+ ##sample = csv.Sniffer().sniff(filestream.read(1024))
14
+ lines = list(line for line in filestream if not (line.startswith("#") or len(line) == 0))
15
+ if all(line.find("\t") >= 0 for line in lines):
16
+ dialect = csv.get_dialect("excel-tab")
17
+ else:
18
+ sample = "\n".join(lines)
19
+ try:
20
+ dialect = csv.Sniffer().sniff(sample)
21
+ except Exception as ex:
22
+ print("Could not determine delimiter type, proceedings as excel csv", file=sys.stderr)
23
+ dialect = csv.get_dialect("excel")
24
+ return (lines, dialect)
25
+
26
+
27
+ class subrule(object):
28
+ ##
29
+ ## simple wrapper for sub rules
30
+ ##
31
+ def __init__(self, values, classes):
32
+ for key in ["sfrom", "sto", "precede", "follow", "weight"]:
33
+ value = values[key]
34
+
35
+ ##
36
+ ## handle classes (and subclasses)
37
+ ##
38
+ while re.search("{.*}", value):
39
+ value = value.format(**classes)
40
+ self.__dict__[key] = value
41
+
42
+ self.weight = float(self.weight)
43
+ self.sfrom = re.compile(self.sfrom)
44
+ self.precede = re.compile(self.precede+"$")
45
+ self.follow = re.compile("^"+self.follow)
46
+
47
+ def subScore(self, sfrom, precede, follow):
48
+ if self.sfrom.match(sfrom) and self.precede.search(precede) and self.follow.search(follow):
49
+ return self.weight
50
+ else:
51
+ return None
52
+
53
+ def sub(self, x):
54
+ return self.sfrom.sub(self.sto, x)
55
+
56
+ def __repr__(self):
57
+ return repr(self.__dict__)
58
+
59
+ def __lt__(self, other):
60
+ if not isinstance(other, subrule):
61
+ raise Exception("Incompatible types for comparison")
62
+ return [self.weight] < [other.weight]
63
+
64
+
65
+ class alphabet2ipa(object):
66
+ ##
67
+ ## interpretation of rules files
68
+ ##
69
+
70
+ classes = None
71
+ subs = None
72
+ ##chartr = None
73
+
74
+ def __init__(self, langrules, missing="@", loglevel=0):
75
+ self.classes = dict()
76
+ self.subs = set()
77
+ self.ipasubs = set()
78
+ self.words = dict()
79
+ self.matches = dict()
80
+ self.pre = str.maketrans("", "")
81
+ self.NO_TRANSLATE = missing
82
+ self.loglevel = loglevel
83
+
84
+
85
+ with langrules as csvsource:
86
+ ##rules = csv.DictReader(csvsource)
87
+ (csvsource, dialect) = sniff(csvsource)
88
+ rules = csv.DictReader(csvsource, dialect=dialect)
89
+ for rule in rules:
90
+ if self.loglevel > 2:
91
+ print("Rule found:", rule, file=sys.stderr)
92
+
93
+ try:
94
+
95
+ ##
96
+ ## Pre equivalences
97
+ ##
98
+ if rule["type"] == "pre":
99
+ self.pre = str.maketrans(rule["sfrom"], rule["sto"])
100
+
101
+ ##
102
+ ## Deal with classes
103
+ ##
104
+ elif rule["type"] == "class":
105
+ self.classes[rule["sfrom"]] = rule["sto"]
106
+
107
+ ##
108
+ ## Deal with match rules
109
+ ##
110
+ elif rule["type"] == "match":
111
+ value = rule["sto"]
112
+ while re.search("{.*}", value):
113
+ value = value.format(**self.classes)
114
+ self.matches[rule["sfrom"]] = value
115
+
116
+ ##
117
+ ## Deal with sub rules
118
+ ##
119
+ elif rule["type"] == "sub":
120
+ newrule = subrule(rule, self.classes)
121
+ self.subs.add(newrule)
122
+
123
+ ##
124
+ ## Deal with IPA sub rules
125
+ ##
126
+ elif rule["type"] == "ipasub":
127
+ newrule = subrule(rule, self.classes)
128
+ self.ipasubs.add(newrule)
129
+
130
+ ##
131
+ ## Deal with whole word substitutions
132
+ ##
133
+ elif rule["type"] == "word":
134
+ self.words[rule["sfrom"]] = rule["sto"].split()
135
+
136
+
137
+ ##
138
+ ## No such rule
139
+ ##
140
+ else:
141
+ print("Unrecognized rule type ({type}), with sfrom={sfrom}, and sto={sto}".format(**rule), file=sys.stderr)
142
+ continue
143
+
144
+ except Exception as ex:
145
+ errInfo = sys.exc_info()
146
+ traceback.print_exception(*errInfo)
147
+ print("Error processing rule, but resuming processing other rules. Rule details: {}".format(rule, ex), file=sys.stderr)
148
+ continue
149
+
150
+
151
+ if self.loglevel > 1:
152
+ print("Rule added:", rule, file=sys.stderr)
153
+
154
+
155
+
156
+
157
+ def translate(self, source):
158
+ ##
159
+ ## fully translated words
160
+ ##
161
+ if source in self.words:
162
+ return self.words[source]
163
+
164
+ ##
165
+ ## preprocess using pre, and turn to lowercase
166
+ ##
167
+ source = source.translate(self.pre).lower()
168
+
169
+ ##
170
+ ## If there are character-based translations, apply them first
171
+ ## (for Cyrillic and other non-latin scripts)
172
+ ##
173
+ ##if not self.chartr is None:
174
+ ## source = source.translate(self.chartr)
175
+
176
+ sourceList = re.findall(".", source)
177
+ targetList = deque()
178
+ for (sx, sfrom) in enumerate(sourceList):
179
+
180
+ ##
181
+ ## If there's a match rule: translate "as is" (and skip costly regular expressions)
182
+ ##
183
+ if sfrom in self.matches:
184
+ translation = self.matches[sfrom]
185
+
186
+ ##
187
+ ## Otherwise, look for all matches
188
+ ##
189
+ else:
190
+ ##
191
+ ## prepare context
192
+ ##
193
+ precede = "".join(source[:sx])
194
+ follow = "".join(source[sx+1:])
195
+
196
+ ##
197
+ ## perform all possible translations
198
+ ##
199
+ translations = [(rule.subScore(sfrom, precede, follow), rule.sub(sfrom)) for rule in self.subs]
200
+
201
+ ##
202
+ ## Exclude translations that didn't apply
203
+ ##
204
+ translations = [pair for pair in translations if not pair[0] is None]
205
+
206
+ ##
207
+ ## Choose best translation
208
+ ##
209
+ translation = sorted(translations)[-1][-1] if len(translations) > 0 else self.NO_TRANSLATE
210
+
211
+ if len(translation) == 0:
212
+ continue
213
+
214
+ targetList.append(translation)
215
+
216
+ targetString = " ".join(targetList)
217
+ for (weight, rule) in sorted((-rule.weight, rule)
218
+ for rule in self.ipasubs):
219
+ targetString = rule.sub(targetString)
220
+
221
+
222
+
223
+ return list(targetString.split())
224
+
225
+
226
+ def check(self, cfile):
227
+ ##
228
+ ## Check that words translate as they should. Returns True if
229
+ ## they all do (or if there are no words). Unless logevel is
230
+ ## negative, mismatches are printed.
231
+ ##
232
+
233
+
234
+ ##
235
+ ## Open the file, regardless of csv formats, excluding comment lines
236
+ ##
237
+ (csvsource, dialect) = sniff(cfile)
238
+ data = csv.reader(csvsource, dialect=dialect)
239
+
240
+
241
+ ##
242
+ ## Iterate over all lines
243
+ ##
244
+ allGood = True ## default is that it's all good
245
+ for values in data:
246
+ try:
247
+ word = values[0]
248
+ shouldbe = values[1]
249
+ except Exception as ex:
250
+ errInfo = sys.exc_info()
251
+ allGood = False
252
+ traceback.print_exception(*errInfo)
253
+ print("Error processing verification statement, but resuming processing other statements. Statement details: {}".format(values, ex), file=sys.stderr)
254
+ continue
255
+
256
+ translation = " ".join(self.translate(word)) ## translate returns a list, change to spaces
257
+ if self.loglevel > 2:
258
+ print("Does '{}' translate to '{}'?".format(word, shouldbe), file=sys.stderr)
259
+
260
+ if not shouldbe == translation: ## if wrong translation
261
+ allGood = False ## not all translations are good
262
+ if self.loglevel >= 0:
263
+ print("Verification error, '{}' was translated to '{}', not '{}'"
264
+ .format(word, translation, shouldbe)
265
+ , file=sys.stderr)
266
+
267
+ if self.loglevel > 0:
268
+ print("Word '{} ({})' translated to '{}'".format(word, shouldbe, translation)
269
+ , file=sys.stderr)
270
+
271
+ return allGood
272
+
273
+ def concatenate(*seqs):
274
+ for seq in seqs:
275
+ for item in seq:
276
+ yield item
277
+
278
+
279
+ def main(argv):
280
+ parser = argparse.ArgumentParser("Translate words to ipa")
281
+
282
+ ##
283
+ ## Specifies the rules used for trsnslation
284
+ ##
285
+ parser.add_argument("-l", "--langrules", dest="langrules",
286
+ default="es.rules", type=argparse.FileType('r', encoding="utf8"),
287
+ help="language code rules file")
288
+
289
+ ##
290
+ ## Specifies the log level
291
+ ##
292
+ parser.add_argument("-v", "--verbose", dest="loglevel",
293
+ default=0, type=int,
294
+ help="Error level specification")
295
+
296
+
297
+ ##
298
+ ## Specifies a verifcation file, in any csv format. Headers are
299
+ ## not expected. The first columns is supposed to be a word, and
300
+ ## the second is its ideal translation
301
+ ##
302
+ parser.add_argument("-c", "--check", dest="check",
303
+ default=None, type=argparse.FileType('r', encoding="utf8"),
304
+ help="file to use for verification")
305
+
306
+ ##
307
+ ## Allows to read date from some file (which should not be compressed)
308
+ ##
309
+ parser.add_argument("-r", "--read", dest="read",
310
+ default=None, type=argparse.FileType('r', encoding="utf8"),
311
+ help="file used for translation (read up to first space)")
312
+
313
+ ##
314
+ ## Any following words would be translated
315
+ ##
316
+ parser.add_argument("words", nargs="*")
317
+
318
+ options = vars(parser.parse_args(argv))
319
+
320
+ a2ipa = alphabet2ipa(options["langrules"], loglevel=options["loglevel"])
321
+ ##print(options)
322
+
323
+ if "check" in options and not options["check"] is None:
324
+ allGood = a2ipa.check(options["check"])
325
+ if not allGood:
326
+ print("Verification failed, not processing additional data", file=sys.stderr)
327
+ return []
328
+
329
+
330
+ if "read" in options and not options["read"] is None:
331
+ readwords = (fields.replace(",", " ").split()[0] for fields in options["read"])
332
+ else:
333
+ readwords = []
334
+
335
+ words = concatenate(options["words"], readwords)
336
+
337
+ ret = ((word, " ".join(a2ipa.translate(word)))
338
+ for word in words)
339
+
340
+ return ret
341
+
342
+
343
+
344
+ if __name__ == "__main__":
345
+ for output in main(sys.argv[1:]):
346
+ print("\t".join(output))
Data/README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data
2
+
3
+ Each of the language files found in this folder (and the `./compromised` folder) contain the files listed below. I have linked the relevant sections within the manual for more detail.
4
+
5
+ * An `.Rmd` (and corresponding `.html`) file that outlines the language specific [description](https://cohenpr-xpf.github.io/XPF/manual/xpf_manual.pdf#Language%20Descriptions).
6
+ * A `.rules` file that contains the computational [grammar](https://cohenpr-xpf.github.io/XPF/manual/xpf_manual.pdf#Language%20Grammars) needed to translate language specific orthographic characters to their phonemes.
7
+ * A `.verify.csv` file that contains language specific sample words and their translations used to [verify](https://cohenpr-xpf.github.io/XPF/manual/xpf_manual.pdf#Grammar%20Verification) and confirm the validity of the `.rules` file.
8
+ * A `.bib` file that contains the sources referenced within the `.Rmd`.
9
+
10
+ The `langs_list.tsv` file is a consolidation of relevant language identifiers and directory paths to make for more efficient analyses.
Data/_compromised/acr_RabinalAchi'/acr.Rmd ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "Rabinal Achi'"
3
+ author: "Emily Strand"
4
+ bibliography: acr.bib
5
+ output: html_document
6
+ ---
7
+
8
+ Last Updated: 2020-03-30
9
+
10
+ **SLIGHTLY COMPROMISED: suspect marking of vowel length**
11
+
12
+ # Background
13
+
14
+ **Language Family:** Mayan / Quichean-Mamean / Greater Quichean / Quichean / Quiche-Achi
15
+
16
+ * Rabinal Achi' is spoken in Guatemala. It is one of two dialects of Achi. The other dialect is known as Cubulco.
17
+
18
+ # Phonology
19
+
20
+ ## Consonants
21
+
22
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
23
+
24
+ library(dplyr)
25
+ library(knitr)
26
+ library(kableExtra)
27
+
28
+
29
+ consonants <- read.table(textConnection('
30
+ "Manner of Articulation" Labial Alveolar Postalveolar Palatal Velar Uvular Glottal
31
+ Stops "p ɓ" "t tʼ" "" "" "k kʼ" "q qʼ" "ʔ"
32
+ Affricates "" "ts tsʼ" "tʃ tʃʼ" "" "" "" ""
33
+ Fricatives "" "s" "ʃ" "" "" "" "h"
34
+ Nasals "m" "n" "" "" "" "" ""
35
+ Flaps "" "ɾ" "" "" "" "" ""
36
+ Approximants "w" "l" "" "j" "" "" ""
37
+ '), TRUE)
38
+
39
+ kable(consonants, col.names = c("Manner of Articulation", "Labial", "Alveolar", "Postalveolar", "Palatal", "Velar", "Uvular", "Glottal"), align = 'c') %>%
40
+ kable_styling("bordered") %>%
41
+ add_header_above(c("", "Place of Articulation" = 7)) %>%
42
+ column_spec(1, bold = TRUE) %>%
43
+ footnote(general = "Note: For phonemes that share a cell, those on the left are plain and those on the right are ejectives. The labial stop on the right is an implosive.", general_title = "")
44
+ ```
45
+
46
+ ## Vowels
47
+
48
+ * Vowel length is contrastive in Rabinal Achi' [@Pellicer2005, p. 15]. Long vowels are indicated by duplicate vowel graphemes.
49
+ - All the literature pertaining to the language suggests the marking of vowel length; however, the output of the Crúbadán corpus (only 33 accounts reflecting vowel length) makes this suspect.
50
+ - Unlike the languages in the Western branch of Proto-Mayan that have neutralized vowel length, languages in the Eastern branch (including Achi) have not [@England2017].
51
+ - I have chosen to categorize this language as compromised, given the suspect nature of the vowel length marking.
52
+
53
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
54
+
55
+ vowels <- read.table(textConnection('
56
+ Front Central Back
57
+ High "i" "" "u"
58
+ Mid "e" "" "o"
59
+ Low "" "a" ""
60
+ '), TRUE)
61
+
62
+ kable(vowels, align = 'c') %>%
63
+ kable_styling("bordered") %>%
64
+ column_spec(1, bold = TRUE)
65
+ ```
66
+
67
+ # Alphabet
68
+
69
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
70
+
71
+ alphabet <- read.table(textConnection('
72
+ Grapheme Phoneme
73
+ "a" "/a/"
74
+ "b; b\'" "/ɓ/"
75
+ "e" "/e/"
76
+ "i" "/i/"
77
+ "j" "/h/"
78
+ "k" "/k/"
79
+ "k\'" "/kʼ/"
80
+ "l" "/l/"
81
+ "m" "/m/"
82
+ "n" "/n/"
83
+ "o" "/o/"
84
+ "p" "/p/"
85
+ "q" "/q/"
86
+ "q\'" "/qʼ/"
87
+ "r" "/ɾ/"
88
+ "s" "/s/"
89
+ "t" "/t/"
90
+ "t\'" "/tʼ/"
91
+ "u" "/u/"
92
+ "w" "/w/"
93
+ "x" "/ʃ/"
94
+ "y" "/j/"
95
+ "\'" "/ʔ/"
96
+ **Digraph** ""
97
+ "ch" "/tʃ/"
98
+ "ch\'" "/tʃʼ/"
99
+ "tz" "/ts/"
100
+ "tz\'" "/tsʼ/"
101
+ '), TRUE)
102
+
103
+ kable(alphabet, align = 'c') %>%
104
+ kable_styling("bordered")
105
+ ```
106
+
107
+ # Misc. Rules
108
+
109
+ * Voiceless stops are aspirated word-finally and following consonants [@Solares2008, pp. 18-20].
110
+ * Phonetic glottal stops precede vowels in word-initial positions [@Solares2008, p. 21].
111
+
112
+ # References
Data/_compromised/acr_RabinalAchi'/acr.bib ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @Misc{Brawand1963,
2
+ author = {John Brawand and Alice Brawand},
3
+ title = {El alfabeto achí, dialecto de Rabinal},
4
+ howpublished = {SIL},
5
+ year = {1963},
6
+ }
7
+
8
+ @Book{Pellicer2005,
9
+ title = {Los significados de la música: La marimba maya achí de Guatemala},
10
+ publisher = {Centro de Investigaciones y Estudios Superiores en Antropoligía Social},
11
+ year = {2005},
12
+ author = {Sergio Navarrete Pellicer},
13
+ }
14
+
15
+ @Misc{Solares2008,
16
+ author = {Emilsa Solares},
17
+ title = {Gramática del idioma Achi},
18
+ month = oct,
19
+ year = {2008},
20
+ }
21
+
22
+ @InBook{England2017,
23
+ chapter = {Chapter 7: Phonology and phonetics},
24
+ title = {The Mayan Languages},
25
+ publisher = {Routledge},
26
+ year = {2017},
27
+ author = {Nora C. England and Brandon O. Baird},
28
+ editor = {Judith Aissen and Nora C. England and Roberto Zavala Maldonado},
29
+ }
Data/_compromised/acr_RabinalAchi'/acr.html ADDED
The diff for this file is too large to render. See raw diff
 
Data/_compromised/acr_RabinalAchi'/acr.rules ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Rabinal Achi' Rule Set
2
+ # Written by: Emily
3
+ # Last Updated: 2020-03-30
4
+ type,sfrom,sto,weight,precede,follow,comment
5
+ # Class Rules
6
+ class,passthrough,[aeiklmnopqstuw],,,,
7
+ class,apostrophe,['ꞌˈ‘’],,,,
8
+ class,eject,(k|q|t|tʃ|ts),,,
9
+ class,vowels,[aeiou],,,,
10
+ # Individual Letters
11
+ sub,b,ɓ,2,,,
12
+ sub,b,ɓ,3,,{apostrophe},
13
+ sub,{apostrophe},,3,b,,"clean-up",
14
+ sub,j,h,2,,,
15
+ sub,r,ɾ,2,,,
16
+ sub,x,ʃ,2,,,
17
+ sub,y,j,2,,,
18
+ sub,{apostrophe},ʔ,2,,,
19
+ sub,({passthrough}),\1,2,,,
20
+ # Digraphs
21
+ sub,c,tʃ,3,,h,
22
+ sub,h,,3,c,,"clean-up",
23
+ sub,t,ts,3,,z,
24
+ sub,z,,3,t,,"clean-up",
25
+ # Ejectives
26
+ ipasub,({eject}) ʔ,\1ʼ,2,,,
27
+ # Vowel Length
28
+ ipasub,({vowels}) \1,\1 ː,2,,,
Data/_compromised/acr_RabinalAchi'/acr.verify.csv ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ beexex,ɓ e ː ʃ e ʃ,
2
+ eyaj,e j a h,
3
+ iis,i ː s,
4
+ k'ij,kʼ i h,
5
+ me's,m e ʔ s,
6
+ no's,n o ʔ s,
7
+ ok',o kʼ,
8
+ ooj,o ː h,
9
+ pix,p i ʃ,
10
+ quej,q u e h,
11
+ q'uel,qʼ u e l,
12
+ rakana',ɾ a k a n a ʔ,
13
+ sutz',s u tsʼ,
14
+ t'ot',tʼ o tʼ,
15
+ tzi,ts i,
16
+ tz'i',tsʼ i ʔ,
17
+ wuch',w u tʃʼ,
18
+ xan,ʃ a n,
19
+ ya',j a ʔ,
20
+ che',tʃ e ʔ,
Data/_compromised/ake_Akawaio/ake.Rmd ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "Akawaio"
3
+ author: "Emily Strand"
4
+ bibliography: ake.bib
5
+ output: html_document
6
+ ---
7
+
8
+ Last Updated: 2020-06-26
9
+
10
+ **COMPROMISED: graphemes normally representing voiced stops and /z/ are not present in the orthography - ambiguity when voiceless stops and /s/ transcribe to voiced counterparts (underlying vs. surface level)**
11
+
12
+ # Background
13
+
14
+ **Language Family:** Carib / Northern / East-West Guiana / Macushi-Kapon / Kapon
15
+
16
+ * Akawaio is spoken in Guyana.
17
+
18
+ # Phonology
19
+
20
+ * Although voiced consonants are said to be originally conditioned (voiceless to voiced) intervocalically or between vowels and nasals, they are considered phonemes of the language [@Gildea2012, p. 450; @Edwards1978, p. 78].
21
+
22
+ ## Consonants
23
+
24
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
25
+
26
+ library(dplyr)
27
+ library(knitr)
28
+ library(kableExtra)
29
+
30
+
31
+ consonants <- read.table(textConnection('
32
+ "Manner of Articulation" Labial Alveolar Palatal Velar
33
+ Stops "p b" "t d" "" "k ɡ"
34
+ Fricatives "" "s z" "" ""
35
+ Nasals "m" "n" "" ""
36
+ Flaps "" "ɾ" "" ""
37
+ Approximants "w" "" "j" ""
38
+ '), TRUE)
39
+
40
+ kable(consonants, col.names = c("Manner of Articulation", "Labial", "Alveolar", "Palatal", "Velar"), align = 'c') %>%
41
+ kable_styling("bordered") %>%
42
+ add_header_above(c("", "Place of Articulation" = 4)) %>%
43
+ column_spec(1, bold = TRUE) %>%
44
+ footnote(general = "Note: For phonemes that share a cell, those on the left are voiceless and those on the right are voiced.", general_title = "")
45
+ ```
46
+
47
+ ## Vowels
48
+
49
+ * Instead of using /ʌ/ to represent the mid-central unrounded vowel as in @Edwards1978, I have opted for /ɘ/ (p. 77). /ə/ is not a suitable option given that all underlying vowels in Akawaio occur in stressed syllables.
50
+ * Diphthongs are present in Akawaio; however, because they are often interpreted as being separated by off-glides, or part of separate syllables, I have chosen not to include them below (ibid).
51
+
52
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
53
+
54
+ vowels <- read.table(textConnection('
55
+ Front Central Back
56
+ High "i" "ɨ" "u"
57
+ Mid "e" "ɘ" "o"
58
+ Low "" "a" ""
59
+ '), TRUE)
60
+
61
+ kable(vowels, align = 'c') %>%
62
+ kable_styling("bordered") %>%
63
+ column_spec(1, bold = TRUE)
64
+
65
+ ```
66
+
67
+ # Alphabet
68
+
69
+ * Apostrophes indicate glottal stops [@Stegeman2014]; however, the glottal stop is not phonemic (see Lenition Rules below). It will be transcribed to /k/.
70
+ * The orthography doesn't represent the voiced stops [@Stegeman2014, p. 2], and it is unclear when the voiceless stops should transcribe to the voiced stops (i.e. there is too much ambiguity around whether the resulting (voiced consonant) transcription is underlying or a surface representation - see Lenition Rules below), thus the language is compromised. I have chosen to transcribe ⟨p⟩, ⟨t⟩, ⟨k⟩, and ⟨s⟩ to their voiceless variants by default given their originating status in the language, but this means that /b/, /d/, /ɡ/, and /z/ are not represented by the rules.
71
+
72
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
73
+
74
+ alphabet <- read.table(textConnection('
75
+ Grapheme Phoneme
76
+ "a" "/a/"
77
+ "e" "/e/"
78
+ "i" "/i/"
79
+ "ɨ" "/ɨ/"
80
+ "k" "/k/"
81
+ "m" "/m/"
82
+ "n" "/n/"
83
+ "o" "/o/"
84
+ "p" "/p/"
85
+ "r" "/ɾ/"
86
+ "s" "/s/"
87
+ "t" "/t/"
88
+ "u" "/u/"
89
+ "ʉ" "/ɘ/"
90
+ "w" "/w/"
91
+ "y" "/j/"
92
+ "\'" "/k/"
93
+ '), TRUE)
94
+
95
+ kable(alphabet, align = 'c') %>%
96
+ kable_styling("bordered")
97
+ ```
98
+
99
+ # Lenition Rules
100
+
101
+ * /k/ debuccalizes to [ʔ] syllable-finally [@Edwards1978, p. 79].
102
+ * Voiceless stops and /s/ become voiced intervocalically [@Edwards1978, p. 81].
103
+ * Voiced stops optionally spirantize [@Gildea2012, p. 450].
104
+ * Unstressed vowels are often deleted, especially word-initially [@Edwards1978, p. 82].
105
+
106
+
107
+ # Misc. Rules
108
+
109
+ * /n/ is realized as [ŋ] word-finally and syllable finally, preceding velar consonants and /w/ [@Edwards1978, p. 79].
110
+ * /k/ becomes palatalized or labialized when preceding /i/ or /u/, respectively (ibid.).
111
+ * /s/ and /z/ are realized as [tʃ] and [dʒ], respectively, preceding /i/ (ibid.).
112
+
113
+ # References
Data/_compromised/ake_Akawaio/ake.bib ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @book{Stegeman2014,
2
+ author = {Ray Stegeman and Rita Hunter},
3
+ pages = {207},
4
+ publisher = {SIL International},
5
+ title = {Akawaio-English Dictionary},
6
+ year = {2014}
7
+ }
8
+
9
+ @article{Edwards1978,
10
+ address = {Bloomington},
11
+ author = {Walter F. Edwards},
12
+ journal = {Anthropological Linguistics},
13
+ number = {2},
14
+ pages = {77-84},
15
+ title = {Some synchronic and diachronic aspects of Akawaio phonology},
16
+ volume = {20},
17
+ year = {1978}
18
+ }
19
+
20
+ @article{Edwards1078_sketch,
21
+ author = {Edwards, Walter F.},
22
+ title = {A Preliminary Sketch of Arekuna (Carib) Phonology},
23
+ journal = {International Journal of American Linguistics},
24
+ volume = {44},
25
+ number = {3},
26
+ pages = {223-227},
27
+ year = {1978}
28
+ }
29
+
30
+ @InBook{Gildea2012,
31
+ chapter = {Linguistic studies in the Cariban family},
32
+ pages = {441-494},
33
+ title = {The Indigenous Languages of South America: A Comprehensive Guide (The World of Linguistics)},
34
+ publisher = {Mouton De Gruyter},
35
+ year = {2012},
36
+ author = {Spike Gildea},
37
+ editor = {Lyle Campbell and Verónica Grondona},
38
+ isbn = {978-3-11-025513-3},
39
+ url = {https://www.amazon.com/Indigenous-Languages-South-America-Comprehensive/dp/3110255138?SubscriptionId=AKIAIOBINVZYXZQZ2U3A&tag=chimbori05-20&linkCode=xm2&camp=2025&creative=165953&creativeASIN=3110255138},
40
+ }
Data/_compromised/ake_Akawaio/ake.html ADDED
The diff for this file is too large to render. See raw diff
 
Data/_compromised/ake_Akawaio/ake.rules ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Akawaio Rule Set
2
+ # Written by: Emily Strand
3
+ # Last updated: 2020-02-04
4
+ type,sfrom,sto,weight,precede,follow,comment
5
+ # Class Rules
6
+ class,passthrough,[aeiɨkmnopstuw],,,,
7
+ class,apostrophe,['ꞌˈ‘’],,,,
8
+ # Individual Letters
9
+ sub,r,ɾ,2,,,
10
+ sub,ʉ,ɘ,2,,,
11
+ sub,y,j,2,,,
12
+ sub,{apostrophe},k,2,,,
13
+ sub,({passthrough}),\1,2,,,
Data/_compromised/ake_Akawaio/ake.verify.csv ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ɨkena',ɨ k e n a k,
2
+ ɨkʉi,ɨ k ɘ i,
3
+ ɨ'se'ne',ɨ k s e k n e k,
4
+ musun,m u s u n,
5
+ nari',n a ɾ i k,
6
+ nau,n a u,
7
+ nawon,n a w o n,
8
+ nɨpontɨ,n ɨ p o n t ɨ,
9
+ nonka,n o n k a,
10
+ piyau,p i j a u,
11
+ pɨ're,p ɨ k ɾ e,
12
+ pe,p e,
13
+ uya,u j a,
14
+ rɨ,ɾ ɨ,
15
+ kon,k o n,
16
+ mɨrɨ,m ɨ ɾ ɨ,
17
+ pʉra,p ɘ ɾ a,
18
+ to,t o,
19
+ awonsi'kɨ,a w o n s i k k ɨ,
20
+ amʉ,a m ɘ,
21
+ iya,i j a,
22
+ yau,j a u,
23
+ ton,t o n,
24
+ ta'pʉ,t a k p ɘ,
25
+ esi,e s i,
26
+ iyesi,i j e s i,
27
+ nɨ,n ɨ,
28
+ a'tai,a k t a i,
29
+ pen,p e n,
30
+ mɨrɨpan,m ɨ ɾ ɨ p a n,
Data/_compromised/amp_Alamblak/amp.Rmd ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "Alamblak"
3
+ author: "Abi Creighton"
4
+ bibliography: amp.bib
5
+ output: html_document
6
+ ---
7
+
8
+ Last Updated: 2020-08-11
9
+
10
+ **COMPROMISED: conflation between /ɘ/ and /o/**
11
+
12
+ # Background
13
+
14
+ **Language Family:** Sepik-Ramu / Sepik / Sepik Hill / Alamblak
15
+
16
+ * Alamblak is spoken in the Angoram District of the East Sepik Province in Papua New Guinea.
17
+
18
+ # Phonology
19
+
20
+ ## Consonants
21
+
22
+ * The phonemic status of the palato-alveolar (or postalveolar) consonants is somewhat suspect. Apart from the ones I include below (/dʒ/, /ɲ/, and /j/), this includes /tʃ/ and /ʃ/ [@bruce_1984, p. 21]. @bruce_1984 explains that they almost always result from phonological processes imposed on underlying alveolar consonants (p. 29). The exceptions to this, or the idiosyncratic surfacing of the postalveolars is what makes for the lack of consensus. I have ultimately chosen to follow the analysis done by @bruce_1975, which excludes /tʃ/ and /ʃ/ on the basis that they are predictable variants of /s/ [p. 101; @bruce_1984, p. 30]. Occurrences of /tʃ/ and /ʃ/ are interpreted as residual forms that have yet to fully collapse.
23
+
24
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
25
+
26
+ library(dplyr)
27
+ library(knitr)
28
+ library(kableExtra)
29
+
30
+
31
+ consonants <- read.table(textConnection('
32
+ "Manner of Articulation" Bilabial Alveolar Postalveolar Palatal Velar
33
+ Stops "p b" "t d" "" "" "k ɡ"
34
+ Affricates "" "" "dʒ" "" ""
35
+ Fricatives "ɸ" "s" "" "" "x"
36
+ Nasals "m" "n" "" "ɲ" ""
37
+ Flaps "" "ɾ" "" "" ""
38
+ Approximants "w" "" "" "j" ""
39
+ '), TRUE)
40
+
41
+ kable(consonants, col.names = c("Manner of Articulation", "Bilabial", "Alveolar", "Postalveolar", "Palatal", "Velar"), align = 'c') %>%
42
+ kable_styling("bordered") %>%
43
+ add_header_above(c("", "Place of Articulation" = 5)) %>%
44
+ column_spec(1, bold = TRUE) %>%
45
+ footnote(general = "Note: For phonemes that share a cell, those on the left are voiceless and those on the right are voiced", general_title = "")
46
+ ```
47
+
48
+ ## Vowels
49
+
50
+ * There is some uncertainty around the mid central vowel. @edmiston_2003 specify it as /ə/ (p. 3), while @bruce_1984 specifies it as /ë/ (p. 34). Based on the minimal pairs Bruce provides through monosyllabic words, it's evident that this mid vowel may bear stress, thus making it not fully reduced. Schwa, therefore, would be an inappropriate representation of this vowel. However, /ë/ is also not ideal in that it doesn't adequately reflect IPA, so based on Bruce's description of a mid unrounded vowel, I have chosen to use /ɘ/.
51
+ * @bruce_1984 also includes /ɨ/ in Alamblak's vowel inventory, but questions its phonemic status, suggesting that occurrences may be epenthetic (pp. 39, 61). Because @edmiston_2003 make no mention of it, I have chosen not to present in the vowel inventory below.
52
+ * Diphthongs may occur phonetically [@bruce_1984, p. 55].
53
+
54
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
55
+
56
+ vowels <- read.table(textConnection('
57
+ Front Central Back
58
+ High "i" "" "u"
59
+ Mid "e" "ɘ" "o"
60
+ Low "" "a" ""
61
+ '), TRUE)
62
+
63
+ kable(vowels, align = 'c') %>%
64
+ kable_styling("bordered") %>%
65
+ column_spec(1, bold = TRUE)
66
+
67
+ ```
68
+
69
+ # Alphabet
70
+
71
+ * ⟨o⟩ is used to represent both /o/ and /ɘ/, which compromises the language [@edmiston_2003, p. 1]. ⟨o⟩ transcribes to /o/ by default in the rule set.
72
+
73
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
74
+
75
+ alphabet <- read.table(textConnection('
76
+ Grapheme Phoneme Comment
77
+ "a" "/a/" ""
78
+ "b" "/b/" ""
79
+ "d" "/d/" ""
80
+ "e" "/e/" ""
81
+ "f" "/ɸ/" ""
82
+ "g" "/ɡ/" ""
83
+ "h" "/h/" ""
84
+ "i" "/i/" ""
85
+ "j" "/dʒ/" ""
86
+ "k" "/k/" ""
87
+ "m" "/m/" ""
88
+ "n" "/n/" ""
89
+ "o" "/o/; /ɘ/" "/o/: default in the rules"
90
+ "p" "/p/" ""
91
+ "r" "/ɾ/" ""
92
+ "s" "/s/" ""
93
+ "t" "/t/" ""
94
+ "u" "/u/" ""
95
+ "w" "/w/" ""
96
+ "y" "/j/" ""
97
+ **Digraph** "" ""
98
+ "ny" "/ɲ/" ""
99
+ '), TRUE)
100
+
101
+ kable(alphabet, align = 'c') %>%
102
+ kable_styling("bordered")
103
+ ```
104
+
105
+ # Syllable Structure
106
+
107
+ * Alamblak has three basic syllable structures [@bruce_1984, p. 61]:
108
+
109
+ - C(C)(C)V(C)(C)
110
+ - V(C)(C)
111
+ - CVV(C)
112
+
113
+ # Lenition Rules
114
+
115
+ * /ɸ/ voices to [β] word-medially [@edmiston_2003, p. 4].
116
+ * /x/ voices to [ɣ] word-medially following a voiced phoneme (ibid.).
117
+
118
+ # Misc. Rules
119
+
120
+ * /n/ assimilates to [ŋ] preceding [ɡ] [@edmiston_2003, p. 4].
121
+ * Low vowel dissimilation is present in Alamblak; /a/ raises to [ə] when followed by another syllable containing /a/ [@blevins_2009, p. 479].
122
+ * An epenthetic [j] is inserted between vowel sequences which are not permitted in Alamblak [@bruce_1984, p. 54].
123
+ * An epenthetic [ɨ] may be inserted optionally in consonant clusters [@bruce_1984, pp. 56-57].
124
+
125
+ # References
Data/_compromised/amp_Alamblak/amp.bib ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ @techreport{edmiston_2003,
3
+ title = {Alamblak Organised Phonology Data},
4
+ url = {https://www.sil.org/resources/archives/42279},
5
+ author = {Edmiston, Melenda and Edmiston, Patrick},
6
+ year = {2003}
7
+ }
8
+
9
+ @phdthesis{bruce_1979,
10
+ author = {Bruce Jr., Leslie P.},
11
+ pages = {572},
12
+ school = {Australian National University},
13
+ title = {A Grammar of Alamblak (Papua New Guinea)},
14
+ year = {1979}
15
+ }
16
+
17
+ @book{bruce_1984,
18
+ address = {Canberra},
19
+ author = {Bruce Jr., Leslie P.},
20
+ number = {81},
21
+ pages = {iv+361},
22
+ publisher = {Research School of Pacific and Asian Studies, Australian National University},
23
+ series = {Pacific Linguistics: Series {C}},
24
+ title = {The Alamblak Language of Papua New Guinea (East Sepik)},
25
+ volume = {81},
26
+ year = {1984}
27
+ }
28
+
29
+ @article{blevins_2009,
30
+ title = {Low {Vowel} {Dissimilation} {Outside} of {Oceanic}: {The} {Case} of {Alamblak}},
31
+ volume = {48},
32
+ copyright = {University of Hawai'i Press},
33
+ url = {http://www.jstor.com/stable/40783539},
34
+ number = {2},
35
+ journal = {Oceanic Linguistics},
36
+ author = {Blevins, Juliette},
37
+ month = dec,
38
+ year = {2009},
39
+ pages = {477--483}
40
+ }
41
+
42
+ @article{dye_1968,
43
+ title = {The {Sepik} {Hill} {Languages}: {A} {Preliminary} {Report}},
44
+ volume = {39},
45
+ url = {http://www.jstor.com/stable/40329762},
46
+ number = {2},
47
+ journal = {Oceania},
48
+ author = {Dye, W and Townsend, P and Townsend, W},
49
+ month = dec,
50
+ year = {1968},
51
+ pages = {146--156}
52
+ }
53
+
54
+ @InBook{bruce_1975,
55
+ author = {Bruce Jr., Leslie P.},
56
+ booktitle = {Papers in New Guinea Linguistics No. 18},
57
+ publisher = {Pacific Linguistics, The Australian National University},
58
+ year = {1975},
59
+ editor = {Conrad, R. and Dye, W. and Thomson, N. P., and Bruce Jr., L. P.},
60
+ title = {Alamblak Alveopalatals - Dead Portmanteaus},
61
+ pages = {91-102},
62
+ }
Data/_compromised/amp_Alamblak/amp.html ADDED
The diff for this file is too large to render. See raw diff
 
Data/_compromised/amp_Alamblak/amp.rules ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Alamblak Rule Set
2
+ # Written by Abi Creighton
3
+ # Last updated: 2020-08-11
4
+ type,sfrom,sto,weight,precede,follow,comment
5
+ # Class Rules
6
+ class,passthrough,[abdehikmnopstuw],,,,
7
+ class,punct,['ꞌ‘’-],,,,
8
+ # Individual Letters
9
+ sub,f,ɸ,1,,,
10
+ sub,g,ɡ,1,,,
11
+ sub,j,dʒ,1,,,
12
+ sub,r,ɾ,1,,,
13
+ sub,y,j,1,,,
14
+ sub,({passthrough}),\1,0.1,,,
15
+ # Multigraphs
16
+ sub,n,ɲ,2,,y,
17
+ sub,y,,2,n,,"clean-up"
18
+ # Misc. Rules
19
+ sub,{punct},,1,,,
Data/_compromised/amp_Alamblak/amp.verify.csv ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ nhai,n h a i,"<a>"
2
+ bi,b i,"<b>"
3
+ duka,d u k a,"<d>"
4
+ be,b e,"<e>"
5
+ tfit,t ɸ i t,"<f>"
6
+ hingrneft,h i n ɡ ɾ n e ɸ t,"<g>"
7
+ toh,t o h,"<h>"
8
+ yima,j i m a,"<i>"
9
+ inji,i n dʒ i,"<j>"
10
+ kfo,k ɸ o,"<k>"
11
+ wom,w o m,"<m>"
12
+ na,n a,"<n>"
13
+ mrokfot,m ɾ o k ɸ o t,"<o>"
14
+ rpat,ɾ p a t,"<p>"
15
+ bro,b ɾ o,"<r>"
16
+ fasoh,ɸ a s o h,"<s>"
17
+ tu,t u,"<t>"
18
+ yuhat,j u h a t,"<u>"
19
+ wanyhato,w a ɲ h a t o,"<w>"
20
+ yak,j a k,"<y>"
21
+ hanyhato,h a ɲ h a t o,"<ny>"
22
+ afo,a ɸ o,
23
+ yuhum,j u h u m,
24
+ finji,ɸ i n dʒ i,
25
+ hik,h i k,
26
+ turhu,t u ɾ h u,
27
+ memom,m e m o m,
28
+ yiha,j i h a,
29
+ rim,ɾ i m,
30
+ hingrna,h i n ɡ ɾ n a,
Data/_compromised/aoj_Mufian/aoj.Rmd ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "Mufian"
3
+ author: "Emily Strand"
4
+ bibliography: aoj.bib
5
+ output: html_document
6
+ ---
7
+
8
+ Last Updated: 2020-01-09
9
+
10
+ **COMPROMISED: conflation among /a/, /æ/, and /ɘ/, between /ɘ/ and /o/, and between /e/ and /ɘ/; ambiguity among long vowels; ambiguity related to whether phonemic (labialized) clusters are always realized as such, ambiguity between word-medial phonetic prenasalized stops and voiced (and voiceless stops)**
11
+
12
+ # Background
13
+
14
+ **Language Family:** Torricelli / Kombio-Arapesh / Arapesh
15
+
16
+ * Mufian is spoken in the East Sepik province of Papua New Guinea.
17
+
18
+ # Phonology
19
+
20
+ ## Consonants
21
+
22
+ * @Conrad1977 do not include the labialized consonants as part of the phonemic inventory, but note that they contrast with their non-labialized counterparts (p. 3). Given this and that other sources [e.g. @Conrad1992; @Conrad1978] include them, I have chosen to include them below.
23
+
24
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
25
+
26
+ library(dplyr)
27
+ library(knitr)
28
+ library(kableExtra)
29
+
30
+
31
+ consonants <- read.table(textConnection('
32
+ "Manner of Articulation" Labial Alveolar Velar Glottal
33
+ Stops "p b" "t d" "k kʷ ɡ ɡʷ" "ʔ ʔʷ"
34
+ Fricatives "f" "s" "" "h"
35
+ Nasals "m" "n" "" ""
36
+ Approximants "w" "l" "" ""
37
+ '), TRUE)
38
+
39
+ kable(consonants, col.names = c("Manner of Articulation", "Labial", "Alveolar", "Velar", "Glottal"), align = 'c') %>%
40
+ kable_styling("bordered") %>%
41
+ add_header_above(c("", "Place of Articulation" = 4)) %>%
42
+ column_spec(1, bold = TRUE) %>%
43
+ footnote(general = "Note: For phonemes that share a cell, those on the left are voiceless and those on the right are voiced. Phonemes that have the diacritic (ʷ) are labialized.", general_title = "")
44
+ ```
45
+
46
+ ## Vowels
47
+
48
+ * @Conrad1992 as well as @Conrad1977 include /ə/ as a phoneme in Mufian (p. 2; p. 3); however, I've opted for /ɘ/ given that @Conrad1978 describe it as a high to mid-central vowel (p. 90).
49
+ * Adjacent vowels are interpreted as sequences [@Conrad1977, pp. 6-7].
50
+ * The documentation for long vowels is ambiguous. @Conrad1977 state that they are rather infrequent, citing length for only /i/, /a/, and /æ/ (contrastive occurrences for only /a/ and /æ/) [@Conrad1977, pp. 9-10, 23]. @Conrad1992 also lists long vowels, but for /a/, /e/, and /æ/ (p. 2); however, they note that long vowels are transcribed just as short vowels are. Because they are rather infrequent and have a low functional load, I have opted not to include them.
51
+
52
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
53
+
54
+ vowels <- read.table(textConnection('
55
+ Front Central Back
56
+ High "i" "" "u"
57
+ Mid "e" "ɘ" "o"
58
+ Low "æ" "a" ""
59
+ '), TRUE)
60
+
61
+ kable(vowels, align = 'c') %>%
62
+ kable_styling("bordered") %>%
63
+ column_spec(1, bold = TRUE)
64
+ ```
65
+
66
+ # Alphabet
67
+
68
+ * The digraphs ⟨mb⟩ and ⟨nd⟩ transcribe to word-medial phonetic prenasalization, so I will transcribe them to /b/ and /d/ respectively [@Conrad1977, p. 6]. The prenasalized /ɡ/ is transcribed as ⟨g⟩, so it poses no correction.
69
+
70
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
71
+
72
+ alphabet <- read.table(textConnection('
73
+ Grapheme Phoneme Comment
74
+ "a" "/a/; /æ/; /ɘ/" "/a/: default in the rules"
75
+ "b" "/b/" ""
76
+ "d" "/d/" ""
77
+ "e" "/e/; /ɘ/" "/e/: default in the rules"
78
+ "f" "/f/" ""
79
+ "g" "/ɡ/" ""
80
+ "h" "/h/" ""
81
+ "i" "/i/" ""
82
+ "k" "/k/" ""
83
+ "l" "/l/" ""
84
+ "m" "/m/" ""
85
+ "n" "/n/" ""
86
+ "o" "/o/; /ɘ/" "/o/: default in the rules"
87
+ "p" "/p/" ""
88
+ "s" "/s/" ""
89
+ "t" "/t/" ""
90
+ "u" "/u/" ""
91
+ "w" "/w/" ""
92
+ "\'" "/ʔ/" ""
93
+ **Digraph** "" ""
94
+ "mb" "/b/" ""
95
+ "nd" "/d/" ""
96
+ "gw" "/ɡʷ/" ""
97
+ "kw" "/kʷ/" ""
98
+ "\'w" "/ʔʷ/" ""
99
+ '), TRUE)
100
+
101
+ kable(alphabet, align = 'c') %>%
102
+ kable_styling("bordered")
103
+ ```
104
+
105
+ # Misc. Rules
106
+
107
+ * Voiceless consonants are aspirated word-finally [@Conrad1977, pp. 12-13].
108
+
109
+ # References
Data/_compromised/aoj_Mufian/aoj.bib ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @incollection{Conrad1978,
2
+ address = {Ukarumpa},
3
+ author = {Conrad, Robert J. and Lukas, Joshua and Alungum, John},
4
+ booktitle = {Miscellaneous papers on Dobu and Arapesh},
5
+ editor = {Richard Loving},
6
+ pages = {89-130},
7
+ publisher = {Summer Institute of Linguistics},
8
+ series = {Workpapers in Papua New Guinea Languages},
9
+ title = {Some Muhiang grammatical notes},
10
+ url = {http://www.sil.org/pacific/png/abstract.asp?id=15292},
11
+ volume = {25},
12
+ year = {1978}
13
+ }
14
+
15
+ @Misc{Conrad1977,
16
+ author = {Conrad, Robert J. and Lukas, Joshua and Alungum, John},
17
+ title = {Preliminary Phonology of Mufian (Southern Arapesh)},
18
+ year = {1977},
19
+ }
20
+
21
+ @Misc{Conrad1992,
22
+ author = {Conrad, Robert J.},
23
+ title = {Mufian Organised Phonology Data},
24
+ howpublished = {SIL},
25
+ month = mar,
26
+ year = {1992},
27
+ }
Data/_compromised/aoj_Mufian/aoj.html ADDED
The diff for this file is too large to render. See raw diff
 
Data/_compromised/aoj_Mufian/aoj.rules ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Mufian Rule Set
2
+ # Written by: Emily
3
+ # Last Updated: 2020-01-09
4
+ type,sfrom,sto,weight,precede,follow,comment
5
+ # Class Rules
6
+ class,apostrophe,['ꞌ‘’],,,,
7
+ class,passthrough,[abdefhiklmnopstuw],,,,
8
+ class,w-preceder,[gk'ꞌ‘’],,,,
9
+ # Individual Letters
10
+ #sub,a,æ,2,,,"conflation among /a/ and /ɘ/ - /a/ is used as default transcription in the passthrough class rule",
11
+ #sub,a,ɘ,2,,,
12
+ #sub,e,ɘ,2,,,"conflation with /e/ - /e/ is used as default transcription in the passthrough class rule",
13
+ #sub,o,ɘ,2,,,"conflation with /o/ - /o/ is used as default transcription in the passthrough class rule",
14
+ sub,g,ɡ,2,,,
15
+ sub,{apostrophe},ʔ,2,,,
16
+ sub,({passthrough}),\1,2,,,
17
+ # Digraphs
18
+ sub,m,b,3,,b,
19
+ sub,b,,3,m,,"clean-up",
20
+ sub,n,d,3,,d,
21
+ sub,d,,3,n,,"clean-up",
22
+ sub,g,ɡʷ,3,,w,
23
+ sub,k,kʷ,3,,w,
24
+ sub,{apostrophe},ʔʷ,3,,w,
25
+ sub,w,,3,{w-preceder},,"clean-up",
Data/_compromised/aoj_Mufian/aoj.verify.csv ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ papi,p a p i,
2
+ ambuta,a b u t a,
3
+ naep,n a e p,
4
+ owamb,o w a b,
5
+ waulu'mana,w a u l u ʔ m a n a,
6
+ tata,t a t a,
7
+ kwa'ah,kʷ a ʔ a h,
8
+ gani,ɡ a n i,
9
+ isag,i s a ɡ,
10
+ dindigina,d i d i ɡ i n a,
11
+ ondop,o d o p,
12
+ wambele'w,w a b e l e ʔʷ,
13
+ gwagwi,ɡʷ a ɡʷ i,
14
+ safe',s a f e ʔ,
15
+ lawah,l a w a h,
16
+ ukup,u k u p,
17
+ ma,m a,
18
+ basef,b a s e f,
19
+ ea',e a ʔ,
20
+ owa',o w a ʔ,
21
+ ina,i n a,
22
+ esis,e s i s,
23
+ waf,w a f,
24
+ na'i,n a ʔ i,
25
+ ipa',i p a ʔ,
26
+ anen,a n e n,
27
+ ae',a e ʔ,
28
+ epes,e p e s,
29
+ si'i,s i ʔ i,
30
+ dei',d e i ʔ,
Data/_compromised/ar_Arabic/ar.Rmd ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "Arabic"
3
+ author: "Emily Strand"
4
+ bibliography: ar.bib
5
+ output: html_document
6
+ ---
7
+
8
+ Last Updated: 2019-12-05
9
+
10
+ **COMPROMISED: some ambiguity in the transcription of alif; some conflation between /w/ and /uː/ and between /j/ and /iː/**
11
+
12
+ # Background
13
+
14
+ **Language Family:** Afro-Asiatic / Semitic / Central / South / Arabic
15
+
16
+ * Arabic is considered an overarching classification of all the dialectal varieties [@Boudelaa2010, p. 482]. Given this, I have chosen to address the Modern Standard variety, which is predominantly written or used in formal communication. Thus, Arabic exhibits diglossia, where written text (and formal communication) differs from what is actually spoken in everyday life. @Boudelaa2010 explain that Modern Standard Arabic is considered the "high" variety, and all the regional dialects are considered the "low" varieties (p. 482). Because of this distinction, the phonemic inventory for Modern Standard Arabic differs somewhat from those of the regional dialects.
17
+ * Arabic is a widely spoken language with speakers primarily residing in either Asia, the Middle East, or North Africa.
18
+
19
+ # Phonology
20
+
21
+ ## Consonants
22
+
23
+ * Arabic includes what are called emphatic consonants, which are produced when the back, or the root, of the tongue retracts towards the pharynx [@Amayreh1998, p. 643].
24
+
25
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
26
+
27
+ library(dplyr)
28
+ library(knitr)
29
+ library(kableExtra)
30
+
31
+
32
+ consonants <- read.table(textConnection('
33
+ "Manner of Articulation" Labial Dental Alveolar Postalveolar Palatal Velar Uvular Pharyngeal Glottal
34
+ "Stops (plain)" "b" "" "t tˤ d dˤ" "" "" "k" "q" "" "ʔ"
35
+ Affricates "" "" "" "dʒ" "" "" "" "" ""
36
+ Fricatives "f" "θ ð ðˤ" "s sˤ z" "ʃ" "" "x ɣ" "" "ħ ʕ" "h"
37
+ Nasals "m" "" "n" "" "" "" "" "" ""
38
+ Trills "" "" "r" "" "" "" "" "" ""
39
+ Approximants "w" "" "l" "" "j" "" "" "" ""
40
+ '), TRUE)
41
+
42
+ kable(consonants, col.names = c("Manner of Articulation", "Labial", "Dental", "Alveolar", "Postalveolar", "Palatal", "Velar", "Uvular", "Pharyngeal", "Glottal"), align = 'c') %>%
43
+ kable_styling("bordered") %>%
44
+ add_header_above(c("", "Place of Articulation" = 9)) %>%
45
+ footnote("Note: For phonemes that share a cell, those on the left are voiceless and those on the right are voiced. Phonemes that have the diacritic (ˤ) are emphatic.", general_title = "") %>%
46
+ column_spec(1, bold = TRUE)
47
+ ```
48
+
49
+ ## Vowels
50
+
51
+ * Vowel length is contrastive in Arabic [@Amayreh1998, p. 643].
52
+ * /e/ and /o/ exist in spoken varieties of Arabic, but not in Standard Modern Arabic [@Ibrahim2002, p. 323].
53
+
54
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
55
+ consonants <- read.table(textConnection('
56
+ Front Central Back
57
+ High "i" "" "u"
58
+ Low "" "a" ""
59
+ '), TRUE)
60
+
61
+ kable(consonants, align = 'c') %>%
62
+ kable_styling("bordered") %>%
63
+ column_spec(1, bold = TRUE)
64
+
65
+
66
+ diphthongs <- read.table(textConnection('
67
+ Diphthongs
68
+ "/aj/, /aw/"
69
+ '), TRUE)
70
+
71
+ kable(diphthongs, align = 'c') %>%
72
+ kable_styling("bordered")
73
+ ```
74
+
75
+ # Alphabet
76
+
77
+ * Arabic is written from right to left [@Ibrahim2002, p. 323].
78
+ * The majority of the graphemes have different forms depending on where they appear in a word in relation to other graphemes (ibid.). Graphemes may have up to four different forms for the word initial, medial, and final position as well as for the isolated form. The isolated forms for all the graphemes are represented below.
79
+ - Although the diacritics are often seen as orthographically separate from the isolated forms (i.e. the diacritic followed by the base grapheme), some grapheme and diacritic combinations are seen as individual units. For example, the alif ⟨ا⟩ with an overhead hamza ⟨ء⟩, is often represented as ⟨أ⟩ rather than ⟨ ٔا⟩. Other "permanent" grapheme diacritic combinations include: ⟨آ⟩ ,⟨ئ⟩ ,⟨ؤ⟩, and ⟨إ⟩.
80
+ * Long vowels are indicated by specific graphemes, however, short vowels are indicated by diacritics [@Awde2000, p. 18].
81
+
82
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
83
+
84
+ alphabet <- read.table(textConnection('
85
+ Grapheme Phoneme Comment
86
+ "ا" "/aː/; /ʔ/" "/ʔ/: word-initially (not always marked, which somewhat compromises the language)"
87
+ "ب" "/b/" ""
88
+ "ت" "/t/" ""
89
+ "ث" "/θ/" ""
90
+ "ج" "/dʒ/" ""
91
+ "ح" "/ħ/" ""
92
+ "خ" "/x/" ""
93
+ "د" "/d/" ""
94
+ "ذ" "/ð/" ""
95
+ "ر" "/r/" ""
96
+ "ز" "/z/" ""
97
+ "س" "/s/" ""
98
+ "ش" "/ʃ/" ""
99
+ "ص" "/sˤ/" ""
100
+ "ض" "/dˤ/" ""
101
+ "ط" "/tˤ/" ""
102
+ "ظ" "/ðˤ/" ""
103
+ "ع" "/ʕ/" ""
104
+ "غ" "/ɣ/" ""
105
+ "ف" "/f/" ""
106
+ "ق" "/q/" ""
107
+ "ك" "/k/" ""
108
+ "ل" "/l/" ""
109
+ "م" "/m/" ""
110
+ "ن" "/n/" ""
111
+ "ه" "/h/" ""
112
+ "و" "/w/; /uː/" "/w/: word-initially (used as default in the rules); /uː/: preceded by a short /u/ diacritic"
113
+ "ي" "/j/; /iː/" "/j/: word -initially (used as default in the rules); /iː/ preceded by a short /i/ diacritic"
114
+ "ء" "/ʔ/" "called a hamza, this grapheme also exists as a diacritic (explained below)"
115
+ "ة" "∅; /t/" "called a ta-marbuta, this grapheme appears word-finally, corresponding to /t/ if followed by a diacritic or ∅ otherwise [@Biadsy2009, p. 3]"
116
+ "ى" "/a/" "called an alif-maqsura, this grapheme occurs word-finally [@Habash2010, p. 11; @Biadsy2009, p. 3]"
117
+ **Diacritic** "" ""
118
+ "ُ" "/u/" "this diacritic is called a dammah [@Yurtbasi2016, p. 146]"
119
+ "َ" "/a/" "this diacritic is called a fatḥah (ibid.)"
120
+ "ِ" "/i/" "this diacritic is called a kasrah (ibid.)"
121
+ "ٰ" "/aː/" "this diacritic is called an alif khanjariyah (ibid.)"
122
+ "ٔ" "/ʔ/" "this diacritic is called a hamza, and only appears (as a diacritic) in combination with ⟨ا⟩ ,⟨ي⟩, and ⟨و⟩ [@Habash2010, pp. 5-6]"
123
+ "ٕ" "/ʔi/" ""
124
+ "ٓ" "/ʔ/" "this diacritic is called a madda (a variant of the hamza), appearing in combination with ⟨ا⟩ [@Habash2010, p. 6]"
125
+ "ّ" "" "called a shadda, this diacritic indicates gemination of consonants [@Habash2010, p. 11]"
126
+ "ْ" "" "called a sukun, this diacritic indicates that no vowel follows the consonant in which it\'s attached to; it also typically marks syllable boundaries [@Habash2012, p. 712]"
127
+ "ٌ" "/an/" "indicates a word-final /an/ (nunnation) [@Habash2012, p. 713]"
128
+ "ٍ" "/in/" "indicates a word-final /in/ (nunnation) (ibid.)"
129
+ "ً" "/un/" "indicates a word-final /un/ (nunnation) (ibid.)"
130
+ '), TRUE)
131
+
132
+ kable(alphabet, col.names = c("Grapheme", "Phoneme", "Comment"), align = 'c') %>%
133
+ kable_styling("bordered")
134
+ ```
135
+
136
+ # Syllable Structure
137
+
138
+ * Syllables in Modern Standard Arabic tend to have CV or CVC structures; however, CVCC syllables exist word-finally [@Habash2012, p. 712].
139
+ * Word-initial vowels are written as an inflected alif, or a hamza watsl, produced with a glottal stop. This glottal stop vowel is considered an allophone of vowels, so word-initial vowels will be transcribed as just vowels (ibid.).
140
+ - A plain alif may indicate an optional allophonic glottal stop word-initially, but it may also indicate an obligatory glottal stop (noted in the chart above), which results in a compromised language [@Ibrahim2019, p. 293].
141
+
142
+ # Lenition Rules
143
+
144
+ * According to @Amayreh1998 (p. 643):
145
+ - Glottal stops delete word-medially.
146
+ - Emphatic /s/ becomes voiced word-finally.
147
+ - /j/ debuccalizes to a glottal stop word-initially.
148
+ - /dʒ/ may fully spirantize to [ʒ].
149
+ - /q/ may debuccalize to a glottal stop or become a voiced velar or uvular stop.
150
+
151
+ # Misc. Rules
152
+
153
+ * Emphatic consonants tend to affect vowels and consonants around them, resulting in lower back vowels and velarization of consonants [@Saiegh-Haddad2014, p. 5].
154
+
155
+ # References
Data/_compromised/ar_Arabic/ar.bib ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @article{Habash2012,
2
+ author = {Habash, Nizar and Diab, Mona and Rambow, Owen},
3
+ year = {2012},
4
+ month = {01},
5
+ pages = {},
6
+ title = {Conventional Orthography for Dialectal Arabic},
7
+ journal = {Proceedings of the Language Resources and Evaluation Conference (LREC), Istanbul}
8
+ }
9
+
10
+ @Article{Boudelaa2010,
11
+ author = {Boudelaa, Sami and Marslen-Wilson, William D.},
12
+ title = {Aralex: A lexical database for Modern Standard Arabic},
13
+ journal = {Behavior Research Methods},
14
+ year = {2010},
15
+ volume = {42},
16
+ number = {2},
17
+ pages = {481--487},
18
+ month = {may},
19
+ doi = {10.3758/brm.42.2.481},
20
+ publisher = {Springer Science and Business Media {LLC}},
21
+ }
22
+
23
+ @Article{Amayreh1998,
24
+ author = {Amayreh, Mousa M. and Dyson, Alice T.},
25
+ title = {The Acquisition of Arabic Consonants},
26
+ journal = {Journal of Speech, Language, and Hearing Reasearch},
27
+ year = {1998},
28
+ }
29
+
30
+ @Article{Ibrahim2002,
31
+ author = {Ibrahim, Raphiq and Eviatar, Zohar and Aharon-Peretz, Judith},
32
+ title = {The characteristics of Arabic orthography slow its processing.},
33
+ journal = {Neuropsychology},
34
+ year = {2002},
35
+ volume = {16},
36
+ number = {3},
37
+ pages = {322--326},
38
+ doi = {10.1037/0894-4105.16.3.322},
39
+ publisher = {American Psychological Association ({APA})},
40
+ }
41
+
42
+ @Book{Awde2000,
43
+ title = {The Arabic Alphabet: How to Read and Write It},
44
+ publisher = {LYLE STUART},
45
+ year = {2000},
46
+ author = {Awde, N.},
47
+ isbn = {0818404302},
48
+ date = {2000-10-01},
49
+ ean = {9780818404306},
50
+ pagetotal = {95},
51
+ url = {https://www.ebook.de/de/product/3309537/n_awde_the_arabic_alphabet_how_to_read_and_write_it.html},
52
+ }
53
+
54
+ @inproceedings{Biadsy2009,
55
+ author = {Biadsy, Fadi and Habash, Nizar and Hirschberg, Julia},
56
+ title = {Improving the Arabic Pronunciation Dictionary for Phone and Word Recognition with Linguistically-based Pronunciation Rules},
57
+ booktitle = {Proceedings of Human Language Technologies: The 2009 Annual Conference of the North American Chapter of the Association for Computational Linguistics},
58
+ series = {NAACL '09},
59
+ year = {2009},
60
+ isbn = {978-1-932432-41-1},
61
+ location = {Boulder, Colorado},
62
+ pages = {397--405},
63
+ numpages = {9},
64
+ url = {http://dl.acm.org/citation.cfm?id=1620754.1620812},
65
+ acmid = {1620812},
66
+ publisher = {Association for Computational Linguistics},
67
+ address = {Stroudsburg, PA, USA},
68
+ }
69
+
70
+ @Book{Habash2010,
71
+ title = {Introduction to Arabic Natural Language Processing},
72
+ publisher = {Morgan \& Claypool},
73
+ year = {2010},
74
+ author = {Habash, Nizar},
75
+ }
76
+
77
+ @Book{Coulmas2008,
78
+ title = {Writing Systems},
79
+ publisher = {Cambridge University Press},
80
+ year = {2008},
81
+ author = {Coulmas, Florian},
82
+ isbn = {0521787378},
83
+ date = {2008-02-29},
84
+ ean = {9780521787376},
85
+ pagetotal = {292},
86
+ url = {https://www.ebook.de/de/product/3255945/florian_coulmas_writing_systems.html},
87
+ }
88
+
89
+ @Article{Yurtbasi2016,
90
+ author = {Yurtbaşı, Metin},
91
+ title = {Sura Yusuf in Full IPA (Segmental-Suprasegmental) Transcription with English Translation},
92
+ journal = {International Journal of Arts and Humanities and Social Sciences},
93
+ year = {2016},
94
+ }
95
+
96
+ @InCollection{Saiegh-Haddad2014,
97
+ author = {Saiegh-Haddad, Elinor and Henkin-Roitfarb, Roni},
98
+ title = {The Structure of Arabic Language and Orthography},
99
+ booktitle = {Literacy Studies},
100
+ publisher = {Springer Netherlands},
101
+ year = {2014},
102
+ pages = {3--28},
103
+ doi = {10.1007/978-94-017-8545-7_1},
104
+ }
105
+
106
+ @article{Ibrahim2019,
107
+ author = {Ibrahim, Abdulateef},
108
+ year = {2019},
109
+ month = {04},
110
+ pages = {},
111
+ title = {Glottal Stop in Arabic with Reference to English: Phonological and Orthographical Study},
112
+ volume = {( 2016 M- 1437 e)}
113
+ }
Data/_compromised/ar_Arabic/ar.html ADDED
The diff for this file is too large to render. See raw diff
 
Data/_compromised/ar_Arabic/ar.rules ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Arabic Rule Set
2
+ # Written by: Emily
3
+ # Last Updated: 2019-12-04
4
+ type,sfrom,sto,weight,precede,follow,comment
5
+ # Class Rules
6
+ class,consonant,(b|t|θ|dʒ|ħ|x|d|ð|r|z|s|ʃ|sˤ|dˤ|tˤ|ðˤ|ʕ|ɣ|f|q|k|l|m|n|h|w|j|ʔ),,,,
7
+ class,shortV,[ ُِ َ],,,,
8
+ class,shortV-output,[aiu],,,,
9
+ class,diacritic,[ ْٕٔ ًٍّٓ ٌ ِٰ َُ],,,,
10
+ class,hamza-combo,[اوي],,,,
11
+ class,hamza-combo-output,(j|w|a ː),,,,
12
+ class,diphthong-combo,[jw],,,,
13
+ ## Graphemes
14
+ class,alif,[ا],,,,
15
+ class,b,[ب],,,,
16
+ class,t,[ت],,,,
17
+ class,θ,[ث],,,,
18
+ class,dʒ,[ج],,,,
19
+ class,ħ,[ح],,,,
20
+ class,x,[خ],,,,
21
+ class,d,[د],,,,
22
+ class,ð,[ذ],,,,
23
+ class,r,[ر],,,,
24
+ class,z,[ز],,,,
25
+ class,s,[س],,,,
26
+ class,ʃ,[ش],,,,
27
+ class,emph-s,[ص],,,,
28
+ class,emph-d,[ض],,,,
29
+ class,emph-t,[ط],,,,
30
+ class,emph-ð,[ظ],,,,
31
+ class,ʕ,[ع],,,,
32
+ class,ɣ,[غ],,,,
33
+ class,f,[ف],,,,
34
+ class,q,[ق],,,,
35
+ class,k,[ك],,,,
36
+ class,l,[ل],,,,
37
+ class,m,[م],,,,
38
+ class,n,[ن],,,,
39
+ class,h,[ه],,,,
40
+ class,w-uu,[و],,,,
41
+ class,j-ii,[ي],,,,
42
+ class,hamza,[ء],,,,
43
+ class,ta-marbuta,[ة],,,,
44
+ class,alif-maq,[ى],,,,
45
+ ## Diacritics
46
+ class,u,[ُ],,,,
47
+ class,a,[َ],,,,
48
+ class,i,[ِ],,,,
49
+ class,sup-alif,[ٰ],,,,
50
+ class,dia-hamza-above,[ٔ],,,,,
51
+ class,dia-hamza-below,[ٕ],,,,
52
+ class,madda,[ٓ],,,,
53
+ class,shadda,[ّ],,,,
54
+ class,sukun,[ْ],,,,
55
+ class,a-nunnation,[ٌ],,,
56
+ class,i-nunnation,[ٍ],,,
57
+ class,u-nunnation,[ً],,,
58
+ ## Permanent Diacritic Grapheme Combos (some diacritics are not treated as separate from the grapheme)
59
+ class,alif-hamza-above,[أ],,,,
60
+ class,alif-hamza-below,[إ],,,,
61
+ class,w-hamza,[ؤ],,,,
62
+ class,j-hamza,[ئ],,,,
63
+ class,alif-madda,[آ],,,,
64
+ # Sub Rules
65
+ ## Graphemes
66
+ sub,{alif},a ː,2,,,
67
+ sub,{alif},ʔ,3,^,,"word-initial alif is /ʔ/, however, not every word-initial alif is realized as such (primarily the hamza is used) [@Coulmas2003, p. 123] - somewhat compromises the transcription",
68
+ sub,{b},b,2,,,
69
+ sub,{t},t,2,,,
70
+ sub,{θ},θ,2,,,
71
+ sub,{dʒ},dʒ,2,,,
72
+ sub,{ħ},ħ,2,,,
73
+ sub,{x},x,2,,,
74
+ sub,{d},d,2,,,
75
+ sub,{ð},ð,2,,,
76
+ sub,{r},r,2,,,
77
+ sub,{z},z,2,,,
78
+ sub,{s},s,2,,,
79
+ sub,{ʃ},ʃ,2,,,
80
+ sub,{emph-s},sˤ,2,,,
81
+ sub,{emph-d},dˤ,2,,,
82
+ sub,{emph-t},tˤ,2,,,
83
+ sub,{emph-ð},ðˤ,2,,,
84
+ sub,{ʕ},ʕ,2,,,
85
+ sub,{ɣ},ɣ,2,,,
86
+ sub,{f},f,2,,,
87
+ sub,{q},q,2,,,
88
+ sub,{k},k,2,,,
89
+ sub,{l},l,2,,,
90
+ sub,{m},m,2,,,
91
+ sub,{n},n,2,,,
92
+ sub,{h},h,2,,,
93
+ sub,{w-uu},w,2,,,
94
+ sub,{j-ii},j,2,,,
95
+ sub,{hamza},ʔ,2,,,
96
+ sub,{ta-marbuta},t,3,,{diacritic}$,"ta-marbuta occurs word-finally and if followed by a diacritic is recognized as /t/",
97
+ sub,{diacritic},,3,{ta-marbuta},$,"clean-up",
98
+ sub,{ta-marbuta},,2,,$,"ta-marbuta transcribes to nothing when not followed by a diacritic",
99
+ sub,{alif-maq},a,2,,$,"alif-maq occurs word-finally",
100
+ ## Diacritics
101
+ sub,{u},u,2,,,
102
+ sub,{a},a,2,,,
103
+ sub,{i},i,2,,,
104
+ sub,{sup-alif},a ː,2,,,
105
+ sub,{dia-hamza-above},1ʔ,6,{hamza-combo},,"this transcribes the hamza sequences as the consonant followed by a glottal stop, but we need an ipasub rule to make the glottal stop precede the consonant",
106
+ sub,{alif},,4,^,{dia-hamza-above},"alif-hamza (above) only corresponds to glottal stop word-initially, so the alif shouldn't be transcribed",
107
+ sub,{alif},ʔ i,4,,{dia-hamza-below},
108
+ sub,{alif},ʔ,5,^,{dia-hamza-below},"alif-hamza (below) only corresponds to glottal stop word-initially, so the alif shouldn't be transcribed",
109
+ sub,{dia-hamza-below},,2,{alif},,"clean-up",
110
+ sub,{alif},ʔ a ː,3,,{madda},
111
+ sub,{shadda},1ː,2,,,"needed for ipasub gemination rules below",
112
+ sub,{sukun},,2,,,
113
+ sub,{a-nunnation},a n,3,,$,
114
+ sub,{a-nunnation},a n,3,,{diacritic}$,"sometimes followed by a diacritic (shadda) which both apply to the consonant [@Habash2010, p. 11]",
115
+ sub,{i-nunnation},i n,3,,$,
116
+ sub,{i-nunnation},i n,3,,{diacritic}$,"sometimes followed by a diacritic (shadda) which both apply to the consonant (ibid.)",
117
+ sub,{u-nunnation},u n,3,,$,
118
+ sub,{u-nunnation},u n,3,,{diacritic}$,"sometimes followed by a diacritic (shadda) which both apply to the consonant (ibid.)",
119
+ ## Permanent Diacritic Grapheme Combos
120
+ sub,{alif-hamza-above},ʔ a ː,3,,,
121
+ sub,{alif-hamza-above},ʔ,4,^,,"alif-hamza (above) only corresponds to glottal stop word-initially - alif is a place holder",
122
+ sub,{alif-hamza-below},ʔ i,3,,,
123
+ sub,{alif-hamza-below},ʔ,4,^,,"alif-hamza (below) only corresponds to glottal stop word-initially - alif is a place holder",
124
+ sub,{w-hamza},ʔ w,2,,,
125
+ sub,{j-hamza},ʔ j,2,,,
126
+ sub,{alif-madda},ʔ a ː,2,,,
127
+ ## Diphthongs [@Javed, p. 6]
128
+ sub,{a},aw,3,,{w-uu},
129
+ sub,{w-uu},,3,{a},,"clean-up",
130
+ sub,{a},aj,3,,{j-ii},
131
+ sub,{j-ii},,3,{a},,"clean-up",
132
+ # Additional Long Vowel Transcriptions
133
+ sub,{alif},,8,^,{shortV},"word-initial vowels are represented by an inflected alif [@Habash2012, p. 712]",
134
+ sub,{a},a ː,5,,{alif},
135
+ sub,{alif},,4,{a},,"clean-up",
136
+ sub,{a},a ː,5,,{alif-maq},
137
+ sub,{alif-maq},,4,{a},,"clean-up",
138
+ sub,{u},u 2ː,5,,{w-uu},
139
+ sub,{w-uu},,4,{u},,"clean-up",
140
+ sub,{i},i 3ː,5,,{j-ii},
141
+ sub,{j-ii},,4,{i},,"clean-up",
142
+ ## Alif-maqsura
143
+ sub,{alif-maq},j ː,6,,{shortV}{shadda},"alif-maq changes into a yaa if followed by a diacritic (generally a shadda) [@Habash2010, p. 61]",
144
+ sub,{alif-maq},j ː,6,,{shadda},
145
+ sub,{shadda},,6,{alif-maq}{shortV},,
146
+ # ipasub Rules
147
+ ## Consonant Gemination
148
+ ipasub,({consonant}) ({shortV-output}) 1ː,\1 ː \2,3,,,"controls for shadda geminating vowels depending on if the double consonant is also followed by a short vowel",
149
+ ipasub,({consonant}) a 1ː ({diphthong-combo}),\1 ː a\3,4,,,"controls for shadda geminating vowels depending on if the double consonant is also followed by a diphthong",
150
+ ipasub,({consonant}) 1ː,\1 ː,2,,,"this removes the 1 if the consonant is not followed by a short vowel"
151
+ ipasub,({shortV-output}) 2ː ({shortV-output}) 1ː,\1 w ː \2,3,,,"controls for shadda taking priority over elongation of vowels (e.g. Dammah + waw) with following short vowel",
152
+ ipasub,({shortV-output}) 2ː 1ː,\1 w ː,3,,,"controls for shadda taking priority over elongation of vowels (e.g. dammah + waw)",
153
+ ipasub,({shortV-output}) 2ː u n 1ː,\1 w ː u n,4,,,"flips around the nunation and gemination",
154
+ ipasub,({shortV-output}) 2ː a n 1ː,\1 w ː a n,4,,,"flips around the nunation and gemination",
155
+ ipasub,({shortV-output}) 2ː i n 1ː,\1 w ː i n,4,,,"flips around the nunation and gemination",
156
+ ipasub,2ː,ː,2,,,"removes the 2 from the rest of the elongated /u/s",
157
+ ipasub,({shortV-output}) 3ː ({shortV-output}) 1ː,\1 j ː \2,3,,,"controls for shadda taking priority over elongation of vowels (e.g. kasrah + yaa) with following short vowel",
158
+ ipasub,({shortV-output}) 3ː 1ː,\1 j ː,3,,,"controls for shadda taking priority over elongation of vowels (e.g. kasrah + yaa)",
159
+ ipasub,({shortV-output}) 3ː i n 1ː,\1 j ː i n,4,,,"flips around the nunation and gemination",
160
+ ipasub,({shortV-output}) 3ː u n 1ː,\1 j ː u n,4,,,"flips around the nunation and gemination",
161
+ ipasub,({shortV-output}) 3ː a n 1ː,\1 j ː a n,4,,,"flips around the nunation and gemination",
162
+ ipasub,1ː,@,1,,,"rules out illegal combos (shadda appearing with nunnation word-medially over an alif)",
163
+ ipasub,3ː,ː,2,,"removes the 3 from the rest of the elongated /i/s",
164
+ ## Hamza (glottal stop)
165
+ ipasub,({hamza-combo-output}) 1ʔ,ʔ \1,3,,,"this puts the glottal stop before the character carrying the hamza",
166
+ ipasub,a ː 1ʔ ({shortV-output}),ʔ \1,4,,,"word-medial glottal stops are sometimes represented as alif topped with a hamza and a short vowel diacritic, the alif holds no value here",
167
+ ipasub,ʔ a ː ({shortV-output}),ʔ \1,2,,,"word-medial glottal stops are sometimes represented as alif topped with a hamza and a short vowel diacritic, the alif holds no value here (this is the same rule as the one above but uses the permanent alf-hamza-above character)",
Data/_compromised/ar_Arabic/ar.verify.csv ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ أُكْسِجِين,ʔ u k s i dʒ i ː n,"long /i/ (kasrah + ي)",
2
+ أَقِط,ʔ a q i tˤ,"initial glottal stop with /a/ diacritic",
3
+ أَجْهَدَ,ʔ a dʒ h a d a,"sukun",
4
+ أَجْسَم,ʔ a dʒ s a m,
5
+ أَجْهَل,ʔ a dʒ h a l,
6
+ أُمْدُوحَة,ʔ u m d u ː ħ a,"ta-marbuta without a diacritic",
7
+ أَنْشَط,ʔ a n ʃ a tˤ,"emphatic /t/",
8
+ أَهْدَب,ʔ a h d a b,
9
+ شَاحِنَة,ʃ a ː ħ i n a,
10
+ شَارِع,ʃ a ː r i ʕ,
11
+ دَا,d a ː,"long /a/ (fatha + alif)"
12
+ غاضِب,ɣ a ː dˤ i b,"emphatic /d/",
13
+ غَزَالَة,ɣ a z a ː l a,
14
+ رَزَقَ,r a z a q a,
15
+ رِفَاق,r i f a ː q,
16
+ رَفَأَ,r a f a ʔ a,"word-medial glottal stop (alif-hamza)",
17
+ رُمْح,r u m ħ,
18
+ هَافَانَا,h a ː f a ː n a ː,
19
+ فَاسِد,f a ː s i d,
20
+ فِتْنَة,f i t n a,
21
+ فِرْعَوْن,f i r ʕ aw n,"diphthong /aw/",
22
+ فُرُوغ,f u r u ː ɣ,
23
+ فَظِيع,f a ðˤ i ː ʕ,
24
+ قَاضٍ,q a ː dˤ i n,"/in/ nunnation",
25
+ قَبُوح,q a b u ː ħ,
26
+ قَاطَعَ,q a ː tˤ a ʕ a,
27
+ قَبِيح,q a b i ː ħ,
28
+ قَزّ,q a z ː,"consonant gemination (shadda)",
29
+ وَازَى,w a ː z a ː,"initial /w/",
30
+ ثَلَاثَة,θ a l a ː θ a,
31
+ جَبْخَانَة,dʒ a b x a ː n a,
32
+ جُمَّيْزَة,dʒ u m ː aj z a,
33
+ الْبُخَارِىُّ,ʔ l b u x a ː r i j ː u,
34
+ نَبَاتِيّ,n a b a ː t i j ː,
35
+ عَلِيٍّ,ʕ a l i j ː i n,
36
+ آدَمِيّ,ʔ a ː d a m i j ː,
37
+ آيَسَ,ʔ a ː j a s a,
38
+ ذبابة,ð b a ː b,
39
+ صار,sˤ a ː r,
Data/_compromised/arn_Mapudungun/arn.Rmd ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "Mapudungun"
3
+ author: "Emily Strand"
4
+ bibliography: arn.bib
5
+ output: html_document
6
+ ---
7
+
8
+ Last updated: 2020-06-26
9
+
10
+ **COMPROMISED: ambiguity due to non-standard alphabet; conflation of dental and alveolar consonants /t̪/ and /t/ (most likely with the others as well)**
11
+
12
+ # Background
13
+
14
+ **Language Family:** Araucanian
15
+
16
+ * Also referred to as Mapuche or Araucana, it is spoken throughout Chile.
17
+
18
+ # Phonology
19
+
20
+ ## Consonants
21
+
22
+ * In some dialects of Mapudungun, the distinction between the dental and alveolar phonemes have merged [@sadowsky_mapudungun_2013, p. 89], leaving the dentals to appear as a result of allophonic variation.
23
+ * /ʃ/ is rather infrequent, often alternating with /s/, yet it is still considered in most sources to be a phoneme [@smeets_grammar_2008, p. 23].
24
+
25
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
26
+
27
+ library(dplyr)
28
+ library(knitr)
29
+ library(kableExtra)
30
+
31
+
32
+ consonants <- read.table(textConnection('
33
+ "Manner of Articulation" Labial Dental Alveolar Postalveolar Retroflex Palatal Velar
34
+ Stops "p" "t̪" "t" "" "" "" "k"
35
+ Affricates "" "" "" "tʃ" "ʈʂ" "" ""
36
+ Fricatives "f" "θ" "s" "ʃ" "ʐ" "" "ɣ"
37
+ Nasals "m" "n̪" "n" "" "" "ɲ" "ŋ"
38
+ Approximants "" "l̪" "l" "" "" "j ʎ" "w"
39
+ '), TRUE)
40
+
41
+ kable(consonants, col.names = c("Manner of Articulation", "Labial", "Dental", "Alveolar", "Postalveolar", "Retroflex", "Palatal", "Velar"), align = 'c') %>%
42
+ kable_styling("bordered") %>%
43
+ add_header_above(c("", "Place of Articulation" = 7)) %>%
44
+ column_spec(1, bold = TRUE) %>%
45
+ footnote(general = "Note: The palatal approximant on the right is lateral.", general_title = "")
46
+ ```
47
+
48
+ ## Vowels
49
+
50
+ * Diphthongs aren't prevalent in Mapudungun; however, /ae/ is generally realized as one [@smeets_grammar_2008, p. 52]. Because it can also occur as a sequence of vowels, it will not be transcribed in the rules.
51
+
52
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
53
+
54
+ vowels <- read.table(textConnection('
55
+ Front Central Back
56
+ High "i" "" "u"
57
+ Mid "e" "ɘ" "o"
58
+ Low "" "a" ""
59
+ '), TRUE)
60
+
61
+ kable(vowels, align = 'c') %>%
62
+ kable_styling("bordered") %>%
63
+ column_spec(1, bold = TRUE)
64
+
65
+ ```
66
+
67
+ # Alphabet
68
+
69
+ * Although there isn't a standardization of the alphabet, the three alphabets commonly referred to provide a rather comprehensive picture in terms of depicting each phoneme. These alphabets include the Alfabeto Mapuche Unificado, the Frafemario Raguileo, and the Azumchefi [@bronzino_loanword_nodate, p. 22]. Listed below is the Alfabeto Mapuche Unificado, but following it, I have provided the orthographic variations from the other two.
70
+ * Regarding the corresponding phoneme of ⟨g⟩, the Unified Alphabet uses /ɣ/ whereas the Raguileo and Azumchefi alphabets use /ŋ/. I went with the ⟨g⟩ to /ŋ/ correspondence due to the inclusion of ⟨q⟩ in the Crúbadán corpus, which based on the Raguileo and Azumchefi alphabets corresponds to /ɣ/.
71
+ * Based on the Alfabeto Mapuche Unificado, the dentals are not represented in the Crúbadán corpus. With the grapheme correspondences of the other two alphabets, they appear (except /t̪/, as the distinction between it at /t/ is not maintained in either). Based on this, conflation most likely occurs between all alveolar consonants and their dental counterparts, compromising the language.
72
+
73
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
74
+
75
+ alphabet <- read.table(textConnection('
76
+ Grapheme Phoneme Comment
77
+ "a" "/a/" ""
78
+ "d" "/θ/" ""
79
+ "e" "/e/" ""
80
+ "f" "/f/" ""
81
+ "g" "/ŋ/" ""
82
+ "i" "/i/" ""
83
+ "ï" "/ɘ/" ""
84
+ "k" "/k/" ""
85
+ "l" "/l/" ""
86
+ "ḻ" "/l̪/" ""
87
+ "m" "/m/" ""
88
+ "n" "/n/" ""
89
+ "ṉ" "/n̪/" ""
90
+ "ñ" "/ɲ/" ""
91
+ "o" "/o/" ""
92
+ "p" "/p/" ""
93
+ "r" "/ʐ/" ""
94
+ "s" "/s/" ""
95
+ "t" "/t/" ""
96
+ "ṯ" "/t̪/" "not reflected in the Crúbadán corpus"
97
+ "u" "/u/" ""
98
+ "ü" "/ɘ/" ""
99
+ "w" "/w/" ""
100
+ "y" "/j/" ""
101
+ **Digraph** "" ""
102
+ "ch" "/tʃ/" ""
103
+ "tr" "/ʈʂ/" ""
104
+ "sh" "/ʃ/" ""
105
+ "ng" "/ŋ/" ""
106
+ "ll" "/ʎ/" ""
107
+ "**Orthographic Variation**" "" ""
108
+ "c" "/tʃ/" "Raguileo"
109
+ "x" "/ʈʂ/" "Raguileo"
110
+ "tx" "/ʈʂ/" "Azumchefi"
111
+ "z" "/θ/" "Raguileo and Azumchefi"
112
+ "h" "/n̪/" "Raguileo"
113
+ "nh" "/n̪/" "Azumchefi"
114
+ "q" "/ɣ/" "Raguileo and Azumchefi"
115
+ "b" "/l̪/" "Raguileo"
116
+ "lh" "/l̪/" "Azumchefi"
117
+ "j" "/ʎ/" "Raguileo"
118
+ "v" "/ɘ/" "Raguileo"
119
+ '), TRUE)
120
+
121
+ kable(alphabet, align = 'c') %>%
122
+ kable_styling("bordered")
123
+ ```
124
+
125
+ # Lenition Rules
126
+
127
+ * Geminates occur in the language, but they are often realized as single consonants [@smeets_grammar_2008, p. 51].
128
+ * /ʐ/ can approximate to /ɻ/ [@sadowsky_mapudungun_2013, p. 90].
129
+
130
+ # Misc. Rules
131
+
132
+ * Aspiration of some of the stops can occur [@sadowsky_mapudungun_2013, p. 89].
133
+ * Unstressed vowels are often deleted in word-final positions following voiceless consonants [@sadowsky_mapudungun_2013, p. 94].
134
+ * /ɘ/ may be deleted following a stressed syllable (ibid.).
135
+ * Consonant clusters are only allowed intervocalically [@smeets_grammar_2008, pp. 37, 38].
136
+ * The velar phonemes /k/, /ŋ/, /ɣ/ tend to be fronted preceding front vowels [@sadowsky_mapudungun_2013, p. 89].
137
+
138
+
139
+ # References
Data/_compromised/arn_Mapudungun/arn.bib ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ @phdthesis{bronzino_loanword_nodate,
3
+ address = {Bryn Mawr, Pennsylvania},
4
+ title = {Loanword {Adaptation} in {Spanish} and {Mapudungun}: a {Phonological} and {Sociolinguistic} {Analysis}},
5
+ school = {Bryn Mawr College},
6
+ author = {Bronzino, Dana},
7
+ year = {2015},
8
+ month = dec
9
+ }
10
+
11
+ @book{smeets_grammar_2008,
12
+ address = {Berlin ; New York},
13
+ series = {Mouton grammar library},
14
+ title = {A grammar of {Mapuche}},
15
+ isbn = {978-3-11-019558-3},
16
+ number = {41},
17
+ publisher = {Mouton de Gruyter},
18
+ author = {Smeets, Ineke},
19
+ year = {2008},
20
+ keywords = {Phonology, Grammar, Mapuche language, Morphosyntax}
21
+ }
22
+
23
+ @article{sadowsky_mapudungun_2013,
24
+ title = {Mapudungun},
25
+ volume = {43},
26
+ issn = {0025-1003, 1475-3502},
27
+ url = {https://www.cambridge.org/core/product/identifier/S0025100312000369/type/journal_article},
28
+ doi = {10.1017/S0025100312000369},
29
+ language = {en},
30
+ number = {1},
31
+ urldate = {2019-10-02},
32
+ journal = {Journal of the International Phonetic Association},
33
+ author = {Sadowsky, Scott and Painequeo, Héctor and Salamanca, Gastón and Avelino, Heriberto},
34
+ month = apr,
35
+ year = {2013},
36
+ pages = {87--96},
37
+ file = {Full Text:files/216/Sadowsky et al. - 2013 - Mapudungun.pdf:application/pdf}
38
+ }
Data/_compromised/arn_Mapudungun/arn.html ADDED
The diff for this file is too large to render. See raw diff
 
Data/_compromised/arn_Mapudungun/arn.rules ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Mapudungun Rule Set
2
+ # Written by: Emily
3
+ # Last Updated: 2020-04-15
4
+ type,sfrom,sto,weight,precede,follow,comment
5
+ # Class Rules
6
+ class,passthrough,[aefiklmnopstuw],,,,
7
+ class,gem,(p|t̪|t|k|m|n̪|n|ɲ|ŋ|tʃ|ʈʂ|f|θ|s|ʃ|ʐ|ɣ|l̪|l|j|ʎ|w),,,,
8
+ # Individual Letters
9
+ sub,d,θ,2,,,
10
+ sub,g,ŋ,2,,,
11
+ sub,ï,ɘ,2,,,
12
+ sub,ḻ,l̪,2,,,
13
+ sub,ṉ,n̪,2,,,
14
+ sub,ñ,ɲ,2,,,
15
+ sub,r,ʐ,2,,,
16
+ sub,ṯ,t̪,2,,,
17
+ sub,ü,ɘ,2,,,
18
+ sub,y,j,2,,,
19
+ sub,({passthrough}),\1,2,,,
20
+ # Digraphs
21
+ sub,c,tʃ,3,,h,
22
+ sub,h,,3,c,,"clean-up",
23
+ sub,t,ʈʂ,3,,r,
24
+ sub,r,,3,t,,"clean-up",
25
+ sub,s,ʃ,3,,h,
26
+ sub,h,,3,s,,"clean-up",
27
+ sub,n,ŋ,3,,g,
28
+ sub,g,,3,n,,"clean-up",
29
+ sub,l,ʎ,3,,l,
30
+ sub,l,,3,l,,"clean-up",
31
+ # Orthographic Variation
32
+ sub,c,tʃ,2,,,
33
+ sub,x,ʈʂ,2,,,
34
+ sub,t,ʈʂ,3,,x,
35
+ sub,x,,3,t,,"clean-up",
36
+ sub,z,θ,2,,,
37
+ sub,h,n̪,2,,,
38
+ sub,n,n̪,3,,h,
39
+ sub,h,,3,n,,"clean-up",
40
+ sub,q,ɣ,2,,,
41
+ sub,b,l̪,2,,,
42
+ sub,l,l̪,3,,h,
43
+ sub,h,,3,l,,"clean-up",
44
+ sub,j,ʎ,2,,,
45
+ sub,v,ɘ,2,,,
46
+ # Geminates
47
+ ipasub,\b({gem}) \1\b,\1 ː,2,,,
Data/_compromised/arn_Mapudungun/arn.verify.csv ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ afvlkan,a f ɘ l k a n,
2
+ ajfeñ,a ʎ f e ɲ,
3
+ aliwentu,a l i w e n t u,
4
+ anci,a n tʃ i,
5
+ bafaxa,l̪ a f a ʈʂ a,
6
+ calin,tʃ a l i n,
7
+ cazi,tʃ a θ i,
8
+ cagvj,tʃ a ŋ ɘ ʎ,
9
+ dallun,θ a ʎ u n,
10
+ hamuh,n̪ a m u n̪,
11
+ ichuna,i tʃ u n a,
12
+ kutri,k u ʈʂ i,
13
+ kümelekaymi,k ɘ m e l e k a j m i,
14
+ mansun,m a n s u n,
15
+ nge,ŋ e,
16
+ reqle,ʐ e ɣ l e,
17
+ kom,k o m,
18
+ pu,p u,
19
+ kishu,k i ʃ u,
20
+ nhi,n̪ i,
21
+ malhenh,m a l̪ e n̪,
22
+ femmeken,f e m ː e k e n,
23
+ diccionario,θ i tʃ ː i o n a ʐ i o,
24
+ segredossereia,s e ŋ ʐ e θ o s ː e ʐ e i a,
25
+ küzawwe,k ɘ θ a w ː e,
26
+ kutt,k u t ː,
27
+ mew,m e w,
28
+ ta,t a,
29
+ ñi,ɲ i,
30
+ ka,k a,
Data/_compromised/awx_Awara/awx.Rmd ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "Awara"
3
+ author: "Becky Mathew"
4
+ bibliography: awx.bib
5
+ output: html_document
6
+ ---
7
+
8
+ Last Updated: 2020-04-13
9
+
10
+ **COMPROMISED: conflation between /nd/, /mb/, /nɡ/ and /d/, /b/, /ɡ/, respectively**
11
+
12
+ # Background
13
+
14
+ **Language Family:** Trans-New Guinea / Main Section / Central and Western / Huon-Finisterre / Finisterre / Wantoat
15
+
16
+ * Awara is spoken in the Morobe province of Papua New Guinea.
17
+
18
+ # Phonology
19
+
20
+ ## Consonants
21
+
22
+ * @Quigley2002 and @Quigley2003 disagree slightly on the consonant inventory of Awara (p. 4; p. 14); I have chosen to predominantly follow the more recent source.
23
+
24
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
25
+ library(dplyr)
26
+ library(knitr)
27
+ library(kableExtra)
28
+
29
+ consonants <- read.table(textConnection('
30
+ "Manner of Articulation" Labial Alveolar Palatal Velar Glottal
31
+ Stops "p b" "t d" "" "k kʷ ɡ ɡʷ" ""
32
+ Fricatives "β" "s" "" "ɣ" "h"
33
+ Nasals "m" "n" "" "ŋ ŋʷ" ""
34
+ Approximants "" "l" "j" "" ""
35
+ '), TRUE)
36
+
37
+ kable(consonants, col.names = c("Manner of Articulation", "Labial", "Alveolar", "Palatal", "Velar", "Glottal"), align = 'c') %>%
38
+ kable_styling("bordered") %>%
39
+ add_header_above(c("", "Place of Articulation" = 5)) %>%
40
+ footnote(general = "Note: For phonemes that share a cell, those on the left are voiceless and those on the right are voiced. Velar consonants that have the diacritic (ʷ) are labialized.", general_title = "") %>%
41
+ column_spec(1, bold = TRUE)
42
+ ```
43
+
44
+ ## Vowels
45
+
46
+ * Both @Quigley2002 and @Quigley2003 indicate a mid central vowel (p. 4; p. 35, 37); however, they represent it as /ʌ/. I have chosen to use /ɘ/, as it's more reflective of the description.
47
+
48
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
49
+ consonants <- read.table(textConnection('
50
+ Front Central Back
51
+ High "i" "" "u"
52
+ Mid "e" "ɘ" "o"
53
+ Low "" "a" ""
54
+ '), TRUE)
55
+
56
+ kable(consonants, align = 'c') %>%
57
+ kable_styling("bordered") %>%
58
+ column_spec(1, bold = TRUE)
59
+ ```
60
+
61
+ # Alphabet
62
+
63
+ * Surface level prenasalization of consonants /b/, /d/, and /ɡ/ is orthographically represented intervocalically [@Quigley2003, p. 155]. This compromises the language because there is uncertainty whether the sequence ⟨ambi⟩, for example, always represents /abi/ (with an [ambi] realization) or /ambi/ in some instances. I have chosen to transcribe the language how it appears; that is, every ⟨mb⟩ sequence, for example, transcribes to /mb/.
64
+
65
+ ```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
66
+
67
+ alphabet <- read.table(textConnection('
68
+ Grapheme Phoneme Comment
69
+ "a" "/a/" ""
70
+ "ä" "/ɘ/" ""
71
+ "b" "/b/" ""
72
+ "d" "/d/" ""
73
+ "e" "/e/" ""
74
+ "g" "/ɡ/" ""
75
+ "h" "/h/" ""
76
+ "i" "/i/" ""
77
+ "k" "/k/" ""
78
+ "l; r" "/l/" "intervocalically"
79
+ "m" "/m/" ""
80
+ "n" "/n/" ""
81
+ "o" "/o/" ""
82
+ "p" "/p/" ""
83
+ "s" "/s/" ""
84
+ "t" "/t/" ""
85
+ "u" "/u/" ""
86
+ "w" "/β/" ""
87
+ "x" "/ɣ/" ""
88
+ "y" "/j/" ""
89
+ **Multigraph** "" ""
90
+ "gw" "/ɡʷ/" ""
91
+ "kw" "/kʷ/" ""
92
+ "ng" "/ŋ/" ""
93
+ "ngw" "/ŋʷ/" ""
94
+ '), TRUE)
95
+
96
+ kable(alphabet, align = 'c') %>%
97
+ kable_styling("bordered")
98
+ ```
99
+
100
+ # Syllable Structure
101
+
102
+ * Syllables in Awara follow the order of (C)V(C) [@Quigley2003, p. 175].
103
+
104
+ # Lenition Rules
105
+
106
+ * Voiceless stops (including /kʷ/) lenite intervocalically at morpheme boundaries [@Quigley2003, p. 26].
107
+ * /t/ and /k/ may be realized as [l] and [ɣ] word-initially [@Quigley2003, pp. 20-21].
108
+ * /k/, /p/, and /t/ are realized as [ɣ], [w], and [l] intervocalically [@Quigley2002, p. 6].
109
+ * /pu/ may be realized as [βu] or [wu] word-initially [@Quigley2003, p. 22].
110
+
111
+ # Misc. Rules
112
+
113
+ * Voiced stops have prenasalized variants [@Quigley2003, pp. 16-17].
114
+ - Word-initially, they are realized as prenasalized consonants.
115
+ - Syllable-initially, following open syllables, the homorganic nasal is realized as the coda of the preceding syllable and the voiced stop is realized as the onset of the following syllable.
116
+ * Voiceless stops tend to be aspirated word-initially and syllable-initially intervocalically [@Quigley2003, p. 17].
117
+ * Glottal stop epenthesis may occur word-initially, preceding vowels [@Quigley2003, p. 18].
118
+
119
+ # References
Data/_compromised/awx_Awara/awx.bib ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ % Encoding: UTF-8
2
+
3
+ @MastersThesis{Quigley2003,
4
+ author = {Edward C. Quigley},
5
+ school = {University of North Dakota},
6
+ title = {Awara Phonology},
7
+ year = {2003},
8
+ }
9
+
10
+ @MastersThesis{Quigley2002,
11
+ author = {Susan R. Quigley},
12
+ school = {University of North Dakota},
13
+ title = {The Awara Verbal System},
14
+ year = {2002},
15
+ }
16
+
17
+ @Comment{jabref-meta: databaseType:bibtex;}
Data/_compromised/awx_Awara/awx.html ADDED
The diff for this file is too large to render. See raw diff
 
Data/_compromised/awx_Awara/awx.rules ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Awara Rule Set
2
+ # Written by: Becky Mathew
3
+ # Last updated: 2020-04-13
4
+ type,sfrom,sto,weight,precede,follow,comment
5
+ # Class Rules
6
+ class,passthrough,[abdehikmnopstu],,,,
7
+ class,w-preceder,[gk],,,,
8
+ class,vowels,[aeiouä],,,,
9
+ # Individual Letters
10
+ sub,ä,ɘ,3,,,
11
+ sub,g,ɡ,2,,,
12
+ sub,l,l,3,{vowels},{vowels},"/l/ only occurs intervocalically",
13
+ sub,r,l,3,{vowels},{vowels},"<r> may also be used to represent /l/",
14
+ sub,w,β,3,,,
15
+ sub,x,ɣ,3,,,
16
+ sub,y,j,3,,,
17
+ sub,({passthrough}),\1,2,,,
18
+ # Digraphs
19
+ sub,g,ɡʷ,4,,w,
20
+ sub,k,kʷ,4,,w,
21
+ sub,w,,4,{w-preceder},,"clean-up",
22
+ sub,n,ŋ,5,,g,
23
+ sub,g,,5,n,,"clean-up",
24
+ # Trigraphs
25
+ sub,n,ŋʷ,6,,gw,
26
+ sub,g,,6,n,w,"clean-up",
27
+ sub,w,,6,ng,,"clean-up",
Data/_compromised/awx_Awara/awx.verify.csv ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hikngä,h i k ŋ ɘ,
2
+ wätä,β ɘ t ɘ,
3
+ bakudupi,b a k u d u p i,
4
+ xät,ɣ ɘ t,
5
+ yähakaying,j ɘ h a k a j i ŋ,
6
+ nap,n a p,
7
+ gwen,ɡʷ e n,
8
+ kwayi,kʷ a j i,
9
+ kungwä,k u ŋʷ ɘ,
10
+ do,d o,
11
+ ge,ɡ e,
12
+ äminu,ɘ m i n u,
13
+ sipsip,s i p s i p,
14
+ inale,i n a l e,
15
+ Awara,a β a l a,
16
+ yang,j a ŋ,
17
+ using,u s i ŋ,
18
+ inikut,i n i k u t,
19
+ nanä,n a n ɘ,
20
+ äwä,ɘ β ɘ,
21
+ puyä,p u j ɘ,
22
+ undä,u n d ɘ,
23
+ tiwän,t i β ɘ n,
24
+ yänikut,j ɘ n i k u t,
25
+ meyä,m e j ɘ,
26
+ wamu,β a m u,
27
+ umanä,u m a n ɘ,
28
+ kewu,k e β u,
29
+ natäke,n a t ɘ k e,
30
+ päke,p ɘ k e,
Data/_compromised/bcl_CentralBikol/bcl.Rmd ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "Central Bikol"
3
+ author: "Bill Mizgerd"
4
+ bibliography: bcl.bib
5
+ output: html_document
6
+ ---
7
+
8
+ Last Updated: 2019-06-17
9
+
10
+ **SLIGHTLY COMPROMISED: glottal stops not transcribed consistently**
11
+
12
+ # Background
13
+
14
+ **Language Family:** Austronesian / Malayo-Polynesian / Western Malayo-Polynesian / Meso Philippine / Central Philippine / Bikol / Coastal / Naga
15
+
16
+ * Central Bikol, or Bikol, is spoken throughout the Bikol provinces within the Philippines.
17
+
18
+ # Phonology
19
+
20
+ ## Consonants
21
+
22
+ * Loans from Spanish and English have introduced /f/, /v/, /z/, /ʃ/, /ʒ/, /ɲ/, /ʎ/, /tʃ/, and /dʒ/ to Bikol, although not all speakers use those sounds [@BclMattes2014, p. 8].
23
+
24
+ ```{r echo = FALSE, message = FALSE, warning = FALSE, results = 'asis'}
25
+
26
+ library(dplyr)
27
+ library(knitr)
28
+ library(kableExtra)
29
+
30
+
31
+ consonants <- read.table(textConnection('
32
+ "Manner of Articulation" Labial Alveolar Palatal Velar Glottal
33
+ Stops "p b" "t d" "" "k ɡ" "ʔ"
34
+ Nasals "m" "n" "" "ŋ" ""
35
+ Fricatives "" "s" "" "" "h"
36
+ Flaps "" "ɾ" "" "" ""
37
+ Approximants "ʋ" "l" "j" "" ""
38
+ '), header = TRUE)
39
+
40
+ kable(consonants, col.names = c("Manner of Articulation", "Labial", "Alveolar", "Palatal", "Velar", "Glottal"), align = 'c') %>%
41
+ kable_styling("bordered") %>%
42
+ column_spec(1, bold = TRUE) %>%
43
+ footnote(general = "Note: For phonemes that share a cell, those on the left are voiceless and those on the right are voiced.", general_title = "") %>%
44
+ add_header_above(c("", "Place of Articulation" = 5))
45
+
46
+ ```
47
+
48
+ ## Vowels
49
+
50
+ * Although the orthography includes ⟨e⟩ and ⟨o⟩, [e] and [o] only exist as allophones of /i/ and /u/ respectively [@BclMattes2014, p. 8].
51
+
52
+ ```{r echo = FALSE}
53
+
54
+ vowels <- read.table(textConnection('
55
+ Front Central Back
56
+ High "i" "" "u"
57
+ Mid "" "" ""
58
+ Low "" "a" ""
59
+ '), TRUE)
60
+
61
+ kable(vowels, align = 'c') %>%
62
+ kable_styling("bordered") %>%
63
+ column_spec(1, bold = TRUE)
64
+
65
+
66
+ diphthongs <- read.table(textConnection('
67
+ Diphthongs
68
+ "/iu/, /ui/, /ai/, /au/"
69
+ '), TRUE)
70
+
71
+ kable(diphthongs, align = 'c') %>%
72
+ kable_styling("bordered")
73
+ ```
74
+
75
+ # Alphabet
76
+
77
+ * Glottal stops are not always reflected in the spelling of a word [@BclMattes2014, p. 12]. Occurrences in at least the intervocalic positions are predictable (i.e. phonetic), which isn't problematic for the language (as we don't account for them); however, occurrences of glottal stops elsewhere aren't transcribed consistently, which compromises the language to some degree.
78
+
79
+ ```{r echo = FALSE}
80
+
81
+ alphabet <- read.table(textConnection('
82
+ Grapheme Phoneme
83
+ "a" "/a/"
84
+ "b" "/b/"
85
+ "d" "/d/"
86
+ "e" "/i/"
87
+ "g" "/ɡ/"
88
+ "h" "/h/"
89
+ "i" "/i/"
90
+ "k" "/k/"
91
+ "l" "/l/"
92
+ "m" "/m/"
93
+ "n" "/n/"
94
+ "o" "/u/"
95
+ "p" "/p/"
96
+ "r" "/ɾ/"
97
+ "s" "/s/"
98
+ "t" "/t/"
99
+ "u" "/u/"
100
+ "w" "/ʋ/"
101
+ "y" "/j/"
102
+ "\' ; -" "/ʔ/"
103
+ **Digraph** ""
104
+ "ng" "/ŋ/"
105
+ "aw" "/au/"
106
+ "ay" "/ai/"
107
+ "iw" "/iu/"
108
+ "oy" "/ui/"
109
+ "uy" "/ui/"'), header = TRUE)
110
+
111
+ knitr::kable(alphabet, align = 'c') %>%
112
+ kable_styling("bordered")
113
+ ```
114
+
115
+ # Syllable Structure
116
+
117
+ * Bikol syllable structure is CV(C), where V can be either a single vowel or a diphthong [@BclMattes2014, p. 10].
118
+
119
+ # Misc. Rules
120
+
121
+ * Epenthesis of [h] occurs between stem-final and suffix-initial vowels [@BclMattes2014, p. 9].
122
+ * /u/ is realized as [o] in the final syllable of a word [@BclMintzD1971, p. 17].
123
+ * Glottal stops are always inserted between orthographically adjacent vowels [@BclMattes2014, p. 12].
124
+ * Prefix-final /ŋ/ tends to assimilate, to varying degrees, to the first consonant of the stem [@BclMattes2014, p. 9].
125
+
126
+ # References
Data/_compromised/bcl_CentralBikol/bcl.bib ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ % Encoding: UTF-8
2
+
3
+ @Book{BclMattes2014,
4
+ author = {Mattes, Veronika},
5
+ title = {Types of Reduplication: A Case Study of Bikol},
6
+ publisher = {De Gruyter},
7
+ year = {2014},
8
+ }
9
+
10
+ @Book{BclMintzD1971,
11
+ author = {Mintz, Malcolm W.},
12
+ title = {Bikol Dictionary},
13
+ publisher = {University of Hawai'i Press},
14
+ year = {1971},
15
+ }
16
+
17
+ @Comment{jabref-meta: databaseType:bibtex;}
Data/_compromised/bcl_CentralBikol/bcl.html ADDED
The diff for this file is too large to render. See raw diff
 
Data/_compromised/bcl_CentralBikol/bcl.rules ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Central Bikol Rule Set
2
+ # Written by: Bill
3
+ # Last Updated: 2019-06-17
4
+ type,sfrom,sto,weight,precede,follow,comment
5
+ # Classes
6
+ class,punctuation,['‘’-],,,,
7
+ class,w-preceder,[ai],,,,
8
+ class,y-preceder,[aou],,,,
9
+ class,vowel,[aeiou],,,,
10
+ class,passthrough,[abdhilkmnpstu],,,,
11
+ # Individual Letters
12
+ sub,e,i,4,,,
13
+ sub,g,ɡ,4,,,
14
+ sub,o,u,4,,,
15
+ sub,r,ɾ,4,,,
16
+ sub,w,ʋ,4,,,
17
+ sub,y,j,4,,,
18
+ sub,{punctuation},ʔ,4,,,
19
+ sub,{punctuation},,5,{vowel},{vowel},
20
+ sub,({passthrough}),\1,0.1,,,
21
+ # Digraphs
22
+ sub,a,au,6,,w,"aw"
23
+ sub,i,iu,6,,w,"iw"
24
+ sub,w,,6,{w-preceder},,"w-final diphthongs clean-up",
25
+ sub,a,ai,6,,y,"ay"
26
+ sub,o,ui,6,,y,"oy"
27
+ sub,u,ui,6,,y,"uy"
28
+ sub,y,,6,{y-preceder},,"y-final diphthongs clean-up",
29
+ sub,n,ŋ,6,,g,
30
+ sub,g,,6,n,,"clean-up",
31
+ # Non-Diphthongs (sequences of three vowels realized independently)
32
+ ipasub,au ({vowel}),a u \1,8,,,
33
+ ipasub,iu ({vowel}),i u \1,8,,,
34
+ ipasub,ai ({vowel}),a i \1,8,,,
35
+ ipasub,ui ({vowel}),u i \1,8,,,
Data/_compromised/bcl_CentralBikol/bcl.verify.csv ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ apat,a p a t,"a"
2
+ atibangaw,a t i b a ŋ au,"aw"
3
+ gulay,ɡ u l ai,"ay"
4
+ bubon,b u b u n,"b"
5
+ daguldol,d a ɡ u l d u l,"d"
6
+ kengke,k i ŋ k i,"e"
7
+ gusok,ɡ u s u k,"g"
8
+ hagahag,h a ɡ a h a ɡ,"h"
9
+ kiri,k i ɾ i,"i"
10
+ ariw,a ɾ iu,"iw"
11
+ kuko,k u k u,"k"
12
+ lalaki,l a l a k i,"l"
13
+ mampak,m a m p a k,"m"
14
+ nana,n a n a,"n"
15
+ ngunyan,ŋ u n j a n,"ng"
16
+ ido,i d u,"o"
17
+ laboy,l a b ui,"oy"
18
+ papel,p a p i l,"p"
19
+ ribo,ɾ i b u,"r"
20
+ sebolyas,s i b u l j a s,"s"
21
+ tatay,t a t ai,"t"
22
+ utang,u t a ŋ,"u"
23
+ buybuy,b ui b ui,"uy"
24
+ wikwik,ʋ i k ʋ i k,"w"
25
+ yating,j a t i ŋ,"y"
26
+ ba-go,b a ʔ ɡ u,"punctuation as glottal stop"
27
+ iba-ibang,i b a i b a ŋ,
28
+ tuyong,t u i u ŋ,"non-diphthongs"
29
+ laog,l a u ɡ,
30
+ hiwas,h i u a s,
31
+ katawo,k a t a u u,
32
+ gayo,ɡ a i u,