niobures commited on Oct 8, 2025

Commit

4a08ba7

verified ·

1 Parent(s): 1964ee9

XPF

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +7 -35
.gitignore +5 -0
Code/README.html +0 -0
Code/README.md +12 -0
Code/contextRep.py +200 -0
Code/stopatn.sh +24 -0
Code/sumstats01.py +202 -0
Code/translate03.py +325 -0
Code/translate04.py +346 -0
Data/README.md +10 -0
Data/_compromised/acr_RabinalAchi'/acr.Rmd +112 -0
Data/_compromised/acr_RabinalAchi'/acr.bib +29 -0
Data/_compromised/acr_RabinalAchi'/acr.html +0 -0
Data/_compromised/acr_RabinalAchi'/acr.rules +28 -0
Data/_compromised/acr_RabinalAchi'/acr.verify.csv +20 -0
Data/_compromised/ake_Akawaio/ake.Rmd +113 -0
Data/_compromised/ake_Akawaio/ake.bib +40 -0
Data/_compromised/ake_Akawaio/ake.html +0 -0
Data/_compromised/ake_Akawaio/ake.rules +13 -0
Data/_compromised/ake_Akawaio/ake.verify.csv +30 -0
Data/_compromised/amp_Alamblak/amp.Rmd +125 -0
Data/_compromised/amp_Alamblak/amp.bib +62 -0
Data/_compromised/amp_Alamblak/amp.html +0 -0
Data/_compromised/amp_Alamblak/amp.rules +19 -0
Data/_compromised/amp_Alamblak/amp.verify.csv +30 -0
Data/_compromised/aoj_Mufian/aoj.Rmd +109 -0
Data/_compromised/aoj_Mufian/aoj.bib +27 -0
Data/_compromised/aoj_Mufian/aoj.html +0 -0
Data/_compromised/aoj_Mufian/aoj.rules +25 -0
Data/_compromised/aoj_Mufian/aoj.verify.csv +30 -0
Data/_compromised/ar_Arabic/ar.Rmd +155 -0
Data/_compromised/ar_Arabic/ar.bib +113 -0
Data/_compromised/ar_Arabic/ar.html +0 -0
Data/_compromised/ar_Arabic/ar.rules +167 -0
Data/_compromised/ar_Arabic/ar.verify.csv +39 -0
Data/_compromised/arn_Mapudungun/arn.Rmd +139 -0
Data/_compromised/arn_Mapudungun/arn.bib +38 -0
Data/_compromised/arn_Mapudungun/arn.html +0 -0
Data/_compromised/arn_Mapudungun/arn.rules +47 -0
Data/_compromised/arn_Mapudungun/arn.verify.csv +30 -0
Data/_compromised/awx_Awara/awx.Rmd +119 -0
Data/_compromised/awx_Awara/awx.bib +17 -0
Data/_compromised/awx_Awara/awx.html +0 -0
Data/_compromised/awx_Awara/awx.rules +27 -0
Data/_compromised/awx_Awara/awx.verify.csv +30 -0
Data/_compromised/bcl_CentralBikol/bcl.Rmd +126 -0
Data/_compromised/bcl_CentralBikol/bcl.bib +17 -0
Data/_compromised/bcl_CentralBikol/bcl.html +0 -0
Data/_compromised/bcl_CentralBikol/bcl.rules +35 -0
Data/_compromised/bcl_CentralBikol/bcl.verify.csv +32 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,7 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+docs/* linguist-documentation=false
+docs/css/* linguist-documentation=false
+docs/js/* linguist-documentation=false
+Data/** linguist-detectable=falsedocs/images/brown.jpg filter=lfs diff=lfs merge=lfs -text
+docs/images/wordcloud_image.jpg filter=lfs diff=lfs merge=lfs -text
+docs/manual/xpf_manual.pdf filter=lfs diff=lfs merge=lfs -text
+Manual/xpf_manual.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.DS_Store
+.sav
+.Rhistory
+Abandoned
+Available

Code/README.html ADDED Viewed

The diff for this file is too large to render. See raw diff

Code/README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+# Code
+The scripts within this folder are explained in more detail within the manual in terms of how to use them and how to interpret their respective outputs. I have linked the specific sections within the manual that correspond the each of the files below:
+* `contextRep.py`:&nbsp; [2.1 Segment Informativity Measures](https://cohenpr-xpf.github.io/XPF/manual/xpf_manual.pdf#Segment%20Informativity%20Measures)
+* `stopatn.sh`:&nbsp; [2.0.1 Evaluating Frequency Files](https://cohenpr-xpf.github.io/XPF/manual/xpf_manual.pdf#Evaluating%20Frequency%20Files)
+* `sumstats01.py`:&nbsp; [2.2 Summary Statistics](https://cohenpr-xpf.github.io/XPF/manual/xpf_manual.pdf#Summary%20Statistics)
+* `translate03.py` and `translate04.py`&thinsp;<sup id="ref1">[1](#foot1)</sup>:&nbsp; [1.5 Translation Scheme](https://cohenpr-xpf.github.io/XPF/manual/xpf_manual.pdf#Translation%20Scheme)
+</br>
+<b id="foot1">1</b> The only difference between `translate03.py` and `translate04.py` is that `translate04.py` accounts for the [match rules](https://cohenpr-xpf.github.io/XPF/manual/xpf_manual.pdf#page=10) needed for the phonemic translation of Korean. [←](#ref1)

Code/contextRep.py ADDED Viewed

	@@ -0,0 +1,200 @@

+#!/usr/bin/python3
+from __future__ import print_function
+from math import log
+from collections import deque
+class contextRep(object):
+    def __init__(self):
+        self.count = 0.0         # times this context was observed
+        self.contexts = dict()   # continuation:context dictionary
+        self.precals = None      # probs can be precalculated
+        self.terminal = 0.0      # number of times this context was final
+    def __repr__(self):
+        return repr([self.count, self.terminal, self.contexts])
+    def __str__(self):
+        return repr(self)
+    def add(self, seq, count, func=lambda x: None):
+        """
+        add a full sequence to the representation
+        """
+        if len(seq) > 0:
+            key = seq[0]
+            if not key in self.contexts:
+                self.contexts[key] = contextRep()
+            self.contexts[key].add(seq[1:], count, func)
+        else:
+            self.terminal += count
+        self.count += count
+    def prob(self, key, log2=False):
+        """
+        get the probability of observing a particular continuation in
+        the given context
+        """
+        if self.precals is None:
+            ret = self.contexts[key].count / self.count \
+                  if key in self.contexts else 0.0
+        else:
+            ret = self.precals[key]
+        return ret if not log2 else log(ret, 2)
+    def probs(self, log2=False):
+        """
+        Get the probabilities of getting all continuations in the given
+        context
+        """
+        if self.precals is None:
+            ret = {key:self.prob(key, log2=log2) for key in self.contexts}
+        else:
+            ret = self.precals if not log2 \
+                  else {p:log(self.precals[p], 2) for p in self.precals}
+                  ##else {p:log(p, 2) for p in self.precals}
+        return ret
+    def precalc(self):
+        """
+        Create a static image of the probabilities
+        """
+        self.precals = self.probs()
+        for key in self.contexts:
+            self.contexts[key].precalc()
+    def contextProb(self, seq, terminal=False):
+        """
+        Create for each item in a sequence the probability of observing
+        it in the given context
+        """
+        context = self
+        ret = deque()
+        for key in seq:
+            if context is not None and key in context.contexts:
+                ret.append(context.prob(key, False))
+                context = context.contexts[key]
+            else:
+                context = None
+                ret.append(0.0)
+        if terminal:
+            if context is not None:
+                ret.append(context.terminal / context.count)
+            else:
+                ret.append(0.0)
+        return list(ret)
+    def informativity_counts(self):
+        """
+        Create for each item in a sequence the probability of observing
+        it in the given context
+        """
+        retvals = {key:(-log(self.contexts[key].count / self.count, 2))
+                    for key in self.contexts}
+        retcounts = {key:self.contexts[key].count for key in self.contexts}
+        for key in self.contexts:
+            (subvals, subcounts) = \
+                self.contexts[key].informativity_counts()
+            for key in subvals:
+                (selfval, selfcount) = (retvals[key], retcounts[key]) \
+                                       if key in retvals \
+                                        else (0.0, 0.0)
+                retvals[key] = (selfval*selfcount +
+                                 subvals[key]*subcounts[key]) / (subcounts[key]+selfcount)
+                retcounts[key] = selfcount + subcounts[key]
+        return (retvals, retcounts)
+    def informativity(self):
+        (informativity, counts) = self.informativity_counts()
+        return informativity
+    def iter(self, terminal=False, log2=False):
+        logfunc = (lambda x: -log(x, 2) if x < 1 else 0) if log2 else (lambda x: x)
+        if self.terminal > 0:
+            yield [{"seg":None, "prob":logfunc(self.terminal / self.count), "count":self.count}] if terminal else []
+        for key in sorted(self.contexts):
+            for cont in self.contexts[key].iter(terminal=terminal, log2=log2):
+                yield [{"seg":key,
+                        "prob":logfunc(self.contexts[key].count / self.count),
+                        "count":self.contexts[key].count}
+                ] + cont
+    def __iter__(self):
+        for value in self.iter(log2=True, terminal=False):
+            yield value
+    ##
+    ## Returns a pure dictionary representation of the object
+    ##
+    def asdict(self):
+        ret = {"count":  self.count,
+               "contexts": {key: self.contexts[key].asdict() for key in self.contexts},
+               "precals": self.precals is None,
+               "terminal": self.terminal}
+        return ret
+    ##
+    ## reconstruct an object from a dictionary (created by asdict)
+    ## I failed to create a static method and couldn't bother more with it.
+    ## The only real reason to use this method + todict is to save contextRep objects in R / json easily
+    ##
+    def populate(self, d):
+        self.count = d["count"]
+        self.terminal = d["terminal"]
+        self.contexts = {key:contextRep().populate(d["contexts"][key]) for key in d["contexts"]}
+        self.precals = None if d["precals"] is False else self.precalc()
+        return self
+    ##
+    ## Object equality (only to check todict / populate)
+    ##
+    def __eq__(self, other):
+        if isinstance(other, contextRep):
+            return all([self.terminal == other.terminal,
+                        self.count == other.count,
+                        all(self.contexts[key] == other.contexts[key] if key in other.contexts else False
+                            for key in self.contexts),
+                        all(key in self.contexts for key in other.contexts)])
+        else:
+            return False
+if __name__ == "__main__":
+    c = contextRep()
+    c.add("ab", 5)
+    c.add("ac", 5)
+    c.add("a", 5)
+    c.add("c", 15)
+    c.add("P AO1 R T N OY0".split(), 1)
+    print(c)
+    print(c.informativity())
+    print(c.probs())
+    print(c.contextProb("ab"))
+    print(c.contextProb("a"))
+    print(c.asdict())
+    c2 = contextRep()
+    c2.populate(c.asdict())
+    print(c2)
+    print(c2 == c)
+    print(c.informativity() == c2.informativity())
+    for v in c.iter(terminal=True):
+        print(v)

Code/stopatn.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+#!/bin/bash
+##
+## ignore low frequencies so we don't save time dealing with
+## them. Default: ignore nothing.
+##
+minfreq=$(test "$2" && echo "$2" || echo 0)
+tf=$(mktemp)
+cat | awk "\$2>=$minfreq" |                ## remove low freqs if any
+    bzip2 -9 > ${tf}                       ## first save the file,
+					   ## sorted by freq
+lastfreq=$(bzcat ${tf} |                   ## revisit the file
+	       sort -nr -k 2,2 -k 1,1 |    ## sort to find top $1 freq
+	       head -n "$1" |              ## find the top $1 rlines
+	       tail -n1 |                  ## keep only last line
+	       (read w freq; echo $freq))  ## it's the second field
+bzcat ${tf} | awk "\$2>=$lastfreq"         ## use awk to remove lower freqs
+rm -f ${tf}

Code/sumstats01.py ADDED Viewed

	@@ -0,0 +1,202 @@

+#!/usr/bin/env python3
+import re
+import argparse
+import sys
+import csv
+import traceback
+from collections import deque, defaultdict
+from math import inf
+import translate04 as translate
+from contextRep import contextRep
+def oneNyield(item, iterable):
+    yield item
+    for item in iterable:
+        yield item
+def getRep(fobj, a2ipa, minfreq=1):
+    ret = contextRep()
+    stats = {"nlines": 0,
+             "skipped": 0,
+             "missing": 0,
+             "@words": dict()}
+    ret.wordlist = deque()
+    ret.stats = stats
+    finalnr = re.compile("[\r\n]*$")
+    ##
+    ## Try to understand file type
+    ##
+    sniffLine = fobj.readline()
+    sniffLine = re.sub(finalnr, "", sniffLine)
+    if sniffLine.find("\t") >= 0 and len(sniffLine.split("\t")) == 2:
+        sep = "\t"
+    elif sniffLine.find(",") >= 0 and len(sniffLine.split(",")) == 2:
+        sep = ","
+    elif sniffLine.find(" ") >= 0 and len(sniffLine.split(" ")) == 2:
+        sep = " "
+    else:
+        print("Could not understand frequencies file, not proceeding")
+        return ret
+    ##
+    ## Try to figure whether there are headers
+    ##
+    try:
+        int(sniffLine.split(sep)[-1])
+        lines = oneNyield(sniffLine, fobj)
+    except ValueError:
+        lines = fobj
+    for line in lines:
+        stats["nlines"] += 1
+        line = re.sub(finalnr, "", line)
+        try:
+            ##
+            ## parse line
+            ##
+            (word, freq) = line.split(sep)
+            freq = int(freq)
+            ##
+            ## ignore low frequencies
+            ##
+            if freq < minfreq:
+                stats["skipped"] += 1
+                continue
+            ##
+            ## Translate
+            ##
+            translation = a2ipa.translate(word)
+            if "@" in translation:
+                stats["missing"] += 1
+                stats["@words"][word] = {"freq":freq, "translation":translation}
+            ##
+            ## Add context to representation
+            ##
+            else:
+                ret.wordlist.append({"word:": word, "translation": translation})
+                ret.add(translation, freq)
+        except Exception as err:
+            print("Error in word frequency parsing. Offending line is {}, the message is: {}".format(repr(line), err), file=sys.stderr)
+            exit(1)
+        ##ret.precalc()
+        ##for (wx, wordprops) in enumerate(ret.wordlist):
+        ##    ret.wordlist[wx]["probs"] = ret.probs(wordprops["translation"])
+    return ret
+def main(argv):
+    parser = argparse.ArgumentParser("Provide summary statistics for language and frequency files")
+    ##
+    ## Specifies the rules used for trsnslation
+    ##
+    parser.add_argument("-l", "--langrules", dest="langrules",
+                        type=argparse.FileType('r', encoding="utf8"),
+                        required=True,
+                        help="language code rules file")
+    ##
+    ## Specifies a verifcation file, in any csv format. Headers are
+    ## not expected.  The first columns is supposed to be a word, and
+    ## the second is its ideal translation
+    ##
+    parser.add_argument("-c", "--check", dest="check",
+                        default=None, type=argparse.FileType('r', encoding="utf8"),
+                        help="file to use for verification")
+    ##
+    ## Allows to read date from some file (which should not be compressed)
+    ##
+    parser.add_argument("-r", "--read", dest="read",
+                        default=sys.stdin,
+                        type=argparse.FileType('r', encoding="utf8"),
+                        help="file used for translation (word and frequency)")
+    ##
+    ## Add min frequency
+    ##
+    parser.add_argument("-m", "--min", dest="min",
+                        default=1,
+                        type=int,
+                        help="minimal frequency to consider")
+    ##
+    ## Print summary?
+    ##
+    parser.add_argument("-N", "--no-summary", dest="nosummary",
+                        default=False, action="store_true",
+                        help="suppress summary information")
+    ##
+    ## Print all probs?
+    ##
+    parser.add_argument("-A", "--all-words", dest="allwords",
+                        default=False, action="store_false",
+                        help="Enumerate all words and probabilities")
+    ##
+    ## How many @ words?
+    ##
+    parser.add_argument("-@", "--max@", dest="max@",
+                        default=10,
+                        type=int,
+                        help="number of @ words to include in summary")
+    options = vars(parser.parse_args(argv))
+    a2ipa = translate.alphabet2ipa(options["langrules"])
+    if "check" in options and not options["check"] is None:
+        allGood = a2ipa.check(options["check"])
+        if not allGood:
+            print("Verification failed, not processing additional data", file=sys.stderr)
+            exit(1)
+    ##ret = ((word, " ".join(a2ipa.translate(word)))
+    ##       for  word in words)
+    rep = getRep(options["read"], a2ipa, minfreq=options["min"])
+    rep.precalc()
+    (info, counts) = rep.informativity_counts()
+    print("seg\tinformativity\t,count")
+    for (count, seg) in sorted((-counts[seg], seg) for seg in counts):
+        print("{seg}\t{info}\t{count}".format(seg=seg,
+                                              info=info[seg],
+                                              count=-count))
+    ##
+    ## Print summary information, if not suppressed
+    #3
+    if not options["nosummary"]:
+        print("## Summary statistics:")
+        print("##   processed (inc. skipped):", rep.stats["nlines"])
+        print("##   skipped:", rep.stats["skipped"])
+        print("##   %@ words:", round(rep.stats["missing"] /
+                                      (rep.stats["nlines"] - rep.stats["skipped"])*100, 1))
+        atwords = rep.stats["@words"]
+        print("## Top missing:")
+        for (nfreq, word, translation) in sorted((-atwords[word]["freq"], word, atwords[word]["translation"]) for word in atwords)[:options["max@"]]:
+            print("##   {word} → '{translation}' ({freq})".format(word=word, translation=" ".join(translation), freq=-nfreq))
+if __name__ == "__main__":
+    main(sys.argv[1:])

Code/translate03.py ADDED Viewed

	@@ -0,0 +1,325 @@

+#!/usr/bin/env python3
+import re
+import argparse
+import sys
+import csv
+import traceback
+from collections import deque, defaultdict
+from math import inf
+def sniff(filestream):
+    ##sample = csv.Sniffer().sniff(filestream.read(1024))
+    sample = filestream.read(1024)
+    sample = "\n".join(line for line in re.split("[\r\n]", sample) if not (line.startswith("#") or len(line) == 0))
+    try:
+        dialect = csv.Sniffer().sniff(sample)
+    except Exception as ex:
+        print("Could not determine delimiter type, proceedings as excel csv", file=sys.stderr)
+        dialect = csv.get_dialect("excel")
+    filestream.seek(0)
+    ret = (line for line in filestream if not line.startswith("#"))
+    return (ret, dialect)
+class subrule(object):
+    ##
+    ## simple wrapper for sub rules
+    ##
+    def __init__(self, values, classes):
+        for key in ["sfrom", "sto", "precede", "follow", "weight"]:
+            value = values[key]
+            ##
+            ## handle classes (and subclasses)
+            ##
+            while re.search("{.*}", value):
+                value = value.format(**classes)
+            self.__dict__[key] = value
+        self.weight = float(self.weight)
+        self.sfrom = re.compile(self.sfrom)
+        self.precede = re.compile(self.precede+"$")
+        self.follow = re.compile("^"+self.follow)
+    def subScore(self, sfrom, precede, follow):
+        if self.sfrom.match(sfrom) and self.precede.search(precede) and self.follow.search(follow):
+            return self.weight
+        else:
+            return None
+    def sub(self, x):
+        return self.sfrom.sub(self.sto, x)
+    def __repr__(self):
+        return repr(self.__dict__)
+    def __lt__(self, other):
+        if not isinstance(other, subrule):
+            raise Exception("Incompatible types for comparison")
+        return [self.weight] < [other.weight]
+class alphabet2ipa(object):
+    ##
+    ## interpretation of rules files
+    ##
+    classes = None
+    subs = None
+    ##chartr = None
+    def __init__(self, langrules, missing="@", loglevel=0):
+        self.classes = dict()
+        self.subs = set()
+        self.ipasubs = set()
+        self.words = dict()
+        self.pre = str.maketrans("", "")
+        self.NO_TRANSLATE = missing
+        self.loglevel = loglevel
+        with langrules as csvsource:
+            ##rules = csv.DictReader(csvsource)
+            (csvsource, dialect) = sniff(csvsource)
+            rules = csv.DictReader(csvsource, dialect=dialect)
+            for rule in rules:
+                if self.loglevel > 2:
+                    print("Rule found:", rule, file=sys.stderr)
+                try:
+                    ##
+                    ## Pre equivalences
+                    ##
+                    if rule["type"] == "pre":
+                        self.pre = str.maketrans(rule["sfrom"], rule["sto"])
+                    ##
+                    ## Deal with classes
+                    ##
+                    elif rule["type"] == "class":
+                        self.classes[rule["sfrom"]] = rule["sto"]
+                    ##
+                    ## Deal with sub rules
+                    ##
+                    elif rule["type"] == "sub":
+                        newrule = subrule(rule, self.classes)
+                        self.subs.add(newrule)
+                    ##
+                    ## Deal with IPA sub rules
+                    ##
+                    elif rule["type"] == "ipasub":
+                        newrule = subrule(rule, self.classes)
+                        self.ipasubs.add(newrule)
+                    ##
+                    ## Deal with whole word substitutions
+                    ##
+                    elif rule["type"] == "word":
+                        self.words[rule["sfrom"]] = rule["sto"].split()
+                    ##
+                    ## No such rule
+                    ##
+                    else:
+                        print("Unrecognized rule type ({type}), with sfrom={sfrom}, and sto={sto}".format(**rule), file=sys.stderr)
+                        continue
+                except Exception as ex:
+                    errInfo = sys.exc_info()
+                    traceback.print_exception(*errInfo)
+                    print("Error processing rule, but resuming processing other rules. Rule details: {}".format(rule, ex), file=sys.stderr)
+                    continue
+                if self.loglevel > 1:
+                    print("Rule added:", rule, file=sys.stderr)
+    def translate(self, source):
+        ##
+        ## fully translated words
+        ##
+        if source in self.words:
+            return self.words[source]
+        ##
+        ## preprocess using pre, and turn to lowercase
+        ##
+        source = source.translate(self.pre).lower()
+        ##
+        ## If there are character-based translations, apply them first
+        ## (for Cyrillic and other non-latin scripts)
+        ##
+        ##if not self.chartr is None:
+        ##    source = source.translate(self.chartr)
+        sourceList = re.findall(".", source)
+        targetList = deque()
+        for (sx, sfrom) in enumerate(sourceList):
+            ##
+            ## prepare context
+            ##
+            precede = "".join(source[:sx])
+            follow = "".join(source[sx+1:])
+            ##
+            ## perform all possible translations
+            ##
+            translations = [(rule.subScore(sfrom, precede, follow), rule.sub(sfrom)) for rule in self.subs]
+            ##
+            ## Exclude translations that didn't apply
+            ##
+            translations = [pair for pair in translations if not pair[0] is None]
+            ##
+            ## Choose best translation
+            ##
+            translation = sorted(translations)[-1][-1] if len(translations) > 0 else self.NO_TRANSLATE
+            if len(translation) == 0:
+                continue
+            targetList.append(translation)
+        targetString = " ".join(targetList)
+        for (weight, rule) in sorted((-rule.weight, rule)
+                                     for rule in self.ipasubs):
+            targetString = rule.sub(targetString)
+        return list(targetString.split())
+    def check(self, cfile):
+        ##
+        ## Check that words translate as they should. Returns True if
+        ## they all do (or if there are no words). Unless logevel is
+        ## negative, mismatches are printed.
+        ##
+        ##
+        ## Open the file, regardless of csv formats, excluding comment lines
+        ##
+        (csvsource, dialect) = sniff(cfile)
+        data = csv.reader(csvsource, dialect=dialect)
+        ##
+        ## Iterate over all lines
+        ##
+        allGood = True                                     ## default is that it's all good
+        for values in data:
+            try:
+               word = values[0]
+               shouldbe = values[1]
+            except Exception as ex:
+                errInfo = sys.exc_info()
+                allGood = False
+                traceback.print_exception(*errInfo)
+                print("Error processing verification statement, but resuming processing other statements. Statement details: {}".format(values, ex), file=sys.stderr)
+                continue
+            translation = " ".join(self.translate(word))   ## translate returns a list, change to spaces
+            if self.loglevel > 2:
+                print("Does '{}' translate to '{}'?".format(word, shouldbe), file=sys.stderr)
+            if not shouldbe == translation:                ## if wrong translation
+                allGood = False                            ##    not all translations are good
+                if self.loglevel >= 0:
+                    print("Verification error, '{}' was translated to '{}', not '{}'"
+                          .format(word, translation, shouldbe)
+                          , file=sys.stderr)
+            if self.loglevel > 0:
+                print("Word '{} ({})' translated to '{}'".format(word, shouldbe, translation)
+                      , file=sys.stderr)
+        return allGood
+def concatenate(*seqs):
+    for seq in seqs:
+        for item in seq:
+            yield item
+def main(argv):
+    parser = argparse.ArgumentParser("Translate words to ipa")
+    ##
+    ## Specifies the rules used for trsnslation
+    ##
+    parser.add_argument("-l", "--langrules", dest="langrules",
+                        default="es.rules", type=argparse.FileType('r', encoding="utf8"),
+                        help="language code rules file")
+    ##
+    ## Specifies the log level
+    ##
+    parser.add_argument("-v", "--verbose", dest="loglevel",
+                        default=0, type=int,
+                        help="Error level specification")
+    ##
+    ## Specifies a verifcation file, in any csv format. Headers are
+    ## not expected.  The first columns is supposed to be a word, and
+    ## the second is its ideal translation
+    ##
+    parser.add_argument("-c", "--check", dest="check",
+                        default=None, type=argparse.FileType('r', encoding="utf8"),
+                        help="file to use for verification")
+    ##
+    ## Allows to read date from some file (which should not be compressed)
+    ##
+    parser.add_argument("-r", "--read", dest="read",
+                        default=None, type=argparse.FileType('r', encoding="utf8"),
+                        help="file used for translation (read up to first space)")
+    ##
+    ## Any following words would be translated
+    ##
+    parser.add_argument("words", nargs="*")
+    options = vars(parser.parse_args(argv))
+    a2ipa = alphabet2ipa(options["langrules"], loglevel=options["loglevel"])
+    ##print(options)
+    if "check" in options and not options["check"] is None:
+        allGood = a2ipa.check(options["check"])
+        if not allGood:
+            print("Verification failed, not processing additional data", file=sys.stderr)
+            return []
+    if "read" in options and not options["read"] is None:
+        readwords = (fields.replace(",", " ").split()[0] for fields in options["read"])
+    else:
+        readwords = []
+    words = concatenate(options["words"], readwords)
+    ret = ((word, " ".join(a2ipa.translate(word)))
+           for  word in words)
+    return ret
+if __name__ == "__main__":
+    for output in main(sys.argv[1:]):
+        print("\t".join(output))

Code/translate04.py ADDED Viewed

	@@ -0,0 +1,346 @@

+#!/usr/bin/env python3
+import re
+import argparse
+import sys
+import csv
+import traceback
+from collections import deque, defaultdict
+from math import inf
+def sniff(filestream):
+    ##sample = csv.Sniffer().sniff(filestream.read(1024))
+    lines = list(line for line in filestream if not (line.startswith("#") or len(line) == 0))
+    if all(line.find("\t") >= 0 for line in lines):
+        dialect = csv.get_dialect("excel-tab")
+    else:
+        sample = "\n".join(lines)
+        try:
+            dialect = csv.Sniffer().sniff(sample)
+        except Exception as ex:
+            print("Could not determine delimiter type, proceedings as excel csv", file=sys.stderr)
+            dialect = csv.get_dialect("excel")
+    return (lines, dialect)
+class subrule(object):
+    ##
+    ## simple wrapper for sub rules
+    ##
+    def __init__(self, values, classes):
+        for key in ["sfrom", "sto", "precede", "follow", "weight"]:
+            value = values[key]
+            ##
+            ## handle classes (and subclasses)
+            ##
+            while re.search("{.*}", value):
+                value = value.format(**classes)
+            self.__dict__[key] = value
+        self.weight = float(self.weight)
+        self.sfrom = re.compile(self.sfrom)
+        self.precede = re.compile(self.precede+"$")
+        self.follow = re.compile("^"+self.follow)
+    def subScore(self, sfrom, precede, follow):
+        if self.sfrom.match(sfrom) and self.precede.search(precede) and self.follow.search(follow):
+            return self.weight
+        else:
+            return None
+    def sub(self, x):
+        return self.sfrom.sub(self.sto, x)
+    def __repr__(self):
+        return repr(self.__dict__)
+    def __lt__(self, other):
+        if not isinstance(other, subrule):
+            raise Exception("Incompatible types for comparison")
+        return [self.weight] < [other.weight]
+class alphabet2ipa(object):
+    ##
+    ## interpretation of rules files
+    ##
+    classes = None
+    subs = None
+    ##chartr = None
+    def __init__(self, langrules, missing="@", loglevel=0):
+        self.classes = dict()
+        self.subs = set()
+        self.ipasubs = set()
+        self.words = dict()
+        self.matches = dict()
+        self.pre = str.maketrans("", "")
+        self.NO_TRANSLATE = missing
+        self.loglevel = loglevel
+        with langrules as csvsource:
+            ##rules = csv.DictReader(csvsource)
+            (csvsource, dialect) = sniff(csvsource)
+            rules = csv.DictReader(csvsource, dialect=dialect)
+            for rule in rules:
+                if self.loglevel > 2:
+                    print("Rule found:", rule, file=sys.stderr)
+                try:
+                    ##
+                    ## Pre equivalences
+                    ##
+                    if rule["type"] == "pre":
+                        self.pre = str.maketrans(rule["sfrom"], rule["sto"])
+                    ##
+                    ## Deal with classes
+                    ##
+                    elif rule["type"] == "class":
+                        self.classes[rule["sfrom"]] = rule["sto"]
+                    ##
+                    ## Deal with match rules
+                    ##
+                    elif rule["type"] == "match":
+                        value = rule["sto"]
+                        while re.search("{.*}", value):
+                            value = value.format(**self.classes)
+                        self.matches[rule["sfrom"]] = value
+                    ##
+                    ## Deal with sub rules
+                    ##
+                    elif rule["type"] == "sub":
+                        newrule = subrule(rule, self.classes)
+                        self.subs.add(newrule)
+                    ##
+                    ## Deal with IPA sub rules
+                    ##
+                    elif rule["type"] == "ipasub":
+                        newrule = subrule(rule, self.classes)
+                        self.ipasubs.add(newrule)
+                    ##
+                    ## Deal with whole word substitutions
+                    ##
+                    elif rule["type"] == "word":
+                        self.words[rule["sfrom"]] = rule["sto"].split()
+                    ##
+                    ## No such rule
+                    ##
+                    else:
+                        print("Unrecognized rule type ({type}), with sfrom={sfrom}, and sto={sto}".format(**rule), file=sys.stderr)
+                        continue
+                except Exception as ex:
+                    errInfo = sys.exc_info()
+                    traceback.print_exception(*errInfo)
+                    print("Error processing rule, but resuming processing other rules. Rule details: {}".format(rule, ex), file=sys.stderr)
+                    continue
+                if self.loglevel > 1:
+                    print("Rule added:", rule, file=sys.stderr)
+    def translate(self, source):
+        ##
+        ## fully translated words
+        ##
+        if source in self.words:
+            return self.words[source]
+        ##
+        ## preprocess using pre, and turn to lowercase
+        ##
+        source = source.translate(self.pre).lower()
+        ##
+        ## If there are character-based translations, apply them first
+        ## (for Cyrillic and other non-latin scripts)
+        ##
+        ##if not self.chartr is None:
+        ##    source = source.translate(self.chartr)
+        sourceList = re.findall(".", source)
+        targetList = deque()
+        for (sx, sfrom) in enumerate(sourceList):
+            ##
+            ## If there's a match rule: translate "as is" (and skip costly regular expressions)
+            ##
+            if sfrom in self.matches:
+                translation = self.matches[sfrom]
+            ##
+            ## Otherwise, look for all matches
+            ##
+            else:
+                ##
+                ## prepare context
+                ##
+                precede = "".join(source[:sx])
+                follow = "".join(source[sx+1:])
+                ##
+                ## perform all possible translations
+                ##
+                translations = [(rule.subScore(sfrom, precede, follow), rule.sub(sfrom)) for rule in self.subs]
+                ##
+                ## Exclude translations that didn't apply
+                ##
+                translations = [pair for pair in translations if not pair[0] is None]
+                ##
+                ## Choose best translation
+                ##
+                translation = sorted(translations)[-1][-1] if len(translations) > 0 else self.NO_TRANSLATE
+            if len(translation) == 0:
+                continue
+            targetList.append(translation)
+        targetString = " ".join(targetList)
+        for (weight, rule) in sorted((-rule.weight, rule)
+                                     for rule in self.ipasubs):
+            targetString = rule.sub(targetString)
+        return list(targetString.split())
+    def check(self, cfile):
+        ##
+        ## Check that words translate as they should. Returns True if
+        ## they all do (or if there are no words). Unless logevel is
+        ## negative, mismatches are printed.
+        ##
+        ##
+        ## Open the file, regardless of csv formats, excluding comment lines
+        ##
+        (csvsource, dialect) = sniff(cfile)
+        data = csv.reader(csvsource, dialect=dialect)
+        ##
+        ## Iterate over all lines
+        ##
+        allGood = True                                     ## default is that it's all good
+        for values in data:
+            try:
+               word = values[0]
+               shouldbe = values[1]
+            except Exception as ex:
+                errInfo = sys.exc_info()
+                allGood = False
+                traceback.print_exception(*errInfo)
+                print("Error processing verification statement, but resuming processing other statements. Statement details: {}".format(values, ex), file=sys.stderr)
+                continue
+            translation = " ".join(self.translate(word))   ## translate returns a list, change to spaces
+            if self.loglevel > 2:
+                print("Does '{}' translate to '{}'?".format(word, shouldbe), file=sys.stderr)
+            if not shouldbe == translation:                ## if wrong translation
+                allGood = False                            ##    not all translations are good
+                if self.loglevel >= 0:
+                    print("Verification error, '{}' was translated to '{}', not '{}'"
+                          .format(word, translation, shouldbe)
+                          , file=sys.stderr)
+            if self.loglevel > 0:
+                print("Word '{} ({})' translated to '{}'".format(word, shouldbe, translation)
+                      , file=sys.stderr)
+        return allGood
+def concatenate(*seqs):
+    for seq in seqs:
+        for item in seq:
+            yield item
+def main(argv):
+    parser = argparse.ArgumentParser("Translate words to ipa")
+    ##
+    ## Specifies the rules used for trsnslation
+    ##
+    parser.add_argument("-l", "--langrules", dest="langrules",
+                        default="es.rules", type=argparse.FileType('r', encoding="utf8"),
+                        help="language code rules file")
+    ##
+    ## Specifies the log level
+    ##
+    parser.add_argument("-v", "--verbose", dest="loglevel",
+                        default=0, type=int,
+                        help="Error level specification")
+    ##
+    ## Specifies a verifcation file, in any csv format. Headers are
+    ## not expected.  The first columns is supposed to be a word, and
+    ## the second is its ideal translation
+    ##
+    parser.add_argument("-c", "--check", dest="check",
+                        default=None, type=argparse.FileType('r', encoding="utf8"),
+                        help="file to use for verification")
+    ##
+    ## Allows to read date from some file (which should not be compressed)
+    ##
+    parser.add_argument("-r", "--read", dest="read",
+                        default=None, type=argparse.FileType('r', encoding="utf8"),
+                        help="file used for translation (read up to first space)")
+    ##
+    ## Any following words would be translated
+    ##
+    parser.add_argument("words", nargs="*")
+    options = vars(parser.parse_args(argv))
+    a2ipa = alphabet2ipa(options["langrules"], loglevel=options["loglevel"])
+    ##print(options)
+    if "check" in options and not options["check"] is None:
+        allGood = a2ipa.check(options["check"])
+        if not allGood:
+            print("Verification failed, not processing additional data", file=sys.stderr)
+            return []
+    if "read" in options and not options["read"] is None:
+        readwords = (fields.replace(",", " ").split()[0] for fields in options["read"])
+    else:
+        readwords = []
+    words = concatenate(options["words"], readwords)
+    ret = ((word, " ".join(a2ipa.translate(word)))
+           for  word in words)
+    return ret
+if __name__ == "__main__":
+    for output in main(sys.argv[1:]):
+        print("\t".join(output))

Data/README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+# Data
+Each of the language files found in this folder (and the `./compromised` folder) contain the files listed below. I have linked the relevant sections within the manual for more detail.
+* An `.Rmd` (and corresponding `.html`) file that outlines the language specific [description](https://cohenpr-xpf.github.io/XPF/manual/xpf_manual.pdf#Language%20Descriptions).
+* A `.rules` file that contains the computational [grammar](https://cohenpr-xpf.github.io/XPF/manual/xpf_manual.pdf#Language%20Grammars) needed to translate language specific orthographic characters to their phonemes.
+* A `.verify.csv` file that contains language specific sample words and their translations used to [verify](https://cohenpr-xpf.github.io/XPF/manual/xpf_manual.pdf#Grammar%20Verification) and confirm the validity of the `.rules` file.
+* A `.bib` file that contains the sources referenced within the `.Rmd`.
+The `langs_list.tsv` file is a consolidation of relevant language identifiers and directory paths to make for more efficient analyses.

Data/_compromised/acr_RabinalAchi'/acr.Rmd ADDED Viewed

	@@ -0,0 +1,112 @@

+---
+title: "Rabinal Achi'"
+author: "Emily Strand"
+bibliography: acr.bib
+output: html_document
+---
+Last Updated: 2020-03-30
+**SLIGHTLY COMPROMISED: suspect marking of vowel length**
+# Background
+**Language Family:** Mayan / Quichean-Mamean / Greater Quichean / Quichean / Quiche-Achi
+* Rabinal Achi' is spoken in Guatemala. It is one of two dialects of Achi. The other dialect is known as Cubulco.
+# Phonology
+## Consonants
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+library(dplyr)
+library(knitr)
+library(kableExtra)
+consonants <- read.table(textConnection('
+"Manner of Articulation"    Labial   Alveolar  Postalveolar  Palatal  Velar  Uvular  Glottal
+Stops                       "p ɓ"     "t tʼ"       ""          ""    "k kʼ"  "q qʼ"    "ʔ"
+Affricates                   ""      "ts tsʼ"   "tʃ tʃʼ"       ""      ""     ""       ""
+Fricatives                   ""        "s"        "ʃ"          ""      ""     ""       "h"
+Nasals                       "m"       "n"         ""          ""      ""     ""       ""
+Flaps                        ""        "ɾ"         ""          ""      ""     ""       ""
+Approximants                 "w"       "l"         ""          "j"     ""     ""       ""
+'), TRUE)
+kable(consonants, col.names = c("Manner of Articulation", "Labial", "Alveolar", "Postalveolar", "Palatal", "Velar", "Uvular", "Glottal"), align = 'c') %>%
+  kable_styling("bordered") %>%
+  add_header_above(c("", "Place of Articulation" = 7)) %>%
+  column_spec(1, bold = TRUE) %>%
+  footnote(general = "Note: For phonemes that share a cell, those on the left are plain and those on the right are ejectives. The labial stop on the right is an implosive.", general_title = "")
+```
+## Vowels
+* Vowel length is contrastive in Rabinal Achi' [@Pellicer2005, p. 15]. Long vowels are indicated by duplicate vowel graphemes.
+    - All the literature pertaining to the language suggests the marking of vowel length; however, the output of the Crúbadán corpus (only 33 accounts reflecting vowel length) makes this suspect.
+    - Unlike the languages in the Western branch of Proto-Mayan that have neutralized vowel length, languages in the Eastern branch (including Achi) have not [@England2017].
+    - I have chosen to categorize this language as compromised, given the suspect nature of the vowel length marking.
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+vowels <- read.table(textConnection('
+         Front      Central       Back
+High      "i"         ""           "u"
+Mid       "e"         ""           "o"
+Low        ""         "a"          ""
+'), TRUE)
+kable(vowels, align = 'c') %>%
+  kable_styling("bordered") %>%
+  column_spec(1, bold = TRUE)
+```
+# Alphabet
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+alphabet <- read.table(textConnection('
+Grapheme       Phoneme
+"a"             "/a/"
+"b; b\'"        "/ɓ/"
+"e"             "/e/"
+"i"             "/i/"
+"j"             "/h/"
+"k"             "/k/"
+"k\'"           "/kʼ/"
+"l"             "/l/"
+"m"             "/m/"
+"n"             "/n/"
+"o"             "/o/"
+"p"             "/p/"
+"q"             "/q/"
+"q\'"           "/qʼ/"
+"r"             "/ɾ/"
+"s"             "/s/"
+"t"             "/t/"
+"t\'"           "/tʼ/"
+"u"             "/u/"
+"w"             "/w/"
+"x"             "/ʃ/"
+"y"             "/j/"
+"\'"            "/ʔ/"
+**Digraph**     ""
+"ch"            "/tʃ/"
+"ch\'"          "/tʃʼ/"
+"tz"            "/ts/"
+"tz\'"          "/tsʼ/"
+'), TRUE)
+kable(alphabet, align = 'c') %>%
+  kable_styling("bordered")
+```
+# Misc. Rules
+* Voiceless stops are aspirated word-finally and following consonants [@Solares2008, pp. 18-20].
+* Phonetic glottal stops precede vowels in word-initial positions [@Solares2008, p. 21].
+# References

Data/_compromised/acr_RabinalAchi'/acr.bib ADDED Viewed

	@@ -0,0 +1,29 @@

+@Misc{Brawand1963,
+  author       = {John Brawand and Alice Brawand},
+  title        = {El alfabeto achí, dialecto de Rabinal},
+  howpublished = {SIL},
+  year         = {1963},
+}
+@Book{Pellicer2005,
+  title     = {Los significados de la música: La marimba maya achí de Guatemala},
+  publisher = {Centro de Investigaciones y Estudios Superiores en Antropoligía Social},
+  year      = {2005},
+  author    = {Sergio Navarrete Pellicer},
+}
+@Misc{Solares2008,
+  author = {Emilsa Solares},
+  title  = {Gramática del idioma Achi},
+  month  = oct,
+  year   = {2008},
+}
+@InBook{England2017,
+  chapter   = {Chapter 7: Phonology and phonetics},
+  title     = {The Mayan Languages},
+  publisher = {Routledge},
+  year      = {2017},
+  author    = {Nora C. England and Brandon O. Baird},
+  editor    = {Judith Aissen and Nora C. England and Roberto Zavala Maldonado},
+}

Data/_compromised/acr_RabinalAchi'/acr.html ADDED Viewed

The diff for this file is too large to render. See raw diff

Data/_compromised/acr_RabinalAchi'/acr.rules ADDED Viewed

	@@ -0,0 +1,28 @@

+# Rabinal Achi' Rule Set
+# Written by: Emily
+# Last Updated: 2020-03-30
+type,sfrom,sto,weight,precede,follow,comment
+# Class Rules
+class,passthrough,[aeiklmnopqstuw],,,,
+class,apostrophe,['ꞌˈ‘’],,,,
+class,eject,(k|q|t|tʃ|ts),,,
+class,vowels,[aeiou],,,,
+# Individual Letters
+sub,b,ɓ,2,,,
+sub,b,ɓ,3,,{apostrophe},
+sub,{apostrophe},,3,b,,"clean-up",
+sub,j,h,2,,,
+sub,r,ɾ,2,,,
+sub,x,ʃ,2,,,
+sub,y,j,2,,,
+sub,{apostrophe},ʔ,2,,,
+sub,({passthrough}),\1,2,,,
+# Digraphs
+sub,c,tʃ,3,,h,
+sub,h,,3,c,,"clean-up",
+sub,t,ts,3,,z,
+sub,z,,3,t,,"clean-up",
+# Ejectives
+ipasub,({eject}) ʔ,\1ʼ,2,,,
+# Vowel Length
+ipasub,({vowels}) \1,\1 ː,2,,,

Data/_compromised/acr_RabinalAchi'/acr.verify.csv ADDED Viewed

	@@ -0,0 +1,20 @@

+beexex,ɓ e ː ʃ e ʃ,
+eyaj,e j a h,
+iis,i ː s,
+k'ij,kʼ i h,
+me's,m e ʔ s,
+no's,n o ʔ s,
+ok',o kʼ,
+ooj,o ː h,
+pix,p i ʃ,
+quej,q u e h,
+q'uel,qʼ u e l,
+rakana',ɾ a k a n a ʔ,
+sutz',s u tsʼ,
+t'ot',tʼ o tʼ,
+tzi,ts i,
+tz'i',tsʼ i ʔ,
+wuch',w u tʃʼ,
+xan,ʃ a n,
+ya',j a ʔ,
+che',tʃ e ʔ,

Data/_compromised/ake_Akawaio/ake.Rmd ADDED Viewed

	@@ -0,0 +1,113 @@

+---
+title: "Akawaio"
+author: "Emily Strand"
+bibliography: ake.bib
+output: html_document
+---
+Last Updated: 2020-06-26
+**COMPROMISED: graphemes normally representing voiced stops and /z/ are not present in the orthography - ambiguity when voiceless stops and /s/ transcribe to voiced counterparts (underlying vs. surface level)**
+# Background
+**Language Family:** Carib / Northern / East-West Guiana / Macushi-Kapon / Kapon
+* Akawaio is spoken in Guyana.
+# Phonology
+* Although voiced consonants are said to be originally conditioned (voiceless to voiced) intervocalically or between vowels and nasals, they are considered phonemes of the language [@Gildea2012, p. 450; @Edwards1978, p. 78].
+## Consonants
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+library(dplyr)
+library(knitr)
+library(kableExtra)
+consonants <- read.table(textConnection('
+"Manner of Articulation"   Labial   Alveolar  Palatal    Velar
+Stops                      "p b"     "t d"      ""       "k ɡ"
+Fricatives                  ""       "s z"      ""        ""
+Nasals                      "m"       "n"       ""        ""
+Flaps                       ""        "ɾ"       ""        ""
+Approximants                "w"       ""        "j"       ""
+'), TRUE)
+kable(consonants, col.names = c("Manner of Articulation", "Labial", "Alveolar", "Palatal", "Velar"), align = 'c') %>%
+  kable_styling("bordered") %>%
+  add_header_above(c("", "Place of Articulation" = 4)) %>%
+  column_spec(1, bold = TRUE) %>%
+  footnote(general = "Note: For phonemes that share a cell, those on the left are voiceless and those on the right are voiced.", general_title = "")
+```
+## Vowels
+* Instead of using /ʌ/ to represent the mid-central unrounded vowel as in @Edwards1978, I have opted for /ɘ/ (p. 77). /ə/ is not a suitable option given that all underlying vowels in Akawaio occur in stressed syllables.
+* Diphthongs are present in Akawaio; however, because they are often interpreted as being separated by off-glides, or part of separate syllables, I have chosen not to include them below (ibid).
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+vowels <- read.table(textConnection('
+           Front      Central       Back
+High        "i"         "ɨ"          "u"
+Mid         "e"         "ɘ"          "o"
+Low         ""          "a"          ""
+'), TRUE)
+kable(vowels, align = 'c') %>%
+  kable_styling("bordered") %>%
+  column_spec(1, bold = TRUE)
+```
+# Alphabet
+* Apostrophes indicate glottal stops [@Stegeman2014]; however, the glottal stop is not phonemic (see Lenition Rules below). It will be transcribed to /k/.
+* The orthography doesn't represent the voiced stops [@Stegeman2014, p. 2], and it is unclear when the voiceless stops should transcribe to the voiced stops (i.e. there is too much ambiguity around whether the resulting (voiced consonant) transcription is underlying or a surface representation - see Lenition Rules below), thus the language is compromised. I have chosen to transcribe ⟨p⟩, ⟨t⟩, ⟨k⟩, and ⟨s⟩ to their voiceless variants by default given their originating status in the language, but this means that /b/, /d/, /ɡ/, and /z/ are not represented by the rules.
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+alphabet <- read.table(textConnection('
+Grapheme       Phoneme
+"a"             "/a/"
+"e"             "/e/"
+"i"             "/i/"
+"ɨ"             "/ɨ/"
+"k"             "/k/"
+"m"             "/m/"
+"n"             "/n/"
+"o"             "/o/"
+"p"             "/p/"
+"r"             "/ɾ/"
+"s"             "/s/"
+"t"             "/t/"
+"u"             "/u/"
+"ʉ"             "/ɘ/"
+"w"             "/w/"
+"y"             "/j/"
+"\'"            "/k/"
+'), TRUE)
+kable(alphabet, align = 'c') %>%
+  kable_styling("bordered")
+```
+# Lenition Rules
+* /k/ debuccalizes to [ʔ] syllable-finally [@Edwards1978, p. 79].
+* Voiceless stops and /s/ become voiced intervocalically [@Edwards1978, p. 81].
+* Voiced stops optionally spirantize [@Gildea2012, p. 450].
+* Unstressed vowels are often deleted, especially word-initially [@Edwards1978, p. 82].
+# Misc. Rules
+* /n/ is realized as [ŋ] word-finally and syllable finally, preceding velar consonants and /w/ [@Edwards1978, p. 79].
+* /k/ becomes palatalized or labialized when preceding /i/ or /u/, respectively (ibid.).
+* /s/ and /z/ are realized as [tʃ] and [dʒ], respectively, preceding /i/ (ibid.).
+# References

Data/_compromised/ake_Akawaio/ake.bib ADDED Viewed

	@@ -0,0 +1,40 @@

+@book{Stegeman2014,
+  author     = {Ray Stegeman and Rita Hunter},
+  pages      = {207},
+  publisher  = {SIL International},
+  title      = {Akawaio-English Dictionary},
+  year       = {2014}
+}
+@article{Edwards1978,
+  address    = {Bloomington},
+  author     = {Walter F. Edwards},
+  journal    = {Anthropological Linguistics},
+  number     = {2},
+  pages      = {77-84},
+  title      = {Some synchronic and diachronic aspects of Akawaio phonology},
+  volume     = {20},
+  year       = {1978}
+}
+@article{Edwards1078_sketch,
+  author = {Edwards, Walter F.},
+  title = {A Preliminary Sketch of Arekuna (Carib) Phonology},
+  journal = {International Journal of American Linguistics},
+  volume = {44},
+  number = {3},
+  pages = {223-227},
+  year = {1978}
+}
+@InBook{Gildea2012,
+  chapter   = {Linguistic studies in the Cariban family},
+  pages     = {441-494},
+  title     = {The Indigenous Languages of South America: A Comprehensive Guide (The World of Linguistics)},
+  publisher = {Mouton De Gruyter},
+  year      = {2012},
+  author    = {Spike Gildea},
+  editor    = {Lyle Campbell and Verónica Grondona},
+  isbn      = {978-3-11-025513-3},
+  url       = {https://www.amazon.com/Indigenous-Languages-South-America-Comprehensive/dp/3110255138?SubscriptionId=AKIAIOBINVZYXZQZ2U3A&tag=chimbori05-20&linkCode=xm2&camp=2025&creative=165953&creativeASIN=3110255138},
+}

Data/_compromised/ake_Akawaio/ake.html ADDED Viewed

The diff for this file is too large to render. See raw diff

Data/_compromised/ake_Akawaio/ake.rules ADDED Viewed

	@@ -0,0 +1,13 @@

+# Akawaio Rule Set
+# Written by: Emily Strand
+# Last updated: 2020-02-04
+type,sfrom,sto,weight,precede,follow,comment
+# Class Rules
+class,passthrough,[aeiɨkmnopstuw],,,,
+class,apostrophe,['ꞌˈ‘’],,,,
+# Individual Letters
+sub,r,ɾ,2,,,
+sub,ʉ,ɘ,2,,,
+sub,y,j,2,,,
+sub,{apostrophe},k,2,,,
+sub,({passthrough}),\1,2,,,

Data/_compromised/ake_Akawaio/ake.verify.csv ADDED Viewed

	@@ -0,0 +1,30 @@

+ɨkena',ɨ k e n a k,
+ɨkʉi,ɨ k ɘ i,
+ɨ'se'ne',ɨ k s e k n e k,
+musun,m u s u n,
+nari',n a ɾ i k,
+nau,n a u,
+nawon,n a w o n,
+nɨpontɨ,n ɨ p o n t ɨ,
+nonka,n o n k a,
+piyau,p i j a u,
+pɨ're,p ɨ k ɾ e,
+pe,p e,
+uya,u j a,
+rɨ,ɾ ɨ,
+kon,k o n,
+mɨrɨ,m ɨ ɾ ɨ,
+pʉra,p ɘ ɾ a,
+to,t o,
+awonsi'kɨ,a w o n s i k k ɨ,
+amʉ,a m ɘ,
+iya,i j a,
+yau,j a u,
+ton,t o n,
+ta'pʉ,t a k p ɘ,
+esi,e s i,
+iyesi,i j e s i,
+nɨ,n ɨ,
+a'tai,a k t a i,
+pen,p e n,
+mɨrɨpan,m ɨ ɾ ɨ p a n,

Data/_compromised/amp_Alamblak/amp.Rmd ADDED Viewed

	@@ -0,0 +1,125 @@

+---
+title: "Alamblak"
+author: "Abi Creighton"
+bibliography: amp.bib
+output: html_document
+---
+Last Updated: 2020-08-11
+**COMPROMISED: conflation between /ɘ/ and /o/**
+# Background
+**Language Family:** Sepik-Ramu / Sepik / Sepik Hill / Alamblak
+* Alamblak is spoken in the Angoram District of the East Sepik Province in Papua New Guinea.
+# Phonology
+## Consonants
+* The phonemic status of the palato-alveolar (or postalveolar) consonants is somewhat suspect. Apart from the ones I include below (/dʒ/, /ɲ/, and /j/), this includes /tʃ/ and /ʃ/ [@bruce_1984, p. 21]. @bruce_1984 explains that they almost always result from phonological processes imposed on underlying alveolar consonants (p. 29). The exceptions to this, or the idiosyncratic surfacing of the postalveolars is what makes for the lack of consensus. I have ultimately chosen to follow the analysis done by @bruce_1975, which excludes /tʃ/ and /ʃ/ on the basis that they are predictable variants of /s/ [p. 101; @bruce_1984, p. 30]. Occurrences of /tʃ/ and /ʃ/ are interpreted as residual forms that have yet to fully collapse.
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+library(dplyr)
+library(knitr)
+library(kableExtra)
+consonants <- read.table(textConnection('
+"Manner of Articulation"   Bilabial   Alveolar  Postalveolar  Palatal   Velar
+Stops                      "p b"       "t d"         ""         ""      "k ɡ"
+Affricates                  ""          ""          "dʒ"        ""       ""
+Fricatives                  "ɸ"         "s"          ""         ""       "x"
+Nasals                      "m"         "n"          ""         "ɲ"      ""
+Flaps                       ""          "ɾ"          ""         ""       ""
+Approximants                "w"         ""           ""         "j"      ""
+'), TRUE)
+kable(consonants, col.names = c("Manner of Articulation", "Bilabial", "Alveolar", "Postalveolar", "Palatal", "Velar"), align = 'c') %>%
+  kable_styling("bordered") %>%
+  add_header_above(c("", "Place of Articulation" = 5)) %>%
+  column_spec(1, bold = TRUE) %>%
+  footnote(general = "Note: For phonemes that share a cell, those on the left are voiceless and those on the right are voiced", general_title = "")
+```
+## Vowels
+* There is some uncertainty around the mid central vowel. @edmiston_2003 specify it as /ə/ (p. 3), while @bruce_1984 specifies it as /ë/ (p. 34). Based on the minimal pairs Bruce provides through monosyllabic words, it's evident that this mid vowel may bear stress, thus making it not fully reduced. Schwa, therefore, would be an inappropriate representation of this vowel. However, /ë/ is also not ideal in that it doesn't adequately reflect IPA, so based on Bruce's description of a mid unrounded vowel, I have chosen to use /ɘ/.
+* @bruce_1984 also includes /ɨ/ in Alamblak's vowel inventory, but questions its phonemic status, suggesting that occurrences may be epenthetic (pp. 39, 61). Because @edmiston_2003 make no mention of it, I have chosen not to present in the vowel inventory below.
+* Diphthongs may occur phonetically [@bruce_1984, p. 55].
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+vowels <- read.table(textConnection('
+             Front      Central       Back
+High          "i"         ""           "u"
+Mid           "e"         "ɘ"          "o"
+Low           ""          "a"          ""
+'), TRUE)
+kable(vowels, align = 'c') %>%
+  kable_styling("bordered") %>%
+  column_spec(1, bold = TRUE)
+```
+# Alphabet
+* ⟨o⟩ is used to represent both /o/ and /ɘ/, which compromises the language [@edmiston_2003, p. 1]. ⟨o⟩ transcribes to /o/ by default in the rule set.
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+alphabet <- read.table(textConnection('
+Grapheme        Phoneme           Comment
+"a"             "/a/"              ""
+"b"             "/b/"              ""
+"d"             "/d/"              ""
+"e"             "/e/"              ""
+"f"             "/ɸ/"              ""
+"g"             "/ɡ/"              ""
+"h"             "/h/"              ""
+"i"             "/i/"              ""
+"j"             "/dʒ/"             ""
+"k"             "/k/"              ""
+"m"             "/m/"              ""
+"n"             "/n/"              ""
+"o"             "/o/; /ɘ/"        "/o/: default in the rules"
+"p"             "/p/"              ""
+"r"             "/ɾ/"              ""
+"s"             "/s/"              ""
+"t"             "/t/"              ""
+"u"             "/u/"              ""
+"w"             "/w/"              ""
+"y"             "/j/"              ""
+**Digraph**      ""                ""
+"ny"            "/ɲ/"              ""
+'), TRUE)
+kable(alphabet, align = 'c') %>%
+  kable_styling("bordered")
+```
+# Syllable Structure
+* Alamblak has three basic syllable structures [@bruce_1984, p. 61]:
+    - C(C)(C)V(C)(C)
+    - V(C)(C)
+    - CVV(C)
+# Lenition Rules
+* /ɸ/ voices to [β] word-medially [@edmiston_2003, p. 4].
+* /x/ voices to [ɣ] word-medially following a voiced phoneme (ibid.).
+# Misc. Rules
+* /n/ assimilates to [ŋ] preceding [ɡ] [@edmiston_2003, p. 4].
+* Low vowel dissimilation is present in Alamblak; /a/ raises to [ə] when followed by another syllable containing /a/ [@blevins_2009, p. 479].
+* An epenthetic [j] is inserted between vowel sequences which are not permitted in Alamblak [@bruce_1984, p. 54].
+* An epenthetic [ɨ] may be inserted optionally in consonant clusters [@bruce_1984, pp. 56-57].
+# References

Data/_compromised/amp_Alamblak/amp.bib ADDED Viewed

	@@ -0,0 +1,62 @@

+@techreport{edmiston_2003,
+	title = {Alamblak Organised Phonology Data},
+	url = {https://www.sil.org/resources/archives/42279},
+	author = {Edmiston, Melenda and Edmiston, Patrick},
+	year = {2003}
+}
+@phdthesis{bruce_1979,
+  author     = {Bruce Jr., Leslie P.},
+  pages      = {572},
+  school     = {Australian National University},
+  title      = {A Grammar of Alamblak (Papua New Guinea)},
+  year       = {1979}
+}
+@book{bruce_1984,
+  address               = {Canberra},
+  author                = {Bruce Jr., Leslie P.},
+  number                = {81},
+  pages                 = {iv+361},
+  publisher             = {Research School of Pacific and Asian Studies, Australian National University},
+  series                = {Pacific Linguistics: Series {C}},
+  title                 = {The Alamblak Language of Papua New Guinea (East Sepik)},
+  volume                = {81},
+  year                  = {1984}
+}
+@article{blevins_2009,
+	title = {Low {Vowel} {Dissimilation} {Outside} of {Oceanic}: {The} {Case} of {Alamblak}},
+	volume = {48},
+	copyright = {University of Hawai'i Press},
+	url = {http://www.jstor.com/stable/40783539},
+	number = {2},
+	journal = {Oceanic Linguistics},
+	author = {Blevins, Juliette},
+	month = dec,
+	year = {2009},
+	pages = {477--483}
+}
+@article{dye_1968,
+	title = {The {Sepik} {Hill} {Languages}: {A} {Preliminary} {Report}},
+	volume = {39},
+	url = {http://www.jstor.com/stable/40329762},
+	number = {2},
+	journal = {Oceania},
+	author = {Dye, W and Townsend, P and Townsend, W},
+	month = dec,
+	year = {1968},
+	pages = {146--156}
+}
+@InBook{bruce_1975,
+  author    = {Bruce Jr., Leslie P.},
+  booktitle     = {Papers in New Guinea Linguistics No. 18},
+  publisher = {Pacific Linguistics, The Australian National University},
+  year      = {1975},
+  editor    = {Conrad, R. and Dye, W. and Thomson, N. P., and Bruce Jr., L. P.},
+  title   = {Alamblak Alveopalatals - Dead Portmanteaus},
+  pages     = {91-102},
+}

Data/_compromised/amp_Alamblak/amp.html ADDED Viewed

The diff for this file is too large to render. See raw diff

Data/_compromised/amp_Alamblak/amp.rules ADDED Viewed

	@@ -0,0 +1,19 @@

+# Alamblak Rule Set
+# Written by Abi Creighton
+# Last updated: 2020-08-11
+type,sfrom,sto,weight,precede,follow,comment
+# Class Rules
+class,passthrough,[abdehikmnopstuw],,,,
+class,punct,['ꞌ‘’-],,,,
+# Individual Letters
+sub,f,ɸ,1,,,
+sub,g,ɡ,1,,,
+sub,j,dʒ,1,,,
+sub,r,ɾ,1,,,
+sub,y,j,1,,,
+sub,({passthrough}),\1,0.1,,,
+# Multigraphs
+sub,n,ɲ,2,,y,
+sub,y,,2,n,,"clean-up"
+# Misc. Rules
+sub,{punct},,1,,,

Data/_compromised/amp_Alamblak/amp.verify.csv ADDED Viewed

	@@ -0,0 +1,30 @@

+nhai,n h a i,"<a>"
+bi,b i,"<b>"
+duka,d u k a,"<d>"
+be,b e,"<e>"
+tfit,t ɸ i t,"<f>"
+hingrneft,h i n ɡ ɾ n e ɸ t,"<g>"
+toh,t o h,"<h>"
+yima,j i m a,"<i>"
+inji,i n dʒ i,"<j>"
+kfo,k ɸ o,"<k>"
+wom,w o m,"<m>"
+na,n a,"<n>"
+mrokfot,m ɾ o k ɸ o t,"<o>"
+rpat,ɾ p a t,"<p>"
+bro,b ɾ o,"<r>"
+fasoh,ɸ a s o h,"<s>"
+tu,t u,"<t>"
+yuhat,j u h a t,"<u>"
+wanyhato,w a ɲ h a t o,"<w>"
+yak,j a k,"<y>"
+hanyhato,h a ɲ h a t o,"<ny>"
+afo,a ɸ o,
+yuhum,j u h u m,
+finji,ɸ i n dʒ i,
+hik,h i k,
+turhu,t u ɾ h u,
+memom,m e m o m,
+yiha,j i h a,
+rim,ɾ i m,
+hingrna,h i n ɡ ɾ n a,

Data/_compromised/aoj_Mufian/aoj.Rmd ADDED Viewed

	@@ -0,0 +1,109 @@

+---
+title: "Mufian"
+author: "Emily Strand"
+bibliography: aoj.bib
+output: html_document
+---
+Last Updated: 2020-01-09
+**COMPROMISED: conflation among /a/, /æ/, and /ɘ/, between /ɘ/ and /o/, and between /e/ and /ɘ/; ambiguity among long vowels; ambiguity related to whether phonemic (labialized) clusters are always realized as such, ambiguity between word-medial phonetic prenasalized stops and voiced (and voiceless stops)**
+# Background
+**Language Family:** Torricelli / Kombio-Arapesh / Arapesh
+* Mufian is spoken in the East Sepik province of Papua New Guinea.
+# Phonology
+## Consonants
+* @Conrad1977 do not include the labialized consonants as part of the phonemic inventory, but note that they contrast with their non-labialized counterparts (p. 3). Given this and that other sources [e.g. @Conrad1992; @Conrad1978] include them, I have chosen to include them below.
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+library(dplyr)
+library(knitr)
+library(kableExtra)
+consonants <- read.table(textConnection('
+"Manner of Articulation"   Labial    Alveolar   Velar     Glottal
+Stops                       "p b"     "t d"  "k kʷ ɡ ɡʷ"   "ʔ ʔʷ"
+Fricatives                   "f"       "s"       ""         "h"
+Nasals                       "m"       "n"       ""         ""
+Approximants                 "w"       "l"       ""         ""
+'), TRUE)
+kable(consonants, col.names = c("Manner of Articulation", "Labial", "Alveolar", "Velar", "Glottal"), align = 'c') %>%
+  kable_styling("bordered") %>%
+  add_header_above(c("", "Place of Articulation" = 4)) %>%
+  column_spec(1, bold = TRUE) %>%
+  footnote(general = "Note: For phonemes that share a cell, those on the left are voiceless and those on the right are voiced. Phonemes that have the diacritic (ʷ) are labialized.", general_title = "")
+```
+## Vowels
+* @Conrad1992 as well as @Conrad1977 include /ə/ as a phoneme in Mufian (p. 2; p. 3); however, I've opted for /ɘ/ given that @Conrad1978 describe it as a high to mid-central vowel (p. 90).
+* Adjacent vowels are interpreted as sequences [@Conrad1977, pp. 6-7].
+* The documentation for long vowels is ambiguous. @Conrad1977 state that they are rather infrequent, citing length for only /i/, /a/, and /æ/ (contrastive occurrences for only /a/ and /æ/) [@Conrad1977, pp. 9-10, 23]. @Conrad1992 also lists long vowels, but for /a/, /e/, and /æ/ (p. 2); however, they note that long vowels are transcribed just as short vowels are. Because they are rather infrequent and have a low functional load, I have opted not to include them.
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+vowels <- read.table(textConnection('
+         Front      Central       Back
+High      "i"         ""           "u"
+Mid       "e"         "ɘ"          "o"
+Low       "æ"         "a"          ""
+'), TRUE)
+kable(vowels, align = 'c') %>%
+  kable_styling("bordered") %>%
+  column_spec(1, bold = TRUE)
+```
+# Alphabet
+* The digraphs ⟨mb⟩ and ⟨nd⟩ transcribe to word-medial phonetic prenasalization, so I will transcribe them to /b/ and /d/ respectively [@Conrad1977, p. 6]. The prenasalized /ɡ/ is transcribed as ⟨g⟩, so it poses no correction.
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+alphabet <- read.table(textConnection('
+Grapheme      Phoneme                      Comment
+"a"        "/a/; /æ/; /ɘ/"       "/a/: default in the rules"
+"b"             "/b/"                        ""
+"d"             "/d/"                        ""
+"e"           "/e/; /ɘ/"         "/e/: default in the rules"
+"f"             "/f/"                        ""
+"g"             "/ɡ/"                        ""
+"h"             "/h/"                        ""
+"i"             "/i/"                        ""
+"k"             "/k/"                        ""
+"l"             "/l/"                        ""
+"m"             "/m/"                        ""
+"n"             "/n/"                        ""
+"o"           "/o/; /ɘ/"           "/o/: default in the rules"
+"p"             "/p/"                        ""
+"s"             "/s/"                        ""
+"t"             "/t/"                        ""
+"u"             "/u/"                        ""
+"w"             "/w/"                        ""
+"\'"            "/ʔ/"                        ""
+**Digraph**      ""                          ""
+"mb"            "/b/"                        ""
+"nd"            "/d/"                        ""
+"gw"            "/ɡʷ/"                       ""
+"kw"            "/kʷ/"                       ""
+"\'w"           "/ʔʷ/"                       ""
+'), TRUE)
+kable(alphabet, align = 'c') %>%
+  kable_styling("bordered")
+```
+# Misc. Rules
+* Voiceless consonants are aspirated word-finally [@Conrad1977, pp. 12-13].
+# References

Data/_compromised/aoj_Mufian/aoj.bib ADDED Viewed

	@@ -0,0 +1,27 @@

+@incollection{Conrad1978,
+  address    = {Ukarumpa},
+  author     = {Conrad, Robert J. and Lukas, Joshua and Alungum, John},
+  booktitle  = {Miscellaneous papers on Dobu and Arapesh},
+  editor     = {Richard Loving},
+  pages      = {89-130},
+  publisher  = {Summer Institute of Linguistics},
+  series     = {Workpapers in Papua New Guinea Languages},
+  title      = {Some Muhiang grammatical notes},
+  url        = {http://www.sil.org/pacific/png/abstract.asp?id=15292},
+  volume     = {25},
+  year       = {1978}
+}
+@Misc{Conrad1977,
+  author = {Conrad, Robert J. and Lukas, Joshua and Alungum, John},
+  title  = {Preliminary Phonology of Mufian (Southern Arapesh)},
+  year   = {1977},
+}
+@Misc{Conrad1992,
+  author       = {Conrad, Robert J.},
+  title        = {Mufian Organised Phonology Data},
+  howpublished = {SIL},
+  month        = mar,
+  year         = {1992},
+}

Data/_compromised/aoj_Mufian/aoj.html ADDED Viewed

The diff for this file is too large to render. See raw diff

Data/_compromised/aoj_Mufian/aoj.rules ADDED Viewed

	@@ -0,0 +1,25 @@

+# Mufian Rule Set
+# Written by: Emily
+# Last Updated: 2020-01-09
+type,sfrom,sto,weight,precede,follow,comment
+# Class Rules
+class,apostrophe,['ꞌ‘’],,,,
+class,passthrough,[abdefhiklmnopstuw],,,,
+class,w-preceder,[gk'ꞌ‘’],,,,
+# Individual Letters
+#sub,a,æ,2,,,"conflation among /a/ and /ɘ/ - /a/ is used as default transcription in the passthrough class rule",
+#sub,a,ɘ,2,,,
+#sub,e,ɘ,2,,,"conflation with /e/ - /e/ is used as default transcription in the passthrough class rule",
+#sub,o,ɘ,2,,,"conflation with /o/ - /o/ is used as default transcription in the passthrough class rule",
+sub,g,ɡ,2,,,
+sub,{apostrophe},ʔ,2,,,
+sub,({passthrough}),\1,2,,,
+# Digraphs
+sub,m,b,3,,b,
+sub,b,,3,m,,"clean-up",
+sub,n,d,3,,d,
+sub,d,,3,n,,"clean-up",
+sub,g,ɡʷ,3,,w,
+sub,k,kʷ,3,,w,
+sub,{apostrophe},ʔʷ,3,,w,
+sub,w,,3,{w-preceder},,"clean-up",

Data/_compromised/aoj_Mufian/aoj.verify.csv ADDED Viewed

	@@ -0,0 +1,30 @@

+papi,p a p i,
+ambuta,a b u t a,
+naep,n a e p,
+owamb,o w a b,
+waulu'mana,w a u l u ʔ m a n a,
+tata,t a t a,
+kwa'ah,kʷ a ʔ a h,
+gani,ɡ a n i,
+isag,i s a ɡ,
+dindigina,d i d i ɡ i n a,
+ondop,o d o p,
+wambele'w,w a b e l e ʔʷ,
+gwagwi,ɡʷ a ɡʷ i,
+safe',s a f e ʔ,
+lawah,l a w a h,
+ukup,u k u p,
+ma,m a,
+basef,b a s e f,
+ea',e a ʔ,
+owa',o w a ʔ,
+ina,i n a,
+esis,e s i s,
+waf,w a f,
+na'i,n a ʔ i,
+ipa',i p a ʔ,
+anen,a n e n,
+ae',a e ʔ,
+epes,e p e s,
+si'i,s i ʔ i,
+dei',d e i ʔ,

Data/_compromised/ar_Arabic/ar.Rmd ADDED Viewed

	@@ -0,0 +1,155 @@

+---
+title: "Arabic"
+author: "Emily Strand"
+bibliography: ar.bib
+output: html_document
+---
+Last Updated: 2019-12-05
+**COMPROMISED: some ambiguity in the transcription of alif; some conflation between /w/ and /uː/ and between /j/ and /iː/**
+# Background
+**Language Family:** Afro-Asiatic / Semitic / Central / South / Arabic
+* Arabic is considered an overarching classification of all the dialectal varieties [@Boudelaa2010, p. 482]. Given this, I have chosen to address the Modern Standard variety, which is predominantly written or used in formal communication. Thus, Arabic exhibits diglossia, where written text (and formal communication) differs from what is actually spoken in everyday life. @Boudelaa2010 explain that Modern Standard Arabic is considered the "high" variety, and all the regional dialects are considered the "low" varieties (p. 482). Because of this distinction, the phonemic inventory for Modern Standard Arabic differs somewhat from those of the regional dialects.
+* Arabic is a widely spoken language with speakers primarily residing in either Asia, the Middle East, or North Africa.
+# Phonology
+## Consonants
+* Arabic includes what are called emphatic consonants, which are produced when the back, or the root, of the tongue retracts towards the pharynx [@Amayreh1998, p. 643].
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+library(dplyr)
+library(knitr)
+library(kableExtra)
+consonants <- read.table(textConnection('
+"Manner of Articulation"   Labial  Dental   Alveolar   Postalveolar   Palatal    Velar   Uvular    Pharyngeal    Glottal
+"Stops (plain)"            "b"       ""    "t tˤ d dˤ"    ""            ""        "k"      "q"         ""          "ʔ"
+Affricates                 ""        ""       ""          "dʒ"          ""        ""       ""          ""          ""
+Fricatives                 "f"    "θ ð ðˤ"  "s sˤ z"      "ʃ"           ""       "x ɣ"     ""         "ħ ʕ"        "h"
+Nasals                     "m"       ""       "n"         ""            ""        ""       ""          ""          ""
+Trills                     ""        ""       "r"         ""            ""        ""       ""          ""          ""
+Approximants               "w"       ""       "l"         ""            "j"       ""       ""          ""          ""
+'), TRUE)
+kable(consonants, col.names = c("Manner of Articulation", "Labial", "Dental", "Alveolar", "Postalveolar", "Palatal", "Velar", "Uvular", "Pharyngeal", "Glottal"), align = 'c') %>%
+  kable_styling("bordered") %>%
+  add_header_above(c("", "Place of Articulation" = 9)) %>%
+  footnote("Note: For phonemes that share a cell, those on the left are voiceless and those on the right are voiced. Phonemes that have the diacritic (ˤ) are emphatic.", general_title = "") %>%
+  column_spec(1, bold = TRUE)
+```
+## Vowels
+* Vowel length is contrastive in Arabic [@Amayreh1998, p. 643].
+* /e/ and /o/ exist in spoken varieties of Arabic, but not in Standard Modern Arabic [@Ibrahim2002, p. 323].
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+consonants <- read.table(textConnection('
+          Front   Central   Back
+High      "i"     ""        "u"
+Low       ""      "a"       ""
+'), TRUE)
+kable(consonants, align = 'c') %>%
+  kable_styling("bordered") %>%
+  column_spec(1, bold = TRUE)
+diphthongs <- read.table(textConnection('
+Diphthongs
+"/aj/, /aw/"
+'), TRUE)
+kable(diphthongs, align = 'c') %>%
+  kable_styling("bordered")
+```
+# Alphabet
+* Arabic is written from right to left [@Ibrahim2002, p. 323].
+* The majority of the graphemes have different forms depending on where they appear in a word in relation to other graphemes (ibid.). Graphemes may have up to four different forms for the word initial, medial, and final position as well as for the isolated form. The isolated forms for all the graphemes are represented below.
+    - Although the diacritics are often seen as orthographically separate from the isolated forms (i.e. the diacritic followed by the base grapheme), some grapheme and diacritic combinations are seen as individual units. For example, the alif ⟨ا⟩ with an overhead hamza ⟨ء⟩, is often represented as ⟨أ⟩ rather than ⟨ ٔا⟩. Other "permanent" grapheme diacritic combinations include: ⟨آ⟩  ,⟨ئ⟩  ,⟨ؤ⟩, and ⟨إ⟩.
+* Long vowels are indicated by specific graphemes, however, short vowels are indicated by diacritics [@Awde2000, p. 18].
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+alphabet <- read.table(textConnection('
+Grapheme  Phoneme      Comment
+"ا"       "/aː/; /ʔ/"   "/ʔ/: word-initially (not always marked, which somewhat compromises the language)"
+"ب"       "/b/"         ""
+"ت"       "/t/"         ""
+"ث"       "/θ/"         ""
+"ج"       "/dʒ/"        ""
+"ح"       "/ħ/"         ""
+"خ"       "/x/"         ""
+"د"       "/d/"         ""
+"ذ"       "/ð/"         ""
+"ر"       "/r/"         ""
+"ز"       "/z/"         ""
+"س"       "/s/"         ""
+"ش"       "/ʃ/"         ""
+"ص"       "/sˤ/"        ""
+"ض"       "/dˤ/"        ""
+"ط"       "/tˤ/"        ""
+"ظ"       "/ðˤ/"        ""
+"ع"       "/ʕ/"         ""
+"غ"       "/ɣ/"         ""
+"ف"       "/f/"         ""
+"ق"       "/q/"         ""
+"ك"       "/k/"         ""
+"ل"       "/l/"         ""
+"م"       "/m/"         ""
+"ن"       "/n/"         ""
+"ه"       "/h/"         ""
+"و"       "/w/; /uː/"   "/w/: word-initially (used as default in the rules); /uː/: preceded by a short /u/ diacritic"
+"ي"       "/j/; /iː/"   "/j/: word -initially (used as default in the rules); /iː/ preceded by a short /i/ diacritic"
+"ء"       "/ʔ/"         "called a hamza, this grapheme also exists as a diacritic (explained below)"
+"ة"       "∅; /t/"      "called a ta-marbuta, this grapheme appears word-finally, corresponding to /t/ if followed by a diacritic or ∅ otherwise [@Biadsy2009, p. 3]"
+"ى"       "/a/"         "called an alif-maqsura, this grapheme occurs word-finally [@Habash2010, p. 11; @Biadsy2009, p. 3]"
+**Diacritic**    ""     ""
+"ُ"       "/u/"          "this diacritic is called a dammah [@Yurtbasi2016, p. 146]"
+"َ"       "/a/"          "this diacritic is called a fatḥah (ibid.)"
+"ِ"       "/i/"          "this diacritic is called a kasrah (ibid.)"
+"ٰ"       "/aː/"         "this diacritic is called an alif khanjariyah (ibid.)"
+"ٔ"       "/ʔ/"          "this diacritic is called a hamza, and only appears (as a diacritic) in combination with ⟨ا⟩ ,⟨ي⟩, and ⟨و⟩ [@Habash2010, pp. 5-6]"
+"ٕ"       "/ʔi/"         ""
+"ٓ"       "/ʔ/"          "this diacritic is called a madda (a variant of the hamza), appearing in combination with ⟨ا⟩ [@Habash2010, p. 6]"
+"ّ"        ""            "called a shadda, this diacritic indicates gemination of consonants [@Habash2010, p. 11]"
+"ْ"        ""            "called a sukun, this diacritic indicates that no vowel follows the consonant in which it\'s attached to; it also typically marks syllable boundaries [@Habash2012, p. 712]"
+"ٌ"       "/an/"         "indicates a word-final /an/ (nunnation) [@Habash2012, p. 713]"
+"ٍ"       "/in/"         "indicates a word-final /in/ (nunnation) (ibid.)"
+"ً"       "/un/"         "indicates a word-final /un/ (nunnation) (ibid.)"
+'), TRUE)
+kable(alphabet, col.names = c("Grapheme", "Phoneme", "Comment"), align = 'c') %>%
+  kable_styling("bordered")
+```
+# Syllable Structure
+* Syllables in Modern Standard Arabic tend to have CV or CVC structures; however, CVCC syllables exist word-finally [@Habash2012, p. 712].
+* Word-initial vowels are written as an inflected alif, or a hamza watsl, produced with a glottal stop. This glottal stop vowel is considered an allophone of vowels, so word-initial vowels will be transcribed as just vowels (ibid.).
+    - A plain alif may indicate an optional allophonic glottal stop word-initially, but it may also indicate an obligatory glottal stop (noted in the chart above), which results in a compromised language [@Ibrahim2019, p. 293].
+# Lenition Rules
+* According to @Amayreh1998 (p. 643):
+    - Glottal stops delete word-medially.
+    - Emphatic /s/ becomes voiced word-finally.
+    - /j/ debuccalizes to a glottal stop word-initially.
+    - /dʒ/ may fully spirantize to [ʒ].
+    - /q/ may debuccalize to a glottal stop or become a voiced velar or uvular stop.
+# Misc. Rules
+* Emphatic consonants tend to affect vowels and consonants around them, resulting in lower back vowels and velarization of consonants [@Saiegh-Haddad2014, p. 5].
+# References

Data/_compromised/ar_Arabic/ar.bib ADDED Viewed

	@@ -0,0 +1,113 @@

+@article{Habash2012,
+author = {Habash, Nizar and Diab, Mona and Rambow, Owen},
+year = {2012},
+month = {01},
+pages = {},
+title = {Conventional Orthography for Dialectal Arabic},
+journal = {Proceedings of the Language Resources and Evaluation Conference (LREC), Istanbul}
+}
+@Article{Boudelaa2010,
+  author    = {Boudelaa, Sami and Marslen-Wilson, William D.},
+  title     = {Aralex: A lexical database for Modern Standard Arabic},
+  journal   = {Behavior Research Methods},
+  year      = {2010},
+  volume    = {42},
+  number    = {2},
+  pages     = {481--487},
+  month     = {may},
+  doi       = {10.3758/brm.42.2.481},
+  publisher = {Springer Science and Business Media {LLC}},
+}
+@Article{Amayreh1998,
+  author  = {Amayreh, Mousa M. and Dyson, Alice T.},
+  title   = {The Acquisition of Arabic Consonants},
+  journal = {Journal of Speech, Language, and Hearing Reasearch},
+  year    = {1998},
+}
+@Article{Ibrahim2002,
+  author    = {Ibrahim, Raphiq and Eviatar, Zohar and Aharon-Peretz, Judith},
+  title     = {The characteristics of Arabic orthography slow its processing.},
+  journal   = {Neuropsychology},
+  year      = {2002},
+  volume    = {16},
+  number    = {3},
+  pages     = {322--326},
+  doi       = {10.1037/0894-4105.16.3.322},
+  publisher = {American Psychological Association ({APA})},
+}
+@Book{Awde2000,
+  title     = {The Arabic Alphabet: How to Read and Write It},
+  publisher = {LYLE STUART},
+  year      = {2000},
+  author    = {Awde, N.},
+  isbn      = {0818404302},
+  date      = {2000-10-01},
+  ean       = {9780818404306},
+  pagetotal = {95},
+  url       = {https://www.ebook.de/de/product/3309537/n_awde_the_arabic_alphabet_how_to_read_and_write_it.html},
+}
+@inproceedings{Biadsy2009,
+ author = {Biadsy, Fadi and Habash, Nizar and Hirschberg, Julia},
+ title = {Improving the Arabic Pronunciation Dictionary for Phone and Word Recognition with Linguistically-based Pronunciation Rules},
+ booktitle = {Proceedings of Human Language Technologies: The 2009 Annual Conference of the North American Chapter of the Association for Computational Linguistics},
+ series = {NAACL '09},
+ year = {2009},
+ isbn = {978-1-932432-41-1},
+ location = {Boulder, Colorado},
+ pages = {397--405},
+ numpages = {9},
+ url = {http://dl.acm.org/citation.cfm?id=1620754.1620812},
+ acmid = {1620812},
+ publisher = {Association for Computational Linguistics},
+ address = {Stroudsburg, PA, USA},
+}
+@Book{Habash2010,
+  title     = {Introduction to Arabic Natural Language Processing},
+  publisher = {Morgan \& Claypool},
+  year      = {2010},
+  author    = {Habash, Nizar},
+}
+@Book{Coulmas2008,
+  title     = {Writing Systems},
+  publisher = {Cambridge University Press},
+  year      = {2008},
+  author    = {Coulmas, Florian},
+  isbn      = {0521787378},
+  date      = {2008-02-29},
+  ean       = {9780521787376},
+  pagetotal = {292},
+  url       = {https://www.ebook.de/de/product/3255945/florian_coulmas_writing_systems.html},
+}
+@Article{Yurtbasi2016,
+  author  = {Yurtbaşı, Metin},
+  title   = {Sura Yusuf in Full IPA (Segmental-Suprasegmental) Transcription with English Translation},
+  journal = {International Journal of Arts and Humanities and Social Sciences},
+  year    = {2016},
+}
+@InCollection{Saiegh-Haddad2014,
+  author    = {Saiegh-Haddad, Elinor and Henkin-Roitfarb, Roni},
+  title     = {The Structure of Arabic Language and Orthography},
+  booktitle = {Literacy Studies},
+  publisher = {Springer Netherlands},
+  year      = {2014},
+  pages     = {3--28},
+  doi       = {10.1007/978-94-017-8545-7_1},
+}
+@article{Ibrahim2019,
+author = {Ibrahim, Abdulateef},
+year = {2019},
+month = {04},
+pages = {},
+title = {Glottal Stop in Arabic with Reference to English: Phonological and Orthographical Study},
+volume = {( 2016 M- 1437 e)}
+}

Data/_compromised/ar_Arabic/ar.html ADDED Viewed

The diff for this file is too large to render. See raw diff

Data/_compromised/ar_Arabic/ar.rules ADDED Viewed

	@@ -0,0 +1,167 @@

+# Arabic Rule Set
+# Written by: Emily
+# Last Updated: 2019-12-04
+type,sfrom,sto,weight,precede,follow,comment
+# Class Rules
+class,consonant,(b|t|θ|dʒ|ħ|x|d|ð|r|z|s|ʃ|sˤ|dˤ|tˤ|ðˤ|ʕ|ɣ|f|q|k|l|m|n|h|w|j|ʔ),,,,
+class,shortV,[  ُِ َ],,,,
+class,shortV-output,[aiu],,,,
+class,diacritic,[ ْٕٔ ًٍّٓ ٌ ِٰ َُ],,,,
+class,hamza-combo,[اوي],,,,
+class,hamza-combo-output,(j|w|a ː),,,,
+class,diphthong-combo,[jw],,,,
+## Graphemes
+class,alif,[ا],,,,
+class,b,[ب],,,,
+class,t,[ت],,,,
+class,θ,[ث],,,,
+class,dʒ,[ج],,,,
+class,ħ,[ح],,,,
+class,x,[خ],,,,
+class,d,[د],,,,
+class,ð,[ذ],,,,
+class,r,[ر],,,,
+class,z,[ز],,,,
+class,s,[س],,,,
+class,ʃ,[ش],,,,
+class,emph-s,[ص],,,,
+class,emph-d,[ض],,,,
+class,emph-t,[ط],,,,
+class,emph-ð,[ظ],,,,
+class,ʕ,[ع],,,,
+class,ɣ,[غ],,,,
+class,f,[ف],,,,
+class,q,[ق],,,,
+class,k,[ك],,,,
+class,l,[ل],,,,
+class,m,[م],,,,
+class,n,[ن],,,,
+class,h,[ه],,,,
+class,w-uu,[و],,,,
+class,j-ii,[ي],,,,
+class,hamza,[ء],,,,
+class,ta-marbuta,[ة],,,,
+class,alif-maq,[ى],,,,
+## Diacritics
+class,u,[ُ],,,,
+class,a,[َ],,,,
+class,i,[ِ],,,,
+class,sup-alif,[ٰ],,,,
+class,dia-hamza-above,[ٔ],,,,,
+class,dia-hamza-below,[ٕ],,,,
+class,madda,[ٓ],,,,
+class,shadda,[ّ],,,,
+class,sukun,[ْ],,,,
+class,a-nunnation,[ٌ],,,
+class,i-nunnation,[ٍ],,,
+class,u-nunnation,[ً],,,
+## Permanent Diacritic Grapheme Combos (some diacritics are not treated as separate from the grapheme)
+class,alif-hamza-above,[أ],,,,
+class,alif-hamza-below,[إ],,,,
+class,w-hamza,[ؤ],,,,
+class,j-hamza,[ئ],,,,
+class,alif-madda,[آ],,,,
+# Sub Rules
+## Graphemes
+sub,{alif},a ː,2,,,
+sub,{alif},ʔ,3,^,,"word-initial alif is /ʔ/, however, not every word-initial alif is realized as such (primarily the hamza is used) [@Coulmas2003, p. 123] - somewhat compromises the transcription",
+sub,{b},b,2,,,
+sub,{t},t,2,,,
+sub,{θ},θ,2,,,
+sub,{dʒ},dʒ,2,,,
+sub,{ħ},ħ,2,,,
+sub,{x},x,2,,,
+sub,{d},d,2,,,
+sub,{ð},ð,2,,,
+sub,{r},r,2,,,
+sub,{z},z,2,,,
+sub,{s},s,2,,,
+sub,{ʃ},ʃ,2,,,
+sub,{emph-s},sˤ,2,,,
+sub,{emph-d},dˤ,2,,,
+sub,{emph-t},tˤ,2,,,
+sub,{emph-ð},ðˤ,2,,,
+sub,{ʕ},ʕ,2,,,
+sub,{ɣ},ɣ,2,,,
+sub,{f},f,2,,,
+sub,{q},q,2,,,
+sub,{k},k,2,,,
+sub,{l},l,2,,,
+sub,{m},m,2,,,
+sub,{n},n,2,,,
+sub,{h},h,2,,,
+sub,{w-uu},w,2,,,
+sub,{j-ii},j,2,,,
+sub,{hamza},ʔ,2,,,
+sub,{ta-marbuta},t,3,,{diacritic}$,"ta-marbuta occurs word-finally and if followed by a diacritic is recognized as /t/",
+sub,{diacritic},,3,{ta-marbuta},$,"clean-up",
+sub,{ta-marbuta},,2,,$,"ta-marbuta transcribes to nothing when not followed by a diacritic",
+sub,{alif-maq},a,2,,$,"alif-maq occurs word-finally",
+## Diacritics
+sub,{u},u,2,,,
+sub,{a},a,2,,,
+sub,{i},i,2,,,
+sub,{sup-alif},a ː,2,,,
+sub,{dia-hamza-above},1ʔ,6,{hamza-combo},,"this transcribes the hamza sequences as the consonant followed by a glottal stop, but we need an ipasub rule to make the glottal stop precede the consonant",
+sub,{alif},,4,^,{dia-hamza-above},"alif-hamza (above) only corresponds to glottal stop word-initially, so the alif shouldn't be transcribed",
+sub,{alif},ʔ i,4,,{dia-hamza-below},
+sub,{alif},ʔ,5,^,{dia-hamza-below},"alif-hamza (below) only corresponds to glottal stop word-initially, so the alif shouldn't be transcribed",
+sub,{dia-hamza-below},,2,{alif},,"clean-up",
+sub,{alif},ʔ a ː,3,,{madda},
+sub,{shadda},1ː,2,,,"needed for ipasub gemination rules below",
+sub,{sukun},,2,,,
+sub,{a-nunnation},a n,3,,$,
+sub,{a-nunnation},a n,3,,{diacritic}$,"sometimes followed by a diacritic (shadda) which both apply to the consonant [@Habash2010, p. 11]",
+sub,{i-nunnation},i n,3,,$,
+sub,{i-nunnation},i n,3,,{diacritic}$,"sometimes followed by a diacritic (shadda) which both apply to the consonant (ibid.)",
+sub,{u-nunnation},u n,3,,$,
+sub,{u-nunnation},u n,3,,{diacritic}$,"sometimes followed by a diacritic (shadda) which both apply to the consonant (ibid.)",
+## Permanent Diacritic Grapheme Combos
+sub,{alif-hamza-above},ʔ a ː,3,,,
+sub,{alif-hamza-above},ʔ,4,^,,"alif-hamza (above) only corresponds to glottal stop word-initially - alif is a place holder",
+sub,{alif-hamza-below},ʔ i,3,,,
+sub,{alif-hamza-below},ʔ,4,^,,"alif-hamza (below) only corresponds to glottal stop word-initially - alif is a place holder",
+sub,{w-hamza},ʔ w,2,,,
+sub,{j-hamza},ʔ j,2,,,
+sub,{alif-madda},ʔ a ː,2,,,
+## Diphthongs [@Javed, p. 6]
+sub,{a},aw,3,,{w-uu},
+sub,{w-uu},,3,{a},,"clean-up",
+sub,{a},aj,3,,{j-ii},
+sub,{j-ii},,3,{a},,"clean-up",
+# Additional Long Vowel Transcriptions
+sub,{alif},,8,^,{shortV},"word-initial vowels are represented by an inflected alif [@Habash2012, p. 712]",
+sub,{a},a ː,5,,{alif},
+sub,{alif},,4,{a},,"clean-up",
+sub,{a},a ː,5,,{alif-maq},
+sub,{alif-maq},,4,{a},,"clean-up",
+sub,{u},u 2ː,5,,{w-uu},
+sub,{w-uu},,4,{u},,"clean-up",
+sub,{i},i 3ː,5,,{j-ii},
+sub,{j-ii},,4,{i},,"clean-up",
+## Alif-maqsura
+sub,{alif-maq},j ː,6,,{shortV}{shadda},"alif-maq changes into a yaa if followed by a diacritic (generally a shadda) [@Habash2010, p. 61]",
+sub,{alif-maq},j ː,6,,{shadda},
+sub,{shadda},,6,{alif-maq}{shortV},,
+# ipasub Rules
+## Consonant Gemination
+ipasub,({consonant}) ({shortV-output}) 1ː,\1 ː \2,3,,,"controls for shadda geminating vowels depending on if the double consonant is also followed by a short vowel",
+ipasub,({consonant}) a 1ː ({diphthong-combo}),\1 ː a\3,4,,,"controls for shadda geminating vowels depending on if the double consonant is also followed by a diphthong",
+ipasub,({consonant}) 1ː,\1 ː,2,,,"this removes the 1 if the consonant is not followed by a short vowel"
+ipasub,({shortV-output}) 2ː ({shortV-output}) 1ː,\1 w ː \2,3,,,"controls for shadda taking priority over elongation of vowels (e.g. Dammah + waw) with following short vowel",
+ipasub,({shortV-output}) 2ː 1ː,\1 w ː,3,,,"controls for shadda taking priority over elongation of vowels (e.g. dammah + waw)",
+ipasub,({shortV-output}) 2ː u n 1ː,\1 w ː u n,4,,,"flips around the nunation and gemination",
+ipasub,({shortV-output}) 2ː a n 1ː,\1 w ː a n,4,,,"flips around the nunation and gemination",
+ipasub,({shortV-output}) 2ː i n 1ː,\1 w ː i n,4,,,"flips around the nunation and gemination",
+ipasub,2ː,ː,2,,,"removes the 2 from the rest of the elongated /u/s",
+ipasub,({shortV-output}) 3ː ({shortV-output}) 1ː,\1 j ː \2,3,,,"controls for shadda taking priority over elongation of vowels (e.g. kasrah + yaa) with following short vowel",
+ipasub,({shortV-output}) 3ː 1ː,\1 j ː,3,,,"controls for shadda taking priority over elongation of vowels (e.g. kasrah + yaa)",
+ipasub,({shortV-output}) 3ː i n 1ː,\1 j ː i n,4,,,"flips around the nunation and gemination",
+ipasub,({shortV-output}) 3ː u n 1ː,\1 j ː u n,4,,,"flips around the nunation and gemination",
+ipasub,({shortV-output}) 3ː a n 1ː,\1 j ː a n,4,,,"flips around the nunation and gemination",
+ipasub,1ː,@,1,,,"rules out illegal combos (shadda appearing with nunnation word-medially over an alif)",
+ipasub,3ː,ː,2,,"removes the 3 from the rest of the elongated /i/s",
+## Hamza (glottal stop)
+ipasub,({hamza-combo-output}) 1ʔ,ʔ \1,3,,,"this puts the glottal stop before the character carrying the hamza",
+ipasub,a ː 1ʔ ({shortV-output}),ʔ \1,4,,,"word-medial glottal stops are sometimes represented as alif topped with a hamza and a short vowel diacritic, the alif holds no value here",
+ipasub,ʔ a ː ({shortV-output}),ʔ \1,2,,,"word-medial glottal stops are sometimes represented as alif topped with a hamza and a short vowel diacritic, the alif holds no value here (this is the same rule as the one above but uses the permanent alf-hamza-above character)",

Data/_compromised/ar_Arabic/ar.verify.csv ADDED Viewed

	@@ -0,0 +1,39 @@

+أُكْسِجِين,ʔ u k s i dʒ i ː n,"long /i/ (kasrah + ي)",
+أَقِط,ʔ a q i tˤ,"initial glottal stop with /a/ diacritic",
+أَجْهَدَ,ʔ a dʒ h a d a,"sukun",
+أَجْسَم,ʔ a dʒ s a m,
+أَجْهَل,ʔ a dʒ h a l,
+أُمْدُوحَة,ʔ u m d u ː ħ a,"ta-marbuta without a diacritic",
+أَنْشَط,ʔ a n ʃ a tˤ,"emphatic /t/",
+أَهْدَب,ʔ a h d a b,
+شَاحِنَة,ʃ a ː ħ i n a,
+شَارِع,ʃ a ː r i ʕ,
+دَا,d a ː,"long /a/ (fatha + alif)"
+غاضِب,ɣ a ː dˤ i b,"emphatic /d/",
+غَزَالَة,ɣ a z a ː l a,
+رَزَقَ,r a z a q a,
+رِفَاق,r i f a ː q,
+رَفَأَ,r a f a ʔ a,"word-medial glottal stop (alif-hamza)",
+رُمْح,r u m ħ,
+هَافَانَا,h a ː f a ː n a ː,
+فَاسِد,f a ː s i d,
+فِتْنَة,f i t n a,
+فِرْعَوْن,f i r ʕ aw n,"diphthong /aw/",
+فُرُوغ,f u r u ː ɣ,
+فَظِيع,f a ðˤ i ː ʕ,
+قَاضٍ,q a ː dˤ i n,"/in/ nunnation",
+قَبُوح,q a b u ː ħ,
+قَاطَعَ,q a ː tˤ a ʕ a,
+قَبِيح,q a b i ː ħ,
+قَزّ,q a z ː,"consonant gemination (shadda)",
+وَازَى,w a ː z a ː,"initial /w/",
+ثَلَاثَة,θ a l a ː θ a,
+جَبْخَانَة,dʒ a b x a ː n a,
+جُمَّيْزَة,dʒ u m ː aj z a,
+الْبُخَارِىُّ,ʔ l b u x a ː r i j ː u,
+نَبَاتِيّ,n a b a ː t i j ː,
+عَلِيٍّ,ʕ a l i j ː i n,
+آدَمِيّ,ʔ a ː d a m i j ː,
+آيَسَ,ʔ a ː j a s a,
+ذبابة,ð b a ː b,
+صار,sˤ a ː r,

Data/_compromised/arn_Mapudungun/arn.Rmd ADDED Viewed

	@@ -0,0 +1,139 @@

+---
+title: "Mapudungun"
+author: "Emily Strand"
+bibliography: arn.bib
+output: html_document
+---
+Last updated: 2020-06-26
+**COMPROMISED: ambiguity due to non-standard alphabet; conflation of dental and alveolar consonants /t̪/ and /t/ (most likely with the others as well)**
+# Background
+**Language Family:** Araucanian
+* Also referred to as Mapuche or Araucana, it is spoken throughout Chile.
+# Phonology
+## Consonants
+* In some dialects of Mapudungun, the distinction between the dental and alveolar phonemes have merged [@sadowsky_mapudungun_2013, p. 89], leaving the dentals to appear as a result of allophonic variation.
+* /ʃ/ is rather infrequent, often alternating with /s/, yet it is still considered in most sources to be a phoneme [@smeets_grammar_2008, p. 23].
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+library(dplyr)
+library(knitr)
+library(kableExtra)
+consonants <- read.table(textConnection('
+"Manner of Articulation"     Labial     Dental    Alveolar   Postalveolar   Retroflex  Palatal   Velar
+Stops                         "p"        "t̪"         "t"         ""            ""        ""       "k"
+Affricates                    ""         ""          ""         "tʃ"          "ʈʂ"       ""       ""
+Fricatives                    "f"        "θ"         "s"         "ʃ"          "ʐ"        ""       "ɣ"
+Nasals                        "m"        "n̪"         "n"         ""            ""        "ɲ"      "ŋ"
+Approximants                  ""         "l̪"         "l"         ""            ""       "j ʎ"     "w"
+'), TRUE)
+kable(consonants, col.names = c("Manner of Articulation", "Labial", "Dental", "Alveolar", "Postalveolar", "Retroflex", "Palatal", "Velar"), align = 'c') %>%
+  kable_styling("bordered") %>%
+  add_header_above(c("", "Place of Articulation" = 7)) %>%
+  column_spec(1, bold = TRUE) %>%
+  footnote(general = "Note: The palatal approximant on the right is lateral.", general_title = "")
+```
+## Vowels
+* Diphthongs aren't prevalent in Mapudungun; however, /ae/ is generally realized as one [@smeets_grammar_2008, p. 52]. Because it can also occur as a sequence of vowels, it will not be transcribed in the rules.
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+vowels <- read.table(textConnection('
+      Front    Central    Back
+High   "i"       ""       "u"
+Mid    "e"       "ɘ"      "o"
+Low     ""       "a"      ""
+'), TRUE)
+kable(vowels, align = 'c') %>%
+  kable_styling("bordered") %>%
+  column_spec(1, bold = TRUE)
+```
+# Alphabet
+* Although there isn't a standardization of the alphabet, the three alphabets commonly referred to provide a rather comprehensive picture in terms of depicting each phoneme. These alphabets include the Alfabeto Mapuche Unificado, the Frafemario Raguileo, and the Azumchefi [@bronzino_loanword_nodate, p. 22]. Listed below is the Alfabeto Mapuche Unificado, but following it, I have provided the orthographic variations from the other two.
+* Regarding the corresponding phoneme of ⟨g⟩, the Unified Alphabet uses /ɣ/ whereas the Raguileo and Azumchefi alphabets use /ŋ/. I went with the ⟨g⟩ to /ŋ/ correspondence due to the inclusion of ⟨q⟩ in the Crúbadán corpus, which based on the Raguileo and Azumchefi alphabets corresponds to /ɣ/.
+* Based on the Alfabeto Mapuche Unificado, the dentals are not represented in the Crúbadán corpus. With the grapheme correspondences of the other two alphabets, they appear (except /t̪/, as the distinction between it at /t/ is not maintained in either). Based on this, conflation most likely occurs between all alveolar consonants and their dental counterparts, compromising the language.
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+alphabet <- read.table(textConnection('
+Grapheme      Phoneme      Comment
+"a"            "/a/"         ""
+"d"            "/θ/"         ""
+"e"            "/e/"         ""
+"f"            "/f/"         ""
+"g"            "/ŋ/"         ""
+"i"            "/i/"         ""
+"ï"            "/ɘ/"         ""
+"k"            "/k/"         ""
+"l"            "/l/"         ""
+"ḻ"            "/l̪/"         ""
+"m"            "/m/"         ""
+"n"            "/n/"         ""
+"ṉ"            "/n̪/"         ""
+"ñ"            "/ɲ/"         ""
+"o"            "/o/"         ""
+"p"            "/p/"         ""
+"r"            "/ʐ/"         ""
+"s"            "/s/"         ""
+"t"            "/t/"         ""
+"ṯ"            "/t̪/"         "not reflected in the Crúbadán corpus"
+"u"            "/u/"         ""
+"ü"            "/ɘ/"         ""
+"w"            "/w/"         ""
+"y"            "/j/"         ""
+**Digraph**     ""           ""
+"ch"           "/tʃ/"        ""
+"tr"           "/ʈʂ/"        ""
+"sh"           "/ʃ/"         ""
+"ng"           "/ŋ/"         ""
+"ll"           "/ʎ/"         ""
+"**Orthographic Variation**" "" ""
+"c"            "/tʃ/"    "Raguileo"
+"x"            "/ʈʂ/"    "Raguileo"
+"tx"           "/ʈʂ/"    "Azumchefi"
+"z"            "/θ/"     "Raguileo and Azumchefi"
+"h"            "/n̪/"     "Raguileo"
+"nh"           "/n̪/"     "Azumchefi"
+"q"            "/ɣ/"     "Raguileo and Azumchefi"
+"b"            "/l̪/"     "Raguileo"
+"lh"           "/l̪/"     "Azumchefi"
+"j"            "/ʎ/"     "Raguileo"
+"v"            "/ɘ/"     "Raguileo"
+'), TRUE)
+kable(alphabet, align = 'c') %>%
+  kable_styling("bordered")
+```
+# Lenition Rules
+* Geminates occur in the language, but they are often realized as single consonants [@smeets_grammar_2008, p. 51].
+* /ʐ/ can approximate to /ɻ/ [@sadowsky_mapudungun_2013, p. 90].
+# Misc. Rules
+* Aspiration of some of the stops can occur [@sadowsky_mapudungun_2013, p. 89].
+* Unstressed vowels are often deleted in word-final positions following voiceless consonants [@sadowsky_mapudungun_2013, p. 94].
+* /ɘ/ may be deleted following a stressed syllable (ibid.).
+* Consonant clusters are only allowed intervocalically [@smeets_grammar_2008, pp. 37, 38].
+* The velar phonemes /k/, /ŋ/, /ɣ/ tend to be fronted preceding front vowels [@sadowsky_mapudungun_2013, p. 89].
+# References

Data/_compromised/arn_Mapudungun/arn.bib ADDED Viewed

	@@ -0,0 +1,38 @@

+@phdthesis{bronzino_loanword_nodate,
+	address = {Bryn Mawr, Pennsylvania},
+	title = {Loanword {Adaptation} in {Spanish} and {Mapudungun}: a {Phonological} and {Sociolinguistic} {Analysis}},
+	school = {Bryn Mawr College},
+	author = {Bronzino, Dana},
+	year = {2015},
+	month = dec
+}
+@book{smeets_grammar_2008,
+	address = {Berlin ; New York},
+	series = {Mouton grammar library},
+	title = {A grammar of {Mapuche}},
+	isbn = {978-3-11-019558-3},
+	number = {41},
+	publisher = {Mouton de Gruyter},
+	author = {Smeets, Ineke},
+	year = {2008},
+	keywords = {Phonology, Grammar, Mapuche language, Morphosyntax}
+}
+@article{sadowsky_mapudungun_2013,
+	title = {Mapudungun},
+	volume = {43},
+	issn = {0025-1003, 1475-3502},
+	url = {https://www.cambridge.org/core/product/identifier/S0025100312000369/type/journal_article},
+	doi = {10.1017/S0025100312000369},
+	language = {en},
+	number = {1},
+	urldate = {2019-10-02},
+	journal = {Journal of the International Phonetic Association},
+	author = {Sadowsky, Scott and Painequeo, Héctor and Salamanca, Gastón and Avelino, Heriberto},
+	month = apr,
+	year = {2013},
+	pages = {87--96},
+	file = {Full Text:files/216/Sadowsky et al. - 2013 - Mapudungun.pdf:application/pdf}
+}

Data/_compromised/arn_Mapudungun/arn.html ADDED Viewed

The diff for this file is too large to render. See raw diff

Data/_compromised/arn_Mapudungun/arn.rules ADDED Viewed

	@@ -0,0 +1,47 @@

+# Mapudungun Rule Set
+# Written by: Emily
+# Last Updated: 2020-04-15
+type,sfrom,sto,weight,precede,follow,comment
+# Class Rules
+class,passthrough,[aefiklmnopstuw],,,,
+class,gem,(p|t̪|t|k|m|n̪|n|ɲ|ŋ|tʃ|ʈʂ|f|θ|s|ʃ|ʐ|ɣ|l̪|l|j|ʎ|w),,,,
+# Individual Letters
+sub,d,θ,2,,,
+sub,g,ŋ,2,,,
+sub,ï,ɘ,2,,,
+sub,ḻ,l̪,2,,,
+sub,ṉ,n̪,2,,,
+sub,ñ,ɲ,2,,,
+sub,r,ʐ,2,,,
+sub,ṯ,t̪,2,,,
+sub,ü,ɘ,2,,,
+sub,y,j,2,,,
+sub,({passthrough}),\1,2,,,
+# Digraphs
+sub,c,tʃ,3,,h,
+sub,h,,3,c,,"clean-up",
+sub,t,ʈʂ,3,,r,
+sub,r,,3,t,,"clean-up",
+sub,s,ʃ,3,,h,
+sub,h,,3,s,,"clean-up",
+sub,n,ŋ,3,,g,
+sub,g,,3,n,,"clean-up",
+sub,l,ʎ,3,,l,
+sub,l,,3,l,,"clean-up",
+# Orthographic Variation
+sub,c,tʃ,2,,,
+sub,x,ʈʂ,2,,,
+sub,t,ʈʂ,3,,x,
+sub,x,,3,t,,"clean-up",
+sub,z,θ,2,,,
+sub,h,n̪,2,,,
+sub,n,n̪,3,,h,
+sub,h,,3,n,,"clean-up",
+sub,q,ɣ,2,,,
+sub,b,l̪,2,,,
+sub,l,l̪,3,,h,
+sub,h,,3,l,,"clean-up",
+sub,j,ʎ,2,,,
+sub,v,ɘ,2,,,
+# Geminates
+ipasub,\b({gem}) \1\b,\1 ː,2,,,

Data/_compromised/arn_Mapudungun/arn.verify.csv ADDED Viewed

	@@ -0,0 +1,30 @@

+afvlkan,a f ɘ l k a n,
+ajfeñ,a ʎ f e ɲ,
+aliwentu,a l i w e n t u,
+anci,a n tʃ i,
+bafaxa,l̪ a f a ʈʂ a,
+calin,tʃ a l i n,
+cazi,tʃ a θ i,
+cagvj,tʃ a ŋ ɘ ʎ,
+dallun,θ a ʎ u n,
+hamuh,n̪ a m u n̪,
+ichuna,i tʃ u n a,
+kutri,k u ʈʂ i,
+kümelekaymi,k ɘ m e l e k a j m i,
+mansun,m a n s u n,
+nge,ŋ e,
+reqle,ʐ e ɣ l e,
+kom,k o m,
+pu,p u,
+kishu,k i ʃ u,
+nhi,n̪ i,
+malhenh,m a l̪ e n̪,
+femmeken,f e m ː e k e n,
+diccionario,θ i tʃ ː i o n a ʐ i o,
+segredossereia,s e ŋ ʐ e θ o s ː e ʐ e i a,
+küzawwe,k ɘ θ a w ː e,
+kutt,k u t ː,
+mew,m e w,
+ta,t a,
+ñi,ɲ i,
+ka,k a,

Data/_compromised/awx_Awara/awx.Rmd ADDED Viewed

	@@ -0,0 +1,119 @@

+---
+title: "Awara"
+author: "Becky Mathew"
+bibliography: awx.bib
+output: html_document
+---
+Last Updated: 2020-04-13
+**COMPROMISED: conflation between /nd/, /mb/, /nɡ/ and /d/, /b/, /ɡ/, respectively**
+# Background
+**Language Family:** Trans-New Guinea / Main Section / Central and Western / Huon-Finisterre / Finisterre / Wantoat
+* Awara is spoken in the Morobe province of Papua New Guinea.
+# Phonology
+## Consonants
+* @Quigley2002 and @Quigley2003 disagree slightly on the consonant inventory of Awara (p. 4; p. 14); I have chosen to predominantly follow the more recent source.
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+library(dplyr)
+library(knitr)
+library(kableExtra)
+consonants <- read.table(textConnection('
+"Manner of Articulation"     Labial     Alveolar  Palatal     Velar      Glottal
+Stops                        "p b"      "t d"     ""       "k kʷ ɡ ɡʷ"     ""
+Fricatives                   "β"        "s"       ""           "ɣ"        "h"
+Nasals                       "m"        "n"       ""         "ŋ ŋʷ"        ""
+Approximants                 ""         "l"       "j"          ""          ""
+'), TRUE)
+kable(consonants, col.names = c("Manner of Articulation", "Labial", "Alveolar", "Palatal", "Velar", "Glottal"), align = 'c') %>%
+  kable_styling("bordered") %>%
+  add_header_above(c("", "Place of Articulation" = 5)) %>%
+  footnote(general = "Note: For phonemes that share a cell, those on the left are voiceless and those on the right are voiced. Velar consonants that have the diacritic (ʷ) are labialized.", general_title = "") %>%
+  column_spec(1, bold = TRUE)
+```
+## Vowels
+* Both @Quigley2002 and @Quigley2003 indicate a mid central vowel (p. 4; p. 35, 37); however, they represent it as /ʌ/. I have chosen to use /ɘ/, as it's more reflective of the description.
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+consonants <- read.table(textConnection('
+       Front   Central     Back
+High   "i"       ""        "u"
+Mid    "e"       "ɘ"       "o"
+Low    ""        "a"       ""
+'), TRUE)
+kable(consonants, align = 'c') %>%
+  kable_styling("bordered") %>%
+  column_spec(1, bold = TRUE)
+```
+# Alphabet
+* Surface level prenasalization of consonants /b/, /d/, and /ɡ/ is orthographically represented intervocalically [@Quigley2003, p. 155]. This compromises the language because there is uncertainty whether the sequence ⟨ambi⟩, for example, always represents /abi/ (with an [ambi] realization) or /ambi/ in some instances. I have chosen to transcribe the language how it appears; that is, every ⟨mb⟩ sequence, for example, transcribes to /mb/.
+```{r echo=FALSE, message=FALSE, warning=FALSE, results = 'asis'}
+alphabet <- read.table(textConnection('
+Grapheme      Phoneme               Comment
+"a"             "/a/"                 ""
+"ä"             "/ɘ/"                 ""
+"b"             "/b/"                 ""
+"d"             "/d/"                 ""
+"e"             "/e/"                 ""
+"g"             "/ɡ/"                 ""
+"h"             "/h/"                 ""
+"i"             "/i/"                 ""
+"k"             "/k/"                 ""
+"l; r"          "/l/"          "intervocalically"
+"m"             "/m/"                 ""
+"n"             "/n/"                 ""
+"o"             "/o/"                 ""
+"p"             "/p/"                 ""
+"s"             "/s/"                 ""
+"t"             "/t/"                 ""
+"u"             "/u/"                 ""
+"w"             "/β/"                 ""
+"x"             "/ɣ/"                 ""
+"y"             "/j/"                 ""
+**Multigraph**   ""                   ""
+"gw"            "/ɡʷ/"                ""
+"kw"            "/kʷ/"                ""
+"ng"            "/ŋ/"                 ""
+"ngw"           "/ŋʷ/"                ""
+'), TRUE)
+kable(alphabet, align = 'c') %>%
+  kable_styling("bordered")
+```
+# Syllable Structure
+* Syllables in Awara follow the order of (C)V(C) [@Quigley2003, p. 175].
+# Lenition Rules
+* Voiceless stops (including /kʷ/) lenite intervocalically at morpheme boundaries [@Quigley2003, p. 26].
+* /t/ and /k/ may be realized as [l] and [ɣ] word-initially [@Quigley2003, pp. 20-21].
+* /k/, /p/, and /t/ are realized as [ɣ], [w], and [l] intervocalically [@Quigley2002, p. 6].
+* /pu/ may be realized as [βu] or [wu] word-initially [@Quigley2003, p. 22].
+# Misc. Rules
+* Voiced stops have prenasalized variants [@Quigley2003, pp. 16-17].
+    - Word-initially, they are realized as prenasalized consonants.
+    - Syllable-initially, following open syllables, the homorganic nasal is realized as the coda of the preceding syllable and the voiced stop is realized as the onset of the following syllable.
+* Voiceless stops tend to be aspirated word-initially and syllable-initially intervocalically [@Quigley2003, p. 17].
+* Glottal stop epenthesis may occur word-initially, preceding vowels [@Quigley2003, p. 18].
+# References

Data/_compromised/awx_Awara/awx.bib ADDED Viewed

	@@ -0,0 +1,17 @@

+% Encoding: UTF-8
+@MastersThesis{Quigley2003,
+  author = {Edward C. Quigley},
+  school = {University of North Dakota},
+  title  = {Awara Phonology},
+  year   = {2003},
+}
+@MastersThesis{Quigley2002,
+  author = {Susan R. Quigley},
+  school = {University of North Dakota},
+  title  = {The Awara Verbal System},
+  year   = {2002},
+}
+@Comment{jabref-meta: databaseType:bibtex;}

Data/_compromised/awx_Awara/awx.html ADDED Viewed

The diff for this file is too large to render. See raw diff

Data/_compromised/awx_Awara/awx.rules ADDED Viewed

	@@ -0,0 +1,27 @@

+# Awara Rule Set
+# Written by: Becky Mathew
+# Last updated: 2020-04-13
+type,sfrom,sto,weight,precede,follow,comment
+# Class Rules
+class,passthrough,[abdehikmnopstu],,,,
+class,w-preceder,[gk],,,,
+class,vowels,[aeiouä],,,,
+# Individual Letters
+sub,ä,ɘ,3,,,
+sub,g,ɡ,2,,,
+sub,l,l,3,{vowels},{vowels},"/l/ only occurs intervocalically",
+sub,r,l,3,{vowels},{vowels},"<r> may also be used to represent /l/",
+sub,w,β,3,,,
+sub,x,ɣ,3,,,
+sub,y,j,3,,,
+sub,({passthrough}),\1,2,,,
+# Digraphs
+sub,g,ɡʷ,4,,w,
+sub,k,kʷ,4,,w,
+sub,w,,4,{w-preceder},,"clean-up",
+sub,n,ŋ,5,,g,
+sub,g,,5,n,,"clean-up",
+# Trigraphs
+sub,n,ŋʷ,6,,gw,
+sub,g,,6,n,w,"clean-up",
+sub,w,,6,ng,,"clean-up",

Data/_compromised/awx_Awara/awx.verify.csv ADDED Viewed

	@@ -0,0 +1,30 @@

+hikngä,h i k ŋ ɘ,
+wätä,β ɘ t ɘ,
+bakudupi,b a k u d u p i,
+xät,ɣ ɘ t,
+yähakaying,j ɘ h a k a j i ŋ,
+nap,n a p,
+gwen,ɡʷ e n,
+kwayi,kʷ a j i,
+kungwä,k u ŋʷ ɘ,
+do,d o,
+ge,ɡ e,
+äminu,ɘ m i n u,
+sipsip,s i p s i p,
+inale,i n a l e,
+Awara,a β a l a,
+yang,j a ŋ,
+using,u s i ŋ,
+inikut,i n i k u t,
+nanä,n a n ɘ,
+äwä,ɘ β ɘ,
+puyä,p u j ɘ,
+undä,u n d ɘ,
+tiwän,t i β ɘ n,
+yänikut,j ɘ n i k u t,
+meyä,m e j ɘ,
+wamu,β a m u,
+umanä,u m a n ɘ,
+kewu,k e β u,
+natäke,n a t ɘ k e,
+päke,p ɘ k e,

Data/_compromised/bcl_CentralBikol/bcl.Rmd ADDED Viewed

	@@ -0,0 +1,126 @@

+---
+title: "Central Bikol"
+author: "Bill Mizgerd"
+bibliography: bcl.bib
+output: html_document
+---
+Last Updated: 2019-06-17
+**SLIGHTLY COMPROMISED: glottal stops not transcribed consistently**
+# Background
+**Language Family:** Austronesian / Malayo-Polynesian / Western Malayo-Polynesian / Meso Philippine / Central Philippine / Bikol / Coastal / Naga
+* Central Bikol, or Bikol, is spoken throughout the Bikol provinces within the Philippines.
+# Phonology
+## Consonants
+* Loans from Spanish and English have introduced /f/, /v/, /z/, /ʃ/, /ʒ/, /ɲ/, /ʎ/, /tʃ/, and /dʒ/ to Bikol, although not all speakers use those sounds [@BclMattes2014, p. 8].
+```{r echo = FALSE, message = FALSE, warning = FALSE, results = 'asis'}
+library(dplyr)
+library(knitr)
+library(kableExtra)
+consonants <- read.table(textConnection('
+"Manner of Articulation"   Labial   Alveolar   Palatal    Velar    Glottal
+Stops                      "p  b"    "t  d"      ""       "k  ɡ"     "ʔ"
+Nasals                      "m"       "n"        ""        "ŋ"       ""
+Fricatives                  ""        "s"        ""        ""        "h"
+Flaps                       ""        "ɾ"        ""        ""        ""
+Approximants                "ʋ"       "l"        "j"       ""        ""
+'), header = TRUE)
+kable(consonants, col.names = c("Manner of Articulation", "Labial", "Alveolar", "Palatal", "Velar", "Glottal"), align = 'c') %>%
+  kable_styling("bordered") %>%
+  column_spec(1, bold = TRUE) %>%
+  footnote(general = "Note: For phonemes that share a cell, those on the left are voiceless and those on the right are voiced.", general_title = "") %>%
+  add_header_above(c("", "Place of Articulation" = 5))
+```
+## Vowels
+* Although the orthography includes ⟨e⟩ and ⟨o⟩, [e] and [o] only exist as allophones of /i/ and /u/ respectively [@BclMattes2014, p. 8].
+```{r echo = FALSE}
+vowels <- read.table(textConnection('
+         Front      Central       Back
+High      "i"         ""          "u"
+Mid       ""          ""          ""
+Low       ""         "a"          ""
+'), TRUE)
+kable(vowels, align = 'c') %>%
+  kable_styling("bordered") %>%
+  column_spec(1, bold = TRUE)
+diphthongs <- read.table(textConnection('
+        Diphthongs
+"/iu/,  /ui/,  /ai/,  /au/"
+'), TRUE)
+kable(diphthongs, align = 'c') %>%
+  kable_styling("bordered")
+```
+# Alphabet
+* Glottal stops are not always reflected in the spelling of a word [@BclMattes2014, p. 12]. Occurrences in at least the intervocalic positions are predictable (i.e. phonetic), which isn't problematic for the language (as we don't account for them); however, occurrences of glottal stops elsewhere aren't transcribed consistently, which compromises the language to some degree.
+```{r echo = FALSE}
+alphabet <- read.table(textConnection('
+Grapheme  Phoneme
+  "a"      "/a/"
+  "b"      "/b/"
+  "d"      "/d/"
+  "e"      "/i/"
+  "g"      "/ɡ/"
+  "h"      "/h/"
+  "i"      "/i/"
+  "k"      "/k/"
+  "l"      "/l/"
+  "m"      "/m/"
+  "n"      "/n/"
+  "o"      "/u/"
+  "p"      "/p/"
+  "r"      "/ɾ/"
+  "s"      "/s/"
+  "t"      "/t/"
+  "u"      "/u/"
+  "w"      "/ʋ/"
+  "y"      "/j/"
+  "\' ; -" "/ʔ/"
+  **Digraph** ""
+  "ng"     "/ŋ/"
+  "aw"     "/au/"
+  "ay"     "/ai/"
+  "iw"     "/iu/"
+  "oy"     "/ui/"
+  "uy"     "/ui/"'), header = TRUE)
+knitr::kable(alphabet, align = 'c') %>%
+  kable_styling("bordered")
+```
+# Syllable Structure
+* Bikol syllable structure is CV(C), where V can be either a single vowel or a diphthong [@BclMattes2014, p. 10].
+# Misc. Rules
+* Epenthesis of [h] occurs between stem-final and suffix-initial vowels [@BclMattes2014, p. 9].
+* /u/ is realized as [o] in the final syllable of a word [@BclMintzD1971, p. 17].
+* Glottal stops are always inserted between orthographically adjacent vowels [@BclMattes2014, p. 12].
+* Prefix-final /ŋ/ tends to assimilate, to varying degrees, to the first consonant of the stem [@BclMattes2014, p. 9].
+# References

Data/_compromised/bcl_CentralBikol/bcl.bib ADDED Viewed

	@@ -0,0 +1,17 @@

+% Encoding: UTF-8
+@Book{BclMattes2014,
+  author    = {Mattes, Veronika},
+  title     = {Types of Reduplication: A Case Study of Bikol},
+  publisher = {De Gruyter},
+  year      = {2014},
+}
+@Book{BclMintzD1971,
+  author    = {Mintz, Malcolm W.},
+  title     = {Bikol Dictionary},
+  publisher = {University of Hawai'i Press},
+  year      = {1971},
+}
+@Comment{jabref-meta: databaseType:bibtex;}

Data/_compromised/bcl_CentralBikol/bcl.html ADDED Viewed

The diff for this file is too large to render. See raw diff

Data/_compromised/bcl_CentralBikol/bcl.rules ADDED Viewed

	@@ -0,0 +1,35 @@

+# Central Bikol Rule Set
+# Written by: Bill
+# Last Updated: 2019-06-17
+type,sfrom,sto,weight,precede,follow,comment
+# Classes
+class,punctuation,['‘’-],,,,
+class,w-preceder,[ai],,,,
+class,y-preceder,[aou],,,,
+class,vowel,[aeiou],,,,
+class,passthrough,[abdhilkmnpstu],,,,
+# Individual Letters
+sub,e,i,4,,,
+sub,g,ɡ,4,,,
+sub,o,u,4,,,
+sub,r,ɾ,4,,,
+sub,w,ʋ,4,,,
+sub,y,j,4,,,
+sub,{punctuation},ʔ,4,,,
+sub,{punctuation},,5,{vowel},{vowel},
+sub,({passthrough}),\1,0.1,,,
+# Digraphs
+sub,a,au,6,,w,"aw"
+sub,i,iu,6,,w,"iw"
+sub,w,,6,{w-preceder},,"w-final diphthongs clean-up",
+sub,a,ai,6,,y,"ay"
+sub,o,ui,6,,y,"oy"
+sub,u,ui,6,,y,"uy"
+sub,y,,6,{y-preceder},,"y-final diphthongs clean-up",
+sub,n,ŋ,6,,g,
+sub,g,,6,n,,"clean-up",
+# Non-Diphthongs (sequences of three vowels realized independently)
+ipasub,au ({vowel}),a u \1,8,,,
+ipasub,iu ({vowel}),i u \1,8,,,
+ipasub,ai ({vowel}),a i \1,8,,,
+ipasub,ui ({vowel}),u i \1,8,,,

Data/_compromised/bcl_CentralBikol/bcl.verify.csv ADDED Viewed

	@@ -0,0 +1,32 @@

+apat,a p a t,"a"
+atibangaw,a t i b a ŋ au,"aw"
+gulay,ɡ u l ai,"ay"
+bubon,b u b u n,"b"
+daguldol,d a ɡ u l d u l,"d"
+kengke,k i ŋ k i,"e"
+gusok,ɡ u s u k,"g"
+hagahag,h a ɡ a h a ɡ,"h"
+kiri,k i ɾ i,"i"
+ariw,a ɾ iu,"iw"
+kuko,k u k u,"k"
+lalaki,l a l a k i,"l"
+mampak,m a m p a k,"m"
+nana,n a n a,"n"
+ngunyan,ŋ u n j a n,"ng"
+ido,i d u,"o"
+laboy,l a b ui,"oy"
+papel,p a p i l,"p"
+ribo,ɾ i b u,"r"
+sebolyas,s i b u l j a s,"s"
+tatay,t a t ai,"t"
+utang,u t a ŋ,"u"
+buybuy,b ui b ui,"uy"
+wikwik,ʋ i k ʋ i k,"w"
+yating,j a t i ŋ,"y"
+ba-go,b a ʔ ɡ u,"punctuation as glottal stop"
+iba-ibang,i b a i b a ŋ,
+tuyong,t u i u ŋ,"non-diphthongs"
+laog,l a u ɡ,
+hiwas,h i u a s,
+katawo,k a t a u u,
+gayo,ɡ a i u,