In [1]:
coda_to_ipa = {
 "b": "p",
 "p": "p‘",
 "m": "m",
 "f": "f",
 "d": "t",
 "t": "t‘",
 "n": "n",
 "z": "ts",
 "c": "ts‘",
 "s": "s",
 "r": "z",
 "j": "tɕ",
 "q": "tɕ‘",
 "gn": "ȵ",
 "x": "ɕ",
 "g": "k",
 "k": "k‘",
 "ng": "ŋ",
 "h": "x",
}

In [2]:
rime_to_ipa = {
 "(j|q|x)[uv]an": r"\1yan",
 "y[uv]an": "yan",
 "(j|q|x)[uv]n": r"\1yn",
 "y[uv]n": "yn",
 "iong": "yoŋ",

 "[uw]ang": "uaŋ",
 "[uw]ai": "uai",
 "[uw]ei": "uei",
 "[uw]an": "uan",
 "w?un": "uən",
 "[uw]a": "ua",
 "y?uo": "yo",
 "(g|k|h)ue": r"\1ue",
 "y?[uv]e": "ye",

 "iang": "iaŋ",
 "ang": "aŋ",
 "ong": "oŋ",
 "[iy]ai": "iɛi",
 "[iy]ao": "iau",
 "[iy]an": "iɛn",
 "you|iu": "iəu",
 "y?in": "in",
 "(z|c|s|r)i": r"\1ɿ",
 "ai": "ai",
 "ei": "ei",
 "ao": "au",
 "ou": "əu",
 "an": "an",
 "en": "ən",
 "[iy]a": "ia",
 "[iy]e": "ie",
 "(j|q|x)[uv]": r"\1y",
 "er": "ɚ",
 "yu": "y",
 "yi": "i",
 "v": "y",
 "i": "i",
 "a": "a",
 "o": "o",
 "e": "e",
 "u": "u",
}

In [3]:
second_tone_to_ipa = {
 "1": "꜒",
 "2": "꜕꜖",
 "3": "꜒꜔",
 "4": "꜕꜖꜔",
}

first_tone_to_ipa = {
 "1": "˥",
 "2": "˨˩",
 "3": "˥˧",
 "4": "˨˩˧",
}

In [4]:
codas = "|".join(coda_to_ipa.keys())
erhua_to_ipa = {
 "fur": "fɚ",
 f"({codas}?)([yiwu]).*r": r"\1\2ɚ",
 f"({codas}?).+r": r"\1ɚ",
}
simplify_erhua = {
 "fur": "fer",
 f"y.+r": "yir",
 f"w.+r": "wur",
 f"({codas}?)([iu]).*r": r"\1\2r",
 f"({codas}?).+r": r"\1er",
}
print(codas)

b|p|m|f|d|t|n|z|c|s|r|j|q|gn|x|g|k|ng|h


In [5]:
import re

# extract pinyins from fangyan.json
prefix = re.compile('\"pinyin\": +\"(.+)\",')
pinyins = set()

with open("fangyan.json", "r") as input_file:
 for line in input_file:
 line = line.strip()
 match = prefix.match(line)
 if match:
 pinyin = match.group(1).replace("——", "").replace("ü", "v")
 if '→' in pinyin:
 [before, after] = pinyin.split('→')
 assert len(before) == len(after)
 pinyin = ""
 for b, a in zip(before, after):
 if b != a:
 pinyin += (b + a)
 else:
 pinyin += b
 pinyins.add(pinyin)

print(f"Extracted {len(pinyins)} pinyin sequences")

Extracted 1540 pinyin sequences


In [6]:
ipas = {}

syllable_pattern = re.compile(r"([a-z]+)([1-4][1-4]?)")

all_pinyins = []
with open("all_cendu_pinyins.txt", "r") as cendu_pinyin_file:
 for pinyin in cendu_pinyin_file.readlines():
 py = pinyin.strip()
 match = syllable_pattern.match(py)
 if match:
 all_pinyins.append([(match.group(1).replace("l", "n"), match.group(2))])

import random

def random_chunk_list(lst):
 chunked_list = []
 i = 0
 while i < len(lst):
 chunk_size = random.randint(2, 5)
 chunked_list.append(lst[i:i+chunk_size])
 i += chunk_size
 return chunked_list

for pinyin in pinyins:
 pys = []
 for syllable in syllable_pattern.finditer(pinyin):
 sound = syllable.group(1)
 tone = syllable.group(2)
 pys.append((sound, tone))
 if len(pys) > 3:
 for py in random_chunk_list(pys):
 all_pinyins.append(py)
 else:
 all_pinyins.append(pys)


for pinyin in all_pinyins:
 ipa = []
 pys = []
 for sound, tone in pinyin:
 simplified_sound = sound
 for pattern, replacement in simplify_erhua.items():
 if re.match(pattern, simplified_sound):
 simplified_sound = re.sub(pattern, replacement, simplified_sound)
 break
 py = re.sub(r"\bgn", "n", simplified_sound) + tone
 pys.append(py)
 found_erhua = False
 for pattern, replacement in erhua_to_ipa.items():
 if re.match(pattern, sound):
 sound = re.sub(pattern, replacement, sound)
 found_erhua = True
 break
 if not found_erhua:
 for pattern, replacement in rime_to_ipa.items():
 pattern = pattern + "$"
 if re.search(pattern, sound):
 sound = re.sub(pattern, replacement, sound)
 break
 for pattern, replacement in coda_to_ipa.items():
 pattern = "^" + pattern
 if re.match(pattern, sound):
 sound = re.sub(pattern, replacement, sound)
 break
 if len(tone) == 2:
 for pattern, replacement in first_tone_to_ipa.items():
 pattern = "^" + pattern
 if re.search(pattern, tone):
 tone = re.sub(pattern, replacement, tone)
 break
 for pattern, replacement in second_tone_to_ipa.items():
 if re.search(pattern, tone):
 tone = re.sub(pattern, replacement, tone)
 break
 else:
 for pattern, replacement in first_tone_to_ipa.items():
 pattern = "^" + pattern
 if re.search(pattern, tone):
 tone = re.sub(pattern, replacement, tone)
 break
 ipa.append(sound + tone)
 ipas[" ".join(pys)] = " ".join(ipa)



print(ipas)
print(f"Generated {len(ipas)} ipas")

import json
with open("ipas.txt", "w") as output_file:
 json.dump(ipas, output_file, ensure_ascii=False, indent=2)


{'a1': 'a˥', 'ai1': 'ai˥', 'ai2': 'ai˨˩', 'ai3': 'ai˥˧', 'ai4': 'ai˨˩˧', 'an1': 'an˥', 'an2': 'an˨˩', 'an4': 'an˨˩˧', 'ang1': 'aŋ˥', 'ang2': 'aŋ˨˩', 'ang4': 'aŋ˨˩˧', 'ao1': 'au˥', 'ao2': 'au˨˩', 'ao3': 'au˥˧', 'ao4': 'au˨˩˧', 'ba1': 'pa˥', 'ba2': 'pa˨˩', 'ba3': 'pa˥˧', 'ba4': 'pa˨˩˧', 'bai3': 'pai˥˧', 'bai4': 'pai˨˩˧', 'ban1': 'pan˥', 'ban3': 'pan˥˧', 'ban4': 'pan˨˩˧', 'bang1': 'paŋ˥', 'bang3': 'paŋ˥˧', 'bang4': 'paŋ˨˩˧', 'bao1': 'pau˥', 'bao2': 'pau˨˩', 'bao3': 'pau˥˧', 'bao4': 'pau˨˩˧', 'be2': 'pe˨˩', 'bei1': 'pei˥', 'bei4': 'pei˨˩˧', 'ben1': 'pən˥', 'ben2': 'pən˨˩', 'ben3': 'pən˥˧', 'ben4': 'pən˨˩˧', 'bi1': 'pi˥', 'bi3': 'pi˥˧', 'bi4': 'pi˨˩˧', 'bi2': 'pi˨˩', 'bian1': 'piɛn˥', 'bian3': 'piɛn˥˧', 'bian4': 'piɛn˨˩˧', 'biao1': 'piau˥', 'biao3': 'piau˥˧', 'biao4': 'piau˨˩˧', 'bie1': 'pie˥', 'bin1': 'pin˥', 'bin3': 'pin˥˧', 'bin4': 'pin˨˩˧', 'bo1': 'po˥', 'bo3': 'po˥˧', 'bo4': 'po˨˩˧', 'bo2': 'po˨˩', 'bong4': 'poŋ˨˩˧', 'bu1': 'pu˥', 'bu3': 'pu˥˧', 'bu4': 'pu˨˩˧', 'bu2': 'pu˨˩', 'ca3': 't

In [7]:
from PIL import Image, ImageDraw, ImageFont
import shutil
import os

img_folder = "ipa_images"

# Check if the folder exists
if os.path.exists(img_folder):
 # Clear the folder if it exists
 shutil.rmtree(img_folder)
 os.makedirs(img_folder)
else:
 # Create the folder if it doesn't exist
 os.makedirs(img_folder)

# Load the TTF font
scale_factor = 2
font_size = 15
font = ImageFont.truetype("CharisSIL-Regular.ttf", font_size * scale_factor)

def draw_ipa(pinyin: str, ipa: str):
 text_color = (0, 0, 0, 100) # Black

 # Determine the size of the text
 text_width, _ = font.getsize(ipa)
 text_height = 14

 # Create a blank image with the size of the text
 scaled_width = text_width * scale_factor
 scaled_height = text_height * scale_factor
 scaled_image = Image.new("RGB", (scaled_width, scaled_height), (255, 255, 255)) # White

 # Create a draw object
 draw = ImageDraw.Draw(scaled_image)

 # Draw the text on the image
 draw.text((0, -6 * scale_factor), ipa, font=font, fill=text_color)

 # Resize the image back to the original size using BICUBIC interpolation
 image = scaled_image.resize((text_width, text_height), resample=Image.LANCZOS).crop((0, 0, text_width // scale_factor, text_height))

 # Darken gray pixels
 darker_factor = 0.8
 width, height = image.size
 for x in range(width):
 for y in range(height):
 r, g, b = image.getpixel((x, y))
 if r < 230: # Identify gray pixels
 new_r = int(r * darker_factor)
 new_g = int(g * darker_factor)
 new_b = int(b * darker_factor)
 image.putpixel((x, y), (new_r, new_g, new_b))

 # Save the image
 image.save(f"{img_folder}/{pinyin}.png")

In [8]:
for pinyin, ipa in ipas.items():
 draw_ipa(pinyin, ipa)

 text_width, _ = font.getsize(ipa)
 image = scaled_image.resize((text_width, text_height), resample=Image.LANCZOS).crop((0, 0, text_width // scale_factor, text_height))
