wagner-austin
commited on
Commit
·
1cc387c
1
Parent(s):
36e257e
Copy core package into Space for editable install
Browse files- .gitignore +5 -0
- pyproject.toml +44 -0
- requirements.txt +1 -3
- turkic_translit/__init__.py +9 -0
- turkic_translit/cli.py +109 -0
- turkic_translit/core.py +38 -0
- turkic_translit/logging_config.py +58 -0
- turkic_translit/patches.py +77 -0
- turkic_translit/rules/ar_lat.rules +15 -0
- turkic_translit/rules/kk_ipa.rules +47 -0
- turkic_translit/rules/kk_lat2023.rules +49 -0
- turkic_translit/rules/ky_ipa.rules +43 -0
- turkic_translit/rules/ky_lat2023.rules +40 -0
- turkic_translit/sanity.py +25 -0
.gitignore
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.egg-info/
|
| 3 |
+
*.whl
|
| 4 |
+
.venv/
|
| 5 |
+
.pytest_cache/
|
pyproject.toml
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=68", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
[project]
|
| 7 |
+
name = "turkic_transliterate"
|
| 8 |
+
version = "0.1.0"
|
| 9 |
+
description = "Deterministic Latin and IPA transliteration for Kazakh, Kyrgyz, plus tokenizer/glue scripts."
|
| 10 |
+
authors = [ {name="Austin Wagner", email="awagner@uci.edu"} ]
|
| 11 |
+
requires-python = ">=3.9"
|
| 12 |
+
|
| 13 |
+
dependencies = [
|
| 14 |
+
# Core dependencies (alphabetized)
|
| 15 |
+
"epitran>=1.0,<1.27", # 1.26.0 is the latest on PyPI
|
| 16 |
+
# Universal fasttext-wheel for all platforms and Python 3.10-3.13
|
| 17 |
+
"fasttext-wheel==0.9.2",
|
| 18 |
+
"numpy<2",
|
| 19 |
+
"packaging>=23.0", # Used in tests/test_fasttext.py
|
| 20 |
+
"panphon>=0.20,<0.22", # 0.21.2 is the newest published wheel
|
| 21 |
+
# Source build for non-Windows platforms
|
| 22 |
+
"PyICU>=2.15 ; sys_platform != 'win32'",
|
| 23 |
+
# Windows wheels for PyICU
|
| 24 |
+
"PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp310-cp310-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.10'",
|
| 25 |
+
"PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp311-cp311-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.11'",
|
| 26 |
+
"PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp312-cp312-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.12'",
|
| 27 |
+
"PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp313-cp313-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.13'",
|
| 28 |
+
"pytest>=8.0", # Test runner
|
| 29 |
+
"rapidfuzz>=3.5",
|
| 30 |
+
"rich>=13.7", # Color-aware logging and console output
|
| 31 |
+
"sentencepiece>=0.2.0"
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
[project.optional-dependencies]
|
| 35 |
+
# winlid dependency kept for backward compatibility but empty since fasttext-wheel is now in main dependencies
|
| 36 |
+
winlid = []
|
| 37 |
+
|
| 38 |
+
# Development tools
|
| 39 |
+
dev = ["black", "ruff"]
|
| 40 |
+
# User interface dependencies
|
| 41 |
+
ui = ["gradio"]
|
| 42 |
+
|
| 43 |
+
[project.scripts]
|
| 44 |
+
turkic-translit = "turkic_translit.cli:main"
|
requirements.txt
CHANGED
|
@@ -1,4 +1,2 @@
|
|
| 1 |
-
|
| 2 |
gradio>=4.30
|
| 3 |
-
sentencepiece>=0.2
|
| 4 |
-
rapidfuzz>=3.5
|
|
|
|
| 1 |
+
-e .[ui]
|
| 2 |
gradio>=4.30
|
|
|
|
|
|
turkic_translit/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from importlib.metadata import version
|
| 2 |
+
# Set up logging first before any other operations
|
| 3 |
+
from .logging_config import setup as _log_setup; _log_setup()
|
| 4 |
+
# Import patches next to ensure they're applied before other imports
|
| 5 |
+
from . import patches
|
| 6 |
+
from .core import to_latin, to_ipa
|
| 7 |
+
|
| 8 |
+
__all__ = ["to_latin", "to_ipa"]
|
| 9 |
+
__version__ = version("turkic_transliterate")
|
turkic_translit/cli.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys, argparse, pathlib, time, os, logging
|
| 2 |
+
from .core import to_latin, to_ipa
|
| 3 |
+
from .logging_config import setup as _log_setup
|
| 4 |
+
|
| 5 |
+
# Initialize logger
|
| 6 |
+
log = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
def main() -> None:
|
| 9 |
+
ap = argparse.ArgumentParser(description="Turkic transliteration")
|
| 10 |
+
ap.add_argument("--lang", required=True, choices=["kk", "ky"])
|
| 11 |
+
ap.add_argument("--ipa", action="store_true", help="produce IPA")
|
| 12 |
+
ap.add_argument("--arabic", action="store_true", help="also transliterate Arabic script")
|
| 13 |
+
ap.add_argument("--in", dest="inp", default="-")
|
| 14 |
+
ap.add_argument("--out_latin", default="-")
|
| 15 |
+
ap.add_argument("--out_ipa")
|
| 16 |
+
ap.add_argument("--benchmark", action="store_true")
|
| 17 |
+
ap.add_argument("--log-level", choices=["debug", "info", "warning", "error", "critical"],
|
| 18 |
+
default="info",
|
| 19 |
+
help="Set logging level (default: info)")
|
| 20 |
+
args = ap.parse_args()
|
| 21 |
+
|
| 22 |
+
# Always set log level from args at the start (first runtime line)
|
| 23 |
+
os.environ["TURKIC_LOG_LEVEL"] = args.log_level.upper()
|
| 24 |
+
_log_setup()
|
| 25 |
+
|
| 26 |
+
outputs = ["latin"]
|
| 27 |
+
if args.ipa:
|
| 28 |
+
outputs.append("ipa")
|
| 29 |
+
# Use Rich markup for output modes (magenta)
|
| 30 |
+
outputs_markup = ", ".join(f"[magenta]{o}[/]" for o in outputs)
|
| 31 |
+
log.info(
|
| 32 |
+
f"Starting transliteration: lang={args.lang}, input={args.inp}, outputs={outputs_markup}, "
|
| 33 |
+
f"out_latin={args.out_latin}, out_ipa={args.out_ipa}, arabic={args.arabic}, benchmark={args.benchmark}"
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
# Use UTF-8-sig for Windows to include BOM for proper encoding support
|
| 37 |
+
encoding = "utf-8-sig" if sys.platform == "win32" else "utf-8"
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
fin = sys.stdin if args.inp == "-" else open(args.inp, encoding=encoding)
|
| 41 |
+
fo_l = sys.stdout if args.out_latin == "-" else open(args.out_latin, "w", encoding=encoding)
|
| 42 |
+
fo_i = None
|
| 43 |
+
if args.ipa:
|
| 44 |
+
if not args.out_ipa:
|
| 45 |
+
ap.error("--ipa requires --out_ipa")
|
| 46 |
+
fo_i = open(args.out_ipa, "w", encoding=encoding)
|
| 47 |
+
except UnicodeDecodeError as e:
|
| 48 |
+
sys.stderr.write(f"Encoding error: {e}\n")
|
| 49 |
+
sys.stderr.write("If you're on Windows, make sure your input file is properly encoded in UTF-8.\n")
|
| 50 |
+
sys.exit(1)
|
| 51 |
+
|
| 52 |
+
start = time.time()
|
| 53 |
+
n = 0
|
| 54 |
+
|
| 55 |
+
# Try to use tqdm for a progress bar if available and if we're in a TTY
|
| 56 |
+
use_progress_bar = False
|
| 57 |
+
pbar = None
|
| 58 |
+
|
| 59 |
+
# Check if we should use a progress bar (stderr is a TTY and input is not stdin)
|
| 60 |
+
is_tty_output = sys.stderr.isatty()
|
| 61 |
+
is_file_input = args.inp != "-"
|
| 62 |
+
|
| 63 |
+
if is_tty_output and is_file_input:
|
| 64 |
+
try:
|
| 65 |
+
from tqdm import tqdm
|
| 66 |
+
# Count the number of lines in the input file for the progress bar
|
| 67 |
+
total_lines = sum(1 for _ in fin)
|
| 68 |
+
fin.seek(0) # Reset file pointer
|
| 69 |
+
pbar = tqdm(total=total_lines, unit="lines")
|
| 70 |
+
use_progress_bar = True
|
| 71 |
+
log.debug("Using tqdm progress bar for %d lines", total_lines)
|
| 72 |
+
except ImportError:
|
| 73 |
+
log.debug("tqdm not available, falling back to basic processing")
|
| 74 |
+
|
| 75 |
+
# Process lines
|
| 76 |
+
for line in fin:
|
| 77 |
+
lat = to_latin(line.rstrip("\n"), args.lang, args.arabic)
|
| 78 |
+
fo_l.write(lat + "\n")
|
| 79 |
+
if fo_i:
|
| 80 |
+
fo_i.write(to_ipa(line.rstrip("\n"), args.lang) + "\n")
|
| 81 |
+
n += 1
|
| 82 |
+
if use_progress_bar and pbar:
|
| 83 |
+
pbar.update(1)
|
| 84 |
+
|
| 85 |
+
log.info(f"Finished writing {n} lines to {args.out_latin if args.out_latin != '-' else 'stdout'}" + (f" and {args.out_ipa}" if args.ipa else ""))
|
| 86 |
+
|
| 87 |
+
# Close progress bar if used
|
| 88 |
+
if use_progress_bar and pbar:
|
| 89 |
+
pbar.close()
|
| 90 |
+
|
| 91 |
+
elapsed = time.time() - start
|
| 92 |
+
# Always log processing statistics, but at different levels based on benchmark flag
|
| 93 |
+
if args.benchmark:
|
| 94 |
+
log.info("Processed %d lines in %.2fs (%.0f lines/s)", n, elapsed, n/elapsed if elapsed > 0 else 0)
|
| 95 |
+
else:
|
| 96 |
+
log.debug("Processed %d lines in %.2fs (%.0f lines/s)", n, elapsed, n/elapsed if elapsed > 0 else 0)
|
| 97 |
+
log.info("Transliteration complete.")
|
| 98 |
+
|
| 99 |
+
# Clean up file handles
|
| 100 |
+
if fin is not sys.stdin:
|
| 101 |
+
fin.close()
|
| 102 |
+
if fo_l is not sys.stdout:
|
| 103 |
+
fo_l.close()
|
| 104 |
+
if fo_i:
|
| 105 |
+
fo_i.close()
|
| 106 |
+
|
| 107 |
+
# This is the entry point when the module is run directly
|
| 108 |
+
if __name__ == "__main__":
|
| 109 |
+
main()
|
turkic_translit/core.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Public API for Latin and IPA transliteration."""
|
| 2 |
+
try:
|
| 3 |
+
import icu # noqa: F401
|
| 4 |
+
except ImportError as e: # PyICU wheel is still missing
|
| 5 |
+
raise RuntimeError(
|
| 6 |
+
"PyICU missing. On Windows run:\n"
|
| 7 |
+
" python scripts/get_pyicu_wheel.py\n"
|
| 8 |
+
"or manually install a wheel from "
|
| 9 |
+
"https://github.com/cgohlke/pyicu-build/releases ."
|
| 10 |
+
) from e
|
| 11 |
+
|
| 12 |
+
from functools import lru_cache
|
| 13 |
+
import unicodedata as ud
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
_RULE_DIR = Path(__file__).with_suffix("").parent / "rules"
|
| 18 |
+
|
| 19 |
+
@lru_cache
|
| 20 |
+
def _icu_trans(name: str) -> icu.Transliterator:
|
| 21 |
+
txt = (_RULE_DIR / name).read_text(encoding="utf8")
|
| 22 |
+
return icu.Transliterator.createFromRules(name, txt, 0)
|
| 23 |
+
|
| 24 |
+
def to_latin(text: str, lang: str, include_arabic: bool = False) -> str:
|
| 25 |
+
if lang not in ("kk", "ky"):
|
| 26 |
+
raise ValueError("lang must be 'kk' or 'ky'")
|
| 27 |
+
rule = f"{lang}_lat2023.rules"
|
| 28 |
+
trans = _icu_trans(rule)
|
| 29 |
+
if include_arabic:
|
| 30 |
+
ar = _icu_trans("ar_lat.rules")
|
| 31 |
+
text = ar.transliterate(text)
|
| 32 |
+
out = trans.transliterate(text)
|
| 33 |
+
return ud.normalize("NFC", out)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def to_ipa(text: str, lang: str) -> str:
|
| 37 |
+
trans = _icu_trans(f"{lang}_ipa.rules")
|
| 38 |
+
return ud.normalize("NFC", trans.transliterate(text))
|
turkic_translit/logging_config.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Centralized logging configuration module.
|
| 3 |
+
Uses Rich for colorized output if available with fallback to standard library.
|
| 4 |
+
"""
|
| 5 |
+
import logging
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
from functools import lru_cache
|
| 9 |
+
|
| 10 |
+
# Get log level from environment or default to INFO
|
| 11 |
+
LOG_LEVEL = os.environ.get("TURKIC_LOG_LEVEL", "INFO").upper()
|
| 12 |
+
|
| 13 |
+
@lru_cache(maxsize=1)
|
| 14 |
+
def setup():
|
| 15 |
+
"""
|
| 16 |
+
Set up logging with Rich if available, with fallback to stdlib logging.
|
| 17 |
+
Uses TURKIC_LOG_LEVEL environment variable or defaults to INFO.
|
| 18 |
+
|
| 19 |
+
Uses @lru_cache to ensure this is only run once.
|
| 20 |
+
"""
|
| 21 |
+
root_logger = logging.getLogger()
|
| 22 |
+
|
| 23 |
+
# Clear any existing handlers
|
| 24 |
+
for handler in root_logger.handlers[:]:
|
| 25 |
+
root_logger.removeHandler(handler)
|
| 26 |
+
|
| 27 |
+
# Set the log level based on environment variable
|
| 28 |
+
log_level = getattr(logging, LOG_LEVEL, logging.INFO)
|
| 29 |
+
root_logger.setLevel(log_level)
|
| 30 |
+
|
| 31 |
+
# Try to use Rich for pretty, colorized output
|
| 32 |
+
try:
|
| 33 |
+
from rich.logging import RichHandler
|
| 34 |
+
|
| 35 |
+
# Configure Rich handler with appropriate settings
|
| 36 |
+
handler = RichHandler(
|
| 37 |
+
rich_tracebacks=True,
|
| 38 |
+
markup=True,
|
| 39 |
+
show_time=False,
|
| 40 |
+
show_path=False,
|
| 41 |
+
)
|
| 42 |
+
formatter = logging.Formatter("%(message)s")
|
| 43 |
+
|
| 44 |
+
except ImportError:
|
| 45 |
+
# Fall back to standard logging if Rich is not available
|
| 46 |
+
handler = logging.StreamHandler(sys.stderr)
|
| 47 |
+
formatter = logging.Formatter(
|
| 48 |
+
"%(levelname)s: %(message)s"
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# Configure and add the handler
|
| 52 |
+
handler.setFormatter(formatter)
|
| 53 |
+
root_logger.addHandler(handler)
|
| 54 |
+
|
| 55 |
+
logger = logging.getLogger("turkic_translit")
|
| 56 |
+
logger.debug(f"Logging initialized at level {LOG_LEVEL}")
|
| 57 |
+
|
| 58 |
+
return logger
|
turkic_translit/patches.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Patches for third-party libraries to fix encoding issues on Windows.
|
| 3 |
+
This module is imported automatically at startup.
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
import functools
|
| 8 |
+
import logging
|
| 9 |
+
from .logging_config import setup; setup()
|
| 10 |
+
|
| 11 |
+
log = logging.getLogger(__name__)
|
| 12 |
+
_PATCH_DONE = False
|
| 13 |
+
_PATCHED_FILES = set()
|
| 14 |
+
|
| 15 |
+
def _fix_broken_ssl_cert_env():
|
| 16 |
+
"""
|
| 17 |
+
If the user (often Conda on Windows) left SSL_CERT_FILE pointing at a
|
| 18 |
+
non-existent bundle, httpx ⇢ gradio will crash on import. When the file
|
| 19 |
+
is missing we delete the env-var so Python falls back to the system
|
| 20 |
+
certificates.
|
| 21 |
+
"""
|
| 22 |
+
import os, pathlib, logging
|
| 23 |
+
log = logging.getLogger(__name__)
|
| 24 |
+
bundle = os.environ.get("SSL_CERT_FILE")
|
| 25 |
+
if bundle and not pathlib.Path(bundle).exists():
|
| 26 |
+
log.warning(
|
| 27 |
+
"SSL_CERT_FILE=%s does not exist – removing the variable so "
|
| 28 |
+
"httpx can create a default context", bundle)
|
| 29 |
+
del os.environ["SSL_CERT_FILE"]
|
| 30 |
+
|
| 31 |
+
def apply_patches():
|
| 32 |
+
"""Apply all necessary patches for third-party libraries."""
|
| 33 |
+
global _PATCH_DONE
|
| 34 |
+
_fix_broken_ssl_cert_env() # ← new line: ensure SSL_CERT_FILE is valid before any third-party import
|
| 35 |
+
# Skip if patches have already been applied
|
| 36 |
+
if _PATCH_DONE:
|
| 37 |
+
log.debug("Patches already applied, skipping")
|
| 38 |
+
return
|
| 39 |
+
|
| 40 |
+
_PATCH_DONE = True
|
| 41 |
+
# Fix panphon encoding issues on Windows
|
| 42 |
+
if sys.platform == 'win32':
|
| 43 |
+
try:
|
| 44 |
+
import panphon.featuretable
|
| 45 |
+
import io
|
| 46 |
+
import csv
|
| 47 |
+
|
| 48 |
+
# Save the original open function
|
| 49 |
+
original_open = open
|
| 50 |
+
|
| 51 |
+
# Monkey patch the built-in open function when used by panphon
|
| 52 |
+
def patched_open_for_panphon(file, mode='r', *args, **kwargs):
|
| 53 |
+
# Add explicit UTF-8 encoding for CSV files opened by panphon
|
| 54 |
+
if 'panphon' in sys.modules and mode == 'r' and isinstance(file, str) and file.endswith('.csv'):
|
| 55 |
+
if 'encoding' not in kwargs:
|
| 56 |
+
kwargs['encoding'] = 'utf-8'
|
| 57 |
+
# Only log the first time per unique file
|
| 58 |
+
if file not in _PATCHED_FILES:
|
| 59 |
+
log.debug(f"Applied UTF-8 encoding patch for {file}")
|
| 60 |
+
_PATCHED_FILES.add(file)
|
| 61 |
+
return original_open(file, mode, *args, **kwargs)
|
| 62 |
+
|
| 63 |
+
# Set the environment variable for good measure
|
| 64 |
+
os.environ['PYTHONUTF8'] = '1'
|
| 65 |
+
|
| 66 |
+
# Apply the patch
|
| 67 |
+
import builtins
|
| 68 |
+
builtins.open = patched_open_for_panphon
|
| 69 |
+
log.info("Applied panphon UTF-8 patch for Windows")
|
| 70 |
+
|
| 71 |
+
# We've already applied the patch above
|
| 72 |
+
except ImportError:
|
| 73 |
+
log.warning("Could not patch panphon (not installed)")
|
| 74 |
+
|
| 75 |
+
# Apply patches when module is imported
|
| 76 |
+
apply_patches()
|
| 77 |
+
log.debug("Patches module initialized")
|
turkic_translit/rules/ar_lat.rules
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# consonants
|
| 2 |
+
ب > b ; پ > p ; ت > t ; ج > j ; چ > ch ;
|
| 3 |
+
ح > h ; خ > x ; د > d ; ر > r ; ز > z ; س > s ;
|
| 4 |
+
ش > sh ; ص > s ; ط > t ; غ > gh ;
|
| 5 |
+
ف > f ; ق > q ; ك > k ; گ > g ; ل > l ; م > m ; ن > n ;
|
| 6 |
+
ه > h ; ھ > h ; ژ > zh ; ڭ > ng ; ۋ > w ;
|
| 7 |
+
|
| 8 |
+
# vowels (hamza carrier ئ can be dropped or mapped to ')
|
| 9 |
+
ا > a ; ە > e ; ۆ > ö ; و > o ;
|
| 10 |
+
ۇ > u ; ۈ > ü ; ى > i ; ې > ë ;
|
| 11 |
+
|
| 12 |
+
# glottals
|
| 13 |
+
ء > ' ; ع > ' ; ئ > ;
|
| 14 |
+
|
| 15 |
+
:: NFC ;
|
turkic_translit/rules/kk_ipa.rules
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Kazakh → IPA transliteration rules (kk_ipa.rules)
|
| 2 |
+
# One line per Cyrillic letter. Right-hand side is plain IPA (no slashes). NFC-normalised.
|
| 3 |
+
|
| 4 |
+
А > ɑ ; а > ɑ ;
|
| 5 |
+
Ә > æ ; ә > æ ;
|
| 6 |
+
Б > b ; б > b ;
|
| 7 |
+
В > v ; в > v ;
|
| 8 |
+
Г > ɡ ; г > ɡ ;
|
| 9 |
+
Ғ > ʁ ; ғ > ʁ ;
|
| 10 |
+
Д > d ; д > d ;
|
| 11 |
+
Е > e ; е > e ;
|
| 12 |
+
Ё > jo ; ё > jo ;
|
| 13 |
+
Ж > ʒ ; ж > ʒ ;
|
| 14 |
+
З > z ; з > z ;
|
| 15 |
+
И > i ; и > i ;
|
| 16 |
+
Й > j ; й > j ;
|
| 17 |
+
К > k ; к > k ;
|
| 18 |
+
Қ > q ; қ > q ;
|
| 19 |
+
Л > l ; л > l ;
|
| 20 |
+
М > m ; м > m ;
|
| 21 |
+
Н > n ; н > n ;
|
| 22 |
+
Ң > ŋ ; ң > ŋ ;
|
| 23 |
+
О > o ; о > o ;
|
| 24 |
+
Ө > ø ; ө > ø ;
|
| 25 |
+
П > p ; п > p ;
|
| 26 |
+
Р > r ; р > r ;
|
| 27 |
+
С > s ; с > s ;
|
| 28 |
+
Т > t ; т > t ;
|
| 29 |
+
У > u ; у > u ;
|
| 30 |
+
Ұ > ʊ ; ұ > ʊ ;
|
| 31 |
+
Ү > y ; ү > y ;
|
| 32 |
+
Ф > f ; ф > f ;
|
| 33 |
+
Х > x ; х > x ;
|
| 34 |
+
Һ > h ; һ > h ;
|
| 35 |
+
Ц > ts ; ц > ts ;
|
| 36 |
+
Ч > t͡ʃ ; ч > t͡ʃ ;
|
| 37 |
+
Ш > ʃ ; ш > ʃ ;
|
| 38 |
+
Щ > ɕː ; щ > ɕː ;
|
| 39 |
+
Ъ > ʔ ; ъ > ʔ ;
|
| 40 |
+
Ы > ɯ ; ы > ɯ ;
|
| 41 |
+
І > ɪ ; і > ɪ ;
|
| 42 |
+
Ь > ; ь > ;
|
| 43 |
+
Э > e ; э > e ;
|
| 44 |
+
Ю > ju ; ю > ju ;
|
| 45 |
+
Я > ja ; я > ja ;
|
| 46 |
+
|
| 47 |
+
:: NFC ;
|
turkic_translit/rules/kk_lat2023.rules
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Official Kazakh Latin alphabet (April 2021)
|
| 2 |
+
# https://en.wikipedia.org/wiki/Kazakh_alphabets
|
| 3 |
+
|
| 4 |
+
А > A ; а > a ;
|
| 5 |
+
Ә > Ä ; ә > ä ;
|
| 6 |
+
Б > B ; б > b ;
|
| 7 |
+
В > V ; в > v ;
|
| 8 |
+
Г > G ; г > g ;
|
| 9 |
+
Ғ > Ğ ; ғ > ğ ;
|
| 10 |
+
Д > D ; д > d ;
|
| 11 |
+
Е > E ; е > e ;
|
| 12 |
+
Ж > J ; ж > j ;
|
| 13 |
+
З > Z ; з > z ;
|
| 14 |
+
И > İ ; и > i ; # dotted İ/i
|
| 15 |
+
Й > İ ; й > i ; # official merging per standard (ambiguity known)
|
| 16 |
+
І > I ; і > ı ; # corrected: dotless lowercase ı
|
| 17 |
+
К > K ; к > k ;
|
| 18 |
+
Қ > Q ; қ > q ;
|
| 19 |
+
Л > L ; л > l ;
|
| 20 |
+
М > M ; м > m ;
|
| 21 |
+
Н > N ; н > n ;
|
| 22 |
+
Ң > Ñ ; ң > ñ ;
|
| 23 |
+
О > O ; о > o ;
|
| 24 |
+
Ө > Ö ; ө > ö ;
|
| 25 |
+
П > P ; п > p ;
|
| 26 |
+
Р > R ; р > r ;
|
| 27 |
+
С > S ; с > s ;
|
| 28 |
+
Т > T ; т > t ;
|
| 29 |
+
У > U ; у > u ;
|
| 30 |
+
Ұ > Ū ; ұ > ū ;
|
| 31 |
+
Ү > Ü ; ү > ü ;
|
| 32 |
+
Ф > F ; ф > f ;
|
| 33 |
+
Х > H ; х > h ;
|
| 34 |
+
Һ > H ; һ > h ;
|
| 35 |
+
|
| 36 |
+
# Russian loan letters (clearly marked, NOT official Kazakh letters)
|
| 37 |
+
Ё > Yo ; ё > yo ;
|
| 38 |
+
Э > Ė ; э > ė ;
|
| 39 |
+
Ц > Ts ; ц > ts ;
|
| 40 |
+
Ч > Ch ; ч > ch ;
|
| 41 |
+
Ш > Ş ; ш > ş ;
|
| 42 |
+
Щ > Şç ; щ > şç ;
|
| 43 |
+
Ы > Y ; ы > y ;
|
| 44 |
+
Ю > Yu ; ю > yu ;
|
| 45 |
+
Я > Ya ; я > ya ;
|
| 46 |
+
Ъ > ; ъ > ; # dropped entirely
|
| 47 |
+
Ь > ; ь > ; # dropped entirely
|
| 48 |
+
|
| 49 |
+
:: NFC ;
|
turkic_translit/rules/ky_ipa.rules
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Kyrgyz → IPA transliteration rules (ky_ipa.rules)
|
| 2 |
+
# One line per Cyrillic letter; NFC‐normalised; IPA given without slashes.
|
| 3 |
+
|
| 4 |
+
А > a ; а > a ;
|
| 5 |
+
Б > b ; б > b ;
|
| 6 |
+
В > v ; в > v ;
|
| 7 |
+
Г > ɡ ; г > ɡ ;
|
| 8 |
+
Ғ > ʁ ; ғ > ʁ ;
|
| 9 |
+
Д > d ; д > d ;
|
| 10 |
+
Е > e ; е > e ;
|
| 11 |
+
Ё > jo ; ё > jo ;
|
| 12 |
+
Ж > d͡ʒ ; ж > d͡ʒ ;
|
| 13 |
+
З > z ; з > z ;
|
| 14 |
+
И > i ; и > i ;
|
| 15 |
+
Й > j ; й > j ;
|
| 16 |
+
К > k ; к > k ;
|
| 17 |
+
Қ > q ; қ > q ;
|
| 18 |
+
Л > l ; л > l ;
|
| 19 |
+
М > m ; м > m ;
|
| 20 |
+
Н > n ; н > n ;
|
| 21 |
+
Ң > ŋ ; ң > ŋ ;
|
| 22 |
+
О > o ; о > o ;
|
| 23 |
+
Ө > ø ; ө > ø ;
|
| 24 |
+
П > p ; п > p ;
|
| 25 |
+
Р > r ; р > r ;
|
| 26 |
+
С > s ; с > s ;
|
| 27 |
+
Т > t ; т > t ;
|
| 28 |
+
У > u ; у > u ;
|
| 29 |
+
Ү > y ; ү > y ;
|
| 30 |
+
Ф > f ; ф > f ;
|
| 31 |
+
Х > x ; х > x ;
|
| 32 |
+
Ц > ts ; ц > ts ;
|
| 33 |
+
Ч > t͡ʃ ; ч > t͡ʃ ;
|
| 34 |
+
Ш > ʃ ; ш > ʃ ;
|
| 35 |
+
Щ > ɕː ; щ > ɕː ;
|
| 36 |
+
Ы > ɯ ; ы > ɯ ;
|
| 37 |
+
Э > ɛ ; э > ɛ ;
|
| 38 |
+
Ю > ju ; ю > ju ;
|
| 39 |
+
Я > ja ; я > ja ;
|
| 40 |
+
Ъ > ʔ ; ъ > ʔ ;
|
| 41 |
+
Ь > ; ь > ;
|
| 42 |
+
|
| 43 |
+
:: NFC ;
|
turkic_translit/rules/ky_lat2023.rules
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Kyrgyz → Modern Practical Latin Transliteration (NFC)
|
| 2 |
+
# One line per pair; use pre‑composed Unicode code‑points; digraphs are atomic tokens.
|
| 3 |
+
А > A ; а > a ;
|
| 4 |
+
Б > B ; б > b ;
|
| 5 |
+
В > V ; в > v ;
|
| 6 |
+
Г > G ; г > g ;
|
| 7 |
+
Д > D ; д > d ;
|
| 8 |
+
Е > E ; е > e ;
|
| 9 |
+
Ё > Yo ; ё > yo ;
|
| 10 |
+
Ж > J ; ж > j ;
|
| 11 |
+
З > Z ; з > z ;
|
| 12 |
+
И > İ ; и > i ; # dotted I
|
| 13 |
+
Й > Ý ; й > ý ; # /j/ glide
|
| 14 |
+
К > K ; к > k ;
|
| 15 |
+
Л > L ; л > l ;
|
| 16 |
+
М > M ; м > m ;
|
| 17 |
+
Н > N ; н > n ;
|
| 18 |
+
Ң > Ñ ; ң > ñ ;
|
| 19 |
+
О > O ; о > o ;
|
| 20 |
+
Ө > Ö ; ө > ö ;
|
| 21 |
+
П > P ; п > p ;
|
| 22 |
+
Р > R ; р > r ;
|
| 23 |
+
С > S ; с > s ;
|
| 24 |
+
Т > T ; т > t ;
|
| 25 |
+
У > U ; у > u ;
|
| 26 |
+
Ү > Ü ; ү > ü ;
|
| 27 |
+
Ф > F ; ф > f ;
|
| 28 |
+
Х > H ; х > h ;
|
| 29 |
+
Ц > Ts ; ц > ts ; # digraph
|
| 30 |
+
Ч > Ç ; ч > ç ;
|
| 31 |
+
Ш > Ş ; ш > ş ;
|
| 32 |
+
Щ > Şç ; щ > şç ; # digraph
|
| 33 |
+
Ы > Y ; ы > y ;
|
| 34 |
+
Э > É ; э > é ;
|
| 35 |
+
Ю > Yu ; ю > yu ; # digraph
|
| 36 |
+
Я > Ya ; я > ya ; # digraph
|
| 37 |
+
Ъ > ʼ ; ъ > ʼ ; # modifier apostrophe U+02BC
|
| 38 |
+
Ь > ʼ ; ь > ʼ ;
|
| 39 |
+
|
| 40 |
+
:: NFC ;
|
turkic_translit/sanity.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Helper functions for Levenshtein and byte checks."""
|
| 2 |
+
from rapidfuzz.distance import Levenshtein
|
| 3 |
+
import io
|
| 4 |
+
import os, re, unicodedata as ud
|
| 5 |
+
|
| 6 |
+
def median_lev(file_lat:str, file_ipa:str, sample:int=5000) -> float:
|
| 7 |
+
from statistics import median
|
| 8 |
+
m=[]
|
| 9 |
+
with io.open(file_lat, encoding="utf8") as f1, io.open(file_ipa, encoding="utf8") as f2:
|
| 10 |
+
for i,(l,i_) in enumerate(zip(f1,f2)):
|
| 11 |
+
if i==sample: break
|
| 12 |
+
m.append(Levenshtein.normalized_distance(l.strip(), i_.strip()))
|
| 13 |
+
return median(m)
|
| 14 |
+
|
| 15 |
+
def bytes_per_char(filename:str)->float:
|
| 16 |
+
import os, io
|
| 17 |
+
b = os.path.getsize(filename)
|
| 18 |
+
with io.open(filename, encoding="utf8") as f:
|
| 19 |
+
chars = sum(len(line) for line in f)
|
| 20 |
+
return b / chars
|
| 21 |
+
|
| 22 |
+
def is_nfc(filename:str)->bool:
|
| 23 |
+
import unicodedata, io
|
| 24 |
+
with io.open(filename, encoding="utf8") as f:
|
| 25 |
+
return all(unicodedata.is_normalized("NFC", line) for line in f)
|