wagner-austin commited on
Commit
1cc387c
·
1 Parent(s): 36e257e

Copy core package into Space for editable install

Browse files
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.egg-info/
3
+ *.whl
4
+ .venv/
5
+ .pytest_cache/
pyproject.toml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+
6
+ [project]
7
+ name = "turkic_transliterate"
8
+ version = "0.1.0"
9
+ description = "Deterministic Latin and IPA transliteration for Kazakh, Kyrgyz, plus tokenizer/glue scripts."
10
+ authors = [ {name="Austin Wagner", email="awagner@uci.edu"} ]
11
+ requires-python = ">=3.9"
12
+
13
+ dependencies = [
14
+ # Core dependencies (alphabetized)
15
+ "epitran>=1.0,<1.27", # 1.26.0 is the latest on PyPI
16
+ # Universal fasttext-wheel for all platforms and Python 3.10-3.13
17
+ "fasttext-wheel==0.9.2",
18
+ "numpy<2",
19
+ "packaging>=23.0", # Used in tests/test_fasttext.py
20
+ "panphon>=0.20,<0.22", # 0.21.2 is the newest published wheel
21
+ # Source build for non-Windows platforms
22
+ "PyICU>=2.15 ; sys_platform != 'win32'",
23
+ # Windows wheels for PyICU
24
+ "PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp310-cp310-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.10'",
25
+ "PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp311-cp311-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.11'",
26
+ "PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp312-cp312-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.12'",
27
+ "PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp313-cp313-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.13'",
28
+ "pytest>=8.0", # Test runner
29
+ "rapidfuzz>=3.5",
30
+ "rich>=13.7", # Color-aware logging and console output
31
+ "sentencepiece>=0.2.0"
32
+ ]
33
+
34
+ [project.optional-dependencies]
35
+ # winlid dependency kept for backward compatibility but empty since fasttext-wheel is now in main dependencies
36
+ winlid = []
37
+
38
+ # Development tools
39
+ dev = ["black", "ruff"]
40
+ # User interface dependencies
41
+ ui = ["gradio"]
42
+
43
+ [project.scripts]
44
+ turkic-translit = "turkic_translit.cli:main"
requirements.txt CHANGED
@@ -1,4 +1,2 @@
1
- turkic_transliterate==0.1.0 # will pull PyICU, panphon, etc.
2
  gradio>=4.30
3
- sentencepiece>=0.2
4
- rapidfuzz>=3.5
 
1
+ -e .[ui]
2
  gradio>=4.30
 
 
turkic_translit/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from importlib.metadata import version
2
+ # Set up logging first before any other operations
3
+ from .logging_config import setup as _log_setup; _log_setup()
4
+ # Import patches next to ensure they're applied before other imports
5
+ from . import patches
6
+ from .core import to_latin, to_ipa
7
+
8
+ __all__ = ["to_latin", "to_ipa"]
9
+ __version__ = version("turkic_transliterate")
turkic_translit/cli.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys, argparse, pathlib, time, os, logging
2
+ from .core import to_latin, to_ipa
3
+ from .logging_config import setup as _log_setup
4
+
5
+ # Initialize logger
6
+ log = logging.getLogger(__name__)
7
+
8
+ def main() -> None:
9
+ ap = argparse.ArgumentParser(description="Turkic transliteration")
10
+ ap.add_argument("--lang", required=True, choices=["kk", "ky"])
11
+ ap.add_argument("--ipa", action="store_true", help="produce IPA")
12
+ ap.add_argument("--arabic", action="store_true", help="also transliterate Arabic script")
13
+ ap.add_argument("--in", dest="inp", default="-")
14
+ ap.add_argument("--out_latin", default="-")
15
+ ap.add_argument("--out_ipa")
16
+ ap.add_argument("--benchmark", action="store_true")
17
+ ap.add_argument("--log-level", choices=["debug", "info", "warning", "error", "critical"],
18
+ default="info",
19
+ help="Set logging level (default: info)")
20
+ args = ap.parse_args()
21
+
22
+ # Always set log level from args at the start (first runtime line)
23
+ os.environ["TURKIC_LOG_LEVEL"] = args.log_level.upper()
24
+ _log_setup()
25
+
26
+ outputs = ["latin"]
27
+ if args.ipa:
28
+ outputs.append("ipa")
29
+ # Use Rich markup for output modes (magenta)
30
+ outputs_markup = ", ".join(f"[magenta]{o}[/]" for o in outputs)
31
+ log.info(
32
+ f"Starting transliteration: lang={args.lang}, input={args.inp}, outputs={outputs_markup}, "
33
+ f"out_latin={args.out_latin}, out_ipa={args.out_ipa}, arabic={args.arabic}, benchmark={args.benchmark}"
34
+ )
35
+
36
+ # Use UTF-8-sig for Windows to include BOM for proper encoding support
37
+ encoding = "utf-8-sig" if sys.platform == "win32" else "utf-8"
38
+
39
+ try:
40
+ fin = sys.stdin if args.inp == "-" else open(args.inp, encoding=encoding)
41
+ fo_l = sys.stdout if args.out_latin == "-" else open(args.out_latin, "w", encoding=encoding)
42
+ fo_i = None
43
+ if args.ipa:
44
+ if not args.out_ipa:
45
+ ap.error("--ipa requires --out_ipa")
46
+ fo_i = open(args.out_ipa, "w", encoding=encoding)
47
+ except UnicodeDecodeError as e:
48
+ sys.stderr.write(f"Encoding error: {e}\n")
49
+ sys.stderr.write("If you're on Windows, make sure your input file is properly encoded in UTF-8.\n")
50
+ sys.exit(1)
51
+
52
+ start = time.time()
53
+ n = 0
54
+
55
+ # Try to use tqdm for a progress bar if available and if we're in a TTY
56
+ use_progress_bar = False
57
+ pbar = None
58
+
59
+ # Check if we should use a progress bar (stderr is a TTY and input is not stdin)
60
+ is_tty_output = sys.stderr.isatty()
61
+ is_file_input = args.inp != "-"
62
+
63
+ if is_tty_output and is_file_input:
64
+ try:
65
+ from tqdm import tqdm
66
+ # Count the number of lines in the input file for the progress bar
67
+ total_lines = sum(1 for _ in fin)
68
+ fin.seek(0) # Reset file pointer
69
+ pbar = tqdm(total=total_lines, unit="lines")
70
+ use_progress_bar = True
71
+ log.debug("Using tqdm progress bar for %d lines", total_lines)
72
+ except ImportError:
73
+ log.debug("tqdm not available, falling back to basic processing")
74
+
75
+ # Process lines
76
+ for line in fin:
77
+ lat = to_latin(line.rstrip("\n"), args.lang, args.arabic)
78
+ fo_l.write(lat + "\n")
79
+ if fo_i:
80
+ fo_i.write(to_ipa(line.rstrip("\n"), args.lang) + "\n")
81
+ n += 1
82
+ if use_progress_bar and pbar:
83
+ pbar.update(1)
84
+
85
+ log.info(f"Finished writing {n} lines to {args.out_latin if args.out_latin != '-' else 'stdout'}" + (f" and {args.out_ipa}" if args.ipa else ""))
86
+
87
+ # Close progress bar if used
88
+ if use_progress_bar and pbar:
89
+ pbar.close()
90
+
91
+ elapsed = time.time() - start
92
+ # Always log processing statistics, but at different levels based on benchmark flag
93
+ if args.benchmark:
94
+ log.info("Processed %d lines in %.2fs (%.0f lines/s)", n, elapsed, n/elapsed if elapsed > 0 else 0)
95
+ else:
96
+ log.debug("Processed %d lines in %.2fs (%.0f lines/s)", n, elapsed, n/elapsed if elapsed > 0 else 0)
97
+ log.info("Transliteration complete.")
98
+
99
+ # Clean up file handles
100
+ if fin is not sys.stdin:
101
+ fin.close()
102
+ if fo_l is not sys.stdout:
103
+ fo_l.close()
104
+ if fo_i:
105
+ fo_i.close()
106
+
107
+ # This is the entry point when the module is run directly
108
+ if __name__ == "__main__":
109
+ main()
turkic_translit/core.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Public API for Latin and IPA transliteration."""
2
+ try:
3
+ import icu # noqa: F401
4
+ except ImportError as e: # PyICU wheel is still missing
5
+ raise RuntimeError(
6
+ "PyICU missing. On Windows run:\n"
7
+ " python scripts/get_pyicu_wheel.py\n"
8
+ "or manually install a wheel from "
9
+ "https://github.com/cgohlke/pyicu-build/releases ."
10
+ ) from e
11
+
12
+ from functools import lru_cache
13
+ import unicodedata as ud
14
+ from pathlib import Path
15
+
16
+
17
+ _RULE_DIR = Path(__file__).with_suffix("").parent / "rules"
18
+
19
+ @lru_cache
20
+ def _icu_trans(name: str) -> icu.Transliterator:
21
+ txt = (_RULE_DIR / name).read_text(encoding="utf8")
22
+ return icu.Transliterator.createFromRules(name, txt, 0)
23
+
24
+ def to_latin(text: str, lang: str, include_arabic: bool = False) -> str:
25
+ if lang not in ("kk", "ky"):
26
+ raise ValueError("lang must be 'kk' or 'ky'")
27
+ rule = f"{lang}_lat2023.rules"
28
+ trans = _icu_trans(rule)
29
+ if include_arabic:
30
+ ar = _icu_trans("ar_lat.rules")
31
+ text = ar.transliterate(text)
32
+ out = trans.transliterate(text)
33
+ return ud.normalize("NFC", out)
34
+
35
+
36
+ def to_ipa(text: str, lang: str) -> str:
37
+ trans = _icu_trans(f"{lang}_ipa.rules")
38
+ return ud.normalize("NFC", trans.transliterate(text))
turkic_translit/logging_config.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Centralized logging configuration module.
3
+ Uses Rich for colorized output if available with fallback to standard library.
4
+ """
5
+ import logging
6
+ import os
7
+ import sys
8
+ from functools import lru_cache
9
+
10
+ # Get log level from environment or default to INFO
11
+ LOG_LEVEL = os.environ.get("TURKIC_LOG_LEVEL", "INFO").upper()
12
+
13
+ @lru_cache(maxsize=1)
14
+ def setup():
15
+ """
16
+ Set up logging with Rich if available, with fallback to stdlib logging.
17
+ Uses TURKIC_LOG_LEVEL environment variable or defaults to INFO.
18
+
19
+ Uses @lru_cache to ensure this is only run once.
20
+ """
21
+ root_logger = logging.getLogger()
22
+
23
+ # Clear any existing handlers
24
+ for handler in root_logger.handlers[:]:
25
+ root_logger.removeHandler(handler)
26
+
27
+ # Set the log level based on environment variable
28
+ log_level = getattr(logging, LOG_LEVEL, logging.INFO)
29
+ root_logger.setLevel(log_level)
30
+
31
+ # Try to use Rich for pretty, colorized output
32
+ try:
33
+ from rich.logging import RichHandler
34
+
35
+ # Configure Rich handler with appropriate settings
36
+ handler = RichHandler(
37
+ rich_tracebacks=True,
38
+ markup=True,
39
+ show_time=False,
40
+ show_path=False,
41
+ )
42
+ formatter = logging.Formatter("%(message)s")
43
+
44
+ except ImportError:
45
+ # Fall back to standard logging if Rich is not available
46
+ handler = logging.StreamHandler(sys.stderr)
47
+ formatter = logging.Formatter(
48
+ "%(levelname)s: %(message)s"
49
+ )
50
+
51
+ # Configure and add the handler
52
+ handler.setFormatter(formatter)
53
+ root_logger.addHandler(handler)
54
+
55
+ logger = logging.getLogger("turkic_translit")
56
+ logger.debug(f"Logging initialized at level {LOG_LEVEL}")
57
+
58
+ return logger
turkic_translit/patches.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Patches for third-party libraries to fix encoding issues on Windows.
3
+ This module is imported automatically at startup.
4
+ """
5
+ import os
6
+ import sys
7
+ import functools
8
+ import logging
9
+ from .logging_config import setup; setup()
10
+
11
+ log = logging.getLogger(__name__)
12
+ _PATCH_DONE = False
13
+ _PATCHED_FILES = set()
14
+
15
+ def _fix_broken_ssl_cert_env():
16
+ """
17
+ If the user (often Conda on Windows) left SSL_CERT_FILE pointing at a
18
+ non-existent bundle, httpx ⇢ gradio will crash on import. When the file
19
+ is missing we delete the env-var so Python falls back to the system
20
+ certificates.
21
+ """
22
+ import os, pathlib, logging
23
+ log = logging.getLogger(__name__)
24
+ bundle = os.environ.get("SSL_CERT_FILE")
25
+ if bundle and not pathlib.Path(bundle).exists():
26
+ log.warning(
27
+ "SSL_CERT_FILE=%s does not exist – removing the variable so "
28
+ "httpx can create a default context", bundle)
29
+ del os.environ["SSL_CERT_FILE"]
30
+
31
+ def apply_patches():
32
+ """Apply all necessary patches for third-party libraries."""
33
+ global _PATCH_DONE
34
+ _fix_broken_ssl_cert_env() # ← new line: ensure SSL_CERT_FILE is valid before any third-party import
35
+ # Skip if patches have already been applied
36
+ if _PATCH_DONE:
37
+ log.debug("Patches already applied, skipping")
38
+ return
39
+
40
+ _PATCH_DONE = True
41
+ # Fix panphon encoding issues on Windows
42
+ if sys.platform == 'win32':
43
+ try:
44
+ import panphon.featuretable
45
+ import io
46
+ import csv
47
+
48
+ # Save the original open function
49
+ original_open = open
50
+
51
+ # Monkey patch the built-in open function when used by panphon
52
+ def patched_open_for_panphon(file, mode='r', *args, **kwargs):
53
+ # Add explicit UTF-8 encoding for CSV files opened by panphon
54
+ if 'panphon' in sys.modules and mode == 'r' and isinstance(file, str) and file.endswith('.csv'):
55
+ if 'encoding' not in kwargs:
56
+ kwargs['encoding'] = 'utf-8'
57
+ # Only log the first time per unique file
58
+ if file not in _PATCHED_FILES:
59
+ log.debug(f"Applied UTF-8 encoding patch for {file}")
60
+ _PATCHED_FILES.add(file)
61
+ return original_open(file, mode, *args, **kwargs)
62
+
63
+ # Set the environment variable for good measure
64
+ os.environ['PYTHONUTF8'] = '1'
65
+
66
+ # Apply the patch
67
+ import builtins
68
+ builtins.open = patched_open_for_panphon
69
+ log.info("Applied panphon UTF-8 patch for Windows")
70
+
71
+ # We've already applied the patch above
72
+ except ImportError:
73
+ log.warning("Could not patch panphon (not installed)")
74
+
75
+ # Apply patches when module is imported
76
+ apply_patches()
77
+ log.debug("Patches module initialized")
turkic_translit/rules/ar_lat.rules ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # consonants
2
+ ب > b ; پ > p ; ت > t ; ج > j ; چ > ch ;
3
+ ح > h ; خ > x ; د > d ; ر > r ; ز > z ; س > s ;
4
+ ش > sh ; ص > s ; ط > t ; غ > gh ;
5
+ ف > f ; ق > q ; ك > k ; گ > g ; ل > l ; م > m ; ن > n ;
6
+ ه > h ; ھ > h ; ژ > zh ; ڭ > ng ; ۋ > w ;
7
+
8
+ # vowels (hamza carrier ئ can be dropped or mapped to ')
9
+ ا > a ; ە > e ; ۆ > ö ; و > o ;
10
+ ۇ > u ; ۈ > ü ; ى > i ; ې > ë ;
11
+
12
+ # glottals
13
+ ء > ' ; ع > ' ; ئ > ;
14
+
15
+ :: NFC ;
turkic_translit/rules/kk_ipa.rules ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Kazakh → IPA transliteration rules (kk_ipa.rules)
2
+ # One line per Cyrillic letter. Right-hand side is plain IPA (no slashes). NFC-normalised.
3
+
4
+ А > ɑ ; а > ɑ ;
5
+ Ә > æ ; ә > æ ;
6
+ Б > b ; б > b ;
7
+ В > v ; в > v ;
8
+ Г > ɡ ; г > ɡ ;
9
+ Ғ > ʁ ; ғ > ʁ ;
10
+ Д > d ; д > d ;
11
+ Е > e ; е > e ;
12
+ Ё > jo ; ё > jo ;
13
+ Ж > ʒ ; ж > ʒ ;
14
+ З > z ; з > z ;
15
+ И > i ; и > i ;
16
+ Й > j ; й > j ;
17
+ К > k ; к > k ;
18
+ Қ > q ; қ > q ;
19
+ Л > l ; л > l ;
20
+ М > m ; м > m ;
21
+ Н > n ; н > n ;
22
+ Ң > ŋ ; ң > ŋ ;
23
+ О > o ; о > o ;
24
+ Ө > ø ; ө > ø ;
25
+ П > p ; п > p ;
26
+ Р > r ; р > r ;
27
+ С > s ; с > s ;
28
+ Т > t ; т > t ;
29
+ У > u ; у > u ;
30
+ Ұ > ʊ ; ұ > ʊ ;
31
+ Ү > y ; ү > y ;
32
+ Ф > f ; ф > f ;
33
+ Х > x ; х > x ;
34
+ Һ > h ; һ > h ;
35
+ Ц > ts ; ц > ts ;
36
+ Ч > t͡ʃ ; ч > t͡ʃ ;
37
+ Ш > ʃ ; ш > ʃ ;
38
+ Щ > ɕː ; щ > ɕː ;
39
+ Ъ > ʔ ; ъ > ʔ ;
40
+ Ы > ɯ ; ы > ɯ ;
41
+ І > ɪ ; і > ɪ ;
42
+ Ь > ; ь > ;
43
+ Э > e ; э > e ;
44
+ Ю > ju ; ю > ju ;
45
+ Я > ja ; я > ja ;
46
+
47
+ :: NFC ;
turkic_translit/rules/kk_lat2023.rules ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Official Kazakh Latin alphabet (April 2021)
2
+ # https://en.wikipedia.org/wiki/Kazakh_alphabets
3
+
4
+ А > A ; а > a ;
5
+ Ә > Ä ; ә > ä ;
6
+ Б > B ; б > b ;
7
+ В > V ; в > v ;
8
+ Г > G ; г > g ;
9
+ Ғ > Ğ ; ғ > ğ ;
10
+ Д > D ; д > d ;
11
+ Е > E ; е > e ;
12
+ Ж > J ; ж > j ;
13
+ З > Z ; з > z ;
14
+ И > İ ; и > i ; # dotted İ/i
15
+ Й > İ ; й > i ; # official merging per standard (ambiguity known)
16
+ І > I ; і > ı ; # corrected: dotless lowercase ı
17
+ К > K ; к > k ;
18
+ Қ > Q ; қ > q ;
19
+ Л > L ; л > l ;
20
+ М > M ; м > m ;
21
+ Н > N ; н > n ;
22
+ Ң > Ñ ; ң > ñ ;
23
+ О > O ; о > o ;
24
+ Ө > Ö ; ө > ö ;
25
+ П > P ; п > p ;
26
+ Р > R ; р > r ;
27
+ С > S ; с > s ;
28
+ Т > T ; т > t ;
29
+ У > U ; у > u ;
30
+ Ұ > Ū ; ұ > ū ;
31
+ Ү > Ü ; ү > ü ;
32
+ Ф > F ; ф > f ;
33
+ Х > H ; х > h ;
34
+ Һ > H ; һ > h ;
35
+
36
+ # Russian loan letters (clearly marked, NOT official Kazakh letters)
37
+ Ё > Yo ; ё > yo ;
38
+ Э > Ė ; э > ė ;
39
+ Ц > Ts ; ц > ts ;
40
+ Ч > Ch ; ч > ch ;
41
+ Ш > Ş ; ш > ş ;
42
+ Щ > Şç ; щ > şç ;
43
+ Ы > Y ; ы > y ;
44
+ Ю > Yu ; ю > yu ;
45
+ Я > Ya ; я > ya ;
46
+ Ъ > ; ъ > ; # dropped entirely
47
+ Ь > ; ь > ; # dropped entirely
48
+
49
+ :: NFC ;
turkic_translit/rules/ky_ipa.rules ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Kyrgyz → IPA transliteration rules (ky_ipa.rules)
2
+ # One line per Cyrillic letter; NFC‐normalised; IPA given without slashes.
3
+
4
+ А > a ; а > a ;
5
+ Б > b ; б > b ;
6
+ В > v ; в > v ;
7
+ Г > ɡ ; г > ɡ ;
8
+ Ғ > ʁ ; ғ > ʁ ;
9
+ Д > d ; д > d ;
10
+ Е > e ; е > e ;
11
+ Ё > jo ; ё > jo ;
12
+ Ж > d͡ʒ ; ж > d͡ʒ ;
13
+ З > z ; з > z ;
14
+ И > i ; и > i ;
15
+ Й > j ; й > j ;
16
+ К > k ; к > k ;
17
+ Қ > q ; қ > q ;
18
+ Л > l ; л > l ;
19
+ М > m ; м > m ;
20
+ Н > n ; н > n ;
21
+ Ң > ŋ ; ң > ŋ ;
22
+ О > o ; о > o ;
23
+ Ө > ø ; ө > ø ;
24
+ П > p ; п > p ;
25
+ Р > r ; р > r ;
26
+ С > s ; с > s ;
27
+ Т > t ; т > t ;
28
+ У > u ; у > u ;
29
+ Ү > y ; ү > y ;
30
+ Ф > f ; ф > f ;
31
+ Х > x ; х > x ;
32
+ Ц > ts ; ц > ts ;
33
+ Ч > t͡ʃ ; ч > t͡ʃ ;
34
+ Ш > ʃ ; ш > ʃ ;
35
+ Щ > ɕː ; щ > ɕː ;
36
+ Ы > ɯ ; ы > ɯ ;
37
+ Э > ɛ ; э > ɛ ;
38
+ Ю > ju ; ю > ju ;
39
+ Я > ja ; я > ja ;
40
+ Ъ > ʔ ; ъ > ʔ ;
41
+ Ь > ; ь > ;
42
+
43
+ :: NFC ;
turkic_translit/rules/ky_lat2023.rules ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Kyrgyz → Modern Practical Latin Transliteration (NFC)
2
+ # One line per pair; use pre‑composed Unicode code‑points; digraphs are atomic tokens.
3
+ А > A ; а > a ;
4
+ Б > B ; б > b ;
5
+ В > V ; в > v ;
6
+ Г > G ; г > g ;
7
+ Д > D ; д > d ;
8
+ Е > E ; е > e ;
9
+ Ё > Yo ; ё > yo ;
10
+ Ж > J ; ж > j ;
11
+ З > Z ; з > z ;
12
+ И > İ ; и > i ; # dotted I
13
+ Й > Ý ; й > ý ; # /j/ glide
14
+ К > K ; к > k ;
15
+ Л > L ; л > l ;
16
+ М > M ; м > m ;
17
+ Н > N ; н > n ;
18
+ Ң > Ñ ; ң > ñ ;
19
+ О > O ; о > o ;
20
+ Ө > Ö ; ө > ö ;
21
+ П > P ; п > p ;
22
+ Р > R ; р > r ;
23
+ С > S ; с > s ;
24
+ Т > T ; т > t ;
25
+ У > U ; у > u ;
26
+ Ү > Ü ; ү > ü ;
27
+ Ф > F ; ф > f ;
28
+ Х > H ; х > h ;
29
+ Ц > Ts ; ц > ts ; # digraph
30
+ Ч > Ç ; ч > ç ;
31
+ Ш > Ş ; ш > ş ;
32
+ Щ > Şç ; щ > şç ; # digraph
33
+ Ы > Y ; ы > y ;
34
+ Э > É ; э > é ;
35
+ Ю > Yu ; ю > yu ; # digraph
36
+ Я > Ya ; я > ya ; # digraph
37
+ Ъ > ʼ ; ъ > ʼ ; # modifier apostrophe U+02BC
38
+ Ь > ʼ ; ь > ʼ ;
39
+
40
+ :: NFC ;
turkic_translit/sanity.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Helper functions for Levenshtein and byte checks."""
2
+ from rapidfuzz.distance import Levenshtein
3
+ import io
4
+ import os, re, unicodedata as ud
5
+
6
+ def median_lev(file_lat:str, file_ipa:str, sample:int=5000) -> float:
7
+ from statistics import median
8
+ m=[]
9
+ with io.open(file_lat, encoding="utf8") as f1, io.open(file_ipa, encoding="utf8") as f2:
10
+ for i,(l,i_) in enumerate(zip(f1,f2)):
11
+ if i==sample: break
12
+ m.append(Levenshtein.normalized_distance(l.strip(), i_.strip()))
13
+ return median(m)
14
+
15
+ def bytes_per_char(filename:str)->float:
16
+ import os, io
17
+ b = os.path.getsize(filename)
18
+ with io.open(filename, encoding="utf8") as f:
19
+ chars = sum(len(line) for line in f)
20
+ return b / chars
21
+
22
+ def is_nfc(filename:str)->bool:
23
+ import unicodedata, io
24
+ with io.open(filename, encoding="utf8") as f:
25
+ return all(unicodedata.is_normalized("NFC", line) for line in f)