Spaces:

AustinWagner
/

turkic-transliteration-demo

Sleeping

App Files Files Community

wagner-austin commited on May 13, 2025

Commit

1cc387c

1 Parent(s): 36e257e

Copy core package into Space for editable install

Browse files

Files changed (14) hide show

.gitignore +5 -0
pyproject.toml +44 -0
requirements.txt +1 -3
turkic_translit/__init__.py +9 -0
turkic_translit/cli.py +109 -0
turkic_translit/core.py +38 -0
turkic_translit/logging_config.py +58 -0
turkic_translit/patches.py +77 -0
turkic_translit/rules/ar_lat.rules +15 -0
turkic_translit/rules/kk_ipa.rules +47 -0
turkic_translit/rules/kk_lat2023.rules +49 -0
turkic_translit/rules/ky_ipa.rules +43 -0
turkic_translit/rules/ky_lat2023.rules +40 -0
turkic_translit/sanity.py +25 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+__pycache__/
+*.egg-info/
+*.whl
+.venv/
+.pytest_cache/

pyproject.toml ADDED Viewed

	@@ -0,0 +1,44 @@

+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name            = "turkic_transliterate"
+version         = "0.1.0"
+description     = "Deterministic Latin and IPA transliteration for Kazakh, Kyrgyz, plus tokenizer/glue scripts."
+authors         = [ {name="Austin Wagner", email="awagner@uci.edu"} ]
+requires-python = ">=3.9"
+dependencies = [
+    # Core dependencies (alphabetized)
+    "epitran>=1.0,<1.27",           # 1.26.0 is the latest on PyPI
+    # Universal fasttext-wheel for all platforms and Python 3.10-3.13
+    "fasttext-wheel==0.9.2",
+    "numpy<2",
+    "packaging>=23.0",              # Used in tests/test_fasttext.py
+    "panphon>=0.20,<0.22",          # 0.21.2 is the newest published wheel
+    # Source build for non-Windows platforms
+    "PyICU>=2.15 ; sys_platform != 'win32'",
+    # Windows wheels for PyICU
+    "PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp310-cp310-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.10'",
+    "PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp311-cp311-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.11'",
+    "PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp312-cp312-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.12'",
+    "PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp313-cp313-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.13'",
+    "pytest>=8.0",                  # Test runner
+    "rapidfuzz>=3.5",
+    "rich>=13.7",                   # Color-aware logging and console output
+    "sentencepiece>=0.2.0"
+]
+[project.optional-dependencies]
+# winlid dependency kept for backward compatibility but empty since fasttext-wheel is now in main dependencies
+winlid = []
+# Development tools
+dev = ["black", "ruff"]
+# User interface dependencies
+ui = ["gradio"]
+[project.scripts]
+turkic-translit = "turkic_translit.cli:main"

requirements.txt CHANGED Viewed

@@ -1,4 +1,2 @@
-turkic_transliterate==0.1.0      # will pull PyICU, panphon, etc.
 gradio>=4.30
-sentencepiece>=0.2
-rapidfuzz>=3.5


1	+ -e .[ui]
2	gradio>=4.30

turkic_translit/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from importlib.metadata import version
+# Set up logging first before any other operations
+from .logging_config import setup as _log_setup; _log_setup()
+# Import patches next to ensure they're applied before other imports
+from . import patches
+from .core import to_latin, to_ipa
+__all__ = ["to_latin", "to_ipa"]
+__version__ = version("turkic_transliterate")

turkic_translit/cli.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import sys, argparse, pathlib, time, os, logging
+from .core import to_latin, to_ipa
+from .logging_config import setup as _log_setup
+# Initialize logger
+log = logging.getLogger(__name__)
+def main() -> None:
+    ap = argparse.ArgumentParser(description="Turkic transliteration")
+    ap.add_argument("--lang", required=True, choices=["kk", "ky"])
+    ap.add_argument("--ipa", action="store_true", help="produce IPA")
+    ap.add_argument("--arabic", action="store_true", help="also transliterate Arabic script")
+    ap.add_argument("--in", dest="inp", default="-")
+    ap.add_argument("--out_latin", default="-")
+    ap.add_argument("--out_ipa")
+    ap.add_argument("--benchmark", action="store_true")
+    ap.add_argument("--log-level", choices=["debug", "info", "warning", "error", "critical"],
+                    default="info",
+                    help="Set logging level (default: info)")
+    args = ap.parse_args()
+    # Always set log level from args at the start (first runtime line)
+    os.environ["TURKIC_LOG_LEVEL"] = args.log_level.upper()
+    _log_setup()
+    outputs = ["latin"]
+    if args.ipa:
+        outputs.append("ipa")
+    # Use Rich markup for output modes (magenta)
+    outputs_markup = ", ".join(f"[magenta]{o}[/]" for o in outputs)
+    log.info(
+        f"Starting transliteration: lang={args.lang}, input={args.inp}, outputs={outputs_markup}, "
+        f"out_latin={args.out_latin}, out_ipa={args.out_ipa}, arabic={args.arabic}, benchmark={args.benchmark}"
+    )
+    # Use UTF-8-sig for Windows to include BOM for proper encoding support
+    encoding = "utf-8-sig" if sys.platform == "win32" else "utf-8"
+    try:
+        fin  = sys.stdin  if args.inp  == "-" else open(args.inp, encoding=encoding)
+        fo_l = sys.stdout if args.out_latin == "-" else open(args.out_latin, "w", encoding=encoding)
+        fo_i = None
+        if args.ipa:
+            if not args.out_ipa:
+                ap.error("--ipa requires --out_ipa")
+            fo_i = open(args.out_ipa, "w", encoding=encoding)
+    except UnicodeDecodeError as e:
+        sys.stderr.write(f"Encoding error: {e}\n")
+        sys.stderr.write("If you're on Windows, make sure your input file is properly encoded in UTF-8.\n")
+        sys.exit(1)
+    start = time.time()
+    n = 0
+    # Try to use tqdm for a progress bar if available and if we're in a TTY
+    use_progress_bar = False
+    pbar = None
+    # Check if we should use a progress bar (stderr is a TTY and input is not stdin)
+    is_tty_output = sys.stderr.isatty()
+    is_file_input = args.inp != "-"
+    if is_tty_output and is_file_input:
+        try:
+            from tqdm import tqdm
+            # Count the number of lines in the input file for the progress bar
+            total_lines = sum(1 for _ in fin)
+            fin.seek(0)  # Reset file pointer
+            pbar = tqdm(total=total_lines, unit="lines")
+            use_progress_bar = True
+            log.debug("Using tqdm progress bar for %d lines", total_lines)
+        except ImportError:
+            log.debug("tqdm not available, falling back to basic processing")
+    # Process lines
+    for line in fin:
+        lat = to_latin(line.rstrip("\n"), args.lang, args.arabic)
+        fo_l.write(lat + "\n")
+        if fo_i:
+            fo_i.write(to_ipa(line.rstrip("\n"), args.lang) + "\n")
+        n += 1
+        if use_progress_bar and pbar:
+            pbar.update(1)
+    log.info(f"Finished writing {n} lines to {args.out_latin if args.out_latin != '-' else 'stdout'}" + (f" and {args.out_ipa}" if args.ipa else ""))
+    # Close progress bar if used
+    if use_progress_bar and pbar:
+        pbar.close()
+    elapsed = time.time() - start
+    # Always log processing statistics, but at different levels based on benchmark flag
+    if args.benchmark:
+        log.info("Processed %d lines in %.2fs (%.0f lines/s)", n, elapsed, n/elapsed if elapsed > 0 else 0)
+    else:
+        log.debug("Processed %d lines in %.2fs (%.0f lines/s)", n, elapsed, n/elapsed if elapsed > 0 else 0)
+    log.info("Transliteration complete.")
+    # Clean up file handles
+    if fin is not sys.stdin:
+        fin.close()
+    if fo_l is not sys.stdout:
+        fo_l.close()
+    if fo_i:
+        fo_i.close()
+# This is the entry point when the module is run directly
+if __name__ == "__main__":
+    main()

turkic_translit/core.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""Public API for Latin and IPA transliteration."""
+try:
+    import icu        # noqa: F401
+except ImportError as e:    # PyICU wheel is still missing
+    raise RuntimeError(
+        "PyICU missing. On Windows run:\n"
+        "  python scripts/get_pyicu_wheel.py\n"
+        "or manually install a wheel from "
+        "https://github.com/cgohlke/pyicu-build/releases ."
+    ) from e
+from functools import lru_cache
+import unicodedata as ud
+from pathlib import Path
+_RULE_DIR = Path(__file__).with_suffix("").parent / "rules"
+@lru_cache
+def _icu_trans(name: str) -> icu.Transliterator:
+    txt = (_RULE_DIR / name).read_text(encoding="utf8")
+    return icu.Transliterator.createFromRules(name, txt, 0)
+def to_latin(text: str, lang: str, include_arabic: bool = False) -> str:
+    if lang not in ("kk", "ky"):
+        raise ValueError("lang must be 'kk' or 'ky'")
+    rule = f"{lang}_lat2023.rules"
+    trans = _icu_trans(rule)
+    if include_arabic:
+        ar = _icu_trans("ar_lat.rules")
+        text = ar.transliterate(text)
+    out = trans.transliterate(text)
+    return ud.normalize("NFC", out)
+def to_ipa(text: str, lang: str) -> str:
+    trans = _icu_trans(f"{lang}_ipa.rules")
+    return ud.normalize("NFC", trans.transliterate(text))

turkic_translit/logging_config.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""
+Centralized logging configuration module.
+Uses Rich for colorized output if available with fallback to standard library.
+"""
+import logging
+import os
+import sys
+from functools import lru_cache
+# Get log level from environment or default to INFO
+LOG_LEVEL = os.environ.get("TURKIC_LOG_LEVEL", "INFO").upper()
+@lru_cache(maxsize=1)
+def setup():
+    """
+    Set up logging with Rich if available, with fallback to stdlib logging.
+    Uses TURKIC_LOG_LEVEL environment variable or defaults to INFO.
+    Uses @lru_cache to ensure this is only run once.
+    """
+    root_logger = logging.getLogger()
+    # Clear any existing handlers
+    for handler in root_logger.handlers[:]:
+        root_logger.removeHandler(handler)
+    # Set the log level based on environment variable
+    log_level = getattr(logging, LOG_LEVEL, logging.INFO)
+    root_logger.setLevel(log_level)
+    # Try to use Rich for pretty, colorized output
+    try:
+        from rich.logging import RichHandler
+        # Configure Rich handler with appropriate settings
+        handler = RichHandler(
+            rich_tracebacks=True,
+            markup=True,
+            show_time=False,
+            show_path=False,
+        )
+        formatter = logging.Formatter("%(message)s")
+    except ImportError:
+        # Fall back to standard logging if Rich is not available
+        handler = logging.StreamHandler(sys.stderr)
+        formatter = logging.Formatter(
+            "%(levelname)s: %(message)s"
+        )
+    # Configure and add the handler
+    handler.setFormatter(formatter)
+    root_logger.addHandler(handler)
+    logger = logging.getLogger("turkic_translit")
+    logger.debug(f"Logging initialized at level {LOG_LEVEL}")
+    return logger

turkic_translit/patches.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+Patches for third-party libraries to fix encoding issues on Windows.
+This module is imported automatically at startup.
+"""
+import os
+import sys
+import functools
+import logging
+from .logging_config import setup; setup()
+log = logging.getLogger(__name__)
+_PATCH_DONE = False
+_PATCHED_FILES = set()
+def _fix_broken_ssl_cert_env():
+    """
+    If the user (often Conda on Windows) left SSL_CERT_FILE pointing at a
+    non-existent bundle, httpx ⇢ gradio will crash on import.  When the file
+    is missing we delete the env-var so Python falls back to the system
+    certificates.
+    """
+    import os, pathlib, logging
+    log = logging.getLogger(__name__)
+    bundle = os.environ.get("SSL_CERT_FILE")
+    if bundle and not pathlib.Path(bundle).exists():
+        log.warning(
+            "SSL_CERT_FILE=%s does not exist – removing the variable so "
+            "httpx can create a default context", bundle)
+        del os.environ["SSL_CERT_FILE"]
+def apply_patches():
+    """Apply all necessary patches for third-party libraries."""
+    global _PATCH_DONE
+    _fix_broken_ssl_cert_env()  # ← new line: ensure SSL_CERT_FILE is valid before any third-party import
+    # Skip if patches have already been applied
+    if _PATCH_DONE:
+        log.debug("Patches already applied, skipping")
+        return
+    _PATCH_DONE = True
+    # Fix panphon encoding issues on Windows
+    if sys.platform == 'win32':
+        try:
+            import panphon.featuretable
+            import io
+            import csv
+            # Save the original open function
+            original_open = open
+            # Monkey patch the built-in open function when used by panphon
+            def patched_open_for_panphon(file, mode='r', *args, **kwargs):
+                # Add explicit UTF-8 encoding for CSV files opened by panphon
+                if 'panphon' in sys.modules and mode == 'r' and isinstance(file, str) and file.endswith('.csv'):
+                    if 'encoding' not in kwargs:
+                        kwargs['encoding'] = 'utf-8'
+                        # Only log the first time per unique file
+                        if file not in _PATCHED_FILES:
+                            log.debug(f"Applied UTF-8 encoding patch for {file}")
+                            _PATCHED_FILES.add(file)
+                return original_open(file, mode, *args, **kwargs)
+            # Set the environment variable for good measure
+            os.environ['PYTHONUTF8'] = '1'
+            # Apply the patch
+            import builtins
+            builtins.open = patched_open_for_panphon
+            log.info("Applied panphon UTF-8 patch for Windows")
+            # We've already applied the patch above
+        except ImportError:
+            log.warning("Could not patch panphon (not installed)")
+# Apply patches when module is imported
+apply_patches()
+log.debug("Patches module initialized")

turkic_translit/rules/ar_lat.rules ADDED Viewed

	@@ -0,0 +1,15 @@

+# consonants
+ب > b ;  پ > p ;  ت > t ;  ج > j ;  چ > ch ;
+ح > h ;  خ > x ;  د > d ;  ر > r ;  ز > z ;  س > s ;
+ش > sh ; ص > s ;  ط > t ;  غ > gh ;
+ف > f ;  ق > q ;  ك > k ;  گ > g ;  ل > l ;  م > m ;  ن > n ;
+ه > h ;  ھ > h ;  ژ > zh ;  ڭ > ng ;  ۋ > w ;
+# vowels  (hamza carrier ئ can be dropped or mapped to ')
+ا > a ;   ە > e ;   ۆ > ö ;   و > o ;
+ۇ > u ;   ۈ > ü ;   ى > i ;   ې > ë ;
+# glottals
+ء > ' ;   ع > ' ;   ئ > ;
+:: NFC ;

turkic_translit/rules/kk_ipa.rules ADDED Viewed

	@@ -0,0 +1,47 @@

+# Kazakh → IPA transliteration rules (kk_ipa.rules)
+# One line per Cyrillic letter.  Right-hand side is plain IPA (no slashes).  NFC-normalised.
+А > ɑ ;  а > ɑ ;
+Ә > æ ;  ә > æ ;
+Б > b ;  б > b ;
+В > v ;  в > v ;
+Г > ɡ ;  г > ɡ ;
+Ғ > ʁ ;  ғ > ʁ ;
+Д > d ;  д > d ;
+Е > e ;  е > e ;
+Ё > jo ; ё > jo ;
+Ж > ʒ ;  ж > ʒ ;
+З > z ;  з > z ;
+И > i ;  и > i ;
+Й > j ;  й > j ;
+К > k ;  к > k ;
+Қ > q ;  қ > q ;
+Л > l ;  л > l ;
+М > m ;  м > m ;
+Н > n ;  н > n ;
+Ң > ŋ ;  ң > ŋ ;
+О > o ;  о > o ;
+Ө > ø ;  ө > ø ;
+П > p ;  п > p ;
+Р > r ;  р > r ;
+С > s ;  с > s ;
+Т > t ;  т > t ;
+У > u ;  у > u ;
+Ұ > ʊ ;  ұ > ʊ ;
+Ү > y ;  ү > y ;
+Ф > f ;  ф > f ;
+Х > x ;  х > x ;
+Һ > h ;  һ > h ;
+Ц > ts ; ц > ts ;
+Ч > t͡ʃ ;  ч > t͡ʃ ;
+Ш > ʃ ;  ш > ʃ ;
+Щ > ɕː ; щ > ɕː ;
+Ъ > ʔ ;  ъ > ʔ ;
+Ы > ɯ ;  ы > ɯ ;
+І > ɪ ;  і > ɪ ;
+Ь > ;   ь > ;
+Э > e ;  э > e ;
+Ю > ju ; ю > ju ;
+Я > ja ; я > ja ;
+:: NFC ;

turkic_translit/rules/kk_lat2023.rules ADDED Viewed

	@@ -0,0 +1,49 @@

+# Official Kazakh Latin alphabet (April 2021)
+# https://en.wikipedia.org/wiki/Kazakh_alphabets
+А > A ;  а > a ;
+Ә > Ä ;  ә > ä ;
+Б > B ;  б > b ;
+В > V ;  в > v ;
+Г > G ;  г > g ;
+Ғ > Ğ ;  ғ > ğ ;
+Д > D ;  д > d ;
+Е > E ;  е > e ;
+Ж > J ;  ж > j ;
+З > Z ;  з > z ;
+И > İ ;  и > i ;   # dotted İ/i
+Й > İ ;  й > i ;   # official merging per standard (ambiguity known)
+І > I ;  і > ı ;   # corrected: dotless lowercase ı
+К > K ;  к > k ;
+Қ > Q ;  қ > q ;
+Л > L ;  л > l ;
+М > M ;  м > m ;
+Н > N ;  н > n ;
+Ң > Ñ ;  ң > ñ ;
+О > O ;  о > o ;
+Ө > Ö ;  ө > ö ;
+П > P ;  п > p ;
+Р > R ;  р > r ;
+С > S ;  с > s ;
+Т > T ;  т > t ;
+У > U ;  у > u ;
+Ұ > Ū ;  ұ > ū ;
+Ү > Ü ;  ү > ü ;
+Ф > F ;  ф > f ;
+Х > H ;  х > h ;
+Һ > H ;  һ > h ;
+# Russian loan letters (clearly marked, NOT official Kazakh letters)
+Ё > Yo ;  ё > yo ;
+Э > Ė  ;  э > ė ;
+Ц > Ts ;  ц > ts ;
+Ч > Ch ;  ч > ch ;
+Ш > Ş  ;  ш > ş ;
+Щ > Şç ; щ > şç ;
+Ы > Y  ;  ы > y ;
+Ю > Yu ; ю > yu ;
+Я > Ya ; я > ya ;
+Ъ > ;   ъ > ;     # dropped entirely
+Ь > ;   ь > ;     # dropped entirely
+:: NFC ;

turkic_translit/rules/ky_ipa.rules ADDED Viewed

	@@ -0,0 +1,43 @@

+# Kyrgyz → IPA transliteration rules (ky_ipa.rules)
+# One line per Cyrillic letter; NFC‐normalised; IPA given without slashes.
+А > a ;  а > a ;
+Б > b ;  б > b ;
+В > v ;  в > v ;
+Г > ɡ ;  г > ɡ ;
+Ғ > ʁ ;  ғ > ʁ ;
+Д > d ;  д > d ;
+Е > e ;  е > e ;
+Ё > jo ; ё > jo ;
+Ж > d͡ʒ ;  ж > d͡ʒ ;
+З > z ;  з > z ;
+И > i ;  и > i ;
+Й > j ;  й > j ;
+К > k ;  к > k ;
+Қ > q ;  қ > q ;
+Л > l ;  л > l ;
+М > m ;  м > m ;
+Н > n ;  н > n ;
+Ң > ŋ ;  ң > ŋ ;
+О > o ;  о > o ;
+Ө > ø ;  ө > ø ;
+П > p ;  п > p ;
+Р > r ;  р > r ;
+С > s ;  с > s ;
+Т > t ;  т > t ;
+У > u ;  у > u ;
+Ү > y ;  ү > y ;
+Ф > f ;  ф > f ;
+Х > x ;  х > x ;
+Ц > ts ; ц > ts ;
+Ч > t͡ʃ ;  ч > t͡ʃ ;
+Ш > ʃ ;  ш > ʃ ;
+Щ > ɕː ; щ > ɕː ;
+Ы > ɯ ;  ы > ɯ ;
+Э > ɛ ;  э > ɛ ;
+Ю > ju ; ю > ju ;
+Я > ja ; я > ja ;
+Ъ > ʔ ;  ъ > ʔ ;
+Ь > ;   ь > ;
+:: NFC ;

turkic_translit/rules/ky_lat2023.rules ADDED Viewed

	@@ -0,0 +1,40 @@

+# Kyrgyz → Modern Practical Latin Transliteration (NFC)
+# One line per pair; use pre‑composed Unicode code‑points; digraphs are atomic tokens.
+А > A ;  а > a ;
+Б > B ;  б > b ;
+В > V ;  в > v ;
+Г > G ;  г > g ;
+Д > D ;  д > d ;
+Е > E ;  е > e ;
+Ё > Yo ; ё > yo ;
+Ж > J ;  ж > j ;
+З > Z ;  з > z ;
+И > İ ;  и > i ;   # dotted I
+Й > Ý ;  й > ý ;   # /j/ glide
+К > K ;  к > k ;
+Л > L ;  л > l ;
+М > M ;  м > m ;
+Н > N ;  н > n ;
+Ң > Ñ ;  ң > ñ ;
+О > O ;  о > o ;
+Ө > Ö ;  ө > ö ;
+П > P ;  п > p ;
+Р > R ;  р > r ;
+С > S ;  с > s ;
+Т > T ;  т > t ;
+У > U ;  у > u ;
+Ү > Ü ;  ү > ü ;
+Ф > F ;  ф > f ;
+Х > H ;  х > h ;
+Ц > Ts ; ц > ts ;  # digraph
+Ч > Ç ;  ч > ç ;
+Ш > Ş ;  ш > ş ;
+Щ > Şç ; щ > şç ;  # digraph
+Ы > Y ;  ы > y ;
+Э > É ;  э > é ;
+Ю > Yu ; ю > yu ;   # digraph
+Я > Ya ; я > ya ;   # digraph
+Ъ > ʼ ;  ъ > ʼ ;    # modifier apostrophe U+02BC
+Ь > ʼ ;  ь > ʼ ;
+:: NFC ;

turkic_translit/sanity.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""Helper functions for Levenshtein and byte checks."""
+from rapidfuzz.distance import Levenshtein
+import io
+import os, re, unicodedata as ud
+def median_lev(file_lat:str, file_ipa:str, sample:int=5000) -> float:
+    from statistics import median
+    m=[]
+    with io.open(file_lat, encoding="utf8") as f1, io.open(file_ipa, encoding="utf8") as f2:
+        for i,(l,i_) in enumerate(zip(f1,f2)):
+            if i==sample: break
+            m.append(Levenshtein.normalized_distance(l.strip(), i_.strip()))
+    return median(m)
+def bytes_per_char(filename:str)->float:
+    import os, io
+    b = os.path.getsize(filename)
+    with io.open(filename, encoding="utf8") as f:
+        chars = sum(len(line) for line in f)
+    return b / chars
+def is_nfc(filename:str)->bool:
+    import unicodedata, io
+    with io.open(filename, encoding="utf8") as f:
+        return all(unicodedata.is_normalized("NFC", line) for line in f)