File size: 3,047 Bytes
d0326ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from __future__ import annotations

import sys
from typing import List

from .pipeline import classify_text
from .scoring import DIALECTS


def _decode_stdin_bytes(data: bytes) -> str:
    """Decode piped stdin bytes robustly on Windows/PowerShell.



    PowerShell (especially Windows PowerShell 5.x) may pipe text to native

    executables as UTF-16LE, which can appear in Python as NUL-padded bytes or

    mojibake if decoded with a legacy code page.

    """

    if not data:
        return ""

    # Heuristic: lots of NUL bytes strongly suggests UTF-16.
    nul_ratio = data.count(b"\x00") / max(1, len(data))
    if nul_ratio > 0.10:
        for enc in ("utf-16", "utf-16-le", "utf-16-be"):
            try:
                return data.decode(enc)
            except UnicodeDecodeError:
                continue

    # Otherwise, try UTF-8 first (common in PowerShell 7+), then UTF-16 just in case.
    for enc in ("utf-8-sig", "utf-8", "utf-16", "utf-16-le", "utf-16-be"):
        try:
            return data.decode(enc)
        except UnicodeDecodeError:
            continue

    # Fallback: replace undecodable bytes.
    return data.decode("utf-8", errors="replace")


def read_multiline_stdin() -> str:
    """Read multi-line input.



    - If text is piped in, read all of stdin.

    - If interactive, read until an empty line or EOF.

    """

    if not sys.stdin.isatty():
        data = sys.stdin.buffer.read()
        return _decode_stdin_bytes(data)

    print("Enter Greek text (finish with an empty line, or Ctrl-Z then Enter on Windows):")
    lines: List[str] = []
    while True:
        try:
            line = input()
        except EOFError:
            break
        if line.strip() == "":
            break
        lines.append(line)
    return "\n".join(lines)


def run_cli() -> int:
    # Best-effort Windows console UTF-8 handling.
    # This does not affect piped-input decoding (handled separately).
    try:
        if sys.stdin.isatty():
            sys.stdin.reconfigure(encoding="utf-8", errors="replace")  # type: ignore[attr-defined]
        if sys.stdout.isatty():
            sys.stdout.reconfigure(encoding="utf-8", errors="replace")  # type: ignore[attr-defined]
    except Exception:
        pass

    text = read_multiline_stdin()
    if not text.strip():
        print("No input provided.")
        return 2

    # If the console encoding is wrong, Greek often turns into '?'.
    if text.count("?") >= 10 and sys.stdin.isatty():
        print(
            "Warning: many '?' characters detected; your terminal may not be using UTF-8. "
            "In PowerShell, try: chcp 65001"
        )

    result = classify_text(text)
    print(f"Dialect: {result['dialect']}")
    print(f"Confidence: {result['confidence'] * 100:.1f}%")
    print("Scores (%):")
    for d in DIALECTS:
        print(f"  {d}: {float(result['scores'].get(d, 0.0)):.1f}")
    print("")
    print(result["explanation"])
    return 0