Spaces:
Sleeping
Sleeping
File size: 3,047 Bytes
d0326ea | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 | from __future__ import annotations
import sys
from typing import List
from .pipeline import classify_text
from .scoring import DIALECTS
def _decode_stdin_bytes(data: bytes) -> str:
"""Decode piped stdin bytes robustly on Windows/PowerShell.
PowerShell (especially Windows PowerShell 5.x) may pipe text to native
executables as UTF-16LE, which can appear in Python as NUL-padded bytes or
mojibake if decoded with a legacy code page.
"""
if not data:
return ""
# Heuristic: lots of NUL bytes strongly suggests UTF-16.
nul_ratio = data.count(b"\x00") / max(1, len(data))
if nul_ratio > 0.10:
for enc in ("utf-16", "utf-16-le", "utf-16-be"):
try:
return data.decode(enc)
except UnicodeDecodeError:
continue
# Otherwise, try UTF-8 first (common in PowerShell 7+), then UTF-16 just in case.
for enc in ("utf-8-sig", "utf-8", "utf-16", "utf-16-le", "utf-16-be"):
try:
return data.decode(enc)
except UnicodeDecodeError:
continue
# Fallback: replace undecodable bytes.
return data.decode("utf-8", errors="replace")
def read_multiline_stdin() -> str:
"""Read multi-line input.
- If text is piped in, read all of stdin.
- If interactive, read until an empty line or EOF.
"""
if not sys.stdin.isatty():
data = sys.stdin.buffer.read()
return _decode_stdin_bytes(data)
print("Enter Greek text (finish with an empty line, or Ctrl-Z then Enter on Windows):")
lines: List[str] = []
while True:
try:
line = input()
except EOFError:
break
if line.strip() == "":
break
lines.append(line)
return "\n".join(lines)
def run_cli() -> int:
# Best-effort Windows console UTF-8 handling.
# This does not affect piped-input decoding (handled separately).
try:
if sys.stdin.isatty():
sys.stdin.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined]
if sys.stdout.isatty():
sys.stdout.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined]
except Exception:
pass
text = read_multiline_stdin()
if not text.strip():
print("No input provided.")
return 2
# If the console encoding is wrong, Greek often turns into '?'.
if text.count("?") >= 10 and sys.stdin.isatty():
print(
"Warning: many '?' characters detected; your terminal may not be using UTF-8. "
"In PowerShell, try: chcp 65001"
)
result = classify_text(text)
print(f"Dialect: {result['dialect']}")
print(f"Confidence: {result['confidence'] * 100:.1f}%")
print("Scores (%):")
for d in DIALECTS:
print(f" {d}: {float(result['scores'].get(d, 0.0)):.1f}")
print("")
print(result["explanation"])
return 0
|