from __future__ import annotations import sys from typing import List from .pipeline import classify_text from .scoring import DIALECTS def _decode_stdin_bytes(data: bytes) -> str: """Decode piped stdin bytes robustly on Windows/PowerShell. PowerShell (especially Windows PowerShell 5.x) may pipe text to native executables as UTF-16LE, which can appear in Python as NUL-padded bytes or mojibake if decoded with a legacy code page. """ if not data: return "" # Heuristic: lots of NUL bytes strongly suggests UTF-16. nul_ratio = data.count(b"\x00") / max(1, len(data)) if nul_ratio > 0.10: for enc in ("utf-16", "utf-16-le", "utf-16-be"): try: return data.decode(enc) except UnicodeDecodeError: continue # Otherwise, try UTF-8 first (common in PowerShell 7+), then UTF-16 just in case. for enc in ("utf-8-sig", "utf-8", "utf-16", "utf-16-le", "utf-16-be"): try: return data.decode(enc) except UnicodeDecodeError: continue # Fallback: replace undecodable bytes. return data.decode("utf-8", errors="replace") def read_multiline_stdin() -> str: """Read multi-line input. - If text is piped in, read all of stdin. - If interactive, read until an empty line or EOF. """ if not sys.stdin.isatty(): data = sys.stdin.buffer.read() return _decode_stdin_bytes(data) print("Enter Greek text (finish with an empty line, or Ctrl-Z then Enter on Windows):") lines: List[str] = [] while True: try: line = input() except EOFError: break if line.strip() == "": break lines.append(line) return "\n".join(lines) def run_cli() -> int: # Best-effort Windows console UTF-8 handling. # This does not affect piped-input decoding (handled separately). try: if sys.stdin.isatty(): sys.stdin.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined] if sys.stdout.isatty(): sys.stdout.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined] except Exception: pass text = read_multiline_stdin() if not text.strip(): print("No input provided.") return 2 # If the console encoding is wrong, Greek often turns into '?'. if text.count("?") >= 10 and sys.stdin.isatty(): print( "Warning: many '?' characters detected; your terminal may not be using UTF-8. " "In PowerShell, try: chcp 65001" ) result = classify_text(text) print(f"Dialect: {result['dialect']}") print(f"Confidence: {result['confidence'] * 100:.1f}%") print("Scores (%):") for d in DIALECTS: print(f" {d}: {float(result['scores'].get(d, 0.0)):.1f}") print("") print(result["explanation"]) return 0