Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import sys | |
| from typing import List | |
| from .pipeline import classify_text | |
| from .scoring import DIALECTS | |
| def _decode_stdin_bytes(data: bytes) -> str: | |
| """Decode piped stdin bytes robustly on Windows/PowerShell. | |
| PowerShell (especially Windows PowerShell 5.x) may pipe text to native | |
| executables as UTF-16LE, which can appear in Python as NUL-padded bytes or | |
| mojibake if decoded with a legacy code page. | |
| """ | |
| if not data: | |
| return "" | |
| # Heuristic: lots of NUL bytes strongly suggests UTF-16. | |
| nul_ratio = data.count(b"\x00") / max(1, len(data)) | |
| if nul_ratio > 0.10: | |
| for enc in ("utf-16", "utf-16-le", "utf-16-be"): | |
| try: | |
| return data.decode(enc) | |
| except UnicodeDecodeError: | |
| continue | |
| # Otherwise, try UTF-8 first (common in PowerShell 7+), then UTF-16 just in case. | |
| for enc in ("utf-8-sig", "utf-8", "utf-16", "utf-16-le", "utf-16-be"): | |
| try: | |
| return data.decode(enc) | |
| except UnicodeDecodeError: | |
| continue | |
| # Fallback: replace undecodable bytes. | |
| return data.decode("utf-8", errors="replace") | |
| def read_multiline_stdin() -> str: | |
| """Read multi-line input. | |
| - If text is piped in, read all of stdin. | |
| - If interactive, read until an empty line or EOF. | |
| """ | |
| if not sys.stdin.isatty(): | |
| data = sys.stdin.buffer.read() | |
| return _decode_stdin_bytes(data) | |
| print("Enter Greek text (finish with an empty line, or Ctrl-Z then Enter on Windows):") | |
| lines: List[str] = [] | |
| while True: | |
| try: | |
| line = input() | |
| except EOFError: | |
| break | |
| if line.strip() == "": | |
| break | |
| lines.append(line) | |
| return "\n".join(lines) | |
| def run_cli() -> int: | |
| # Best-effort Windows console UTF-8 handling. | |
| # This does not affect piped-input decoding (handled separately). | |
| try: | |
| if sys.stdin.isatty(): | |
| sys.stdin.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined] | |
| if sys.stdout.isatty(): | |
| sys.stdout.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined] | |
| except Exception: | |
| pass | |
| text = read_multiline_stdin() | |
| if not text.strip(): | |
| print("No input provided.") | |
| return 2 | |
| # If the console encoding is wrong, Greek often turns into '?'. | |
| if text.count("?") >= 10 and sys.stdin.isatty(): | |
| print( | |
| "Warning: many '?' characters detected; your terminal may not be using UTF-8. " | |
| "In PowerShell, try: chcp 65001" | |
| ) | |
| result = classify_text(text) | |
| print(f"Dialect: {result['dialect']}") | |
| print(f"Confidence: {result['confidence'] * 100:.1f}%") | |
| print("Scores (%):") | |
| for d in DIALECTS: | |
| print(f" {d}: {float(result['scores'].get(d, 0.0)):.1f}") | |
| print("") | |
| print(result["explanation"]) | |
| return 0 | |