"""Small shared utilities.""" from __future__ import annotations import hashlib import json from dataclasses import asdict, is_dataclass from pathlib import Path from typing import Any, Iterable def document_id_for_path(path: str | Path) -> str: path_obj = Path(path) stat = path_obj.stat() seed = f"{path_obj.resolve()}:{stat.st_size}:{int(stat.st_mtime)}" return hashlib.sha1(seed.encode("utf-8")).hexdigest()[:16] def file_type_from_path(path: str | Path) -> str: suffix = Path(path).suffix.lower().lstrip(".") if suffix == "pdf": return "pdf" if suffix in {"docx", "doc"}: return "docx" if suffix in {"pptx", "ppt"}: return "pptx" if suffix in {"xlsx", "xls", "csv"}: return "xlsx" if suffix in {"html", "htm"}: return "html" if suffix in {"png", "jpg", "jpeg", "tiff", "tif", "bmp", "webp"}: return "image" if suffix == "epub": return "epub" if suffix in {"md", "markdown"}: return "markdown" if suffix in {"txt", "text"}: return "text" return suffix or "unknown" def to_plain_data(value: Any) -> Any: if is_dataclass(value): return {key: to_plain_data(item) for key, item in asdict(value).items()} if isinstance(value, dict): return {str(key): to_plain_data(item) for key, item in value.items()} if isinstance(value, (list, tuple)): return [to_plain_data(item) for item in value] if isinstance(value, Path): return str(value) return value def dumps_json(value: Any, *, indent: int = 2) -> str: return json.dumps(to_plain_data(value), indent=indent, ensure_ascii=False, sort_keys=True) def write_json(path: str | Path, value: Any) -> None: Path(path).write_text(dumps_json(value) + "\n", encoding="utf-8") def write_jsonl(path: str | Path, records: Iterable[Any]) -> None: lines = [json.dumps(to_plain_data(record), ensure_ascii=False, sort_keys=True) for record in records] Path(path).write_text("\n".join(lines) + ("\n" if lines else ""), encoding="utf-8") def normalize_whitespace(text: str) -> str: return " ".join(text.split()) def clamp(value: float, low: float = 0.0, high: float = 1.0) -> float: return max(low, min(high, value))