Spaces:
Running on Zero
Running on Zero
| """Small shared utilities.""" | |
| from __future__ import annotations | |
| import hashlib | |
| import json | |
| from dataclasses import asdict, is_dataclass | |
| from pathlib import Path | |
| from typing import Any, Iterable | |
| def document_id_for_path(path: str | Path) -> str: | |
| path_obj = Path(path) | |
| stat = path_obj.stat() | |
| seed = f"{path_obj.resolve()}:{stat.st_size}:{int(stat.st_mtime)}" | |
| return hashlib.sha1(seed.encode("utf-8")).hexdigest()[:16] | |
| def file_type_from_path(path: str | Path) -> str: | |
| suffix = Path(path).suffix.lower().lstrip(".") | |
| if suffix == "pdf": | |
| return "pdf" | |
| if suffix in {"docx", "doc"}: | |
| return "docx" | |
| if suffix in {"pptx", "ppt"}: | |
| return "pptx" | |
| if suffix in {"xlsx", "xls", "csv"}: | |
| return "xlsx" | |
| if suffix in {"html", "htm"}: | |
| return "html" | |
| if suffix in {"png", "jpg", "jpeg", "tiff", "tif", "bmp", "webp"}: | |
| return "image" | |
| if suffix == "epub": | |
| return "epub" | |
| if suffix in {"md", "markdown"}: | |
| return "markdown" | |
| if suffix in {"txt", "text"}: | |
| return "text" | |
| return suffix or "unknown" | |
| def to_plain_data(value: Any) -> Any: | |
| if is_dataclass(value): | |
| return {key: to_plain_data(item) for key, item in asdict(value).items()} | |
| if isinstance(value, dict): | |
| return {str(key): to_plain_data(item) for key, item in value.items()} | |
| if isinstance(value, (list, tuple)): | |
| return [to_plain_data(item) for item in value] | |
| if isinstance(value, Path): | |
| return str(value) | |
| return value | |
| def dumps_json(value: Any, *, indent: int = 2) -> str: | |
| return json.dumps(to_plain_data(value), indent=indent, ensure_ascii=False, sort_keys=True) | |
| def write_json(path: str | Path, value: Any) -> None: | |
| Path(path).write_text(dumps_json(value) + "\n", encoding="utf-8") | |
| def write_jsonl(path: str | Path, records: Iterable[Any]) -> None: | |
| lines = [json.dumps(to_plain_data(record), ensure_ascii=False, sort_keys=True) for record in records] | |
| Path(path).write_text("\n".join(lines) + ("\n" if lines else ""), encoding="utf-8") | |
| def normalize_whitespace(text: str) -> str: | |
| return " ".join(text.split()) | |
| def clamp(value: float, low: float = 0.0, high: float = 1.0) -> float: | |
| return max(low, min(high, value)) | |