zeroshotGPU / zsgdp /utils.py
Arjunvir Singh
Initial commit: zeroshotGPU MVP with full eval surface
db06ffa
"""Small shared utilities."""
from __future__ import annotations
import hashlib
import json
from dataclasses import asdict, is_dataclass
from pathlib import Path
from typing import Any, Iterable
def document_id_for_path(path: str | Path) -> str:
path_obj = Path(path)
stat = path_obj.stat()
seed = f"{path_obj.resolve()}:{stat.st_size}:{int(stat.st_mtime)}"
return hashlib.sha1(seed.encode("utf-8")).hexdigest()[:16]
def file_type_from_path(path: str | Path) -> str:
suffix = Path(path).suffix.lower().lstrip(".")
if suffix == "pdf":
return "pdf"
if suffix in {"docx", "doc"}:
return "docx"
if suffix in {"pptx", "ppt"}:
return "pptx"
if suffix in {"xlsx", "xls", "csv"}:
return "xlsx"
if suffix in {"html", "htm"}:
return "html"
if suffix in {"png", "jpg", "jpeg", "tiff", "tif", "bmp", "webp"}:
return "image"
if suffix == "epub":
return "epub"
if suffix in {"md", "markdown"}:
return "markdown"
if suffix in {"txt", "text"}:
return "text"
return suffix or "unknown"
def to_plain_data(value: Any) -> Any:
if is_dataclass(value):
return {key: to_plain_data(item) for key, item in asdict(value).items()}
if isinstance(value, dict):
return {str(key): to_plain_data(item) for key, item in value.items()}
if isinstance(value, (list, tuple)):
return [to_plain_data(item) for item in value]
if isinstance(value, Path):
return str(value)
return value
def dumps_json(value: Any, *, indent: int = 2) -> str:
return json.dumps(to_plain_data(value), indent=indent, ensure_ascii=False, sort_keys=True)
def write_json(path: str | Path, value: Any) -> None:
Path(path).write_text(dumps_json(value) + "\n", encoding="utf-8")
def write_jsonl(path: str | Path, records: Iterable[Any]) -> None:
lines = [json.dumps(to_plain_data(record), ensure_ascii=False, sort_keys=True) for record in records]
Path(path).write_text("\n".join(lines) + ("\n" if lines else ""), encoding="utf-8")
def normalize_whitespace(text: str) -> str:
return " ".join(text.split())
def clamp(value: float, low: float = 0.0, high: float = 1.0) -> float:
return max(low, min(high, value))