Spaces:
Sleeping
Sleeping
File size: 1,446 Bytes
9d8a0cf | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | from __future__ import annotations
import logging
from pathlib import Path
logger = logging.getLogger(__name__)
_EXT_MAP: dict[str, str] = {
".pdf": "pdf",
".docx": "docx",
".doc": "docx",
".xml": "xml",
".txt": "text",
".md": "text",
".markdown": "text",
".csv": "csv",
".xlsx": "xlsx",
".xls": "xlsx",
".html": "html",
".htm": "html",
}
_MIME_MAP: dict[str, str] = {
"application/pdf": "pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
"application/msword": "docx",
"text/xml": "xml",
"application/xml": "xml",
"text/plain": "text",
"text/markdown": "text",
"text/csv": "csv",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
"application/vnd.ms-excel": "xlsx",
"text/html": "html",
}
def detect_format(path: str) -> str:
"""Return a normalized format string or 'unknown'."""
p = Path(path)
ext = p.suffix.lower()
if ext in _EXT_MAP:
return _EXT_MAP[ext]
# MIME fallback via python-magic (optional dep)
try:
import magic
mime = magic.from_file(str(p), mime=True)
if mime in _MIME_MAP:
return _MIME_MAP[mime]
except ImportError:
pass
except Exception:
logger.debug("detector: magic failed for %s", path)
logger.warning("detector: unknown format for %s", path)
return "unknown"
|