llm-ready-data / app /core /constants.py
Soumik Bose
ok
6c24b50
Raw
History Blame Contribute Delete
880 Bytes
from __future__ import annotations
SUPPORTED_EXTENSIONS = {
".pdf", ".docx", ".doc", ".pptx", ".ppt",
".xlsx", ".xls", ".csv", ".json", ".xml",
".html", ".htm", ".txt", ".md", ".rst",
".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff",
".mp3", ".wav", ".ogg", ".flac",
".zip", ".epub",
}
IMAGE_EXTENSIONS = {
".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff",
}
IMAGE_MIME_PREFIXES = {"image/"}
TABULAR_EXTENSIONS = {".csv", ".xls", ".xlsx"}
AUDIO_EXTENSIONS = {".mp3", ".wav", ".ogg", ".flac"}
DOCUMENT_EXTENSIONS = {".pdf", ".docx", ".doc", ".epub"}
OFFICE_EXTENSIONS = {".pptx", ".ppt", ".xlsx", ".xls"}
WEB_EXTENSIONS = {".html", ".htm"}
TEXT_EXTENSIONS = {".txt", ".md", ".rst"}
ARCHIVE_EXTENSIONS = {".zip"}
MAX_CSV_ROWS = 100_000
MAX_EXCEL_ROWS = 50_000
MAX_MEMORY_CELLS = 2_000_000
OCR_TEXT_SCORE = 0.5
OCR_DPI = 150