Spaces:
Sleeping
Sleeping
File size: 5,416 Bytes
5df8a73 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | #!/usr/bin/env python
"""
Document Validator - Validation utilities for document uploads
"""
import mimetypes
import os
import re
from typing import ClassVar
class DocumentValidator:
"""Document validation utilities"""
# Maximum file size in bytes (100MB)
MAX_FILE_SIZE: ClassVar[int] = 100 * 1024 * 1024
# Maximum file size for PDF processing (50MB to prevent resource exhaustion)
MAX_PDF_SIZE: ClassVar[int] = 50 * 1024 * 1024
# Allowed file extensions
ALLOWED_EXTENSIONS: ClassVar[set[str]] = {
".pdf",
".txt",
".md",
".doc",
".docx",
".rtf",
".html",
".htm",
".xml",
".json",
".csv",
".xlsx",
".xls",
".pptx",
".ppt",
}
# MIME type mapping for additional validation
ALLOWED_MIME_TYPES: ClassVar[set[str]] = {
"application/pdf",
"text/plain",
"text/markdown",
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/rtf",
"text/html",
"application/xml",
"text/xml",
"application/json",
"text/csv",
"application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
}
@staticmethod
def validate_upload_safety(
filename: str, file_size: int | None, allowed_extensions: set[str] | None = None
) -> str:
"""
Validate file upload safety
Args:
filename: Name of the file
file_size: Size of the file in bytes, or None to skip size validation
allowed_extensions: Optional override for allowed extensions
Returns:
Sanitized filename safe for filesystem use
Raises:
ValueError: If validation fails
"""
# Check file size (skip if size is None)
if file_size is not None and file_size > DocumentValidator.MAX_FILE_SIZE:
raise ValueError(
f"File too large: {file_size} bytes. Maximum allowed: {DocumentValidator.MAX_FILE_SIZE} bytes"
)
# Additional size check for PDFs to prevent resource exhaustion
_, ext = os.path.splitext(filename.lower())
if ext == ".pdf" and file_size is not None and file_size > DocumentValidator.MAX_PDF_SIZE:
raise ValueError(
f"PDF file too large: {file_size} bytes. Maximum allowed for PDFs: {DocumentValidator.MAX_PDF_SIZE} bytes"
)
# Sanitize filename - remove path components and dangerous characters
# Extract just the filename, removing any path components
safe_name = os.path.basename(filename)
# Remove null bytes and other control characters
safe_name = re.sub(r"[\x00-\x1f\x7f]", "", safe_name)
# Replace problematic characters
safe_name = re.sub(r'[<>:"/\\|?*]', "_", safe_name)
if not safe_name or safe_name in (".", "..") or safe_name.strip("_") == "":
raise ValueError("Invalid filename")
# Check file extension
exts_to_check = allowed_extensions or DocumentValidator.ALLOWED_EXTENSIONS
if ext not in exts_to_check:
raise ValueError(
f"Unsupported file type: {ext}. Allowed types: {', '.join(exts_to_check)}"
)
# Additional MIME type validation for security
guessed_mime, _ = mimetypes.guess_type(filename.lower())
if guessed_mime and guessed_mime not in DocumentValidator.ALLOWED_MIME_TYPES:
raise ValueError(
f"MIME type validation failed: {guessed_mime}. File may be malicious or corrupted."
)
return safe_name
@staticmethod
def get_file_info(filename: str, file_size: int) -> dict:
"""
Get file information
Args:
filename: Name of the file
file_size: Size of the file in bytes
Returns:
Dictionary with file information
"""
_, ext = os.path.splitext(filename.lower())
return {
"filename": filename,
"extension": ext,
"size_bytes": file_size,
"size_mb": round(file_size / (1024 * 1024), 2),
"is_allowed": ext in DocumentValidator.ALLOWED_EXTENSIONS,
}
@staticmethod
def validate_file(path: str) -> dict:
"""
Validate that a file exists, is readable, and has valid content.
Args:
path: Path to the file to validate
Returns:
File info dictionary
Raises:
ValueError: If file is missing or validation fails
"""
if not os.path.exists(path):
raise ValueError(f"File not found: {path}")
if not os.path.isfile(path):
raise ValueError(f"Not a file: {path}")
if not os.access(path, os.R_OK):
raise ValueError(f"File not readable: {path}")
size = os.path.getsize(path)
filename = os.path.basename(path)
# Validate using validate_upload_safety
safe_name = DocumentValidator.validate_upload_safety(filename, size)
return DocumentValidator.get_file_info(safe_name, size)
|