# DEPENDENCIES from typing import Dict from typing import List from typing import Tuple from dataclasses import field from config.enums import Script from dataclasses import dataclass @dataclass(frozen = True) class DocumentExtractionParams: """ Hyperparameters for Document Extraction """ # Supported file extensions SUPPORTED_EXTENSIONS : frozenset = frozenset({'.txt', '.text', '.md', '.markdown', '.log', '.csv', '.pdf', '.docx', '.doc', '.rtf', '.html', '.htm'}) # Text file extensions TEXT_EXTENSIONS : frozenset = frozenset({'.txt', '.text', '.md', '.markdown', '.log', '.csv'}) # Maximum file size (50 MB default) MAX_FILE_SIZE : int = 50 * 1024 * 1024 @dataclass(frozen = True) class LanguageDetectionParams: """ Hyperparameters for Language Detection """ # Text length constraints MINIMUM_TEXT_LENGTH : int = 20 # Chunking parameters MAX_CHUNK_LENGTH : int = 500 MIN_CHUNK_LENGTH : int = 50 FIXED_CHUNK_SIZE : int = 1000 # Model parameters MODEL_MAX_LENGTH : int = 512 TOP_K_PREDICTIONS : int = 3 # Confidence thresholds LOW_CONFIDENCE_THRESHOLD : float = 0.6 MULTILINGUAL_THRESHOLD : float = 0.2 SCRIPT_DOMINANCE_THRESHOLD : float = 0.7 LANGUAGE_MATCH_THRESHOLD : float = 0.7 # Quality assessment WORD_BOUNDARY_RATIO : float = 0.7 MIXED_DOMAIN_CONFIDENCE_PENALTY : float = 0.8 # Language name mappings LANGUAGE_NAMES : Dict[str, str] = field(default_factory = lambda : {"en": "English", "es": "Spanish", "fr": "French", "de": "German", "it": "Italian", "pt": "Portuguese", "ru": "Russian", "zh": "Chinese", "ja": "Japanese", "ko": "Korean", "ar": "Arabic", "hi": "Hindi", } ) # Unicode script ranges SCRIPT_RANGES : Dict[str, List[Tuple[int, int]]] = field(default_factory = lambda: {"latin" : [(0x0041, 0x007A), (0x00C0, 0x024F)], "cyrillic" : [(0x0400, 0x04FF)], "arabic" : [(0x0600, 0x06FF), (0x0750, 0x077F)], "chinese" : [(0x4E00, 0x9FFF), (0x3400, 0x4DBF)], "japanese" : [(0x3040, 0x309F), (0x30A0, 0x30FF)], "korean" : [(0xAC00, 0xD7AF), (0x1100, 0x11FF)], "devanagari" : [(0x0900, 0x097F)], "greek" : [(0x0370, 0x03FF)], "hebrew" : [(0x0590, 0x05FF)], "thai" : [(0x0E00, 0x0E7F)], } ) @dataclass(frozen = True) class TextProcessingParams: """ Hyperparameters for Text Processing """ # Text length constraints MINIMUM_TEXT_LENGTH : int = 20 MAXIMUM_TEXT_LENGTH : int = 1000000 # 1M characters # Text cleaning options PRESERVE_FORMATTING : bool = False REMOVE_URLS : bool = True REMOVE_EMAILS : bool = True NORMALIZE_UNICODE : bool = True FIX_ENCODING : bool = True # Validation thresholds MINIMUM_WORD_COUNT : int = 10 # Common abbreviations for sentence splitting COMMON_ABBREVIATIONS : list = field(default_factory = lambda: ["Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Rev.", "Gen.", "Sen.", "Rep.", "St.", "Ave.", "Blvd.", "Rd.", "Pkwy.", "Co.", "Ltd.", "Inc.", "Corp.", "vs.", "etc.", "e.g.", "i.e.", "c.", "ca.", "cf.", "al.", "et al.", "Jan.", "Feb.", "Mar.", "Apr.", "Jun.", "Jul.", "Aug.", "Sep.", "Oct.", "Nov.", "Dec.", "Mon.", "Tue.", "Wed.", "Thu.", "Fri.", "Sat.", "Sun.", "kg.", "g.", "mg.", "km.", "m.", "cm.", "mm.", "hr.", "min.", "sec.", "vol.", "no.", "p.", "pp.", "ch.", "fig.", "ed.", "trans.", "approx.", "est.", "max.", "min.", "avg.", "std.", "temp.", "pres.", "vol.", "ibid.", "op.", "cit.", "loc.", "cf.", "viz.", "sc.", "seq." ] ) @dataclass(frozen = True) class DomainClassificationParams: """ Hyperparameters for Domain Classification """ # Classification parameters TOP_K_DOMAINS : int = 2 MIN_CONFIDENCE_THRESHOLD : float = 0.3 # Confidence thresholds HIGH_CONFIDENCE_THRESHOLD : float = 0.7 MEDIUM_CONFIDENCE_THRESHOLD : float = 0.6 LOW_CONFIDENCE_THRESHOLD : float = 0.5 SECONDARY_DOMAIN_MIN_SCORE : float = 0.1 # Mixed domain detection MIXED_DOMAIN_PRIMARY_MAX : float = 0.7 MIXED_DOMAIN_SECONDARY_MIN : float = 0.3 MIXED_DOMAIN_RATIO_THRESHOLD : float = 0.6 MIXED_DOMAIN_CONFIDENCE_PENALTY : float = 0.8 # Text preprocessing MAX_WORDS_FOR_CLASSIFICATION : int = 400 # Domain labels for zero-shot classification DOMAIN_LABELS : Dict[str, List[str]] = field(default_factory = lambda : {"academic" : ["academic paper", "research article", "scientific paper", "scholarly writing", "thesis", "dissertation", "academic research"], "creative" : ["creative writing", "fiction", "story", "narrative", "poetry", "literary work", "imaginative writing"], "ai_ml" : ["artificial intelligence", "machine learning", "neural networks", "data science", "AI research", "deep learning"], "software_dev" : ["software development", "programming", "coding", "software engineering", "web development", "application development"], "technical_doc" : ["technical documentation", "user manual", "API documentation", "technical guide", "system documentation"], "engineering" : ["engineering document", "technical design", "engineering analysis", "mechanical engineering", "electrical engineering"], "science" : ["scientific research", "physics", "chemistry", "biology", "scientific study", "experimental results"], "business" : ["business document", "corporate communication", "business report", "professional writing", "executive summary"], "journalism" : ["news article", "journalism", "press release", "news report", "media content", "reporting"], "social_media" : ["social media post", "casual writing", "online content", "informal text", "social media content"], "blog_personal" : ["personal blog", "personal writing", "lifestyle blog", "personal experience", "opinion piece", "diary entry"], "legal" : ["legal document", "contract", "legal writing", "law", "legal agreement", "legal analysis"], "medical" : ["medical document", "healthcare", "clinical", "medical report", "health information", "medical research"], "marketing" : ["marketing content", "advertising", "brand content", "promotional writing", "sales copy", "marketing material"], "tutorial" : ["tutorial", "how-to guide", "instructional content", "step-by-step guide", "educational guide", "learning material"], "general" : ["general content", "everyday writing", "common text", "standard writing", "normal text", "general information"], } ) @dataclass(frozen = True) class BaseMetricParams: """ Hyperparameters for BaseMetric class """ DEFAULT_AUTHENTIC_PROBABILITY : float = 0.5 DEFAULT_SYNTHETIC_PROBABILITY : float = 0.5 DEFAULT_HYBRID_PROBABILITY : float = 0.0 DEFAULT_CONFIDENCE : float = 0.0 @dataclass(frozen = True) class StructuralMetricParams: """ Hyperparameters for Structural Metric """ # Domain threshold application - PROBABILITY CONSTANTS STRONG_SYNTHETIC_BASE_PROB : float = 0.7 STRONG_AUTHENTIC_BASE_PROB : float = 0.7 WEAK_PROBABILITY_ADJUSTMENT : float = 0.3 UNCERTAIN_SYNTHETIC_RANGE_START : float = 0.3 UNCERTAIN_AUTHENTIC_RANGE_START : float = 0.7 UNCERTAIN_RANGE_WIDTH : float = 0.4 NEUTRAL_PROBABILITY : float = 0.5 # For fallback MIN_PROBABILITY : float = 0.0 MAX_PROBABILITY : float = 1.0 # Feature extraction - sentence splitting SENTENCE_SPLIT_PATTERN : str = r'(?