Spaces:
Sleeping
Sleeping
| # DEPENDENCIES | |
| from typing import Dict | |
| from typing import List | |
| from typing import Tuple | |
| from dataclasses import field | |
| from config.enums import Script | |
| from dataclasses import dataclass | |
| class DocumentExtractionParams: | |
| """ | |
| Hyperparameters for Document Extraction | |
| """ | |
| # Supported file extensions | |
| SUPPORTED_EXTENSIONS : frozenset = frozenset({'.txt', '.text', '.md', '.markdown', '.log', '.csv', '.pdf', '.docx', '.doc', '.rtf', '.html', '.htm'}) | |
| # Text file extensions | |
| TEXT_EXTENSIONS : frozenset = frozenset({'.txt', '.text', '.md', '.markdown', '.log', '.csv'}) | |
| # Maximum file size (50 MB default) | |
| MAX_FILE_SIZE : int = 50 * 1024 * 1024 | |
| class LanguageDetectionParams: | |
| """ | |
| Hyperparameters for Language Detection | |
| """ | |
| # Text length constraints | |
| MINIMUM_TEXT_LENGTH : int = 20 | |
| # Chunking parameters | |
| MAX_CHUNK_LENGTH : int = 500 | |
| MIN_CHUNK_LENGTH : int = 50 | |
| FIXED_CHUNK_SIZE : int = 1000 | |
| # Model parameters | |
| MODEL_MAX_LENGTH : int = 512 | |
| TOP_K_PREDICTIONS : int = 3 | |
| # Confidence thresholds | |
| LOW_CONFIDENCE_THRESHOLD : float = 0.6 | |
| MULTILINGUAL_THRESHOLD : float = 0.2 | |
| SCRIPT_DOMINANCE_THRESHOLD : float = 0.7 | |
| LANGUAGE_MATCH_THRESHOLD : float = 0.7 | |
| # Quality assessment | |
| WORD_BOUNDARY_RATIO : float = 0.7 | |
| MIXED_DOMAIN_CONFIDENCE_PENALTY : float = 0.8 | |
| # Language name mappings | |
| LANGUAGE_NAMES : Dict[str, str] = field(default_factory = lambda : {"en": "English", | |
| "es": "Spanish", | |
| "fr": "French", | |
| "de": "German", | |
| "it": "Italian", | |
| "pt": "Portuguese", | |
| "ru": "Russian", | |
| "zh": "Chinese", | |
| "ja": "Japanese", | |
| "ko": "Korean", | |
| "ar": "Arabic", | |
| "hi": "Hindi", | |
| } | |
| ) | |
| # Unicode script ranges | |
| SCRIPT_RANGES : Dict[str, List[Tuple[int, int]]] = field(default_factory = lambda: {"latin" : [(0x0041, 0x007A), (0x00C0, 0x024F)], | |
| "cyrillic" : [(0x0400, 0x04FF)], | |
| "arabic" : [(0x0600, 0x06FF), (0x0750, 0x077F)], | |
| "chinese" : [(0x4E00, 0x9FFF), (0x3400, 0x4DBF)], | |
| "japanese" : [(0x3040, 0x309F), (0x30A0, 0x30FF)], | |
| "korean" : [(0xAC00, 0xD7AF), (0x1100, 0x11FF)], | |
| "devanagari" : [(0x0900, 0x097F)], | |
| "greek" : [(0x0370, 0x03FF)], | |
| "hebrew" : [(0x0590, 0x05FF)], | |
| "thai" : [(0x0E00, 0x0E7F)], | |
| } | |
| ) | |
| class TextProcessingParams: | |
| """ | |
| Hyperparameters for Text Processing | |
| """ | |
| # Text length constraints | |
| MINIMUM_TEXT_LENGTH : int = 20 | |
| MAXIMUM_TEXT_LENGTH : int = 1000000 # 1M characters | |
| # Text cleaning options | |
| PRESERVE_FORMATTING : bool = False | |
| REMOVE_URLS : bool = True | |
| REMOVE_EMAILS : bool = True | |
| NORMALIZE_UNICODE : bool = True | |
| FIX_ENCODING : bool = True | |
| # Validation thresholds | |
| MINIMUM_WORD_COUNT : int = 10 | |
| # Common abbreviations for sentence splitting | |
| COMMON_ABBREVIATIONS : list = field(default_factory = lambda: ["Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Rev.", "Gen.", "Sen.", "Rep.", "St.", "Ave.", "Blvd.", "Rd.", "Pkwy.", "Co.", "Ltd.", "Inc.", "Corp.", | |
| "vs.", "etc.", "e.g.", "i.e.", "c.", "ca.", "cf.", "al.", "et al.", "Jan.", "Feb.", "Mar.", "Apr.", "Jun.", "Jul.", "Aug.", "Sep.", "Oct.", | |
| "Nov.", "Dec.", "Mon.", "Tue.", "Wed.", "Thu.", "Fri.", "Sat.", "Sun.", "kg.", "g.", "mg.", "km.", "m.", "cm.", "mm.", "hr.", "min.", "sec.", | |
| "vol.", "no.", "p.", "pp.", "ch.", "fig.", "ed.", "trans.", "approx.", "est.", "max.", "min.", "avg.", "std.", "temp.", "pres.", "vol.", "ibid.", | |
| "op.", "cit.", "loc.", "cf.", "viz.", "sc.", "seq." | |
| ] | |
| ) | |
| class DomainClassificationParams: | |
| """ | |
| Hyperparameters for Domain Classification | |
| """ | |
| # Classification parameters | |
| TOP_K_DOMAINS : int = 2 | |
| MIN_CONFIDENCE_THRESHOLD : float = 0.3 | |
| # Confidence thresholds | |
| HIGH_CONFIDENCE_THRESHOLD : float = 0.7 | |
| MEDIUM_CONFIDENCE_THRESHOLD : float = 0.6 | |
| LOW_CONFIDENCE_THRESHOLD : float = 0.5 | |
| SECONDARY_DOMAIN_MIN_SCORE : float = 0.1 | |
| # Mixed domain detection | |
| MIXED_DOMAIN_PRIMARY_MAX : float = 0.7 | |
| MIXED_DOMAIN_SECONDARY_MIN : float = 0.3 | |
| MIXED_DOMAIN_RATIO_THRESHOLD : float = 0.6 | |
| MIXED_DOMAIN_CONFIDENCE_PENALTY : float = 0.8 | |
| # Text preprocessing | |
| MAX_WORDS_FOR_CLASSIFICATION : int = 400 | |
| # Domain labels for zero-shot classification | |
| DOMAIN_LABELS : Dict[str, List[str]] = field(default_factory = lambda : {"academic" : ["academic paper", "research article", "scientific paper", "scholarly writing", "thesis", "dissertation", "academic research"], | |
| "creative" : ["creative writing", "fiction", "story", "narrative", "poetry", "literary work", "imaginative writing"], | |
| "ai_ml" : ["artificial intelligence", "machine learning", "neural networks", "data science", "AI research", "deep learning"], | |
| "software_dev" : ["software development", "programming", "coding", "software engineering", "web development", "application development"], | |
| "technical_doc" : ["technical documentation", "user manual", "API documentation", "technical guide", "system documentation"], | |
| "engineering" : ["engineering document", "technical design", "engineering analysis", "mechanical engineering", "electrical engineering"], | |
| "science" : ["scientific research", "physics", "chemistry", "biology", "scientific study", "experimental results"], | |
| "business" : ["business document", "corporate communication", "business report", "professional writing", "executive summary"], | |
| "journalism" : ["news article", "journalism", "press release", "news report", "media content", "reporting"], | |
| "social_media" : ["social media post", "casual writing", "online content", "informal text", "social media content"], | |
| "blog_personal" : ["personal blog", "personal writing", "lifestyle blog", "personal experience", "opinion piece", "diary entry"], | |
| "legal" : ["legal document", "contract", "legal writing", "law", "legal agreement", "legal analysis"], | |
| "medical" : ["medical document", "healthcare", "clinical", "medical report", "health information", "medical research"], | |
| "marketing" : ["marketing content", "advertising", "brand content", "promotional writing", "sales copy", "marketing material"], | |
| "tutorial" : ["tutorial", "how-to guide", "instructional content", "step-by-step guide", "educational guide", "learning material"], | |
| "general" : ["general content", "everyday writing", "common text", "standard writing", "normal text", "general information"], | |
| } | |
| ) | |
| class BaseMetricParams: | |
| """ | |
| Hyperparameters for BaseMetric class | |
| """ | |
| DEFAULT_AUTHENTIC_PROBABILITY : float = 0.5 | |
| DEFAULT_SYNTHETIC_PROBABILITY : float = 0.5 | |
| DEFAULT_HYBRID_PROBABILITY : float = 0.0 | |
| DEFAULT_CONFIDENCE : float = 0.0 | |
| class StructuralMetricParams: | |
| """ | |
| Hyperparameters for Structural Metric | |
| """ | |
| # Domain threshold application - PROBABILITY CONSTANTS | |
| STRONG_SYNTHETIC_BASE_PROB : float = 0.7 | |
| STRONG_AUTHENTIC_BASE_PROB : float = 0.7 | |
| WEAK_PROBABILITY_ADJUSTMENT : float = 0.3 | |
| UNCERTAIN_SYNTHETIC_RANGE_START : float = 0.3 | |
| UNCERTAIN_AUTHENTIC_RANGE_START : float = 0.7 | |
| UNCERTAIN_RANGE_WIDTH : float = 0.4 | |
| NEUTRAL_PROBABILITY : float = 0.5 # For fallback | |
| MIN_PROBABILITY : float = 0.0 | |
| MAX_PROBABILITY : float = 1.0 | |
| # Feature extraction - sentence splitting | |
| SENTENCE_SPLIT_PATTERN : str = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s' | |
| WORD_TOKENIZE_PATTERN : str = r'\b\w+\b' | |
| PUNCTUATION_PATTERN : str = r'[^\w\s]' | |
| # Burstiness calculation | |
| BURSTINESS_NORMALIZATION_FACTOR : float = 2.0 | |
| # Readability calculation | |
| FLESCH_CONSTANT_1 : float = 206.835 | |
| FLESCH_CONSTANT_2 : float = 1.015 | |
| FLESCH_CONSTANT_3 : float = 84.6 | |
| NEUTRAL_READABILITY_SCORE : float = 50.0 | |
| MIN_READABILITY_SCORE : float = 0.0 | |
| MAX_READABILITY_SCORE : float = 100.0 | |
| # Repetition detection | |
| REPETITION_WINDOW_SIZE : int = 10 | |
| MIN_WORDS_FOR_REPETITION : int = 10 | |
| # N-gram analysis | |
| BIGRAM_N : int = 2 | |
| TRIGRAM_N : int = 3 | |
| # Synthetic probability calculation thresholds | |
| BURSTINESS_LOW_THRESHOLD : float = 0.3 | |
| BURSTINESS_MEDIUM_THRESHOLD : float = 0.5 | |
| LENGTH_UNIFORMITY_HIGH_THRESHOLD : float = 0.7 | |
| LENGTH_UNIFORMITY_MEDIUM_THRESH : float = 0.5 | |
| BIGRAM_DIVERSITY_LOW_THRESHOLD : float = 0.7 | |
| READABILITY_SYNTHETIC_MIN : float = 60.0 | |
| READABILITY_SYNTHETIC_MAX : float = 75.0 | |
| REPETITION_LOW_THRESHOLD : float = 0.1 | |
| REPETITION_MEDIUM_THRESHOLD : float = 0.2 | |
| # Synthetic probability weights | |
| STRONG_SYNTHETIC_WEIGHT : float = 0.7 | |
| MODERATE_SYNTHETIC_WEIGHT : float = 0.5 | |
| WEAK_SYNTHETIC_WEIGHT : float = 0.3 | |
| VERY_WEAK_SYNTHETIC_WEIGHT : float = 0.4 | |
| NEUTRAL_WEIGHT : float = 0.5 | |
| # Confidence calculation | |
| CONFIDENCE_STD_NORMALIZER : float = 0.5 | |
| MIN_CONFIDENCE : float = 0.1 | |
| MAX_CONFIDENCE : float = 0.9 | |
| NEUTRAL_CONFIDENCE : float = 0.5 # For fallback | |
| # Hybrid probability calculation | |
| BURSTINESS_HIGH_THRESHOLD : float = 0.6 | |
| SENTENCE_LENGTH_VARIANCE_RATIO : float = 0.8 | |
| TYPE_TOKEN_RATIO_EXTREME_LOW : float = 0.3 | |
| TYPE_TOKEN_RATIO_EXTREME_HIGH : float = 0.9 | |
| READABILITY_EXTREME_LOW : float = 20.0 | |
| READABILITY_EXTREME_HIGH : float = 90.0 | |
| MODERATE_HYBRID_WEIGHT : float = 0.4 | |
| WEAK_HYBRID_WEIGHT : float = 0.3 | |
| MAX_HYBRID_PROBABILITY : float = 0.3 | |
| # Feature validation | |
| MIN_SENTENCE_LENGTH_FOR_STD : int = 2 | |
| MIN_WORD_LENGTH_FOR_STD : int = 2 | |
| MIN_VALUES_FOR_BURSTINESS : int = 2 | |
| MIN_WORDS_FOR_NGRAM : int = 2 # For n-gram where n=2 | |
| # Math and normalization | |
| ZERO_TOLERANCE : float = 1e-10 | |
| ZERO_VALUE : float = 0.0 | |
| ONE_VALUE : float = 1.0 | |
| class SemanticAnalysisParams: | |
| """ | |
| Hyperparameters for Semantic Analysis Metric | |
| """ | |
| # Text validation | |
| MIN_TEXT_LENGTH_FOR_ANALYSIS : int = 50 | |
| MIN_SENTENCES_FOR_ANALYSIS : int = 3 | |
| MIN_SENTENCE_LENGTH : int = 10 | |
| MIN_VALID_SENTENCE_LENGTH : int = 5 | |
| # Domain threshold application - PROBABILITY CONSTANTS | |
| STRONG_SYNTHETIC_BASE_PROB : float = 0.7 | |
| STRONG_AUTHENTIC_BASE_PROB : float = 0.7 | |
| WEAK_PROBABILITY_ADJUSTMENT : float = 0.3 | |
| UNCERTAIN_SYNTHETIC_RANGE_START : float = 0.3 | |
| UNCERTAIN_AUTHENTIC_RANGE_START : float = 0.7 | |
| UNCERTAIN_RANGE_WIDTH : float = 0.4 | |
| NEUTRAL_PROBABILITY : float = 0.5 | |
| MIN_PROBABILITY : float = 0.0 | |
| MAX_PROBABILITY : float = 1.0 | |
| # Sentence splitting | |
| SENTENCE_SPLIT_PATTERN : str = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s' | |
| WORD_EXTRACTION_PATTERN : str = r'\b[a-zA-Z]{4,}\b' | |
| # Coherence calculation | |
| HIGH_COHERENCE_SIMILARITY : float = 0.8 | |
| SIMILARITY_VARIANCE_FACTOR : float = 5.0 | |
| # Repetition detection | |
| REPETITION_SIMILARITY_THRESHOLD : float = 0.8 | |
| REPETITION_SCORE_SCALING : float = 3.0 | |
| MIN_SENTENCES_FOR_REPETITION : int = 5 | |
| # Topic drift calculation | |
| START_SECTION_SIZE : int = 3 | |
| END_SECTION_SIZE : int = 3 | |
| SECTION_SIZE_RATIO : int = 3 # denominator for section size calculation | |
| # Chunk analysis | |
| CHUNK_SIZE_WORDS : int = 200 | |
| CHUNK_OVERLAP_RATIO : float = 0.5 # 50% overlap | |
| MIN_CHUNK_LENGTH : int = 50 | |
| MIN_SENTENCES_PER_CHUNK : int = 2 | |
| # Keyword analysis | |
| MIN_WORDS_FOR_KEYWORD_ANALYSIS : int = 10 | |
| TOP_KEYWORDS_COUNT : int = 10 | |
| MIN_KEYWORD_FREQUENCY : int = 2 | |
| # Synthetic probability thresholds | |
| COHERENCE_HIGH_THRESHOLD : float = 0.7 | |
| COHERENCE_MEDIUM_THRESHOLD : float = 0.5 | |
| CONSISTENCY_HIGH_THRESHOLD : float = 0.8 | |
| CONSISTENCY_MEDIUM_THRESHOLD : float = 0.6 | |
| REPETITION_HIGH_THRESHOLD : float = 0.3 | |
| REPETITION_MEDIUM_THRESHOLD : float = 0.1 | |
| TOPIC_DRIFT_LOW_THRESHOLD : float = 0.2 | |
| TOPIC_DRIFT_MEDIUM_THRESHOLD : float = 0.4 | |
| COHERENCE_VARIANCE_LOW_THRESHOLD : float = 0.05 | |
| COHERENCE_VARIANCE_MEDIUM_THRESHOLD : float = 0.1 | |
| # Synthetic probability weights | |
| STRONG_SYNTHETIC_WEIGHT : float = 0.9 | |
| MODERATE_SYNTHETIC_WEIGHT : float = 0.8 | |
| MEDIUM_SYNTHETIC_WEIGHT : float = 0.6 | |
| WEAK_SYNTHETIC_WEIGHT : float = 0.5 | |
| VERY_WEAK_SYNTHETIC_WEIGHT : float = 0.4 | |
| VERY_LOW_SYNTHETIC_WEIGHT : float = 0.3 | |
| LOW_SYNTHETIC_WEIGHT : float = 0.2 | |
| # Confidence calculation | |
| CONFIDENCE_STD_NORMALIZER : float = 0.5 | |
| MIN_CONFIDENCE : float = 0.1 | |
| MAX_CONFIDENCE : float = 0.9 | |
| NEUTRAL_CONFIDENCE : float = 0.5 | |
| LOW_FEATURE_CONFIDENCE : float = 0.3 | |
| # Hybrid probability calculation | |
| COHERENCE_MIXED_MIN : float = 0.4 | |
| COHERENCE_MIXED_MAX : float = 0.6 | |
| COHERENCE_VARIANCE_HIGH_THRESHOLD : float = 0.15 | |
| COHERENCE_VARIANCE_MEDIUM_THRESHOLD : float = 0.1 | |
| REPETITION_MIXED_MIN : float = 0.15 | |
| REPETITION_MIXED_MAX : float = 0.35 | |
| MODERATE_HYBRID_WEIGHT : float = 0.4 | |
| WEAK_HYBRID_WEIGHT : float = 0.3 | |
| VERY_WEAK_HYBRID_WEIGHT : float = 0.2 | |
| MAX_HYBRID_PROBABILITY : float = 0.3 | |
| # Default feature values | |
| DEFAULT_COHERENCE : float = 0.5 | |
| DEFAULT_CONSISTENCY : float = 0.5 | |
| DEFAULT_REPETITION : float = 0.0 | |
| DEFAULT_TOPIC_DRIFT : float = 0.5 | |
| DEFAULT_CONTEXTUAL_CONSISTENCY : float = 0.5 | |
| DEFAULT_CHUNK_COHERENCE : float = 0.5 | |
| DEFAULT_COHERENCE_VARIANCE : float = 0.1 | |
| # Error handling | |
| MIN_REQUIRED_FEATURES : int = 3 | |
| ZERO_TOLERANCE : float = 1e-10 | |
| class LinguisticMetricParams: | |
| """ | |
| Hyperparameters for Linguistic Metric | |
| """ | |
| # Text validation | |
| MIN_TEXT_LENGTH_FOR_ANALYSIS : int = 50 | |
| # Domain threshold application - PROBABILITY CONSTANTS | |
| STRONG_SYNTHETIC_BASE_PROB : float = 0.7 | |
| STRONG_AUTHENTIC_BASE_PROB : float = 0.7 | |
| WEAK_PROBABILITY_ADJUSTMENT : float = 0.3 | |
| UNCERTAIN_SYNTHETIC_RANGE_START : float = 0.3 | |
| UNCERTAIN_AUTHENTIC_RANGE_START : float = 0.7 | |
| UNCERTAIN_RANGE_WIDTH : float = 0.4 | |
| NEUTRAL_PROBABILITY : float = 0.5 | |
| MIN_PROBABILITY : float = 0.0 | |
| MAX_PROBABILITY : float = 1.0 | |
| # POS analysis | |
| MIN_TAGS_FOR_ENTROPY : int = 1 | |
| # Syntactic complexity | |
| COMPLEXITY_WEIGHT_AVG : float = 0.5 | |
| COMPLEXITY_WEIGHT_MAX : float = 0.5 | |
| # Sentence complexity | |
| WORDS_PER_COMPLEXITY_UNIT : float = 10.0 | |
| CLAUSE_COMPLEXITY_FACTOR : float = 0.5 | |
| # Grammatical patterns | |
| TRANSITION_WORDS_SET : tuple = ('however', 'therefore', 'moreover', 'furthermore', 'consequently', 'additionally', 'nevertheless', 'nonetheless', 'thus', 'hence') | |
| IDEAL_PASSIVE_RATIO : float = 0.3 | |
| IDEAL_TRANSITION_RATIO : float = 0.2 | |
| PASSIVE_DEPENDENCY : str = 'nsubjpass' | |
| CLAUSE_MARKERS : tuple = ('cc', 'mark') | |
| # Writing style analysis | |
| IDEAL_LENGTH_VARIATION : float = 0.5 | |
| IDEAL_PUNCTUATION_RATIO : float = 0.1 | |
| # SYNTHETIC pattern detection | |
| TRANSITION_OVERUSE_THRESHOLD : float = 0.05 | |
| POS_SEQUENCE_FREQ_THRESHOLD : float = 0.1 | |
| STRUCTURE_DIVERSITY_THRESHOLD : float = 0.5 | |
| UNUSUAL_CONSTRUCTION_THRESHOLD : float = 0.02 | |
| REPETITIVE_PHRASING_THRESHOLD : float = 0.3 | |
| UNUSUAL_DEPENDENCIES : tuple = ('attr', 'oprd') | |
| # Chunk analysis | |
| CHUNK_SIZE_WORDS : int = 200 | |
| CHUNK_OVERLAP_RATIO : float = 0.5 | |
| MIN_CHUNK_LENGTH : int = 50 | |
| MIN_SENTENCES_FOR_STRUCTURE : int = 3 | |
| MIN_SENTENCES_FOR_ANALYSIS : int = 1 | |
| # Synthetic probability thresholds | |
| POS_DIVERSITY_LOW_THRESHOLD : float = 0.3 | |
| POS_DIVERSITY_MEDIUM_THRESHOLD : float = 0.5 | |
| SYNTACTIC_COMPLEXITY_LOW_THRESHOLD : float = 2.0 | |
| SYNTACTIC_COMPLEXITY_MEDIUM_THRESHOLD : float = 3.0 | |
| GRAMMATICAL_CONSISTENCY_HIGH_THRESHOLD : float = 0.8 | |
| GRAMMATICAL_CONSISTENCY_MEDIUM_THRESHOLD : float = 0.6 | |
| TRANSITION_USAGE_HIGH_THRESHOLD : float = 0.3 | |
| TRANSITION_USAGE_MEDIUM_THRESHOLD : float = 0.15 | |
| SYNTHETIC_PATTERN_HIGH_THRESHOLD : float = 0.6 | |
| SYNTHETIC_PATTERN_MEDIUM_THRESHOLD : float = 0.3 | |
| COMPLEXITY_VARIANCE_LOW_THRESHOLD : float = 0.1 | |
| COMPLEXITY_VARIANCE_MEDIUM_THRESHOLD : float = 0.3 | |
| # Synthetic probability weights | |
| STRONG_SYNTHETIC_WEIGHT : float = 0.9 | |
| MODERATE_SYNTHETIC_WEIGHT : float = 0.8 | |
| MEDIUM_SYNTHETIC_WEIGHT : float = 0.7 | |
| WEAK_SYNTHETIC_WEIGHT : float = 0.6 | |
| VERY_WEAK_SYNTHETIC_WEIGHT : float = 0.5 | |
| LOW_SYNTHETIC_WEIGHT : float = 0.4 | |
| VERY_LOW_SYNTHETIC_WEIGHT : float = 0.3 | |
| MINIMAL_SYNTHETIC_WEIGHT : float = 0.2 | |
| # Confidence calculation | |
| CONFIDENCE_STD_NORMALIZER : float = 0.5 | |
| MIN_CONFIDENCE : float = 0.1 | |
| MAX_CONFIDENCE : float = 0.9 | |
| NEUTRAL_CONFIDENCE : float = 0.5 | |
| LOW_FEATURE_CONFIDENCE : float = 0.3 | |
| MIN_REQUIRED_FEATURES : int = 4 | |
| # Hybrid probability calculation | |
| POS_DIVERSITY_MIXED_MIN : float = 0.35 | |
| POS_DIVERSITY_MIXED_MAX : float = 0.55 | |
| POS_ENTROPY_LOW_THRESHOLD : float = 0.35 | |
| POS_ENTROPY_HIGH_THRESHOLD : float = 0.65 | |
| COMPLEXITY_VARIANCE_HIGH_THRESHOLD : float = 0.5 | |
| COMPLEXITY_VARIANCE_MEDIUM_THRESHOLD : float = 0.3 | |
| SYNTHETIC_PATTERN_MIXED_MIN : float = 0.2 | |
| SYNTHETIC_PATTERN_MIXED_MAX : float = 0.6 | |
| MODERATE_HYBRID_WEIGHT : float = 0.4 | |
| WEAK_HYBRID_WEIGHT : float = 0.3 | |
| MINIMAL_HYBRID_WEIGHT : float = 0.2 | |
| MAX_HYBRID_PROBABILITY : float = 0.3 | |
| # Default feature values | |
| DEFAULT_POS_DIVERSITY : float = 0.5 | |
| DEFAULT_POS_ENTROPY : float = 2.5 | |
| DEFAULT_SYNTACTIC_COMPLEXITY : float = 2.5 | |
| DEFAULT_SENTENCE_COMPLEXITY : float = 2.0 | |
| DEFAULT_GRAMMATICAL_CONSISTENCY : float = 0.5 | |
| DEFAULT_TRANSITION_USAGE : float = 0.1 | |
| DEFAULT_PASSIVE_RATIO : float = 0.2 | |
| DEFAULT_WRITING_STYLE_SCORE : float = 0.5 | |
| DEFAULT_SYNTHETIC_PATTERN_SCORE : float = 0.3 | |
| DEFAULT_CHUNK_COMPLEXITY : float = 2.5 | |
| DEFAULT_COMPLEXITY_VARIANCE : float = 0.2 | |
| # Math and normalization | |
| LOG_BASE : int = 2 | |
| ZERO_TOLERANCE : float = 1e-10 | |
| class PerplexityMetricParams: | |
| """ | |
| Hyperparameters for Perplexity Metric | |
| """ | |
| # Text validation | |
| MIN_TEXT_LENGTH_FOR_ANALYSIS : int = 50 | |
| # Domain threshold application - PROBABILITY CONSTANTS | |
| STRONG_SYNTHETIC_BASE_PROB : float = 0.7 | |
| STRONG_AUTHENTIC_BASE_PROB : float = 0.7 | |
| WEAK_PROBABILITY_ADJUSTMENT : float = 0.3 | |
| UNCERTAIN_SYNTHETIC_RANGE_START : float = 0.3 | |
| UNCERTAIN_AUTHENTIC_RANGE_START : float = 0.7 | |
| UNCERTAIN_RANGE_WIDTH : float = 0.4 | |
| NEUTRAL_PROBABILITY : float = 0.5 | |
| MIN_PROBABILITY : float = 0.0 | |
| MAX_PROBABILITY : float = 1.0 | |
| # Model parameters | |
| MAX_TOKEN_LENGTH : int = 1024 | |
| MIN_TOKENS_FOR_PERPLEXITY : int = 5 | |
| MIN_SENTENCE_LENGTH : int = 20 | |
| MIN_CHUNK_LENGTH : int = 50 | |
| # Chunk analysis | |
| CHUNK_SIZE_WORDS : int = 200 | |
| CHUNK_OVERLAP_RATIO : float = 0.5 | |
| # Perplexity normalization | |
| PERPLEXITY_SIGMOID_CENTER : float = 30.0 | |
| PERPLEXITY_SIGMOID_SCALE : float = 10.0 | |
| # Cross-entropy normalization | |
| MAX_CROSS_ENTROPY : float = 5.0 | |
| # Perplexity value thresholds (actual perplexity values) | |
| PERPLEXITY_VERY_LOW_THRESHOLD : float = 20.0 | |
| PERPLEXITY_LOW_THRESHOLD : float = 40.0 | |
| PERPLEXITY_HIGH_THRESHOLD : float = 80.0 | |
| PERPLEXITY_VERY_HIGH_THRESHOLD : float = 150.0 | |
| # Synthetic probability thresholds (normalized values 0-1) | |
| NORMALIZED_PERPLEXITY_HIGH_THRESHOLD : float = 0.7 | |
| NORMALIZED_PERPLEXITY_MEDIUM_THRESHOLD : float = 0.5 | |
| PERPLEXITY_VARIANCE_LOW_THRESHOLD : float = 50.0 | |
| PERPLEXITY_VARIANCE_MEDIUM_THRESHOLD : float = 200.0 | |
| STD_SENTENCE_PERPLEXITY_LOW_THRESHOLD : float = 20.0 | |
| STD_SENTENCE_PERPLEXITY_MEDIUM_THRESHOLD : float = 50.0 | |
| CROSS_ENTROPY_LOW_THRESHOLD : float = 0.3 | |
| CROSS_ENTROPY_MEDIUM_THRESHOLD : float = 0.6 | |
| CHUNK_VARIANCE_VERY_LOW_THRESHOLD : float = 25.0 | |
| CHUNK_VARIANCE_LOW_THRESHOLD : float = 100.0 | |
| # Synthetic probability weights | |
| STRONG_SYNTHETIC_WEIGHT : float = 0.8 | |
| MEDIUM_SYNTHETIC_WEIGHT : float = 0.6 | |
| WEAK_SYNTHETIC_WEIGHT : float = 0.4 | |
| VERY_WEAK_SYNTHETIC_WEIGHT : float = 0.2 | |
| VERY_LOW_SYNTHETIC_WEIGHT : float = 0.3 | |
| MINIMAL_SYNTHETIC_WEIGHT : float = 0.2 | |
| # Confidence calculation | |
| CONFIDENCE_STD_NORMALIZER : float = 0.5 | |
| MIN_CONFIDENCE : float = 0.1 | |
| MAX_CONFIDENCE : float = 0.9 | |
| NEUTRAL_CONFIDENCE : float = 0.5 | |
| LOW_FEATURE_CONFIDENCE : float = 0.3 | |
| MIN_REQUIRED_FEATURES : int = 3 | |
| # Hybrid probability calculation | |
| NORMALIZED_PERPLEXITY_MIXED_MIN : float = 0.4 | |
| NORMALIZED_PERPLEXITY_MIXED_MAX : float = 0.6 | |
| PERPLEXITY_VARIANCE_HIGH_THRESHOLD : float = 200.0 | |
| PERPLEXITY_VARIANCE_MEDIUM_THRESHOLD : float = 100.0 | |
| STD_SENTENCE_PERPLEXITY_MIXED_MIN : float = 20.0 | |
| STD_SENTENCE_PERPLEXITY_MIXED_MAX : float = 60.0 | |
| MODERATE_HYBRID_WEIGHT : float = 0.4 | |
| WEAK_HYBRID_WEIGHT : float = 0.2 | |
| MINIMAL_HYBRID_WEIGHT : float = 0.0 | |
| MAX_HYBRID_PROBABILITY : float = 0.3 | |
| # Default feature values | |
| DEFAULT_OVERALL_PERPLEXITY : float = 50.0 | |
| DEFAULT_NORMALIZED_PERPLEXITY : float = 0.5 | |
| DEFAULT_AVG_SENTENCE_PERPLEXITY : float = 50.0 | |
| DEFAULT_STD_SENTENCE_PERPLEXITY : float = 25.0 | |
| DEFAULT_MIN_SENTENCE_PERPLEXITY : float = 30.0 | |
| DEFAULT_MAX_SENTENCE_PERPLEXITY : float = 70.0 | |
| DEFAULT_PERPLEXITY_VARIANCE : float = 100.0 | |
| DEFAULT_AVG_CHUNK_PERPLEXITY : float = 50.0 | |
| DEFAULT_CROSS_ENTROPY_SCORE : float = 0.5 | |
| # Math and normalization | |
| ZERO_TOLERANCE : float = 1e-10 | |
| LARGE_PERPLEXITY_THRESHOLD : float = 1000.0 | |
| # Regular experssion for sentence splitting | |
| SENTENCE_SPLIT_PATTERN : str = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s' | |
| class EntropyMetricParams: | |
| """ | |
| Hyperparameters for Entropy Metric | |
| """ | |
| # Text validation | |
| MIN_TEXT_LENGTH_FOR_ANALYSIS : int = 50 | |
| MIN_SENTENCE_LENGTH : int = 10 | |
| MIN_WORDS_FOR_ANALYSIS : int = 5 | |
| MIN_TOKENS_FOR_ANALYSIS : int = 10 | |
| MIN_TOKENS_FOR_SEQUENCE : int = 20 | |
| # Domain threshold application - PROBABILITY CONSTANTS | |
| STRONG_SYNTHETIC_BASE_PROB : float = 0.7 | |
| STRONG_AUTHENTIC_BASE_PROB : float = 0.7 | |
| WEAK_PROBABILITY_ADJUSTMENT : float = 0.3 | |
| UNCERTAIN_SYNTHETIC_RANGE_START : float = 0.3 | |
| UNCERTAIN_AUTHENTIC_RANGE_START : float = 0.7 | |
| UNCERTAIN_RANGE_WIDTH : float = 0.4 | |
| NEUTRAL_PROBABILITY : float = 0.5 | |
| MIN_PROBABILITY : float = 0.0 | |
| MAX_PROBABILITY : float = 1.0 | |
| # Chunk analysis | |
| CHUNK_SIZE_WORDS : int = 100 | |
| CHUNK_OVERLAP_RATIO : float = 0.5 | |
| MIN_CHUNK_LENGTH : int = 20 | |
| # Sequence analysis | |
| MAX_BIGRAM_ENTROPY : float = 8.0 | |
| # Entropy normalization | |
| MAX_CHAR_ENTROPY : float = 4.0 | |
| # Synthetic probability thresholds | |
| CHAR_ENTROPY_VERY_LOW_THRESHOLD : float = 3.5 | |
| CHAR_ENTROPY_LOW_THRESHOLD : float = 3.8 | |
| CHAR_ENTROPY_MEDIUM_THRESHOLD : float = 4.0 | |
| ENTROPY_VARIANCE_VERY_LOW_THRESHOLD : float = 0.1 | |
| ENTROPY_VARIANCE_LOW_THRESHOLD : float = 0.2 | |
| ENTROPY_VARIANCE_MEDIUM_THRESHOLD : float = 0.3 | |
| TOKEN_DIVERSITY_LOW_THRESHOLD : float = 0.6 | |
| TOKEN_DIVERSITY_MEDIUM_THRESHOLD : float = 0.7 | |
| TOKEN_DIVERSITY_HIGH_THRESHOLD : float = 0.8 | |
| SEQUENCE_UNPREDICTABILITY_LOW_THRESHOLD : float = 0.3 | |
| SEQUENCE_UNPREDICTABILITY_MEDIUM_THRESHOLD : float = 0.4 | |
| SEQUENCE_UNPREDICTABILITY_HIGH_THRESHOLD : float = 0.5 | |
| SYNTHETIC_PATTERN_SCORE_HIGH_THRESHOLD : float = 0.75 | |
| SYNTHETIC_PATTERN_SCORE_MEDIUM_THRESHOLD : float = 0.5 | |
| TOKEN_ENTROPY_LOW_THRESHOLD : float = 6.5 | |
| # Synthetic probability weights | |
| STRONG_SYNTHETIC_WEIGHT : float = 0.9 | |
| VERY_STRONG_SYNTHETIC_WEIGHT : float = 0.8 | |
| MEDIUM_SYNTHETIC_WEIGHT : float = 0.7 | |
| MODERATE_SYNTHETIC_WEIGHT : float = 0.6 | |
| WEAK_SYNTHETIC_WEIGHT : float = 0.5 | |
| VERY_WEAK_SYNTHETIC_WEIGHT : float = 0.4 | |
| LOW_SYNTHETIC_WEIGHT : float = 0.3 | |
| MINIMAL_SYNTHETIC_WEIGHT : float = 0.2 | |
| VERY_LOW_SYNTHETIC_WEIGHT : float = 0.1 | |
| # Confidence calculation | |
| CONFIDENCE_STD_NORMALIZER : float = 0.5 | |
| MIN_CONFIDENCE : float = 0.1 | |
| MAX_CONFIDENCE : float = 0.9 | |
| NEUTRAL_CONFIDENCE : float = 0.5 | |
| LOW_FEATURE_CONFIDENCE : float = 0.3 | |
| MIN_REQUIRED_FEATURES : int = 2 | |
| # Hybrid probability calculation | |
| ENTROPY_VARIANCE_HIGH_THRESHOLD : float = 0.5 | |
| ENTROPY_VARIANCE_MIXED_THRESHOLD : float = 0.3 | |
| ENTROPY_DISCREPANCY_THRESHOLD : float = 1.0 | |
| SYNTHETIC_PATTERN_MIXED_MIN : float = 0.4 | |
| SYNTHETIC_PATTERN_MIXED_MAX : float = 0.6 | |
| STRONG_HYBRID_WEIGHT : float = 0.6 | |
| MODERATE_HYBRID_WEIGHT : float = 0.4 | |
| WEAK_HYBRID_WEIGHT : float = 0.3 | |
| MINIMAL_HYBRID_WEIGHT : float = 0.0 | |
| MAX_HYBRID_PROBABILITY : float = 0.4 | |
| # Default feature values | |
| DEFAULT_CHAR_ENTROPY : float = 3.8 | |
| DEFAULT_WORD_ENTROPY : float = 6.0 | |
| DEFAULT_TOKEN_ENTROPY : float = 8.0 | |
| DEFAULT_TOKEN_DIVERSITY : float = 0.7 | |
| DEFAULT_SEQUENCE_UNPREDICTABILITY : float = 0.5 | |
| DEFAULT_ENTROPY_VARIANCE : float = 0.2 | |
| DEFAULT_AVG_CHUNK_ENTROPY : float = 3.8 | |
| DEFAULT_PREDICTABILITY_SCORE : float = 0.5 | |
| # Math and normalization | |
| ZERO_TOLERANCE : float = 1e-10 | |
| class MultiPerturbationStabilityMetricParams: | |
| """ | |
| Hyperparameters for Multi-Perturbation Stability Metric | |
| """ | |
| # Text validation | |
| MIN_TEXT_LENGTH_FOR_ANALYSIS : int = 50 | |
| MIN_TEXT_LENGTH_FOR_PERTURBATION : int = 10 | |
| MIN_TOKENS_FOR_LIKELIHOOD : int = 3 | |
| MIN_WORDS_FOR_PERTURBATION : int = 3 | |
| MIN_WORDS_FOR_DELETION : int = 5 | |
| # Domain threshold application - PROBABILITY CONSTANTS | |
| STRONG_SYNTHETIC_BASE_PROB : float = 0.7 | |
| STRONG_AUTHENTIC_BASE_PROB : float = 0.7 | |
| WEAK_PROBABILITY_ADJUSTMENT : float = 0.3 | |
| UNCERTAIN_SYNTHETIC_RANGE_START : float = 0.3 | |
| UNCERTAIN_AUTHENTIC_RANGE_START : float = 0.7 | |
| UNCERTAIN_RANGE_WIDTH : float = 0.4 | |
| NEUTRAL_PROBABILITY : float = 0.5 | |
| MIN_PROBABILITY : float = 0.0 | |
| MAX_PROBABILITY : float = 1.0 | |
| # Perturbation parameters | |
| NUM_PERTURBATIONS : int = 10 | |
| MAX_PERTURBATION_ATTEMPTS : int = 10 | |
| PERTURBATION_DELETION_RATIO : float = 0.1 | |
| ROBBERTA_TOP_K_PREDICTIONS : int = 3 | |
| # Text preprocessing | |
| MAX_TEXT_LENGTH_FOR_ANALYSIS : int = 2000 | |
| MAX_TEXT_LENGTH_FOR_PERTURBATION : int = 1000 | |
| MAX_TOKEN_LENGTH : int = 256 | |
| MAX_ROBERTA_TOKEN_LENGTH : int = 128 | |
| # Chunk analysis | |
| CHUNK_SIZE_WORDS : int = 150 | |
| CHUNK_OVERLAP_RATIO : float = 0.5 | |
| MIN_CHUNK_LENGTH : int = 50 | |
| CHUNK_DELETION_RATIO : float = 0.1 | |
| # Likelihood calculation | |
| MIN_VALID_PERTURBATIONS : int = 3 | |
| DEFAULT_LIKELIHOOD : float = 2.0 | |
| MIN_LIKELIHOOD : float = 0.5 | |
| MAX_LIKELIHOOD : float = 10.0 | |
| # Stability scoring | |
| STABILITY_HIGH_THRESHOLD : float = 0.7 | |
| STABILITY_MEDIUM_THRESHOLD : float = 0.5 | |
| STABILITY_LOW_THRESHOLD : float = 0.3 | |
| RELATIVE_DROP_HIGH_THRESHOLD : float = 0.5 | |
| RELATIVE_DROP_MEDIUM_THRESHOLD : float = 0.3 | |
| RELATIVE_DROP_LOW_THRESHOLD : float = 0.15 | |
| # Curvature scoring | |
| CURVATURE_HIGH_THRESHOLD : float = 0.7 | |
| CURVATURE_MEDIUM_THRESHOLD : float = 0.5 | |
| CURVATURE_LOW_THRESHOLD : float = 0.3 | |
| CURVATURE_SCALING_FACTOR : float = 3.0 | |
| # Likelihood ratio thresholds | |
| LIKELIHOOD_RATIO_HIGH_THRESHOLD : float = 0.8 | |
| LIKELIHOOD_RATIO_MEDIUM_THRESHOLD : float = 0.6 | |
| LIKELIHOOD_RATIO_LOW_THRESHOLD : float = 0.4 | |
| MAX_LIKELIHOOD_RATIO : float = 3.0 | |
| MIN_LIKELIHOOD_RATIO : float = 0.33 | |
| # Stability variance thresholds | |
| STABILITY_VARIANCE_VERY_LOW : float = 0.05 | |
| STABILITY_VARIANCE_LOW : float = 0.1 | |
| STABILITY_VARIANCE_HIGH : float = 0.15 | |
| # Synthetic probability weights | |
| STABILITY_WEIGHT : float = 0.3 | |
| CURVATURE_WEIGHT : float = 0.25 | |
| RATIO_WEIGHT : float = 0.25 | |
| VARIANCE_WEIGHT : float = 0.2 | |
| # Synthetic probability thresholds | |
| STABILITY_STRONG_THRESHOLD : float = 0.9 | |
| STABILITY_MEDIUM_STRONG_THRESHOLD : float = 0.7 | |
| STABILITY_MODERATE_THRESHOLD : float = 0.5 | |
| STABILITY_WEAK_THRESHOLD : float = 0.2 | |
| CURVATURE_STRONG_THRESHOLD : float = 0.8 | |
| CURVATURE_MEDIUM_THRESHOLD : float = 0.6 | |
| CURVATURE_MODERATE_THRESHOLD : float = 0.4 | |
| CURVATURE_WEAK_THRESHOLD : float = 0.2 | |
| RATIO_STRONG_THRESHOLD : float = 0.9 | |
| RATIO_MEDIUM_THRESHOLD : float = 0.7 | |
| RATIO_MODERATE_THRESHOLD : float = 0.5 | |
| RATIO_WEAK_THRESHOLD : float = 0.3 | |
| VARIANCE_STRONG_THRESHOLD : float = 0.8 | |
| VARIANCE_MODERATE_THRESHOLD : float = 0.5 | |
| VARIANCE_WEAK_THRESHOLD : float = 0.2 | |
| # Confidence calculation | |
| CONFIDENCE_BASE : float = 0.5 | |
| CONFIDENCE_STD_FACTOR : float = 0.5 | |
| MIN_CONFIDENCE : float = 0.1 | |
| MAX_CONFIDENCE : float = 0.9 | |
| NEUTRAL_CONFIDENCE : float = 0.5 | |
| LOW_FEATURE_CONFIDENCE : float = 0.3 | |
| MIN_REQUIRED_FEATURES : int = 3 | |
| # Hybrid probability calculation | |
| STABILITY_MIXED_MIN : float = 0.35 | |
| STABILITY_MIXED_MAX : float = 0.55 | |
| STABILITY_VARIANCE_MIXED_HIGH : float = 0.15 | |
| STABILITY_VARIANCE_MIXED_MEDIUM : float = 0.1 | |
| LIKELIHOOD_RATIO_MIXED_MIN : float = 0.5 | |
| LIKELIHOOD_RATIO_MIXED_MAX : float = 0.8 | |
| MODERATE_HYBRID_WEIGHT : float = 0.4 | |
| WEAK_HYBRID_WEIGHT : float = 0.3 | |
| VERY_WEAK_HYBRID_WEIGHT : float = 0.2 | |
| MINIMAL_HYBRID_WEIGHT : float = 0.0 | |
| MAX_HYBRID_PROBABILITY : float = 0.3 | |
| # Default feature values | |
| DEFAULT_ORIGINAL_LIKELIHOOD : float = 2.0 | |
| DEFAULT_AVG_PERTURBED_LIKELIHOOD : float = 1.8 | |
| DEFAULT_LIKELIHOOD_RATIO : float = 1.1 | |
| DEFAULT_NORMALIZED_LIKELIHOOD_RATIO : float = 0.55 | |
| DEFAULT_STABILITY_SCORE : float = 0.3 | |
| DEFAULT_CURVATURE_SCORE : float = 0.3 | |
| DEFAULT_PERTURBATION_VARIANCE : float = 0.05 | |
| DEFAULT_AVG_CHUNK_STABILITY : float = 0.3 | |
| DEFAULT_STABILITY_VARIANCE : float = 0.1 | |
| # Math and normalization | |
| ZERO_TOLERANCE : float = 1e-10 | |
| # Common words to avoid masking | |
| COMMON_WORDS_TO_AVOID : tuple = ('the', 'and', 'but', 'for', 'with', 'that', 'this', 'have', 'from', 'were') | |
| class MetricsEnsembleParams: | |
| """ | |
| Constants for MEtrics Ensemble Classifier | |
| """ | |
| # Minimum requirements | |
| MIN_METRICS_REQUIRED : int = 3 | |
| # Default probabilities | |
| DEFAULT_SYNTHETIC_PROB : float = 0.5 | |
| DEFAULT_AUTHENTIC_PROB : float = 0.5 | |
| DEFAULT_HYBRID_PROB : float = 0.0 | |
| # Weighting | |
| SIGMOID_CONFIDENCE_SCALE : float = 10.0 | |
| SIGMOID_CENTER : float = 0.5 | |
| # Confidence composition | |
| CONFIDENCE_WEIGHT_BASE : float = 0.4 | |
| CONFIDENCE_WEIGHT_AGREEMENT : float = 0.3 | |
| CONFIDENCE_WEIGHT_CERTAINTY : float = 0.2 | |
| CONFIDENCE_WEIGHT_QUALITY : float = 0.1 | |
| # Uncertainty composition | |
| UNCERTAINTY_WEIGHT_VARIANCE : float = 0.4 | |
| UNCERTAINTY_WEIGHT_CONFIDENCE : float = 0.3 | |
| UNCERTAINTY_WEIGHT_DECISION : float = 0.3 | |
| # Consensus | |
| CONSENSUS_STD_SCALING : float = 2.0 | |
| # Hybrid detection | |
| HYBRID_PROB_THRESHOLD : float = 0.25 | |
| HYBRID_UNCERTAINTY_THRESHOLD : float = 0.6 | |
| HYBRID_SYNTHETIC_RANGE_LOW : float = 0.3 | |
| HYBRID_SYNTHETIC_RANGE_HIGH : float = 0.7 | |
| # Threshold adaptation | |
| UNCERTAINTY_THRESHOLD_ADJUSTMENT : float = 0.1 | |
| # Contribution labels | |
| CONTRIBUTION_HIGH : float = 0.15 | |
| CONTRIBUTION_MEDIUM : float = 0.08 | |
| HIGH_CONFIDENCE_THRESHOLD : float = 0.7 | |
| # Singleton instances for parameter classes | |
| document_extraction_params = DocumentExtractionParams() | |
| language_detection_params = LanguageDetectionParams() | |
| domain_classification_params = DomainClassificationParams() | |
| text_processing_params = TextProcessingParams() | |
| base_metric_params = BaseMetricParams() | |
| structural_metric_params = StructuralMetricParams() | |
| semantic_analysis_params = SemanticAnalysisParams() | |
| linguistic_metric_params = LinguisticMetricParams() | |
| perplexity_metric_params = PerplexityMetricParams() | |
| entropy_metric_params = EntropyMetricParams() | |
| multi_perturbation_stability_metric_params = MultiPerturbationStabilityMetricParams() | |
| metrics_ensemble_params = MetricsEnsembleParams() |