""" Formality classifier module. Classifies text on a 0-1 formality scale using linguistic features. Used as one dimension of the style fingerprint. """ import re from typing import Optional class FormalityClassifier: """Scores text formality on a 0-1 scale using rule-based heuristics.""" # Informal markers that decrease formality score CONTRACTIONS = { "don't", "can't", "won't", "it's", "that's", "there's", "they're", "we're", "you're", "i'm", "i've", "i'll", "isn't", "aren't", "wasn't", "weren't", "hasn't", "haven't", "couldn't", "wouldn't", "shouldn't", "let's", "he's", "she's", } INFORMAL_WORDS = { "gonna", "wanna", "gotta", "kinda", "sorta", "ya", "yeah", "yep", "nope", "ok", "okay", "cool", "awesome", "stuff", "things", "like", "basically", "actually", "literally", "totally", "really", "super", "pretty", "kind of", "sort of", } FORMAL_MARKERS = { "furthermore", "moreover", "consequently", "nevertheless", "nonetheless", "accordingly", "hence", "thus", "therefore", "whereas", "notwithstanding", "hitherto", "whereby", "therein", "thereof", "herein", } def __init__(self): pass def score(self, text: str) -> float: """Return formality score in [0, 1]. Higher = more formal. Scoring based on: - Contraction penalty (-0.05 each) - Informal word penalty (-0.03 each) - Formal marker bonus (+0.04 each) - Average sentence length bonus (longer = more formal) - First person penalty (-0.02 per occurrence) - Exclamation penalty (-0.05 each) """ if not text or not text.strip(): return 0.5 words = text.lower().split() word_count = max(len(words), 1) # Base score score = 0.5 # Contraction penalty contraction_count = sum(1 for w in words if w in self.CONTRACTIONS) score -= min(contraction_count * 0.05, 0.25) # Informal word penalty informal_count = sum(1 for w in words if w in self.INFORMAL_WORDS) score -= min((informal_count / word_count) * 0.5, 0.2) # Formal marker bonus formal_count = sum(1 for w in words if w in self.FORMAL_MARKERS) score += min(formal_count * 0.04, 0.2) # Sentence length bonus (longer sentences tend to be more formal) sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()] if sentences: avg_sent_len = sum(len(s.split()) for s in sentences) / len(sentences) if avg_sent_len > 20: score += 0.1 elif avg_sent_len > 15: score += 0.05 elif avg_sent_len < 8: score -= 0.05 # First person penalty (academic writing avoids "I") first_person = sum(1 for w in words if w in ("i", "me", "my", "mine", "myself")) score -= min((first_person / word_count) * 0.3, 0.1) # Exclamation penalty exclamation_count = text.count("!") score -= min(exclamation_count * 0.05, 0.15) # Question mark mild penalty (academic writing has fewer questions) question_count = text.count("?") score -= min(question_count * 0.02, 0.08) # Passive voice bonus (approximation: "is/was/were/been" + past participle patterns) passive_indicators = sum(1 for w in words if w in ("is", "was", "were", "been", "being")) score += min((passive_indicators / word_count) * 0.15, 0.1) return max(0.0, min(1.0, score))