| """ |
| Formality classifier module. |
| Classifies text on a 0-1 formality scale using linguistic features. |
| Used as one dimension of the style fingerprint. |
| """ |
|
|
| import re |
| from typing import Optional |
|
|
|
|
| class FormalityClassifier: |
| """Scores text formality on a 0-1 scale using rule-based heuristics.""" |
|
|
| |
| CONTRACTIONS = { |
| "don't", "can't", "won't", "it's", "that's", "there's", |
| "they're", "we're", "you're", "i'm", "i've", "i'll", |
| "isn't", "aren't", "wasn't", "weren't", "hasn't", "haven't", |
| "couldn't", "wouldn't", "shouldn't", "let's", "he's", "she's", |
| } |
|
|
| INFORMAL_WORDS = { |
| "gonna", "wanna", "gotta", "kinda", "sorta", "ya", "yeah", |
| "yep", "nope", "ok", "okay", "cool", "awesome", "stuff", |
| "things", "like", "basically", "actually", "literally", |
| "totally", "really", "super", "pretty", "kind of", "sort of", |
| } |
|
|
| FORMAL_MARKERS = { |
| "furthermore", "moreover", "consequently", "nevertheless", |
| "nonetheless", "accordingly", "hence", "thus", "therefore", |
| "whereas", "notwithstanding", "hitherto", "whereby", |
| "therein", "thereof", "herein", |
| } |
|
|
| def __init__(self): |
| pass |
|
|
| def score(self, text: str) -> float: |
| """Return formality score in [0, 1]. Higher = more formal. |
| |
| Scoring based on: |
| - Contraction penalty (-0.05 each) |
| - Informal word penalty (-0.03 each) |
| - Formal marker bonus (+0.04 each) |
| - Average sentence length bonus (longer = more formal) |
| - First person penalty (-0.02 per occurrence) |
| - Exclamation penalty (-0.05 each) |
| """ |
| if not text or not text.strip(): |
| return 0.5 |
|
|
| words = text.lower().split() |
| word_count = max(len(words), 1) |
|
|
| |
| score = 0.5 |
|
|
| |
| contraction_count = sum(1 for w in words if w in self.CONTRACTIONS) |
| score -= min(contraction_count * 0.05, 0.25) |
|
|
| |
| informal_count = sum(1 for w in words if w in self.INFORMAL_WORDS) |
| score -= min((informal_count / word_count) * 0.5, 0.2) |
|
|
| |
| formal_count = sum(1 for w in words if w in self.FORMAL_MARKERS) |
| score += min(formal_count * 0.04, 0.2) |
|
|
| |
| sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()] |
| if sentences: |
| avg_sent_len = sum(len(s.split()) for s in sentences) / len(sentences) |
| if avg_sent_len > 20: |
| score += 0.1 |
| elif avg_sent_len > 15: |
| score += 0.05 |
| elif avg_sent_len < 8: |
| score -= 0.05 |
|
|
| |
| first_person = sum(1 for w in words if w in ("i", "me", "my", "mine", "myself")) |
| score -= min((first_person / word_count) * 0.3, 0.1) |
|
|
| |
| exclamation_count = text.count("!") |
| score -= min(exclamation_count * 0.05, 0.15) |
|
|
| |
| question_count = text.count("?") |
| score -= min(question_count * 0.02, 0.08) |
|
|
| |
| passive_indicators = sum(1 for w in words if w in ("is", "was", "were", "been", "being")) |
| score += min((passive_indicators / word_count) * 0.15, 0.1) |
|
|
| return max(0.0, min(1.0, score)) |
|
|