rewrite / src /style /formality_classifier.py
morpheuslord's picture
Add files using upload-large-folder tool
12fd5f2 verified
"""
Formality classifier module.
Classifies text on a 0-1 formality scale using linguistic features.
Used as one dimension of the style fingerprint.
"""
import re
from typing import Optional
class FormalityClassifier:
"""Scores text formality on a 0-1 scale using rule-based heuristics."""
# Informal markers that decrease formality score
CONTRACTIONS = {
"don't", "can't", "won't", "it's", "that's", "there's",
"they're", "we're", "you're", "i'm", "i've", "i'll",
"isn't", "aren't", "wasn't", "weren't", "hasn't", "haven't",
"couldn't", "wouldn't", "shouldn't", "let's", "he's", "she's",
}
INFORMAL_WORDS = {
"gonna", "wanna", "gotta", "kinda", "sorta", "ya", "yeah",
"yep", "nope", "ok", "okay", "cool", "awesome", "stuff",
"things", "like", "basically", "actually", "literally",
"totally", "really", "super", "pretty", "kind of", "sort of",
}
FORMAL_MARKERS = {
"furthermore", "moreover", "consequently", "nevertheless",
"nonetheless", "accordingly", "hence", "thus", "therefore",
"whereas", "notwithstanding", "hitherto", "whereby",
"therein", "thereof", "herein",
}
def __init__(self):
pass
def score(self, text: str) -> float:
"""Return formality score in [0, 1]. Higher = more formal.
Scoring based on:
- Contraction penalty (-0.05 each)
- Informal word penalty (-0.03 each)
- Formal marker bonus (+0.04 each)
- Average sentence length bonus (longer = more formal)
- First person penalty (-0.02 per occurrence)
- Exclamation penalty (-0.05 each)
"""
if not text or not text.strip():
return 0.5
words = text.lower().split()
word_count = max(len(words), 1)
# Base score
score = 0.5
# Contraction penalty
contraction_count = sum(1 for w in words if w in self.CONTRACTIONS)
score -= min(contraction_count * 0.05, 0.25)
# Informal word penalty
informal_count = sum(1 for w in words if w in self.INFORMAL_WORDS)
score -= min((informal_count / word_count) * 0.5, 0.2)
# Formal marker bonus
formal_count = sum(1 for w in words if w in self.FORMAL_MARKERS)
score += min(formal_count * 0.04, 0.2)
# Sentence length bonus (longer sentences tend to be more formal)
sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
if sentences:
avg_sent_len = sum(len(s.split()) for s in sentences) / len(sentences)
if avg_sent_len > 20:
score += 0.1
elif avg_sent_len > 15:
score += 0.05
elif avg_sent_len < 8:
score -= 0.05
# First person penalty (academic writing avoids "I")
first_person = sum(1 for w in words if w in ("i", "me", "my", "mine", "myself"))
score -= min((first_person / word_count) * 0.3, 0.1)
# Exclamation penalty
exclamation_count = text.count("!")
score -= min(exclamation_count * 0.05, 0.15)
# Question mark mild penalty (academic writing has fewer questions)
question_count = text.count("?")
score -= min(question_count * 0.02, 0.08)
# Passive voice bonus (approximation: "is/was/were/been" + past participle patterns)
passive_indicators = sum(1 for w in words if w in ("is", "was", "were", "been", "being"))
score += min((passive_indicators / word_count) * 0.15, 0.1)
return max(0.0, min(1.0, score))