File size: 3,612 Bytes

12fd5f2

"""
Formality classifier module.
Classifies text on a 0-1 formality scale using linguistic features.
Used as one dimension of the style fingerprint.
"""

import re
from typing import Optional


class FormalityClassifier:
    """Scores text formality on a 0-1 scale using rule-based heuristics."""

    # Informal markers that decrease formality score
    CONTRACTIONS = {
        "don't", "can't", "won't", "it's", "that's", "there's",
        "they're", "we're", "you're", "i'm", "i've", "i'll",
        "isn't", "aren't", "wasn't", "weren't", "hasn't", "haven't",
        "couldn't", "wouldn't", "shouldn't", "let's", "he's", "she's",
    }

    INFORMAL_WORDS = {
        "gonna", "wanna", "gotta", "kinda", "sorta", "ya", "yeah",
        "yep", "nope", "ok", "okay", "cool", "awesome", "stuff",
        "things", "like", "basically", "actually", "literally",
        "totally", "really", "super", "pretty", "kind of", "sort of",
    }

    FORMAL_MARKERS = {
        "furthermore", "moreover", "consequently", "nevertheless",
        "nonetheless", "accordingly", "hence", "thus", "therefore",
        "whereas", "notwithstanding", "hitherto", "whereby",
        "therein", "thereof", "herein",
    }

    def __init__(self):
        pass

    def score(self, text: str) -> float:
        """Return formality score in [0, 1]. Higher = more formal.

        Scoring based on:
        - Contraction penalty (-0.05 each)
        - Informal word penalty (-0.03 each)
        - Formal marker bonus (+0.04 each)
        - Average sentence length bonus (longer = more formal)
        - First person penalty (-0.02 per occurrence)
        - Exclamation penalty (-0.05 each)
        """
        if not text or not text.strip():
            return 0.5

        words = text.lower().split()
        word_count = max(len(words), 1)

        # Base score
        score = 0.5

        # Contraction penalty
        contraction_count = sum(1 for w in words if w in self.CONTRACTIONS)
        score -= min(contraction_count * 0.05, 0.25)

        # Informal word penalty
        informal_count = sum(1 for w in words if w in self.INFORMAL_WORDS)
        score -= min((informal_count / word_count) * 0.5, 0.2)

        # Formal marker bonus
        formal_count = sum(1 for w in words if w in self.FORMAL_MARKERS)
        score += min(formal_count * 0.04, 0.2)

        # Sentence length bonus (longer sentences tend to be more formal)
        sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
        if sentences:
            avg_sent_len = sum(len(s.split()) for s in sentences) / len(sentences)
            if avg_sent_len > 20:
                score += 0.1
            elif avg_sent_len > 15:
                score += 0.05
            elif avg_sent_len < 8:
                score -= 0.05

        # First person penalty (academic writing avoids "I")
        first_person = sum(1 for w in words if w in ("i", "me", "my", "mine", "myself"))
        score -= min((first_person / word_count) * 0.3, 0.1)

        # Exclamation penalty
        exclamation_count = text.count("!")
        score -= min(exclamation_count * 0.05, 0.15)

        # Question mark mild penalty (academic writing has fewer questions)
        question_count = text.count("?")
        score -= min(question_count * 0.02, 0.08)

        # Passive voice bonus (approximation: "is/was/were/been" + past participle patterns)
        passive_indicators = sum(1 for w in words if w in ("is", "was", "were", "been", "being"))
        score += min((passive_indicators / word_count) * 0.15, 0.1)

        return max(0.0, min(1.0, score))