morpheuslord
/

rewrite

text2text-generation

grammar-correction

style-preservation

Model card Files Files and versions

Metrics Training metrics Community

rewrite / src /style /formality_classifier.py

morpheuslord's picture

Add files using upload-large-folder tool

12fd5f2 verified 7 days ago

history blame contribute delete

3.61 kB

	"""
	Formality classifier module.
	Classifies text on a 0-1 formality scale using linguistic features.
	Used as one dimension of the style fingerprint.
	"""

	import re
	from typing import Optional


	class FormalityClassifier:
	"""Scores text formality on a 0-1 scale using rule-based heuristics."""

	# Informal markers that decrease formality score
	CONTRACTIONS = {
	"don't", "can't", "won't", "it's", "that's", "there's",
	"they're", "we're", "you're", "i'm", "i've", "i'll",
	"isn't", "aren't", "wasn't", "weren't", "hasn't", "haven't",
	"couldn't", "wouldn't", "shouldn't", "let's", "he's", "she's",
	}

	INFORMAL_WORDS = {
	"gonna", "wanna", "gotta", "kinda", "sorta", "ya", "yeah",
	"yep", "nope", "ok", "okay", "cool", "awesome", "stuff",
	"things", "like", "basically", "actually", "literally",
	"totally", "really", "super", "pretty", "kind of", "sort of",
	}

	FORMAL_MARKERS = {
	"furthermore", "moreover", "consequently", "nevertheless",
	"nonetheless", "accordingly", "hence", "thus", "therefore",
	"whereas", "notwithstanding", "hitherto", "whereby",
	"therein", "thereof", "herein",
	}

	def __init__(self):
	pass

	def score(self, text: str) -> float:
	"""Return formality score in [0, 1]. Higher = more formal.

	Scoring based on:
	- Contraction penalty (-0.05 each)
	- Informal word penalty (-0.03 each)
	- Formal marker bonus (+0.04 each)
	- Average sentence length bonus (longer = more formal)
	- First person penalty (-0.02 per occurrence)
	- Exclamation penalty (-0.05 each)
	"""
	if not text or not text.strip():
	return 0.5

	words = text.lower().split()
	word_count = max(len(words), 1)

	# Base score
	score = 0.5

	# Contraction penalty
	contraction_count = sum(1 for w in words if w in self.CONTRACTIONS)
	score -= min(contraction_count * 0.05, 0.25)

	# Informal word penalty
	informal_count = sum(1 for w in words if w in self.INFORMAL_WORDS)
	score -= min((informal_count / word_count) * 0.5, 0.2)

	# Formal marker bonus
	formal_count = sum(1 for w in words if w in self.FORMAL_MARKERS)
	score += min(formal_count * 0.04, 0.2)

	# Sentence length bonus (longer sentences tend to be more formal)
	sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
	if sentences:
	avg_sent_len = sum(len(s.split()) for s in sentences) / len(sentences)
	if avg_sent_len > 20:
	score += 0.1
	elif avg_sent_len > 15:
	score += 0.05
	elif avg_sent_len < 8:
	score -= 0.05

	# First person penalty (academic writing avoids "I")
	first_person = sum(1 for w in words if w in ("i", "me", "my", "mine", "myself"))
	score -= min((first_person / word_count) * 0.3, 0.1)

	# Exclamation penalty
	exclamation_count = text.count("!")
	score -= min(exclamation_count * 0.05, 0.15)

	# Question mark mild penalty (academic writing has fewer questions)
	question_count = text.count("?")
	score -= min(question_count * 0.02, 0.08)

	# Passive voice bonus (approximation: "is/was/were/been" + past participle patterns)
	passive_indicators = sum(1 for w in words if w in ("is", "was", "were", "been", "being"))
	score += min((passive_indicators / word_count) * 0.15, 0.1)

	return max(0.0, min(1.0, score))