Spaces:
Running
Running
AJAY KASU commited on
Commit ·
ed67efe
1
Parent(s): 37bddee
feat: enhance humanizer with colloquialisms, creative grammar, and register mixing
Browse files- agents/humanizer.py +9 -8
- agents/style.py +90 -29
agents/humanizer.py
CHANGED
|
@@ -146,7 +146,8 @@ class Humanizer:
|
|
| 146 |
"l_sem": round(l_sem, 4),
|
| 147 |
"l_style": round(l_style, 4),
|
| 148 |
"l_det": round(l_det, 4),
|
| 149 |
-
"total": round(total_loss, 4)
|
|
|
|
| 150 |
}
|
| 151 |
|
| 152 |
def _build_messages(self, text, feedback=""):
|
|
@@ -154,13 +155,13 @@ class Humanizer:
|
|
| 154 |
"You are a rewriting expert. Your goal is to make AI-generated text sound "
|
| 155 |
"like a real person wrote it on a forum (like Reddit or Hacker News).\n\n"
|
| 156 |
"Guidelines:\n"
|
| 157 |
-
"1. Break long sentences into shorter, punchier ones.\n"
|
| 158 |
-
"2. Use natural phrasing (e.g., 'I feel like', 'Honestly', 'Pretty much').\n"
|
| 159 |
-
"3.
|
| 160 |
-
"4. Use contractions everywhere (can't, won't, it's).\n"
|
| 161 |
-
"5. Vary your rhythm. Mix a 3-word sentence with a longer thought.\n"
|
| 162 |
-
"6.
|
| 163 |
-
"7.
|
| 164 |
"8. STRICTLY NO XML TAGS or markdown fencing in output.\n"
|
| 165 |
"\n"
|
| 166 |
"Examples:\n"
|
|
|
|
| 146 |
"l_sem": round(l_sem, 4),
|
| 147 |
"l_style": round(l_style, 4),
|
| 148 |
"l_det": round(l_det, 4),
|
| 149 |
+
"total": round(total_loss, 4),
|
| 150 |
+
"features": features
|
| 151 |
}
|
| 152 |
|
| 153 |
def _build_messages(self, text, feedback=""):
|
|
|
|
| 155 |
"You are a rewriting expert. Your goal is to make AI-generated text sound "
|
| 156 |
"like a real person wrote it on a forum (like Reddit or Hacker News).\n\n"
|
| 157 |
"Guidelines:\n"
|
| 158 |
+
"1. Break long sentences into shorter, punchier ones. Use fragments sometimes.\n"
|
| 159 |
+
"2. Use natural phrasing (e.g., 'I feel like', 'Honestly', 'Pretty much', 'Kinda').\n"
|
| 160 |
+
"3. Start sentences with 'And', 'But', or 'So'. Deviate from strict grammar.\n"
|
| 161 |
+
"4. Use contractions everywhere (can't, won't, it's, we're).\n"
|
| 162 |
+
"5. Vary your rhythm. Mix a 3-word sentence with a longer thought. Be bursty.\n"
|
| 163 |
+
"6. Avoid robotic transitions like 'Furthermore' or 'In conclusion'. Use 'Plus' or 'Anyway'.\n"
|
| 164 |
+
"7. Insert mild opinions or rhetorical questions ('Right?', 'You know?').\n"
|
| 165 |
"8. STRICTLY NO XML TAGS or markdown fencing in output.\n"
|
| 166 |
"\n"
|
| 167 |
"Examples:\n"
|
agents/style.py
CHANGED
|
@@ -18,30 +18,26 @@ class StyleExtractor:
|
|
| 18 |
- ttr: Type-Token Ratio (vocabulary diversity)
|
| 19 |
- comma_rate: Commas per sentence
|
| 20 |
- dash_rate: Em-dashes per sentence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
"""
|
| 22 |
if not text or not text.strip():
|
| 23 |
-
return
|
| 24 |
-
"avg_len": 0.0, "std_len": 0.0, "ttr": 0.0,
|
| 25 |
-
"comma_rate": 0.0, "dash_rate": 0.0
|
| 26 |
-
}
|
| 27 |
|
| 28 |
-
# 1. Sentence splitting
|
| 29 |
sentences = re.split(r'[.!?]+', text)
|
| 30 |
sentences = [s.strip() for s in sentences if s.strip()]
|
| 31 |
if not sentences:
|
| 32 |
-
return
|
| 33 |
-
"avg_len": 0.0, "std_len": 0.0, "ttr": 0.0,
|
| 34 |
-
"comma_rate": 0.0, "dash_rate": 0.0
|
| 35 |
-
}
|
| 36 |
|
| 37 |
# 2. Tokenization (space-based)
|
| 38 |
words = re.findall(r'\b\w+\b', text.lower())
|
| 39 |
num_words = len(words)
|
| 40 |
if num_words == 0:
|
| 41 |
-
return
|
| 42 |
-
"avg_len": 0.0, "std_len": 0.0, "ttr": 0.0,
|
| 43 |
-
"comma_rate": 0.0, "dash_rate": 0.0
|
| 44 |
-
}
|
| 45 |
|
| 46 |
# 3. Calculate sentence lengths
|
| 47 |
sent_lengths = [len(re.findall(r'\b\w+\b', s)) for s in sentences]
|
|
@@ -55,47 +51,112 @@ class StyleExtractor:
|
|
| 55 |
# 5. Punctuation stats
|
| 56 |
num_sentences = len(sentences)
|
| 57 |
comma_count = text.count(',')
|
| 58 |
-
dash_count = text.count('—') + text.count('--')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
return {
|
| 61 |
"avg_len": float(avg_len),
|
| 62 |
"std_len": float(std_len),
|
| 63 |
"ttr": float(ttr),
|
| 64 |
"comma_rate": comma_count / num_sentences,
|
| 65 |
-
"dash_rate": dash_count / num_sentences
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
}
|
| 67 |
|
| 68 |
def compute_loss(self, features, target_stats=None):
|
| 69 |
"""
|
| 70 |
Computes L_style based on distance from target human stats.
|
| 71 |
-
Default targets are based on casual/natural writing.
|
| 72 |
"""
|
| 73 |
-
# Default targets for "natural human writing"
|
| 74 |
-
# - Avg length: 15-20 words
|
| 75 |
-
# - Std dev: ~8-10 (high variability/burstiness)
|
| 76 |
-
# - TTR: ~0.6-0.7 (diverse vocab)
|
| 77 |
-
|
| 78 |
target = target_stats or {
|
| 79 |
"avg_len": 18.0,
|
| 80 |
-
"std_len": 10.0,
|
| 81 |
"ttr": 0.65,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
}
|
| 83 |
|
| 84 |
-
# Z-score-like difference square
|
| 85 |
-
# We normalize by expected variance to weigh them
|
| 86 |
loss = 0.0
|
| 87 |
|
| 88 |
-
#
|
| 89 |
loss += ((features["avg_len"] - target["avg_len"]) / 5.0) ** 2
|
| 90 |
|
| 91 |
-
#
|
| 92 |
-
# If std_len > target, that's good! So zero loss.
|
| 93 |
-
# If std_len < target, penalize.
|
| 94 |
if features["std_len"] < target["std_len"]:
|
| 95 |
loss += ((features["std_len"] - target["std_len"]) / 3.0) ** 2
|
| 96 |
|
| 97 |
-
# TTR: Penalize if too low
|
| 98 |
if features["ttr"] < target["ttr"]:
|
| 99 |
loss += ((features["ttr"] - target["ttr"]) / 0.1) ** 2
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
return loss
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
- ttr: Type-Token Ratio (vocabulary diversity)
|
| 19 |
- comma_rate: Commas per sentence
|
| 20 |
- dash_rate: Em-dashes per sentence
|
| 21 |
+
- conjunction_start_rate: Sentences starting with And/But/So
|
| 22 |
+
- colloquial_rate: Colloquial markers per 100 words
|
| 23 |
+
- contraction_rate: Contractions per 100 words
|
| 24 |
+
- pronoun_rate: 1st/2nd person pronouns per 100 words
|
| 25 |
+
- corp_speak_rate: Robotic words per 100 words
|
| 26 |
"""
|
| 27 |
if not text or not text.strip():
|
| 28 |
+
return self._empty_features()
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
# 1. Sentence splitting
|
| 31 |
sentences = re.split(r'[.!?]+', text)
|
| 32 |
sentences = [s.strip() for s in sentences if s.strip()]
|
| 33 |
if not sentences:
|
| 34 |
+
return self._empty_features()
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
# 2. Tokenization (space-based)
|
| 37 |
words = re.findall(r'\b\w+\b', text.lower())
|
| 38 |
num_words = len(words)
|
| 39 |
if num_words == 0:
|
| 40 |
+
return self._empty_features()
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
# 3. Calculate sentence lengths
|
| 43 |
sent_lengths = [len(re.findall(r'\b\w+\b', s)) for s in sentences]
|
|
|
|
| 51 |
# 5. Punctuation stats
|
| 52 |
num_sentences = len(sentences)
|
| 53 |
comma_count = text.count(',')
|
| 54 |
+
dash_count = text.count('—') + text.count('--')
|
| 55 |
+
|
| 56 |
+
# 6. Advanced Features
|
| 57 |
+
|
| 58 |
+
# Conjunction Starts (And, But, So)
|
| 59 |
+
conj_starts = sum(1 for s in sentences if re.match(r'^(And|But|So)\b', s, re.IGNORECASE))
|
| 60 |
+
|
| 61 |
+
# Colloquial Markers
|
| 62 |
+
colloquialisms = [
|
| 63 |
+
"kinda", "sorta", "gonna", "wanna", "yeah", "honestly", "actually",
|
| 64 |
+
"just saying", "you know", "i mean", "pretty much", "basically",
|
| 65 |
+
"literally", "totally", "seriously"
|
| 66 |
+
]
|
| 67 |
+
colloquial_count = sum(text.lower().count(c) for c in colloquialisms)
|
| 68 |
+
|
| 69 |
+
# Contractions
|
| 70 |
+
contractions = [
|
| 71 |
+
"n't", "'re", "'ll", "'ve", "'m", "'d", "'s"
|
| 72 |
+
]
|
| 73 |
+
contraction_count = sum(text.lower().count(c) for c in contractions)
|
| 74 |
+
|
| 75 |
+
# Personal Pronouns (I, me, my, we, us, our, you, your)
|
| 76 |
+
pronouns = [
|
| 77 |
+
r"\bi\b", r"\bme\b", r"\bmy\b", r"\bwe\b", r"\bus\b", r"\bour\b",
|
| 78 |
+
r"\byou\b", r"\byour\b"
|
| 79 |
+
]
|
| 80 |
+
pronoun_count = sum(len(re.findall(p, text.lower())) for p in pronouns)
|
| 81 |
+
|
| 82 |
+
# Corp-Speak (Penalize these)
|
| 83 |
+
corp_speak = [
|
| 84 |
+
"optimize", "leverage", "fundamentally", "at scale", "crucial",
|
| 85 |
+
"imperative", "facilitate", "utilize", "orchestrate", "synergy",
|
| 86 |
+
"paradigm", "robust", "transformative"
|
| 87 |
+
]
|
| 88 |
+
corp_speak_count = sum(text.lower().count(w) for w in corp_speak)
|
| 89 |
|
| 90 |
return {
|
| 91 |
"avg_len": float(avg_len),
|
| 92 |
"std_len": float(std_len),
|
| 93 |
"ttr": float(ttr),
|
| 94 |
"comma_rate": comma_count / num_sentences,
|
| 95 |
+
"dash_rate": dash_count / num_sentences,
|
| 96 |
+
"conjunction_start_rate": conj_starts / num_sentences,
|
| 97 |
+
"colloquial_rate": (colloquial_count / num_words) * 100,
|
| 98 |
+
"contraction_rate": (contraction_count / num_words) * 100,
|
| 99 |
+
"pronoun_rate": (pronoun_count / num_words) * 100,
|
| 100 |
+
"corp_speak_rate": (corp_speak_count / num_words) * 100
|
| 101 |
}
|
| 102 |
|
| 103 |
def compute_loss(self, features, target_stats=None):
|
| 104 |
"""
|
| 105 |
Computes L_style based on distance from target human stats.
|
|
|
|
| 106 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
target = target_stats or {
|
| 108 |
"avg_len": 18.0,
|
| 109 |
+
"std_len": 10.0, # WANT high variance
|
| 110 |
"ttr": 0.65,
|
| 111 |
+
"colloquial_rate": 2.0, # ~2 per 100 words
|
| 112 |
+
"contraction_rate": 3.0,
|
| 113 |
+
"pronoun_rate": 4.0,
|
| 114 |
+
"conjunction_start_rate": 0.1,
|
| 115 |
+
"corp_speak_rate": 0.0 # WANT zero
|
| 116 |
}
|
| 117 |
|
|
|
|
|
|
|
| 118 |
loss = 0.0
|
| 119 |
|
| 120 |
+
# Basic Stats
|
| 121 |
loss += ((features["avg_len"] - target["avg_len"]) / 5.0) ** 2
|
| 122 |
|
| 123 |
+
# Burstiness: Only penalize if TOO LOW
|
|
|
|
|
|
|
| 124 |
if features["std_len"] < target["std_len"]:
|
| 125 |
loss += ((features["std_len"] - target["std_len"]) / 3.0) ** 2
|
| 126 |
|
| 127 |
+
# TTR: Penalize if too low
|
| 128 |
if features["ttr"] < target["ttr"]:
|
| 129 |
loss += ((features["ttr"] - target["ttr"]) / 0.1) ** 2
|
| 130 |
|
| 131 |
+
# Advanced Stats
|
| 132 |
+
|
| 133 |
+
# Colloquial: Penalize if too LOW
|
| 134 |
+
if features["colloquial_rate"] < target["colloquial_rate"]:
|
| 135 |
+
loss += ((features["colloquial_rate"] - target["colloquial_rate"]) / 1.0) ** 2
|
| 136 |
+
|
| 137 |
+
# Contractions: Penalize if too LOW
|
| 138 |
+
if features["contraction_rate"] < target["contraction_rate"]:
|
| 139 |
+
loss += ((features["contraction_rate"] - target["contraction_rate"]) / 1.0) ** 2
|
| 140 |
+
|
| 141 |
+
# Pronouns: Penalize if too LOW (robotic text is impersonal)
|
| 142 |
+
if features["pronoun_rate"] < target["pronoun_rate"]:
|
| 143 |
+
loss += ((features["pronoun_rate"] - target["pronoun_rate"]) / 1.0) ** 2
|
| 144 |
+
|
| 145 |
+
# Conjunction Starts: Penalize if too LOW
|
| 146 |
+
if features["conjunction_start_rate"] < target["conjunction_start_rate"]:
|
| 147 |
+
loss += ((features["conjunction_start_rate"] - target["conjunction_start_rate"]) / 0.05) ** 2
|
| 148 |
+
|
| 149 |
+
# Corp Speak: Penalize if too HIGH (only one direction)
|
| 150 |
+
if features["corp_speak_rate"] > target["corp_speak_rate"]:
|
| 151 |
+
loss += ((features["corp_speak_rate"] - target["corp_speak_rate"]) / 0.5) ** 2
|
| 152 |
+
|
| 153 |
return loss
|
| 154 |
+
|
| 155 |
+
def _empty_features(self):
|
| 156 |
+
return {
|
| 157 |
+
"avg_len": 0.0, "std_len": 0.0, "ttr": 0.0,
|
| 158 |
+
"comma_rate": 0.0, "dash_rate": 0.0,
|
| 159 |
+
"conjunction_start_rate": 0.0, "colloquial_rate": 0.0,
|
| 160 |
+
"contraction_rate": 0.0, "pronoun_rate": 0.0,
|
| 161 |
+
"corp_speak_rate": 0.0
|
| 162 |
+
}
|