AJAY KASU commited on
Commit
ed67efe
·
1 Parent(s): 37bddee

feat: enhance humanizer with colloquialisms, creative grammar, and register mixing

Browse files
Files changed (2) hide show
  1. agents/humanizer.py +9 -8
  2. agents/style.py +90 -29
agents/humanizer.py CHANGED
@@ -146,7 +146,8 @@ class Humanizer:
146
  "l_sem": round(l_sem, 4),
147
  "l_style": round(l_style, 4),
148
  "l_det": round(l_det, 4),
149
- "total": round(total_loss, 4)
 
150
  }
151
 
152
  def _build_messages(self, text, feedback=""):
@@ -154,13 +155,13 @@ class Humanizer:
154
  "You are a rewriting expert. Your goal is to make AI-generated text sound "
155
  "like a real person wrote it on a forum (like Reddit or Hacker News).\n\n"
156
  "Guidelines:\n"
157
- "1. Break long sentences into shorter, punchier ones.\n"
158
- "2. Use natural phrasing (e.g., 'I feel like', 'Honestly', 'Pretty much').\n"
159
- "3. Don't be afraid to start sentences with 'And', 'But', or 'So'.\n"
160
- "4. Use contractions everywhere (can't, won't, it's).\n"
161
- "5. Vary your rhythm. Mix a 3-word sentence with a longer thought.\n"
162
- "6. Keep the core meaning but change the structure completely.\n"
163
- "7. Avoid robotic transitions like 'Furthermore' or 'In conclusion'. Use 'Plus' or 'Anyway'.\n"
164
  "8. STRICTLY NO XML TAGS or markdown fencing in output.\n"
165
  "\n"
166
  "Examples:\n"
 
146
  "l_sem": round(l_sem, 4),
147
  "l_style": round(l_style, 4),
148
  "l_det": round(l_det, 4),
149
+ "total": round(total_loss, 4),
150
+ "features": features
151
  }
152
 
153
  def _build_messages(self, text, feedback=""):
 
155
  "You are a rewriting expert. Your goal is to make AI-generated text sound "
156
  "like a real person wrote it on a forum (like Reddit or Hacker News).\n\n"
157
  "Guidelines:\n"
158
+ "1. Break long sentences into shorter, punchier ones. Use fragments sometimes.\n"
159
+ "2. Use natural phrasing (e.g., 'I feel like', 'Honestly', 'Pretty much', 'Kinda').\n"
160
+ "3. Start sentences with 'And', 'But', or 'So'. Deviate from strict grammar.\n"
161
+ "4. Use contractions everywhere (can't, won't, it's, we're).\n"
162
+ "5. Vary your rhythm. Mix a 3-word sentence with a longer thought. Be bursty.\n"
163
+ "6. Avoid robotic transitions like 'Furthermore' or 'In conclusion'. Use 'Plus' or 'Anyway'.\n"
164
+ "7. Insert mild opinions or rhetorical questions ('Right?', 'You know?').\n"
165
  "8. STRICTLY NO XML TAGS or markdown fencing in output.\n"
166
  "\n"
167
  "Examples:\n"
agents/style.py CHANGED
@@ -18,30 +18,26 @@ class StyleExtractor:
18
  - ttr: Type-Token Ratio (vocabulary diversity)
19
  - comma_rate: Commas per sentence
20
  - dash_rate: Em-dashes per sentence
 
 
 
 
 
21
  """
22
  if not text or not text.strip():
23
- return {
24
- "avg_len": 0.0, "std_len": 0.0, "ttr": 0.0,
25
- "comma_rate": 0.0, "dash_rate": 0.0
26
- }
27
 
28
- # 1. Sentence splitting (naive but fast)
29
  sentences = re.split(r'[.!?]+', text)
30
  sentences = [s.strip() for s in sentences if s.strip()]
31
  if not sentences:
32
- return {
33
- "avg_len": 0.0, "std_len": 0.0, "ttr": 0.0,
34
- "comma_rate": 0.0, "dash_rate": 0.0
35
- }
36
 
37
  # 2. Tokenization (space-based)
38
  words = re.findall(r'\b\w+\b', text.lower())
39
  num_words = len(words)
40
  if num_words == 0:
41
- return {
42
- "avg_len": 0.0, "std_len": 0.0, "ttr": 0.0,
43
- "comma_rate": 0.0, "dash_rate": 0.0
44
- }
45
 
46
  # 3. Calculate sentence lengths
47
  sent_lengths = [len(re.findall(r'\b\w+\b', s)) for s in sentences]
@@ -55,47 +51,112 @@ class StyleExtractor:
55
  # 5. Punctuation stats
56
  num_sentences = len(sentences)
57
  comma_count = text.count(',')
58
- dash_count = text.count('—') + text.count('--') # standard em-dash or double-dash
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  return {
61
  "avg_len": float(avg_len),
62
  "std_len": float(std_len),
63
  "ttr": float(ttr),
64
  "comma_rate": comma_count / num_sentences,
65
- "dash_rate": dash_count / num_sentences
 
 
 
 
 
66
  }
67
 
68
  def compute_loss(self, features, target_stats=None):
69
  """
70
  Computes L_style based on distance from target human stats.
71
- Default targets are based on casual/natural writing.
72
  """
73
- # Default targets for "natural human writing"
74
- # - Avg length: 15-20 words
75
- # - Std dev: ~8-10 (high variability/burstiness)
76
- # - TTR: ~0.6-0.7 (diverse vocab)
77
-
78
  target = target_stats or {
79
  "avg_len": 18.0,
80
- "std_len": 10.0,
81
  "ttr": 0.65,
 
 
 
 
 
82
  }
83
 
84
- # Z-score-like difference square
85
- # We normalize by expected variance to weigh them
86
  loss = 0.0
87
 
88
- # Penalize if avg length is too short (robotic) or too long (academic)
89
  loss += ((features["avg_len"] - target["avg_len"]) / 5.0) ** 2
90
 
91
- # Penalize ONLY if variance is too LOW (we want burstiness)
92
- # If std_len > target, that's good! So zero loss.
93
- # If std_len < target, penalize.
94
  if features["std_len"] < target["std_len"]:
95
  loss += ((features["std_len"] - target["std_len"]) / 3.0) ** 2
96
 
97
- # TTR: Penalize if too low (repetitive)
98
  if features["ttr"] < target["ttr"]:
99
  loss += ((features["ttr"] - target["ttr"]) / 0.1) ** 2
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  return loss
 
 
 
 
 
 
 
 
 
 
18
  - ttr: Type-Token Ratio (vocabulary diversity)
19
  - comma_rate: Commas per sentence
20
  - dash_rate: Em-dashes per sentence
21
+ - conjunction_start_rate: Sentences starting with And/But/So
22
+ - colloquial_rate: Colloquial markers per 100 words
23
+ - contraction_rate: Contractions per 100 words
24
+ - pronoun_rate: 1st/2nd person pronouns per 100 words
25
+ - corp_speak_rate: Robotic words per 100 words
26
  """
27
  if not text or not text.strip():
28
+ return self._empty_features()
 
 
 
29
 
30
+ # 1. Sentence splitting
31
  sentences = re.split(r'[.!?]+', text)
32
  sentences = [s.strip() for s in sentences if s.strip()]
33
  if not sentences:
34
+ return self._empty_features()
 
 
 
35
 
36
  # 2. Tokenization (space-based)
37
  words = re.findall(r'\b\w+\b', text.lower())
38
  num_words = len(words)
39
  if num_words == 0:
40
+ return self._empty_features()
 
 
 
41
 
42
  # 3. Calculate sentence lengths
43
  sent_lengths = [len(re.findall(r'\b\w+\b', s)) for s in sentences]
 
51
  # 5. Punctuation stats
52
  num_sentences = len(sentences)
53
  comma_count = text.count(',')
54
+ dash_count = text.count('—') + text.count('--')
55
+
56
+ # 6. Advanced Features
57
+
58
+ # Conjunction Starts (And, But, So)
59
+ conj_starts = sum(1 for s in sentences if re.match(r'^(And|But|So)\b', s, re.IGNORECASE))
60
+
61
+ # Colloquial Markers
62
+ colloquialisms = [
63
+ "kinda", "sorta", "gonna", "wanna", "yeah", "honestly", "actually",
64
+ "just saying", "you know", "i mean", "pretty much", "basically",
65
+ "literally", "totally", "seriously"
66
+ ]
67
+ colloquial_count = sum(text.lower().count(c) for c in colloquialisms)
68
+
69
+ # Contractions
70
+ contractions = [
71
+ "n't", "'re", "'ll", "'ve", "'m", "'d", "'s"
72
+ ]
73
+ contraction_count = sum(text.lower().count(c) for c in contractions)
74
+
75
+ # Personal Pronouns (I, me, my, we, us, our, you, your)
76
+ pronouns = [
77
+ r"\bi\b", r"\bme\b", r"\bmy\b", r"\bwe\b", r"\bus\b", r"\bour\b",
78
+ r"\byou\b", r"\byour\b"
79
+ ]
80
+ pronoun_count = sum(len(re.findall(p, text.lower())) for p in pronouns)
81
+
82
+ # Corp-Speak (Penalize these)
83
+ corp_speak = [
84
+ "optimize", "leverage", "fundamentally", "at scale", "crucial",
85
+ "imperative", "facilitate", "utilize", "orchestrate", "synergy",
86
+ "paradigm", "robust", "transformative"
87
+ ]
88
+ corp_speak_count = sum(text.lower().count(w) for w in corp_speak)
89
 
90
  return {
91
  "avg_len": float(avg_len),
92
  "std_len": float(std_len),
93
  "ttr": float(ttr),
94
  "comma_rate": comma_count / num_sentences,
95
+ "dash_rate": dash_count / num_sentences,
96
+ "conjunction_start_rate": conj_starts / num_sentences,
97
+ "colloquial_rate": (colloquial_count / num_words) * 100,
98
+ "contraction_rate": (contraction_count / num_words) * 100,
99
+ "pronoun_rate": (pronoun_count / num_words) * 100,
100
+ "corp_speak_rate": (corp_speak_count / num_words) * 100
101
  }
102
 
103
  def compute_loss(self, features, target_stats=None):
104
  """
105
  Computes L_style based on distance from target human stats.
 
106
  """
 
 
 
 
 
107
  target = target_stats or {
108
  "avg_len": 18.0,
109
+ "std_len": 10.0, # WANT high variance
110
  "ttr": 0.65,
111
+ "colloquial_rate": 2.0, # ~2 per 100 words
112
+ "contraction_rate": 3.0,
113
+ "pronoun_rate": 4.0,
114
+ "conjunction_start_rate": 0.1,
115
+ "corp_speak_rate": 0.0 # WANT zero
116
  }
117
 
 
 
118
  loss = 0.0
119
 
120
+ # Basic Stats
121
  loss += ((features["avg_len"] - target["avg_len"]) / 5.0) ** 2
122
 
123
+ # Burstiness: Only penalize if TOO LOW
 
 
124
  if features["std_len"] < target["std_len"]:
125
  loss += ((features["std_len"] - target["std_len"]) / 3.0) ** 2
126
 
127
+ # TTR: Penalize if too low
128
  if features["ttr"] < target["ttr"]:
129
  loss += ((features["ttr"] - target["ttr"]) / 0.1) ** 2
130
 
131
+ # Advanced Stats
132
+
133
+ # Colloquial: Penalize if too LOW
134
+ if features["colloquial_rate"] < target["colloquial_rate"]:
135
+ loss += ((features["colloquial_rate"] - target["colloquial_rate"]) / 1.0) ** 2
136
+
137
+ # Contractions: Penalize if too LOW
138
+ if features["contraction_rate"] < target["contraction_rate"]:
139
+ loss += ((features["contraction_rate"] - target["contraction_rate"]) / 1.0) ** 2
140
+
141
+ # Pronouns: Penalize if too LOW (robotic text is impersonal)
142
+ if features["pronoun_rate"] < target["pronoun_rate"]:
143
+ loss += ((features["pronoun_rate"] - target["pronoun_rate"]) / 1.0) ** 2
144
+
145
+ # Conjunction Starts: Penalize if too LOW
146
+ if features["conjunction_start_rate"] < target["conjunction_start_rate"]:
147
+ loss += ((features["conjunction_start_rate"] - target["conjunction_start_rate"]) / 0.05) ** 2
148
+
149
+ # Corp Speak: Penalize if too HIGH (only one direction)
150
+ if features["corp_speak_rate"] > target["corp_speak_rate"]:
151
+ loss += ((features["corp_speak_rate"] - target["corp_speak_rate"]) / 0.5) ** 2
152
+
153
  return loss
154
+
155
+ def _empty_features(self):
156
+ return {
157
+ "avg_len": 0.0, "std_len": 0.0, "ttr": 0.0,
158
+ "comma_rate": 0.0, "dash_rate": 0.0,
159
+ "conjunction_start_rate": 0.0, "colloquial_rate": 0.0,
160
+ "contraction_rate": 0.0, "pronoun_rate": 0.0,
161
+ "corp_speak_rate": 0.0
162
+ }