Spaces:

AJAYKASU
/

AI_Humanizer

Running

App Files Files Community

AJAY KASU commited on Feb 11

Commit

ed67efe

1 Parent(s): 37bddee

feat: enhance humanizer with colloquialisms, creative grammar, and register mixing

Browse files

Files changed (2) hide show

agents/humanizer.py +9 -8
agents/style.py +90 -29

agents/humanizer.py CHANGED Viewed

@@ -146,7 +146,8 @@ class Humanizer:
             "l_sem": round(l_sem, 4),
             "l_style": round(l_style, 4),
             "l_det": round(l_det, 4),
-            "total": round(total_loss, 4)
         }
     def _build_messages(self, text, feedback=""):
@@ -154,13 +155,13 @@ class Humanizer:
             "You are a rewriting expert. Your goal is to make AI-generated text sound "
             "like a real person wrote it on a forum (like Reddit or Hacker News).\n\n"
             "Guidelines:\n"
-            "1. Break long sentences into shorter, punchier ones.\n"
-            "2. Use natural phrasing (e.g., 'I feel like', 'Honestly', 'Pretty much').\n"
-            "3. Don't be afraid to start sentences with 'And', 'But', or 'So'.\n"
-            "4. Use contractions everywhere (can't, won't, it's).\n"
-            "5. Vary your rhythm. Mix a 3-word sentence with a longer thought.\n"
-            "6. Keep the core meaning but change the structure completely.\n"
-            "7. Avoid robotic transitions like 'Furthermore' or 'In conclusion'. Use 'Plus' or 'Anyway'.\n"
             "8. STRICTLY NO XML TAGS or markdown fencing in output.\n"
             "\n"
             "Examples:\n"

             "l_sem": round(l_sem, 4),
             "l_style": round(l_style, 4),
             "l_det": round(l_det, 4),
+            "total": round(total_loss, 4),
+            "features": features
         }
     def _build_messages(self, text, feedback=""):
             "You are a rewriting expert. Your goal is to make AI-generated text sound "
             "like a real person wrote it on a forum (like Reddit or Hacker News).\n\n"
             "Guidelines:\n"
+            "1. Break long sentences into shorter, punchier ones. Use fragments sometimes.\n"
+            "2. Use natural phrasing (e.g., 'I feel like', 'Honestly', 'Pretty much', 'Kinda').\n"
+            "3. Start sentences with 'And', 'But', or 'So'. Deviate from strict grammar.\n"
+            "4. Use contractions everywhere (can't, won't, it's, we're).\n"
+            "5. Vary your rhythm. Mix a 3-word sentence with a longer thought. Be bursty.\n"
+            "6. Avoid robotic transitions like 'Furthermore' or 'In conclusion'. Use 'Plus' or 'Anyway'.\n"
+            "7. Insert mild opinions or rhetorical questions ('Right?', 'You know?').\n"
             "8. STRICTLY NO XML TAGS or markdown fencing in output.\n"
             "\n"
             "Examples:\n"

agents/style.py CHANGED Viewed

@@ -18,30 +18,26 @@ class StyleExtractor:
         - ttr: Type-Token Ratio (vocabulary diversity)
         - comma_rate: Commas per sentence
         - dash_rate: Em-dashes per sentence
         """
         if not text or not text.strip():
-            return {
-                "avg_len": 0.0, "std_len": 0.0, "ttr": 0.0,
-                "comma_rate": 0.0, "dash_rate": 0.0
-            }
-        # 1. Sentence splitting (naive but fast)
         sentences = re.split(r'[.!?]+', text)
         sentences = [s.strip() for s in sentences if s.strip()]
         if not sentences:
-            return {
-                "avg_len": 0.0, "std_len": 0.0, "ttr": 0.0,
-                "comma_rate": 0.0, "dash_rate": 0.0
-            }
         # 2. Tokenization (space-based)
         words = re.findall(r'\b\w+\b', text.lower())
         num_words = len(words)
         if num_words == 0:
-            return {
-                "avg_len": 0.0, "std_len": 0.0, "ttr": 0.0,
-                "comma_rate": 0.0, "dash_rate": 0.0
-            }
         # 3. Calculate sentence lengths
         sent_lengths = [len(re.findall(r'\b\w+\b', s)) for s in sentences]
@@ -55,47 +51,112 @@ class StyleExtractor:
         # 5. Punctuation stats
         num_sentences = len(sentences)
         comma_count = text.count(',')
-        dash_count = text.count('—') + text.count('--')  # standard em-dash or double-dash
         return {
             "avg_len": float(avg_len),
             "std_len": float(std_len),
             "ttr": float(ttr),
             "comma_rate": comma_count / num_sentences,
-            "dash_rate": dash_count / num_sentences
         }
     def compute_loss(self, features, target_stats=None):
         """
         Computes L_style based on distance from target human stats.
-        Default targets are based on casual/natural writing.
         """
-        # Default targets for "natural human writing"
-        # - Avg length: 15-20 words
-        # - Std dev: ~8-10 (high variability/burstiness)
-        # - TTR: ~0.6-0.7 (diverse vocab)
         target = target_stats or {
             "avg_len": 18.0,
-            "std_len": 10.0,
             "ttr": 0.65,
         }
-        # Z-score-like difference square
-        # We normalize by expected variance to weigh them
         loss = 0.0
-        # Penalize if avg length is too short (robotic) or too long (academic)
         loss += ((features["avg_len"] - target["avg_len"]) / 5.0) ** 2
-        # Penalize ONLY if variance is too LOW (we want burstiness)
-        # If std_len > target, that's good! So zero loss.
-        # If std_len < target, penalize.
         if features["std_len"] < target["std_len"]:
              loss += ((features["std_len"] - target["std_len"]) / 3.0) ** 2
-        # TTR: Penalize if too low (repetitive)
         if features["ttr"] < target["ttr"]:
             loss += ((features["ttr"] - target["ttr"]) / 0.1) ** 2
         return loss

         - ttr: Type-Token Ratio (vocabulary diversity)
         - comma_rate: Commas per sentence
         - dash_rate: Em-dashes per sentence
+        - conjunction_start_rate: Sentences starting with And/But/So
+        - colloquial_rate: Colloquial markers per 100 words
+        - contraction_rate: Contractions per 100 words
+        - pronoun_rate: 1st/2nd person pronouns per 100 words
+        - corp_speak_rate: Robotic words per 100 words
         """
         if not text or not text.strip():
+            return self._empty_features()
+        # 1. Sentence splitting
         sentences = re.split(r'[.!?]+', text)
         sentences = [s.strip() for s in sentences if s.strip()]
         if not sentences:
+            return self._empty_features()
         # 2. Tokenization (space-based)
         words = re.findall(r'\b\w+\b', text.lower())
         num_words = len(words)
         if num_words == 0:
+            return self._empty_features()
         # 3. Calculate sentence lengths
         sent_lengths = [len(re.findall(r'\b\w+\b', s)) for s in sentences]
         # 5. Punctuation stats
         num_sentences = len(sentences)
         comma_count = text.count(',')
+        dash_count = text.count('—') + text.count('--')
+        # 6. Advanced Features
+        # Conjunction Starts (And, But, So)
+        conj_starts = sum(1 for s in sentences if re.match(r'^(And|But|So)\b', s, re.IGNORECASE))
+        # Colloquial Markers
+        colloquialisms = [
+            "kinda", "sorta", "gonna", "wanna", "yeah", "honestly", "actually",
+            "just saying", "you know", "i mean", "pretty much", "basically",
+            "literally", "totally", "seriously"
+        ]
+        colloquial_count = sum(text.lower().count(c) for c in colloquialisms)
+        # Contractions
+        contractions = [
+            "n't", "'re", "'ll", "'ve", "'m", "'d", "'s"
+        ]
+        contraction_count = sum(text.lower().count(c) for c in contractions)
+        # Personal Pronouns (I, me, my, we, us, our, you, your)
+        pronouns = [
+            r"\bi\b", r"\bme\b", r"\bmy\b", r"\bwe\b", r"\bus\b", r"\bour\b",
+            r"\byou\b", r"\byour\b"
+        ]
+        pronoun_count = sum(len(re.findall(p, text.lower())) for p in pronouns)
+        # Corp-Speak (Penalize these)
+        corp_speak = [
+            "optimize", "leverage", "fundamentally", "at scale", "crucial",
+            "imperative", "facilitate", "utilize", "orchestrate", "synergy",
+            "paradigm", "robust", "transformative"
+        ]
+        corp_speak_count = sum(text.lower().count(w) for w in corp_speak)
         return {
             "avg_len": float(avg_len),
             "std_len": float(std_len),
             "ttr": float(ttr),
             "comma_rate": comma_count / num_sentences,
+            "dash_rate": dash_count / num_sentences,
+            "conjunction_start_rate": conj_starts / num_sentences,
+            "colloquial_rate": (colloquial_count / num_words) * 100,
+            "contraction_rate": (contraction_count / num_words) * 100,
+            "pronoun_rate": (pronoun_count / num_words) * 100,
+            "corp_speak_rate": (corp_speak_count / num_words) * 100
         }
     def compute_loss(self, features, target_stats=None):
         """
         Computes L_style based on distance from target human stats.
         """
         target = target_stats or {
             "avg_len": 18.0,
+            "std_len": 10.0,     # WANT high variance
             "ttr": 0.65,
+            "colloquial_rate": 2.0, # ~2 per 100 words
+            "contraction_rate": 3.0,
+            "pronoun_rate": 4.0,
+            "conjunction_start_rate": 0.1,
+            "corp_speak_rate": 0.0 # WANT zero
         }
         loss = 0.0
+        # Basic Stats
         loss += ((features["avg_len"] - target["avg_len"]) / 5.0) ** 2
+        # Burstiness: Only penalize if TOO LOW
         if features["std_len"] < target["std_len"]:
              loss += ((features["std_len"] - target["std_len"]) / 3.0) ** 2
+        # TTR: Penalize if too low
         if features["ttr"] < target["ttr"]:
             loss += ((features["ttr"] - target["ttr"]) / 0.1) ** 2
+        # Advanced Stats
+        # Colloquial: Penalize if too LOW
+        if features["colloquial_rate"] < target["colloquial_rate"]:
+            loss += ((features["colloquial_rate"] - target["colloquial_rate"]) / 1.0) ** 2
+        # Contractions: Penalize if too LOW
+        if features["contraction_rate"] < target["contraction_rate"]:
+            loss += ((features["contraction_rate"] - target["contraction_rate"]) / 1.0) ** 2
+        # Pronouns: Penalize if too LOW (robotic text is impersonal)
+        if features["pronoun_rate"] < target["pronoun_rate"]:
+            loss += ((features["pronoun_rate"] - target["pronoun_rate"]) / 1.0) ** 2
+        # Conjunction Starts: Penalize if too LOW
+        if features["conjunction_start_rate"] < target["conjunction_start_rate"]:
+            loss += ((features["conjunction_start_rate"] - target["conjunction_start_rate"]) / 0.05) ** 2
+        # Corp Speak: Penalize if too HIGH (only one direction)
+        if features["corp_speak_rate"] > target["corp_speak_rate"]:
+            loss += ((features["corp_speak_rate"] - target["corp_speak_rate"]) / 0.5) ** 2
         return loss
+    def _empty_features(self):
+        return {
+            "avg_len": 0.0, "std_len": 0.0, "ttr": 0.0,
+            "comma_rate": 0.0, "dash_rate": 0.0,
+            "conjunction_start_rate": 0.0, "colloquial_rate": 0.0,
+            "contraction_rate": 0.0, "pronoun_rate": 0.0,
+            "corp_speak_rate": 0.0
+        }