Spaces:

ProjectFrozone
/

Site

Running

File size: 6,148 Bytes

40a04d4

#!/usr/bin/env python3
"""
humanizer.py - remove bulleted lists, markdown bold indicators, titles, and
various other obviously-AI-written textual features, and replace them with more
human-like connective text.

This module provides `humanize(text, ...)`, which preserves the basic content
of the original text, but with a more human-like straight-prose expression. See
function docstring for arguments and explanations.
"""
import re
import random
import argparse
from typing import List


INTRO_PHRASES = [
    "On {topic}, ",
    "On the {topic} issue, ",
    "When it comes to {topic}, ",
    "As for {topic}, ",
    "Another thing is {topic}, ",  # yep, it's a comma splice! We're human.
    "People often claim that {topic}, but ",
    "People might say {topic}, but "
]

def strip_markdown(text: str) -> str:
    text = re.sub(r"\*\*(.*?)\*\*", r"\1", text)
    text = re.sub(r"\*(.*?)\*", r"\1", text)
    return text

def is_bullet(line: str) -> bool:
    # Matches:
    #   * item
    #   - item
    #   • item
    #   1. item
    #   1) item
    return bool(
        re.match(r"^\s*(?:[*\-•]|(?:\d+[.)]))\s+", line)
    )

def extract_bullet_text(line: str) -> str:
    return re.sub(r"^\s*(?:[*\-•]|(?:\d+[.)]))\s+", "", line).strip()

def choose_intro(topic: str) -> str:
    phrase = random.choice(INTRO_PHRASES)
    return phrase.format(topic=topic.strip().lower())

def collapse_list(items: List[str]) -> str:
    if len(items) == 1:
        return items[0]
    if len(items) == 2:
        return f"{items[0]} and {items[1]}"
    return ", ".join(items[:-1]) + f", and {items[-1]}"

def lowercase_initial(text: str) -> str:
    """
    Lowercase the first alphabetic character in `text`.
    Leaves leading quotes/whitespace/punctuation intact.
    """
    chars = list(text)
    for i, ch in enumerate(chars):
        if ch.isalpha():
            chars[i] = ch.lower()
            break
    return "".join(chars)

def normalize_inline_bullets(text: str) -> str:
    """
    Turn inline bullet markers into real line-starting bullets.

    Example:
      "pay: * Sales taxes... * Property taxes..."
    becomes:
      "pay:\n* Sales taxes...\n* Property taxes..."
    """
    # Put a newline before any bullet marker that is preceded by whitespace,
    # but avoid changing bullets that are already at the start of a line.
    text = re.sub(r"(?m)(?<!^)\s+([*\-•])\s+", r"\n\1 ", text)

    # Also handle numbered bullets like " 1) foo" or " 1. foo"
    text = re.sub(r"(?m)(?<!^)\s+(\d+[.)])\s+", r"\n\1 ", text)

    return text

def humanize_chunk(text: str) -> str:

    text = normalize_inline_bullets(text)
    text = strip_markdown(text)
    lines = text.splitlines()

    output: List[str] = []

    current_sentence = None
    tail_items: List[str] = []

    for raw_line in lines:
        line = raw_line.strip()

        if not line:
            continue

        if is_bullet(line):
            item = extract_bullet_text(line)

            # Heading bullet: flush previous sentence first
            if ":" in item:
                if current_sentence:
                    if tail_items:
                        clean_items = [
                            lowercase_initial(ti.rstrip("."))
                            for ti in tail_items
                        ]
                        current_sentence += " " + collapse_list(clean_items)
                        tail_items = []

                    output.append(current_sentence)

                title, rest = item.split(":", 1)
                body = lowercase_initial(rest.strip())
                current_sentence = choose_intro(title) + body


            else:
                # Sub-bullet: belongs to current heading
                if current_sentence:
                    tail_items.append(item)
                else:
                    # orphan bullet (rare, but handle)
                    output.append(item)

        else:
            # Normal line flushes everything
            if current_sentence:
                if tail_items:
                    clean_items = [
                        lowercase_initial(ti.rstrip("."))
                        for ti in tail_items
                    ]
                    current_sentence += " " + collapse_list(clean_items)
                    tail_items = []

                output.append(current_sentence)
                current_sentence = None

            output.append(line)

    # Final flush
    if current_sentence:
        if tail_items:
            clean_items = [
                lowercase_initial(ti.rstrip("."))
                for ti in tail_items
            ]
            current_sentence += " " + collapse_list(clean_items)
        output.append(current_sentence)

    result = "\n\n".join(output)
    result = re.sub(r"[ \t]+", " ", result)
    return result

def humanize(text: str) -> str:
    paragraphs = re.split(r"\n\s*\n", text.strip())
    cleaned = [humanize_chunk(p) for p in paragraphs]
    return "\n\n".join(cleaned)


def parse_args():
    parser = argparse.ArgumentParser(
        description="Interactive 'humanizer': replaces obviously AI-written "
        "content with more human-like comment.")
    parser.add_argument(
        "--verbose",
        action="store_true",
        help="Print verbose output for debugging."
    )
    return parser.parse_args()

if __name__ == "__main__":

    random.seed(123)

    args = parse_args()

    sample = """
* **Free Healthcare:** Undocumented immigrants generally do not receive free, comprehensive healthcare.
* **Other Benefits:** The vast majority of federally funded public benefits require legal status.
* **No Taxes:** This is a common misconception.
    * Sales taxes
    * Property taxes
    * Federal and state income taxes
"""
    print(f"\nSample humanized version:\n{humanize(sample)}")

    s = input("\nEnter text or filename (ending in .txt): ")
    while s and s != "done":
        if s.endswith(".txt"):
            with open(s, encoding='utf-8') as f:
                s = f.read()
        humanized = humanize(s)
        print(f"\nHumanized version: {humanized}")
        s = input("Enter text: ")