Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| humanizer.py - remove bulleted lists, markdown bold indicators, titles, and | |
| various other obviously-AI-written textual features, and replace them with more | |
| human-like connective text. | |
| This module provides `humanize(text, ...)`, which preserves the basic content | |
| of the original text, but with a more human-like straight-prose expression. See | |
| function docstring for arguments and explanations. | |
| """ | |
| import re | |
| import random | |
| import argparse | |
| from typing import List | |
| INTRO_PHRASES = [ | |
| "On {topic}, ", | |
| "On the {topic} issue, ", | |
| "When it comes to {topic}, ", | |
| "As for {topic}, ", | |
| "Another thing is {topic}, ", # yep, it's a comma splice! We're human. | |
| "People often claim that {topic}, but ", | |
| "People might say {topic}, but " | |
| ] | |
| def strip_markdown(text: str) -> str: | |
| text = re.sub(r"\*\*(.*?)\*\*", r"\1", text) | |
| text = re.sub(r"\*(.*?)\*", r"\1", text) | |
| return text | |
| def is_bullet(line: str) -> bool: | |
| # Matches: | |
| # * item | |
| # - item | |
| # • item | |
| # 1. item | |
| # 1) item | |
| return bool( | |
| re.match(r"^\s*(?:[*\-•]|(?:\d+[.)]))\s+", line) | |
| ) | |
| def extract_bullet_text(line: str) -> str: | |
| return re.sub(r"^\s*(?:[*\-•]|(?:\d+[.)]))\s+", "", line).strip() | |
| def choose_intro(topic: str) -> str: | |
| phrase = random.choice(INTRO_PHRASES) | |
| return phrase.format(topic=topic.strip().lower()) | |
| def collapse_list(items: List[str]) -> str: | |
| if len(items) == 1: | |
| return items[0] | |
| if len(items) == 2: | |
| return f"{items[0]} and {items[1]}" | |
| return ", ".join(items[:-1]) + f", and {items[-1]}" | |
| def lowercase_initial(text: str) -> str: | |
| """ | |
| Lowercase the first alphabetic character in `text`. | |
| Leaves leading quotes/whitespace/punctuation intact. | |
| """ | |
| chars = list(text) | |
| for i, ch in enumerate(chars): | |
| if ch.isalpha(): | |
| chars[i] = ch.lower() | |
| break | |
| return "".join(chars) | |
| def normalize_inline_bullets(text: str) -> str: | |
| """ | |
| Turn inline bullet markers into real line-starting bullets. | |
| Example: | |
| "pay: * Sales taxes... * Property taxes..." | |
| becomes: | |
| "pay:\n* Sales taxes...\n* Property taxes..." | |
| """ | |
| # Put a newline before any bullet marker that is preceded by whitespace, | |
| # but avoid changing bullets that are already at the start of a line. | |
| text = re.sub(r"(?m)(?<!^)\s+([*\-•])\s+", r"\n\1 ", text) | |
| # Also handle numbered bullets like " 1) foo" or " 1. foo" | |
| text = re.sub(r"(?m)(?<!^)\s+(\d+[.)])\s+", r"\n\1 ", text) | |
| return text | |
| def humanize_chunk(text: str) -> str: | |
| text = normalize_inline_bullets(text) | |
| text = strip_markdown(text) | |
| lines = text.splitlines() | |
| output: List[str] = [] | |
| current_sentence = None | |
| tail_items: List[str] = [] | |
| for raw_line in lines: | |
| line = raw_line.strip() | |
| if not line: | |
| continue | |
| if is_bullet(line): | |
| item = extract_bullet_text(line) | |
| # Heading bullet: flush previous sentence first | |
| if ":" in item: | |
| if current_sentence: | |
| if tail_items: | |
| clean_items = [ | |
| lowercase_initial(ti.rstrip(".")) | |
| for ti in tail_items | |
| ] | |
| current_sentence += " " + collapse_list(clean_items) | |
| tail_items = [] | |
| output.append(current_sentence) | |
| title, rest = item.split(":", 1) | |
| body = lowercase_initial(rest.strip()) | |
| current_sentence = choose_intro(title) + body | |
| else: | |
| # Sub-bullet: belongs to current heading | |
| if current_sentence: | |
| tail_items.append(item) | |
| else: | |
| # orphan bullet (rare, but handle) | |
| output.append(item) | |
| else: | |
| # Normal line flushes everything | |
| if current_sentence: | |
| if tail_items: | |
| clean_items = [ | |
| lowercase_initial(ti.rstrip(".")) | |
| for ti in tail_items | |
| ] | |
| current_sentence += " " + collapse_list(clean_items) | |
| tail_items = [] | |
| output.append(current_sentence) | |
| current_sentence = None | |
| output.append(line) | |
| # Final flush | |
| if current_sentence: | |
| if tail_items: | |
| clean_items = [ | |
| lowercase_initial(ti.rstrip(".")) | |
| for ti in tail_items | |
| ] | |
| current_sentence += " " + collapse_list(clean_items) | |
| output.append(current_sentence) | |
| result = "\n\n".join(output) | |
| result = re.sub(r"[ \t]+", " ", result) | |
| return result | |
| def humanize(text: str) -> str: | |
| paragraphs = re.split(r"\n\s*\n", text.strip()) | |
| cleaned = [humanize_chunk(p) for p in paragraphs] | |
| return "\n\n".join(cleaned) | |
| def parse_args(): | |
| parser = argparse.ArgumentParser( | |
| description="Interactive 'humanizer': replaces obviously AI-written " | |
| "content with more human-like comment.") | |
| parser.add_argument( | |
| "--verbose", | |
| action="store_true", | |
| help="Print verbose output for debugging." | |
| ) | |
| return parser.parse_args() | |
| if __name__ == "__main__": | |
| random.seed(123) | |
| args = parse_args() | |
| sample = """ | |
| * **Free Healthcare:** Undocumented immigrants generally do not receive free, comprehensive healthcare. | |
| * **Other Benefits:** The vast majority of federally funded public benefits require legal status. | |
| * **No Taxes:** This is a common misconception. | |
| * Sales taxes | |
| * Property taxes | |
| * Federal and state income taxes | |
| """ | |
| print(f"\nSample humanized version:\n{humanize(sample)}") | |
| s = input("\nEnter text or filename (ending in .txt): ") | |
| while s and s != "done": | |
| if s.endswith(".txt"): | |
| with open(s, encoding='utf-8') as f: | |
| s = f.read() | |
| humanized = humanize(s) | |
| print(f"\nHumanized version: {humanized}") | |
| s = input("Enter text: ") | |