Spaces:

Chungulus
/

Humanizer_Pro

Sleeping

File size: 24,139 Bytes

fea8d44

"""
Unified Humanization Pipeline
Chains three humanization approaches in optimal order for maximum AI-detection bypass.

Pipeline Order:
  Stage 1: T5 Humanizer (a.py) — fine-tuned on 39k samples, best initial paraphrase
  Stage 2: Qwen LLM Rewrite (b.py) — deep semantic rewrite via instruction-tuned LLM
  Stage 3: Multi-Pass Cleanup (c.py) — AI pattern removal, restructuring, contractions, human touches
  Verify:  RoBERTa AI Detector (b.py) — sentence-level AI probability check
"""

import gradio as gr
import torch
import re
import random
import math
import numpy as np
import os
from collections import defaultdict, Counter
from typing import List, Dict, Tuple
from transformers import (
    pipeline as hf_pipeline,
    AutoTokenizer,
    AutoModelForCausalLM,
    T5Tokenizer,
    T5ForConditionalGeneration,
    GenerationConfig,
)

# ── NLTK setup ───────────────────────────────────────────────────────
import ssl
import nltk

# Fix SSL certificate issue on macOS
try:
    ssl._create_default_https_context = ssl._create_unverified_context
except AttributeError:
    pass

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Use home directory for NLTK data (already downloaded there)
NLTK_DIR = os.path.join(os.path.expanduser("~"), "nltk_data")
os.makedirs(NLTK_DIR, exist_ok=True)
nltk.data.path.insert(0, NLTK_DIR)

for _res in ["punkt", "punkt_tab", "averaged_perceptron_tagger",
             "stopwords", "wordnet", "omw-1.4"]:
    try:
        nltk.download(_res, download_dir=NLTK_DIR, quiet=True)
    except Exception:
        pass

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import wordnet, stopwords

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
STOP_WORDS = set(stopwords.words("english"))

# =====================================================================
#  STAGE 1 — T5 Humanizer Model (from a.py)
#  Fine-tuned on 39,776 humanization samples.  Best initial paraphrase.
# =====================================================================

_t5_model = None
_t5_tokenizer = None

def _load_t5():
    global _t5_model, _t5_tokenizer
    if _t5_model is None:
        print("Loading Stage 1: T5 Humanizer model …")
        MODEL_PATH = "harryroger798/humanizer-model-v3"
        _t5_tokenizer = T5Tokenizer.from_pretrained(MODEL_PATH)
        _t5_model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH)
        print("  Stage 1 ready.")
    return _t5_model, _t5_tokenizer


def stage1_t5_humanize(text: str) -> str:
    """Initial paraphrase using the fine-tuned T5 humanizer."""
    if not text.strip():
        return text
    model, tokenizer = _load_t5()

    inputs = tokenizer(
        f"humanize: {text}",
        return_tensors="pt",
        max_length=512,
        truncation=True,
    )
    outputs = model.generate(
        **inputs,
        max_length=512,
        num_beams=4,
        early_stopping=True,
        do_sample=True,
        temperature=0.8,
        top_p=0.9,
        repetition_penalty=2.5,
        no_repeat_ngram_size=3,
        length_penalty=1.0,
    )
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Repetition guard — if model loops, fall back to original text
    words = result.split()
    if len(words) > 10:
        counts = Counter(words)
        if max(counts.values()) > len(words) * 0.3:
            return text
    return result


# =====================================================================
#  STAGE 2 — Qwen LLM Rewrite (from b.py)
#  Instruction-tuned 1.5B model does a deep semantic rewrite.
# =====================================================================

_qwen_pipe = None

def _load_qwen():
    global _qwen_pipe
    if _qwen_pipe is None:
        print("Loading Stage 2: Qwen 2.5-1.5B-Instruct …")
        model_id = "Qwen/Qwen2.5-1.5B-Instruct"
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
            device_map="auto" if DEVICE == "cuda" else None,
        )
        _qwen_pipe = hf_pipeline("text-generation", model=model, tokenizer=tokenizer)
        print("  Stage 2 ready.")
    return _qwen_pipe


REWRITE_PROMPTS = {
    "Natural": "Rewrite this to sound completely natural, human-written — vary sentence length, use contractions, slight imperfections.",
    "Casual": "Rewrite this in a relaxed, friendly, conversational tone like a real person chatting.",
    "Academic": "Rewrite this in clear, formal academic style with precise and sophisticated language.",
    "Professional": "Rewrite this in a crisp, professional business tone — confident and authoritative.",
}


def stage2_qwen_rewrite(text: str, style: str = "Natural", intensity: float = 0.7) -> str:
    """Deep semantic rewrite using Qwen instruction-tuned LLM."""
    if not text.strip():
        return text
    pipe = _load_qwen()

    tone = REWRITE_PROMPTS.get(style, REWRITE_PROMPTS["Natural"])

    prompt = (
        "<|im_start|>system\n"
        "You are an expert editor that removes AI stiffness and makes text feel authentically human.\n"
        "Keep original meaning 100%. Improve flow, rhythm, vocabulary variety. "
        "Output ONLY the rewritten text.<|im_end|>\n"
        f"<|im_start|>user\n{tone}\nText:\n{text}<|im_end|>\n"
        "<|im_start|>assistant\n"
    )

    gen_config = GenerationConfig(
        max_new_tokens=600,
        temperature=0.4 + float(intensity) * 0.5,
        top_p=0.92,
        repetition_penalty=1.08,
        do_sample=True,
        pad_token_id=pipe.tokenizer.eos_token_id,
        eos_token_id=pipe.tokenizer.eos_token_id,
    )
    gen_config.max_length = None

    try:
        output = pipe(prompt, generation_config=gen_config, num_return_sequences=1)[0][
            "generated_text"
        ]
        if "assistant" in output:
            rewritten = output.split("assistant", 1)[-1].strip()
        else:
            rewritten = output[len(prompt) :].strip()
        return rewritten.strip() if rewritten.strip() else text
    except Exception as e:
        print(f"Stage 2 error: {e}")
        return text


# =====================================================================
#  STAGE 3 — Multi-Pass Cleanup (from c.py, optimized)
#  Removes AI-flagged patterns, restructures sentences, adds
#  contractions and human touches.  Conflicts with a.py resolved:
#  - No contraction EXPANSION (a.py did this, we skip it)
#  - Synonym direction is casual-ward only
# =====================================================================

# AI-flagged words/phrases → more natural replacements
AI_PATTERNS = {
    r"\bdelve into\b": ["explore", "examine", "look into", "dig into", "study"],
    r"\bembark upon?\b": ["begin", "start", "kick off", "launch", "set out"],
    r"\ba testament to\b": ["proof of", "evidence of", "shows", "reflects"],
    r"\blandscape of\b": ["world of", "field of", "area of", "space of"],
    r"\bnavigating\b": ["handling", "managing", "dealing with", "tackling"],
    r"\bmeticulous\b": ["careful", "thorough", "detailed", "precise"],
    r"\bintricate\b": ["complex", "detailed", "elaborate", "complicated"],
    r"\bmyriad\b": ["many", "numerous", "various", "lots of"],
    r"\bplethora\b": ["abundance", "wealth", "range", "loads"],
    r"\bparadigm\b": ["model", "framework", "approach", "method"],
    r"\bsynergy\b": ["teamwork", "cooperation", "collaboration"],
    r"\bleverage\b": ["use", "employ", "tap into", "make use of"],
    r"\bfacilitate\b": ["help", "enable", "support", "make easier"],
    r"\boptimize\b": ["improve", "enhance", "refine", "boost"],
    r"\bstreamline\b": ["simplify", "improve", "smooth out"],
    r"\brobust\b": ["strong", "reliable", "solid", "effective"],
    r"\bseamless\b": ["smooth", "easy", "fluid", "effortless"],
    r"\binnovative\b": ["creative", "original", "new", "fresh"],
    r"\bcutting-edge\b": ["advanced", "modern", "latest", "leading"],
    r"\bstate-of-the-art\b": ["advanced", "modern", "top-notch"],
    r"\bfurthermore\b": ["also", "plus", "on top of that", "besides"],
    r"\bmoreover\b": ["also", "plus", "what's more", "besides"],
    r"\bnevertheless\b": ["still", "yet", "even so", "all the same"],
    r"\bconsequently\b": ["so", "as a result", "because of this"],
    r"\bin conclusion\b": ["finally", "to wrap up", "in the end", "lastly"],
    r"\bin order to\b": ["to", "so we can", "aiming to"],
    r"\bdue to the fact that\b": ["because", "since", "given that"],
    r"\bwith regard to\b": ["about", "regarding", "when it comes to"],
    r"\bin terms of\b": ["regarding", "as for", "about"],
    r"\bprior to\b": ["before", "ahead of", "earlier than"],
    r"\bsubsequent to\b": ["after", "following", "once"],
    r"\bcomprehensive\b": ["complete", "thorough", "detailed", "full"],
    r"\bfundamental\b": ["basic", "essential", "core", "key"],
    r"\bsubstantial\b": ["significant", "considerable", "big", "major"],
    r"\bimplement\b": ["put in place", "carry out", "apply", "use"],
    r"\butilize\b": ["use", "employ", "make use of", "tap into"],
    r"\bdemonstrate\b": ["show", "prove", "reveal", "display"],
    r"\bestablish\b": ["set up", "create", "build", "start"],
    r"\bmaintain\b": ["keep", "preserve", "continue", "sustain"],
    r"\bobtain\b": ["get", "gain", "secure", "pick up"],
}

# Contractions to ADD (making text sound human/casual)
CONTRACTIONS = {
    r"\bit is\b": "it's", r"\bthat is\b": "that's", r"\bthere is\b": "there's",
    r"\bwho is\b": "who's", r"\bwhat is\b": "what's", r"\bwhere is\b": "where's",
    r"\bthey are\b": "they're", r"\bwe are\b": "we're", r"\byou are\b": "you're",
    r"\bI am\b": "I'm", r"\bhe is\b": "he's", r"\bshe is\b": "she's",
    r"\bcannot\b": "can't", r"\bdo not\b": "don't", r"\bdoes not\b": "doesn't",
    r"\bwill not\b": "won't", r"\bwould not\b": "wouldn't",
    r"\bshould not\b": "shouldn't", r"\bcould not\b": "couldn't",
    r"\bhave not\b": "haven't", r"\bhas not\b": "hasn't", r"\bhad not\b": "hadn't",
    r"\bis not\b": "isn't", r"\bare not\b": "aren't",
    r"\bwas not\b": "wasn't", r"\bwere not\b": "weren't",
    r"\blet us\b": "let's", r"\bI will\b": "I'll", r"\bI would\b": "I'd",
    r"\byou will\b": "you'll", r"\bwe will\b": "we'll", r"\bthey will\b": "they'll",
}

HUMAN_STARTERS = [
    "Actually,", "Honestly,", "Basically,", "Really,", "Generally,",
    "Usually,", "Often,", "Clearly,", "Naturally,", "Definitely,",
    "Interestingly,", "What's more,", "Plus,", "Also,", "Besides,",
    "In fact,", "Of course,", "Frankly,", "To be honest,", "The thing is,",
]

NATURAL_TRANSITIONS = [
    "And here's the thing:", "But here's what's interesting:",
    "So, what does this mean?", "Here's why this matters:",
    "Think about it this way:", "The reality is:", "The truth is:",
]

WORD_GROUPS = {
    "analyze": ["examine", "study", "investigate", "explore", "review"],
    "important": ["crucial", "vital", "essential", "key", "critical"],
    "shows": ["demonstrates", "reveals", "indicates", "displays"],
    "understand": ["grasp", "realize", "recognize", "appreciate"],
    "develop": ["create", "build", "form", "generate", "produce"],
    "improve": ["enhance", "refine", "advance", "boost", "better"],
    "consider": ["think about", "evaluate", "contemplate", "ponder"],
    "different": ["various", "diverse", "distinct", "alternative"],
    "effective": ["successful", "efficient", "productive", "useful"],
    "significant": ["important", "notable", "considerable", "major"],
}


def _replace_ai_patterns(text: str, prob: float = 0.85) -> str:
    """Replace known AI-flagged words with natural alternatives."""
    for pattern, replacements in AI_PATTERNS.items():
        for match in reversed(list(re.finditer(pattern, text, re.IGNORECASE))):
            if random.random() < prob:
                text = text[: match.start()] + random.choice(replacements) + text[match.end() :]
    return text


def _add_contractions(text: str, prob: float = 0.7) -> str:
    """Add natural contractions."""
    for pattern, contraction in CONTRACTIONS.items():
        if re.search(pattern, text, re.IGNORECASE) and random.random() < prob:
            text = re.sub(pattern, contraction, text, flags=re.IGNORECASE)
    return text


def _restructure_sentence(sentence: str) -> str:
    """Randomly restructure a sentence for variation."""
    strategies = [
        # Move adverb clause
        (r"^(.*?),\s*(because|since|when|if|although|while)\s+(.*?)([.!?])$",
         r"\2 \3, \1\4"),
        (r"^(Although|While|Since|Because|When|If)\s+(.*?),\s*(.*?)([.!?])$",
         r"\3, \1 \2\4"),
    ]
    for pat, rep in strategies:
        if re.search(pat, sentence, re.IGNORECASE):
            result = re.sub(pat, rep, sentence, flags=re.IGNORECASE)
            if len(result.split()) >= 3:
                return result.strip()
    return sentence


def _split_long_sentence(sentence: str) -> str:
    """Split overly long compound sentences."""
    conjunctions = [", and ", ", but ", ", so ", ", yet "]
    for conj in conjunctions:
        if conj in sentence and len(sentence.split()) > 15:
            parts = sentence.split(conj, 1)
            if len(parts) == 2 and len(parts[0].split()) > 3 and len(parts[1].split()) > 3:
                first = parts[0].strip().rstrip(".") + "."
                second = parts[1].strip()
                if second and second[0].islower():
                    second = second[0].upper() + second[1:]
                connector = random.choice(["Also,", "Plus,", "What's more,", "On top of that,"])
                return f"{first} {connector} {second[0].lower() + second[1:]}"
    return sentence


def _enhance_vocabulary(text: str, prob: float = 0.3) -> str:
    """Replace repeated words with contextual synonyms."""
    words = word_tokenize(text)
    usage = Counter(w.lower() for w in words if w.isalpha() and len(w) > 3)
    enhanced = []
    for word in words:
        wl = word.lower()
        if (word.isalpha() and len(word) > 3 and wl not in STOP_WORDS
                and usage.get(wl, 0) > 1 and random.random() < prob):
            # Check predefined groups
            for base, syns in WORD_GROUPS.items():
                if wl == base or wl in syns:
                    candidates = [s for s in ([base] + syns) if s != wl]
                    if candidates:
                        enhanced.append(random.choice(candidates))
                        usage[wl] -= 1
                        break
            else:
                # Try WordNet
                synsets = wordnet.synsets(wl)
                syn_candidates = []
                for ss in synsets[:2]:
                    for lemma in ss.lemmas():
                        s = lemma.name().replace("_", " ")
                        if s != wl and len(s) > 2 and abs(len(s) - len(word)) <= 3:
                            syn_candidates.append(s)
                if syn_candidates:
                    enhanced.append(random.choice(syn_candidates[:3]))
                    usage[wl] -= 1
                else:
                    enhanced.append(word)
        else:
            enhanced.append(word)
    return " ".join(enhanced)


def _add_human_touches(text: str, prob: float = 0.25) -> str:
    """Add natural sentence starters, transitions, fillers."""
    sentences = sent_tokenize(text)
    result = []
    for i, sent in enumerate(sentences):
        current = sent
        # Natural starters on ~25% of non-first sentences
        if i > 0 and random.random() < prob and len(current.split()) > 6:
            starter = random.choice(HUMAN_STARTERS)
            current = f"{starter} {current[0].lower() + current[1:]}"
        # Natural transitions rarely
        if i > 0 and random.random() < prob * 0.2:
            transition = random.choice(NATURAL_TRANSITIONS)
            current = f"{transition} {current[0].lower() + current[1:]}"
        result.append(current)
    return " ".join(result)


def _final_cleanup(text: str) -> str:
    """Fix spacing, punctuation, capitalization."""
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\s+([,.!?;:])", r"\1", text)
    text = re.sub(r"([,.!?;:])\s*([A-Z])", r"\1 \2", text)
    text = re.sub(r"\.+", ".", text)
    sentences = sent_tokenize(text)
    corrected = []
    for s in sentences:
        if s and s[0].islower():
            s = s[0].upper() + s[1:]
        corrected.append(s)
    return " ".join(corrected).strip()


def stage3_multipass_cleanup(text: str, intensity: int = 2) -> str:
    """Multi-pass cleanup: pattern removal → restructure → vocabulary → contractions → human touches."""
    if not text.strip():
        return text

    prob_scale = {1: 0.5, 2: 0.75, 3: 1.0}.get(intensity, 0.75)
    current = text

    # Pass 1: Remove AI-flagged patterns
    current = _replace_ai_patterns(current, prob=0.85 * prob_scale)

    # Pass 2: Restructure sentences
    sentences = sent_tokenize(current)
    restructured = []
    for sent in sentences:
        if len(sent.split()) > 8 and random.random() < 0.5 * prob_scale:
            sent = _restructure_sentence(sent)
        if len(sent.split()) > 15 and random.random() < 0.4 * prob_scale:
            sent = _split_long_sentence(sent)
        restructured.append(sent)
    current = " ".join(restructured)

    # Pass 3: Vocabulary enhancement (replace repeated words)
    current = _enhance_vocabulary(current, prob=0.3 * prob_scale)

    # Pass 4: Add contractions + human touches
    current = _add_contractions(current, prob=0.7 * prob_scale)
    current = _add_human_touches(current, prob=0.25 * prob_scale)

    # Final cleanup
    current = _final_cleanup(current)
    return current


# =====================================================================
#  VERIFICATION — RoBERTa AI Detector (from b.py)
# =====================================================================

_detector_pipe = None

def _load_detector():
    global _detector_pipe
    if _detector_pipe is None:
        print("Loading Detector: chatgpt-detector-roberta …")
        _detector_pipe = hf_pipeline(
            "text-classification",
            model="Hello-SimpleAI/chatgpt-detector-roberta",
            device=0 if DEVICE == "cuda" else -1,
            torch_dtype=torch.float16 if DEVICE == "cuda" else None,
        )
        print("  Detector ready.")
    return _detector_pipe


def verify_detection(text: str) -> str:
    """Run sentence-level AI detection and return an HTML report."""
    if not text.strip():
        return "No text to analyze."

    sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text.strip()) if s.strip()]
    pipe = _load_detector()
    preds = pipe(sentences, truncation=True, max_length=512)

    rows = []
    total_ai = 0.0
    for sent, pred in zip(sentences, preds):
        label = pred["label"].lower()
        score = pred["score"]
        ai_prob = score * 100 if any(x in label for x in ["fake", "ai", "generated"]) else (1 - score) * 100
        total_ai += ai_prob
        tag = "Very likely AI" if ai_prob > 85 else "Likely AI" if ai_prob > 60 else "Likely Human"
        color = "#dc2626" if ai_prob > 85 else "#d97706" if ai_prob > 60 else "#16a34a"
        rows.append(
            f"<div style='padding:8px;margin:4px 0;border-left:4px solid {color};'>"
            f"<strong>{tag} ({ai_prob:.1f}%)</strong><br>{sent}</div>"
        )

    avg = total_ai / len(sentences) if sentences else 0
    summary = f"<h3>Overall AI probability: {avg:.1f}%</h3>"
    return summary + "".join(rows)


# =====================================================================
#  FULL PIPELINE
# =====================================================================

def run_pipeline(
    text: str,
    style: str = "Natural",
    intensity: float = 0.7,
    use_stage1: bool = True,
    use_stage2: bool = True,
    use_stage3: bool = True,
    cleanup_intensity: int = 2,
    progress=gr.Progress(track_tqdm=False),
) -> Tuple[str, str, str, str]:
    """
    Run the full humanization pipeline.
    Returns: (stage1_out, stage2_out, final_out, detection_html)
    """
    if not text.strip():
        return "", "", "", ""

    current = text
    s1_out = s2_out = ""

    # Stage 1: T5 Humanizer
    if use_stage1:
        progress(0.1, desc="Stage 1: T5 Humanizer …")
        current = stage1_t5_humanize(current)
        s1_out = current

    # Stage 2: Qwen LLM Rewrite
    if use_stage2:
        progress(0.4, desc="Stage 2: Qwen LLM Rewrite …")
        current = stage2_qwen_rewrite(current, style=style, intensity=intensity)
        s2_out = current

    # Stage 3: Multi-Pass Cleanup
    if use_stage3:
        progress(0.7, desc="Stage 3: Multi-Pass Cleanup …")
        current = stage3_multipass_cleanup(current, intensity=cleanup_intensity)

    # Verification
    progress(0.9, desc="Verifying with AI detector …")
    detection_html = verify_detection(current)

    return s1_out, s2_out, current, detection_html


# =====================================================================
#  GRADIO UI
# =====================================================================

with gr.Blocks(title="Humanization Pipeline") as demo:
    gr.Markdown(
        "# Humanization Pipeline\n"
        "**3-stage chain: T5 Humanizer → Qwen LLM Rewrite → Multi-Pass Cleanup → AI Detection Verify**"
    )

    with gr.Row():
        with gr.Column(scale=1):
            input_text = gr.Textbox(
                label="Input Text (AI-generated)",
                placeholder="Paste AI-generated text here …",
                lines=10,
            )

            style_dropdown = gr.Dropdown(
                choices=["Natural", "Casual", "Academic", "Professional"],
                value="Natural",
                label="Rewrite Style (Stage 2)",
            )

            intensity_slider = gr.Slider(
                minimum=0.1, maximum=1.0, value=0.7, step=0.05,
                label="LLM Rewrite Intensity (Stage 2)",
            )

            cleanup_intensity = gr.Radio(
                choices=[("Light", 1), ("Standard", 2), ("Heavy", 3)],
                value=2,
                label="Cleanup Intensity (Stage 3)",
            )

            with gr.Row():
                use_s1 = gr.Checkbox(label="Stage 1: T5 Humanizer", value=True)
                use_s2 = gr.Checkbox(label="Stage 2: Qwen LLM", value=True)
                use_s3 = gr.Checkbox(label="Stage 3: Multi-Pass", value=True)

            run_btn = gr.Button("Run Pipeline", variant="primary", size="lg")

        with gr.Column(scale=1):
            with gr.Accordion("Stage 1 Output (T5 Humanizer)", open=False):
                s1_output = gr.Textbox(label="After Stage 1", lines=5)

            with gr.Accordion("Stage 2 Output (Qwen LLM)", open=False):
                s2_output = gr.Textbox(label="After Stage 2", lines=5)

            final_output = gr.Textbox(
                label="Final Humanized Text",
                lines=10,
            )

            detection_result = gr.HTML(label="AI Detection Verification")

    run_btn.click(
        fn=run_pipeline,
        inputs=[input_text, style_dropdown, intensity_slider,
                use_s1, use_s2, use_s3, cleanup_intensity],
        outputs=[s1_output, s2_output, final_output, detection_result],
    )

    gr.Examples(
        examples=[
            ["The rapid advancement of artificial intelligence technologies has significantly transformed numerous industries and daily life."],
            ["Machine learning algorithms demonstrate superior performance in pattern recognition tasks across diverse datasets."],
            ["In conclusion, leveraging cutting-edge methodologies facilitates the optimization of robust and seamless solutions."],
        ],
        inputs=input_text,
        label="Test examples (heavily AI-flagged text)",
    )

if __name__ == "__main__":
    demo.launch(debug=False, share=True)