""" Unified Humanization Pipeline Chains three humanization approaches in optimal order for maximum AI-detection bypass. Pipeline Order: Stage 1: T5 Humanizer (a.py) — fine-tuned on 39k samples, best initial paraphrase Stage 2: Qwen LLM Rewrite (b.py) — deep semantic rewrite via instruction-tuned LLM Stage 3: Multi-Pass Cleanup (c.py) — AI pattern removal, restructuring, contractions, human touches Verify: RoBERTa AI Detector (b.py) — sentence-level AI probability check """ import gradio as gr import torch import re import random import math import numpy as np import os from collections import defaultdict, Counter from typing import List, Dict, Tuple from transformers import ( pipeline as hf_pipeline, AutoTokenizer, AutoModelForCausalLM, T5Tokenizer, T5ForConditionalGeneration, GenerationConfig, ) # ── NLTK setup ─────────────────────────────────────────────────────── import ssl import nltk # Fix SSL certificate issue on macOS try: ssl._create_default_https_context = ssl._create_unverified_context except AttributeError: pass os.environ["TOKENIZERS_PARALLELISM"] = "false" # Use home directory for NLTK data (already downloaded there) NLTK_DIR = os.path.join(os.path.expanduser("~"), "nltk_data") os.makedirs(NLTK_DIR, exist_ok=True) nltk.data.path.insert(0, NLTK_DIR) for _res in ["punkt", "punkt_tab", "averaged_perceptron_tagger", "stopwords", "wordnet", "omw-1.4"]: try: nltk.download(_res, download_dir=NLTK_DIR, quiet=True) except Exception: pass from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import wordnet, stopwords DEVICE = "cuda" if torch.cuda.is_available() else "cpu" STOP_WORDS = set(stopwords.words("english")) # ===================================================================== # STAGE 1 — T5 Humanizer Model (from a.py) # Fine-tuned on 39,776 humanization samples. Best initial paraphrase. # ===================================================================== _t5_model = None _t5_tokenizer = None def _load_t5(): global _t5_model, _t5_tokenizer if _t5_model is None: print("Loading Stage 1: T5 Humanizer model …") MODEL_PATH = "harryroger798/humanizer-model-v3" _t5_tokenizer = T5Tokenizer.from_pretrained(MODEL_PATH) _t5_model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH) print(" Stage 1 ready.") return _t5_model, _t5_tokenizer def stage1_t5_humanize(text: str) -> str: """Initial paraphrase using the fine-tuned T5 humanizer.""" if not text.strip(): return text model, tokenizer = _load_t5() inputs = tokenizer( f"humanize: {text}", return_tensors="pt", max_length=512, truncation=True, ) outputs = model.generate( **inputs, max_length=512, num_beams=4, early_stopping=True, do_sample=True, temperature=0.8, top_p=0.9, repetition_penalty=2.5, no_repeat_ngram_size=3, length_penalty=1.0, ) result = tokenizer.decode(outputs[0], skip_special_tokens=True) # Repetition guard — if model loops, fall back to original text words = result.split() if len(words) > 10: counts = Counter(words) if max(counts.values()) > len(words) * 0.3: return text return result # ===================================================================== # STAGE 2 — Qwen LLM Rewrite (from b.py) # Instruction-tuned 1.5B model does a deep semantic rewrite. # ===================================================================== _qwen_pipe = None def _load_qwen(): global _qwen_pipe if _qwen_pipe is None: print("Loading Stage 2: Qwen 2.5-1.5B-Instruct …") model_id = "Qwen/Qwen2.5-1.5B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32, device_map="auto" if DEVICE == "cuda" else None, ) _qwen_pipe = hf_pipeline("text-generation", model=model, tokenizer=tokenizer) print(" Stage 2 ready.") return _qwen_pipe REWRITE_PROMPTS = { "Natural": "Rewrite this to sound completely natural, human-written — vary sentence length, use contractions, slight imperfections.", "Casual": "Rewrite this in a relaxed, friendly, conversational tone like a real person chatting.", "Academic": "Rewrite this in clear, formal academic style with precise and sophisticated language.", "Professional": "Rewrite this in a crisp, professional business tone — confident and authoritative.", } def stage2_qwen_rewrite(text: str, style: str = "Natural", intensity: float = 0.7) -> str: """Deep semantic rewrite using Qwen instruction-tuned LLM.""" if not text.strip(): return text pipe = _load_qwen() tone = REWRITE_PROMPTS.get(style, REWRITE_PROMPTS["Natural"]) prompt = ( "<|im_start|>system\n" "You are an expert editor that removes AI stiffness and makes text feel authentically human.\n" "Keep original meaning 100%. Improve flow, rhythm, vocabulary variety. " "Output ONLY the rewritten text.<|im_end|>\n" f"<|im_start|>user\n{tone}\nText:\n{text}<|im_end|>\n" "<|im_start|>assistant\n" ) gen_config = GenerationConfig( max_new_tokens=600, temperature=0.4 + float(intensity) * 0.5, top_p=0.92, repetition_penalty=1.08, do_sample=True, pad_token_id=pipe.tokenizer.eos_token_id, eos_token_id=pipe.tokenizer.eos_token_id, ) gen_config.max_length = None try: output = pipe(prompt, generation_config=gen_config, num_return_sequences=1)[0][ "generated_text" ] if "assistant" in output: rewritten = output.split("assistant", 1)[-1].strip() else: rewritten = output[len(prompt) :].strip() return rewritten.strip() if rewritten.strip() else text except Exception as e: print(f"Stage 2 error: {e}") return text # ===================================================================== # STAGE 3 — Multi-Pass Cleanup (from c.py, optimized) # Removes AI-flagged patterns, restructures sentences, adds # contractions and human touches. Conflicts with a.py resolved: # - No contraction EXPANSION (a.py did this, we skip it) # - Synonym direction is casual-ward only # ===================================================================== # AI-flagged words/phrases → more natural replacements AI_PATTERNS = { r"\bdelve into\b": ["explore", "examine", "look into", "dig into", "study"], r"\bembark upon?\b": ["begin", "start", "kick off", "launch", "set out"], r"\ba testament to\b": ["proof of", "evidence of", "shows", "reflects"], r"\blandscape of\b": ["world of", "field of", "area of", "space of"], r"\bnavigating\b": ["handling", "managing", "dealing with", "tackling"], r"\bmeticulous\b": ["careful", "thorough", "detailed", "precise"], r"\bintricate\b": ["complex", "detailed", "elaborate", "complicated"], r"\bmyriad\b": ["many", "numerous", "various", "lots of"], r"\bplethora\b": ["abundance", "wealth", "range", "loads"], r"\bparadigm\b": ["model", "framework", "approach", "method"], r"\bsynergy\b": ["teamwork", "cooperation", "collaboration"], r"\bleverage\b": ["use", "employ", "tap into", "make use of"], r"\bfacilitate\b": ["help", "enable", "support", "make easier"], r"\boptimize\b": ["improve", "enhance", "refine", "boost"], r"\bstreamline\b": ["simplify", "improve", "smooth out"], r"\brobust\b": ["strong", "reliable", "solid", "effective"], r"\bseamless\b": ["smooth", "easy", "fluid", "effortless"], r"\binnovative\b": ["creative", "original", "new", "fresh"], r"\bcutting-edge\b": ["advanced", "modern", "latest", "leading"], r"\bstate-of-the-art\b": ["advanced", "modern", "top-notch"], r"\bfurthermore\b": ["also", "plus", "on top of that", "besides"], r"\bmoreover\b": ["also", "plus", "what's more", "besides"], r"\bnevertheless\b": ["still", "yet", "even so", "all the same"], r"\bconsequently\b": ["so", "as a result", "because of this"], r"\bin conclusion\b": ["finally", "to wrap up", "in the end", "lastly"], r"\bin order to\b": ["to", "so we can", "aiming to"], r"\bdue to the fact that\b": ["because", "since", "given that"], r"\bwith regard to\b": ["about", "regarding", "when it comes to"], r"\bin terms of\b": ["regarding", "as for", "about"], r"\bprior to\b": ["before", "ahead of", "earlier than"], r"\bsubsequent to\b": ["after", "following", "once"], r"\bcomprehensive\b": ["complete", "thorough", "detailed", "full"], r"\bfundamental\b": ["basic", "essential", "core", "key"], r"\bsubstantial\b": ["significant", "considerable", "big", "major"], r"\bimplement\b": ["put in place", "carry out", "apply", "use"], r"\butilize\b": ["use", "employ", "make use of", "tap into"], r"\bdemonstrate\b": ["show", "prove", "reveal", "display"], r"\bestablish\b": ["set up", "create", "build", "start"], r"\bmaintain\b": ["keep", "preserve", "continue", "sustain"], r"\bobtain\b": ["get", "gain", "secure", "pick up"], } # Contractions to ADD (making text sound human/casual) CONTRACTIONS = { r"\bit is\b": "it's", r"\bthat is\b": "that's", r"\bthere is\b": "there's", r"\bwho is\b": "who's", r"\bwhat is\b": "what's", r"\bwhere is\b": "where's", r"\bthey are\b": "they're", r"\bwe are\b": "we're", r"\byou are\b": "you're", r"\bI am\b": "I'm", r"\bhe is\b": "he's", r"\bshe is\b": "she's", r"\bcannot\b": "can't", r"\bdo not\b": "don't", r"\bdoes not\b": "doesn't", r"\bwill not\b": "won't", r"\bwould not\b": "wouldn't", r"\bshould not\b": "shouldn't", r"\bcould not\b": "couldn't", r"\bhave not\b": "haven't", r"\bhas not\b": "hasn't", r"\bhad not\b": "hadn't", r"\bis not\b": "isn't", r"\bare not\b": "aren't", r"\bwas not\b": "wasn't", r"\bwere not\b": "weren't", r"\blet us\b": "let's", r"\bI will\b": "I'll", r"\bI would\b": "I'd", r"\byou will\b": "you'll", r"\bwe will\b": "we'll", r"\bthey will\b": "they'll", } HUMAN_STARTERS = [ "Actually,", "Honestly,", "Basically,", "Really,", "Generally,", "Usually,", "Often,", "Clearly,", "Naturally,", "Definitely,", "Interestingly,", "What's more,", "Plus,", "Also,", "Besides,", "In fact,", "Of course,", "Frankly,", "To be honest,", "The thing is,", ] NATURAL_TRANSITIONS = [ "And here's the thing:", "But here's what's interesting:", "So, what does this mean?", "Here's why this matters:", "Think about it this way:", "The reality is:", "The truth is:", ] WORD_GROUPS = { "analyze": ["examine", "study", "investigate", "explore", "review"], "important": ["crucial", "vital", "essential", "key", "critical"], "shows": ["demonstrates", "reveals", "indicates", "displays"], "understand": ["grasp", "realize", "recognize", "appreciate"], "develop": ["create", "build", "form", "generate", "produce"], "improve": ["enhance", "refine", "advance", "boost", "better"], "consider": ["think about", "evaluate", "contemplate", "ponder"], "different": ["various", "diverse", "distinct", "alternative"], "effective": ["successful", "efficient", "productive", "useful"], "significant": ["important", "notable", "considerable", "major"], } def _replace_ai_patterns(text: str, prob: float = 0.85) -> str: """Replace known AI-flagged words with natural alternatives.""" for pattern, replacements in AI_PATTERNS.items(): for match in reversed(list(re.finditer(pattern, text, re.IGNORECASE))): if random.random() < prob: text = text[: match.start()] + random.choice(replacements) + text[match.end() :] return text def _add_contractions(text: str, prob: float = 0.7) -> str: """Add natural contractions.""" for pattern, contraction in CONTRACTIONS.items(): if re.search(pattern, text, re.IGNORECASE) and random.random() < prob: text = re.sub(pattern, contraction, text, flags=re.IGNORECASE) return text def _restructure_sentence(sentence: str) -> str: """Randomly restructure a sentence for variation.""" strategies = [ # Move adverb clause (r"^(.*?),\s*(because|since|when|if|although|while)\s+(.*?)([.!?])$", r"\2 \3, \1\4"), (r"^(Although|While|Since|Because|When|If)\s+(.*?),\s*(.*?)([.!?])$", r"\3, \1 \2\4"), ] for pat, rep in strategies: if re.search(pat, sentence, re.IGNORECASE): result = re.sub(pat, rep, sentence, flags=re.IGNORECASE) if len(result.split()) >= 3: return result.strip() return sentence def _split_long_sentence(sentence: str) -> str: """Split overly long compound sentences.""" conjunctions = [", and ", ", but ", ", so ", ", yet "] for conj in conjunctions: if conj in sentence and len(sentence.split()) > 15: parts = sentence.split(conj, 1) if len(parts) == 2 and len(parts[0].split()) > 3 and len(parts[1].split()) > 3: first = parts[0].strip().rstrip(".") + "." second = parts[1].strip() if second and second[0].islower(): second = second[0].upper() + second[1:] connector = random.choice(["Also,", "Plus,", "What's more,", "On top of that,"]) return f"{first} {connector} {second[0].lower() + second[1:]}" return sentence def _enhance_vocabulary(text: str, prob: float = 0.3) -> str: """Replace repeated words with contextual synonyms.""" words = word_tokenize(text) usage = Counter(w.lower() for w in words if w.isalpha() and len(w) > 3) enhanced = [] for word in words: wl = word.lower() if (word.isalpha() and len(word) > 3 and wl not in STOP_WORDS and usage.get(wl, 0) > 1 and random.random() < prob): # Check predefined groups for base, syns in WORD_GROUPS.items(): if wl == base or wl in syns: candidates = [s for s in ([base] + syns) if s != wl] if candidates: enhanced.append(random.choice(candidates)) usage[wl] -= 1 break else: # Try WordNet synsets = wordnet.synsets(wl) syn_candidates = [] for ss in synsets[:2]: for lemma in ss.lemmas(): s = lemma.name().replace("_", " ") if s != wl and len(s) > 2 and abs(len(s) - len(word)) <= 3: syn_candidates.append(s) if syn_candidates: enhanced.append(random.choice(syn_candidates[:3])) usage[wl] -= 1 else: enhanced.append(word) else: enhanced.append(word) return " ".join(enhanced) def _add_human_touches(text: str, prob: float = 0.25) -> str: """Add natural sentence starters, transitions, fillers.""" sentences = sent_tokenize(text) result = [] for i, sent in enumerate(sentences): current = sent # Natural starters on ~25% of non-first sentences if i > 0 and random.random() < prob and len(current.split()) > 6: starter = random.choice(HUMAN_STARTERS) current = f"{starter} {current[0].lower() + current[1:]}" # Natural transitions rarely if i > 0 and random.random() < prob * 0.2: transition = random.choice(NATURAL_TRANSITIONS) current = f"{transition} {current[0].lower() + current[1:]}" result.append(current) return " ".join(result) def _final_cleanup(text: str) -> str: """Fix spacing, punctuation, capitalization.""" text = re.sub(r"\s+", " ", text) text = re.sub(r"\s+([,.!?;:])", r"\1", text) text = re.sub(r"([,.!?;:])\s*([A-Z])", r"\1 \2", text) text = re.sub(r"\.+", ".", text) sentences = sent_tokenize(text) corrected = [] for s in sentences: if s and s[0].islower(): s = s[0].upper() + s[1:] corrected.append(s) return " ".join(corrected).strip() def stage3_multipass_cleanup(text: str, intensity: int = 2) -> str: """Multi-pass cleanup: pattern removal → restructure → vocabulary → contractions → human touches.""" if not text.strip(): return text prob_scale = {1: 0.5, 2: 0.75, 3: 1.0}.get(intensity, 0.75) current = text # Pass 1: Remove AI-flagged patterns current = _replace_ai_patterns(current, prob=0.85 * prob_scale) # Pass 2: Restructure sentences sentences = sent_tokenize(current) restructured = [] for sent in sentences: if len(sent.split()) > 8 and random.random() < 0.5 * prob_scale: sent = _restructure_sentence(sent) if len(sent.split()) > 15 and random.random() < 0.4 * prob_scale: sent = _split_long_sentence(sent) restructured.append(sent) current = " ".join(restructured) # Pass 3: Vocabulary enhancement (replace repeated words) current = _enhance_vocabulary(current, prob=0.3 * prob_scale) # Pass 4: Add contractions + human touches current = _add_contractions(current, prob=0.7 * prob_scale) current = _add_human_touches(current, prob=0.25 * prob_scale) # Final cleanup current = _final_cleanup(current) return current # ===================================================================== # VERIFICATION — RoBERTa AI Detector (from b.py) # ===================================================================== _detector_pipe = None def _load_detector(): global _detector_pipe if _detector_pipe is None: print("Loading Detector: chatgpt-detector-roberta …") _detector_pipe = hf_pipeline( "text-classification", model="Hello-SimpleAI/chatgpt-detector-roberta", device=0 if DEVICE == "cuda" else -1, torch_dtype=torch.float16 if DEVICE == "cuda" else None, ) print(" Detector ready.") return _detector_pipe def verify_detection(text: str) -> str: """Run sentence-level AI detection and return an HTML report.""" if not text.strip(): return "No text to analyze." sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text.strip()) if s.strip()] pipe = _load_detector() preds = pipe(sentences, truncation=True, max_length=512) rows = [] total_ai = 0.0 for sent, pred in zip(sentences, preds): label = pred["label"].lower() score = pred["score"] ai_prob = score * 100 if any(x in label for x in ["fake", "ai", "generated"]) else (1 - score) * 100 total_ai += ai_prob tag = "Very likely AI" if ai_prob > 85 else "Likely AI" if ai_prob > 60 else "Likely Human" color = "#dc2626" if ai_prob > 85 else "#d97706" if ai_prob > 60 else "#16a34a" rows.append( f"