Spaces:

Chungulus
/

Humanizer_Pro

Sleeping

App Files Files Community

Humanizer_Pro / pipeline.py

Chungulus

Upload folder using huggingface_hub

fea8d44 verified about 2 months ago

raw

history blame contribute delete

24.1 kB

	"""
	Unified Humanization Pipeline
	Chains three humanization approaches in optimal order for maximum AI-detection bypass.

	Pipeline Order:
	Stage 1: T5 Humanizer (a.py) — fine-tuned on 39k samples, best initial paraphrase
	Stage 2: Qwen LLM Rewrite (b.py) — deep semantic rewrite via instruction-tuned LLM
	Stage 3: Multi-Pass Cleanup (c.py) — AI pattern removal, restructuring, contractions, human touches
	Verify: RoBERTa AI Detector (b.py) — sentence-level AI probability check
	"""

	import gradio as gr
	import torch
	import re
	import random
	import math
	import numpy as np
	import os
	from collections import defaultdict, Counter
	from typing import List, Dict, Tuple
	from transformers import (
	pipeline as hf_pipeline,
	AutoTokenizer,
	AutoModelForCausalLM,
	T5Tokenizer,
	T5ForConditionalGeneration,
	GenerationConfig,
	)

	# ── NLTK setup ───────────────────────────────────────────────────────
	import ssl
	import nltk

	# Fix SSL certificate issue on macOS
	try:
	ssl._create_default_https_context = ssl._create_unverified_context
	except AttributeError:
	pass

	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	# Use home directory for NLTK data (already downloaded there)
	NLTK_DIR = os.path.join(os.path.expanduser("~"), "nltk_data")
	os.makedirs(NLTK_DIR, exist_ok=True)
	nltk.data.path.insert(0, NLTK_DIR)

	for _res in ["punkt", "punkt_tab", "averaged_perceptron_tagger",
	"stopwords", "wordnet", "omw-1.4"]:
	try:
	nltk.download(_res, download_dir=NLTK_DIR, quiet=True)
	except Exception:
	pass

	from nltk.tokenize import sent_tokenize, word_tokenize
	from nltk.corpus import wordnet, stopwords

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	STOP_WORDS = set(stopwords.words("english"))

	# =====================================================================
	# STAGE 1 — T5 Humanizer Model (from a.py)
	# Fine-tuned on 39,776 humanization samples. Best initial paraphrase.
	# =====================================================================

	_t5_model = None
	_t5_tokenizer = None

	def _load_t5():
	global _t5_model, _t5_tokenizer
	if _t5_model is None:
	print("Loading Stage 1: T5 Humanizer model …")
	MODEL_PATH = "harryroger798/humanizer-model-v3"
	_t5_tokenizer = T5Tokenizer.from_pretrained(MODEL_PATH)
	_t5_model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH)
	print(" Stage 1 ready.")
	return _t5_model, _t5_tokenizer


	def stage1_t5_humanize(text: str) -> str:
	"""Initial paraphrase using the fine-tuned T5 humanizer."""
	if not text.strip():
	return text
	model, tokenizer = _load_t5()

	inputs = tokenizer(
	f"humanize: {text}",
	return_tensors="pt",
	max_length=512,
	truncation=True,
	)
	outputs = model.generate(
	**inputs,
	max_length=512,
	num_beams=4,
	early_stopping=True,
	do_sample=True,
	temperature=0.8,
	top_p=0.9,
	repetition_penalty=2.5,
	no_repeat_ngram_size=3,
	length_penalty=1.0,
	)
	result = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Repetition guard — if model loops, fall back to original text
	words = result.split()
	if len(words) > 10:
	counts = Counter(words)
	if max(counts.values()) > len(words) * 0.3:
	return text
	return result


	# =====================================================================
	# STAGE 2 — Qwen LLM Rewrite (from b.py)
	# Instruction-tuned 1.5B model does a deep semantic rewrite.
	# =====================================================================

	_qwen_pipe = None

	def _load_qwen():
	global _qwen_pipe
	if _qwen_pipe is None:
	print("Loading Stage 2: Qwen 2.5-1.5B-Instruct …")
	model_id = "Qwen/Qwen2.5-1.5B-Instruct"
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
	device_map="auto" if DEVICE == "cuda" else None,
	)
	_qwen_pipe = hf_pipeline("text-generation", model=model, tokenizer=tokenizer)
	print(" Stage 2 ready.")
	return _qwen_pipe


	REWRITE_PROMPTS = {
	"Natural": "Rewrite this to sound completely natural, human-written — vary sentence length, use contractions, slight imperfections.",
	"Casual": "Rewrite this in a relaxed, friendly, conversational tone like a real person chatting.",
	"Academic": "Rewrite this in clear, formal academic style with precise and sophisticated language.",
	"Professional": "Rewrite this in a crisp, professional business tone — confident and authoritative.",
	}


	def stage2_qwen_rewrite(text: str, style: str = "Natural", intensity: float = 0.7) -> str:
	"""Deep semantic rewrite using Qwen instruction-tuned LLM."""
	if not text.strip():
	return text
	pipe = _load_qwen()

	tone = REWRITE_PROMPTS.get(style, REWRITE_PROMPTS["Natural"])

	prompt = (
	"<\|im_start\|>system\n"
	"You are an expert editor that removes AI stiffness and makes text feel authentically human.\n"
	"Keep original meaning 100%. Improve flow, rhythm, vocabulary variety. "
	"Output ONLY the rewritten text.<\|im_end\|>\n"
	f"<\|im_start\|>user\n{tone}\nText:\n{text}<\|im_end\|>\n"
	"<\|im_start\|>assistant\n"
	)

	gen_config = GenerationConfig(
	max_new_tokens=600,
	temperature=0.4 + float(intensity) * 0.5,
	top_p=0.92,
	repetition_penalty=1.08,
	do_sample=True,
	pad_token_id=pipe.tokenizer.eos_token_id,
	eos_token_id=pipe.tokenizer.eos_token_id,
	)
	gen_config.max_length = None

	try:
	output = pipe(prompt, generation_config=gen_config, num_return_sequences=1)[0][
	"generated_text"
	]
	if "assistant" in output:
	rewritten = output.split("assistant", 1)[-1].strip()
	else:
	rewritten = output[len(prompt) :].strip()
	return rewritten.strip() if rewritten.strip() else text
	except Exception as e:
	print(f"Stage 2 error: {e}")
	return text


	# =====================================================================
	# STAGE 3 — Multi-Pass Cleanup (from c.py, optimized)
	# Removes AI-flagged patterns, restructures sentences, adds
	# contractions and human touches. Conflicts with a.py resolved:
	# - No contraction EXPANSION (a.py did this, we skip it)
	# - Synonym direction is casual-ward only
	# =====================================================================

	# AI-flagged words/phrases → more natural replacements
	AI_PATTERNS = {
	r"\bdelve into\b": ["explore", "examine", "look into", "dig into", "study"],
	r"\bembark upon?\b": ["begin", "start", "kick off", "launch", "set out"],
	r"\ba testament to\b": ["proof of", "evidence of", "shows", "reflects"],
	r"\blandscape of\b": ["world of", "field of", "area of", "space of"],
	r"\bnavigating\b": ["handling", "managing", "dealing with", "tackling"],
	r"\bmeticulous\b": ["careful", "thorough", "detailed", "precise"],
	r"\bintricate\b": ["complex", "detailed", "elaborate", "complicated"],
	r"\bmyriad\b": ["many", "numerous", "various", "lots of"],
	r"\bplethora\b": ["abundance", "wealth", "range", "loads"],
	r"\bparadigm\b": ["model", "framework", "approach", "method"],
	r"\bsynergy\b": ["teamwork", "cooperation", "collaboration"],
	r"\bleverage\b": ["use", "employ", "tap into", "make use of"],
	r"\bfacilitate\b": ["help", "enable", "support", "make easier"],
	r"\boptimize\b": ["improve", "enhance", "refine", "boost"],
	r"\bstreamline\b": ["simplify", "improve", "smooth out"],
	r"\brobust\b": ["strong", "reliable", "solid", "effective"],
	r"\bseamless\b": ["smooth", "easy", "fluid", "effortless"],
	r"\binnovative\b": ["creative", "original", "new", "fresh"],
	r"\bcutting-edge\b": ["advanced", "modern", "latest", "leading"],
	r"\bstate-of-the-art\b": ["advanced", "modern", "top-notch"],
	r"\bfurthermore\b": ["also", "plus", "on top of that", "besides"],
	r"\bmoreover\b": ["also", "plus", "what's more", "besides"],
	r"\bnevertheless\b": ["still", "yet", "even so", "all the same"],
	r"\bconsequently\b": ["so", "as a result", "because of this"],
	r"\bin conclusion\b": ["finally", "to wrap up", "in the end", "lastly"],
	r"\bin order to\b": ["to", "so we can", "aiming to"],
	r"\bdue to the fact that\b": ["because", "since", "given that"],
	r"\bwith regard to\b": ["about", "regarding", "when it comes to"],
	r"\bin terms of\b": ["regarding", "as for", "about"],
	r"\bprior to\b": ["before", "ahead of", "earlier than"],
	r"\bsubsequent to\b": ["after", "following", "once"],
	r"\bcomprehensive\b": ["complete", "thorough", "detailed", "full"],
	r"\bfundamental\b": ["basic", "essential", "core", "key"],
	r"\bsubstantial\b": ["significant", "considerable", "big", "major"],
	r"\bimplement\b": ["put in place", "carry out", "apply", "use"],
	r"\butilize\b": ["use", "employ", "make use of", "tap into"],
	r"\bdemonstrate\b": ["show", "prove", "reveal", "display"],
	r"\bestablish\b": ["set up", "create", "build", "start"],
	r"\bmaintain\b": ["keep", "preserve", "continue", "sustain"],
	r"\bobtain\b": ["get", "gain", "secure", "pick up"],
	}

	# Contractions to ADD (making text sound human/casual)
	CONTRACTIONS = {
	r"\bit is\b": "it's", r"\bthat is\b": "that's", r"\bthere is\b": "there's",
	r"\bwho is\b": "who's", r"\bwhat is\b": "what's", r"\bwhere is\b": "where's",
	r"\bthey are\b": "they're", r"\bwe are\b": "we're", r"\byou are\b": "you're",
	r"\bI am\b": "I'm", r"\bhe is\b": "he's", r"\bshe is\b": "she's",
	r"\bcannot\b": "can't", r"\bdo not\b": "don't", r"\bdoes not\b": "doesn't",
	r"\bwill not\b": "won't", r"\bwould not\b": "wouldn't",
	r"\bshould not\b": "shouldn't", r"\bcould not\b": "couldn't",
	r"\bhave not\b": "haven't", r"\bhas not\b": "hasn't", r"\bhad not\b": "hadn't",
	r"\bis not\b": "isn't", r"\bare not\b": "aren't",
	r"\bwas not\b": "wasn't", r"\bwere not\b": "weren't",
	r"\blet us\b": "let's", r"\bI will\b": "I'll", r"\bI would\b": "I'd",
	r"\byou will\b": "you'll", r"\bwe will\b": "we'll", r"\bthey will\b": "they'll",
	}

	HUMAN_STARTERS = [
	"Actually,", "Honestly,", "Basically,", "Really,", "Generally,",
	"Usually,", "Often,", "Clearly,", "Naturally,", "Definitely,",
	"Interestingly,", "What's more,", "Plus,", "Also,", "Besides,",
	"In fact,", "Of course,", "Frankly,", "To be honest,", "The thing is,",
	]

	NATURAL_TRANSITIONS = [
	"And here's the thing:", "But here's what's interesting:",
	"So, what does this mean?", "Here's why this matters:",
	"Think about it this way:", "The reality is:", "The truth is:",
	]

	WORD_GROUPS = {
	"analyze": ["examine", "study", "investigate", "explore", "review"],
	"important": ["crucial", "vital", "essential", "key", "critical"],
	"shows": ["demonstrates", "reveals", "indicates", "displays"],
	"understand": ["grasp", "realize", "recognize", "appreciate"],
	"develop": ["create", "build", "form", "generate", "produce"],
	"improve": ["enhance", "refine", "advance", "boost", "better"],
	"consider": ["think about", "evaluate", "contemplate", "ponder"],
	"different": ["various", "diverse", "distinct", "alternative"],
	"effective": ["successful", "efficient", "productive", "useful"],
	"significant": ["important", "notable", "considerable", "major"],
	}


	def _replace_ai_patterns(text: str, prob: float = 0.85) -> str:
	"""Replace known AI-flagged words with natural alternatives."""
	for pattern, replacements in AI_PATTERNS.items():
	for match in reversed(list(re.finditer(pattern, text, re.IGNORECASE))):
	if random.random() < prob:
	text = text[: match.start()] + random.choice(replacements) + text[match.end() :]
	return text


	def _add_contractions(text: str, prob: float = 0.7) -> str:
	"""Add natural contractions."""
	for pattern, contraction in CONTRACTIONS.items():
	if re.search(pattern, text, re.IGNORECASE) and random.random() < prob:
	text = re.sub(pattern, contraction, text, flags=re.IGNORECASE)
	return text


	def _restructure_sentence(sentence: str) -> str:
	"""Randomly restructure a sentence for variation."""
	strategies = [
	# Move adverb clause
	(r"^(.?),\s(because\|since\|when\|if\|although\|while)\s+(.*?)([.!?])$",
	r"\2 \3, \1\4"),
	(r"^(Although\|While\|Since\|Because\|When\|If)\s+(.?),\s(.*?)([.!?])$",
	r"\3, \1 \2\4"),
	]
	for pat, rep in strategies:
	if re.search(pat, sentence, re.IGNORECASE):
	result = re.sub(pat, rep, sentence, flags=re.IGNORECASE)
	if len(result.split()) >= 3:
	return result.strip()
	return sentence


	def _split_long_sentence(sentence: str) -> str:
	"""Split overly long compound sentences."""
	conjunctions = [", and ", ", but ", ", so ", ", yet "]
	for conj in conjunctions:
	if conj in sentence and len(sentence.split()) > 15:
	parts = sentence.split(conj, 1)
	if len(parts) == 2 and len(parts[0].split()) > 3 and len(parts[1].split()) > 3:
	first = parts[0].strip().rstrip(".") + "."
	second = parts[1].strip()
	if second and second[0].islower():
	second = second[0].upper() + second[1:]
	connector = random.choice(["Also,", "Plus,", "What's more,", "On top of that,"])
	return f"{first} {connector} {second[0].lower() + second[1:]}"
	return sentence


	def _enhance_vocabulary(text: str, prob: float = 0.3) -> str:
	"""Replace repeated words with contextual synonyms."""
	words = word_tokenize(text)
	usage = Counter(w.lower() for w in words if w.isalpha() and len(w) > 3)
	enhanced = []
	for word in words:
	wl = word.lower()
	if (word.isalpha() and len(word) > 3 and wl not in STOP_WORDS
	and usage.get(wl, 0) > 1 and random.random() < prob):
	# Check predefined groups
	for base, syns in WORD_GROUPS.items():
	if wl == base or wl in syns:
	candidates = [s for s in ([base] + syns) if s != wl]
	if candidates:
	enhanced.append(random.choice(candidates))
	usage[wl] -= 1
	break
	else:
	# Try WordNet
	synsets = wordnet.synsets(wl)
	syn_candidates = []
	for ss in synsets[:2]:
	for lemma in ss.lemmas():
	s = lemma.name().replace("_", " ")
	if s != wl and len(s) > 2 and abs(len(s) - len(word)) <= 3:
	syn_candidates.append(s)
	if syn_candidates:
	enhanced.append(random.choice(syn_candidates[:3]))
	usage[wl] -= 1
	else:
	enhanced.append(word)
	else:
	enhanced.append(word)
	return " ".join(enhanced)


	def _add_human_touches(text: str, prob: float = 0.25) -> str:
	"""Add natural sentence starters, transitions, fillers."""
	sentences = sent_tokenize(text)
	result = []
	for i, sent in enumerate(sentences):
	current = sent
	# Natural starters on ~25% of non-first sentences
	if i > 0 and random.random() < prob and len(current.split()) > 6:
	starter = random.choice(HUMAN_STARTERS)
	current = f"{starter} {current[0].lower() + current[1:]}"
	# Natural transitions rarely
	if i > 0 and random.random() < prob * 0.2:
	transition = random.choice(NATURAL_TRANSITIONS)
	current = f"{transition} {current[0].lower() + current[1:]}"
	result.append(current)
	return " ".join(result)


	def _final_cleanup(text: str) -> str:
	"""Fix spacing, punctuation, capitalization."""
	text = re.sub(r"\s+", " ", text)
	text = re.sub(r"\s+([,.!?;:])", r"\1", text)
	text = re.sub(r"([,.!?;:])\s*([A-Z])", r"\1 \2", text)
	text = re.sub(r"\.+", ".", text)
	sentences = sent_tokenize(text)
	corrected = []
	for s in sentences:
	if s and s[0].islower():
	s = s[0].upper() + s[1:]
	corrected.append(s)
	return " ".join(corrected).strip()


	def stage3_multipass_cleanup(text: str, intensity: int = 2) -> str:
	"""Multi-pass cleanup: pattern removal → restructure → vocabulary → contractions → human touches."""
	if not text.strip():
	return text

	prob_scale = {1: 0.5, 2: 0.75, 3: 1.0}.get(intensity, 0.75)
	current = text

	# Pass 1: Remove AI-flagged patterns
	current = _replace_ai_patterns(current, prob=0.85 * prob_scale)

	# Pass 2: Restructure sentences
	sentences = sent_tokenize(current)
	restructured = []
	for sent in sentences:
	if len(sent.split()) > 8 and random.random() < 0.5 * prob_scale:
	sent = _restructure_sentence(sent)
	if len(sent.split()) > 15 and random.random() < 0.4 * prob_scale:
	sent = _split_long_sentence(sent)
	restructured.append(sent)
	current = " ".join(restructured)

	# Pass 3: Vocabulary enhancement (replace repeated words)
	current = _enhance_vocabulary(current, prob=0.3 * prob_scale)

	# Pass 4: Add contractions + human touches
	current = _add_contractions(current, prob=0.7 * prob_scale)
	current = _add_human_touches(current, prob=0.25 * prob_scale)

	# Final cleanup
	current = _final_cleanup(current)
	return current


	# =====================================================================
	# VERIFICATION — RoBERTa AI Detector (from b.py)
	# =====================================================================

	_detector_pipe = None

	def _load_detector():
	global _detector_pipe
	if _detector_pipe is None:
	print("Loading Detector: chatgpt-detector-roberta …")
	_detector_pipe = hf_pipeline(
	"text-classification",
	model="Hello-SimpleAI/chatgpt-detector-roberta",
	device=0 if DEVICE == "cuda" else -1,
	torch_dtype=torch.float16 if DEVICE == "cuda" else None,
	)
	print(" Detector ready.")
	return _detector_pipe


	def verify_detection(text: str) -> str:
	"""Run sentence-level AI detection and return an HTML report."""
	if not text.strip():
	return "No text to analyze."

	sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text.strip()) if s.strip()]
	pipe = _load_detector()
	preds = pipe(sentences, truncation=True, max_length=512)

	rows = []
	total_ai = 0.0
	for sent, pred in zip(sentences, preds):
	label = pred["label"].lower()
	score = pred["score"]
	ai_prob = score * 100 if any(x in label for x in ["fake", "ai", "generated"]) else (1 - score) * 100
	total_ai += ai_prob
	tag = "Very likely AI" if ai_prob > 85 else "Likely AI" if ai_prob > 60 else "Likely Human"
	color = "#dc2626" if ai_prob > 85 else "#d97706" if ai_prob > 60 else "#16a34a"
	rows.append(
	f"<div style='padding:8px;margin:4px 0;border-left:4px solid {color};'>"
	f"<strong>{tag} ({ai_prob:.1f}%)</strong><br>{sent}</div>"
	)

	avg = total_ai / len(sentences) if sentences else 0
	summary = f"<h3>Overall AI probability: {avg:.1f}%</h3>"
	return summary + "".join(rows)


	# =====================================================================
	# FULL PIPELINE
	# =====================================================================

	def run_pipeline(
	text: str,
	style: str = "Natural",
	intensity: float = 0.7,
	use_stage1: bool = True,
	use_stage2: bool = True,
	use_stage3: bool = True,
	cleanup_intensity: int = 2,
	progress=gr.Progress(track_tqdm=False),
	) -> Tuple[str, str, str, str]:
	"""
	Run the full humanization pipeline.
	Returns: (stage1_out, stage2_out, final_out, detection_html)
	"""
	if not text.strip():
	return "", "", "", ""

	current = text
	s1_out = s2_out = ""

	# Stage 1: T5 Humanizer
	if use_stage1:
	progress(0.1, desc="Stage 1: T5 Humanizer …")
	current = stage1_t5_humanize(current)
	s1_out = current

	# Stage 2: Qwen LLM Rewrite
	if use_stage2:
	progress(0.4, desc="Stage 2: Qwen LLM Rewrite …")
	current = stage2_qwen_rewrite(current, style=style, intensity=intensity)
	s2_out = current

	# Stage 3: Multi-Pass Cleanup
	if use_stage3:
	progress(0.7, desc="Stage 3: Multi-Pass Cleanup …")
	current = stage3_multipass_cleanup(current, intensity=cleanup_intensity)

	# Verification
	progress(0.9, desc="Verifying with AI detector …")
	detection_html = verify_detection(current)

	return s1_out, s2_out, current, detection_html


	# =====================================================================
	# GRADIO UI
	# =====================================================================

	with gr.Blocks(title="Humanization Pipeline") as demo:
	gr.Markdown(
	"# Humanization Pipeline\n"
	"3-stage chain: T5 Humanizer → Qwen LLM Rewrite → Multi-Pass Cleanup → AI Detection Verify"
	)

	with gr.Row():
	with gr.Column(scale=1):
	input_text = gr.Textbox(
	label="Input Text (AI-generated)",
	placeholder="Paste AI-generated text here …",
	lines=10,
	)

	style_dropdown = gr.Dropdown(
	choices=["Natural", "Casual", "Academic", "Professional"],
	value="Natural",
	label="Rewrite Style (Stage 2)",
	)

	intensity_slider = gr.Slider(
	minimum=0.1, maximum=1.0, value=0.7, step=0.05,
	label="LLM Rewrite Intensity (Stage 2)",
	)

	cleanup_intensity = gr.Radio(
	choices=[("Light", 1), ("Standard", 2), ("Heavy", 3)],
	value=2,
	label="Cleanup Intensity (Stage 3)",
	)

	with gr.Row():
	use_s1 = gr.Checkbox(label="Stage 1: T5 Humanizer", value=True)
	use_s2 = gr.Checkbox(label="Stage 2: Qwen LLM", value=True)
	use_s3 = gr.Checkbox(label="Stage 3: Multi-Pass", value=True)

	run_btn = gr.Button("Run Pipeline", variant="primary", size="lg")

	with gr.Column(scale=1):
	with gr.Accordion("Stage 1 Output (T5 Humanizer)", open=False):
	s1_output = gr.Textbox(label="After Stage 1", lines=5)

	with gr.Accordion("Stage 2 Output (Qwen LLM)", open=False):
	s2_output = gr.Textbox(label="After Stage 2", lines=5)

	final_output = gr.Textbox(
	label="Final Humanized Text",
	lines=10,
	)

	detection_result = gr.HTML(label="AI Detection Verification")

	run_btn.click(
	fn=run_pipeline,
	inputs=[input_text, style_dropdown, intensity_slider,
	use_s1, use_s2, use_s3, cleanup_intensity],
	outputs=[s1_output, s2_output, final_output, detection_result],
	)

	gr.Examples(
	examples=[
	["The rapid advancement of artificial intelligence technologies has significantly transformed numerous industries and daily life."],
	["Machine learning algorithms demonstrate superior performance in pattern recognition tasks across diverse datasets."],
	["In conclusion, leveraging cutting-edge methodologies facilitates the optimization of robust and seamless solutions."],
	],
	inputs=input_text,
	label="Test examples (heavily AI-flagged text)",
	)

	if __name__ == "__main__":
	demo.launch(debug=False, share=True)