File size: 24,139 Bytes
fea8d44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
"""
Unified Humanization Pipeline
Chains three humanization approaches in optimal order for maximum AI-detection bypass.

Pipeline Order:
  Stage 1: T5 Humanizer (a.py) β€” fine-tuned on 39k samples, best initial paraphrase
  Stage 2: Qwen LLM Rewrite (b.py) β€” deep semantic rewrite via instruction-tuned LLM
  Stage 3: Multi-Pass Cleanup (c.py) β€” AI pattern removal, restructuring, contractions, human touches
  Verify:  RoBERTa AI Detector (b.py) β€” sentence-level AI probability check
"""

import gradio as gr
import torch
import re
import random
import math
import numpy as np
import os
from collections import defaultdict, Counter
from typing import List, Dict, Tuple
from transformers import (
    pipeline as hf_pipeline,
    AutoTokenizer,
    AutoModelForCausalLM,
    T5Tokenizer,
    T5ForConditionalGeneration,
    GenerationConfig,
)

# ── NLTK setup ───────────────────────────────────────────────────────
import ssl
import nltk

# Fix SSL certificate issue on macOS
try:
    ssl._create_default_https_context = ssl._create_unverified_context
except AttributeError:
    pass

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Use home directory for NLTK data (already downloaded there)
NLTK_DIR = os.path.join(os.path.expanduser("~"), "nltk_data")
os.makedirs(NLTK_DIR, exist_ok=True)
nltk.data.path.insert(0, NLTK_DIR)

for _res in ["punkt", "punkt_tab", "averaged_perceptron_tagger",
             "stopwords", "wordnet", "omw-1.4"]:
    try:
        nltk.download(_res, download_dir=NLTK_DIR, quiet=True)
    except Exception:
        pass

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import wordnet, stopwords

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
STOP_WORDS = set(stopwords.words("english"))

# =====================================================================
#  STAGE 1 β€” T5 Humanizer Model (from a.py)
#  Fine-tuned on 39,776 humanization samples.  Best initial paraphrase.
# =====================================================================

_t5_model = None
_t5_tokenizer = None

def _load_t5():
    global _t5_model, _t5_tokenizer
    if _t5_model is None:
        print("Loading Stage 1: T5 Humanizer model …")
        MODEL_PATH = "harryroger798/humanizer-model-v3"
        _t5_tokenizer = T5Tokenizer.from_pretrained(MODEL_PATH)
        _t5_model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH)
        print("  Stage 1 ready.")
    return _t5_model, _t5_tokenizer


def stage1_t5_humanize(text: str) -> str:
    """Initial paraphrase using the fine-tuned T5 humanizer."""
    if not text.strip():
        return text
    model, tokenizer = _load_t5()

    inputs = tokenizer(
        f"humanize: {text}",
        return_tensors="pt",
        max_length=512,
        truncation=True,
    )
    outputs = model.generate(
        **inputs,
        max_length=512,
        num_beams=4,
        early_stopping=True,
        do_sample=True,
        temperature=0.8,
        top_p=0.9,
        repetition_penalty=2.5,
        no_repeat_ngram_size=3,
        length_penalty=1.0,
    )
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Repetition guard β€” if model loops, fall back to original text
    words = result.split()
    if len(words) > 10:
        counts = Counter(words)
        if max(counts.values()) > len(words) * 0.3:
            return text
    return result


# =====================================================================
#  STAGE 2 β€” Qwen LLM Rewrite (from b.py)
#  Instruction-tuned 1.5B model does a deep semantic rewrite.
# =====================================================================

_qwen_pipe = None

def _load_qwen():
    global _qwen_pipe
    if _qwen_pipe is None:
        print("Loading Stage 2: Qwen 2.5-1.5B-Instruct …")
        model_id = "Qwen/Qwen2.5-1.5B-Instruct"
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
            device_map="auto" if DEVICE == "cuda" else None,
        )
        _qwen_pipe = hf_pipeline("text-generation", model=model, tokenizer=tokenizer)
        print("  Stage 2 ready.")
    return _qwen_pipe


REWRITE_PROMPTS = {
    "Natural": "Rewrite this to sound completely natural, human-written β€” vary sentence length, use contractions, slight imperfections.",
    "Casual": "Rewrite this in a relaxed, friendly, conversational tone like a real person chatting.",
    "Academic": "Rewrite this in clear, formal academic style with precise and sophisticated language.",
    "Professional": "Rewrite this in a crisp, professional business tone β€” confident and authoritative.",
}


def stage2_qwen_rewrite(text: str, style: str = "Natural", intensity: float = 0.7) -> str:
    """Deep semantic rewrite using Qwen instruction-tuned LLM."""
    if not text.strip():
        return text
    pipe = _load_qwen()

    tone = REWRITE_PROMPTS.get(style, REWRITE_PROMPTS["Natural"])

    prompt = (
        "<|im_start|>system\n"
        "You are an expert editor that removes AI stiffness and makes text feel authentically human.\n"
        "Keep original meaning 100%. Improve flow, rhythm, vocabulary variety. "
        "Output ONLY the rewritten text.<|im_end|>\n"
        f"<|im_start|>user\n{tone}\nText:\n{text}<|im_end|>\n"
        "<|im_start|>assistant\n"
    )

    gen_config = GenerationConfig(
        max_new_tokens=600,
        temperature=0.4 + float(intensity) * 0.5,
        top_p=0.92,
        repetition_penalty=1.08,
        do_sample=True,
        pad_token_id=pipe.tokenizer.eos_token_id,
        eos_token_id=pipe.tokenizer.eos_token_id,
    )
    gen_config.max_length = None

    try:
        output = pipe(prompt, generation_config=gen_config, num_return_sequences=1)[0][
            "generated_text"
        ]
        if "assistant" in output:
            rewritten = output.split("assistant", 1)[-1].strip()
        else:
            rewritten = output[len(prompt) :].strip()
        return rewritten.strip() if rewritten.strip() else text
    except Exception as e:
        print(f"Stage 2 error: {e}")
        return text


# =====================================================================
#  STAGE 3 β€” Multi-Pass Cleanup (from c.py, optimized)
#  Removes AI-flagged patterns, restructures sentences, adds
#  contractions and human touches.  Conflicts with a.py resolved:
#  - No contraction EXPANSION (a.py did this, we skip it)
#  - Synonym direction is casual-ward only
# =====================================================================

# AI-flagged words/phrases β†’ more natural replacements
AI_PATTERNS = {
    r"\bdelve into\b": ["explore", "examine", "look into", "dig into", "study"],
    r"\bembark upon?\b": ["begin", "start", "kick off", "launch", "set out"],
    r"\ba testament to\b": ["proof of", "evidence of", "shows", "reflects"],
    r"\blandscape of\b": ["world of", "field of", "area of", "space of"],
    r"\bnavigating\b": ["handling", "managing", "dealing with", "tackling"],
    r"\bmeticulous\b": ["careful", "thorough", "detailed", "precise"],
    r"\bintricate\b": ["complex", "detailed", "elaborate", "complicated"],
    r"\bmyriad\b": ["many", "numerous", "various", "lots of"],
    r"\bplethora\b": ["abundance", "wealth", "range", "loads"],
    r"\bparadigm\b": ["model", "framework", "approach", "method"],
    r"\bsynergy\b": ["teamwork", "cooperation", "collaboration"],
    r"\bleverage\b": ["use", "employ", "tap into", "make use of"],
    r"\bfacilitate\b": ["help", "enable", "support", "make easier"],
    r"\boptimize\b": ["improve", "enhance", "refine", "boost"],
    r"\bstreamline\b": ["simplify", "improve", "smooth out"],
    r"\brobust\b": ["strong", "reliable", "solid", "effective"],
    r"\bseamless\b": ["smooth", "easy", "fluid", "effortless"],
    r"\binnovative\b": ["creative", "original", "new", "fresh"],
    r"\bcutting-edge\b": ["advanced", "modern", "latest", "leading"],
    r"\bstate-of-the-art\b": ["advanced", "modern", "top-notch"],
    r"\bfurthermore\b": ["also", "plus", "on top of that", "besides"],
    r"\bmoreover\b": ["also", "plus", "what's more", "besides"],
    r"\bnevertheless\b": ["still", "yet", "even so", "all the same"],
    r"\bconsequently\b": ["so", "as a result", "because of this"],
    r"\bin conclusion\b": ["finally", "to wrap up", "in the end", "lastly"],
    r"\bin order to\b": ["to", "so we can", "aiming to"],
    r"\bdue to the fact that\b": ["because", "since", "given that"],
    r"\bwith regard to\b": ["about", "regarding", "when it comes to"],
    r"\bin terms of\b": ["regarding", "as for", "about"],
    r"\bprior to\b": ["before", "ahead of", "earlier than"],
    r"\bsubsequent to\b": ["after", "following", "once"],
    r"\bcomprehensive\b": ["complete", "thorough", "detailed", "full"],
    r"\bfundamental\b": ["basic", "essential", "core", "key"],
    r"\bsubstantial\b": ["significant", "considerable", "big", "major"],
    r"\bimplement\b": ["put in place", "carry out", "apply", "use"],
    r"\butilize\b": ["use", "employ", "make use of", "tap into"],
    r"\bdemonstrate\b": ["show", "prove", "reveal", "display"],
    r"\bestablish\b": ["set up", "create", "build", "start"],
    r"\bmaintain\b": ["keep", "preserve", "continue", "sustain"],
    r"\bobtain\b": ["get", "gain", "secure", "pick up"],
}

# Contractions to ADD (making text sound human/casual)
CONTRACTIONS = {
    r"\bit is\b": "it's", r"\bthat is\b": "that's", r"\bthere is\b": "there's",
    r"\bwho is\b": "who's", r"\bwhat is\b": "what's", r"\bwhere is\b": "where's",
    r"\bthey are\b": "they're", r"\bwe are\b": "we're", r"\byou are\b": "you're",
    r"\bI am\b": "I'm", r"\bhe is\b": "he's", r"\bshe is\b": "she's",
    r"\bcannot\b": "can't", r"\bdo not\b": "don't", r"\bdoes not\b": "doesn't",
    r"\bwill not\b": "won't", r"\bwould not\b": "wouldn't",
    r"\bshould not\b": "shouldn't", r"\bcould not\b": "couldn't",
    r"\bhave not\b": "haven't", r"\bhas not\b": "hasn't", r"\bhad not\b": "hadn't",
    r"\bis not\b": "isn't", r"\bare not\b": "aren't",
    r"\bwas not\b": "wasn't", r"\bwere not\b": "weren't",
    r"\blet us\b": "let's", r"\bI will\b": "I'll", r"\bI would\b": "I'd",
    r"\byou will\b": "you'll", r"\bwe will\b": "we'll", r"\bthey will\b": "they'll",
}

HUMAN_STARTERS = [
    "Actually,", "Honestly,", "Basically,", "Really,", "Generally,",
    "Usually,", "Often,", "Clearly,", "Naturally,", "Definitely,",
    "Interestingly,", "What's more,", "Plus,", "Also,", "Besides,",
    "In fact,", "Of course,", "Frankly,", "To be honest,", "The thing is,",
]

NATURAL_TRANSITIONS = [
    "And here's the thing:", "But here's what's interesting:",
    "So, what does this mean?", "Here's why this matters:",
    "Think about it this way:", "The reality is:", "The truth is:",
]

WORD_GROUPS = {
    "analyze": ["examine", "study", "investigate", "explore", "review"],
    "important": ["crucial", "vital", "essential", "key", "critical"],
    "shows": ["demonstrates", "reveals", "indicates", "displays"],
    "understand": ["grasp", "realize", "recognize", "appreciate"],
    "develop": ["create", "build", "form", "generate", "produce"],
    "improve": ["enhance", "refine", "advance", "boost", "better"],
    "consider": ["think about", "evaluate", "contemplate", "ponder"],
    "different": ["various", "diverse", "distinct", "alternative"],
    "effective": ["successful", "efficient", "productive", "useful"],
    "significant": ["important", "notable", "considerable", "major"],
}


def _replace_ai_patterns(text: str, prob: float = 0.85) -> str:
    """Replace known AI-flagged words with natural alternatives."""
    for pattern, replacements in AI_PATTERNS.items():
        for match in reversed(list(re.finditer(pattern, text, re.IGNORECASE))):
            if random.random() < prob:
                text = text[: match.start()] + random.choice(replacements) + text[match.end() :]
    return text


def _add_contractions(text: str, prob: float = 0.7) -> str:
    """Add natural contractions."""
    for pattern, contraction in CONTRACTIONS.items():
        if re.search(pattern, text, re.IGNORECASE) and random.random() < prob:
            text = re.sub(pattern, contraction, text, flags=re.IGNORECASE)
    return text


def _restructure_sentence(sentence: str) -> str:
    """Randomly restructure a sentence for variation."""
    strategies = [
        # Move adverb clause
        (r"^(.*?),\s*(because|since|when|if|although|while)\s+(.*?)([.!?])$",
         r"\2 \3, \1\4"),
        (r"^(Although|While|Since|Because|When|If)\s+(.*?),\s*(.*?)([.!?])$",
         r"\3, \1 \2\4"),
    ]
    for pat, rep in strategies:
        if re.search(pat, sentence, re.IGNORECASE):
            result = re.sub(pat, rep, sentence, flags=re.IGNORECASE)
            if len(result.split()) >= 3:
                return result.strip()
    return sentence


def _split_long_sentence(sentence: str) -> str:
    """Split overly long compound sentences."""
    conjunctions = [", and ", ", but ", ", so ", ", yet "]
    for conj in conjunctions:
        if conj in sentence and len(sentence.split()) > 15:
            parts = sentence.split(conj, 1)
            if len(parts) == 2 and len(parts[0].split()) > 3 and len(parts[1].split()) > 3:
                first = parts[0].strip().rstrip(".") + "."
                second = parts[1].strip()
                if second and second[0].islower():
                    second = second[0].upper() + second[1:]
                connector = random.choice(["Also,", "Plus,", "What's more,", "On top of that,"])
                return f"{first} {connector} {second[0].lower() + second[1:]}"
    return sentence


def _enhance_vocabulary(text: str, prob: float = 0.3) -> str:
    """Replace repeated words with contextual synonyms."""
    words = word_tokenize(text)
    usage = Counter(w.lower() for w in words if w.isalpha() and len(w) > 3)
    enhanced = []
    for word in words:
        wl = word.lower()
        if (word.isalpha() and len(word) > 3 and wl not in STOP_WORDS
                and usage.get(wl, 0) > 1 and random.random() < prob):
            # Check predefined groups
            for base, syns in WORD_GROUPS.items():
                if wl == base or wl in syns:
                    candidates = [s for s in ([base] + syns) if s != wl]
                    if candidates:
                        enhanced.append(random.choice(candidates))
                        usage[wl] -= 1
                        break
            else:
                # Try WordNet
                synsets = wordnet.synsets(wl)
                syn_candidates = []
                for ss in synsets[:2]:
                    for lemma in ss.lemmas():
                        s = lemma.name().replace("_", " ")
                        if s != wl and len(s) > 2 and abs(len(s) - len(word)) <= 3:
                            syn_candidates.append(s)
                if syn_candidates:
                    enhanced.append(random.choice(syn_candidates[:3]))
                    usage[wl] -= 1
                else:
                    enhanced.append(word)
        else:
            enhanced.append(word)
    return " ".join(enhanced)


def _add_human_touches(text: str, prob: float = 0.25) -> str:
    """Add natural sentence starters, transitions, fillers."""
    sentences = sent_tokenize(text)
    result = []
    for i, sent in enumerate(sentences):
        current = sent
        # Natural starters on ~25% of non-first sentences
        if i > 0 and random.random() < prob and len(current.split()) > 6:
            starter = random.choice(HUMAN_STARTERS)
            current = f"{starter} {current[0].lower() + current[1:]}"
        # Natural transitions rarely
        if i > 0 and random.random() < prob * 0.2:
            transition = random.choice(NATURAL_TRANSITIONS)
            current = f"{transition} {current[0].lower() + current[1:]}"
        result.append(current)
    return " ".join(result)


def _final_cleanup(text: str) -> str:
    """Fix spacing, punctuation, capitalization."""
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\s+([,.!?;:])", r"\1", text)
    text = re.sub(r"([,.!?;:])\s*([A-Z])", r"\1 \2", text)
    text = re.sub(r"\.+", ".", text)
    sentences = sent_tokenize(text)
    corrected = []
    for s in sentences:
        if s and s[0].islower():
            s = s[0].upper() + s[1:]
        corrected.append(s)
    return " ".join(corrected).strip()


def stage3_multipass_cleanup(text: str, intensity: int = 2) -> str:
    """Multi-pass cleanup: pattern removal β†’ restructure β†’ vocabulary β†’ contractions β†’ human touches."""
    if not text.strip():
        return text

    prob_scale = {1: 0.5, 2: 0.75, 3: 1.0}.get(intensity, 0.75)
    current = text

    # Pass 1: Remove AI-flagged patterns
    current = _replace_ai_patterns(current, prob=0.85 * prob_scale)

    # Pass 2: Restructure sentences
    sentences = sent_tokenize(current)
    restructured = []
    for sent in sentences:
        if len(sent.split()) > 8 and random.random() < 0.5 * prob_scale:
            sent = _restructure_sentence(sent)
        if len(sent.split()) > 15 and random.random() < 0.4 * prob_scale:
            sent = _split_long_sentence(sent)
        restructured.append(sent)
    current = " ".join(restructured)

    # Pass 3: Vocabulary enhancement (replace repeated words)
    current = _enhance_vocabulary(current, prob=0.3 * prob_scale)

    # Pass 4: Add contractions + human touches
    current = _add_contractions(current, prob=0.7 * prob_scale)
    current = _add_human_touches(current, prob=0.25 * prob_scale)

    # Final cleanup
    current = _final_cleanup(current)
    return current


# =====================================================================
#  VERIFICATION β€” RoBERTa AI Detector (from b.py)
# =====================================================================

_detector_pipe = None

def _load_detector():
    global _detector_pipe
    if _detector_pipe is None:
        print("Loading Detector: chatgpt-detector-roberta …")
        _detector_pipe = hf_pipeline(
            "text-classification",
            model="Hello-SimpleAI/chatgpt-detector-roberta",
            device=0 if DEVICE == "cuda" else -1,
            torch_dtype=torch.float16 if DEVICE == "cuda" else None,
        )
        print("  Detector ready.")
    return _detector_pipe


def verify_detection(text: str) -> str:
    """Run sentence-level AI detection and return an HTML report."""
    if not text.strip():
        return "No text to analyze."

    sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text.strip()) if s.strip()]
    pipe = _load_detector()
    preds = pipe(sentences, truncation=True, max_length=512)

    rows = []
    total_ai = 0.0
    for sent, pred in zip(sentences, preds):
        label = pred["label"].lower()
        score = pred["score"]
        ai_prob = score * 100 if any(x in label for x in ["fake", "ai", "generated"]) else (1 - score) * 100
        total_ai += ai_prob
        tag = "Very likely AI" if ai_prob > 85 else "Likely AI" if ai_prob > 60 else "Likely Human"
        color = "#dc2626" if ai_prob > 85 else "#d97706" if ai_prob > 60 else "#16a34a"
        rows.append(
            f"<div style='padding:8px;margin:4px 0;border-left:4px solid {color};'>"
            f"<strong>{tag} ({ai_prob:.1f}%)</strong><br>{sent}</div>"
        )

    avg = total_ai / len(sentences) if sentences else 0
    summary = f"<h3>Overall AI probability: {avg:.1f}%</h3>"
    return summary + "".join(rows)


# =====================================================================
#  FULL PIPELINE
# =====================================================================

def run_pipeline(
    text: str,
    style: str = "Natural",
    intensity: float = 0.7,
    use_stage1: bool = True,
    use_stage2: bool = True,
    use_stage3: bool = True,
    cleanup_intensity: int = 2,
    progress=gr.Progress(track_tqdm=False),
) -> Tuple[str, str, str, str]:
    """
    Run the full humanization pipeline.
    Returns: (stage1_out, stage2_out, final_out, detection_html)
    """
    if not text.strip():
        return "", "", "", ""

    current = text
    s1_out = s2_out = ""

    # Stage 1: T5 Humanizer
    if use_stage1:
        progress(0.1, desc="Stage 1: T5 Humanizer …")
        current = stage1_t5_humanize(current)
        s1_out = current

    # Stage 2: Qwen LLM Rewrite
    if use_stage2:
        progress(0.4, desc="Stage 2: Qwen LLM Rewrite …")
        current = stage2_qwen_rewrite(current, style=style, intensity=intensity)
        s2_out = current

    # Stage 3: Multi-Pass Cleanup
    if use_stage3:
        progress(0.7, desc="Stage 3: Multi-Pass Cleanup …")
        current = stage3_multipass_cleanup(current, intensity=cleanup_intensity)

    # Verification
    progress(0.9, desc="Verifying with AI detector …")
    detection_html = verify_detection(current)

    return s1_out, s2_out, current, detection_html


# =====================================================================
#  GRADIO UI
# =====================================================================

with gr.Blocks(title="Humanization Pipeline") as demo:
    gr.Markdown(
        "# Humanization Pipeline\n"
        "**3-stage chain: T5 Humanizer β†’ Qwen LLM Rewrite β†’ Multi-Pass Cleanup β†’ AI Detection Verify**"
    )

    with gr.Row():
        with gr.Column(scale=1):
            input_text = gr.Textbox(
                label="Input Text (AI-generated)",
                placeholder="Paste AI-generated text here …",
                lines=10,
            )

            style_dropdown = gr.Dropdown(
                choices=["Natural", "Casual", "Academic", "Professional"],
                value="Natural",
                label="Rewrite Style (Stage 2)",
            )

            intensity_slider = gr.Slider(
                minimum=0.1, maximum=1.0, value=0.7, step=0.05,
                label="LLM Rewrite Intensity (Stage 2)",
            )

            cleanup_intensity = gr.Radio(
                choices=[("Light", 1), ("Standard", 2), ("Heavy", 3)],
                value=2,
                label="Cleanup Intensity (Stage 3)",
            )

            with gr.Row():
                use_s1 = gr.Checkbox(label="Stage 1: T5 Humanizer", value=True)
                use_s2 = gr.Checkbox(label="Stage 2: Qwen LLM", value=True)
                use_s3 = gr.Checkbox(label="Stage 3: Multi-Pass", value=True)

            run_btn = gr.Button("Run Pipeline", variant="primary", size="lg")

        with gr.Column(scale=1):
            with gr.Accordion("Stage 1 Output (T5 Humanizer)", open=False):
                s1_output = gr.Textbox(label="After Stage 1", lines=5)

            with gr.Accordion("Stage 2 Output (Qwen LLM)", open=False):
                s2_output = gr.Textbox(label="After Stage 2", lines=5)

            final_output = gr.Textbox(
                label="Final Humanized Text",
                lines=10,
            )

            detection_result = gr.HTML(label="AI Detection Verification")

    run_btn.click(
        fn=run_pipeline,
        inputs=[input_text, style_dropdown, intensity_slider,
                use_s1, use_s2, use_s3, cleanup_intensity],
        outputs=[s1_output, s2_output, final_output, detection_result],
    )

    gr.Examples(
        examples=[
            ["The rapid advancement of artificial intelligence technologies has significantly transformed numerous industries and daily life."],
            ["Machine learning algorithms demonstrate superior performance in pattern recognition tasks across diverse datasets."],
            ["In conclusion, leveraging cutting-edge methodologies facilitates the optimization of robust and seamless solutions."],
        ],
        inputs=input_text,
        label="Test examples (heavily AI-flagged text)",
    )

if __name__ == "__main__":
    demo.launch(debug=False, share=True)