File size: 32,812 Bytes
6416ff6
 
 
 
 
 
501dc1d
6416ff6
 
 
 
 
 
 
 
501dc1d
 
6416ff6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
501dc1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6416ff6
 
 
501dc1d
6416ff6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
501dc1d
6416ff6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
# app.py
"""

Merged Rephraser app

- GUI from original (first) file

- Models/logic from later big file (kept unchanged)

- Grammar highlight (red for issues; green underline for corrected words)

- File upload/download for .docx/.pdf/.txt with best-effort format preservation

- Tools independent (no automatic chaining)

- Prev/Next browsing for multi-version outputs

"""

import streamlit as st
import io, os, random, re, difflib, html, tempfile
from pathlib import Path

# IMPORTANT: st.set_page_config MUST be the first Streamlit command
st.set_page_config(page_title="Rephraser", layout="wide")

# --- Home button at the top ---
if st.button("🏠 Home"):
    st.rerun()

# Optional heavy libs (lazy imports used where needed)
try:
    import docx
except Exception:
    docx = None

try:
    import fitz  # PyMuPDF
except Exception:
    fitz = None

try:
    import language_tool_python
except Exception:
    language_tool_python = None

try:
    from textblob import TextBlob
except Exception:
    TextBlob = None

# NLTK / WordNet
try:
    import nltk
    from nltk.corpus import wordnet as wn
    nltk_available = True
except Exception:
    nltk_available = False

# spaCy
try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
    SPACY_AVAILABLE = True
except Exception:
    nlp = None
    SPACY_AVAILABLE = False

# transformers check
try:
    import transformers
    TRANSFORMERS_AVAILABLE = True
except Exception:
    TRANSFORMERS_AVAILABLE = False

# SpellChecker
try:
    from spellchecker import SpellChecker
    SPELLCHECKER_AVAILABLE = True
    spell = SpellChecker()
except Exception:
    SPELLCHECKER_AVAILABLE = False

# pyperclip optional
try:
    import pyperclip
    PYPERCLIP = True
except Exception:
    PYPERCLIP = False

# -----------------------
# Session state init (preserve old behavior)
# -----------------------
if "versions" not in st.session_state:
    st.session_state.versions = []
if "version_index" not in st.session_state:
    st.session_state.version_index = 0
if "last_input" not in st.session_state:
    st.session_state.last_input = ""
if "current_text" not in st.session_state:
    st.session_state.current_text = ""
if "history" not in st.session_state:
    st.session_state.history = []
# bookkeeping for file uploads & grammar
if "_uploaded_bytes" not in st.session_state:
    st.session_state._uploaded_bytes = None
if "_uploaded_name" not in st.session_state:
    st.session_state._uploaded_name = None
if "_last_grammar_issues" not in st.session_state:
    st.session_state._last_grammar_issues = None
if "_last_output_file" not in st.session_state:
    st.session_state._last_output_file = None
if "_last_output_name" not in st.session_state:
    st.session_state._last_output_name = None
if "_last_tool" not in st.session_state:
    st.session_state._last_tool = None

# -----------------------
# Helpers: highlights & diffs
# -----------------------
def mark_grammar_issues(text, issues):
    """Wrap problem spans in red (inline). issues is list of dicts with offset & length & message & replacements"""
    if not issues:
        return html.escape(text)
    spans = []
    for it in issues:
        off = it.get("offset", 0)
        length = it.get("length", 0)
        msg = it.get("message", "")
        spans.append((off, off + length, msg))
    spans.sort()
    out = ""
    idx = 0
    for s, e, msg in spans:
        if s > idx:
            out += html.escape(text[idx:s])
        problem = html.escape(text[s:e])
        out += f'<span title="{html.escape(msg)}" style="border-bottom:2px solid #c0392b;">{problem}</span>'
        idx = e
    if idx < len(text):
        out += html.escape(text[idx:])
    return out

def underline_changes_in_output(orig, corrected):
    """

    Token-level diff: underline changed/inserted fragments in green in corrected text.

    """
    a = orig.split()
    b = corrected.split()
    sm = difflib.SequenceMatcher(a=a, b=b)
    parts = []
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == "equal":
            parts.append(" ".join(b[j1:j2]))
        elif tag in ("replace", "insert"):
            changed = " ".join(b[j1:j2])
            parts.append(f'<span style="text-decoration: underline; text-decoration-color: #27ae60;">{html.escape(changed)}</span>')
        elif tag == "delete":
            pass
    return " ".join(parts) if parts else html.escape(corrected)

## Green line
import html
import difflib

def text_to_html_with_highlights(orig, new):
    """

    Compare original and new text word-by-word.

    Underline only the changed/added words in green.

    """
    orig_words = orig.split()
    new_words = new.split()

    diff = list(difflib.ndiff(orig_words, new_words))
    highlighted = []
    for word in diff:
        if word.startswith("+ "):  # Added or changed word
            highlighted.append(
                f"<span style='color:black;text-decoration:underline;text-decoration-color:green'>{html.escape(word[2:])}</span>"
            )
        elif word.startswith("  "):  # Unchanged word
            highlighted.append(html.escape(word[2:]))
        # Words starting with "- " (removed) are skipped

    return " ".join(highlighted)

# -----------------------
# Paraphraser functions (kept from your big code)
# -----------------------
def paraphrase_variants_fast(text, n_variants=3):
    text = text.strip()
    if not text:
        return []
    sents = re.split(r'(?<=[.!?])\s+', text)
    variants = []
    for v in range(n_variants):
        outs = []
        for s in sents:
            sent = s.strip()
            if not sent:
                continue
            if SPACY_AVAILABLE:
                doc = nlp(sent)
                # small structural transforms
                if random.random() < 0.3 and len(list(doc.noun_chunks)) >= 2:
                    chunks = list(doc.noun_chunks)
                    text_chunks = [c.text for c in chunks]
                    s2 = sent
                    try:
                        s2 = s2.replace(text_chunks[0], "<<<A>>>").replace(text_chunks[1], text_chunks[0]).replace("<<<A>>>", text_chunks[1])
                    except Exception:
                        s2 = sent
                    outs.append(s2)
                    continue
                if ',' in sent and random.random() < 0.4:
                    parts = [p.strip() for p in sent.split(',')]
                    random.shuffle(parts)
                    outs.append(", ".join(parts))
                    continue
                outs.append(_synonym_replace(sent, prob=0.15 + 0.05 * v))
            else:
                if random.random() < 0.2:
                    words = sent.split()
                    if len(words) > 3:
                        i = random.randint(0, len(words) - 3)
                        words[i], words[i+1] = words[i+1], words[i]
                    outs.append(" ".join(words))
                else:
                    outs.append(_synonym_replace(sent, prob=0.12 + 0.04 * v))
        final = " ".join(outs)
        if random.random() < 0.3 and len(sents) > 1:
            random.shuffle(sents)
            final = " ".join(outs)
        variants.append(final)
    uniq = []
    for x in variants:
        if x not in uniq and x.strip():
            uniq.append(x)
    return uniq[:n_variants]

def _synonym_replace(sentence, prob=0.12, max_replacements=2):
    if not nltk_available:
        words = sentence.split()
        for i in range(len(words)):
            if random.random() < prob:
                j = random.randrange(len(words))
                words[i], words[j] = words[j], words[i]
        return " ".join(words)
    tokens = re.findall(r"\w+|\W+", sentence)
    words = [t for t in tokens]
    replaced = 0
    for i, tok in enumerate(words):
        if not re.match(r'\w+', tok):
            continue
        lower = tok.lower()
        if random.random() > prob:
            continue
        syns = wn.synsets(lower)
        if not syns:
            continue
        cand = None
        for s in syns:
            for l in s.lemmas():
                name = l.name().replace('_', ' ')
                if name.lower() != lower and ' ' not in name:
                    cand = name
                    break
            if cand:
                break
        if cand:
            if tok[0].isupper():
                cand = cand.capitalize()
            words[i] = cand
            replaced += 1
        if replaced >= max_replacements:
            break
    return "".join(words)

def simple_mix_versions(versions_list):
    if not versions_list:
        return ""
    pieces = []
    for v in versions_list:
        s = v.strip()
        if not s:
            continue
        sents = re.split(r'(?<=[.!?])\s+', s)
        take_n = max(1, min(3, len(sents)))
        picks = random.sample(sents, take_n) if len(sents) > take_n else sents
        pieces.extend(picks)
    random.shuffle(pieces)
    return " ".join(pieces)

# -----------------------
# Plagiarism remover (kept)
# -----------------------
@st.cache_resource(show_spinner=False)
def load_small_model(model_name="t5-small"):
    if not TRANSFORMERS_AVAILABLE:
        raise ImportError("transformers not installed")
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
    tok = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    pipe = pipeline("text2text-generation", model=model, tokenizer=tok, device=-1)
    return tok, model, pipe

def hf_paraphrase_with_pipe(pipe, text, max_len=256):
    try:
        out = pipe(text, max_length=max_len, do_sample=True, top_p=0.95, temperature=0.8, num_return_sequences=1)
        if isinstance(out, list) and out:
            return out[0].get("generated_text") or out[0].get("summary_text") or str(out[0])
        return str(out)
    except Exception:
        return text

def plagiarism_remover_pipeline(text, aggressive=1, light_only=False):
    versions = []
    v_light = paraphrase_variants_fast(text, n_variants=1)[0] if paraphrase_variants_fast(text, n_variants=1) else text
    versions.append(v_light)
    if TRANSFORMERS_AVAILABLE and not light_only:
        try:
            _, _, t5_pipe = load_small_model("t5-small")
            v_t5 = hf_paraphrase_with_pipe(t5_pipe, "paraphrase: " + text)
            versions.append(v_t5)
        except Exception:
            pass
        try:
            _, _, p_pipe = load_small_model("google/pegasus-xsum")
            v_peg = hf_paraphrase_with_pipe(p_pipe, text)
            versions.append(v_peg)
        except Exception:
            pass
    v_combo = simple_mix_versions(versions)
    versions.append(v_combo)
    uniq = []
    for v in versions:
        if v and v.strip() and v not in uniq:
            uniq.append(v)
        if len(uniq) >= 5:
            break
    return uniq

# -----------------------
# Grammar & Spelling (kept)
# -----------------------
def grammar_and_spelling_check(text):
    if language_tool_python is not None:
        try:
            tool = language_tool_python.LanguageTool('en-US')
            matches = tool.check(text)
            corrected = language_tool_python.utils.correct(text, matches)
            issues = []
            for m in matches:
                issues.append({
                    "message": m.message,
                    "replacements": m.replacements,
                    "offset": m.offset,
                    "length": m.errorLength,
                    "context": text[max(0, m.offset - 30): m.offset + 30]
                })
            return corrected, issues
        except Exception:
            pass
    if TextBlob is not None:
        try:
            tb = TextBlob(text)
            corr = str(tb.correct())
            return corr, []
        except Exception:
            pass
    return text, []

def spelling_suggestions(word, top_n=5, sentence=None):
    """Return contextual synonyms if NLTK WordNet is available, else fallback to spellchecker."""
    if not word or not word.strip():
        return []

    # Map POS tags to WordNet POS
    def get_wordnet_pos(treebank_tag):
        from nltk.corpus import wordnet
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        return None

    # Prefer WordNet synonyms with POS from context
    if nltk_available:
        wn_pos = None
        if sentence:
            try:
                tokens = nltk.word_tokenize(sentence)
                tagged = nltk.pos_tag(tokens)
                for tok, tag in tagged:
                    if tok.lower() == word.lower():
                        wn_pos = get_wordnet_pos(tag)
                        break
            except Exception:
                pass

        syns = wn.synsets(word, pos=wn_pos) if wn_pos else wn.synsets(word)
        suggestions = set()
        for s in syns:
            for l in s.lemmas():
                name = l.name().replace('_', ' ')
                if name.lower() != word.lower():
                    suggestions.add(name)
        if suggestions:
            return sorted(suggestions)[:top_n]

    # Fallback to spellchecker
    if SPELLCHECKER_AVAILABLE:
        suggestions = spell.candidates(word)
        return list(suggestions)[:top_n]

    return []

# -----------------------
# File extract & write helpers (kept & added best-effort replace)
# -----------------------
def extract_text_from_docx_bytes(b):
    if docx is None:
        raise RuntimeError("python-docx not installed")
    f = io.BytesIO(b)
    document = docx.Document(f)
    paras = [p.text for p in document.paragraphs]
    return "\n\n".join(paras)

def extract_text_from_pdf_bytes(b):
    if fitz is None:
        raise RuntimeError("PyMuPDF not installed")
    doc = fitz.open(stream=b, filetype="pdf")
    text = ""
    for p in doc:
        text += p.get_text() + "\n\n"
    return text

def extract_text_from_txt_bytes(b):
    try:
        return b.decode("utf-8")
    except Exception:
        try:
            return b.decode("latin-1")
        except Exception:
            return str(b)

def make_docx_bytes_from_text(text):
    if docx is None:
        raise RuntimeError("python-docx not installed")
    out = io.BytesIO()
    d = docx.Document()
    for para in text.split("\n\n"):
        d.add_paragraph(para)
    d.save(out)
    out.seek(0)
    return out.read()

def make_pdf_bytes_from_text(text):
    if fitz is None:
        raise RuntimeError("PyMuPDF not installed")
    doc = fitz.open()
    lines = text.split("\n")
    page = doc.new_page()
    y = 72
    for line in lines:
        if y > 720:
            page = doc.new_page()
            y = 72
        page.insert_text((72, y), line)
        y += 14
    buf = doc.write()
    doc.close()
    return buf

def _build_replacement_spans(orig_text, corrected_text):
    a = orig_text.split()
    b = corrected_text.split()
    sm = difflib.SequenceMatcher(a=a, b=b)
    spans = []
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == "equal":
            continue
        orig_span = " ".join(a[i1:i2]).strip()
        corr_span = " ".join(b[j1:j2]).strip()
        if orig_span:
            spans.append((orig_span, corr_span))
    spans.sort(key=lambda x: -len(x[0]))
    return spans

def apply_replacements_to_docx_bytes(original_bytes, orig_text, corrected_text):
    """Replace occurrences of orig spans with corrected spans inside docx runs and table cells (best-effort)."""
    if docx is None:
        raise RuntimeError("python-docx not installed")
    from io import BytesIO
    document = docx.Document(BytesIO(original_bytes))
    spans = _build_replacement_spans(orig_text, corrected_text)
    if not spans:
        out = BytesIO()
        document.save(out)
        out.seek(0)
        return out.read()
    def replace_in_paragraph_runs(par):
        for orig_span, corr_span in spans:
            for run in par.runs:
                if orig_span in run.text:
                    run.text = run.text.replace(orig_span, corr_span)
    for p in document.paragraphs:
        replace_in_paragraph_runs(p)
    for table in document.tables:
        for row in table.rows:
            for cell in row.cells:
                for p in cell.paragraphs:
                    replace_in_paragraph_runs(p)
    out = io.BytesIO()
    document.save(out)
    out.seek(0)
    return out.read()

def apply_replacements_to_pdf_bytes(original_bytes, orig_text, corrected_text):
    """Best-effort PDF replacement: redact original token bbox and write corrected text in place using PyMuPDF."""
    if fitz is None:
        raise RuntimeError("PyMuPDF not installed")
    orig_tokens = orig_text.split()
    corr_tokens = corrected_text.split()
    sm = difflib.SequenceMatcher(a=orig_tokens, b=corr_tokens)
    ops = []
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == "equal":
            continue
        ops.append((tag, i1, i2, j1, j2))
    if not ops:
        return original_bytes
    pdf = fitz.open(stream=original_bytes, filetype="pdf")
    global_words = []
    for pno in range(len(pdf)):
        page = pdf[pno]
        words = page.get_text("words")  # x0,y0,x1,y1, word, block_no, line_no, word_no
        words_sorted = sorted(words, key=lambda w: (round(w[3],1), round(w[0],1)))
        for w in words_sorted:
            global_words.append((pno, w))
    N = len(global_words)
    M = len(orig_tokens)
    map_len = min(N, M)
    token_to_global = {}
    for i in range(map_len):
        token_to_global[i] = global_words[i]
    redactions_per_page = {}
    inserts_per_page = {}
    for op in ops:
        tag, i1, i2, j1, j2 = op
        corr_span = " ".join(corr_tokens[j1:j2])
        for ti in range(i1, i2):
            if ti in token_to_global:
                pno, wtuple = token_to_global[ti]
                x0, y0, x1, y1 = wtuple[0], wtuple[1], wtuple[2], wtuple[3]
                bbox = fitz.Rect(x0, y0, x1, y1)
                redactions_per_page.setdefault(pno, []).append(bbox)
                inserts_per_page.setdefault(pno, []).append((bbox, corr_span))
                break
    for pno, rects in redactions_per_page.items():
        page = pdf[pno]
        for r in rects:
            page.add_redact_annot(r, fill=(1,1,1))
        page.apply_redactions()
        for bbox, corr_span in inserts_per_page.get(pno, []):
            fontsize = max(6, round(bbox.height * 0.8))
            try:
                page.insert_textbox(bbox, corr_span, fontsize=fontsize, fontname="helv", align=0)
            except Exception:
                page.insert_text((bbox.x0, bbox.y0), corr_span, fontsize=fontsize, fontname="helv")
    out = pdf.write()
    pdf.close()
    return out

# -----------------------
# UI (first file's GUI style) with Prev/Next variants and independent tools
# -----------------------
st.title("Rephraser — Paraphrase · Plagiarism Remover · Grammar & Spelling")
st.markdown("Paste text or upload DOCX/PDF/TXT. Tools are independent and chainable (use output as input manually).")

col_left, col_right = st.columns([2,1])
with col_left:
    input_mode = st.radio("Input:", ("Paste text", "Upload file (.docx/.pdf/.txt)"))
    uploaded_bytes = None
    uploaded_name = None
    input_text = ""
    if input_mode == "Paste text":
        input_text = st.text_area("Paste your paragraph(s) here:", height=200, value=st.session_state.current_text or "")
        # clear upload memory
        st.session_state._uploaded_bytes = None
        st.session_state._uploaded_name = None
    else:
        uploaded = st.file_uploader("Upload .docx, .pdf or .txt", type=["docx","pdf","txt"])
        if uploaded is not None:
            uploaded_bytes = uploaded.read()
            uploaded_name = uploaded.name
            st.session_state._uploaded_bytes = uploaded_bytes
            st.session_state._uploaded_name = uploaded_name
            try:
                if uploaded.name.lower().endswith(".docx"):
                    input_text = extract_text_from_docx_bytes(uploaded_bytes)
                elif uploaded.name.lower().endswith(".pdf"):
                    input_text = extract_text_from_pdf_bytes(uploaded_bytes)
                else:
                    input_text = extract_text_from_txt_bytes(uploaded_bytes)
                st.success(f"Loaded {uploaded.name} (approx {len(input_text.split())} words)")
            except Exception as e:
                st.error(f"Could not extract text from file: {e}")
    st.markdown("**Tools (choose one)**")
    st.markdown("- **Para-phraser (fast):** Focused on rephrase sentence, regardless of Plagiarism ")
    st.markdown("- **Plagiarism Remover (deep):** Focused on Plagiarism, Convert text to human like  ")
    st.markdown("- **Grammar & Spelling:** Spelling And Grammar Check")

with col_right:
    st.header("Actions")
    variants_to_generate = st.slider("Max variants (deep)", 1, 5, 3)
    use_light_only = st.checkbox("Force light-only (no HF models)", value=True)
    if st.button("1) Para-phraser (fast)"):
        st.session_state._last_tool = "paraphrase"
        source = input_text.strip() or st.session_state.current_text.strip()
        if not source:
            st.warning("Provide text or upload a file first.")
        else:
            st.session_state.history.append(st.session_state.current_text or source)
            variants = paraphrase_variants_fast(source, n_variants=variants_to_generate)
            if not variants:
                st.error("No paraphrase produced.")
            else:
                st.session_state.versions = variants
                st.session_state.version_index = 0
                st.session_state.current_text = variants[0]
                st.session_state.last_input = source
                st.session_state._last_grammar_issues = None
                st.session_state._last_output_file = None
                st.success("Para-phraser done. Use Prev/Next to browse.")

    if st.button("2) Plagiarism Remover (deep)"):
        st.session_state._last_tool = "plagiarism"
        source = input_text.strip() or st.session_state.current_text.strip()
        if not source:
            st.warning("Provide text or upload a file first.")
        else:
            st.session_state.history.append(st.session_state.current_text or source)
            st.info("Running plagiarism remover pipeline...")
            try:
                variants = plagiarism_remover_pipeline(source, aggressive=1, light_only=use_light_only)
            except Exception as e:
                st.error(f"Pipeline failed: {e}")
                variants = paraphrase_variants_fast(source, n_variants=variants_to_generate)
            if not variants:
                st.error("No variants produced.")
            else:
                st.session_state.versions = variants
                st.session_state.version_index = 0
                st.session_state.current_text = variants[0]
                st.session_state.last_input = source
                st.session_state._last_grammar_issues = None
                st.session_state._last_output_file = None
                st.success(f"Produced {len(variants)} variants.")

    if st.button("3) Grammar & Spelling (check)"):
        st.session_state._last_tool = "grammar"
        source = st.session_state.current_text.strip() or input_text.strip()
        if not source:
            st.warning("Provide text or upload a file first.")
        else:
            st.session_state.history.append(st.session_state.current_text or source)
            try:
                corrected, issues = grammar_and_spelling_check(source)
                st.session_state.current_text = corrected
                st.session_state.versions = [corrected]
                st.session_state.version_index = 0
                st.session_state._last_grammar_issues = issues or []
                st.success(f"Grammar check applied ({len(issues)} issues).")

                # File-level output if uploaded
                uploaded_bytes = st.session_state.get("_uploaded_bytes")
                uploaded_name = st.session_state.get("_uploaded_name")
                if uploaded_bytes and uploaded_name:
                    suffix = Path(uploaded_name).suffix.lower()
                    try:
                        if suffix == ".docx" and docx is not None:
                            out_bytes = apply_replacements_to_docx_bytes(uploaded_bytes, source, corrected)
                            st.session_state._last_output_file = out_bytes
                            st.session_state._last_output_name = f"corrected_{uploaded_name}"
                        elif suffix == ".pdf" and fitz is not None:
                            out_bytes = apply_replacements_to_pdf_bytes(uploaded_bytes, source, corrected)
                            st.session_state._last_output_file = out_bytes
                            st.session_state._last_output_name = f"corrected_{uploaded_name}"
                        elif suffix == ".txt":
                            st.session_state._last_output_file = corrected.encode("utf-8")
                            st.session_state._last_output_name = f"corrected_{uploaded_name}"
                        else:
                            st.session_state._last_output_file = make_docx_bytes_from_text(corrected)
                            st.session_state._last_output_name = "corrected_output.docx"
                    except Exception as e:
                        st.warning(f"Could not create corrected file preserving format: {e}")
                        st.session_state._last_output_file = None
                        st.session_state._last_output_name = None

                if issues:
                    st.subheader("Detected issues (sample):")
                    for i, it in enumerate(issues[:30]):
                        st.write(f"- {it.get('message')} → suggestions: {it.get('replacements')}")
            except Exception as e:
                st.error(f"Grammar check failed: {e}")

# Navigation
st.markdown("---")
st.subheader("Preview / Versions")
colv1, colv2, colv3 = st.columns([1,1,2])
with colv1:
    if st.button("◀ Previous Version"):
        if st.session_state.versions:
            st.session_state.version_index = max(0, st.session_state.version_index - 1)
            st.session_state.current_text = st.session_state.versions[st.session_state.version_index]
with colv2:
    if st.button("Next Version ▶"):
        if st.session_state.versions:
            st.session_state.version_index = min(len(st.session_state.versions)-1, st.session_state.version_index + 1)
            st.session_state.current_text = st.session_state.versions[st.session_state.version_index]
with colv3:
    st.write(f"Version {st.session_state.version_index+1} of {max(1, len(st.session_state.versions))}")


# Preview
st.markdown("---")
st.subheader("Original (top) — Processed Output (bottom)")
orig_display = st.session_state.last_input or ""
out_display = st.session_state.current_text or (input_text or "")

if st.session_state._last_tool == "grammar" and out_display.strip():
    orig_html = mark_grammar_issues(orig_display, st.session_state._last_grammar_issues or []) if orig_display else html.escape(orig_display)
    out_html = underline_changes_in_output(orig_display or "", out_display)
    st.markdown("<b>Original (issues highlighted)</b>", unsafe_allow_html=True)
    st.markdown(f"<div style='padding:8px;border:1px solid #e6e6e6;background:transparent;white-space:pre-wrap'>{orig_html}</div>", unsafe_allow_html=True)
    st.markdown("<b>Corrected (changes underlined in green)</b>", unsafe_allow_html=True)
    st.markdown(f"<div style='padding:8px;border:1px solid #e6e6e6;background:transparent;white-space:pre-wrap'>{out_html}</div>", unsafe_allow_html=True)
else:
    # generic preview (green underlines for changed parts — new function)
    preview_html = text_to_html_with_highlights(orig_display, out_display) if orig_display else html.escape(out_display)
    st.markdown(
        f"""

        <div style='padding:10px;border:1px solid #eee;background:transparent;white-space:pre-wrap'>

            {preview_html}

        </div>

        """,
        unsafe_allow_html=True
    )

# Editable area
st.subheader("Editable result (you can manually edit before saving)")
st.session_state.editable_area = st.text_area("Edit here:", value=st.session_state.current_text or out_display, height=300)

# If corrected file available (uploaded+grammar), download
if st.session_state._last_output_file is not None and st.session_state._last_output_name:
    st.markdown("**Download corrected file**")
    st.download_button("Download corrected file", data=st.session_state._last_output_file, file_name=st.session_state._last_output_name)

# Spelling suggestions & apply edits
st.markdown("---")
st.markdown("**Spelling suggestions / replace single word:**")
col_s1, col_s2 = st.columns([2,3])
with col_s1:
    word_for_sugg = st.text_input("Enter token to suggest replacements:", value="")
    if st.button("Get suggestions"):
        if not word_for_sugg.strip():
            st.warning("Type a token to get suggestions.")
        else:
            suggs = spelling_suggestions(word_for_sugg, sentence=st.session_state.editable_area)
            if suggs:
                sel = st.selectbox("Choose replacement:", options=["(keep)"] + suggs)
                if sel and sel != "(keep)":
                    st.session_state.editable_area = st.session_state.editable_area.replace(word_for_sugg, sel)
                    st.success(f"Replaced '{word_for_sugg}' with '{sel}'")
            else:
                st.info("No suggestions found.")
with col_s2:
    if st.button("Apply editable area to current text"):
        st.session_state.current_text = st.session_state.editable_area
        st.success("Applied edits to current text.")

# Save / Download / Copy for plain text
st.markdown("---")
col_d1, col_d2, col_d3 = st.columns(3)
with col_d1:
    if st.button("Save as DOCX"):
        try:
            b = make_docx_bytes_from_text(st.session_state.editable_area or "")
            st.download_button("Download DOCX", data=b, file_name="rephrased.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
        except Exception as e:
            st.error(f"Could not create DOCX: {e}")
with col_d2:
    if st.button("Save as PDF"):
        try:
            b = make_pdf_bytes_from_text(st.session_state.editable_area or "")
            st.download_button("Download PDF", data=b, file_name="rephrased.pdf", mime="application/pdf")
        except Exception as e:
            st.error(f"Could not create PDF: {e}")
with col_d3:
    if st.button("Copy to clipboard"):
        if PYPERCLIP:
            pyperclip.copy(st.session_state.editable_area or "")
            st.success("Copied to clipboard")
        else:
            path = os.path.join(tempfile.gettempdir(), "rephrased_output.txt")
            with open(path, "w", encoding="utf-8") as f:
                f.write(st.session_state.editable_area or "")
            st.info(f"Saved to {path} (pyperclip not available)")

# Undo
if st.button("Undo"):
    if st.session_state.history:
        st.session_state.current_text = st.session_state.history.pop()
        st.session_state.versions = [st.session_state.current_text]
        st.session_state.version_index = 0
        st.success("Undone last step")
    else:
        st.info("Nothing to undo")

st.markdown("---")
st.caption("Notes: Paraphraser & Plagiarism Remover code preserved. Grammar prefers LanguageTool (requires Java) else falls back to TextBlob. DOCX/PDF replacements are best-effort to preserve layout.")

# refresh button
# --- Refresh button at the bottom ---
if st.button("🔄 Refresh"):
    st.rerun()