AI-Solutions-KK commited on
Commit
6416ff6
·
unverified ·
1 Parent(s): f89b8b7

Testing app

Browse files

Old_paraphraser_app.py_single_long_code_streamlit_test

Files changed (2) hide show
  1. app.py +841 -0
  2. requirements.txt +14 -0
app.py ADDED
@@ -0,0 +1,841 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ """
3
+ Merged Rephraser app
4
+ - GUI from original (first) file
5
+ - Models/logic from later big file (kept unchanged)
6
+ - Grammar highlight (red for issues; green underline for corrected words)
7
+ - File upload/download for .docx/.pdf/.txt with best-effort format preservation
8
+ - Tools independent (no automatic chaining)
9
+ - Prev/Next browsing for multi-version outputs
10
+ """
11
+
12
+ import streamlit as st
13
+ import io, os, random, re, difflib, html, tempfile
14
+ from pathlib import Path
15
+
16
+ # home "🏠 Home"
17
+
18
+ # --- Home button at the top ---
19
+ if st.button("🏠 Home"):
20
+ st.rerun()
21
+
22
+
23
+ # Optional heavy libs (lazy imports used where needed)
24
+ try:
25
+ import docx
26
+ except Exception:
27
+ docx = None
28
+
29
+ try:
30
+ import fitz # PyMuPDF
31
+ except Exception:
32
+ fitz = None
33
+
34
+ try:
35
+ import language_tool_python
36
+ except Exception:
37
+ language_tool_python = None
38
+
39
+ try:
40
+ from textblob import TextBlob
41
+ except Exception:
42
+ TextBlob = None
43
+
44
+ # NLTK / WordNet
45
+ try:
46
+ import nltk
47
+ from nltk.corpus import wordnet as wn
48
+ nltk_available = True
49
+ except Exception:
50
+ nltk_available = False
51
+
52
+ # spaCy
53
+ try:
54
+ import spacy
55
+ nlp = spacy.load("en_core_web_sm")
56
+ SPACY_AVAILABLE = True
57
+ except Exception:
58
+ nlp = None
59
+ SPACY_AVAILABLE = False
60
+
61
+ # transformers check
62
+ try:
63
+ import transformers
64
+ TRANSFORMERS_AVAILABLE = True
65
+ except Exception:
66
+ TRANSFORMERS_AVAILABLE = False
67
+
68
+ # SpellChecker
69
+ try:
70
+ from spellchecker import SpellChecker
71
+ SPELLCHECKER_AVAILABLE = True
72
+ spell = SpellChecker()
73
+ except Exception:
74
+ SPELLCHECKER_AVAILABLE = False
75
+
76
+ # pyperclip optional
77
+ try:
78
+ import pyperclip
79
+ PYPERCLIP = True
80
+ except Exception:
81
+ PYPERCLIP = False
82
+
83
+ # -----------------------
84
+ # Session state init (preserve old behavior)
85
+ # -----------------------
86
+ if "versions" not in st.session_state:
87
+ st.session_state.versions = []
88
+ if "version_index" not in st.session_state:
89
+ st.session_state.version_index = 0
90
+ if "last_input" not in st.session_state:
91
+ st.session_state.last_input = ""
92
+ if "current_text" not in st.session_state:
93
+ st.session_state.current_text = ""
94
+ if "history" not in st.session_state:
95
+ st.session_state.history = []
96
+ # bookkeeping for file uploads & grammar
97
+ if "_uploaded_bytes" not in st.session_state:
98
+ st.session_state._uploaded_bytes = None
99
+ if "_uploaded_name" not in st.session_state:
100
+ st.session_state._uploaded_name = None
101
+ if "_last_grammar_issues" not in st.session_state:
102
+ st.session_state._last_grammar_issues = None
103
+ if "_last_output_file" not in st.session_state:
104
+ st.session_state._last_output_file = None
105
+ if "_last_output_name" not in st.session_state:
106
+ st.session_state._last_output_name = None
107
+ if "_last_tool" not in st.session_state:
108
+ st.session_state._last_tool = None
109
+
110
+ # -----------------------
111
+ # Helpers: highlights & diffs
112
+ # -----------------------
113
+ def mark_grammar_issues(text, issues):
114
+ """Wrap problem spans in red (inline). issues is list of dicts with offset & length & message & replacements"""
115
+ if not issues:
116
+ return html.escape(text)
117
+ spans = []
118
+ for it in issues:
119
+ off = it.get("offset", 0)
120
+ length = it.get("length", 0)
121
+ msg = it.get("message", "")
122
+ spans.append((off, off + length, msg))
123
+ spans.sort()
124
+ out = ""
125
+ idx = 0
126
+ for s, e, msg in spans:
127
+ if s > idx:
128
+ out += html.escape(text[idx:s])
129
+ problem = html.escape(text[s:e])
130
+ out += f'<span title="{html.escape(msg)}" style="border-bottom:2px solid #c0392b;">{problem}</span>'
131
+ idx = e
132
+ if idx < len(text):
133
+ out += html.escape(text[idx:])
134
+ return out
135
+
136
+ def underline_changes_in_output(orig, corrected):
137
+ """
138
+ Token-level diff: underline changed/inserted fragments in green in corrected text.
139
+ """
140
+ a = orig.split()
141
+ b = corrected.split()
142
+ sm = difflib.SequenceMatcher(a=a, b=b)
143
+ parts = []
144
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
145
+ if tag == "equal":
146
+ parts.append(" ".join(b[j1:j2]))
147
+ elif tag in ("replace", "insert"):
148
+ changed = " ".join(b[j1:j2])
149
+ parts.append(f'<span style="text-decoration: underline; text-decoration-color: #27ae60;">{html.escape(changed)}</span>')
150
+ elif tag == "delete":
151
+ pass
152
+ return " ".join(parts) if parts else html.escape(corrected)
153
+
154
+ ## Green line
155
+ import html
156
+ import difflib
157
+
158
+ def text_to_html_with_highlights(orig, new):
159
+ """
160
+ Compare original and new text word-by-word.
161
+ Underline only the changed/added words in green.
162
+ """
163
+ orig_words = orig.split()
164
+ new_words = new.split()
165
+
166
+ diff = list(difflib.ndiff(orig_words, new_words))
167
+ highlighted = []
168
+ for word in diff:
169
+ if word.startswith("+ "): # Added or changed word
170
+ highlighted.append(
171
+ f"<span style='color:black;text-decoration:underline;text-decoration-color:green'>{html.escape(word[2:])}</span>"
172
+ )
173
+ elif word.startswith(" "): # Unchanged word
174
+ highlighted.append(html.escape(word[2:]))
175
+ # Words starting with "- " (removed) are skipped
176
+
177
+ return " ".join(highlighted)
178
+
179
+ # -----------------------
180
+ # Paraphraser functions (kept from your big code)
181
+ # -----------------------
182
+ def paraphrase_variants_fast(text, n_variants=3):
183
+ text = text.strip()
184
+ if not text:
185
+ return []
186
+ sents = re.split(r'(?<=[.!?])\s+', text)
187
+ variants = []
188
+ for v in range(n_variants):
189
+ outs = []
190
+ for s in sents:
191
+ sent = s.strip()
192
+ if not sent:
193
+ continue
194
+ if SPACY_AVAILABLE:
195
+ doc = nlp(sent)
196
+ # small structural transforms
197
+ if random.random() < 0.3 and len(list(doc.noun_chunks)) >= 2:
198
+ chunks = list(doc.noun_chunks)
199
+ text_chunks = [c.text for c in chunks]
200
+ s2 = sent
201
+ try:
202
+ s2 = s2.replace(text_chunks[0], "<<<A>>>").replace(text_chunks[1], text_chunks[0]).replace("<<<A>>>", text_chunks[1])
203
+ except Exception:
204
+ s2 = sent
205
+ outs.append(s2)
206
+ continue
207
+ if ',' in sent and random.random() < 0.4:
208
+ parts = [p.strip() for p in sent.split(',')]
209
+ random.shuffle(parts)
210
+ outs.append(", ".join(parts))
211
+ continue
212
+ outs.append(_synonym_replace(sent, prob=0.15 + 0.05 * v))
213
+ else:
214
+ if random.random() < 0.2:
215
+ words = sent.split()
216
+ if len(words) > 3:
217
+ i = random.randint(0, len(words) - 3)
218
+ words[i], words[i+1] = words[i+1], words[i]
219
+ outs.append(" ".join(words))
220
+ else:
221
+ outs.append(_synonym_replace(sent, prob=0.12 + 0.04 * v))
222
+ final = " ".join(outs)
223
+ if random.random() < 0.3 and len(sents) > 1:
224
+ random.shuffle(sents)
225
+ final = " ".join(outs)
226
+ variants.append(final)
227
+ uniq = []
228
+ for x in variants:
229
+ if x not in uniq and x.strip():
230
+ uniq.append(x)
231
+ return uniq[:n_variants]
232
+
233
+ def _synonym_replace(sentence, prob=0.12, max_replacements=2):
234
+ if not nltk_available:
235
+ words = sentence.split()
236
+ for i in range(len(words)):
237
+ if random.random() < prob:
238
+ j = random.randrange(len(words))
239
+ words[i], words[j] = words[j], words[i]
240
+ return " ".join(words)
241
+ tokens = re.findall(r"\w+|\W+", sentence)
242
+ words = [t for t in tokens]
243
+ replaced = 0
244
+ for i, tok in enumerate(words):
245
+ if not re.match(r'\w+', tok):
246
+ continue
247
+ lower = tok.lower()
248
+ if random.random() > prob:
249
+ continue
250
+ syns = wn.synsets(lower)
251
+ if not syns:
252
+ continue
253
+ cand = None
254
+ for s in syns:
255
+ for l in s.lemmas():
256
+ name = l.name().replace('_', ' ')
257
+ if name.lower() != lower and ' ' not in name:
258
+ cand = name
259
+ break
260
+ if cand:
261
+ break
262
+ if cand:
263
+ if tok[0].isupper():
264
+ cand = cand.capitalize()
265
+ words[i] = cand
266
+ replaced += 1
267
+ if replaced >= max_replacements:
268
+ break
269
+ return "".join(words)
270
+
271
+ def simple_mix_versions(versions_list):
272
+ if not versions_list:
273
+ return ""
274
+ pieces = []
275
+ for v in versions_list:
276
+ s = v.strip()
277
+ if not s:
278
+ continue
279
+ sents = re.split(r'(?<=[.!?])\s+', s)
280
+ take_n = max(1, min(3, len(sents)))
281
+ picks = random.sample(sents, take_n) if len(sents) > take_n else sents
282
+ pieces.extend(picks)
283
+ random.shuffle(pieces)
284
+ return " ".join(pieces)
285
+
286
+ # -----------------------
287
+ # Plagiarism remover (kept)
288
+ # -----------------------
289
+ @st.cache_resource(show_spinner=False)
290
+ def load_small_model(model_name="t5-small"):
291
+ if not TRANSFORMERS_AVAILABLE:
292
+ raise ImportError("transformers not installed")
293
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
294
+ tok = AutoTokenizer.from_pretrained(model_name)
295
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
296
+ pipe = pipeline("text2text-generation", model=model, tokenizer=tok, device=-1)
297
+ return tok, model, pipe
298
+
299
+ def hf_paraphrase_with_pipe(pipe, text, max_len=256):
300
+ try:
301
+ out = pipe(text, max_length=max_len, do_sample=True, top_p=0.95, temperature=0.8, num_return_sequences=1)
302
+ if isinstance(out, list) and out:
303
+ return out[0].get("generated_text") or out[0].get("summary_text") or str(out[0])
304
+ return str(out)
305
+ except Exception:
306
+ return text
307
+
308
+ def plagiarism_remover_pipeline(text, aggressive=1, light_only=False):
309
+ versions = []
310
+ v_light = paraphrase_variants_fast(text, n_variants=1)[0] if paraphrase_variants_fast(text, n_variants=1) else text
311
+ versions.append(v_light)
312
+ if TRANSFORMERS_AVAILABLE and not light_only:
313
+ try:
314
+ _, _, t5_pipe = load_small_model("t5-small")
315
+ v_t5 = hf_paraphrase_with_pipe(t5_pipe, "paraphrase: " + text)
316
+ versions.append(v_t5)
317
+ except Exception:
318
+ pass
319
+ try:
320
+ _, _, p_pipe = load_small_model("google/pegasus-xsum")
321
+ v_peg = hf_paraphrase_with_pipe(p_pipe, text)
322
+ versions.append(v_peg)
323
+ except Exception:
324
+ pass
325
+ v_combo = simple_mix_versions(versions)
326
+ versions.append(v_combo)
327
+ uniq = []
328
+ for v in versions:
329
+ if v and v.strip() and v not in uniq:
330
+ uniq.append(v)
331
+ if len(uniq) >= 5:
332
+ break
333
+ return uniq
334
+
335
+ # -----------------------
336
+ # Grammar & Spelling (kept)
337
+ # -----------------------
338
+ def grammar_and_spelling_check(text):
339
+ if language_tool_python is not None:
340
+ try:
341
+ tool = language_tool_python.LanguageTool('en-US')
342
+ matches = tool.check(text)
343
+ corrected = language_tool_python.utils.correct(text, matches)
344
+ issues = []
345
+ for m in matches:
346
+ issues.append({
347
+ "message": m.message,
348
+ "replacements": m.replacements,
349
+ "offset": m.offset,
350
+ "length": m.errorLength,
351
+ "context": text[max(0, m.offset - 30): m.offset + 30]
352
+ })
353
+ return corrected, issues
354
+ except Exception:
355
+ pass
356
+ if TextBlob is not None:
357
+ try:
358
+ tb = TextBlob(text)
359
+ corr = str(tb.correct())
360
+ return corr, []
361
+ except Exception:
362
+ pass
363
+ return text, []
364
+
365
+ def spelling_suggestions(word, top_n=5):
366
+ if SPELLCHECKER_AVAILABLE:
367
+ suggestions = spell.candidates(word)
368
+ return list(suggestions)[:top_n]
369
+ return []
370
+
371
+ # -----------------------
372
+ # File extract & write helpers (kept & added best-effort replace)
373
+ # -----------------------
374
+ def extract_text_from_docx_bytes(b):
375
+ if docx is None:
376
+ raise RuntimeError("python-docx not installed")
377
+ f = io.BytesIO(b)
378
+ document = docx.Document(f)
379
+ paras = [p.text for p in document.paragraphs]
380
+ return "\n\n".join(paras)
381
+
382
+ def extract_text_from_pdf_bytes(b):
383
+ if fitz is None:
384
+ raise RuntimeError("PyMuPDF not installed")
385
+ doc = fitz.open(stream=b, filetype="pdf")
386
+ text = ""
387
+ for p in doc:
388
+ text += p.get_text() + "\n\n"
389
+ return text
390
+
391
+ def extract_text_from_txt_bytes(b):
392
+ try:
393
+ return b.decode("utf-8")
394
+ except Exception:
395
+ try:
396
+ return b.decode("latin-1")
397
+ except Exception:
398
+ return str(b)
399
+
400
+ def make_docx_bytes_from_text(text):
401
+ if docx is None:
402
+ raise RuntimeError("python-docx not installed")
403
+ out = io.BytesIO()
404
+ d = docx.Document()
405
+ for para in text.split("\n\n"):
406
+ d.add_paragraph(para)
407
+ d.save(out)
408
+ out.seek(0)
409
+ return out.read()
410
+
411
+ def make_pdf_bytes_from_text(text):
412
+ if fitz is None:
413
+ raise RuntimeError("PyMuPDF not installed")
414
+ doc = fitz.open()
415
+ lines = text.split("\n")
416
+ page = doc.new_page()
417
+ y = 72
418
+ for line in lines:
419
+ if y > 720:
420
+ page = doc.new_page()
421
+ y = 72
422
+ page.insert_text((72, y), line)
423
+ y += 14
424
+ buf = doc.write()
425
+ doc.close()
426
+ return buf
427
+
428
+ def _build_replacement_spans(orig_text, corrected_text):
429
+ a = orig_text.split()
430
+ b = corrected_text.split()
431
+ sm = difflib.SequenceMatcher(a=a, b=b)
432
+ spans = []
433
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
434
+ if tag == "equal":
435
+ continue
436
+ orig_span = " ".join(a[i1:i2]).strip()
437
+ corr_span = " ".join(b[j1:j2]).strip()
438
+ if orig_span:
439
+ spans.append((orig_span, corr_span))
440
+ spans.sort(key=lambda x: -len(x[0]))
441
+ return spans
442
+
443
+ def apply_replacements_to_docx_bytes(original_bytes, orig_text, corrected_text):
444
+ """Replace occurrences of orig spans with corrected spans inside docx runs and table cells (best-effort)."""
445
+ if docx is None:
446
+ raise RuntimeError("python-docx not installed")
447
+ from io import BytesIO
448
+ document = docx.Document(BytesIO(original_bytes))
449
+ spans = _build_replacement_spans(orig_text, corrected_text)
450
+ if not spans:
451
+ out = BytesIO()
452
+ document.save(out)
453
+ out.seek(0)
454
+ return out.read()
455
+ def replace_in_paragraph_runs(par):
456
+ for orig_span, corr_span in spans:
457
+ for run in par.runs:
458
+ if orig_span in run.text:
459
+ run.text = run.text.replace(orig_span, corr_span)
460
+ for p in document.paragraphs:
461
+ replace_in_paragraph_runs(p)
462
+ for table in document.tables:
463
+ for row in table.rows:
464
+ for cell in row.cells:
465
+ for p in cell.paragraphs:
466
+ replace_in_paragraph_runs(p)
467
+ out = io.BytesIO()
468
+ document.save(out)
469
+ out.seek(0)
470
+ return out.read()
471
+
472
+ def apply_replacements_to_pdf_bytes(original_bytes, orig_text, corrected_text):
473
+ """Best-effort PDF replacement: redact original token bbox and write corrected text in place using PyMuPDF."""
474
+ if fitz is None:
475
+ raise RuntimeError("PyMuPDF not installed")
476
+ orig_tokens = orig_text.split()
477
+ corr_tokens = corrected_text.split()
478
+ sm = difflib.SequenceMatcher(a=orig_tokens, b=corr_tokens)
479
+ ops = []
480
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
481
+ if tag == "equal":
482
+ continue
483
+ ops.append((tag, i1, i2, j1, j2))
484
+ if not ops:
485
+ return original_bytes
486
+ pdf = fitz.open(stream=original_bytes, filetype="pdf")
487
+ global_words = []
488
+ for pno in range(len(pdf)):
489
+ page = pdf[pno]
490
+ words = page.get_text("words") # x0,y0,x1,y1, word, block_no, line_no, word_no
491
+ words_sorted = sorted(words, key=lambda w: (round(w[3],1), round(w[0],1)))
492
+ for w in words_sorted:
493
+ global_words.append((pno, w))
494
+ N = len(global_words)
495
+ M = len(orig_tokens)
496
+ map_len = min(N, M)
497
+ token_to_global = {}
498
+ for i in range(map_len):
499
+ token_to_global[i] = global_words[i]
500
+ redactions_per_page = {}
501
+ inserts_per_page = {}
502
+ for op in ops:
503
+ tag, i1, i2, j1, j2 = op
504
+ corr_span = " ".join(corr_tokens[j1:j2])
505
+ for ti in range(i1, i2):
506
+ if ti in token_to_global:
507
+ pno, wtuple = token_to_global[ti]
508
+ x0, y0, x1, y1 = wtuple[0], wtuple[1], wtuple[2], wtuple[3]
509
+ bbox = fitz.Rect(x0, y0, x1, y1)
510
+ redactions_per_page.setdefault(pno, []).append(bbox)
511
+ inserts_per_page.setdefault(pno, []).append((bbox, corr_span))
512
+ break
513
+ for pno, rects in redactions_per_page.items():
514
+ page = pdf[pno]
515
+ for r in rects:
516
+ page.add_redact_annot(r, fill=(1,1,1))
517
+ page.apply_redactions()
518
+ for bbox, corr_span in inserts_per_page.get(pno, []):
519
+ fontsize = max(6, round(bbox.height * 0.8))
520
+ try:
521
+ page.insert_textbox(bbox, corr_span, fontsize=fontsize, fontname="helv", align=0)
522
+ except Exception:
523
+ page.insert_text((bbox.x0, bbox.y0), corr_span, fontsize=fontsize, fontname="helv")
524
+ out = pdf.write()
525
+ pdf.close()
526
+ return out
527
+
528
+ # -----------------------
529
+ # UI (first file's GUI style) with Prev/Next variants and independent tools
530
+ # -----------------------
531
+ st.set_page_config(page_title="Rephraser", layout="wide")
532
+ st.title("Rephraser — Paraphrase · Plagiarism Remover · Grammar & Spelling")
533
+ st.markdown("Paste text or upload DOCX/PDF/TXT. Tools are independent and chainable (use output as input manually).")
534
+
535
+ col_left, col_right = st.columns([2,1])
536
+ with col_left:
537
+ input_mode = st.radio("Input:", ("Paste text", "Upload file (.docx/.pdf/.txt)"))
538
+ uploaded_bytes = None
539
+ uploaded_name = None
540
+ input_text = ""
541
+ if input_mode == "Paste text":
542
+ input_text = st.text_area("Paste your paragraph(s) here:", height=200, value=st.session_state.current_text or "")
543
+ # clear upload memory
544
+ st.session_state._uploaded_bytes = None
545
+ st.session_state._uploaded_name = None
546
+ else:
547
+ uploaded = st.file_uploader("Upload .docx, .pdf or .txt", type=["docx","pdf","txt"])
548
+ if uploaded is not None:
549
+ uploaded_bytes = uploaded.read()
550
+ uploaded_name = uploaded.name
551
+ st.session_state._uploaded_bytes = uploaded_bytes
552
+ st.session_state._uploaded_name = uploaded_name
553
+ try:
554
+ if uploaded.name.lower().endswith(".docx"):
555
+ input_text = extract_text_from_docx_bytes(uploaded_bytes)
556
+ elif uploaded.name.lower().endswith(".pdf"):
557
+ input_text = extract_text_from_pdf_bytes(uploaded_bytes)
558
+ else:
559
+ input_text = extract_text_from_txt_bytes(uploaded_bytes)
560
+ st.success(f"Loaded {uploaded.name} (approx {len(input_text.split())} words)")
561
+ except Exception as e:
562
+ st.error(f"Could not extract text from file: {e}")
563
+ st.markdown("**Tools (choose one)**")
564
+ st.markdown("- **Para-phraser (fast):** Focused on rephrase sentence, regardless of Plagiarism ")
565
+ st.markdown("- **Plagiarism Remover (deep):** Focused on Plagiarism, Convert text to human like ")
566
+ st.markdown("- **Grammar & Spelling:** Spelling And Grammar Check")
567
+
568
+ with col_right:
569
+ st.header("Actions")
570
+ variants_to_generate = st.slider("Max variants (deep)", 1, 5, 3)
571
+ use_light_only = st.checkbox("Force light-only (no HF models)", value=True)
572
+ if st.button("1) Para-phraser (fast)"):
573
+ st.session_state._last_tool = "paraphrase"
574
+ source = input_text.strip() or st.session_state.current_text.strip()
575
+ if not source:
576
+ st.warning("Provide text or upload a file first.")
577
+ else:
578
+ st.session_state.history.append(st.session_state.current_text or source)
579
+ variants = paraphrase_variants_fast(source, n_variants=variants_to_generate)
580
+ if not variants:
581
+ st.error("No paraphrase produced.")
582
+ else:
583
+ st.session_state.versions = variants
584
+ st.session_state.version_index = 0
585
+ st.session_state.current_text = variants[0]
586
+ st.session_state.last_input = source
587
+ st.session_state._last_grammar_issues = None
588
+ st.session_state._last_output_file = None
589
+ st.success("Para-phraser done. Use Prev/Next to browse.")
590
+
591
+ if st.button("2) Plagiarism Remover (deep)"):
592
+ st.session_state._last_tool = "plagiarism"
593
+ source = input_text.strip() or st.session_state.current_text.strip()
594
+ if not source:
595
+ st.warning("Provide text or upload a file first.")
596
+ else:
597
+ st.session_state.history.append(st.session_state.current_text or source)
598
+ st.info("Running plagiarism remover pipeline...")
599
+ try:
600
+ variants = plagiarism_remover_pipeline(source, aggressive=1, light_only=use_light_only)
601
+ except Exception as e:
602
+ st.error(f"Pipeline failed: {e}")
603
+ variants = paraphrase_variants_fast(source, n_variants=variants_to_generate)
604
+ if not variants:
605
+ st.error("No variants produced.")
606
+ else:
607
+ st.session_state.versions = variants
608
+ st.session_state.version_index = 0
609
+ st.session_state.current_text = variants[0]
610
+ st.session_state.last_input = source
611
+ st.session_state._last_grammar_issues = None
612
+ st.session_state._last_output_file = None
613
+ st.success(f"Produced {len(variants)} variants.")
614
+
615
+ if st.button("3) Grammar & Spelling (check)"):
616
+ st.session_state._last_tool = "grammar"
617
+ source = st.session_state.current_text.strip() or input_text.strip()
618
+ if not source:
619
+ st.warning("Provide text or upload a file first.")
620
+ else:
621
+ st.session_state.history.append(st.session_state.current_text or source)
622
+ try:
623
+ corrected, issues = grammar_and_spelling_check(source)
624
+ st.session_state.current_text = corrected
625
+ st.session_state.versions = [corrected]
626
+ st.session_state.version_index = 0
627
+ st.session_state._last_grammar_issues = issues or []
628
+ st.success(f"Grammar check applied ({len(issues)} issues).")
629
+
630
+ # File-level output if uploaded
631
+ uploaded_bytes = st.session_state.get("_uploaded_bytes")
632
+ uploaded_name = st.session_state.get("_uploaded_name")
633
+ if uploaded_bytes and uploaded_name:
634
+ suffix = Path(uploaded_name).suffix.lower()
635
+ try:
636
+ if suffix == ".docx" and docx is not None:
637
+ out_bytes = apply_replacements_to_docx_bytes(uploaded_bytes, source, corrected)
638
+ st.session_state._last_output_file = out_bytes
639
+ st.session_state._last_output_name = f"corrected_{uploaded_name}"
640
+ elif suffix == ".pdf" and fitz is not None:
641
+ out_bytes = apply_replacements_to_pdf_bytes(uploaded_bytes, source, corrected)
642
+ st.session_state._last_output_file = out_bytes
643
+ st.session_state._last_output_name = f"corrected_{uploaded_name}"
644
+ elif suffix == ".txt":
645
+ st.session_state._last_output_file = corrected.encode("utf-8")
646
+ st.session_state._last_output_name = f"corrected_{uploaded_name}"
647
+ else:
648
+ st.session_state._last_output_file = make_docx_bytes_from_text(corrected)
649
+ st.session_state._last_output_name = "corrected_output.docx"
650
+ except Exception as e:
651
+ st.warning(f"Could not create corrected file preserving format: {e}")
652
+ st.session_state._last_output_file = None
653
+ st.session_state._last_output_name = None
654
+
655
+ if issues:
656
+ st.subheader("Detected issues (sample):")
657
+ for i, it in enumerate(issues[:30]):
658
+ st.write(f"- {it.get('message')} → suggestions: {it.get('replacements')}")
659
+ except Exception as e:
660
+ st.error(f"Grammar check failed: {e}")
661
+
662
+ # Navigation
663
+ st.markdown("---")
664
+ st.subheader("Preview / Versions")
665
+ colv1, colv2, colv3 = st.columns([1,1,2])
666
+ with colv1:
667
+ if st.button("◀ Previous Version"):
668
+ if st.session_state.versions:
669
+ st.session_state.version_index = max(0, st.session_state.version_index - 1)
670
+ st.session_state.current_text = st.session_state.versions[st.session_state.version_index]
671
+ with colv2:
672
+ if st.button("Next Version ▶"):
673
+ if st.session_state.versions:
674
+ st.session_state.version_index = min(len(st.session_state.versions)-1, st.session_state.version_index + 1)
675
+ st.session_state.current_text = st.session_state.versions[st.session_state.version_index]
676
+ with colv3:
677
+ st.write(f"Version {st.session_state.version_index+1} of {max(1, len(st.session_state.versions))}")
678
+
679
+
680
+ # Preview
681
+ st.markdown("---")
682
+ st.subheader("Original (top) — Processed Output (bottom)")
683
+ orig_display = st.session_state.last_input or ""
684
+ out_display = st.session_state.current_text or (input_text or "")
685
+
686
+ if st.session_state._last_tool == "grammar" and out_display.strip():
687
+ orig_html = mark_grammar_issues(orig_display, st.session_state._last_grammar_issues or []) if orig_display else html.escape(orig_display)
688
+ out_html = underline_changes_in_output(orig_display or "", out_display)
689
+ st.markdown("<b>Original (issues highlighted)</b>", unsafe_allow_html=True)
690
+ st.markdown(f"<div style='padding:8px;border:1px solid #e6e6e6;background:transparent;white-space:pre-wrap'>{orig_html}</div>", unsafe_allow_html=True)
691
+ st.markdown("<b>Corrected (changes underlined in green)</b>", unsafe_allow_html=True)
692
+ st.markdown(f"<div style='padding:8px;border:1px solid #e6e6e6;background:transparent;white-space:pre-wrap'>{out_html}</div>", unsafe_allow_html=True)
693
+ else:
694
+ # generic preview (green underlines for changed parts — new function)
695
+ preview_html = text_to_html_with_highlights(orig_display, out_display) if orig_display else html.escape(out_display)
696
+ st.markdown(
697
+ f"""
698
+ <div style='padding:10px;border:1px solid #eee;background:transparent;white-space:pre-wrap'>
699
+ {preview_html}
700
+ </div>
701
+ """,
702
+ unsafe_allow_html=True
703
+ )
704
+
705
+ # Editable area
706
+ st.subheader("Editable result (you can manually edit before saving)")
707
+ st.session_state.editable_area = st.text_area("Edit here:", value=st.session_state.current_text or out_display, height=300)
708
+
709
+ # If corrected file available (uploaded+grammar), download
710
+ if st.session_state._last_output_file is not None and st.session_state._last_output_name:
711
+ st.markdown("**Download corrected file**")
712
+ st.download_button("Download corrected file", data=st.session_state._last_output_file, file_name=st.session_state._last_output_name)
713
+
714
+
715
+
716
+ # Spelling suggestions & apply edits
717
+
718
+ # --- unchanged imports and code above ---
719
+
720
+ def spelling_suggestions(word, top_n=5, sentence=None):
721
+ """Return contextual synonyms if NLTK WordNet is available, else fallback to spellchecker."""
722
+ if not word or not word.strip():
723
+ return []
724
+
725
+ # Map POS tags to WordNet POS
726
+ def get_wordnet_pos(treebank_tag):
727
+ from nltk.corpus import wordnet
728
+ if treebank_tag.startswith('J'):
729
+ return wordnet.ADJ
730
+ elif treebank_tag.startswith('V'):
731
+ return wordnet.VERB
732
+ elif treebank_tag.startswith('N'):
733
+ return wordnet.NOUN
734
+ elif treebank_tag.startswith('R'):
735
+ return wordnet.ADV
736
+ return None
737
+
738
+ # Prefer WordNet synonyms with POS from context
739
+ if nltk_available:
740
+ wn_pos = None
741
+ if sentence:
742
+ try:
743
+ tokens = nltk.word_tokenize(sentence)
744
+ tagged = nltk.pos_tag(tokens)
745
+ for tok, tag in tagged:
746
+ if tok.lower() == word.lower():
747
+ wn_pos = get_wordnet_pos(tag)
748
+ break
749
+ except Exception:
750
+ pass
751
+
752
+ syns = wn.synsets(word, pos=wn_pos) if wn_pos else wn.synsets(word)
753
+ suggestions = set()
754
+ for s in syns:
755
+ for l in s.lemmas():
756
+ name = l.name().replace('_', ' ')
757
+ if name.lower() != word.lower():
758
+ suggestions.add(name)
759
+ if suggestions:
760
+ return sorted(suggestions)[:top_n]
761
+
762
+ # Fallback to spellchecker
763
+ if SPELLCHECKER_AVAILABLE:
764
+ suggestions = spell.candidates(word)
765
+ return list(suggestions)[:top_n]
766
+
767
+ return []
768
+
769
+ # --- rest of your unchanged functions and UI code ---
770
+
771
+ # Spelling suggestions & apply edits
772
+ st.markdown("---")
773
+ st.markdown("**Spelling suggestions / replace single word:**")
774
+ col_s1, col_s2 = st.columns([2,3])
775
+ with col_s1:
776
+ word_for_sugg = st.text_input("Enter token to suggest replacements:", value="")
777
+ if st.button("Get suggestions"):
778
+ if not word_for_sugg.strip():
779
+ st.warning("Type a token to get suggestions.")
780
+ else:
781
+ suggs = spelling_suggestions(word_for_sugg, sentence=st.session_state.editable_area) # UPDATED
782
+ if suggs:
783
+ sel = st.selectbox("Choose replacement:", options=["(keep)"] + suggs)
784
+ if sel and sel != "(keep)":
785
+ st.session_state.editable_area = st.session_state.editable_area.replace(word_for_sugg, sel)
786
+ st.success(f"Replaced '{word_for_sugg}' with '{sel}'")
787
+ else:
788
+ st.info("No suggestions found.")
789
+ with col_s2:
790
+ if st.button("Apply editable area to current text"):
791
+ st.session_state.current_text = st.session_state.editable_area
792
+ st.success("Applied edits to current text.")
793
+
794
+ # --- rest of your file remains exactly the same ---
795
+
796
+
797
+ # Save / Download / Copy for plain text
798
+ st.markdown("---")
799
+ col_d1, col_d2, col_d3 = st.columns(3)
800
+ with col_d1:
801
+ if st.button("Save as DOCX"):
802
+ try:
803
+ b = make_docx_bytes_from_text(st.session_state.editable_area or "")
804
+ st.download_button("Download DOCX", data=b, file_name="rephrased.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
805
+ except Exception as e:
806
+ st.error(f"Could not create DOCX: {e}")
807
+ with col_d2:
808
+ if st.button("Save as PDF"):
809
+ try:
810
+ b = make_pdf_bytes_from_text(st.session_state.editable_area or "")
811
+ st.download_button("Download PDF", data=b, file_name="rephrased.pdf", mime="application/pdf")
812
+ except Exception as e:
813
+ st.error(f"Could not create PDF: {e}")
814
+ with col_d3:
815
+ if st.button("Copy to clipboard"):
816
+ if PYPERCLIP:
817
+ pyperclip.copy(st.session_state.editable_area or "")
818
+ st.success("Copied to clipboard")
819
+ else:
820
+ path = os.path.join(tempfile.gettempdir(), "rephrased_output.txt")
821
+ with open(path, "w", encoding="utf-8") as f:
822
+ f.write(st.session_state.editable_area or "")
823
+ st.info(f"Saved to {path} (pyperclip not available)")
824
+
825
+ # Undo
826
+ if st.button("Undo"):
827
+ if st.session_state.history:
828
+ st.session_state.current_text = st.session_state.history.pop()
829
+ st.session_state.versions = [st.session_state.current_text]
830
+ st.session_state.version_index = 0
831
+ st.success("Undone last step")
832
+ else:
833
+ st.info("Nothing to undo")
834
+
835
+ st.markdown("---")
836
+ st.caption("Notes: Paraphraser & Plagiarism Remover code preserved. Grammar prefers LanguageTool (requires Java) else falls back to TextBlob. DOCX/PDF replacements are best-effort to preserve layout.")
837
+
838
+ # refresh button
839
+ # --- Refresh button at the bottom ---
840
+ if st.button("🔄 Refresh"):
841
+ st.rerun()
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ python-docx
3
+ PyMuPDF
4
+ nltk
5
+ spacy
6
+ textblob
7
+ pyspellchecker
8
+ pyperclip
9
+
10
+ # Optional / recommended for best results (heavy)
11
+ transformers
12
+ torch
13
+ sentencepiece
14
+ language-tool-python # requires Java (install JDK/JRE)