Testing app
Browse filesOld_paraphraser_app.py_single_long_code_streamlit_test
- app.py +841 -0
- requirements.txt +14 -0
app.py
ADDED
|
@@ -0,0 +1,841 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
+
"""
|
| 3 |
+
Merged Rephraser app
|
| 4 |
+
- GUI from original (first) file
|
| 5 |
+
- Models/logic from later big file (kept unchanged)
|
| 6 |
+
- Grammar highlight (red for issues; green underline for corrected words)
|
| 7 |
+
- File upload/download for .docx/.pdf/.txt with best-effort format preservation
|
| 8 |
+
- Tools independent (no automatic chaining)
|
| 9 |
+
- Prev/Next browsing for multi-version outputs
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import streamlit as st
|
| 13 |
+
import io, os, random, re, difflib, html, tempfile
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
# home "🏠 Home"
|
| 17 |
+
|
| 18 |
+
# --- Home button at the top ---
|
| 19 |
+
if st.button("🏠 Home"):
|
| 20 |
+
st.rerun()
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# Optional heavy libs (lazy imports used where needed)
|
| 24 |
+
try:
|
| 25 |
+
import docx
|
| 26 |
+
except Exception:
|
| 27 |
+
docx = None
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
import fitz # PyMuPDF
|
| 31 |
+
except Exception:
|
| 32 |
+
fitz = None
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
import language_tool_python
|
| 36 |
+
except Exception:
|
| 37 |
+
language_tool_python = None
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
from textblob import TextBlob
|
| 41 |
+
except Exception:
|
| 42 |
+
TextBlob = None
|
| 43 |
+
|
| 44 |
+
# NLTK / WordNet
|
| 45 |
+
try:
|
| 46 |
+
import nltk
|
| 47 |
+
from nltk.corpus import wordnet as wn
|
| 48 |
+
nltk_available = True
|
| 49 |
+
except Exception:
|
| 50 |
+
nltk_available = False
|
| 51 |
+
|
| 52 |
+
# spaCy
|
| 53 |
+
try:
|
| 54 |
+
import spacy
|
| 55 |
+
nlp = spacy.load("en_core_web_sm")
|
| 56 |
+
SPACY_AVAILABLE = True
|
| 57 |
+
except Exception:
|
| 58 |
+
nlp = None
|
| 59 |
+
SPACY_AVAILABLE = False
|
| 60 |
+
|
| 61 |
+
# transformers check
|
| 62 |
+
try:
|
| 63 |
+
import transformers
|
| 64 |
+
TRANSFORMERS_AVAILABLE = True
|
| 65 |
+
except Exception:
|
| 66 |
+
TRANSFORMERS_AVAILABLE = False
|
| 67 |
+
|
| 68 |
+
# SpellChecker
|
| 69 |
+
try:
|
| 70 |
+
from spellchecker import SpellChecker
|
| 71 |
+
SPELLCHECKER_AVAILABLE = True
|
| 72 |
+
spell = SpellChecker()
|
| 73 |
+
except Exception:
|
| 74 |
+
SPELLCHECKER_AVAILABLE = False
|
| 75 |
+
|
| 76 |
+
# pyperclip optional
|
| 77 |
+
try:
|
| 78 |
+
import pyperclip
|
| 79 |
+
PYPERCLIP = True
|
| 80 |
+
except Exception:
|
| 81 |
+
PYPERCLIP = False
|
| 82 |
+
|
| 83 |
+
# -----------------------
|
| 84 |
+
# Session state init (preserve old behavior)
|
| 85 |
+
# -----------------------
|
| 86 |
+
if "versions" not in st.session_state:
|
| 87 |
+
st.session_state.versions = []
|
| 88 |
+
if "version_index" not in st.session_state:
|
| 89 |
+
st.session_state.version_index = 0
|
| 90 |
+
if "last_input" not in st.session_state:
|
| 91 |
+
st.session_state.last_input = ""
|
| 92 |
+
if "current_text" not in st.session_state:
|
| 93 |
+
st.session_state.current_text = ""
|
| 94 |
+
if "history" not in st.session_state:
|
| 95 |
+
st.session_state.history = []
|
| 96 |
+
# bookkeeping for file uploads & grammar
|
| 97 |
+
if "_uploaded_bytes" not in st.session_state:
|
| 98 |
+
st.session_state._uploaded_bytes = None
|
| 99 |
+
if "_uploaded_name" not in st.session_state:
|
| 100 |
+
st.session_state._uploaded_name = None
|
| 101 |
+
if "_last_grammar_issues" not in st.session_state:
|
| 102 |
+
st.session_state._last_grammar_issues = None
|
| 103 |
+
if "_last_output_file" not in st.session_state:
|
| 104 |
+
st.session_state._last_output_file = None
|
| 105 |
+
if "_last_output_name" not in st.session_state:
|
| 106 |
+
st.session_state._last_output_name = None
|
| 107 |
+
if "_last_tool" not in st.session_state:
|
| 108 |
+
st.session_state._last_tool = None
|
| 109 |
+
|
| 110 |
+
# -----------------------
|
| 111 |
+
# Helpers: highlights & diffs
|
| 112 |
+
# -----------------------
|
| 113 |
+
def mark_grammar_issues(text, issues):
|
| 114 |
+
"""Wrap problem spans in red (inline). issues is list of dicts with offset & length & message & replacements"""
|
| 115 |
+
if not issues:
|
| 116 |
+
return html.escape(text)
|
| 117 |
+
spans = []
|
| 118 |
+
for it in issues:
|
| 119 |
+
off = it.get("offset", 0)
|
| 120 |
+
length = it.get("length", 0)
|
| 121 |
+
msg = it.get("message", "")
|
| 122 |
+
spans.append((off, off + length, msg))
|
| 123 |
+
spans.sort()
|
| 124 |
+
out = ""
|
| 125 |
+
idx = 0
|
| 126 |
+
for s, e, msg in spans:
|
| 127 |
+
if s > idx:
|
| 128 |
+
out += html.escape(text[idx:s])
|
| 129 |
+
problem = html.escape(text[s:e])
|
| 130 |
+
out += f'<span title="{html.escape(msg)}" style="border-bottom:2px solid #c0392b;">{problem}</span>'
|
| 131 |
+
idx = e
|
| 132 |
+
if idx < len(text):
|
| 133 |
+
out += html.escape(text[idx:])
|
| 134 |
+
return out
|
| 135 |
+
|
| 136 |
+
def underline_changes_in_output(orig, corrected):
|
| 137 |
+
"""
|
| 138 |
+
Token-level diff: underline changed/inserted fragments in green in corrected text.
|
| 139 |
+
"""
|
| 140 |
+
a = orig.split()
|
| 141 |
+
b = corrected.split()
|
| 142 |
+
sm = difflib.SequenceMatcher(a=a, b=b)
|
| 143 |
+
parts = []
|
| 144 |
+
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
| 145 |
+
if tag == "equal":
|
| 146 |
+
parts.append(" ".join(b[j1:j2]))
|
| 147 |
+
elif tag in ("replace", "insert"):
|
| 148 |
+
changed = " ".join(b[j1:j2])
|
| 149 |
+
parts.append(f'<span style="text-decoration: underline; text-decoration-color: #27ae60;">{html.escape(changed)}</span>')
|
| 150 |
+
elif tag == "delete":
|
| 151 |
+
pass
|
| 152 |
+
return " ".join(parts) if parts else html.escape(corrected)
|
| 153 |
+
|
| 154 |
+
## Green line
|
| 155 |
+
import html
|
| 156 |
+
import difflib
|
| 157 |
+
|
| 158 |
+
def text_to_html_with_highlights(orig, new):
|
| 159 |
+
"""
|
| 160 |
+
Compare original and new text word-by-word.
|
| 161 |
+
Underline only the changed/added words in green.
|
| 162 |
+
"""
|
| 163 |
+
orig_words = orig.split()
|
| 164 |
+
new_words = new.split()
|
| 165 |
+
|
| 166 |
+
diff = list(difflib.ndiff(orig_words, new_words))
|
| 167 |
+
highlighted = []
|
| 168 |
+
for word in diff:
|
| 169 |
+
if word.startswith("+ "): # Added or changed word
|
| 170 |
+
highlighted.append(
|
| 171 |
+
f"<span style='color:black;text-decoration:underline;text-decoration-color:green'>{html.escape(word[2:])}</span>"
|
| 172 |
+
)
|
| 173 |
+
elif word.startswith(" "): # Unchanged word
|
| 174 |
+
highlighted.append(html.escape(word[2:]))
|
| 175 |
+
# Words starting with "- " (removed) are skipped
|
| 176 |
+
|
| 177 |
+
return " ".join(highlighted)
|
| 178 |
+
|
| 179 |
+
# -----------------------
|
| 180 |
+
# Paraphraser functions (kept from your big code)
|
| 181 |
+
# -----------------------
|
| 182 |
+
def paraphrase_variants_fast(text, n_variants=3):
|
| 183 |
+
text = text.strip()
|
| 184 |
+
if not text:
|
| 185 |
+
return []
|
| 186 |
+
sents = re.split(r'(?<=[.!?])\s+', text)
|
| 187 |
+
variants = []
|
| 188 |
+
for v in range(n_variants):
|
| 189 |
+
outs = []
|
| 190 |
+
for s in sents:
|
| 191 |
+
sent = s.strip()
|
| 192 |
+
if not sent:
|
| 193 |
+
continue
|
| 194 |
+
if SPACY_AVAILABLE:
|
| 195 |
+
doc = nlp(sent)
|
| 196 |
+
# small structural transforms
|
| 197 |
+
if random.random() < 0.3 and len(list(doc.noun_chunks)) >= 2:
|
| 198 |
+
chunks = list(doc.noun_chunks)
|
| 199 |
+
text_chunks = [c.text for c in chunks]
|
| 200 |
+
s2 = sent
|
| 201 |
+
try:
|
| 202 |
+
s2 = s2.replace(text_chunks[0], "<<<A>>>").replace(text_chunks[1], text_chunks[0]).replace("<<<A>>>", text_chunks[1])
|
| 203 |
+
except Exception:
|
| 204 |
+
s2 = sent
|
| 205 |
+
outs.append(s2)
|
| 206 |
+
continue
|
| 207 |
+
if ',' in sent and random.random() < 0.4:
|
| 208 |
+
parts = [p.strip() for p in sent.split(',')]
|
| 209 |
+
random.shuffle(parts)
|
| 210 |
+
outs.append(", ".join(parts))
|
| 211 |
+
continue
|
| 212 |
+
outs.append(_synonym_replace(sent, prob=0.15 + 0.05 * v))
|
| 213 |
+
else:
|
| 214 |
+
if random.random() < 0.2:
|
| 215 |
+
words = sent.split()
|
| 216 |
+
if len(words) > 3:
|
| 217 |
+
i = random.randint(0, len(words) - 3)
|
| 218 |
+
words[i], words[i+1] = words[i+1], words[i]
|
| 219 |
+
outs.append(" ".join(words))
|
| 220 |
+
else:
|
| 221 |
+
outs.append(_synonym_replace(sent, prob=0.12 + 0.04 * v))
|
| 222 |
+
final = " ".join(outs)
|
| 223 |
+
if random.random() < 0.3 and len(sents) > 1:
|
| 224 |
+
random.shuffle(sents)
|
| 225 |
+
final = " ".join(outs)
|
| 226 |
+
variants.append(final)
|
| 227 |
+
uniq = []
|
| 228 |
+
for x in variants:
|
| 229 |
+
if x not in uniq and x.strip():
|
| 230 |
+
uniq.append(x)
|
| 231 |
+
return uniq[:n_variants]
|
| 232 |
+
|
| 233 |
+
def _synonym_replace(sentence, prob=0.12, max_replacements=2):
|
| 234 |
+
if not nltk_available:
|
| 235 |
+
words = sentence.split()
|
| 236 |
+
for i in range(len(words)):
|
| 237 |
+
if random.random() < prob:
|
| 238 |
+
j = random.randrange(len(words))
|
| 239 |
+
words[i], words[j] = words[j], words[i]
|
| 240 |
+
return " ".join(words)
|
| 241 |
+
tokens = re.findall(r"\w+|\W+", sentence)
|
| 242 |
+
words = [t for t in tokens]
|
| 243 |
+
replaced = 0
|
| 244 |
+
for i, tok in enumerate(words):
|
| 245 |
+
if not re.match(r'\w+', tok):
|
| 246 |
+
continue
|
| 247 |
+
lower = tok.lower()
|
| 248 |
+
if random.random() > prob:
|
| 249 |
+
continue
|
| 250 |
+
syns = wn.synsets(lower)
|
| 251 |
+
if not syns:
|
| 252 |
+
continue
|
| 253 |
+
cand = None
|
| 254 |
+
for s in syns:
|
| 255 |
+
for l in s.lemmas():
|
| 256 |
+
name = l.name().replace('_', ' ')
|
| 257 |
+
if name.lower() != lower and ' ' not in name:
|
| 258 |
+
cand = name
|
| 259 |
+
break
|
| 260 |
+
if cand:
|
| 261 |
+
break
|
| 262 |
+
if cand:
|
| 263 |
+
if tok[0].isupper():
|
| 264 |
+
cand = cand.capitalize()
|
| 265 |
+
words[i] = cand
|
| 266 |
+
replaced += 1
|
| 267 |
+
if replaced >= max_replacements:
|
| 268 |
+
break
|
| 269 |
+
return "".join(words)
|
| 270 |
+
|
| 271 |
+
def simple_mix_versions(versions_list):
|
| 272 |
+
if not versions_list:
|
| 273 |
+
return ""
|
| 274 |
+
pieces = []
|
| 275 |
+
for v in versions_list:
|
| 276 |
+
s = v.strip()
|
| 277 |
+
if not s:
|
| 278 |
+
continue
|
| 279 |
+
sents = re.split(r'(?<=[.!?])\s+', s)
|
| 280 |
+
take_n = max(1, min(3, len(sents)))
|
| 281 |
+
picks = random.sample(sents, take_n) if len(sents) > take_n else sents
|
| 282 |
+
pieces.extend(picks)
|
| 283 |
+
random.shuffle(pieces)
|
| 284 |
+
return " ".join(pieces)
|
| 285 |
+
|
| 286 |
+
# -----------------------
|
| 287 |
+
# Plagiarism remover (kept)
|
| 288 |
+
# -----------------------
|
| 289 |
+
@st.cache_resource(show_spinner=False)
|
| 290 |
+
def load_small_model(model_name="t5-small"):
|
| 291 |
+
if not TRANSFORMERS_AVAILABLE:
|
| 292 |
+
raise ImportError("transformers not installed")
|
| 293 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
| 294 |
+
tok = AutoTokenizer.from_pretrained(model_name)
|
| 295 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
| 296 |
+
pipe = pipeline("text2text-generation", model=model, tokenizer=tok, device=-1)
|
| 297 |
+
return tok, model, pipe
|
| 298 |
+
|
| 299 |
+
def hf_paraphrase_with_pipe(pipe, text, max_len=256):
|
| 300 |
+
try:
|
| 301 |
+
out = pipe(text, max_length=max_len, do_sample=True, top_p=0.95, temperature=0.8, num_return_sequences=1)
|
| 302 |
+
if isinstance(out, list) and out:
|
| 303 |
+
return out[0].get("generated_text") or out[0].get("summary_text") or str(out[0])
|
| 304 |
+
return str(out)
|
| 305 |
+
except Exception:
|
| 306 |
+
return text
|
| 307 |
+
|
| 308 |
+
def plagiarism_remover_pipeline(text, aggressive=1, light_only=False):
|
| 309 |
+
versions = []
|
| 310 |
+
v_light = paraphrase_variants_fast(text, n_variants=1)[0] if paraphrase_variants_fast(text, n_variants=1) else text
|
| 311 |
+
versions.append(v_light)
|
| 312 |
+
if TRANSFORMERS_AVAILABLE and not light_only:
|
| 313 |
+
try:
|
| 314 |
+
_, _, t5_pipe = load_small_model("t5-small")
|
| 315 |
+
v_t5 = hf_paraphrase_with_pipe(t5_pipe, "paraphrase: " + text)
|
| 316 |
+
versions.append(v_t5)
|
| 317 |
+
except Exception:
|
| 318 |
+
pass
|
| 319 |
+
try:
|
| 320 |
+
_, _, p_pipe = load_small_model("google/pegasus-xsum")
|
| 321 |
+
v_peg = hf_paraphrase_with_pipe(p_pipe, text)
|
| 322 |
+
versions.append(v_peg)
|
| 323 |
+
except Exception:
|
| 324 |
+
pass
|
| 325 |
+
v_combo = simple_mix_versions(versions)
|
| 326 |
+
versions.append(v_combo)
|
| 327 |
+
uniq = []
|
| 328 |
+
for v in versions:
|
| 329 |
+
if v and v.strip() and v not in uniq:
|
| 330 |
+
uniq.append(v)
|
| 331 |
+
if len(uniq) >= 5:
|
| 332 |
+
break
|
| 333 |
+
return uniq
|
| 334 |
+
|
| 335 |
+
# -----------------------
|
| 336 |
+
# Grammar & Spelling (kept)
|
| 337 |
+
# -----------------------
|
| 338 |
+
def grammar_and_spelling_check(text):
|
| 339 |
+
if language_tool_python is not None:
|
| 340 |
+
try:
|
| 341 |
+
tool = language_tool_python.LanguageTool('en-US')
|
| 342 |
+
matches = tool.check(text)
|
| 343 |
+
corrected = language_tool_python.utils.correct(text, matches)
|
| 344 |
+
issues = []
|
| 345 |
+
for m in matches:
|
| 346 |
+
issues.append({
|
| 347 |
+
"message": m.message,
|
| 348 |
+
"replacements": m.replacements,
|
| 349 |
+
"offset": m.offset,
|
| 350 |
+
"length": m.errorLength,
|
| 351 |
+
"context": text[max(0, m.offset - 30): m.offset + 30]
|
| 352 |
+
})
|
| 353 |
+
return corrected, issues
|
| 354 |
+
except Exception:
|
| 355 |
+
pass
|
| 356 |
+
if TextBlob is not None:
|
| 357 |
+
try:
|
| 358 |
+
tb = TextBlob(text)
|
| 359 |
+
corr = str(tb.correct())
|
| 360 |
+
return corr, []
|
| 361 |
+
except Exception:
|
| 362 |
+
pass
|
| 363 |
+
return text, []
|
| 364 |
+
|
| 365 |
+
def spelling_suggestions(word, top_n=5):
|
| 366 |
+
if SPELLCHECKER_AVAILABLE:
|
| 367 |
+
suggestions = spell.candidates(word)
|
| 368 |
+
return list(suggestions)[:top_n]
|
| 369 |
+
return []
|
| 370 |
+
|
| 371 |
+
# -----------------------
|
| 372 |
+
# File extract & write helpers (kept & added best-effort replace)
|
| 373 |
+
# -----------------------
|
| 374 |
+
def extract_text_from_docx_bytes(b):
|
| 375 |
+
if docx is None:
|
| 376 |
+
raise RuntimeError("python-docx not installed")
|
| 377 |
+
f = io.BytesIO(b)
|
| 378 |
+
document = docx.Document(f)
|
| 379 |
+
paras = [p.text for p in document.paragraphs]
|
| 380 |
+
return "\n\n".join(paras)
|
| 381 |
+
|
| 382 |
+
def extract_text_from_pdf_bytes(b):
|
| 383 |
+
if fitz is None:
|
| 384 |
+
raise RuntimeError("PyMuPDF not installed")
|
| 385 |
+
doc = fitz.open(stream=b, filetype="pdf")
|
| 386 |
+
text = ""
|
| 387 |
+
for p in doc:
|
| 388 |
+
text += p.get_text() + "\n\n"
|
| 389 |
+
return text
|
| 390 |
+
|
| 391 |
+
def extract_text_from_txt_bytes(b):
|
| 392 |
+
try:
|
| 393 |
+
return b.decode("utf-8")
|
| 394 |
+
except Exception:
|
| 395 |
+
try:
|
| 396 |
+
return b.decode("latin-1")
|
| 397 |
+
except Exception:
|
| 398 |
+
return str(b)
|
| 399 |
+
|
| 400 |
+
def make_docx_bytes_from_text(text):
|
| 401 |
+
if docx is None:
|
| 402 |
+
raise RuntimeError("python-docx not installed")
|
| 403 |
+
out = io.BytesIO()
|
| 404 |
+
d = docx.Document()
|
| 405 |
+
for para in text.split("\n\n"):
|
| 406 |
+
d.add_paragraph(para)
|
| 407 |
+
d.save(out)
|
| 408 |
+
out.seek(0)
|
| 409 |
+
return out.read()
|
| 410 |
+
|
| 411 |
+
def make_pdf_bytes_from_text(text):
|
| 412 |
+
if fitz is None:
|
| 413 |
+
raise RuntimeError("PyMuPDF not installed")
|
| 414 |
+
doc = fitz.open()
|
| 415 |
+
lines = text.split("\n")
|
| 416 |
+
page = doc.new_page()
|
| 417 |
+
y = 72
|
| 418 |
+
for line in lines:
|
| 419 |
+
if y > 720:
|
| 420 |
+
page = doc.new_page()
|
| 421 |
+
y = 72
|
| 422 |
+
page.insert_text((72, y), line)
|
| 423 |
+
y += 14
|
| 424 |
+
buf = doc.write()
|
| 425 |
+
doc.close()
|
| 426 |
+
return buf
|
| 427 |
+
|
| 428 |
+
def _build_replacement_spans(orig_text, corrected_text):
|
| 429 |
+
a = orig_text.split()
|
| 430 |
+
b = corrected_text.split()
|
| 431 |
+
sm = difflib.SequenceMatcher(a=a, b=b)
|
| 432 |
+
spans = []
|
| 433 |
+
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
| 434 |
+
if tag == "equal":
|
| 435 |
+
continue
|
| 436 |
+
orig_span = " ".join(a[i1:i2]).strip()
|
| 437 |
+
corr_span = " ".join(b[j1:j2]).strip()
|
| 438 |
+
if orig_span:
|
| 439 |
+
spans.append((orig_span, corr_span))
|
| 440 |
+
spans.sort(key=lambda x: -len(x[0]))
|
| 441 |
+
return spans
|
| 442 |
+
|
| 443 |
+
def apply_replacements_to_docx_bytes(original_bytes, orig_text, corrected_text):
|
| 444 |
+
"""Replace occurrences of orig spans with corrected spans inside docx runs and table cells (best-effort)."""
|
| 445 |
+
if docx is None:
|
| 446 |
+
raise RuntimeError("python-docx not installed")
|
| 447 |
+
from io import BytesIO
|
| 448 |
+
document = docx.Document(BytesIO(original_bytes))
|
| 449 |
+
spans = _build_replacement_spans(orig_text, corrected_text)
|
| 450 |
+
if not spans:
|
| 451 |
+
out = BytesIO()
|
| 452 |
+
document.save(out)
|
| 453 |
+
out.seek(0)
|
| 454 |
+
return out.read()
|
| 455 |
+
def replace_in_paragraph_runs(par):
|
| 456 |
+
for orig_span, corr_span in spans:
|
| 457 |
+
for run in par.runs:
|
| 458 |
+
if orig_span in run.text:
|
| 459 |
+
run.text = run.text.replace(orig_span, corr_span)
|
| 460 |
+
for p in document.paragraphs:
|
| 461 |
+
replace_in_paragraph_runs(p)
|
| 462 |
+
for table in document.tables:
|
| 463 |
+
for row in table.rows:
|
| 464 |
+
for cell in row.cells:
|
| 465 |
+
for p in cell.paragraphs:
|
| 466 |
+
replace_in_paragraph_runs(p)
|
| 467 |
+
out = io.BytesIO()
|
| 468 |
+
document.save(out)
|
| 469 |
+
out.seek(0)
|
| 470 |
+
return out.read()
|
| 471 |
+
|
| 472 |
+
def apply_replacements_to_pdf_bytes(original_bytes, orig_text, corrected_text):
|
| 473 |
+
"""Best-effort PDF replacement: redact original token bbox and write corrected text in place using PyMuPDF."""
|
| 474 |
+
if fitz is None:
|
| 475 |
+
raise RuntimeError("PyMuPDF not installed")
|
| 476 |
+
orig_tokens = orig_text.split()
|
| 477 |
+
corr_tokens = corrected_text.split()
|
| 478 |
+
sm = difflib.SequenceMatcher(a=orig_tokens, b=corr_tokens)
|
| 479 |
+
ops = []
|
| 480 |
+
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
| 481 |
+
if tag == "equal":
|
| 482 |
+
continue
|
| 483 |
+
ops.append((tag, i1, i2, j1, j2))
|
| 484 |
+
if not ops:
|
| 485 |
+
return original_bytes
|
| 486 |
+
pdf = fitz.open(stream=original_bytes, filetype="pdf")
|
| 487 |
+
global_words = []
|
| 488 |
+
for pno in range(len(pdf)):
|
| 489 |
+
page = pdf[pno]
|
| 490 |
+
words = page.get_text("words") # x0,y0,x1,y1, word, block_no, line_no, word_no
|
| 491 |
+
words_sorted = sorted(words, key=lambda w: (round(w[3],1), round(w[0],1)))
|
| 492 |
+
for w in words_sorted:
|
| 493 |
+
global_words.append((pno, w))
|
| 494 |
+
N = len(global_words)
|
| 495 |
+
M = len(orig_tokens)
|
| 496 |
+
map_len = min(N, M)
|
| 497 |
+
token_to_global = {}
|
| 498 |
+
for i in range(map_len):
|
| 499 |
+
token_to_global[i] = global_words[i]
|
| 500 |
+
redactions_per_page = {}
|
| 501 |
+
inserts_per_page = {}
|
| 502 |
+
for op in ops:
|
| 503 |
+
tag, i1, i2, j1, j2 = op
|
| 504 |
+
corr_span = " ".join(corr_tokens[j1:j2])
|
| 505 |
+
for ti in range(i1, i2):
|
| 506 |
+
if ti in token_to_global:
|
| 507 |
+
pno, wtuple = token_to_global[ti]
|
| 508 |
+
x0, y0, x1, y1 = wtuple[0], wtuple[1], wtuple[2], wtuple[3]
|
| 509 |
+
bbox = fitz.Rect(x0, y0, x1, y1)
|
| 510 |
+
redactions_per_page.setdefault(pno, []).append(bbox)
|
| 511 |
+
inserts_per_page.setdefault(pno, []).append((bbox, corr_span))
|
| 512 |
+
break
|
| 513 |
+
for pno, rects in redactions_per_page.items():
|
| 514 |
+
page = pdf[pno]
|
| 515 |
+
for r in rects:
|
| 516 |
+
page.add_redact_annot(r, fill=(1,1,1))
|
| 517 |
+
page.apply_redactions()
|
| 518 |
+
for bbox, corr_span in inserts_per_page.get(pno, []):
|
| 519 |
+
fontsize = max(6, round(bbox.height * 0.8))
|
| 520 |
+
try:
|
| 521 |
+
page.insert_textbox(bbox, corr_span, fontsize=fontsize, fontname="helv", align=0)
|
| 522 |
+
except Exception:
|
| 523 |
+
page.insert_text((bbox.x0, bbox.y0), corr_span, fontsize=fontsize, fontname="helv")
|
| 524 |
+
out = pdf.write()
|
| 525 |
+
pdf.close()
|
| 526 |
+
return out
|
| 527 |
+
|
| 528 |
+
# -----------------------
|
| 529 |
+
# UI (first file's GUI style) with Prev/Next variants and independent tools
|
| 530 |
+
# -----------------------
|
| 531 |
+
st.set_page_config(page_title="Rephraser", layout="wide")
|
| 532 |
+
st.title("Rephraser — Paraphrase · Plagiarism Remover · Grammar & Spelling")
|
| 533 |
+
st.markdown("Paste text or upload DOCX/PDF/TXT. Tools are independent and chainable (use output as input manually).")
|
| 534 |
+
|
| 535 |
+
col_left, col_right = st.columns([2,1])
|
| 536 |
+
with col_left:
|
| 537 |
+
input_mode = st.radio("Input:", ("Paste text", "Upload file (.docx/.pdf/.txt)"))
|
| 538 |
+
uploaded_bytes = None
|
| 539 |
+
uploaded_name = None
|
| 540 |
+
input_text = ""
|
| 541 |
+
if input_mode == "Paste text":
|
| 542 |
+
input_text = st.text_area("Paste your paragraph(s) here:", height=200, value=st.session_state.current_text or "")
|
| 543 |
+
# clear upload memory
|
| 544 |
+
st.session_state._uploaded_bytes = None
|
| 545 |
+
st.session_state._uploaded_name = None
|
| 546 |
+
else:
|
| 547 |
+
uploaded = st.file_uploader("Upload .docx, .pdf or .txt", type=["docx","pdf","txt"])
|
| 548 |
+
if uploaded is not None:
|
| 549 |
+
uploaded_bytes = uploaded.read()
|
| 550 |
+
uploaded_name = uploaded.name
|
| 551 |
+
st.session_state._uploaded_bytes = uploaded_bytes
|
| 552 |
+
st.session_state._uploaded_name = uploaded_name
|
| 553 |
+
try:
|
| 554 |
+
if uploaded.name.lower().endswith(".docx"):
|
| 555 |
+
input_text = extract_text_from_docx_bytes(uploaded_bytes)
|
| 556 |
+
elif uploaded.name.lower().endswith(".pdf"):
|
| 557 |
+
input_text = extract_text_from_pdf_bytes(uploaded_bytes)
|
| 558 |
+
else:
|
| 559 |
+
input_text = extract_text_from_txt_bytes(uploaded_bytes)
|
| 560 |
+
st.success(f"Loaded {uploaded.name} (approx {len(input_text.split())} words)")
|
| 561 |
+
except Exception as e:
|
| 562 |
+
st.error(f"Could not extract text from file: {e}")
|
| 563 |
+
st.markdown("**Tools (choose one)**")
|
| 564 |
+
st.markdown("- **Para-phraser (fast):** Focused on rephrase sentence, regardless of Plagiarism ")
|
| 565 |
+
st.markdown("- **Plagiarism Remover (deep):** Focused on Plagiarism, Convert text to human like ")
|
| 566 |
+
st.markdown("- **Grammar & Spelling:** Spelling And Grammar Check")
|
| 567 |
+
|
| 568 |
+
with col_right:
|
| 569 |
+
st.header("Actions")
|
| 570 |
+
variants_to_generate = st.slider("Max variants (deep)", 1, 5, 3)
|
| 571 |
+
use_light_only = st.checkbox("Force light-only (no HF models)", value=True)
|
| 572 |
+
if st.button("1) Para-phraser (fast)"):
|
| 573 |
+
st.session_state._last_tool = "paraphrase"
|
| 574 |
+
source = input_text.strip() or st.session_state.current_text.strip()
|
| 575 |
+
if not source:
|
| 576 |
+
st.warning("Provide text or upload a file first.")
|
| 577 |
+
else:
|
| 578 |
+
st.session_state.history.append(st.session_state.current_text or source)
|
| 579 |
+
variants = paraphrase_variants_fast(source, n_variants=variants_to_generate)
|
| 580 |
+
if not variants:
|
| 581 |
+
st.error("No paraphrase produced.")
|
| 582 |
+
else:
|
| 583 |
+
st.session_state.versions = variants
|
| 584 |
+
st.session_state.version_index = 0
|
| 585 |
+
st.session_state.current_text = variants[0]
|
| 586 |
+
st.session_state.last_input = source
|
| 587 |
+
st.session_state._last_grammar_issues = None
|
| 588 |
+
st.session_state._last_output_file = None
|
| 589 |
+
st.success("Para-phraser done. Use Prev/Next to browse.")
|
| 590 |
+
|
| 591 |
+
if st.button("2) Plagiarism Remover (deep)"):
|
| 592 |
+
st.session_state._last_tool = "plagiarism"
|
| 593 |
+
source = input_text.strip() or st.session_state.current_text.strip()
|
| 594 |
+
if not source:
|
| 595 |
+
st.warning("Provide text or upload a file first.")
|
| 596 |
+
else:
|
| 597 |
+
st.session_state.history.append(st.session_state.current_text or source)
|
| 598 |
+
st.info("Running plagiarism remover pipeline...")
|
| 599 |
+
try:
|
| 600 |
+
variants = plagiarism_remover_pipeline(source, aggressive=1, light_only=use_light_only)
|
| 601 |
+
except Exception as e:
|
| 602 |
+
st.error(f"Pipeline failed: {e}")
|
| 603 |
+
variants = paraphrase_variants_fast(source, n_variants=variants_to_generate)
|
| 604 |
+
if not variants:
|
| 605 |
+
st.error("No variants produced.")
|
| 606 |
+
else:
|
| 607 |
+
st.session_state.versions = variants
|
| 608 |
+
st.session_state.version_index = 0
|
| 609 |
+
st.session_state.current_text = variants[0]
|
| 610 |
+
st.session_state.last_input = source
|
| 611 |
+
st.session_state._last_grammar_issues = None
|
| 612 |
+
st.session_state._last_output_file = None
|
| 613 |
+
st.success(f"Produced {len(variants)} variants.")
|
| 614 |
+
|
| 615 |
+
if st.button("3) Grammar & Spelling (check)"):
|
| 616 |
+
st.session_state._last_tool = "grammar"
|
| 617 |
+
source = st.session_state.current_text.strip() or input_text.strip()
|
| 618 |
+
if not source:
|
| 619 |
+
st.warning("Provide text or upload a file first.")
|
| 620 |
+
else:
|
| 621 |
+
st.session_state.history.append(st.session_state.current_text or source)
|
| 622 |
+
try:
|
| 623 |
+
corrected, issues = grammar_and_spelling_check(source)
|
| 624 |
+
st.session_state.current_text = corrected
|
| 625 |
+
st.session_state.versions = [corrected]
|
| 626 |
+
st.session_state.version_index = 0
|
| 627 |
+
st.session_state._last_grammar_issues = issues or []
|
| 628 |
+
st.success(f"Grammar check applied ({len(issues)} issues).")
|
| 629 |
+
|
| 630 |
+
# File-level output if uploaded
|
| 631 |
+
uploaded_bytes = st.session_state.get("_uploaded_bytes")
|
| 632 |
+
uploaded_name = st.session_state.get("_uploaded_name")
|
| 633 |
+
if uploaded_bytes and uploaded_name:
|
| 634 |
+
suffix = Path(uploaded_name).suffix.lower()
|
| 635 |
+
try:
|
| 636 |
+
if suffix == ".docx" and docx is not None:
|
| 637 |
+
out_bytes = apply_replacements_to_docx_bytes(uploaded_bytes, source, corrected)
|
| 638 |
+
st.session_state._last_output_file = out_bytes
|
| 639 |
+
st.session_state._last_output_name = f"corrected_{uploaded_name}"
|
| 640 |
+
elif suffix == ".pdf" and fitz is not None:
|
| 641 |
+
out_bytes = apply_replacements_to_pdf_bytes(uploaded_bytes, source, corrected)
|
| 642 |
+
st.session_state._last_output_file = out_bytes
|
| 643 |
+
st.session_state._last_output_name = f"corrected_{uploaded_name}"
|
| 644 |
+
elif suffix == ".txt":
|
| 645 |
+
st.session_state._last_output_file = corrected.encode("utf-8")
|
| 646 |
+
st.session_state._last_output_name = f"corrected_{uploaded_name}"
|
| 647 |
+
else:
|
| 648 |
+
st.session_state._last_output_file = make_docx_bytes_from_text(corrected)
|
| 649 |
+
st.session_state._last_output_name = "corrected_output.docx"
|
| 650 |
+
except Exception as e:
|
| 651 |
+
st.warning(f"Could not create corrected file preserving format: {e}")
|
| 652 |
+
st.session_state._last_output_file = None
|
| 653 |
+
st.session_state._last_output_name = None
|
| 654 |
+
|
| 655 |
+
if issues:
|
| 656 |
+
st.subheader("Detected issues (sample):")
|
| 657 |
+
for i, it in enumerate(issues[:30]):
|
| 658 |
+
st.write(f"- {it.get('message')} → suggestions: {it.get('replacements')}")
|
| 659 |
+
except Exception as e:
|
| 660 |
+
st.error(f"Grammar check failed: {e}")
|
| 661 |
+
|
| 662 |
+
# Navigation
|
| 663 |
+
st.markdown("---")
|
| 664 |
+
st.subheader("Preview / Versions")
|
| 665 |
+
colv1, colv2, colv3 = st.columns([1,1,2])
|
| 666 |
+
with colv1:
|
| 667 |
+
if st.button("◀ Previous Version"):
|
| 668 |
+
if st.session_state.versions:
|
| 669 |
+
st.session_state.version_index = max(0, st.session_state.version_index - 1)
|
| 670 |
+
st.session_state.current_text = st.session_state.versions[st.session_state.version_index]
|
| 671 |
+
with colv2:
|
| 672 |
+
if st.button("Next Version ▶"):
|
| 673 |
+
if st.session_state.versions:
|
| 674 |
+
st.session_state.version_index = min(len(st.session_state.versions)-1, st.session_state.version_index + 1)
|
| 675 |
+
st.session_state.current_text = st.session_state.versions[st.session_state.version_index]
|
| 676 |
+
with colv3:
|
| 677 |
+
st.write(f"Version {st.session_state.version_index+1} of {max(1, len(st.session_state.versions))}")
|
| 678 |
+
|
| 679 |
+
|
| 680 |
+
# Preview
|
| 681 |
+
st.markdown("---")
|
| 682 |
+
st.subheader("Original (top) — Processed Output (bottom)")
|
| 683 |
+
orig_display = st.session_state.last_input or ""
|
| 684 |
+
out_display = st.session_state.current_text or (input_text or "")
|
| 685 |
+
|
| 686 |
+
if st.session_state._last_tool == "grammar" and out_display.strip():
|
| 687 |
+
orig_html = mark_grammar_issues(orig_display, st.session_state._last_grammar_issues or []) if orig_display else html.escape(orig_display)
|
| 688 |
+
out_html = underline_changes_in_output(orig_display or "", out_display)
|
| 689 |
+
st.markdown("<b>Original (issues highlighted)</b>", unsafe_allow_html=True)
|
| 690 |
+
st.markdown(f"<div style='padding:8px;border:1px solid #e6e6e6;background:transparent;white-space:pre-wrap'>{orig_html}</div>", unsafe_allow_html=True)
|
| 691 |
+
st.markdown("<b>Corrected (changes underlined in green)</b>", unsafe_allow_html=True)
|
| 692 |
+
st.markdown(f"<div style='padding:8px;border:1px solid #e6e6e6;background:transparent;white-space:pre-wrap'>{out_html}</div>", unsafe_allow_html=True)
|
| 693 |
+
else:
|
| 694 |
+
# generic preview (green underlines for changed parts — new function)
|
| 695 |
+
preview_html = text_to_html_with_highlights(orig_display, out_display) if orig_display else html.escape(out_display)
|
| 696 |
+
st.markdown(
|
| 697 |
+
f"""
|
| 698 |
+
<div style='padding:10px;border:1px solid #eee;background:transparent;white-space:pre-wrap'>
|
| 699 |
+
{preview_html}
|
| 700 |
+
</div>
|
| 701 |
+
""",
|
| 702 |
+
unsafe_allow_html=True
|
| 703 |
+
)
|
| 704 |
+
|
| 705 |
+
# Editable area
|
| 706 |
+
st.subheader("Editable result (you can manually edit before saving)")
|
| 707 |
+
st.session_state.editable_area = st.text_area("Edit here:", value=st.session_state.current_text or out_display, height=300)
|
| 708 |
+
|
| 709 |
+
# If corrected file available (uploaded+grammar), download
|
| 710 |
+
if st.session_state._last_output_file is not None and st.session_state._last_output_name:
|
| 711 |
+
st.markdown("**Download corrected file**")
|
| 712 |
+
st.download_button("Download corrected file", data=st.session_state._last_output_file, file_name=st.session_state._last_output_name)
|
| 713 |
+
|
| 714 |
+
|
| 715 |
+
|
| 716 |
+
# Spelling suggestions & apply edits
|
| 717 |
+
|
| 718 |
+
# --- unchanged imports and code above ---
|
| 719 |
+
|
| 720 |
+
def spelling_suggestions(word, top_n=5, sentence=None):
|
| 721 |
+
"""Return contextual synonyms if NLTK WordNet is available, else fallback to spellchecker."""
|
| 722 |
+
if not word or not word.strip():
|
| 723 |
+
return []
|
| 724 |
+
|
| 725 |
+
# Map POS tags to WordNet POS
|
| 726 |
+
def get_wordnet_pos(treebank_tag):
|
| 727 |
+
from nltk.corpus import wordnet
|
| 728 |
+
if treebank_tag.startswith('J'):
|
| 729 |
+
return wordnet.ADJ
|
| 730 |
+
elif treebank_tag.startswith('V'):
|
| 731 |
+
return wordnet.VERB
|
| 732 |
+
elif treebank_tag.startswith('N'):
|
| 733 |
+
return wordnet.NOUN
|
| 734 |
+
elif treebank_tag.startswith('R'):
|
| 735 |
+
return wordnet.ADV
|
| 736 |
+
return None
|
| 737 |
+
|
| 738 |
+
# Prefer WordNet synonyms with POS from context
|
| 739 |
+
if nltk_available:
|
| 740 |
+
wn_pos = None
|
| 741 |
+
if sentence:
|
| 742 |
+
try:
|
| 743 |
+
tokens = nltk.word_tokenize(sentence)
|
| 744 |
+
tagged = nltk.pos_tag(tokens)
|
| 745 |
+
for tok, tag in tagged:
|
| 746 |
+
if tok.lower() == word.lower():
|
| 747 |
+
wn_pos = get_wordnet_pos(tag)
|
| 748 |
+
break
|
| 749 |
+
except Exception:
|
| 750 |
+
pass
|
| 751 |
+
|
| 752 |
+
syns = wn.synsets(word, pos=wn_pos) if wn_pos else wn.synsets(word)
|
| 753 |
+
suggestions = set()
|
| 754 |
+
for s in syns:
|
| 755 |
+
for l in s.lemmas():
|
| 756 |
+
name = l.name().replace('_', ' ')
|
| 757 |
+
if name.lower() != word.lower():
|
| 758 |
+
suggestions.add(name)
|
| 759 |
+
if suggestions:
|
| 760 |
+
return sorted(suggestions)[:top_n]
|
| 761 |
+
|
| 762 |
+
# Fallback to spellchecker
|
| 763 |
+
if SPELLCHECKER_AVAILABLE:
|
| 764 |
+
suggestions = spell.candidates(word)
|
| 765 |
+
return list(suggestions)[:top_n]
|
| 766 |
+
|
| 767 |
+
return []
|
| 768 |
+
|
| 769 |
+
# --- rest of your unchanged functions and UI code ---
|
| 770 |
+
|
| 771 |
+
# Spelling suggestions & apply edits
|
| 772 |
+
st.markdown("---")
|
| 773 |
+
st.markdown("**Spelling suggestions / replace single word:**")
|
| 774 |
+
col_s1, col_s2 = st.columns([2,3])
|
| 775 |
+
with col_s1:
|
| 776 |
+
word_for_sugg = st.text_input("Enter token to suggest replacements:", value="")
|
| 777 |
+
if st.button("Get suggestions"):
|
| 778 |
+
if not word_for_sugg.strip():
|
| 779 |
+
st.warning("Type a token to get suggestions.")
|
| 780 |
+
else:
|
| 781 |
+
suggs = spelling_suggestions(word_for_sugg, sentence=st.session_state.editable_area) # UPDATED
|
| 782 |
+
if suggs:
|
| 783 |
+
sel = st.selectbox("Choose replacement:", options=["(keep)"] + suggs)
|
| 784 |
+
if sel and sel != "(keep)":
|
| 785 |
+
st.session_state.editable_area = st.session_state.editable_area.replace(word_for_sugg, sel)
|
| 786 |
+
st.success(f"Replaced '{word_for_sugg}' with '{sel}'")
|
| 787 |
+
else:
|
| 788 |
+
st.info("No suggestions found.")
|
| 789 |
+
with col_s2:
|
| 790 |
+
if st.button("Apply editable area to current text"):
|
| 791 |
+
st.session_state.current_text = st.session_state.editable_area
|
| 792 |
+
st.success("Applied edits to current text.")
|
| 793 |
+
|
| 794 |
+
# --- rest of your file remains exactly the same ---
|
| 795 |
+
|
| 796 |
+
|
| 797 |
+
# Save / Download / Copy for plain text
|
| 798 |
+
st.markdown("---")
|
| 799 |
+
col_d1, col_d2, col_d3 = st.columns(3)
|
| 800 |
+
with col_d1:
|
| 801 |
+
if st.button("Save as DOCX"):
|
| 802 |
+
try:
|
| 803 |
+
b = make_docx_bytes_from_text(st.session_state.editable_area or "")
|
| 804 |
+
st.download_button("Download DOCX", data=b, file_name="rephrased.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
| 805 |
+
except Exception as e:
|
| 806 |
+
st.error(f"Could not create DOCX: {e}")
|
| 807 |
+
with col_d2:
|
| 808 |
+
if st.button("Save as PDF"):
|
| 809 |
+
try:
|
| 810 |
+
b = make_pdf_bytes_from_text(st.session_state.editable_area or "")
|
| 811 |
+
st.download_button("Download PDF", data=b, file_name="rephrased.pdf", mime="application/pdf")
|
| 812 |
+
except Exception as e:
|
| 813 |
+
st.error(f"Could not create PDF: {e}")
|
| 814 |
+
with col_d3:
|
| 815 |
+
if st.button("Copy to clipboard"):
|
| 816 |
+
if PYPERCLIP:
|
| 817 |
+
pyperclip.copy(st.session_state.editable_area or "")
|
| 818 |
+
st.success("Copied to clipboard")
|
| 819 |
+
else:
|
| 820 |
+
path = os.path.join(tempfile.gettempdir(), "rephrased_output.txt")
|
| 821 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 822 |
+
f.write(st.session_state.editable_area or "")
|
| 823 |
+
st.info(f"Saved to {path} (pyperclip not available)")
|
| 824 |
+
|
| 825 |
+
# Undo
|
| 826 |
+
if st.button("Undo"):
|
| 827 |
+
if st.session_state.history:
|
| 828 |
+
st.session_state.current_text = st.session_state.history.pop()
|
| 829 |
+
st.session_state.versions = [st.session_state.current_text]
|
| 830 |
+
st.session_state.version_index = 0
|
| 831 |
+
st.success("Undone last step")
|
| 832 |
+
else:
|
| 833 |
+
st.info("Nothing to undo")
|
| 834 |
+
|
| 835 |
+
st.markdown("---")
|
| 836 |
+
st.caption("Notes: Paraphraser & Plagiarism Remover code preserved. Grammar prefers LanguageTool (requires Java) else falls back to TextBlob. DOCX/PDF replacements are best-effort to preserve layout.")
|
| 837 |
+
|
| 838 |
+
# refresh button
|
| 839 |
+
# --- Refresh button at the bottom ---
|
| 840 |
+
if st.button("🔄 Refresh"):
|
| 841 |
+
st.rerun()
|
requirements.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
python-docx
|
| 3 |
+
PyMuPDF
|
| 4 |
+
nltk
|
| 5 |
+
spacy
|
| 6 |
+
textblob
|
| 7 |
+
pyspellchecker
|
| 8 |
+
pyperclip
|
| 9 |
+
|
| 10 |
+
# Optional / recommended for best results (heavy)
|
| 11 |
+
transformers
|
| 12 |
+
torch
|
| 13 |
+
sentencepiece
|
| 14 |
+
language-tool-python # requires Java (install JDK/JRE)
|