VictorM-Coder commited on
Commit
c63aa57
·
verified ·
1 Parent(s): dc81ef5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -47
app.py CHANGED
@@ -1,68 +1,114 @@
1
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
- import torch, gradio as gr
3
- import re
4
 
5
- # Load Model (lighter + faster)
6
- model_name = "humarin/chatgpt_paraphraser_on_T5_base"
7
- tokenizer = AutoTokenizer.from_pretrained(model_name)
8
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
 
9
 
10
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
- model = model.to(device)
12
- model.eval()
13
 
14
- # --- Helpers ---
15
- def split_sentences(paragraph):
16
- # Split into sentences based on punctuation + space
17
- sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip())
 
18
  return [s for s in sentences if s]
19
 
20
- # --- Main Paraphrasing Function ---
21
- def paraphrase_t5(text, temperature=0.9, top_p=0.92):
22
- if not text.strip():
23
- return "⚠️ Please enter some text"
 
 
 
 
 
 
 
 
24
 
25
- paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
26
- paraphrased_paragraphs = []
 
27
 
28
- for p in paragraphs:
29
- sentences = split_sentences(p)
30
- paraphrased_sentences = []
 
 
 
 
 
 
 
 
31
 
32
- for s in sentences:
33
- prompt = f"Paraphrase this in a more natural, human style while keeping meaning:\n{s}"
 
 
 
 
34
 
35
- inputs = tokenizer([prompt], return_tensors="pt", truncation=True, padding=True).to(device)
 
36
 
37
- outputs = model.generate(
38
- **inputs,
39
- max_new_tokens=512, # allow long outputs
40
- do_sample=True,
41
- top_p=float(top_p),
42
- temperature=float(temperature),
43
- num_return_sequences=1,
44
- no_repeat_ngram_size=3
45
- )
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- paraphrased = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
48
- paraphrased_sentences.append(paraphrased)
49
 
50
- # Rejoin sentences into a paragraph
51
- paraphrased_paragraphs.append(" ".join(paraphrased_sentences))
52
 
53
- return "\n\n".join(paraphrased_paragraphs)
54
 
55
- # --- Gradio UI ---
 
 
56
  iface = gr.Interface(
57
- fn=paraphrase_t5,
58
  inputs=[
59
- gr.Textbox(lines=8, placeholder="Paste full text here..."),
60
- gr.Slider(0.5, 1.5, step=0.1, value=0.9, label="Temperature"),
61
- gr.Slider(0.6, 1.0, step=0.02, value=0.92, label="Top-p")
 
62
  ],
63
- outputs=gr.Textbox(label="Paraphrased & Humanized Text"),
64
- title="T5-Base Paraphraser (Humanizer)",
65
- description="Fast, high-quality paraphrasing on T5-base, tuned for human-like rewrites."
66
  )
67
 
68
  iface.launch()
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
2
+ import torch, gradio as gr, re
 
3
 
4
+ # ------------------------
5
+ # Load Models
6
+ # ------------------------
7
+ # Stage 1: Paraphraser
8
+ paraphrase_model_name = "prithivida/parrot_paraphraser_on_T5"
9
+ paraphrase_tokenizer = AutoTokenizer.from_pretrained(paraphrase_model_name)
10
+ paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained(paraphrase_model_name)
11
+
12
+ # Stage 2: Expander (Flan-T5-Large)
13
+ expander = pipeline(
14
+ "text2text-generation",
15
+ model="google/flan-t5-large",
16
+ device=0 if torch.cuda.is_available() else -1
17
+ )
18
 
19
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
+ paraphrase_model = paraphrase_model.to(device)
21
+ paraphrase_model.eval()
22
 
23
+ # ------------------------
24
+ # Helpers
25
+ # ------------------------
26
+ def split_sentences(text):
27
+ sentences = re.split(r'(?<=[.!?])\s+', text.strip())
28
  return [s for s in sentences if s]
29
 
30
+ def clean_sentence(sent):
31
+ sent = re.sub(r'\s+', ' ', sent).strip()
32
+ if not sent.endswith(('.', '!', '?')):
33
+ sent += "."
34
+ return sent
35
+
36
+ # ------------------------
37
+ # Stage 1: Paraphrase
38
+ # ------------------------
39
+ def paraphrase_fn(text, num_return_sequences=1, temperature=1.2, top_p=0.92):
40
+ sentences = split_sentences(text)
41
+ all_outputs = []
42
 
43
+ for sent in sentences:
44
+ input_text = "paraphrase: " + sent + " </s>"
45
+ inputs = paraphrase_tokenizer([input_text], return_tensors="pt", truncation=True, padding=True).to(device)
46
 
47
+ outputs = paraphrase_model.generate(
48
+ **inputs,
49
+ max_new_tokens=128,
50
+ num_return_sequences=int(num_return_sequences),
51
+ do_sample=True,
52
+ top_p=float(top_p),
53
+ temperature=float(temperature),
54
+ min_length=20,
55
+ length_penalty=1.2
56
+ )
57
+ decoded = paraphrase_tokenizer.batch_decode(outputs, skip_special_tokens=True)
58
 
59
+ seen, unique = set(), []
60
+ for d in decoded:
61
+ d = clean_sentence(d)
62
+ if d not in seen:
63
+ unique.append(d)
64
+ seen.add(d)
65
 
66
+ if unique:
67
+ all_outputs.append(unique[0])
68
 
69
+ return " ".join(all_outputs).strip()
70
+
71
+ # ------------------------
72
+ # Stage 2: Expansion
73
+ # ------------------------
74
+ def expand_text(text, temperature=0.9, top_p=0.95):
75
+ expanded = expander(
76
+ f"Expand and make this text more detailed, natural, and human-like:\n{text}",
77
+ max_new_tokens=250,
78
+ temperature=float(temperature),
79
+ top_p=float(top_p)
80
+ )[0]['generated_text']
81
+ return expanded
82
+
83
+ # ------------------------
84
+ # Final Pipeline
85
+ # ------------------------
86
+ def humanize_pipeline(text, variants=1, temperature=1.2, top_p=0.92):
87
+ if not text.strip():
88
+ return "⚠️ Please enter some text"
89
 
90
+ # Stage 1: Paraphrase
91
+ base = paraphrase_fn(text, num_return_sequences=variants, temperature=temperature, top_p=top_p)
92
 
93
+ # Stage 2: Expand & Smooth
94
+ expanded = expand_text(base, temperature=temperature, top_p=top_p)
95
 
96
+ return expanded
97
 
98
+ # ------------------------
99
+ # Gradio Interface
100
+ # ------------------------
101
  iface = gr.Interface(
102
+ fn=humanize_pipeline,
103
  inputs=[
104
+ gr.Textbox(lines=8, placeholder="Paste text here..."),
105
+ gr.Slider(1, 3, step=1, value=1, label="Variants"),
106
+ gr.Slider(0.5, 2.0, step=0.1, value=1.2, label="Temperature"),
107
+ gr.Slider(0.6, 1.0, step=0.01, value=0.92, label="Top-p"),
108
  ],
109
+ outputs=gr.Textbox(label="Final Humanized Text"),
110
+ title="📝 Writenix Humanizer v2",
111
+ description="Two-stage pipeline: Paraphrase + Expand. Produces longer, more natural, human-like rewrites that are harder to detect."
112
  )
113
 
114
  iface.launch()