changcheng967 commited on
Commit
cea2c03
·
verified ·
1 Parent(s): 624b1df

Better Acuuracy

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +252 -79
src/streamlit_app.py CHANGED
@@ -1,18 +1,32 @@
1
  import streamlit as st
2
  import time
3
  import logging
4
- import torch # Missing import added here
5
- from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
 
 
 
6
 
7
- st.set_page_config(page_title="AI Humanizer", layout="wide")
8
- st.title("AI Humanizer")
9
- st.subheader("Detect AI text and convert to human-like writing")
10
 
 
 
 
 
 
11
  DETECTION_THRESHOLD = 0.65
12
- MAX_LENGTH = 64 # Reduced for CPU efficiency
 
 
13
  MODELS = {
14
- "detection": "Hello-SimpleAI/chatgpt-detector-roberta",
15
- "humanization": "humarin/chatgpt_paraphraser_on_T5_base"
 
 
 
 
16
  }
17
 
18
  if "logs" not in st.session_state:
@@ -23,78 +37,187 @@ def add_log(message):
23
  timestamp = time.strftime("%H:%M:%S")
24
  log_entry = f"[{timestamp}] {message}"
25
  st.session_state.logs.append(log_entry)
26
- logging.info(log_entry)
27
 
28
  def load_models():
29
  if not st.session_state.models_loaded:
30
- add_log("Loading detection model...")
31
- detection_tokenizer = AutoTokenizer.from_pretrained(MODELS["detection"])
32
- detection_model = AutoModelForSequenceClassification.from_pretrained(MODELS["detection"])
 
33
 
 
 
 
 
 
 
 
 
34
  add_log("Loading humanization model...")
35
- humanizer = pipeline(
36
- "text2text-generation",
37
- model=MODELS["humanization"],
38
- max_length=MAX_LENGTH,
39
- device=-1 # Force CPU usage
40
- )
 
 
 
 
41
 
42
  add_log("All models loaded successfully")
43
  st.session_state.models_loaded = True
44
- return detection_tokenizer, detection_model, humanizer
45
- return st.session_state.detection_tokenizer, st.session_state.detection_model, st.session_state.humanizer
46
-
47
- if "detection_tokenizer" not in st.session_state:
48
- with st.spinner("Loading AI models. This may take 2-3 minutes..."):
49
- detection_tokenizer, detection_model, humanizer = load_models()
50
- st.session_state.detection_tokenizer = detection_tokenizer
51
- st.session_state.detection_model = detection_model
52
- st.session_state.humanizer = humanizer
53
- else:
54
- detection_tokenizer = st.session_state.detection_tokenizer
55
- detection_model = st.session_state.detection_model
56
- humanizer = st.session_state.humanizer
57
-
58
- def detect_ai_probability(text):
59
- add_log(f"Detecting AI probability")
60
- inputs = detection_tokenizer(
61
- text,
62
- return_tensors="pt",
63
- truncation=True,
64
- max_length=MAX_LENGTH,
65
- padding=True
66
  )
 
 
 
 
 
67
 
68
- with torch.no_grad():
69
- outputs = detection_model(**inputs)
 
 
 
 
70
 
71
- probs = torch.softmax(outputs.logits, dim=1)
72
- ai_prob = probs[0][1].item()
73
- add_log(f"AI probability: {ai_prob:.4f}")
74
- return ai_prob
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- def humanize_text(text):
77
- add_log("Humanizing text...")
78
- result = humanizer(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  f"paraphrase: {text}",
80
- num_beams=1, # Reduced to 1 for CPU speed
81
- num_return_sequences=1,
82
- temperature=1.1,
83
- repetition_penalty=1.5,
84
- max_new_tokens=MAX_LENGTH
85
  )
86
- humanized = result[0]['generated_text']
87
- add_log("Humanization complete")
88
- return humanized
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  def process_text(text):
91
- add_log("Starting text processing")
92
- ai_prob = detect_ai_probability(text)
 
 
 
 
 
 
 
93
 
94
- if ai_prob > DETECTION_THRESHOLD:
 
 
 
 
 
 
 
95
  add_log("AI probability exceeds threshold - humanizing")
96
- humanized = humanize_text(text)
97
- modified = True
98
  else:
99
  add_log("AI probability below threshold - no changes")
100
  humanized = text
@@ -103,43 +226,93 @@ def process_text(text):
103
  add_log("Processing complete")
104
  return ai_prob, humanized, modified
105
 
106
- input_text = st.text_area("Input Text", placeholder="Paste AI-generated content here...", height=150)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
- if st.button("Humanize Text"):
109
  if not input_text.strip():
110
  st.warning("Please enter some text")
111
  else:
 
 
 
112
  with st.expander("Processing Logs", expanded=True):
113
  log_placeholder = st.empty()
114
 
115
  ai_prob, humanized, modified = process_text(input_text)
116
 
117
- log_text = "\n".join(st.session_state.logs[-10:])
118
  log_placeholder.code(log_text, language="log")
119
 
120
  st.divider()
121
 
 
122
  col1, col2 = st.columns(2)
123
  with col1:
 
 
 
 
 
 
 
 
124
  st.subheader("Original Text")
125
  st.write(input_text)
126
- st.metric("AI Probability", f"{ai_prob*100:.1f}%")
127
 
128
  with col2:
129
- st.subheader("Humanized Result")
 
130
  st.write(humanized)
131
- st.metric("Status", "Humanized" if modified else "Original")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
- if modified:
134
- st.success("Text successfully humanized")
135
- else:
136
- st.info("No changes needed - text already human-like")
137
-
138
- if st.sidebar.button("Clear Logs"):
139
- st.session_state.logs = []
140
- st.rerun()
141
 
 
142
  st.sidebar.divider()
143
- st.sidebar.caption("Models:")
144
- st.sidebar.code(f"Detector: {MODELS['detection']}")
145
- st.sidebar.code(f"Humanizer: {MODELS['humanization']}")
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import time
3
  import logging
4
+ import torch
5
+ import re
6
+ from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
7
+ from sentence_transformers import SentenceTransformer, util
8
+ import numpy as np
9
 
10
+ # Configure logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
 
14
+ st.set_page_config(page_title="AI Humanizer Pro", layout="wide")
15
+ st.title("AI Humanizer Pro")
16
+ st.subheader("Advanced AI detection and humanization")
17
+
18
+ # Enhanced configuration
19
  DETECTION_THRESHOLD = 0.65
20
+ MAX_LENGTH = 128
21
+ ENSEMBLE_WEIGHTS = [0.6, 0.4] # Weighting for model ensemble
22
+
23
  MODELS = {
24
+ "detection": [
25
+ "Hello-SimpleAI/chatgpt-detector-roberta", # Specialized in ChatGPT detection
26
+ "microsoft/deberta-v3-base" # Powerful general classifier
27
+ ],
28
+ "humanization": "humarin/chatgpt_paraphraser_on_T5_base",
29
+ "similarity": "all-MiniLM-L6-v2" # For semantic similarity check
30
  }
31
 
32
  if "logs" not in st.session_state:
 
37
  timestamp = time.strftime("%H:%M:%S")
38
  log_entry = f"[{timestamp}] {message}"
39
  st.session_state.logs.append(log_entry)
40
+ logger.info(log_entry)
41
 
42
  def load_models():
43
  if not st.session_state.models_loaded:
44
+ # Detection models
45
+ add_log("Loading detection models...")
46
+ detection_tokenizers = []
47
+ detection_models = []
48
 
49
+ for model_name in MODELS["detection"]:
50
+ add_log(f"Loading {model_name}...")
51
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
52
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
53
+ detection_tokenizers.append(tokenizer)
54
+ detection_models.append(model)
55
+
56
+ # Humanization model
57
  add_log("Loading humanization model...")
58
+ humanizer_tokenizer = AutoTokenizer.from_pretrained(MODELS["humanization"])
59
+ humanizer_model = AutoModelForSeq2SeqLM.from_pretrained(MODELS["humanization"])
60
+ humanizer = {
61
+ "tokenizer": humanizer_tokenizer,
62
+ "model": humanizer_model
63
+ }
64
+
65
+ # Similarity model
66
+ add_log("Loading semantic similarity model...")
67
+ similarity_model = SentenceTransformer(MODELS["similarity"])
68
 
69
  add_log("All models loaded successfully")
70
  st.session_state.models_loaded = True
71
+ return detection_tokenizers, detection_models, humanizer, similarity_model
72
+
73
+ return (
74
+ st.session_state.detection_tokenizers,
75
+ st.session_state.detection_models,
76
+ st.session_state.humanizer,
77
+ st.session_state.similarity_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  )
79
+
80
+ # Load models with progress indicator
81
+ if not st.session_state.get("models_initialized", False):
82
+ progress_bar = st.progress(0)
83
+ status_text = st.empty()
84
 
85
+ status_text.text("Initializing models (this may take 2-3 minutes)...")
86
+ progress_bar.progress(10)
87
+
88
+ detection_tokenizers, detection_models, humanizer, similarity_model = load_models()
89
+
90
+ progress_bar.progress(60)
91
 
92
+ # Store in session state
93
+ st.session_state.detection_tokenizers = detection_tokenizers
94
+ st.session_state.detection_models = detection_models
95
+ st.session_state.humanizer = humanizer
96
+ st.session_state.similarity_model = similarity_model
97
+ st.session_state.models_initialized = True
98
+
99
+ progress_bar.progress(100)
100
+ time.sleep(0.5)
101
+ progress_bar.empty()
102
+ status_text.empty()
103
+
104
+ # Access models from session state
105
+ detection_tokenizers = st.session_state.detection_tokenizers
106
+ detection_models = st.session_state.detection_models
107
+ humanizer = st.session_state.humanizer
108
+ similarity_model = st.session_state.similarity_model
109
 
110
+ def preprocess_text(text):
111
+ """Clean and normalize text for better detection"""
112
+ text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
113
+ text = re.sub(r'[^\w\s.,;:!?\'-]', '', text) # Remove special characters
114
+ return text.strip()
115
+
116
+ def detect_ai_probability_ensemble(text):
117
+ """Ensemble detection with multiple models"""
118
+ text = preprocess_text(text)
119
+ add_log("Running ensemble AI detection")
120
+ probabilities = []
121
+
122
+ for i, (tokenizer, model) in enumerate(zip(detection_tokenizers, detection_models)):
123
+ add_log(f"Processing with model {i+1}")
124
+ inputs = tokenizer(
125
+ text,
126
+ return_tensors="pt",
127
+ truncation=True,
128
+ max_length=MAX_LENGTH,
129
+ padding=True
130
+ )
131
+
132
+ with torch.no_grad():
133
+ outputs = model(**inputs)
134
+
135
+ probs = torch.softmax(outputs.logits, dim=1)
136
+ ai_prob = probs[0][1].item()
137
+ probabilities.append(ai_prob)
138
+ add_log(f"Model {i+1} AI probability: {ai_prob:.4f}")
139
+
140
+ # Weighted ensemble probability
141
+ ensemble_prob = sum(w * p for w, p in zip(ENSEMBLE_WEIGHTS, probabilities))
142
+ add_log(f"Ensemble AI probability: {ensemble_prob:.4f}")
143
+ return ensemble_prob
144
+
145
+ def calculate_semantic_similarity(original, humanized):
146
+ """Calculate semantic similarity between original and humanized text"""
147
+ embeddings = similarity_model.encode([original, humanized])
148
+ similarity = util.cos_sim(embeddings[0], embeddings[1]).item()
149
+ return similarity
150
+
151
+ def enhance_humanization(text, original):
152
+ """Enhanced humanization with quality control"""
153
+ add_log("Starting enhanced humanization")
154
+
155
+ # First pass humanization
156
+ inputs = humanizer["tokenizer"](
157
  f"paraphrase: {text}",
158
+ return_tensors="pt",
159
+ truncation=True,
160
+ max_length=MAX_LENGTH,
161
+ padding=True
 
162
  )
163
+
164
+ with torch.no_grad():
165
+ outputs = humanizer["model"].generate(
166
+ **inputs,
167
+ max_length=MAX_LENGTH,
168
+ num_beams=5,
169
+ num_return_sequences=3,
170
+ temperature=1.4,
171
+ repetition_penalty=2.5,
172
+ early_stopping=True
173
+ )
174
+
175
+ # Generate multiple options
176
+ candidates = [
177
+ humanizer["tokenizer"].decode(output, skip_special_tokens=True)
178
+ for output in outputs
179
+ ]
180
+
181
+ # Select best candidate based on similarity to original meaning
182
+ best_candidate = None
183
+ best_similarity = 0
184
+
185
+ for candidate in candidates:
186
+ similarity = calculate_semantic_similarity(original, candidate)
187
+ if similarity > best_similarity:
188
+ best_similarity = similarity
189
+ best_candidate = candidate
190
+
191
+ add_log(f"Selected humanized text with similarity: {best_similarity:.4f}")
192
+
193
+ # Ensure quality control
194
+ if best_similarity < 0.7:
195
+ add_log("Low similarity detected, using original text")
196
+ return original, False
197
+
198
+ return best_candidate, True
199
 
200
  def process_text(text):
201
+ add_log("Starting advanced text processing")
202
+ original_text = text # Preserve original for comparison
203
+
204
+ # Text analysis
205
+ word_count = len(text.split())
206
+ add_log(f"Text analysis: {word_count} words")
207
+
208
+ # AI detection
209
+ ai_prob = detect_ai_probability_ensemble(text)
210
 
211
+ # Adjust threshold based on text characteristics
212
+ threshold = DETECTION_THRESHOLD
213
+ if word_count < 50:
214
+ threshold = max(0.4, DETECTION_THRESHOLD - 0.15)
215
+ add_log(f"Short text detected - lowering threshold to {threshold:.2f}")
216
+
217
+ # Humanization decision
218
+ if ai_prob > threshold:
219
  add_log("AI probability exceeds threshold - humanizing")
220
+ humanized, modified = enhance_humanization(text, original_text)
 
221
  else:
222
  add_log("AI probability below threshold - no changes")
223
  humanized = text
 
226
  add_log("Processing complete")
227
  return ai_prob, humanized, modified
228
 
229
+ # UI Components
230
+ with st.sidebar:
231
+ st.header("Configuration")
232
+ st.slider("Detection Threshold", 0.1, 0.9, DETECTION_THRESHOLD, 0.05, key="threshold")
233
+ st.caption("Models:")
234
+ for i, model in enumerate(MODELS["detection"]):
235
+ st.code(f"Detector {i+1}: {model}")
236
+ st.code(f"Humanizer: {MODELS['humanization']}")
237
+ st.code(f"Similarity: {MODELS['similarity']}")
238
+
239
+ if st.button("Clear Logs"):
240
+ st.session_state.logs = []
241
+ st.rerun()
242
+
243
+ st.subheader("Input")
244
+ input_text = st.text_area("Paste text to analyze and humanize",
245
+ placeholder="Enter AI-generated content here...",
246
+ height=200,
247
+ key="input_text")
248
 
249
+ if st.button("Analyze & Humanize", type="primary"):
250
  if not input_text.strip():
251
  st.warning("Please enter some text")
252
  else:
253
+ # Update threshold from UI
254
+ DETECTION_THRESHOLD = st.session_state.threshold
255
+
256
  with st.expander("Processing Logs", expanded=True):
257
  log_placeholder = st.empty()
258
 
259
  ai_prob, humanized, modified = process_text(input_text)
260
 
261
+ log_text = "\n".join(st.session_state.logs[-20:])
262
  log_placeholder.code(log_text, language="log")
263
 
264
  st.divider()
265
 
266
+ # Results display
267
  col1, col2 = st.columns(2)
268
  with col1:
269
+ st.subheader("Analysis Results")
270
+ st.metric("AI Probability", f"{ai_prob*100:.1f}%",
271
+ delta=f"{'High' if ai_prob > 0.7 else 'Medium' if ai_prob > 0.4 else 'Low'} confidence")
272
+
273
+ # Confidence indicator
274
+ confidence_level = min(int(ai_prob * 100), 100)
275
+ st.progress(confidence_level, text=f"Detection confidence: {confidence_level}%")
276
+
277
  st.subheader("Original Text")
278
  st.write(input_text)
 
279
 
280
  with col2:
281
+ status = "Humanized" if modified else "Original"
282
+ st.subheader(f"Output Text ({status})")
283
  st.write(humanized)
284
+
285
+ if modified:
286
+ # Calculate and display similarity
287
+ similarity = calculate_semantic_similarity(input_text, humanized)
288
+ st.metric("Meaning Preservation", f"{similarity*100:.1f}%")
289
+ st.success("Text successfully humanized")
290
+ else:
291
+ st.info("No changes made - text already appears human-like")
292
+
293
+ # Quality rating
294
+ if modified:
295
+ st.subheader("Quality Feedback")
296
+ quality = st.slider("How natural does the humanized text sound?",
297
+ 1, 5, 3, key="quality_rating")
298
+ if quality < 3:
299
+ st.warning("Thanks for feedback! We'll improve our algorithms.")
300
 
301
+ # Add spacing
302
+ st.divider()
303
+ st.caption("Advanced AI detection using model ensemble. Humanization preserves meaning while adding natural variation.")
 
 
 
 
 
304
 
305
+ # Add sample texts for quick testing
306
  st.sidebar.divider()
307
+ st.sidebar.subheader("Sample Texts")
308
+ sample_texts = {
309
+ "Academic": "The utilization of renewable energy sources is imperative for environmental sustainability and represents a critical pathway toward decarbonizing our global energy infrastructure.",
310
+ "Creative": "The city pulsed with predictable rhythms—lights changed on schedule, drones delivered packages, even rain fell by appointment. Yet Kael sensed a disruption, not visible but felt, like a whisper at the edge of consciousness.",
311
+ "Technical": "Machine learning algorithms, particularly deep neural networks, require substantial computational resources during their training phases, necessitating specialized hardware accelerators such as GPUs or TPUs.",
312
+ "Casual": "Just tried that new coffee shop downtown and wow, their cold brew is amazing! Best I've had in years, no joke."
313
+ }
314
+
315
+ for name, text in sample_texts.items():
316
+ if st.sidebar.button(name, key=f"sample_{name}"):
317
+ st.session_state.input_text = text
318
+ st.rerun()