Spaces:

changcheng967
/

DouletAI_Humanizer

Runtime error

App Files Files Community

changcheng967 commited on Jun 10, 2025

Commit

cea2c03

verified ·

1 Parent(s): 624b1df

Better Acuuracy

Browse files

Files changed (1) hide show

src/streamlit_app.py +252 -79

src/streamlit_app.py CHANGED Viewed

@@ -1,18 +1,32 @@
 import streamlit as st
 import time
 import logging
-import torch  # Missing import added here
-from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
-st.set_page_config(page_title="AI Humanizer", layout="wide")
-st.title("AI Humanizer")
-st.subheader("Detect AI text and convert to human-like writing")
 DETECTION_THRESHOLD = 0.65
-MAX_LENGTH = 64  # Reduced for CPU efficiency
 MODELS = {
-    "detection": "Hello-SimpleAI/chatgpt-detector-roberta",
-    "humanization": "humarin/chatgpt_paraphraser_on_T5_base"
 }
 if "logs" not in st.session_state:
@@ -23,78 +37,187 @@ def add_log(message):
     timestamp = time.strftime("%H:%M:%S")
     log_entry = f"[{timestamp}] {message}"
     st.session_state.logs.append(log_entry)
-    logging.info(log_entry)
 def load_models():
     if not st.session_state.models_loaded:
-        add_log("Loading detection model...")
-        detection_tokenizer = AutoTokenizer.from_pretrained(MODELS["detection"])
-        detection_model = AutoModelForSequenceClassification.from_pretrained(MODELS["detection"])
         add_log("Loading humanization model...")
-        humanizer = pipeline(
-            "text2text-generation",
-            model=MODELS["humanization"],
-            max_length=MAX_LENGTH,
-            device=-1  # Force CPU usage
-        )
         add_log("All models loaded successfully")
         st.session_state.models_loaded = True
-        return detection_tokenizer, detection_model, humanizer
-    return st.session_state.detection_tokenizer, st.session_state.detection_model, st.session_state.humanizer
-if "detection_tokenizer" not in st.session_state:
-    with st.spinner("Loading AI models. This may take 2-3 minutes..."):
-        detection_tokenizer, detection_model, humanizer = load_models()
-        st.session_state.detection_tokenizer = detection_tokenizer
-        st.session_state.detection_model = detection_model
-        st.session_state.humanizer = humanizer
-else:
-    detection_tokenizer = st.session_state.detection_tokenizer
-    detection_model = st.session_state.detection_model
-    humanizer = st.session_state.humanizer
-def detect_ai_probability(text):
-    add_log(f"Detecting AI probability")
-    inputs = detection_tokenizer(
-        text,
-        return_tensors="pt",
-        truncation=True,
-        max_length=MAX_LENGTH,
-        padding=True
     )
-    with torch.no_grad():
-        outputs = detection_model(**inputs)
-    probs = torch.softmax(outputs.logits, dim=1)
-    ai_prob = probs[0][1].item()
-    add_log(f"AI probability: {ai_prob:.4f}")
-    return ai_prob
-def humanize_text(text):
-    add_log("Humanizing text...")
-    result = humanizer(
         f"paraphrase: {text}",
-        num_beams=1,  # Reduced to 1 for CPU speed
-        num_return_sequences=1,
-        temperature=1.1,
-        repetition_penalty=1.5,
-        max_new_tokens=MAX_LENGTH
     )
-    humanized = result[0]['generated_text']
-    add_log("Humanization complete")
-    return humanized
 def process_text(text):
-    add_log("Starting text processing")
-    ai_prob = detect_ai_probability(text)
-    if ai_prob > DETECTION_THRESHOLD:
         add_log("AI probability exceeds threshold - humanizing")
-        humanized = humanize_text(text)
-        modified = True
     else:
         add_log("AI probability below threshold - no changes")
         humanized = text
@@ -103,43 +226,93 @@ def process_text(text):
     add_log("Processing complete")
     return ai_prob, humanized, modified
-input_text = st.text_area("Input Text", placeholder="Paste AI-generated content here...", height=150)
-if st.button("Humanize Text"):
     if not input_text.strip():
         st.warning("Please enter some text")
     else:
         with st.expander("Processing Logs", expanded=True):
             log_placeholder = st.empty()
             ai_prob, humanized, modified = process_text(input_text)
-            log_text = "\n".join(st.session_state.logs[-10:])
             log_placeholder.code(log_text, language="log")
         st.divider()
         col1, col2 = st.columns(2)
         with col1:
             st.subheader("Original Text")
             st.write(input_text)
-            st.metric("AI Probability", f"{ai_prob*100:.1f}%")
         with col2:
-            st.subheader("Humanized Result")
             st.write(humanized)
-            st.metric("Status", "Humanized" if modified else "Original")
-        if modified:
-            st.success("Text successfully humanized")
-        else:
-            st.info("No changes needed - text already human-like")
-if st.sidebar.button("Clear Logs"):
-    st.session_state.logs = []
-    st.rerun()
 st.sidebar.divider()
-st.sidebar.caption("Models:")
-st.sidebar.code(f"Detector: {MODELS['detection']}")
-st.sidebar.code(f"Humanizer: {MODELS['humanization']}")

 import streamlit as st
 import time
 import logging
+import torch
+import re
+from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
+from sentence_transformers import SentenceTransformer, util
+import numpy as np
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+st.set_page_config(page_title="AI Humanizer Pro", layout="wide")
+st.title("AI Humanizer Pro")
+st.subheader("Advanced AI detection and humanization")
+# Enhanced configuration
 DETECTION_THRESHOLD = 0.65
+MAX_LENGTH = 128
+ENSEMBLE_WEIGHTS = [0.6, 0.4]  # Weighting for model ensemble
 MODELS = {
+    "detection": [
+        "Hello-SimpleAI/chatgpt-detector-roberta",  # Specialized in ChatGPT detection
+        "microsoft/deberta-v3-base"  # Powerful general classifier
+    ],
+    "humanization": "humarin/chatgpt_paraphraser_on_T5_base",
+    "similarity": "all-MiniLM-L6-v2"  # For semantic similarity check
 }
 if "logs" not in st.session_state:
     timestamp = time.strftime("%H:%M:%S")
     log_entry = f"[{timestamp}] {message}"
     st.session_state.logs.append(log_entry)
+    logger.info(log_entry)
 def load_models():
     if not st.session_state.models_loaded:
+        # Detection models
+        add_log("Loading detection models...")
+        detection_tokenizers = []
+        detection_models = []
+        for model_name in MODELS["detection"]:
+            add_log(f"Loading {model_name}...")
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            model = AutoModelForSequenceClassification.from_pretrained(model_name)
+            detection_tokenizers.append(tokenizer)
+            detection_models.append(model)
+        # Humanization model
         add_log("Loading humanization model...")
+        humanizer_tokenizer = AutoTokenizer.from_pretrained(MODELS["humanization"])
+        humanizer_model = AutoModelForSeq2SeqLM.from_pretrained(MODELS["humanization"])
+        humanizer = {
+            "tokenizer": humanizer_tokenizer,
+            "model": humanizer_model
+        }
+        # Similarity model
+        add_log("Loading semantic similarity model...")
+        similarity_model = SentenceTransformer(MODELS["similarity"])
         add_log("All models loaded successfully")
         st.session_state.models_loaded = True
+        return detection_tokenizers, detection_models, humanizer, similarity_model
+    return (
+        st.session_state.detection_tokenizers,
+        st.session_state.detection_models,
+        st.session_state.humanizer,
+        st.session_state.similarity_model
     )
+# Load models with progress indicator
+if not st.session_state.get("models_initialized", False):
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+    status_text.text("Initializing models (this may take 2-3 minutes)...")
+    progress_bar.progress(10)
+    detection_tokenizers, detection_models, humanizer, similarity_model = load_models()
+    progress_bar.progress(60)
+    # Store in session state
+    st.session_state.detection_tokenizers = detection_tokenizers
+    st.session_state.detection_models = detection_models
+    st.session_state.humanizer = humanizer
+    st.session_state.similarity_model = similarity_model
+    st.session_state.models_initialized = True
+    progress_bar.progress(100)
+    time.sleep(0.5)
+    progress_bar.empty()
+    status_text.empty()
+# Access models from session state
+detection_tokenizers = st.session_state.detection_tokenizers
+detection_models = st.session_state.detection_models
+humanizer = st.session_state.humanizer
+similarity_model = st.session_state.similarity_model
+def preprocess_text(text):
+    """Clean and normalize text for better detection"""
+    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
+    text = re.sub(r'[^\w\s.,;:!?\'-]', '', text)  # Remove special characters
+    return text.strip()
+def detect_ai_probability_ensemble(text):
+    """Ensemble detection with multiple models"""
+    text = preprocess_text(text)
+    add_log("Running ensemble AI detection")
+    probabilities = []
+    for i, (tokenizer, model) in enumerate(zip(detection_tokenizers, detection_models)):
+        add_log(f"Processing with model {i+1}")
+        inputs = tokenizer(
+            text,
+            return_tensors="pt",
+            truncation=True,
+            max_length=MAX_LENGTH,
+            padding=True
+        )
+        with torch.no_grad():
+            outputs = model(**inputs)
+        probs = torch.softmax(outputs.logits, dim=1)
+        ai_prob = probs[0][1].item()
+        probabilities.append(ai_prob)
+        add_log(f"Model {i+1} AI probability: {ai_prob:.4f}")
+    # Weighted ensemble probability
+    ensemble_prob = sum(w * p for w, p in zip(ENSEMBLE_WEIGHTS, probabilities))
+    add_log(f"Ensemble AI probability: {ensemble_prob:.4f}")
+    return ensemble_prob
+def calculate_semantic_similarity(original, humanized):
+    """Calculate semantic similarity between original and humanized text"""
+    embeddings = similarity_model.encode([original, humanized])
+    similarity = util.cos_sim(embeddings[0], embeddings[1]).item()
+    return similarity
+def enhance_humanization(text, original):
+    """Enhanced humanization with quality control"""
+    add_log("Starting enhanced humanization")
+    # First pass humanization
+    inputs = humanizer["tokenizer"](
         f"paraphrase: {text}",
+        return_tensors="pt",
+        truncation=True,
+        max_length=MAX_LENGTH,
+        padding=True
     )
+    with torch.no_grad():
+        outputs = humanizer["model"].generate(
+            **inputs,
+            max_length=MAX_LENGTH,
+            num_beams=5,
+            num_return_sequences=3,
+            temperature=1.4,
+            repetition_penalty=2.5,
+            early_stopping=True
+        )
+    # Generate multiple options
+    candidates = [
+        humanizer["tokenizer"].decode(output, skip_special_tokens=True)
+        for output in outputs
+    ]
+    # Select best candidate based on similarity to original meaning
+    best_candidate = None
+    best_similarity = 0
+    for candidate in candidates:
+        similarity = calculate_semantic_similarity(original, candidate)
+        if similarity > best_similarity:
+            best_similarity = similarity
+            best_candidate = candidate
+    add_log(f"Selected humanized text with similarity: {best_similarity:.4f}")
+    # Ensure quality control
+    if best_similarity < 0.7:
+        add_log("Low similarity detected, using original text")
+        return original, False
+    return best_candidate, True
 def process_text(text):
+    add_log("Starting advanced text processing")
+    original_text = text  # Preserve original for comparison
+    # Text analysis
+    word_count = len(text.split())
+    add_log(f"Text analysis: {word_count} words")
+    # AI detection
+    ai_prob = detect_ai_probability_ensemble(text)
+    # Adjust threshold based on text characteristics
+    threshold = DETECTION_THRESHOLD
+    if word_count < 50:
+        threshold = max(0.4, DETECTION_THRESHOLD - 0.15)
+        add_log(f"Short text detected - lowering threshold to {threshold:.2f}")
+    # Humanization decision
+    if ai_prob > threshold:
         add_log("AI probability exceeds threshold - humanizing")
+        humanized, modified = enhance_humanization(text, original_text)
     else:
         add_log("AI probability below threshold - no changes")
         humanized = text
     add_log("Processing complete")
     return ai_prob, humanized, modified
+# UI Components
+with st.sidebar:
+    st.header("Configuration")
+    st.slider("Detection Threshold", 0.1, 0.9, DETECTION_THRESHOLD, 0.05, key="threshold")
+    st.caption("Models:")
+    for i, model in enumerate(MODELS["detection"]):
+        st.code(f"Detector {i+1}: {model}")
+    st.code(f"Humanizer: {MODELS['humanization']}")
+    st.code(f"Similarity: {MODELS['similarity']}")
+    if st.button("Clear Logs"):
+        st.session_state.logs = []
+        st.rerun()
+st.subheader("Input")
+input_text = st.text_area("Paste text to analyze and humanize",
+                          placeholder="Enter AI-generated content here...",
+                          height=200,
+                          key="input_text")
+if st.button("Analyze & Humanize", type="primary"):
     if not input_text.strip():
         st.warning("Please enter some text")
     else:
+        # Update threshold from UI
+        DETECTION_THRESHOLD = st.session_state.threshold
         with st.expander("Processing Logs", expanded=True):
             log_placeholder = st.empty()
             ai_prob, humanized, modified = process_text(input_text)
+            log_text = "\n".join(st.session_state.logs[-20:])
             log_placeholder.code(log_text, language="log")
         st.divider()
+        # Results display
         col1, col2 = st.columns(2)
         with col1:
+            st.subheader("Analysis Results")
+            st.metric("AI Probability", f"{ai_prob*100:.1f}%",
+                      delta=f"{'High' if ai_prob > 0.7 else 'Medium' if ai_prob > 0.4 else 'Low'} confidence")
+            # Confidence indicator
+            confidence_level = min(int(ai_prob * 100), 100)
+            st.progress(confidence_level, text=f"Detection confidence: {confidence_level}%")
             st.subheader("Original Text")
             st.write(input_text)
         with col2:
+            status = "Humanized" if modified else "Original"
+            st.subheader(f"Output Text ({status})")
             st.write(humanized)
+            if modified:
+                # Calculate and display similarity
+                similarity = calculate_semantic_similarity(input_text, humanized)
+                st.metric("Meaning Preservation", f"{similarity*100:.1f}%")
+                st.success("Text successfully humanized")
+            else:
+                st.info("No changes made - text already appears human-like")
+            # Quality rating
+            if modified:
+                st.subheader("Quality Feedback")
+                quality = st.slider("How natural does the humanized text sound?",
+                                   1, 5, 3, key="quality_rating")
+                if quality < 3:
+                    st.warning("Thanks for feedback! We'll improve our algorithms.")
+        # Add spacing
+        st.divider()
+        st.caption("Advanced AI detection using model ensemble. Humanization preserves meaning while adding natural variation.")
+# Add sample texts for quick testing
 st.sidebar.divider()
+st.sidebar.subheader("Sample Texts")
+sample_texts = {
+    "Academic": "The utilization of renewable energy sources is imperative for environmental sustainability and represents a critical pathway toward decarbonizing our global energy infrastructure.",
+    "Creative": "The city pulsed with predictable rhythms—lights changed on schedule, drones delivered packages, even rain fell by appointment. Yet Kael sensed a disruption, not visible but felt, like a whisper at the edge of consciousness.",
+    "Technical": "Machine learning algorithms, particularly deep neural networks, require substantial computational resources during their training phases, necessitating specialized hardware accelerators such as GPUs or TPUs.",
+    "Casual": "Just tried that new coffee shop downtown and wow, their cold brew is amazing! Best I've had in years, no joke."
+}
+for name, text in sample_texts.items():
+    if st.sidebar.button(name, key=f"sample_{name}"):
+        st.session_state.input_text = text
+        st.rerun()