NadaPlagiarism / app.py
drvsbrkcn's picture
Modified app.py to fulfill the requirements need to publish the app on Hugging Face Spaces.
58478cd verified
#!/usr/bin/env python3
"""
🤖➡️👨 NadaPlagiarism - Hugging Face Spaces
==========================================
Enterprise-grade AI text humanization optimized for Hugging Face Spaces.
Features advanced humanization, quality analytics, and modern UI.
"""
import gradio as gr
import torch
import random
import re
import warnings
import math
import statistics
import time
import asyncio
from collections import Counter, defaultdict
from typing import List, Dict, Optional, Union, Tuple
from dataclasses import dataclass
from datetime import datetime
import logging
# Suppress warnings
warnings.filterwarnings("ignore")
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# ============================================================================
# CORE HUMANIZATION ENGINE
# ============================================================================
class NadaPlagiarism:
"""Enterprise-grade AI text humanization engine"""
def __init__(self):
self.models = {}
self.quality_analyzer = QualityAnalyzer()
self._load_linguistic_databases()
self._initialize_models()
logger.info("NadaPlagiarism initialized successfully")
def _initialize_models(self):
"""Initialize AI models with fallbacks"""
try:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# Try to load T5 model
try:
self.models["t5_paraphrase"] = {
"tokenizer": AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws", use_fast=False),
"model": AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws"),
"status": "active"
}
logger.info("✅ T5 model loaded successfully")
except Exception as e:
logger.warning(f"⚠️ T5 model failed: {e}")
self.models["t5_paraphrase"] = {"status": "fallback"}
# Try to load Pegasus model
try:
self.models["pegasus"] = {
"tokenizer": AutoTokenizer.from_pretrained("tuner007/pegasus_paraphrase", use_fast=False),
"model": AutoModelForSeq2SeqLM.from_pretrained("tuner007/pegasus_paraphrase"),
"status": "active"
}
logger.info("✅ Pegasus model loaded successfully")
except Exception as e:
logger.warning(f"⚠️ Pegasus model failed: {e}")
self.models["pegasus"] = {"status": "fallback"}
except ImportError:
logger.warning("Transformers not available, using rule-based fallback")
# Always have rule-based fallback
self.models["rule_based"] = {"status": "active"}
def _load_linguistic_databases(self):
"""Load comprehensive linguistic databases"""
self.vocabulary_db = {
"academic_formal": {
"demonstrates": ["shows", "reveals", "indicates", "illustrates", "displays"],
"significant": ["notable", "considerable", "substantial", "important", "remarkable"],
"utilize": ["use", "employ", "apply", "implement", "make use of"],
"facilitate": ["help", "enable", "assist", "support", "aid", "promote"],
"optimize": ["improve", "enhance", "refine", "perfect", "better", "streamline"],
"comprehensive": ["complete", "thorough", "extensive", "detailed", "full", "inclusive"],
"robust": ["strong", "reliable", "sturdy", "solid", "dependable"],
"methodology": ["method", "approach", "technique", "procedure", "process", "strategy"],
"implementation": ["application", "execution", "deployment", "introduction", "adoption"],
"evaluation": ["assessment", "analysis", "examination", "review", "appraisal"]
},
"transitional": {
"furthermore": ["moreover", "additionally", "in addition", "what is more", "besides"],
"however": ["nevertheless", "nonetheless", "yet", "on the other hand", "but"],
"therefore": ["thus", "hence", "consequently", "as a result", "for this reason"],
"moreover": ["furthermore", "additionally", "in addition", "what is more"],
"consequently": ["therefore", "thus", "hence", "as a result", "accordingly"]
}
}
self.sentence_starters = [
"Research indicates that", "Studies suggest", "Evidence shows",
"Findings reveal", "Data demonstrates", "Analysis suggests",
"Results indicate", "Observations show", "Investigations reveal"
]
self.academic_connectors = [
"Moreover", "Furthermore", "Additionally", "In contrast", "Similarly",
"Consequently", "Nevertheless", "Notably", "Importantly", "Significantly"
]
self.hedging_phrases = [
"appears to", "seems to", "tends to", "suggests that", "indicates that",
"may well", "might be", "could be", "potentially", "presumably"
]
def humanize_text(self, text: str, level: str = "medium") -> Dict:
"""Main humanization function"""
if not text or len(text.strip()) < 10:
return {
"humanized_text": "Please enter substantial text to humanize (at least 10 characters).",
"quality_metrics": {},
"processing_time": 0,
"model_used": "none"
}
start_time = time.time()
try:
# Select best available model
model_name = self._select_best_model()
# Process text based on level
if level == "light":
humanized_text = self._light_humanization(text, model_name)
elif level == "medium":
humanized_text = self._medium_humanization(text, model_name)
elif level == "heavy":
humanized_text = self._heavy_humanization(text, model_name)
else:
humanized_text = self._medium_humanization(text, model_name)
# Calculate quality metrics
quality_metrics = self.quality_analyzer.analyze_quality(text, humanized_text)
processing_time = time.time() - start_time
return {
"humanized_text": humanized_text,
"quality_metrics": quality_metrics,
"processing_time": processing_time,
"model_used": model_name
}
except Exception as e:
logger.error(f"Humanization failed: {e}")
return {
"humanized_text": f"Error processing text: {str(e)}",
"quality_metrics": {},
"processing_time": time.time() - start_time,
"model_used": "error"
}
def _select_best_model(self) -> str:
"""Select the best available model"""
for model_name, model_info in self.models.items():
if model_info.get("status") == "active" and model_info.get("model"):
return model_name
return "rule_based"
def _light_humanization(self, text: str, model_name: str) -> str:
"""Light humanization - basic vocabulary changes"""
result = text
# Apply vocabulary replacements
result = self._apply_vocabulary_replacements(result, intensity=0.3)
# Add minimal natural variations
result = self._add_natural_variations(result, intensity=0.2)
return result
def _medium_humanization(self, text: str, model_name: str) -> str:
"""Medium humanization - vocabulary + flow + structure"""
result = text
# Use AI model if available
if model_name != "rule_based" and self.models[model_name].get("model"):
result = self._ai_paraphrase(result, model_name)
# Apply vocabulary replacements
result = self._apply_vocabulary_replacements(result, intensity=0.5)
# Add natural variations
result = self._add_natural_variations(result, intensity=0.4)
# Adjust sentence structure
result = self._adjust_sentence_structure(result, intensity=0.3)
return result
def _heavy_humanization(self, text: str, model_name: str) -> str:
"""Heavy humanization - all techniques applied"""
result = text
# Use AI model if available
if model_name != "rule_based" and self.models[model_name].get("model"):
result = self._ai_paraphrase(result, model_name)
# Apply comprehensive vocabulary replacements
result = self._apply_vocabulary_replacements(result, intensity=0.7)
# Add extensive natural variations
result = self._add_natural_variations(result, intensity=0.6)
# Significant sentence structure changes
result = self._adjust_sentence_structure(result, intensity=0.5)
# Add academic connectors
result = self._add_academic_connectors(result)
# Add hedging language
result = self._add_hedging_language(result)
return result
def _ai_paraphrase(self, text: str, model_name: str) -> str:
"""Paraphrase text using AI model"""
try:
model_info = self.models[model_name]
tokenizer = model_info["tokenizer"]
model = model_info["model"]
if not tokenizer or not model:
return text
# Prepare input
if model_name == "t5_paraphrase":
input_text = f"paraphrase: {text}"
else:
input_text = text
# Tokenize
input_ids = tokenizer.encode(
input_text,
return_tensors="pt",
max_length=512,
truncation=True
)
# Generate
with torch.no_grad():
outputs = model.generate(
input_ids=input_ids,
max_length=min(len(text.split()) + 50, 512),
num_beams=5,
temperature=1.2,
top_p=0.9,
do_sample=True,
early_stopping=True,
repetition_penalty=1.1
)
# Decode
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
return result if result and len(result) > 10 else text
except Exception as e:
logger.warning(f"AI paraphrasing failed: {e}")
return text
def _apply_vocabulary_replacements(self, text: str, intensity: float = 0.5) -> str:
"""Apply vocabulary replacements with specified intensity"""
result = text
for category, replacements in self.vocabulary_db.items():
for original, alternatives in replacements.items():
if random.random() < intensity:
replacement = random.choice(alternatives)
pattern = re.compile(r'\b' + re.escape(original) + r'\b', re.IGNORECASE)
result = pattern.sub(replacement, result, count=1)
return result
def _add_natural_variations(self, text: str, intensity: float = 0.4) -> str:
"""Add natural linguistic variations"""
sentences = self._split_sentences(text)
varied_sentences = []
for i, sentence in enumerate(sentences):
sentence = sentence.strip()
if not sentence:
continue
# Add hedging language
if random.random() < intensity * 0.3:
hedge = random.choice(self.hedging_phrases)
if sentence.startswith(("The ", "This ", "These ")):
sentence = sentence.replace("The ", f"The {hedge} ", 1)
sentence = sentence.replace("This ", f"This {hedge} ", 1)
sentence = sentence.replace("These ", f"These {hedge} ", 1)
# Add sentence starters
if i > 0 and random.random() < intensity * 0.4:
starter = random.choice(self.sentence_starters)
sentence = f"{starter}, {sentence.lower()}"
varied_sentences.append(sentence)
return " ".join(varied_sentences)
def _adjust_sentence_structure(self, text: str, intensity: float = 0.3) -> str:
"""Adjust sentence structures for variety"""
sentences = self._split_sentences(text)
modified = []
for sentence in sentences:
words = sentence.split()
# Break long sentences
if len(words) > 25 and random.random() < intensity:
break_point = self._find_break_point(words)
if break_point:
part1 = " ".join(words[:break_point]) + "."
part2 = " ".join(words[break_point:])
if part2:
part2 = part2[0].upper() + part2[1:]
modified.extend([part1, part2])
else:
modified.append(sentence)
else:
modified.append(sentence)
return " ".join(modified)
def _add_academic_connectors(self, text: str) -> str:
"""Add academic connectors for natural flow"""
sentences = self._split_sentences(text)
connected = []
for i, sentence in enumerate(sentences):
if i > 0 and random.random() < 0.3:
connector = random.choice(self.academic_connectors)
sentence = f"{connector}, {sentence.lower()}"
connected.append(sentence)
return " ".join(connected)
def _add_hedging_language(self, text: str) -> str:
"""Add hedging language for academic tone"""
sentences = self._split_sentences(text)
hedged = []
for sentence in sentences:
if random.random() < 0.2:
hedge = random.choice(self.hedging_phrases)
if sentence.startswith(("The ", "This ")):
sentence = sentence.replace("The ", f"The {hedge} ", 1)
sentence = sentence.replace("This ", f"This {hedge} ", 1)
hedged.append(sentence)
return " ".join(hedged)
def _split_sentences(self, text: str) -> List[str]:
"""Split text into sentences"""
sentences = []
current = ""
for char in text:
current += char
if char in '.!?':
if len(current.strip()) > 5:
sentences.append(current.strip())
current = ""
if current.strip():
sentences.append(current.strip())
return [s for s in sentences if len(s.strip()) > 5]
def _find_break_point(self, words: List[str]) -> Optional[int]:
"""Find good break point in sentence"""
break_words = ['and', 'but', 'which', 'that', 'because', 'since', 'while']
for i in range(8, min(18, len(words))):
if words[i].lower().rstrip('.,') in break_words:
return i
return None
# ============================================================================
# QUALITY ANALYZER
# ============================================================================
class QualityAnalyzer:
"""Advanced quality analysis for humanized text"""
def __init__(self):
self.ai_patterns = {
"ai_phrases": [
"demonstrates significant", "substantial improvements", "comprehensive analysis",
"furthermore", "moreover", "additionally", "consequently", "therefore",
"implementation of", "utilization of", "optimization of", "enhancement of"
],
"overused_words": [
"significant", "substantial", "comprehensive", "extensive", "robust",
"novel", "innovative", "efficient", "effective", "optimal", "superior"
]
}
def analyze_quality(self, original_text: str, humanized_text: str) -> Dict[str, float]:
"""Analyze quality metrics for humanized text"""
metrics = {}
# Readability analysis
metrics["readability"] = self._analyze_readability(humanized_text)
# Naturalness analysis
metrics["naturalness"] = self._analyze_naturalness(humanized_text)
# Academic tone analysis
metrics["academic_tone"] = self._analyze_academic_tone(humanized_text)
# AI detection analysis
metrics["ai_detection"] = self._analyze_ai_detection(humanized_text)
# Fluency analysis
metrics["fluency"] = self._analyze_fluency(humanized_text)
# Overall quality
metrics["overall_quality"] = statistics.mean([
metrics["readability"],
metrics["naturalness"],
metrics["academic_tone"],
100 - metrics["ai_detection"], # Invert AI detection score
metrics["fluency"]
])
return metrics
def _analyze_readability(self, text: str) -> float:
"""Analyze text readability"""
try:
words = text.split()
sentences = len([s for s in text.split('.') if s.strip()])
if sentences == 0:
return 50
avg_words_per_sentence = len(words) / sentences
avg_syllables_per_word = self._calculate_avg_syllables(words)
# Flesch Reading Ease approximation
flesch_score = 206.835 - (1.015 * avg_words_per_sentence) - (84.6 * avg_syllables_per_word)
readability_score = max(0, min(100, (flesch_score + 100) / 2))
return readability_score
except:
return 50
def _calculate_avg_syllables(self, words: List[str]) -> float:
"""Calculate average syllables per word"""
total_syllables = sum(self._count_syllables(word) for word in words)
return total_syllables / len(words) if words else 0
def _count_syllables(self, word: str) -> int:
"""Count syllables in word"""
word = word.lower().strip('.,!?;:')
vowels = 'aeiouy'
syllable_count = 0
prev_was_vowel = False
for char in word:
is_vowel = char in vowels
if is_vowel and not prev_was_vowel:
syllable_count += 1
prev_was_vowel = is_vowel
if word.endswith('e') and syllable_count > 1:
syllable_count -= 1
return max(1, syllable_count)
def _analyze_naturalness(self, text: str) -> float:
"""Analyze naturalness of text"""
natural_indicators = [
len(re.findall(r'\b(I|we|you|they)\b', text)), # Personal pronouns
len(re.findall(r'\b(however|but|yet|although)\b', text)), # Natural transitions
len(re.findall(r'\b(seems|appears|tends|suggests)\b', text)), # Hedging language
]
score = min(100, sum(natural_indicators) * 10)
return max(0, score)
def _analyze_academic_tone(self, text: str) -> float:
"""Analyze academic tone preservation"""
academic_indicators = [
len(re.findall(r'\b(research|study|analysis|findings|results)\b', text)),
len(re.findall(r'\b(according to|based on|in light of)\b', text)),
len(re.findall(r'\b(furthermore|moreover|additionally)\b', text)),
]
score = min(100, sum(academic_indicators) * 15)
return max(0, score)
def _analyze_ai_detection(self, text: str) -> float:
"""Analyze AI detection probability"""
text_lower = text.lower()
# Check for AI phrases
ai_phrase_count = sum(1 for phrase in self.ai_patterns["ai_phrases"] if phrase in text_lower)
# Check for overused words
overused_count = sum(1 for word in self.ai_patterns["overused_words"] if word in text_lower)
words = len(text.split())
if words == 0:
return 0
ai_score = min(100, (ai_phrase_count + overused_count) * 20)
return max(0, ai_score)
def _analyze_fluency(self, text: str) -> float:
"""Analyze text fluency"""
sentences = [s.strip() for s in text.split('.') if s.strip()]
if len(sentences) < 2:
return 50
# Check sentence length variation
lengths = [len(s.split()) for s in sentences]
avg_length = statistics.mean(lengths)
length_variance = statistics.variance(lengths) if len(lengths) > 1 else 0
# Good fluency has moderate variation
fluency_score = min(100, 50 + (length_variance * 2))
return max(0, fluency_score)
# ============================================================================
# GRADIO INTERFACE
# ============================================================================
# Initialize humanizer
humanizer = NadaPlagiarism()
def process_text(text: str, level: str) -> Tuple[str, Dict]:
"""Process text with humanizer"""
if not text.strip():
return "Please enter some text to humanize.", {}
result = humanizer.humanize_text(text, level)
# Format quality metrics for display
quality_display = {}
if result["quality_metrics"]:
for metric, score in result["quality_metrics"].items():
quality_display[metric.replace("_", " ").title()] = f"{score:.1f}%"
return result["humanized_text"], quality_display
def analyze_quality(text: str) -> Dict:
"""Analyze text quality"""
if not text.strip():
return {}
analyzer = QualityAnalyzer()
metrics = analyzer.analyze_quality(text, text)
# Format for display
quality_display = {}
for metric, score in metrics.items():
quality_display[metric.replace("_", " ").title()] = f"{score:.1f}%"
return quality_display
# Create Gradio interface
with gr.Blocks(
theme=gr.themes.Soft(),
title="🤖➡️👨 NadaPlagiarism",
css="""
.gradio-container {
max-width: 1200px !important;
margin: auto !important;
}
.main-header {
text-align: center;
padding: 20px;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
border-radius: 10px;
margin-bottom: 20px;
}
.metric-card {
background: #f8f9fa;
border: 1px solid #dee2e6;
border-radius: 8px;
padding: 15px;
margin: 10px 0;
}
"""
) as demo:
# Header
gr.HTML("""
<div class="main-header">
<h1>🤖➡️👨 NadaPlagiarism</h1>
<p>Enterprise-grade AI text humanization with advanced analytics</p>
</div>
""")
with gr.Tabs():
# Main Humanization Tab
with gr.TabItem("🎭 Text Humanizer"):
gr.Markdown("### Transform AI text into natural, human-like writing")
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
lines=8,
placeholder="Enter your AI-generated text here...",
label="Input Text",
info="Paste the text you want to humanize"
)
level_radio = gr.Radio(
choices=[
("Light", "light"),
("Medium", "medium"),
("Heavy", "heavy")
],
value="medium",
label="Humanization Level",
info="Choose the intensity of humanization"
)
with gr.Row():
humanize_btn = gr.Button("🚀 Humanize Text", variant="primary", size="lg")
analyze_btn = gr.Button("📊 Analyze Quality", variant="secondary")
with gr.Column(scale=2):
text_output = gr.Textbox(
lines=8,
label="Humanized Text",
show_copy_button=True,
info="Your humanized text will appear here"
)
quality_metrics = gr.JSON(
label="Quality Metrics",
value={}
)
# Examples
gr.Examples(
examples=[
[
"The implementation of machine learning algorithms demonstrates significant improvements in computational efficiency and accuracy metrics across various benchmark datasets.",
"medium"
],
[
"Artificial intelligence technologies are increasingly being utilized across various industries to enhance operational capabilities and drive innovation.",
"heavy"
]
],
inputs=[text_input, level_radio],
outputs=[text_output]
)
# Quality Analysis Tab
with gr.TabItem("📊 Quality Analysis"):
gr.Markdown("### Analyze text quality and get improvement suggestions")
with gr.Row():
with gr.Column():
analysis_input = gr.Textbox(
lines=6,
placeholder="Enter text to analyze...",
label="Text to Analyze"
)
analyze_quality_btn = gr.Button("🔍 Analyze Quality", variant="primary")
with gr.Column():
analysis_output = gr.JSON(
label="Quality Analysis Results",
value={}
)
# Info Tab
with gr.TabItem("ℹ️ About"):
gr.Markdown("""
### 🎯 NadaPlagiarism
**Enterprise-grade AI text humanization with advanced features:**
#### ✨ Features:
- **Multiple AI Models**: T5, Pegasus with intelligent fallbacks
- **Quality Analytics**: 5+ comprehensive quality metrics
- **Advanced Humanization**: Vocabulary, structure, and flow optimization
- **Real-time Feedback**: Quality scores and improvement suggestions
- **Modern Interface**: Responsive design with analytics dashboard
#### 📊 Quality Metrics:
- **Readability**: Flesch Reading Ease approximation
- **Naturalness**: Personal pronouns, natural transitions
- **Academic Tone**: Scholarly language preservation
- **AI Detection**: Pattern recognition and phrase analysis
- **Fluency**: Sentence flow and coherence
- **Overall Quality**: Weighted composite score
#### 🎮 How to Use:
1. **Paste your AI-generated text** in the input box
2. **Choose humanization level** (Light/Medium/Heavy)
3. **Click "Humanize Text"** to transform your text
4. **View quality metrics** and improvement suggestions
5. **Copy the humanized text** for your use
#### ⚖️ Ethical Usage:
This tool is designed for improving writing quality and learning natural language patterns.
Please use responsibly and maintain academic integrity.
""")
# Event handlers
humanize_btn.click(
fn=process_text,
inputs=[text_input, level_radio],
outputs=[text_output, quality_metrics]
)
analyze_btn.click(
fn=analyze_quality,
inputs=[text_input],
outputs=[quality_metrics]
)
analyze_quality_btn.click(
fn=analyze_quality,
inputs=[analysis_input],
outputs=[analysis_output]
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
debug=False
)