Spaces:

drvsbrkcn
/

NadaPlagiarism

Sleeping

App Files Files Community

NadaPlagiarism / app.py

drvsbrkcn

Modified app.py to fulfill the requirements need to publish the app on Hugging Face Spaces.

58478cd verified 7 months ago

raw

history blame contribute delete

29.1 kB

	#!/usr/bin/env python3
	"""
	🤖➡️👨 NadaPlagiarism - Hugging Face Spaces
	==========================================

	Enterprise-grade AI text humanization optimized for Hugging Face Spaces.
	Features advanced humanization, quality analytics, and modern UI.
	"""

	import gradio as gr
	import torch
	import random
	import re
	import warnings
	import math
	import statistics
	import time
	import asyncio
	from collections import Counter, defaultdict
	from typing import List, Dict, Optional, Union, Tuple
	from dataclasses import dataclass
	from datetime import datetime
	import logging

	# Suppress warnings
	warnings.filterwarnings("ignore")

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# ============================================================================
	# CORE HUMANIZATION ENGINE
	# ============================================================================

	class NadaPlagiarism:
	"""Enterprise-grade AI text humanization engine"""

	def __init__(self):
	self.models = {}
	self.quality_analyzer = QualityAnalyzer()
	self._load_linguistic_databases()
	self._initialize_models()
	logger.info("NadaPlagiarism initialized successfully")

	def _initialize_models(self):
	"""Initialize AI models with fallbacks"""
	try:
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

	# Try to load T5 model
	try:
	self.models["t5_paraphrase"] = {
	"tokenizer": AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws", use_fast=False),
	"model": AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws"),
	"status": "active"
	}
	logger.info("✅ T5 model loaded successfully")
	except Exception as e:
	logger.warning(f"⚠️ T5 model failed: {e}")
	self.models["t5_paraphrase"] = {"status": "fallback"}

	# Try to load Pegasus model
	try:
	self.models["pegasus"] = {
	"tokenizer": AutoTokenizer.from_pretrained("tuner007/pegasus_paraphrase", use_fast=False),
	"model": AutoModelForSeq2SeqLM.from_pretrained("tuner007/pegasus_paraphrase"),
	"status": "active"
	}
	logger.info("✅ Pegasus model loaded successfully")
	except Exception as e:
	logger.warning(f"⚠️ Pegasus model failed: {e}")
	self.models["pegasus"] = {"status": "fallback"}

	except ImportError:
	logger.warning("Transformers not available, using rule-based fallback")

	# Always have rule-based fallback
	self.models["rule_based"] = {"status": "active"}

	def _load_linguistic_databases(self):
	"""Load comprehensive linguistic databases"""
	self.vocabulary_db = {
	"academic_formal": {
	"demonstrates": ["shows", "reveals", "indicates", "illustrates", "displays"],
	"significant": ["notable", "considerable", "substantial", "important", "remarkable"],
	"utilize": ["use", "employ", "apply", "implement", "make use of"],
	"facilitate": ["help", "enable", "assist", "support", "aid", "promote"],
	"optimize": ["improve", "enhance", "refine", "perfect", "better", "streamline"],
	"comprehensive": ["complete", "thorough", "extensive", "detailed", "full", "inclusive"],
	"robust": ["strong", "reliable", "sturdy", "solid", "dependable"],
	"methodology": ["method", "approach", "technique", "procedure", "process", "strategy"],
	"implementation": ["application", "execution", "deployment", "introduction", "adoption"],
	"evaluation": ["assessment", "analysis", "examination", "review", "appraisal"]
	},
	"transitional": {
	"furthermore": ["moreover", "additionally", "in addition", "what is more", "besides"],
	"however": ["nevertheless", "nonetheless", "yet", "on the other hand", "but"],
	"therefore": ["thus", "hence", "consequently", "as a result", "for this reason"],
	"moreover": ["furthermore", "additionally", "in addition", "what is more"],
	"consequently": ["therefore", "thus", "hence", "as a result", "accordingly"]
	}
	}

	self.sentence_starters = [
	"Research indicates that", "Studies suggest", "Evidence shows",
	"Findings reveal", "Data demonstrates", "Analysis suggests",
	"Results indicate", "Observations show", "Investigations reveal"
	]

	self.academic_connectors = [
	"Moreover", "Furthermore", "Additionally", "In contrast", "Similarly",
	"Consequently", "Nevertheless", "Notably", "Importantly", "Significantly"
	]

	self.hedging_phrases = [
	"appears to", "seems to", "tends to", "suggests that", "indicates that",
	"may well", "might be", "could be", "potentially", "presumably"
	]

	def humanize_text(self, text: str, level: str = "medium") -> Dict:
	"""Main humanization function"""
	if not text or len(text.strip()) < 10:
	return {
	"humanized_text": "Please enter substantial text to humanize (at least 10 characters).",
	"quality_metrics": {},
	"processing_time": 0,
	"model_used": "none"
	}

	start_time = time.time()

	try:
	# Select best available model
	model_name = self._select_best_model()

	# Process text based on level
	if level == "light":
	humanized_text = self._light_humanization(text, model_name)
	elif level == "medium":
	humanized_text = self._medium_humanization(text, model_name)
	elif level == "heavy":
	humanized_text = self._heavy_humanization(text, model_name)
	else:
	humanized_text = self._medium_humanization(text, model_name)

	# Calculate quality metrics
	quality_metrics = self.quality_analyzer.analyze_quality(text, humanized_text)

	processing_time = time.time() - start_time

	return {
	"humanized_text": humanized_text,
	"quality_metrics": quality_metrics,
	"processing_time": processing_time,
	"model_used": model_name
	}

	except Exception as e:
	logger.error(f"Humanization failed: {e}")
	return {
	"humanized_text": f"Error processing text: {str(e)}",
	"quality_metrics": {},
	"processing_time": time.time() - start_time,
	"model_used": "error"
	}

	def _select_best_model(self) -> str:
	"""Select the best available model"""
	for model_name, model_info in self.models.items():
	if model_info.get("status") == "active" and model_info.get("model"):
	return model_name
	return "rule_based"

	def _light_humanization(self, text: str, model_name: str) -> str:
	"""Light humanization - basic vocabulary changes"""
	result = text

	# Apply vocabulary replacements
	result = self._apply_vocabulary_replacements(result, intensity=0.3)

	# Add minimal natural variations
	result = self._add_natural_variations(result, intensity=0.2)

	return result

	def _medium_humanization(self, text: str, model_name: str) -> str:
	"""Medium humanization - vocabulary + flow + structure"""
	result = text

	# Use AI model if available
	if model_name != "rule_based" and self.models[model_name].get("model"):
	result = self._ai_paraphrase(result, model_name)

	# Apply vocabulary replacements
	result = self._apply_vocabulary_replacements(result, intensity=0.5)

	# Add natural variations
	result = self._add_natural_variations(result, intensity=0.4)

	# Adjust sentence structure
	result = self._adjust_sentence_structure(result, intensity=0.3)

	return result

	def _heavy_humanization(self, text: str, model_name: str) -> str:
	"""Heavy humanization - all techniques applied"""
	result = text

	# Use AI model if available
	if model_name != "rule_based" and self.models[model_name].get("model"):
	result = self._ai_paraphrase(result, model_name)

	# Apply comprehensive vocabulary replacements
	result = self._apply_vocabulary_replacements(result, intensity=0.7)

	# Add extensive natural variations
	result = self._add_natural_variations(result, intensity=0.6)

	# Significant sentence structure changes
	result = self._adjust_sentence_structure(result, intensity=0.5)

	# Add academic connectors
	result = self._add_academic_connectors(result)

	# Add hedging language
	result = self._add_hedging_language(result)

	return result

	def _ai_paraphrase(self, text: str, model_name: str) -> str:
	"""Paraphrase text using AI model"""
	try:
	model_info = self.models[model_name]
	tokenizer = model_info["tokenizer"]
	model = model_info["model"]

	if not tokenizer or not model:
	return text

	# Prepare input
	if model_name == "t5_paraphrase":
	input_text = f"paraphrase: {text}"
	else:
	input_text = text

	# Tokenize
	input_ids = tokenizer.encode(
	input_text,
	return_tensors="pt",
	max_length=512,
	truncation=True
	)

	# Generate
	with torch.no_grad():
	outputs = model.generate(
	input_ids=input_ids,
	max_length=min(len(text.split()) + 50, 512),
	num_beams=5,
	temperature=1.2,
	top_p=0.9,
	do_sample=True,
	early_stopping=True,
	repetition_penalty=1.1
	)

	# Decode
	result = tokenizer.decode(outputs[0], skip_special_tokens=True)

	return result if result and len(result) > 10 else text

	except Exception as e:
	logger.warning(f"AI paraphrasing failed: {e}")
	return text

	def _apply_vocabulary_replacements(self, text: str, intensity: float = 0.5) -> str:
	"""Apply vocabulary replacements with specified intensity"""
	result = text

	for category, replacements in self.vocabulary_db.items():
	for original, alternatives in replacements.items():
	if random.random() < intensity:
	replacement = random.choice(alternatives)
	pattern = re.compile(r'\b' + re.escape(original) + r'\b', re.IGNORECASE)
	result = pattern.sub(replacement, result, count=1)

	return result

	def _add_natural_variations(self, text: str, intensity: float = 0.4) -> str:
	"""Add natural linguistic variations"""
	sentences = self._split_sentences(text)
	varied_sentences = []

	for i, sentence in enumerate(sentences):
	sentence = sentence.strip()
	if not sentence:
	continue

	# Add hedging language
	if random.random() < intensity * 0.3:
	hedge = random.choice(self.hedging_phrases)
	if sentence.startswith(("The ", "This ", "These ")):
	sentence = sentence.replace("The ", f"The {hedge} ", 1)
	sentence = sentence.replace("This ", f"This {hedge} ", 1)
	sentence = sentence.replace("These ", f"These {hedge} ", 1)

	# Add sentence starters
	if i > 0 and random.random() < intensity * 0.4:
	starter = random.choice(self.sentence_starters)
	sentence = f"{starter}, {sentence.lower()}"

	varied_sentences.append(sentence)

	return " ".join(varied_sentences)

	def _adjust_sentence_structure(self, text: str, intensity: float = 0.3) -> str:
	"""Adjust sentence structures for variety"""
	sentences = self._split_sentences(text)
	modified = []

	for sentence in sentences:
	words = sentence.split()

	# Break long sentences
	if len(words) > 25 and random.random() < intensity:
	break_point = self._find_break_point(words)
	if break_point:
	part1 = " ".join(words[:break_point]) + "."
	part2 = " ".join(words[break_point:])
	if part2:
	part2 = part2[0].upper() + part2[1:]
	modified.extend([part1, part2])
	else:
	modified.append(sentence)
	else:
	modified.append(sentence)

	return " ".join(modified)

	def _add_academic_connectors(self, text: str) -> str:
	"""Add academic connectors for natural flow"""
	sentences = self._split_sentences(text)
	connected = []

	for i, sentence in enumerate(sentences):
	if i > 0 and random.random() < 0.3:
	connector = random.choice(self.academic_connectors)
	sentence = f"{connector}, {sentence.lower()}"
	connected.append(sentence)

	return " ".join(connected)

	def _add_hedging_language(self, text: str) -> str:
	"""Add hedging language for academic tone"""
	sentences = self._split_sentences(text)
	hedged = []

	for sentence in sentences:
	if random.random() < 0.2:
	hedge = random.choice(self.hedging_phrases)
	if sentence.startswith(("The ", "This ")):
	sentence = sentence.replace("The ", f"The {hedge} ", 1)
	sentence = sentence.replace("This ", f"This {hedge} ", 1)
	hedged.append(sentence)

	return " ".join(hedged)

	def _split_sentences(self, text: str) -> List[str]:
	"""Split text into sentences"""
	sentences = []
	current = ""

	for char in text:
	current += char
	if char in '.!?':
	if len(current.strip()) > 5:
	sentences.append(current.strip())
	current = ""

	if current.strip():
	sentences.append(current.strip())

	return [s for s in sentences if len(s.strip()) > 5]

	def _find_break_point(self, words: List[str]) -> Optional[int]:
	"""Find good break point in sentence"""
	break_words = ['and', 'but', 'which', 'that', 'because', 'since', 'while']

	for i in range(8, min(18, len(words))):
	if words[i].lower().rstrip('.,') in break_words:
	return i
	return None

	# ============================================================================
	# QUALITY ANALYZER
	# ============================================================================

	class QualityAnalyzer:
	"""Advanced quality analysis for humanized text"""

	def __init__(self):
	self.ai_patterns = {
	"ai_phrases": [
	"demonstrates significant", "substantial improvements", "comprehensive analysis",
	"furthermore", "moreover", "additionally", "consequently", "therefore",
	"implementation of", "utilization of", "optimization of", "enhancement of"
	],
	"overused_words": [
	"significant", "substantial", "comprehensive", "extensive", "robust",
	"novel", "innovative", "efficient", "effective", "optimal", "superior"
	]
	}

	def analyze_quality(self, original_text: str, humanized_text: str) -> Dict[str, float]:
	"""Analyze quality metrics for humanized text"""
	metrics = {}

	# Readability analysis
	metrics["readability"] = self._analyze_readability(humanized_text)

	# Naturalness analysis
	metrics["naturalness"] = self._analyze_naturalness(humanized_text)

	# Academic tone analysis
	metrics["academic_tone"] = self._analyze_academic_tone(humanized_text)

	# AI detection analysis
	metrics["ai_detection"] = self._analyze_ai_detection(humanized_text)

	# Fluency analysis
	metrics["fluency"] = self._analyze_fluency(humanized_text)

	# Overall quality
	metrics["overall_quality"] = statistics.mean([
	metrics["readability"],
	metrics["naturalness"],
	metrics["academic_tone"],
	100 - metrics["ai_detection"], # Invert AI detection score
	metrics["fluency"]
	])

	return metrics

	def _analyze_readability(self, text: str) -> float:
	"""Analyze text readability"""
	try:
	words = text.split()
	sentences = len([s for s in text.split('.') if s.strip()])

	if sentences == 0:
	return 50

	avg_words_per_sentence = len(words) / sentences
	avg_syllables_per_word = self._calculate_avg_syllables(words)

	# Flesch Reading Ease approximation
	flesch_score = 206.835 - (1.015 * avg_words_per_sentence) - (84.6 * avg_syllables_per_word)
	readability_score = max(0, min(100, (flesch_score + 100) / 2))

	return readability_score
	except:
	return 50

	def _calculate_avg_syllables(self, words: List[str]) -> float:
	"""Calculate average syllables per word"""
	total_syllables = sum(self._count_syllables(word) for word in words)
	return total_syllables / len(words) if words else 0

	def _count_syllables(self, word: str) -> int:
	"""Count syllables in word"""
	word = word.lower().strip('.,!?;:')
	vowels = 'aeiouy'
	syllable_count = 0
	prev_was_vowel = False

	for char in word:
	is_vowel = char in vowels
	if is_vowel and not prev_was_vowel:
	syllable_count += 1
	prev_was_vowel = is_vowel

	if word.endswith('e') and syllable_count > 1:
	syllable_count -= 1

	return max(1, syllable_count)

	def _analyze_naturalness(self, text: str) -> float:
	"""Analyze naturalness of text"""
	natural_indicators = [
	len(re.findall(r'\b(I\|we\|you\|they)\b', text)), # Personal pronouns
	len(re.findall(r'\b(however\|but\|yet\|although)\b', text)), # Natural transitions
	len(re.findall(r'\b(seems\|appears\|tends\|suggests)\b', text)), # Hedging language
	]

	score = min(100, sum(natural_indicators) * 10)
	return max(0, score)

	def _analyze_academic_tone(self, text: str) -> float:
	"""Analyze academic tone preservation"""
	academic_indicators = [
	len(re.findall(r'\b(research\|study\|analysis\|findings\|results)\b', text)),
	len(re.findall(r'\b(according to\|based on\|in light of)\b', text)),
	len(re.findall(r'\b(furthermore\|moreover\|additionally)\b', text)),
	]

	score = min(100, sum(academic_indicators) * 15)
	return max(0, score)

	def _analyze_ai_detection(self, text: str) -> float:
	"""Analyze AI detection probability"""
	text_lower = text.lower()

	# Check for AI phrases
	ai_phrase_count = sum(1 for phrase in self.ai_patterns["ai_phrases"] if phrase in text_lower)

	# Check for overused words
	overused_count = sum(1 for word in self.ai_patterns["overused_words"] if word in text_lower)

	words = len(text.split())
	if words == 0:
	return 0

	ai_score = min(100, (ai_phrase_count + overused_count) * 20)
	return max(0, ai_score)

	def _analyze_fluency(self, text: str) -> float:
	"""Analyze text fluency"""
	sentences = [s.strip() for s in text.split('.') if s.strip()]
	if len(sentences) < 2:
	return 50

	# Check sentence length variation
	lengths = [len(s.split()) for s in sentences]
	avg_length = statistics.mean(lengths)
	length_variance = statistics.variance(lengths) if len(lengths) > 1 else 0

	# Good fluency has moderate variation
	fluency_score = min(100, 50 + (length_variance * 2))
	return max(0, fluency_score)

	# ============================================================================
	# GRADIO INTERFACE
	# ============================================================================

	# Initialize humanizer
	humanizer = NadaPlagiarism()

	def process_text(text: str, level: str) -> Tuple[str, Dict]:
	"""Process text with humanizer"""
	if not text.strip():
	return "Please enter some text to humanize.", {}

	result = humanizer.humanize_text(text, level)

	# Format quality metrics for display
	quality_display = {}
	if result["quality_metrics"]:
	for metric, score in result["quality_metrics"].items():
	quality_display[metric.replace("_", " ").title()] = f"{score:.1f}%"

	return result["humanized_text"], quality_display

	def analyze_quality(text: str) -> Dict:
	"""Analyze text quality"""
	if not text.strip():
	return {}

	analyzer = QualityAnalyzer()
	metrics = analyzer.analyze_quality(text, text)

	# Format for display
	quality_display = {}
	for metric, score in metrics.items():
	quality_display[metric.replace("_", " ").title()] = f"{score:.1f}%"

	return quality_display

	# Create Gradio interface
	with gr.Blocks(
	theme=gr.themes.Soft(),
	title="🤖➡️👨 NadaPlagiarism",
	css="""
	.gradio-container {
	max-width: 1200px !important;
	margin: auto !important;
	}
	.main-header {
	text-align: center;
	padding: 20px;
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: white;
	border-radius: 10px;
	margin-bottom: 20px;
	}
	.metric-card {
	background: #f8f9fa;
	border: 1px solid #dee2e6;
	border-radius: 8px;
	padding: 15px;
	margin: 10px 0;
	}
	"""
	) as demo:

	# Header
	gr.HTML("""
	<div class="main-header">
	<h1>🤖➡️👨 NadaPlagiarism</h1>
	<p>Enterprise-grade AI text humanization with advanced analytics</p>
	</div>
	""")

	with gr.Tabs():

	# Main Humanization Tab
	with gr.TabItem("🎭 Text Humanizer"):
	gr.Markdown("### Transform AI text into natural, human-like writing")

	with gr.Row():
	with gr.Column(scale=2):
	text_input = gr.Textbox(
	lines=8,
	placeholder="Enter your AI-generated text here...",
	label="Input Text",
	info="Paste the text you want to humanize"
	)

	level_radio = gr.Radio(
	choices=[
	("Light", "light"),
	("Medium", "medium"),
	("Heavy", "heavy")
	],
	value="medium",
	label="Humanization Level",
	info="Choose the intensity of humanization"
	)

	with gr.Row():
	humanize_btn = gr.Button("🚀 Humanize Text", variant="primary", size="lg")
	analyze_btn = gr.Button("📊 Analyze Quality", variant="secondary")

	with gr.Column(scale=2):
	text_output = gr.Textbox(
	lines=8,
	label="Humanized Text",
	show_copy_button=True,
	info="Your humanized text will appear here"
	)

	quality_metrics = gr.JSON(
	label="Quality Metrics",
	value={}
	)

	# Examples
	gr.Examples(
	examples=[
	[
	"The implementation of machine learning algorithms demonstrates significant improvements in computational efficiency and accuracy metrics across various benchmark datasets.",
	"medium"
	],
	[
	"Artificial intelligence technologies are increasingly being utilized across various industries to enhance operational capabilities and drive innovation.",
	"heavy"
	]
	],
	inputs=[text_input, level_radio],
	outputs=[text_output]
	)

	# Quality Analysis Tab
	with gr.TabItem("📊 Quality Analysis"):
	gr.Markdown("### Analyze text quality and get improvement suggestions")

	with gr.Row():
	with gr.Column():
	analysis_input = gr.Textbox(
	lines=6,
	placeholder="Enter text to analyze...",
	label="Text to Analyze"
	)

	analyze_quality_btn = gr.Button("🔍 Analyze Quality", variant="primary")

	with gr.Column():
	analysis_output = gr.JSON(
	label="Quality Analysis Results",
	value={}
	)

	# Info Tab
	with gr.TabItem("ℹ️ About"):
	gr.Markdown("""
	### 🎯 NadaPlagiarism

	Enterprise-grade AI text humanization with advanced features:

	#### ✨ Features:
	- Multiple AI Models: T5, Pegasus with intelligent fallbacks
	- Quality Analytics: 5+ comprehensive quality metrics
	- Advanced Humanization: Vocabulary, structure, and flow optimization
	- Real-time Feedback: Quality scores and improvement suggestions
	- Modern Interface: Responsive design with analytics dashboard

	#### 📊 Quality Metrics:
	- Readability: Flesch Reading Ease approximation
	- Naturalness: Personal pronouns, natural transitions
	- Academic Tone: Scholarly language preservation
	- AI Detection: Pattern recognition and phrase analysis
	- Fluency: Sentence flow and coherence
	- Overall Quality: Weighted composite score

	#### 🎮 How to Use:
	1. Paste your AI-generated text in the input box
	2. Choose humanization level (Light/Medium/Heavy)
	3. Click "Humanize Text" to transform your text
	4. View quality metrics and improvement suggestions
	5. Copy the humanized text for your use

	#### ⚖️ Ethical Usage:
	This tool is designed for improving writing quality and learning natural language patterns.
	Please use responsibly and maintain academic integrity.
	""")

	# Event handlers
	humanize_btn.click(
	fn=process_text,
	inputs=[text_input, level_radio],
	outputs=[text_output, quality_metrics]
	)

	analyze_btn.click(
	fn=analyze_quality,
	inputs=[text_input],
	outputs=[quality_metrics]
	)

	analyze_quality_btn.click(
	fn=analyze_quality,
	inputs=[analysis_input],
	outputs=[analysis_output]
	)

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	debug=False
	)