Spaces:

mayankpuvvala
/

ai-content-detector

Sleeping

App Files Files Community

ai-content-detector / app.py

mayankpuvvala

Update app.py

1f1ebb3 verified 9 months ago

raw

history blame contribute delete

18.3 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	import re
	import math
	import numpy as np
	from collections import Counter
	import json
	import time
	from datetime import datetime
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	import plotly.utils
	import base64
	from io import BytesIO

	class AIContentDetector:
	"""AI Content Detection System"""

	def __init__(self):
	self.common_ai_phrases = [
	"it's important to note", "it's worth mentioning", "in conclusion",
	"furthermore", "moreover", "additionally", "it should be noted",
	"as we can see", "clearly", "obviously", "naturally",
	"in summary", "to summarize", "in essence", "fundamentally",
	"it is evident that", "it becomes clear that", "this demonstrates",
	"this illustrates", "this shows", "this indicates"
	]

	self.repetitive_patterns = [
	r'\b(\w+)\s+\1\b', # Word repetition
	r'(\w{3,})\s+\w\1\w', # Substring repetition
	r'(\w+)\s+\w+\s+\1', # Pattern repetition
	]

	def calculate_perplexity(self, text):
	"""Calculate text perplexity"""
	words = text.lower().split()
	if len(words) < 2:
	return 0

	bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
	bigram_counts = Counter(bigrams)
	unigram_counts = Counter(words)

	total_bigrams = len(bigrams)
	log_prob = 0

	for bigram in bigrams:
	if bigram_counts[bigram] > 0 and unigram_counts[bigram[0]] > 0:
	prob = bigram_counts[bigram] / unigram_counts[bigram[0]]
	log_prob += math.log(prob)

	perplexity = math.exp(-log_prob / total_bigrams)
	return perplexity

	def calculate_burstiness(self, text):
	"""Calculate text burstiness"""
	sentences = re.split(r'[.!?]+', text)
	sentences = [s.strip() for s in sentences if s.strip()]

	if len(sentences) < 2:
	return 0

	sentence_lengths = [len(s.split()) for s in sentences]
	mean_length = np.mean(sentence_lengths)
	std_length = np.std(sentence_lengths)

	if mean_length == 0:
	return 0

	burstiness = (std_length - mean_length) / (std_length + mean_length)
	return burstiness

	def analyze_vocabulary_diversity(self, text):
	"""Analyze vocabulary diversity"""
	words = re.findall(r'\b\w+\b', text.lower())
	total_words = len(words)

	if total_words == 0:
	return {'type_token_ratio': 0, 'lexical_diversity': 0}

	unique_words = len(set(words))
	type_token_ratio = unique_words / total_words

	word_freq = Counter(words)
	sum_squares = sum(freq ** 2 for freq in word_freq.values())
	lexical_diversity = (sum_squares - total_words) / (total_words ** 2)

	return {
	'type_token_ratio': type_token_ratio,
	'lexical_diversity': lexical_diversity
	}

	def detect_repetitive_patterns(self, text):
	"""Detect repetitive patterns"""
	ai_phrase_count = sum(1 for phrase in self.common_ai_phrases
	if phrase.lower() in text.lower())

	repetitive_count = 0
	for pattern in self.repetitive_patterns:
	matches = re.findall(pattern, text, re.IGNORECASE)
	repetitive_count += len(matches)

	sentences = re.split(r'[.!?]+', text)
	sentence_starts = [s.strip().split()[0].lower() if s.strip().split() else ''
	for s in sentences if s.strip()]
	start_word_freq = Counter(sentence_starts)
	repeated_starts = sum(1 for count in start_word_freq.values() if count > 1)

	return {
	'ai_phrases': ai_phrase_count,
	'repetitive_patterns': repetitive_count,
	'repeated_sentence_starts': repeated_starts
	}

	def analyze_sentence_structure(self, text):
	"""Analyze sentence structure"""
	sentences = re.split(r'[.!?]+', text)
	sentences = [s.strip() for s in sentences if s.strip()]

	if not sentences:
	return {'avg_sentence_length': 0, 'sentence_variety': 0}

	sentence_lengths = [len(s.split()) for s in sentences]
	avg_length = np.mean(sentence_lengths)
	sentence_variety = np.std(sentence_lengths)

	return {
	'avg_sentence_length': avg_length,
	'sentence_variety': sentence_variety
	}

	def calculate_readability_scores(self, text):
	"""Calculate readability scores"""
	sentences = re.split(r'[.!?]+', text)
	sentences = [s.strip() for s in sentences if s.strip()]

	words = re.findall(r'\b\w+\b', text)
	syllables = self._count_syllables(text)

	if not sentences or not words:
	return {'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0}

	avg_sentence_length = len(words) / len(sentences)
	avg_syllables_per_word = syllables / len(words)

	flesch_ease = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_syllables_per_word)
	flesch_grade = (0.39 * avg_sentence_length) + (11.8 * avg_syllables_per_word) - 15.59

	return {
	'flesch_reading_ease': max(0, min(100, flesch_ease)),
	'flesch_kincaid_grade': max(0, flesch_grade)
	}

	def _count_syllables(self, text):
	"""Count syllables in text"""
	text = text.lower()
	count = 0
	vowels = "aeiouy"
	on_vowel = False

	for char in text:
	is_vowel = char in vowels
	if is_vowel and not on_vowel:
	count += 1
	on_vowel = is_vowel

	return count

	def analyze_semantic_coherence(self, text):
	"""Analyze semantic coherence"""
	sentences = re.split(r'[.!?]+', text)
	sentences = [s.strip() for s in sentences if s.strip()]

	if len(sentences) < 2:
	return {'topic_consistency': 0, 'semantic_coherence': 0}

	topic_consistency_scores = []
	for i in range(len(sentences) - 1):
	words1 = set(re.findall(r'\b\w+\b', sentences[i].lower()))
	words2 = set(re.findall(r'\b\w+\b', sentences[i+1].lower()))

	if words1 and words2:
	overlap = len(words1.intersection(words2))
	union = len(words1.union(words2))
	consistency = overlap / union if union > 0 else 0
	topic_consistency_scores.append(consistency)

	avg_topic_consistency = np.mean(topic_consistency_scores) if topic_consistency_scores else 0
	semantic_coherence = np.std(topic_consistency_scores) if topic_consistency_scores else 0

	return {
	'topic_consistency': avg_topic_consistency,
	'semantic_coherence': semantic_coherence
	}

	def calculate_ai_probability(self, text):
	"""Calculate AI probability"""
	perplexity = self.calculate_perplexity(text)
	burstiness = self.calculate_burstiness(text)
	vocab_diversity = self.analyze_vocabulary_diversity(text)
	repetitive_patterns = self.detect_repetitive_patterns(text)
	sentence_structure = self.analyze_sentence_structure(text)
	readability = self.calculate_readability_scores(text)
	semantic_coherence = self.analyze_semantic_coherence(text)

	scores = {}

	scores['perplexity_score'] = max(0, min(1, 1 - (perplexity / 100)))
	scores['burstiness_score'] = max(0, min(1, 1 - (burstiness + 0.5)))
	scores['vocab_diversity_score'] = max(0, min(1, 1 - vocab_diversity['type_token_ratio']))

	total_patterns = (repetitive_patterns['ai_phrases'] * 2 +
	repetitive_patterns['repetitive_patterns'] +
	repetitive_patterns['repeated_sentence_starts'])
	scores['repetitive_patterns_score'] = min(1, total_patterns / 10)

	sentence_variety_normalized = min(1, sentence_structure['sentence_variety'] / 10)
	scores['sentence_structure_score'] = max(0, min(1, 1 - sentence_variety_normalized))

	flesch_ease = readability['flesch_reading_ease']
	if flesch_ease > 80 or flesch_ease < 20:
	scores['readability_score'] = 0.8
	else:
	scores['readability_score'] = 0.2

	scores['semantic_coherence_score'] = min(1, semantic_coherence['topic_consistency'] * 2)

	weights = {
	'perplexity_score': 0.2,
	'burstiness_score': 0.15,
	'vocab_diversity_score': 0.15,
	'repetitive_patterns_score': 0.2,
	'sentence_structure_score': 0.1,
	'readability_score': 0.1,
	'semantic_coherence_score': 0.1
	}

	total_score = sum(scores[key] * weights[key] for key in weights)

	return {
	'ai_probability': total_score,
	'individual_scores': scores,
	'analysis_results': {
	'perplexity': perplexity,
	'burstiness': burstiness,
	'vocabulary_diversity': vocab_diversity,
	'repetitive_patterns': repetitive_patterns,
	'sentence_structure': sentence_structure,
	'readability': readability,
	'semantic_coherence': semantic_coherence
	}
	}

	def get_detection_result(self, text):
	"""Get comprehensive detection result"""
	if not text.strip():
	return {'error': 'Empty text provided'}

	result = self.calculate_ai_probability(text)
	ai_prob = result['ai_probability']

	if ai_prob < 0.3:
	confidence = "LIKELY HUMAN"
	confidence_level = "high"
	elif ai_prob < 0.6:
	confidence = "UNCERTAIN"
	confidence_level = "medium"
	else:
	confidence = "LIKELY AI-GENERATED"
	confidence_level = "high"

	return {
	'ai_probability': ai_prob,
	'confidence': confidence,
	'confidence_level': confidence_level,
	'text_length': len(text),
	'word_count': len(text.split()),
	'detailed_analysis': result['analysis_results'],
	'individual_scores': result['individual_scores']
	}

	def scrape_website(url):
	"""Scrape content from a website"""
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
	}

	try:
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, 'html.parser')

	# Remove script and style elements
	for script in soup(["script", "style"]):
	script.decompose()

	# Extract text content
	paragraphs = soup.find_all('p')
	headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

	text_parts = []
	for elem in headings + paragraphs:
	text = elem.get_text().strip()
	if text and len(text) > 20:
	text_parts.append(text)

	full_text = ' '.join(text_parts)

	if not full_text:
	return {'error': 'No meaningful text found on the page'}

	return {
	'url': url,
	'text': full_text,
	'text_length': len(full_text),
	'word_count': len(full_text.split())
	}

	except Exception as e:
	return {'error': f'Error scraping URL: {str(e)}'}

	def create_visualizations(detection_result):
	"""Create visualizations for the detection results"""

	# Individual scores radar chart
	scores = detection_result['individual_scores']
	categories = list(scores.keys())
	values = list(scores.values())

	fig_radar = go.Figure()
	fig_radar.add_trace(go.Scatterpolar(
	r=values,
	theta=categories,
	fill='toself',
	name='AI Probability Scores'
	))
	fig_radar.update_layout(
	polar=dict(
	radialaxis=dict(
	visible=True,
	range=[0, 1]
	)),
	showlegend=False,
	title="Individual Detection Scores",
	height=400
	)

	# Analysis metrics bar chart
	analysis = detection_result['detailed_analysis']
	metrics = ['perplexity', 'burstiness', 'avg_sentence_length', 'flesch_reading_ease']
	metric_values = [
	analysis['perplexity'],
	analysis['burstiness'],
	analysis['sentence_structure']['avg_sentence_length'],
	analysis['readability']['flesch_reading_ease']
	]

	fig_bar = px.bar(
	x=metrics,
	y=metric_values,
	title="Text Analysis Metrics",
	labels={'x': 'Metric', 'y': 'Value'},
	height=400
	)

	# Individual scores bar chart
	scores_df = pd.DataFrame([
	{'Score': name.replace('_', ' ').title(), 'Value': value}
	for name, value in scores.items()
	])

	fig_scores = px.bar(
	scores_df,
	x='Score',
	y='Value',
	title="Individual Detection Scores (Higher = More Likely AI)",
	color='Value',
	color_continuous_scale='RdYlBu_r',
	height=400
	)

	return fig_radar, fig_bar, fig_scores

	def analyze_website(url):
	"""Analyze website URL"""
	if not url.strip():
	return "Please enter a URL to analyze.", None, None, None, None

	# Scrape website
	scraped_data = scrape_website(url)

	if 'error' in scraped_data:
	return scraped_data['error'], None, None, None, None

	# Analyze the scraped text
	detector = AIContentDetector()
	detection_result = detector.get_detection_result(scraped_data['text'])

	if 'error' in detection_result:
	return detection_result['error'], None, None, None, None

	# Create visualizations
	fig_radar, fig_bar, fig_scores = create_visualizations(detection_result)

	# Format results
	ai_prob = detection_result['ai_probability']
	confidence = detection_result['confidence']
	word_count = detection_result['word_count']

	# Create detailed analysis text
	analysis = detection_result['detailed_analysis']
	detailed_text = f"""
	Website: {url}
	AI Probability: {ai_prob:.1%}
	Confidence: {confidence}
	Word Count: {word_count}

	Detailed Analysis:
	- Perplexity: {analysis['perplexity']:.2f}
	- Burstiness: {analysis['burstiness']:.3f}
	- Vocabulary Diversity: {analysis['vocabulary_diversity']['type_token_ratio']:.3f}
	- Avg Sentence Length: {analysis['sentence_structure']['avg_sentence_length']:.1f}
	- Flesch Reading Ease: {analysis['readability']['flesch_reading_ease']:.1f}
	- Topic Consistency: {analysis['semantic_coherence']['topic_consistency']:.3f}

	Sample Text: {scraped_data['text'][:200]}...
	"""

	return detailed_text, fig_radar, fig_bar, fig_scores


	def analyze_news_site(site_name):
	"""Analyze pre-configured news sites"""
	news_sites = {
	"BBC News": "https://www.bbc.com/news/articles/c93dgr2dd53o",
	"Al Jazeera News": "https://www.aljazeera.com/news/2025/5/9/india-pakistan-tensions-a-brief-history-of-conflict",
	"Norwich News": "https://online.norwich.edu/online/about/resource-library/five-major-african-wars-and-conflicts-twentieth-century",
	"Britannica News": "https://www.britannica.com/event/Iran-Iraq-War",
	"Council on Foreign Relations News": "https://www.cfr.org/article/syrias-civil-war"
	}

	if site_name not in news_sites:
	return "Please select a valid news site.", None, None, None, None

	url = news_sites[site_name]
	return analyze_website(url)

	# Create Gradio interface
	with gr.Blocks(title="🤖 AI Content Detector", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🤖 AI Content Detector")
	gr.Markdown("Detect AI-generated content from websites or text input using advanced linguistic analysis.")

	with gr.Tabs():
	# News Sites Tab
	with gr.TabItem("📰 News Site Analysis"):
	gr.Markdown("### Analyze Pre-configured News Sites")
	site_dropdown = gr.Dropdown(
	choices=["BBC News", "Al Jazeera News", "Norwich News", "Britannica News", "Council on Foreign Relations News"],
	label="Select a news site:",
	value="BBC News"
	)
	analyze_news_btn = gr.Button("Analyze News Site", variant="primary")
	news_results = gr.Markdown(label="Analysis Results")
	news_radar = gr.Plot(label="Radar Chart")
	news_bar = gr.Plot(label="Analysis Metrics")
	news_scores = gr.Plot(label="Individual Scores")

	analyze_news_btn.click(
	analyze_news_site,
	inputs=[site_dropdown],
	outputs=[news_results, news_radar, news_bar, news_scores]
	)

	# Information section
	with gr.Accordion("ℹ️ How It Works", open=False):
	gr.Markdown("""
	### Detection Methods

	1. Perplexity Analysis - Measures text predictability
	2. Burstiness Calculation - Analyzes sentence length variability
	3. Vocabulary Diversity - Type-token ratio analysis
	4. Repetitive Patterns - Detects common AI phrases
	5. Sentence Structure - Length and complexity analysis
	6. Readability Scores - Flesch metrics
	7. Semantic Coherence - Topic consistency analysis

	### Confidence Levels
	- < 30%: LIKELY HUMAN
	- 30-60%: UNCERTAIN
	- > 60%: LIKELY AI-GENERATED
	""")

	if __name__ == "__main__":
	demo.launch()