mayankpuvvala's picture
Update app.py
1f1ebb3 verified
import gradio as gr
import requests
from bs4 import BeautifulSoup
import re
import math
import numpy as np
from collections import Counter
import json
import time
from datetime import datetime
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.utils
import base64
from io import BytesIO
class AIContentDetector:
"""AI Content Detection System"""
def __init__(self):
self.common_ai_phrases = [
"it's important to note", "it's worth mentioning", "in conclusion",
"furthermore", "moreover", "additionally", "it should be noted",
"as we can see", "clearly", "obviously", "naturally",
"in summary", "to summarize", "in essence", "fundamentally",
"it is evident that", "it becomes clear that", "this demonstrates",
"this illustrates", "this shows", "this indicates"
]
self.repetitive_patterns = [
r'\b(\w+)\s+\1\b', # Word repetition
r'(\w{3,})\s+\w*\1\w*', # Substring repetition
r'(\w+)\s+\w+\s+\1', # Pattern repetition
]
def calculate_perplexity(self, text):
"""Calculate text perplexity"""
words = text.lower().split()
if len(words) < 2:
return 0
bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
bigram_counts = Counter(bigrams)
unigram_counts = Counter(words)
total_bigrams = len(bigrams)
log_prob = 0
for bigram in bigrams:
if bigram_counts[bigram] > 0 and unigram_counts[bigram[0]] > 0:
prob = bigram_counts[bigram] / unigram_counts[bigram[0]]
log_prob += math.log(prob)
perplexity = math.exp(-log_prob / total_bigrams)
return perplexity
def calculate_burstiness(self, text):
"""Calculate text burstiness"""
sentences = re.split(r'[.!?]+', text)
sentences = [s.strip() for s in sentences if s.strip()]
if len(sentences) < 2:
return 0
sentence_lengths = [len(s.split()) for s in sentences]
mean_length = np.mean(sentence_lengths)
std_length = np.std(sentence_lengths)
if mean_length == 0:
return 0
burstiness = (std_length - mean_length) / (std_length + mean_length)
return burstiness
def analyze_vocabulary_diversity(self, text):
"""Analyze vocabulary diversity"""
words = re.findall(r'\b\w+\b', text.lower())
total_words = len(words)
if total_words == 0:
return {'type_token_ratio': 0, 'lexical_diversity': 0}
unique_words = len(set(words))
type_token_ratio = unique_words / total_words
word_freq = Counter(words)
sum_squares = sum(freq ** 2 for freq in word_freq.values())
lexical_diversity = (sum_squares - total_words) / (total_words ** 2)
return {
'type_token_ratio': type_token_ratio,
'lexical_diversity': lexical_diversity
}
def detect_repetitive_patterns(self, text):
"""Detect repetitive patterns"""
ai_phrase_count = sum(1 for phrase in self.common_ai_phrases
if phrase.lower() in text.lower())
repetitive_count = 0
for pattern in self.repetitive_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
repetitive_count += len(matches)
sentences = re.split(r'[.!?]+', text)
sentence_starts = [s.strip().split()[0].lower() if s.strip().split() else ''
for s in sentences if s.strip()]
start_word_freq = Counter(sentence_starts)
repeated_starts = sum(1 for count in start_word_freq.values() if count > 1)
return {
'ai_phrases': ai_phrase_count,
'repetitive_patterns': repetitive_count,
'repeated_sentence_starts': repeated_starts
}
def analyze_sentence_structure(self, text):
"""Analyze sentence structure"""
sentences = re.split(r'[.!?]+', text)
sentences = [s.strip() for s in sentences if s.strip()]
if not sentences:
return {'avg_sentence_length': 0, 'sentence_variety': 0}
sentence_lengths = [len(s.split()) for s in sentences]
avg_length = np.mean(sentence_lengths)
sentence_variety = np.std(sentence_lengths)
return {
'avg_sentence_length': avg_length,
'sentence_variety': sentence_variety
}
def calculate_readability_scores(self, text):
"""Calculate readability scores"""
sentences = re.split(r'[.!?]+', text)
sentences = [s.strip() for s in sentences if s.strip()]
words = re.findall(r'\b\w+\b', text)
syllables = self._count_syllables(text)
if not sentences or not words:
return {'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0}
avg_sentence_length = len(words) / len(sentences)
avg_syllables_per_word = syllables / len(words)
flesch_ease = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_syllables_per_word)
flesch_grade = (0.39 * avg_sentence_length) + (11.8 * avg_syllables_per_word) - 15.59
return {
'flesch_reading_ease': max(0, min(100, flesch_ease)),
'flesch_kincaid_grade': max(0, flesch_grade)
}
def _count_syllables(self, text):
"""Count syllables in text"""
text = text.lower()
count = 0
vowels = "aeiouy"
on_vowel = False
for char in text:
is_vowel = char in vowels
if is_vowel and not on_vowel:
count += 1
on_vowel = is_vowel
return count
def analyze_semantic_coherence(self, text):
"""Analyze semantic coherence"""
sentences = re.split(r'[.!?]+', text)
sentences = [s.strip() for s in sentences if s.strip()]
if len(sentences) < 2:
return {'topic_consistency': 0, 'semantic_coherence': 0}
topic_consistency_scores = []
for i in range(len(sentences) - 1):
words1 = set(re.findall(r'\b\w+\b', sentences[i].lower()))
words2 = set(re.findall(r'\b\w+\b', sentences[i+1].lower()))
if words1 and words2:
overlap = len(words1.intersection(words2))
union = len(words1.union(words2))
consistency = overlap / union if union > 0 else 0
topic_consistency_scores.append(consistency)
avg_topic_consistency = np.mean(topic_consistency_scores) if topic_consistency_scores else 0
semantic_coherence = np.std(topic_consistency_scores) if topic_consistency_scores else 0
return {
'topic_consistency': avg_topic_consistency,
'semantic_coherence': semantic_coherence
}
def calculate_ai_probability(self, text):
"""Calculate AI probability"""
perplexity = self.calculate_perplexity(text)
burstiness = self.calculate_burstiness(text)
vocab_diversity = self.analyze_vocabulary_diversity(text)
repetitive_patterns = self.detect_repetitive_patterns(text)
sentence_structure = self.analyze_sentence_structure(text)
readability = self.calculate_readability_scores(text)
semantic_coherence = self.analyze_semantic_coherence(text)
scores = {}
scores['perplexity_score'] = max(0, min(1, 1 - (perplexity / 100)))
scores['burstiness_score'] = max(0, min(1, 1 - (burstiness + 0.5)))
scores['vocab_diversity_score'] = max(0, min(1, 1 - vocab_diversity['type_token_ratio']))
total_patterns = (repetitive_patterns['ai_phrases'] * 2 +
repetitive_patterns['repetitive_patterns'] +
repetitive_patterns['repeated_sentence_starts'])
scores['repetitive_patterns_score'] = min(1, total_patterns / 10)
sentence_variety_normalized = min(1, sentence_structure['sentence_variety'] / 10)
scores['sentence_structure_score'] = max(0, min(1, 1 - sentence_variety_normalized))
flesch_ease = readability['flesch_reading_ease']
if flesch_ease > 80 or flesch_ease < 20:
scores['readability_score'] = 0.8
else:
scores['readability_score'] = 0.2
scores['semantic_coherence_score'] = min(1, semantic_coherence['topic_consistency'] * 2)
weights = {
'perplexity_score': 0.2,
'burstiness_score': 0.15,
'vocab_diversity_score': 0.15,
'repetitive_patterns_score': 0.2,
'sentence_structure_score': 0.1,
'readability_score': 0.1,
'semantic_coherence_score': 0.1
}
total_score = sum(scores[key] * weights[key] for key in weights)
return {
'ai_probability': total_score,
'individual_scores': scores,
'analysis_results': {
'perplexity': perplexity,
'burstiness': burstiness,
'vocabulary_diversity': vocab_diversity,
'repetitive_patterns': repetitive_patterns,
'sentence_structure': sentence_structure,
'readability': readability,
'semantic_coherence': semantic_coherence
}
}
def get_detection_result(self, text):
"""Get comprehensive detection result"""
if not text.strip():
return {'error': 'Empty text provided'}
result = self.calculate_ai_probability(text)
ai_prob = result['ai_probability']
if ai_prob < 0.3:
confidence = "LIKELY HUMAN"
confidence_level = "high"
elif ai_prob < 0.6:
confidence = "UNCERTAIN"
confidence_level = "medium"
else:
confidence = "LIKELY AI-GENERATED"
confidence_level = "high"
return {
'ai_probability': ai_prob,
'confidence': confidence,
'confidence_level': confidence_level,
'text_length': len(text),
'word_count': len(text.split()),
'detailed_analysis': result['analysis_results'],
'individual_scores': result['individual_scores']
}
def scrape_website(url):
"""Scrape content from a website"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Extract text content
paragraphs = soup.find_all('p')
headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
text_parts = []
for elem in headings + paragraphs:
text = elem.get_text().strip()
if text and len(text) > 20:
text_parts.append(text)
full_text = ' '.join(text_parts)
if not full_text:
return {'error': 'No meaningful text found on the page'}
return {
'url': url,
'text': full_text,
'text_length': len(full_text),
'word_count': len(full_text.split())
}
except Exception as e:
return {'error': f'Error scraping URL: {str(e)}'}
def create_visualizations(detection_result):
"""Create visualizations for the detection results"""
# Individual scores radar chart
scores = detection_result['individual_scores']
categories = list(scores.keys())
values = list(scores.values())
fig_radar = go.Figure()
fig_radar.add_trace(go.Scatterpolar(
r=values,
theta=categories,
fill='toself',
name='AI Probability Scores'
))
fig_radar.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 1]
)),
showlegend=False,
title="Individual Detection Scores",
height=400
)
# Analysis metrics bar chart
analysis = detection_result['detailed_analysis']
metrics = ['perplexity', 'burstiness', 'avg_sentence_length', 'flesch_reading_ease']
metric_values = [
analysis['perplexity'],
analysis['burstiness'],
analysis['sentence_structure']['avg_sentence_length'],
analysis['readability']['flesch_reading_ease']
]
fig_bar = px.bar(
x=metrics,
y=metric_values,
title="Text Analysis Metrics",
labels={'x': 'Metric', 'y': 'Value'},
height=400
)
# Individual scores bar chart
scores_df = pd.DataFrame([
{'Score': name.replace('_', ' ').title(), 'Value': value}
for name, value in scores.items()
])
fig_scores = px.bar(
scores_df,
x='Score',
y='Value',
title="Individual Detection Scores (Higher = More Likely AI)",
color='Value',
color_continuous_scale='RdYlBu_r',
height=400
)
return fig_radar, fig_bar, fig_scores
def analyze_website(url):
"""Analyze website URL"""
if not url.strip():
return "Please enter a URL to analyze.", None, None, None, None
# Scrape website
scraped_data = scrape_website(url)
if 'error' in scraped_data:
return scraped_data['error'], None, None, None, None
# Analyze the scraped text
detector = AIContentDetector()
detection_result = detector.get_detection_result(scraped_data['text'])
if 'error' in detection_result:
return detection_result['error'], None, None, None, None
# Create visualizations
fig_radar, fig_bar, fig_scores = create_visualizations(detection_result)
# Format results
ai_prob = detection_result['ai_probability']
confidence = detection_result['confidence']
word_count = detection_result['word_count']
# Create detailed analysis text
analysis = detection_result['detailed_analysis']
detailed_text = f"""
**Website:** {url}
**AI Probability:** {ai_prob:.1%}
**Confidence:** {confidence}
**Word Count:** {word_count}
**Detailed Analysis:**
- Perplexity: {analysis['perplexity']:.2f}
- Burstiness: {analysis['burstiness']:.3f}
- Vocabulary Diversity: {analysis['vocabulary_diversity']['type_token_ratio']:.3f}
- Avg Sentence Length: {analysis['sentence_structure']['avg_sentence_length']:.1f}
- Flesch Reading Ease: {analysis['readability']['flesch_reading_ease']:.1f}
- Topic Consistency: {analysis['semantic_coherence']['topic_consistency']:.3f}
**Sample Text:** {scraped_data['text'][:200]}...
"""
return detailed_text, fig_radar, fig_bar, fig_scores
def analyze_news_site(site_name):
"""Analyze pre-configured news sites"""
news_sites = {
"BBC News": "https://www.bbc.com/news/articles/c93dgr2dd53o",
"Al Jazeera News": "https://www.aljazeera.com/news/2025/5/9/india-pakistan-tensions-a-brief-history-of-conflict",
"Norwich News": "https://online.norwich.edu/online/about/resource-library/five-major-african-wars-and-conflicts-twentieth-century",
"Britannica News": "https://www.britannica.com/event/Iran-Iraq-War",
"Council on Foreign Relations News": "https://www.cfr.org/article/syrias-civil-war"
}
if site_name not in news_sites:
return "Please select a valid news site.", None, None, None, None
url = news_sites[site_name]
return analyze_website(url)
# Create Gradio interface
with gr.Blocks(title="🤖 AI Content Detector", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🤖 AI Content Detector")
gr.Markdown("Detect AI-generated content from websites or text input using advanced linguistic analysis.")
with gr.Tabs():
# News Sites Tab
with gr.TabItem("📰 News Site Analysis"):
gr.Markdown("### Analyze Pre-configured News Sites")
site_dropdown = gr.Dropdown(
choices=["BBC News", "Al Jazeera News", "Norwich News", "Britannica News", "Council on Foreign Relations News"],
label="Select a news site:",
value="BBC News"
)
analyze_news_btn = gr.Button("Analyze News Site", variant="primary")
news_results = gr.Markdown(label="Analysis Results")
news_radar = gr.Plot(label="Radar Chart")
news_bar = gr.Plot(label="Analysis Metrics")
news_scores = gr.Plot(label="Individual Scores")
analyze_news_btn.click(
analyze_news_site,
inputs=[site_dropdown],
outputs=[news_results, news_radar, news_bar, news_scores]
)
# Information section
with gr.Accordion("ℹ️ How It Works", open=False):
gr.Markdown("""
### Detection Methods
1. **Perplexity Analysis** - Measures text predictability
2. **Burstiness Calculation** - Analyzes sentence length variability
3. **Vocabulary Diversity** - Type-token ratio analysis
4. **Repetitive Patterns** - Detects common AI phrases
5. **Sentence Structure** - Length and complexity analysis
6. **Readability Scores** - Flesch metrics
7. **Semantic Coherence** - Topic consistency analysis
### Confidence Levels
- **< 30%**: LIKELY HUMAN
- **30-60%**: UNCERTAIN
- **> 60%**: LIKELY AI-GENERATED
""")
if __name__ == "__main__":
demo.launch()