Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| import math | |
| import numpy as np | |
| from collections import Counter | |
| import json | |
| import time | |
| from datetime import datetime | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| import plotly.utils | |
| import base64 | |
| from io import BytesIO | |
| class AIContentDetector: | |
| """AI Content Detection System""" | |
| def __init__(self): | |
| self.common_ai_phrases = [ | |
| "it's important to note", "it's worth mentioning", "in conclusion", | |
| "furthermore", "moreover", "additionally", "it should be noted", | |
| "as we can see", "clearly", "obviously", "naturally", | |
| "in summary", "to summarize", "in essence", "fundamentally", | |
| "it is evident that", "it becomes clear that", "this demonstrates", | |
| "this illustrates", "this shows", "this indicates" | |
| ] | |
| self.repetitive_patterns = [ | |
| r'\b(\w+)\s+\1\b', # Word repetition | |
| r'(\w{3,})\s+\w*\1\w*', # Substring repetition | |
| r'(\w+)\s+\w+\s+\1', # Pattern repetition | |
| ] | |
| def calculate_perplexity(self, text): | |
| """Calculate text perplexity""" | |
| words = text.lower().split() | |
| if len(words) < 2: | |
| return 0 | |
| bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)] | |
| bigram_counts = Counter(bigrams) | |
| unigram_counts = Counter(words) | |
| total_bigrams = len(bigrams) | |
| log_prob = 0 | |
| for bigram in bigrams: | |
| if bigram_counts[bigram] > 0 and unigram_counts[bigram[0]] > 0: | |
| prob = bigram_counts[bigram] / unigram_counts[bigram[0]] | |
| log_prob += math.log(prob) | |
| perplexity = math.exp(-log_prob / total_bigrams) | |
| return perplexity | |
| def calculate_burstiness(self, text): | |
| """Calculate text burstiness""" | |
| sentences = re.split(r'[.!?]+', text) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| if len(sentences) < 2: | |
| return 0 | |
| sentence_lengths = [len(s.split()) for s in sentences] | |
| mean_length = np.mean(sentence_lengths) | |
| std_length = np.std(sentence_lengths) | |
| if mean_length == 0: | |
| return 0 | |
| burstiness = (std_length - mean_length) / (std_length + mean_length) | |
| return burstiness | |
| def analyze_vocabulary_diversity(self, text): | |
| """Analyze vocabulary diversity""" | |
| words = re.findall(r'\b\w+\b', text.lower()) | |
| total_words = len(words) | |
| if total_words == 0: | |
| return {'type_token_ratio': 0, 'lexical_diversity': 0} | |
| unique_words = len(set(words)) | |
| type_token_ratio = unique_words / total_words | |
| word_freq = Counter(words) | |
| sum_squares = sum(freq ** 2 for freq in word_freq.values()) | |
| lexical_diversity = (sum_squares - total_words) / (total_words ** 2) | |
| return { | |
| 'type_token_ratio': type_token_ratio, | |
| 'lexical_diversity': lexical_diversity | |
| } | |
| def detect_repetitive_patterns(self, text): | |
| """Detect repetitive patterns""" | |
| ai_phrase_count = sum(1 for phrase in self.common_ai_phrases | |
| if phrase.lower() in text.lower()) | |
| repetitive_count = 0 | |
| for pattern in self.repetitive_patterns: | |
| matches = re.findall(pattern, text, re.IGNORECASE) | |
| repetitive_count += len(matches) | |
| sentences = re.split(r'[.!?]+', text) | |
| sentence_starts = [s.strip().split()[0].lower() if s.strip().split() else '' | |
| for s in sentences if s.strip()] | |
| start_word_freq = Counter(sentence_starts) | |
| repeated_starts = sum(1 for count in start_word_freq.values() if count > 1) | |
| return { | |
| 'ai_phrases': ai_phrase_count, | |
| 'repetitive_patterns': repetitive_count, | |
| 'repeated_sentence_starts': repeated_starts | |
| } | |
| def analyze_sentence_structure(self, text): | |
| """Analyze sentence structure""" | |
| sentences = re.split(r'[.!?]+', text) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| if not sentences: | |
| return {'avg_sentence_length': 0, 'sentence_variety': 0} | |
| sentence_lengths = [len(s.split()) for s in sentences] | |
| avg_length = np.mean(sentence_lengths) | |
| sentence_variety = np.std(sentence_lengths) | |
| return { | |
| 'avg_sentence_length': avg_length, | |
| 'sentence_variety': sentence_variety | |
| } | |
| def calculate_readability_scores(self, text): | |
| """Calculate readability scores""" | |
| sentences = re.split(r'[.!?]+', text) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| words = re.findall(r'\b\w+\b', text) | |
| syllables = self._count_syllables(text) | |
| if not sentences or not words: | |
| return {'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0} | |
| avg_sentence_length = len(words) / len(sentences) | |
| avg_syllables_per_word = syllables / len(words) | |
| flesch_ease = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_syllables_per_word) | |
| flesch_grade = (0.39 * avg_sentence_length) + (11.8 * avg_syllables_per_word) - 15.59 | |
| return { | |
| 'flesch_reading_ease': max(0, min(100, flesch_ease)), | |
| 'flesch_kincaid_grade': max(0, flesch_grade) | |
| } | |
| def _count_syllables(self, text): | |
| """Count syllables in text""" | |
| text = text.lower() | |
| count = 0 | |
| vowels = "aeiouy" | |
| on_vowel = False | |
| for char in text: | |
| is_vowel = char in vowels | |
| if is_vowel and not on_vowel: | |
| count += 1 | |
| on_vowel = is_vowel | |
| return count | |
| def analyze_semantic_coherence(self, text): | |
| """Analyze semantic coherence""" | |
| sentences = re.split(r'[.!?]+', text) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| if len(sentences) < 2: | |
| return {'topic_consistency': 0, 'semantic_coherence': 0} | |
| topic_consistency_scores = [] | |
| for i in range(len(sentences) - 1): | |
| words1 = set(re.findall(r'\b\w+\b', sentences[i].lower())) | |
| words2 = set(re.findall(r'\b\w+\b', sentences[i+1].lower())) | |
| if words1 and words2: | |
| overlap = len(words1.intersection(words2)) | |
| union = len(words1.union(words2)) | |
| consistency = overlap / union if union > 0 else 0 | |
| topic_consistency_scores.append(consistency) | |
| avg_topic_consistency = np.mean(topic_consistency_scores) if topic_consistency_scores else 0 | |
| semantic_coherence = np.std(topic_consistency_scores) if topic_consistency_scores else 0 | |
| return { | |
| 'topic_consistency': avg_topic_consistency, | |
| 'semantic_coherence': semantic_coherence | |
| } | |
| def calculate_ai_probability(self, text): | |
| """Calculate AI probability""" | |
| perplexity = self.calculate_perplexity(text) | |
| burstiness = self.calculate_burstiness(text) | |
| vocab_diversity = self.analyze_vocabulary_diversity(text) | |
| repetitive_patterns = self.detect_repetitive_patterns(text) | |
| sentence_structure = self.analyze_sentence_structure(text) | |
| readability = self.calculate_readability_scores(text) | |
| semantic_coherence = self.analyze_semantic_coherence(text) | |
| scores = {} | |
| scores['perplexity_score'] = max(0, min(1, 1 - (perplexity / 100))) | |
| scores['burstiness_score'] = max(0, min(1, 1 - (burstiness + 0.5))) | |
| scores['vocab_diversity_score'] = max(0, min(1, 1 - vocab_diversity['type_token_ratio'])) | |
| total_patterns = (repetitive_patterns['ai_phrases'] * 2 + | |
| repetitive_patterns['repetitive_patterns'] + | |
| repetitive_patterns['repeated_sentence_starts']) | |
| scores['repetitive_patterns_score'] = min(1, total_patterns / 10) | |
| sentence_variety_normalized = min(1, sentence_structure['sentence_variety'] / 10) | |
| scores['sentence_structure_score'] = max(0, min(1, 1 - sentence_variety_normalized)) | |
| flesch_ease = readability['flesch_reading_ease'] | |
| if flesch_ease > 80 or flesch_ease < 20: | |
| scores['readability_score'] = 0.8 | |
| else: | |
| scores['readability_score'] = 0.2 | |
| scores['semantic_coherence_score'] = min(1, semantic_coherence['topic_consistency'] * 2) | |
| weights = { | |
| 'perplexity_score': 0.2, | |
| 'burstiness_score': 0.15, | |
| 'vocab_diversity_score': 0.15, | |
| 'repetitive_patterns_score': 0.2, | |
| 'sentence_structure_score': 0.1, | |
| 'readability_score': 0.1, | |
| 'semantic_coherence_score': 0.1 | |
| } | |
| total_score = sum(scores[key] * weights[key] for key in weights) | |
| return { | |
| 'ai_probability': total_score, | |
| 'individual_scores': scores, | |
| 'analysis_results': { | |
| 'perplexity': perplexity, | |
| 'burstiness': burstiness, | |
| 'vocabulary_diversity': vocab_diversity, | |
| 'repetitive_patterns': repetitive_patterns, | |
| 'sentence_structure': sentence_structure, | |
| 'readability': readability, | |
| 'semantic_coherence': semantic_coherence | |
| } | |
| } | |
| def get_detection_result(self, text): | |
| """Get comprehensive detection result""" | |
| if not text.strip(): | |
| return {'error': 'Empty text provided'} | |
| result = self.calculate_ai_probability(text) | |
| ai_prob = result['ai_probability'] | |
| if ai_prob < 0.3: | |
| confidence = "LIKELY HUMAN" | |
| confidence_level = "high" | |
| elif ai_prob < 0.6: | |
| confidence = "UNCERTAIN" | |
| confidence_level = "medium" | |
| else: | |
| confidence = "LIKELY AI-GENERATED" | |
| confidence_level = "high" | |
| return { | |
| 'ai_probability': ai_prob, | |
| 'confidence': confidence, | |
| 'confidence_level': confidence_level, | |
| 'text_length': len(text), | |
| 'word_count': len(text.split()), | |
| 'detailed_analysis': result['analysis_results'], | |
| 'individual_scores': result['individual_scores'] | |
| } | |
| def scrape_website(url): | |
| """Scrape content from a website""" | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
| } | |
| try: | |
| response = requests.get(url, headers=headers, timeout=10) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Remove script and style elements | |
| for script in soup(["script", "style"]): | |
| script.decompose() | |
| # Extract text content | |
| paragraphs = soup.find_all('p') | |
| headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) | |
| text_parts = [] | |
| for elem in headings + paragraphs: | |
| text = elem.get_text().strip() | |
| if text and len(text) > 20: | |
| text_parts.append(text) | |
| full_text = ' '.join(text_parts) | |
| if not full_text: | |
| return {'error': 'No meaningful text found on the page'} | |
| return { | |
| 'url': url, | |
| 'text': full_text, | |
| 'text_length': len(full_text), | |
| 'word_count': len(full_text.split()) | |
| } | |
| except Exception as e: | |
| return {'error': f'Error scraping URL: {str(e)}'} | |
| def create_visualizations(detection_result): | |
| """Create visualizations for the detection results""" | |
| # Individual scores radar chart | |
| scores = detection_result['individual_scores'] | |
| categories = list(scores.keys()) | |
| values = list(scores.values()) | |
| fig_radar = go.Figure() | |
| fig_radar.add_trace(go.Scatterpolar( | |
| r=values, | |
| theta=categories, | |
| fill='toself', | |
| name='AI Probability Scores' | |
| )) | |
| fig_radar.update_layout( | |
| polar=dict( | |
| radialaxis=dict( | |
| visible=True, | |
| range=[0, 1] | |
| )), | |
| showlegend=False, | |
| title="Individual Detection Scores", | |
| height=400 | |
| ) | |
| # Analysis metrics bar chart | |
| analysis = detection_result['detailed_analysis'] | |
| metrics = ['perplexity', 'burstiness', 'avg_sentence_length', 'flesch_reading_ease'] | |
| metric_values = [ | |
| analysis['perplexity'], | |
| analysis['burstiness'], | |
| analysis['sentence_structure']['avg_sentence_length'], | |
| analysis['readability']['flesch_reading_ease'] | |
| ] | |
| fig_bar = px.bar( | |
| x=metrics, | |
| y=metric_values, | |
| title="Text Analysis Metrics", | |
| labels={'x': 'Metric', 'y': 'Value'}, | |
| height=400 | |
| ) | |
| # Individual scores bar chart | |
| scores_df = pd.DataFrame([ | |
| {'Score': name.replace('_', ' ').title(), 'Value': value} | |
| for name, value in scores.items() | |
| ]) | |
| fig_scores = px.bar( | |
| scores_df, | |
| x='Score', | |
| y='Value', | |
| title="Individual Detection Scores (Higher = More Likely AI)", | |
| color='Value', | |
| color_continuous_scale='RdYlBu_r', | |
| height=400 | |
| ) | |
| return fig_radar, fig_bar, fig_scores | |
| def analyze_website(url): | |
| """Analyze website URL""" | |
| if not url.strip(): | |
| return "Please enter a URL to analyze.", None, None, None, None | |
| # Scrape website | |
| scraped_data = scrape_website(url) | |
| if 'error' in scraped_data: | |
| return scraped_data['error'], None, None, None, None | |
| # Analyze the scraped text | |
| detector = AIContentDetector() | |
| detection_result = detector.get_detection_result(scraped_data['text']) | |
| if 'error' in detection_result: | |
| return detection_result['error'], None, None, None, None | |
| # Create visualizations | |
| fig_radar, fig_bar, fig_scores = create_visualizations(detection_result) | |
| # Format results | |
| ai_prob = detection_result['ai_probability'] | |
| confidence = detection_result['confidence'] | |
| word_count = detection_result['word_count'] | |
| # Create detailed analysis text | |
| analysis = detection_result['detailed_analysis'] | |
| detailed_text = f""" | |
| **Website:** {url} | |
| **AI Probability:** {ai_prob:.1%} | |
| **Confidence:** {confidence} | |
| **Word Count:** {word_count} | |
| **Detailed Analysis:** | |
| - Perplexity: {analysis['perplexity']:.2f} | |
| - Burstiness: {analysis['burstiness']:.3f} | |
| - Vocabulary Diversity: {analysis['vocabulary_diversity']['type_token_ratio']:.3f} | |
| - Avg Sentence Length: {analysis['sentence_structure']['avg_sentence_length']:.1f} | |
| - Flesch Reading Ease: {analysis['readability']['flesch_reading_ease']:.1f} | |
| - Topic Consistency: {analysis['semantic_coherence']['topic_consistency']:.3f} | |
| **Sample Text:** {scraped_data['text'][:200]}... | |
| """ | |
| return detailed_text, fig_radar, fig_bar, fig_scores | |
| def analyze_news_site(site_name): | |
| """Analyze pre-configured news sites""" | |
| news_sites = { | |
| "BBC News": "https://www.bbc.com/news/articles/c93dgr2dd53o", | |
| "Al Jazeera News": "https://www.aljazeera.com/news/2025/5/9/india-pakistan-tensions-a-brief-history-of-conflict", | |
| "Norwich News": "https://online.norwich.edu/online/about/resource-library/five-major-african-wars-and-conflicts-twentieth-century", | |
| "Britannica News": "https://www.britannica.com/event/Iran-Iraq-War", | |
| "Council on Foreign Relations News": "https://www.cfr.org/article/syrias-civil-war" | |
| } | |
| if site_name not in news_sites: | |
| return "Please select a valid news site.", None, None, None, None | |
| url = news_sites[site_name] | |
| return analyze_website(url) | |
| # Create Gradio interface | |
| with gr.Blocks(title="🤖 AI Content Detector", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# 🤖 AI Content Detector") | |
| gr.Markdown("Detect AI-generated content from websites or text input using advanced linguistic analysis.") | |
| with gr.Tabs(): | |
| # News Sites Tab | |
| with gr.TabItem("📰 News Site Analysis"): | |
| gr.Markdown("### Analyze Pre-configured News Sites") | |
| site_dropdown = gr.Dropdown( | |
| choices=["BBC News", "Al Jazeera News", "Norwich News", "Britannica News", "Council on Foreign Relations News"], | |
| label="Select a news site:", | |
| value="BBC News" | |
| ) | |
| analyze_news_btn = gr.Button("Analyze News Site", variant="primary") | |
| news_results = gr.Markdown(label="Analysis Results") | |
| news_radar = gr.Plot(label="Radar Chart") | |
| news_bar = gr.Plot(label="Analysis Metrics") | |
| news_scores = gr.Plot(label="Individual Scores") | |
| analyze_news_btn.click( | |
| analyze_news_site, | |
| inputs=[site_dropdown], | |
| outputs=[news_results, news_radar, news_bar, news_scores] | |
| ) | |
| # Information section | |
| with gr.Accordion("ℹ️ How It Works", open=False): | |
| gr.Markdown(""" | |
| ### Detection Methods | |
| 1. **Perplexity Analysis** - Measures text predictability | |
| 2. **Burstiness Calculation** - Analyzes sentence length variability | |
| 3. **Vocabulary Diversity** - Type-token ratio analysis | |
| 4. **Repetitive Patterns** - Detects common AI phrases | |
| 5. **Sentence Structure** - Length and complexity analysis | |
| 6. **Readability Scores** - Flesch metrics | |
| 7. **Semantic Coherence** - Topic consistency analysis | |
| ### Confidence Levels | |
| - **< 30%**: LIKELY HUMAN | |
| - **30-60%**: UNCERTAIN | |
| - **> 60%**: LIKELY AI-GENERATED | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() |