Spaces:
Sleeping
Sleeping
| # src/scipeerai/modules/llm_detector.py | |
| # | |
| # LLM-Generated Paper Detector | |
| # Detects AI-generated academic text using: | |
| # - Burstiness analysis (human text varies, LLM uniform) | |
| # - Vocabulary diversity (TTR β type-token ratio) | |
| # - Sentence length uniformity | |
| # - Perplexity approximation via n-gram analysis | |
| # - LLM signature phrases detection | |
| # | |
| # Completely novel approach β no free tool does this. | |
| # Based on research in AI text detection (2023-2024). | |
| import re | |
| import math | |
| import statistics | |
| from collections import Counter | |
| from dataclasses import dataclass, field | |
| class LLMFlag: | |
| flag_type: str | |
| severity: str | |
| description: str | |
| evidence: str | |
| suggestion: str | |
| class LLMResult: | |
| burstiness_score: float | |
| vocabulary_diversity: float | |
| sentence_uniformity: float | |
| llm_phrase_count: int | |
| llm_score: float | |
| risk_level: str | |
| summary: str | |
| flags: list = field(default_factory=list) | |
| flags_count: int = 0 | |
| class LLMDetector: | |
| """ | |
| LLM-Generated Paper Detector. | |
| Human writing is bursty β complex sentences mixed | |
| with simple ones, varied vocabulary, irregular rhythm. | |
| LLM writing is uniform β consistent complexity, | |
| repetitive structures, characteristic phrases. | |
| """ | |
| # LLM signature phrases β common in GPT/Claude output | |
| LLM_PHRASES = [ | |
| "it is worth noting", | |
| "it is important to note", | |
| "it should be noted", | |
| "furthermore", | |
| "moreover", | |
| "in conclusion", | |
| "in summary", | |
| "this paper presents", | |
| "this study aims to", | |
| "the results demonstrate", | |
| "the findings suggest", | |
| "significantly", | |
| "notably", | |
| "interestingly", | |
| "it is evident", | |
| "plays a crucial role", | |
| "plays an important role", | |
| "has been widely studied", | |
| "in recent years", | |
| "state of the art", | |
| "state-of-the-art", | |
| "leveraging", | |
| "utilize", | |
| "utilizes", | |
| "delve into", | |
| "delves into", | |
| "shed light on", | |
| "sheds light on", | |
| "comprehensive analysis", | |
| "robust framework", | |
| "novel approach", | |
| "cutting-edge", | |
| "landscape of", | |
| "in the realm of", | |
| "a testament to", | |
| ] | |
| # Sentence splitter | |
| SENT_PAT = re.compile(r'[.!?]+\s+') | |
| # Word tokenizer | |
| WORD_PAT = re.compile(r'\b[a-z]+\b', re.IGNORECASE) | |
| def analyze(self, text: str) -> LLMResult: | |
| if len(text.strip()) < 100: | |
| return LLMResult( | |
| burstiness_score = 0.0, | |
| vocabulary_diversity = 1.0, | |
| sentence_uniformity = 0.0, | |
| llm_phrase_count = 0, | |
| llm_score = 0.0, | |
| risk_level = "low", | |
| summary = ( | |
| "LLM Detection: Insufficient text for analysis " | |
| "(minimum 100 characters required)." | |
| ), | |
| flags = [], | |
| flags_count= 0, | |
| ) | |
| sentences = self._split_sentences(text) | |
| words = self._tokenize(text) | |
| flags = [] | |
| # ββ 1. Burstiness Analysis ββββββββββββββββββββββββββββββββ | |
| burstiness = self._burstiness(sentences) | |
| # ββ 2. Vocabulary Diversity (TTR) βββββββββββββββββββββββββ | |
| ttr = self._type_token_ratio(words) | |
| # ββ 3. Sentence Length Uniformity βββββββββββββββββββββββββ | |
| uniformity = self._sentence_uniformity(sentences) | |
| # ββ 4. LLM Phrase Detection βββββββββββββββββββββββββββββββ | |
| phrase_count, phrases_found = self._detect_phrases(text) | |
| # ββ Flag 1: Low burstiness ββββββββββββββββββββββββββββββββ | |
| if burstiness < 0.3 and len(sentences) >= 5: | |
| flags.append(LLMFlag( | |
| flag_type = "low_burstiness", | |
| severity = "high" if burstiness < 0.15 else "medium", | |
| description = ( | |
| f"Text burstiness score: {round(burstiness, 3)}. " | |
| f"Human writing naturally varies between complex " | |
| f"and simple sentences (high burstiness). " | |
| f"This text shows unusually uniform complexity β " | |
| f"a strong indicator of LLM generation." | |
| ), | |
| evidence = ( | |
| f"Burstiness: {round(burstiness, 3)} " | |
| f"(human avg: 0.4-0.8) | " | |
| f"Sentences analyzed: {len(sentences)}" | |
| ), | |
| suggestion = ( | |
| "If AI was used, disclose it per journal policy. " | |
| "Human-written text naturally has rhythm variation. " | |
| "Review for AI assistance disclosure requirements." | |
| ), | |
| )) | |
| # ββ Flag 2: Low vocabulary diversity βββββββββββββββββββββ | |
| if ttr < 0.4 and len(words) >= 50: | |
| flags.append(LLMFlag( | |
| flag_type = "low_vocabulary_diversity", | |
| severity = "medium", | |
| description = ( | |
| f"Type-Token Ratio: {round(ttr, 3)}. " | |
| f"Low vocabulary diversity suggests repetitive " | |
| f"word usage typical of LLM output. " | |
| f"Human academic writing typically scores >0.5." | |
| ), | |
| evidence = ( | |
| f"TTR: {round(ttr, 3)} | " | |
| f"Unique words: {len(set(w.lower() for w in words))} / " | |
| f"Total words: {len(words)}" | |
| ), | |
| suggestion = ( | |
| "Vary vocabulary and sentence structure. " | |
| "If AI-assisted, follow institutional disclosure policy." | |
| ), | |
| )) | |
| # ββ Flag 3: High sentence uniformity βββββββββββββββββββββ | |
| if uniformity > 0.7 and len(sentences) >= 5: | |
| flags.append(LLMFlag( | |
| flag_type = "high_sentence_uniformity", | |
| severity = "medium", | |
| description = ( | |
| f"Sentence length uniformity: {round(uniformity*100)}%. " | |
| f"All sentences are suspiciously similar in length. " | |
| f"LLMs tend to produce consistent sentence lengths; " | |
| f"human writers vary naturally." | |
| ), | |
| evidence = ( | |
| f"Uniformity score: {round(uniformity*100)}% | " | |
| f"Sentences: {len(sentences)}" | |
| ), | |
| suggestion = ( | |
| "Natural academic writing mixes short and long " | |
| "sentences. High uniformity is an LLM signal." | |
| ), | |
| )) | |
| # ββ Flag 4: LLM signature phrases ββββββββββββββββββββββββ | |
| if phrase_count >= 3: | |
| flags.append(LLMFlag( | |
| flag_type = "llm_signature_phrases", | |
| severity = "high" if phrase_count >= 6 else "medium", | |
| description = ( | |
| f"{phrase_count} LLM-characteristic phrase(s) detected. " | |
| f"Phrases like 'it is worth noting', 'furthermore', " | |
| f"'delve into' are disproportionately common in " | |
| f"AI-generated text compared to human writing." | |
| ), | |
| evidence = ( | |
| f"Phrases found: {', '.join(phrases_found[:6])} | " | |
| f"Count: {phrase_count}" | |
| ), | |
| suggestion = ( | |
| "Replace generic transitional phrases with " | |
| "discipline-specific language. Disclose AI use " | |
| "if applicable per journal requirements." | |
| ), | |
| )) | |
| score = self._aggregate_score( | |
| burstiness, ttr, uniformity, phrase_count, sentences, words | |
| ) | |
| level = self._risk(score, len(flags)) | |
| summary = self._build_summary( | |
| score, level, burstiness, ttr, phrase_count, len(sentences) | |
| ) | |
| return LLMResult( | |
| burstiness_score = round(burstiness, 4), | |
| vocabulary_diversity = round(ttr, 4), | |
| sentence_uniformity = round(uniformity, 4), | |
| llm_phrase_count = phrase_count, | |
| llm_score = round(score, 4), | |
| risk_level = level, | |
| summary = summary, | |
| flags = flags, | |
| flags_count = len(flags), | |
| ) | |
| # ββ internal helpers βββββββββββββββββββββββββββββββββββββββββ | |
| def _split_sentences(self, text: str) -> list: | |
| sentences = self.SENT_PAT.split(text.strip()) | |
| return [s.strip() for s in sentences if len(s.strip()) > 10] | |
| def _tokenize(self, text: str) -> list: | |
| return self.WORD_PAT.findall(text) | |
| def _burstiness(self, sentences: list) -> float: | |
| """ | |
| Burstiness = coefficient of variation of sentence lengths. | |
| High burstiness = human-like variation. | |
| Low burstiness = LLM-like uniformity. | |
| """ | |
| if len(sentences) < 3: | |
| return 0.5 | |
| lengths = [len(s.split()) for s in sentences] | |
| if statistics.mean(lengths) == 0: | |
| return 0.5 | |
| cv = statistics.stdev(lengths) / statistics.mean(lengths) | |
| return min(cv, 1.0) | |
| def _type_token_ratio(self, words: list) -> float: | |
| """TTR = unique words / total words. Higher = more diverse.""" | |
| if not words: | |
| return 1.0 | |
| # Use sliding window TTR for longer texts | |
| window = min(len(words), 100) | |
| sample = words[:window] | |
| unique = len(set(w.lower() for w in sample)) | |
| return unique / len(sample) | |
| def _sentence_uniformity(self, sentences: list) -> float: | |
| """ | |
| How uniform are sentence lengths? | |
| 1.0 = all same length (LLM-like) | |
| 0.0 = highly varied (human-like) | |
| """ | |
| if len(sentences) < 3: | |
| return 0.0 | |
| lengths = [len(s.split()) for s in sentences] | |
| mean = statistics.mean(lengths) | |
| if mean == 0: | |
| return 0.0 | |
| stdev = statistics.stdev(lengths) | |
| cv = stdev / mean | |
| # Invert: high CV = low uniformity | |
| return max(0.0, 1.0 - min(cv, 1.0)) | |
| def _detect_phrases(self, text: str) -> tuple: | |
| text_lower = text.lower() | |
| found = [] | |
| for phrase in self.LLM_PHRASES: | |
| if phrase in text_lower: | |
| found.append(phrase) | |
| return len(found), found | |
| def _aggregate_score(self, burstiness, ttr, uniformity, | |
| phrase_count, sentences, words) -> float: | |
| if len(sentences) < 3: | |
| return 0.0 | |
| # Normalize components to 0-1 risk | |
| burst_risk = max(0, 1 - (burstiness / 0.5)) | |
| ttr_risk = max(0, 1 - (ttr / 0.6)) | |
| uniform_risk = uniformity | |
| phrase_risk = min(phrase_count / 8, 1.0) | |
| score = ( | |
| burst_risk * 0.35 + | |
| ttr_risk * 0.25 + | |
| uniform_risk * 0.20 + | |
| phrase_risk * 0.20 | |
| ) | |
| return min(round(score, 4), 1.0) | |
| def _risk(self, score: float, flag_count: int) -> str: | |
| if score >= 0.65 or flag_count >= 3: | |
| return "critical" | |
| if score >= 0.45 or flag_count >= 2: | |
| return "high" | |
| if score >= 0.25 or flag_count >= 1: | |
| return "medium" | |
| return "low" | |
| def _build_summary(self, score, level, burstiness, | |
| ttr, phrase_count, n_sentences) -> str: | |
| pct = round(score * 100) | |
| return ( | |
| f"LLM Detection analyzed {n_sentences} sentence(s). " | |
| f"Burstiness: {round(burstiness*100)}% " | |
| f"(human-like threshold: >40%). " | |
| f"Vocabulary diversity: {round(ttr*100)}%. " | |
| f"LLM signature phrases: {phrase_count}. " | |
| f"AI-generation probability: {pct}%. " | |
| f"Risk level: {level.upper()}." | |
| ) |