Spaces:

astein11004
/

TruthLens

Sleeping

App Files Files Community

TruthLens / text_analyzer.py

astein11004

Upload 9 files

9bcc80f verified 28 days ago

raw

history blame contribute delete

13.4 kB

	import io
	import re
	import random
	import time
	import json
	import os
	from collections import Counter
	from functools import lru_cache
	from typing import Dict
	from openai import OpenAI
	from tenacity import retry, stop_after_attempt, wait_exponential
	from dotenv import load_dotenv

	load_dotenv()

	# --- Lazy Dependency Loaders ---
	@lru_cache(None)
	def get_torch():
	import torch
	return torch

	@lru_cache(None)
	def get_numpy():
	import numpy as np
	return np

	@lru_cache(None)
	def get_transformers():
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, GPT2LMHeadModel, GPT2Tokenizer
	return AutoTokenizer, AutoModelForSequenceClassification, GPT2LMHeadModel, GPT2Tokenizer

	@lru_cache(None)
	def get_nltk():
	import nltk
	return nltk

	@lru_cache(None)
	def get_pypdf2():
	import PyPDF2
	return PyPDF2

	@lru_cache(None)
	def get_docx():
	import docx
	return docx

	# === YOUR KEYS (Loading from .env to avoid Git Secret Scanning) ===
	OPENROUTER_KEY = os.getenv("OPENROUTER_KEY", "")
	OPENAI_KEY = os.getenv("OPENAI_KEY", "")
	GROQ_KEY = os.getenv("GROQ_KEY", "")
	TOGETHER_KEY = os.getenv("TOGETHER_KEY", "")

	PROVIDERS = [
	{"name": "Groq", "client": OpenAI(base_url="https://api.groq.com/openai/v1", api_key=GROQ_KEY), "models": ["llama-3.2-3b-instruct"]},
	{"name": "OpenRouter","client": OpenAI(base_url="https://openrouter.ai/api/v1", api_key=OPENROUTER_KEY), "models": ["meta-llama/llama-3.2-3b-instruct:free"]},
	{"name": "Together", "client": OpenAI(base_url="https://api.together.xyz/v1", api_key=TOGETHER_KEY), "models": ["meta-llama/Llama-3.2-3B-Instruct-Turbo"]},
	]

	# === INSANELY STRONG 3-PASS HUMANIZER PROMPT ===
	def get_humanizer_prompt(stage: int):
	return """
	Rewrite this text so it sounds naturally written by a human.

	Requirements:
	- Preserve the original tone and meaning
	- Avoid perfect academic structure
	- Vary sentence lengths significantly
	- Use natural rhythm and phrasing
	- Avoid predictable AI transitions
	- Some sentences can be shorter
	- Avoid overly polished wording

	Human writing traits:
	- slight structural variation
	- occasional uneven sentence flow
	- natural phrasing rather than perfect grammar symmetry

	Text:
	{text}

	Rewrite:
	"""

	def extract_text(content: bytes, filename: str) -> str:
	ext = filename.lower().rsplit('.', 1)[-1]
	try:
	if ext == 'pdf':
	PyPDF2 = get_pypdf2()
	reader = PyPDF2.PdfReader(io.BytesIO(content))
	return "\n".join(page.extract_text() or "" for page in reader.pages)
	elif ext in ('docx', 'doc'):
	docx = get_docx()
	doc = docx.Document(io.BytesIO(content))
	return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
	else:
	return content.decode('utf-8', errors='replace')
	except:
	return content.decode('utf-8', errors='replace')

	import math

	# === NLTK Assets (Lazy downloaded on first use) ===
	@lru_cache(None)
	def download_nltk_assets():
	nltk = get_nltk()
	try:
	print("Checking NLTK assets...")
	try:
	nltk.data.find('tokenizers/punkt')
	except LookupError:
	nltk.download('punkt')
	try:
	nltk.data.find('tokenizers/punkt_tab')
	except LookupError:
	nltk.download('punkt_tab')
	return True
	except Exception as e:
	print(f"NLTK Download Failed: {e}")
	return False

	# === MODELS (Cached global loaders) ===
	@lru_cache(None)
	def get_transformer_model():
	AutoTokenizer, AutoModelForSequenceClassification, _, _ = get_transformers()
	print("Loading DeBERTa-v3-large...")
	model_name = "microsoft/deberta-v3-large"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSequenceClassification.from_pretrained(model_name)
	model.eval()
	return tokenizer, model

	@lru_cache(None)
	def get_perplexity_model():
	from transformers import GPT2LMHeadModel, GPT2Tokenizer
	print("Loading GPT-2 for Perplexity...")
	tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
	model = GPT2LMHeadModel.from_pretrained("gpt2")
	model.eval()
	return tokenizer, model

	def normalize_perplexity(ppl):
	if ppl < 30: return 95
	elif ppl < 45: return 80
	elif ppl < 60: return 60
	elif ppl < 80: return 40
	elif ppl < 120: return 20
	else: return 10

	def normalize_entropy(ent):
	if ent < 4: return 90
	elif ent < 5: return 70
	elif ent < 6: return 50
	elif ent < 7: return 30
	else: return 10

	def chunk_text(text, size=200):
	words = text.split()
	for i in range(0, len(words), size):
	yield " ".join(words[i:i+size])

	# === LAYER 1: Transformer Classification ===
	def transformer_score(text):
	torch = get_torch()
	try:
	tokenizer, model = get_transformer_model()
	inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
	with torch.no_grad():
	outputs = model(**inputs)
	probs = torch.softmax(outputs.logits, dim=1)
	return float(probs[0][1].item() * 100)
	except Exception as e:
	print(f"Transformer Error: {e}")
	return 50.0

	# === LAYER 2: Perplexity Analysis ===
	def perplexity_score(text):
	torch = get_torch()
	try:
	tokenizer, model = get_perplexity_model()
	encodings = tokenizer(text, return_tensors="pt")
	max_length = model.config.n_positions
	stride = 512
	nlls = []

	for i in range(0, encodings.input_ids.size(1), stride):
	begin_loc = max(i + stride - max_length, 0)
	end_loc = min(i + stride, encodings.input_ids.size(1))
	trg_len = end_loc - i

	input_ids = encodings.input_ids[:, begin_loc:end_loc]
	target_ids = input_ids.clone()
	target_ids[:, :-trg_len] = -100

	with torch.no_grad():
	outputs = model(input_ids, labels=target_ids)
	nlls.append(outputs.loss * trg_len)

	ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
	return float(ppl.item())
	except Exception as e:
	print(f"Perplexity Error: {e}")
	return 80.0

	# === LAYER 3: Burstiness Detection ===
	def burstiness_score(text):
	np = get_numpy()
	nltk = get_nltk()
	try:
	download_nltk_assets()
	sentences = nltk.sent_tokenize(text)
	if len(sentences) <= 1: return 0.0
	lengths = [len(s.split()) for s in sentences]
	return float(np.var(lengths))
	except:
	return 0.0

	# === LAYER 4: Stylometric Analysis ===
	def stylometric_features(text):
	words = text.split()
	if not words: return {"vocab_richness": 0, "punctuation_ratio": 0, "avg_sentence_length": 0}
	unique_words = len(set(words))
	vocab_richness = unique_words / len(words)
	punctuation = len(re.findall(r'[.,!?;:]', text))
	punctuation_ratio = punctuation / len(words)
	avg_sentence_length = len(words) / max(1, text.count("."))
	return {
	"vocab_richness": vocab_richness,
	"punctuation_ratio": punctuation_ratio,
	"avg_sentence_length": avg_sentence_length
	}

	# === LAYER 5: Repetition Detection ===
	def repetition_score(text):
	words = text.lower().split()
	if len(words) < 2: return 1.0
	bigrams = list(zip(words, words[1:]))
	freq = Counter(bigrams)
	if not freq: return 1.0
	most_common = freq.most_common(1)[0][1]
	return float(most_common)

	# === LAYER 6: Entropy Score ===
	def entropy_score(text):
	import math
	words = text.split()
	if not words: return 0.0
	freq = Counter(words)
	probs = [f / len(words) for f in freq.values()]
	entropy = -sum(p * math.log2(p) for p in probs)
	return float(entropy)

	def detect_ai_text(text: str) -> Dict:
	raw_text = text.strip()
	words = raw_text.split()
	if len(words) < 200:
	return {"error": "Minimum 200 words required for forensic analysis", "word_count": len(words)}
	if len(words) > 500:
	return {"error": "Maximum 500 words allowed for optimal performance", "word_count": len(words)}

	# Collective chunking for Transformer (to handle long text)
	chunks = list(chunk_text(raw_text))
	t_scores = [transformer_score(c) for c in chunks]
	t_score = np.mean(t_scores)

	# Collect layer scores
	ppl = perplexity_score(raw_text)
	burst = burstiness_score(raw_text)
	rep = repetition_score(raw_text)
	ent = entropy_score(raw_text)
	stylo = stylometric_features(raw_text)

	# Normalization & Ensemble (Smoothed scaling)
	ppl_norm = normalize_perplexity(ppl)
	burst_norm = min(100, burst * 3)
	rep_norm = min(100, rep * 4)
	ent_norm = normalize_entropy(ent)

	np = get_numpy()
	final_score = (
	0.30 * t_score +
	0.30 * ppl_norm +
	0.20 * burst_norm +
	0.12 * ent_norm +
	0.08 * rep_norm
	)

	ai_prob = round(min(99, max(1, final_score)), 2)
	human_prob = round(100 - ai_prob, 2)

	if ai_prob > 75: verdict = "Very likely AI Generated"
	elif ai_prob > 50: verdict = "Likely AI Generated"
	elif ai_prob > 30: verdict = "Uncertain"
	else: verdict = "Likely Human Written"

	return {
	"ai_probability": ai_prob,
	"human_probability": human_prob,
	"verdict": verdict,
	"word_count": len(raw_text.split()),
	"transformerScore": round(float(t_score), 2),
	"perplexity": round(ppl, 2),
	"burstiness": round(burst, 2),
	"entropy": round(ent, 2),
	"details": {
	**stylo,
	"repetition_score": rep,
	"entropy": round(ent, 2)
	}
	}

	def statistical_humanizer(text):
	try:
	nltk = get_nltk()
	download_nltk_assets()
	sentences = nltk.sent_tokenize(text)
	new_sentences = []
	for s in sentences:
	words = s.split()
	# randomly shorten sentences
	if len(words) > 12 and random.random() < 0.35:
	cut = random.randint(8, len(words))
	s = " ".join(words[:cut])
	# occasionally split sentence
	if len(words) > 18 and random.random() < 0.3:
	split = random.randint(7, len(words)-5)
	s = " ".join(words[:split]) + ". " + " ".join(words[split:])
	# slight word shuffle
	if len(words) > 8 and random.random() < 0.2:
	i = random.randint(0, len(words)-2)
	words[i], words[i+1] = words[i+1], words[i]
	s = " ".join(words)
	new_sentences.append(s)
	return " ".join(new_sentences)
	except:
	return text

	@retry(stop=stop_after_attempt(6), wait=wait_exponential(multiplier=2, min=4, max=60))
	def humanize_text(text: str) -> str:
	if len(text) < 50:
	return text

	current = text[:3900]

	# Triple pass for maximum undetectability
	for stage in range(1, 4):
	prompt = get_humanizer_prompt(stage).format(text=current)

	shuffled_providers = PROVIDERS[:]
	random.shuffle(shuffled_providers)

	success = False
	for p in shuffled_providers:
	try:
	client = p["client"]
	model = random.choice(p["models"])

	response = client.chat.completions.create(
	model=model,
	messages=[{"role": "user", "content": prompt}],
	temperature=1.1,
	top_p=0.92,
	max_tokens=2600,
	timeout=60
	)
	current = response.choices[0].message.content.strip()
	success = True
	break

	except Exception as e:
	print(f"Stage {stage} - {p['name']} failed: {str(e)}")
	time.sleep(2)

	if not success:
	# Rule-based fallback (always works)
	current = add_human_noise(current)

	current = statistical_humanizer(current)
	return current

	def add_human_noise(text: str) -> str:
	"""Zero-API fallback — injects real human imperfections"""
	lines = text.split('\n')
	result = []
	fillers = ["tbh", "like", "honestly", "you know", "I mean", "kinda", "yaar", "bro", "idk"]

	for line in lines:
	if random.random() < 0.35:
	filler = random.choice(fillers)
	line = filler + ", " + line
	if random.random() < 0.2:
	words = line.split()
	if len(words) > 3:
	idx = random.randint(0, len(words)-2)
	words.insert(idx+1, words[idx]) # repeat word
	line = ' '.join(words)
	result.append(line)

	return '\n'.join(result)

	def analyze_text(content: bytes, filename: str) -> dict:
	text = extract_text(content, filename)
	detection = detect_ai_text(text)
	return {
	"originalText": text[:4000] + ("..." if len(text) > 4000 else ""),
	"aiProbability": detection["ai_probability"],
	"humanProbability": detection.get("human_probability", 100 - detection["ai_probability"]),
	"verdict": detection["verdict"],
	"wordCount": detection["word_count"],
	"details": detection.get("details", {}),
	"metadata": {"filename": filename, "charCount": len(text)}
	}