Spaces:

deveshpunjabi
/

OpenAudit

Running

App Files Files Community

OpenAudit / model_classifier.py

deveshpunjabi

Rename ai_text_detector.py to model_classifier.py

a6b6b2d verified 7 months ago

raw

history blame contribute delete

5.46 kB

	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import torch
	import re
	from tokenizers import normalizers
	from tokenizers.normalizers import Sequence, Replace, Strip, NFKC
	from tokenizers import Regex
	import os

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	# Updated model paths with the correct model 1 location
	model1_path = "https://huggingface.co/spaces/SzegedAI/AI_Detector/resolve/main/modernbert.bin"
	model2_path = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12"
	model3_path = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"

	print("Loading models...")

	tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

	# Load Model 1 from the correct URL
	try:
	print("Loading model1 from SzegedAI/AI_Detector...")
	model_1 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
	model_1.load_state_dict(torch.hub.load_state_dict_from_url(model1_path, map_location=device))
	model_1.to(device).eval()
	print("✅ Model 1 loaded successfully")
	except Exception as e:
	print(f"⚠️ Could not load model 1: {e}")
	model_1 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
	model_1.to(device).eval()

	# Load Model 2
	try:
	print("Loading model2 from mihalykiss/modernbert_2...")
	model_2 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
	model_2.load_state_dict(torch.hub.load_state_dict_from_url(model2_path, map_location=device))
	model_2.to(device).eval()
	print("✅ Model 2 loaded successfully")
	except Exception as e:
	print(f"⚠️ Could not load model 2: {e}")
	model_2 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
	model_2.to(device).eval()

	# Load Model 3
	try:
	print("Loading model3 from mihalykiss/modernbert_2...")
	model_3 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
	model_3.load_state_dict(torch.hub.load_state_dict_from_url(model3_path, map_location=device))
	model_3.to(device).eval()
	print("✅ Model 3 loaded successfully")
	except Exception as e:
	print(f"⚠️ Could not load model 3: {e}")
	model_3 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
	model_3.to(device).eval()

	print("✅ All models loaded successfully!")

	label_mapping = {
	0: '13B', 1: '30B', 2: '65B', 3: '7B', 4: 'GLM130B', 5: 'bloom_7b',
	6: 'bloomz', 7: 'cohere', 8: 'davinci', 9: 'dolly', 10: 'dolly-v2-12b',
	11: 'flan_t5_base', 12: 'flan_t5_large', 13: 'flan_t5_small',
	14: 'flan_t5_xl', 15: 'flan_t5_xxl', 16: 'gemma-7b-it', 17: 'gemma2-9b-it',
	18: 'gpt-3.5-turbo', 19: 'gpt-35', 20: 'gpt4', 21: 'gpt4o',
	22: 'gpt_j', 23: 'gpt_neox', 24: 'human', 25: 'llama3-70b', 26: 'llama3-8b',
	27: 'mixtral-8x7b', 28: 'opt_1.3b', 29: 'opt_125m', 30: 'opt_13b',
	31: 'opt_2.7b', 32: 'opt_30b', 33: 'opt_350m', 34: 'opt_6.7b',
	35: 'opt_iml_30b', 36: 'opt_iml_max_1.3b', 37: 't0_11b', 38: 't0_3b',
	39: 'text-davinci-002', 40: 'text-davinci-003'
	}

	def clean_text(text: str) -> str:
	text = re.sub(r'\s{2,}', ' ', text)
	text = re.sub(r'\s+([,.;:?!])', r'\1', text)
	return text

	newline_to_space = Replace(Regex(r'\s\n\s'), " ")
	join_hyphen_break = Replace(Regex(r'(\w+)[--]\s\n\s(\w+)'), r"\1\2")

	tokenizer.backend_tokenizer.normalizer = Sequence([
	tokenizer.backend_tokenizer.normalizer,
	join_hyphen_break,
	newline_to_space,
	Strip()
	])

	def classify_text(text):
	"""
	Classify text using ModernBERT ensemble
	Author: deveshpunjabi
	Date: 2025-01-15 07:07:03 UTC
	"""
	cleaned_text = clean_text(text)
	if not text.strip():
	return "⚠️ Please enter some text to analyze"

	inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True).to(device)

	with torch.no_grad():
	logits_1 = model_1(**inputs).logits
	logits_2 = model_2(**inputs).logits
	logits_3 = model_3(**inputs).logits

	softmax_1 = torch.softmax(logits_1, dim=1)
	softmax_2 = torch.softmax(logits_2, dim=1)
	softmax_3 = torch.softmax(logits_3, dim=1)

	averaged_probabilities = (softmax_1 + softmax_2 + softmax_3) / 3
	probabilities = averaged_probabilities[0]

	ai_probs = probabilities.clone()
	ai_probs[24] = 0
	ai_total_prob = ai_probs.sum().item() * 100
	human_prob = 100 - ai_total_prob

	ai_argmax_index = torch.argmax(ai_probs).item()
	ai_argmax_model = label_mapping[ai_argmax_index]

	if human_prob > ai_total_prob:
	result_message = f"""
	### 🟢 Human Written
	Confidence: {human_prob:.2f}%
	This text appears to be written by a human.
	---
	Analysis Details:
	- Human probability: {human_prob:.2f}%
	- AI probability: {ai_total_prob:.2f}%
	- Text length: {len(cleaned_text)} characters
	"""
	else:
	result_message = f"""
	### 🔴 AI Generated
	Confidence: {ai_total_prob:.2f}%
	Most likely source: {ai_argmax_model}
	This text appears to be generated by an AI model.
	---
	Analysis Details:
	- Human probability: {human_prob:.2f}%
	- AI probability: {ai_total_prob:.2f}%
	- Text length: {len(cleaned_text)} characters
	"""

	return result_message