Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| import re | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # Use one tokenizer across all ensemble models | |
| tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base") | |
| # Load 3 models from Hugging Face (no local .bin required) | |
| model_names = [ | |
| "mihalykiss/modernbert_2/Model_groups_3class_seed12", | |
| "mihalykiss/modernbert_2/Model_groups_3class_seed22", | |
| "mihalykiss/modernbert_2/Model_groups_3class_seed32", # third ensemble variant | |
| ] | |
| models = [] | |
| for name in model_names: | |
| m = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41) | |
| m.load_state_dict(torch.hub.load_state_dict_from_url( | |
| f"https://huggingface.co/{name}/resolve/main/pytorch_model.bin", | |
| map_location=device | |
| )) | |
| m.to(device).eval() | |
| models.append(m) | |
| label_mapping = { | |
| 0: '13B', 1: '30B', 2: '65B', 3: '7B', 4: 'GLM130B', 5: 'bloom_7b', | |
| 6: 'bloomz', 7: 'cohere', 8: 'davinci', 9: 'dolly', 10: 'dolly-v2-12b', | |
| 11: 'flan_t5_base', 12: 'flan_t5_large', 13: 'flan_t5_small', | |
| 14: 'flan_t5_xl', 15: 'flan_t5_xxl', 16: 'gemma-7b-it', 17: 'gemma2-9b-it', | |
| 18: 'gpt-3.5-turbo', 19: 'gpt-35', 20: 'gpt4', 21: 'gpt4o', | |
| 22: 'gpt_j', 23: 'gpt_neox', 24: 'human', 25: 'llama3-70b', 26: 'llama3-8b', | |
| 27: 'mixtral-8x7b', 28: 'opt_1.3b', 29: 'opt_125m', 30: 'opt_13b', | |
| 31: 'opt_2.7b', 32: 'opt_30b', 33: 'opt_350m', 34: 'opt_6.7b', | |
| 35: 'opt_iml_30b', 36: 'opt_iml_max_1.3b', 37: 't0_11b', 38: 't0_3b', | |
| 39: 'text-davinci-002', 40: 'text-davinci-003' | |
| } | |
| def clean_text(text: str) -> str: | |
| text = re.sub(r"\s{2,}", " ", text) | |
| text = re.sub(r"\s+([,.;:?!])", r"\1", text) | |
| return text.strip() | |
| def classify_text(text): | |
| cleaned_text = clean_text(text) | |
| if not cleaned_text: | |
| return "Please paste some text." | |
| # Split text into sentences for per-sentence highlighting | |
| sentences = re.split(r'(?<=[.!?])\s+', cleaned_text) | |
| highlighted = [] | |
| total_ai, total_human = 0, 0 | |
| for sent in sentences: | |
| if not sent.strip(): | |
| continue | |
| inputs = tokenizer(sent, return_tensors="pt", truncation=True, padding=True).to(device) | |
| with torch.no_grad(): | |
| probs_list = [] | |
| for m in models: | |
| logits = m(**inputs).logits | |
| probs_list.append(torch.softmax(logits, dim=1)) | |
| avg_probs = sum(probs_list) / len(probs_list) | |
| probs = avg_probs[0] | |
| ai_probs = probs.clone() | |
| ai_probs[24] = 0 | |
| ai_score = ai_probs.sum().item() * 100 | |
| human_score = 100 - ai_score | |
| total_ai += ai_score | |
| total_human += human_score | |
| if ai_score > 20: # highlight AI-like sentences | |
| highlighted.append(f"<span class='highlight-ai'>{sent}</span>") | |
| else: | |
| highlighted.append(f"<span class='highlight-human'>{sent}</span>") | |
| # Global decision | |
| if total_human >= total_ai: | |
| verdict = f"<br><br><b>Overall: {total_human/(total_ai+total_human)*100:.2f}% Human</b>" | |
| else: | |
| verdict = f"<br><br><b>Overall: {total_ai/(total_ai+total_human)*100:.2f}% AI</b>" | |
| return " ".join(highlighted) + verdict | |
| # Gradio UI | |
| iface = gr.Interface( | |
| fn=classify_text, | |
| inputs=gr.Textbox(lines=6, placeholder="Paste text here..."), | |
| outputs="html", | |
| title="AI Text Detector", | |
| description="Detects AI-generated text using ModernBERT ensemble and highlights AI-like vs Human-like sentences." | |
| ) | |
| iface.launch() | |