VictorM-Coder commited on
Commit
ceeca7d
·
verified ·
1 Parent(s): 54bfac3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -96
app.py CHANGED
@@ -1,102 +1,168 @@
1
- import gradio as gr
 
 
 
2
  import torch
 
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
  import re
 
 
 
 
 
 
 
 
 
5
 
6
- # Use GPU if available
7
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
-
9
- # One tokenizer shared across models
10
- tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
11
-
12
- # Ensemble model repos (replace with real Hugging Face repos if names differ)
13
- model_names = [
14
- "mihalykiss/modernbert_2_seed12",
15
- "mihalykiss/modernbert_2_seed22",
16
- "mihalykiss/modernbert_2_seed32"
17
- ]
18
-
19
- # Load models directly from Hugging Face
20
- models = []
21
- for repo in model_names:
22
- m = AutoModelForSequenceClassification.from_pretrained(repo).to(device).eval()
23
- models.append(m)
24
-
25
- # Label map
26
- label_mapping = {
27
- 0: '13B', 1: '30B', 2: '65B', 3: '7B', 4: 'GLM130B', 5: 'bloom_7b',
28
- 6: 'bloomz', 7: 'cohere', 8: 'davinci', 9: 'dolly', 10: 'dolly-v2-12b',
29
- 11: 'flan_t5_base', 12: 'flan_t5_large', 13: 'flan_t5_small',
30
- 14: 'flan_t5_xl', 15: 'flan_t5_xxl', 16: 'gemma-7b-it', 17: 'gemma2-9b-it',
31
- 18: 'gpt-3.5-turbo', 19: 'gpt-35', 20: 'gpt4', 21: 'gpt4o',
32
- 22: 'gpt_j', 23: 'gpt_neox', 24: 'human', 25: 'llama3-70b', 26: 'llama3-8b',
33
- 27: 'mixtral-8x7b', 28: 'opt_1.3b', 29: 'opt_125m', 30: 'opt_13b',
34
- 31: 'opt_2.7b', 32: 'opt_30b', 33: 'opt_350m', 34: 'opt_6.7b',
35
- 35: 'opt_iml_30b', 36: 'opt_iml_max_1.3b', 37: 't0_11b', 38: 't0_3b',
36
- 39: 'text-davinci-002', 40: 'text-davinci-003'
37
- }
38
-
39
- # Text cleanup
40
- def clean_text(text: str) -> str:
41
- text = re.sub(r"\s{2,}", " ", text)
42
- text = re.sub(r"\s+([,.;:?!])", r"\1", text)
43
- return text.strip()
44
-
45
- # Classification function
46
- def classify_text(text):
47
- cleaned_text = clean_text(text)
48
- if not cleaned_text:
49
- return "Please paste some text."
50
-
51
- sentences = re.split(r'(?<=[.!?])\s+', cleaned_text)
52
-
53
- highlighted = []
54
- total_ai, total_human = 0, 0
55
-
56
- for sent in sentences:
57
- if not sent.strip():
58
- continue
59
-
60
- inputs = tokenizer(sent, return_tensors="pt", truncation=True, padding=True).to(device)
61
- with torch.no_grad():
62
- probs_list = []
63
- for m in models:
64
- logits = m(**inputs).logits
65
- probs_list.append(torch.softmax(logits, dim=1))
66
- avg_probs = sum(probs_list) / len(probs_list)
67
- probs = avg_probs[0]
68
-
69
- # Human class = 24, AI = all others
70
- ai_probs = probs.clone()
71
- ai_probs[24] = 0
72
- ai_score = ai_probs.sum().item() * 100
73
- human_score = 100 - ai_score
74
-
75
- total_ai += ai_score
76
- total_human += human_score
77
-
78
- if ai_score > 20:
79
- highlighted.append(f"<span class='highlight-ai'>{sent}</span>")
80
- else:
81
- highlighted.append(f"<span class='highlight-human'>{sent}</span>")
82
-
83
- # Global verdict
84
- if total_human >= total_ai:
85
- verdict = f"<br><br><b>Overall: {(total_human/(total_ai+total_human))*100:.2f}% Human</b>"
86
- else:
87
- verdict = f"<br><br><b>Overall: {(total_ai/(total_ai+total_human))*100:.2f}% AI</b>"
88
-
89
- return " ".join(highlighted) + verdict
90
-
91
- # Gradio interface with styling
92
- iface = gr.Interface(
93
- fn=classify_text,
94
- inputs=gr.Textbox(lines=6, placeholder="Paste text here..."),
95
- outputs="html",
96
- title="AI Text Detector",
97
- description="Detects AI-generated text using a ModernBERT ensemble. Sentences are highlighted:<br>"
98
- "<span style='color:#FF5733;font-weight:bold;'>AI-like</span> vs "
99
- "<span style='color:#4CAF50;font-weight:bold;'>Human-like</span>."
100
  )
101
 
102
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import ast
4
+ import streamlit as st
5
  import torch
6
+ import torch.nn.functional as F
7
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
  import re
9
+ import math
10
+ import logging
11
+ import pandas as pd
12
+
13
+ st.set_page_config(
14
+ page_title="AI Article Detection by Writenix",
15
+ page_icon="🧠",
16
+ layout="wide"
17
+ )
18
 
19
+ st.logo(
20
+ image="https://dejan.ai/wp-content/uploads/2024/02/dejan-300x103.png",
21
+ link="https://dejan.ai/",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  )
23
 
24
+ # --- Load heuristic weights from environment secrets, with JSON→Python fallback ---
25
+ @st.cache_resource
26
+ def load_heuristic_weights():
27
+ def _load(env_key):
28
+ raw = os.environ[env_key]
29
+ try:
30
+ return json.loads(raw)
31
+ except json.JSONDecodeError:
32
+ return ast.literal_eval(raw)
33
+ ai = _load("AI_WEIGHTS_JSON")
34
+ og = _load("OG_WEIGHTS_JSON")
35
+ return ai, og
36
+
37
+ AI_WEIGHTS, OG_WEIGHTS = load_heuristic_weights()
38
+ SIGMOID_K = 0.5
39
+
40
+ def tokenize(text):
41
+ return re.findall(r'\b[a-z]{2,}\b', text.lower())
42
+
43
+ def classify_text_likelihood(text: str) -> float:
44
+ tokens = tokenize(text)
45
+ if not tokens:
46
+ return 0.5
47
+ ai_score = og_score = matched = 0
48
+ for t in tokens:
49
+ aw = AI_WEIGHTS.get(t, 0)
50
+ ow = OG_WEIGHTS.get(t, 0)
51
+ if aw or ow:
52
+ matched += 1
53
+ ai_score += aw
54
+ og_score += ow
55
+ if matched == 0:
56
+ return 0.5
57
+ net = ai_score - og_score
58
+ return 1 / (1 + math.exp(-SIGMOID_K * net))
59
+
60
+ # --- Logging & Streamlit setup ---
61
+ logging.basicConfig(level=logging.INFO)
62
+ logger = logging.getLogger(__name__)
63
+
64
+ st.markdown("""
65
+ <link href="https://fonts.googleapis.com/css2?family=Roboto&display=swap" rel="stylesheet">
66
+ <style>
67
+ html, body, [class*="css"] {
68
+ font-family: 'Roboto', sans-serif;
69
+ }
70
+ </style>
71
+ """, unsafe_allow_html=True)
72
+
73
+ @st.cache_resource
74
+ def load_model_and_tokenizer(model_name):
75
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
76
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
77
+ dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
78
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=dtype)
79
+ model.to(device).eval()
80
+ return tokenizer, model, device
81
+
82
+ MODEL_NAME = "dejanseo/ai-cop"
83
+ try:
84
+ tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME)
85
+ except Exception as e:
86
+ st.error(f"Error loading model: {e}")
87
+ logger.error(f"Failed to load model: {e}", exc_info=True)
88
+ st.stop()
89
+
90
+ def sent_tokenize(text):
91
+ return [s for s in re.split(r'(?<=[\.!?])\s+', text.strip()) if s]
92
+
93
+ st.title("AI Article Detection")
94
+
95
+ text = st.text_area("Enter text to classify", height=200, placeholder="Paste your text here…")
96
+
97
+ if st.button("Classify", type="primary"):
98
+ if not text.strip():
99
+ st.warning("Please enter some text.")
100
+ else:
101
+ with st.spinner("Analyzing…"):
102
+ sentences = sent_tokenize(text)
103
+ if not sentences:
104
+ st.warning("No sentences detected.")
105
+ st.stop()
106
+
107
+ inputs = tokenizer(
108
+ sentences,
109
+ return_tensors="pt",
110
+ padding=True,
111
+ truncation=True,
112
+ max_length=model.config.max_position_embeddings
113
+ ).to(device)
114
+
115
+ with torch.no_grad():
116
+ logits = model(**inputs).logits
117
+ probs = F.softmax(logits, dim=-1).cpu()
118
+ preds = torch.argmax(probs, dim=-1).cpu()
119
+
120
+ # Create dataframe for sentences
121
+ sentences_data = []
122
+ highlighted_sentences = []
123
+ for i, s in enumerate(sentences):
124
+ p = preds[i].item()
125
+ conf = probs[i, p].item()
126
+ label = "AI" if p == 0 else "Human"
127
+
128
+ sentences_data.append({
129
+ "sentence": s,
130
+ "classification": label,
131
+ "confidence": conf
132
+ })
133
+
134
+ if label == "AI":
135
+ highlighted_sentences.append(f"<span style='color:red; font-weight:bold'>{s}</span>")
136
+ else:
137
+ highlighted_sentences.append(f"<span style='color:green; font-weight:bold'>{s}</span>")
138
+
139
+ # Display dataframe
140
+ df = pd.DataFrame(sentences_data)
141
+ st.dataframe(
142
+ df,
143
+ column_config={
144
+ "sentence": st.column_config.TextColumn("Sentence"),
145
+ "classification": st.column_config.TextColumn("Classification"),
146
+ "confidence": st.column_config.ProgressColumn(
147
+ "Confidence",
148
+ help="Model's confidence in the classification",
149
+ format="%.2f",
150
+ min_value=0,
151
+ max_value=1,
152
+ ),
153
+ },
154
+ hide_index=True,
155
+ )
156
+
157
+ # Highlighted text output
158
+ st.markdown("### 🔍 Highlighted Text")
159
+ st.markdown(" ".join(highlighted_sentences), unsafe_allow_html=True)
160
+
161
+ avg = torch.mean(probs, dim=0)
162
+ model_ai = avg[0].item()
163
+ heuristic_ai = classify_text_likelihood(text)
164
+ combined = min(model_ai + heuristic_ai, 1.0)
165
+
166
+ st.subheader(f"⚖️ AI Likelihood: {combined*100:.1f}%")
167
+ st.write(f"🤖 Model: {model_ai*100:.1f}%")
168
+ st.write(f"🛠️ Heuristic: {heuristic_ai*100:.1f}%")