File size: 11,956 Bytes
171eb01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
from rest_framework.views import APIView
from rest_framework.response import Response
from rest_framework import status
from django.conf import settings
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import os
import math
# Model: roberta-base (Masked LM)
# Logic: Low Perplexity = High AI Probability (Predictable)
MODEL_NAME = "roberta-base"
print(f"Loading Model: {MODEL_NAME}...")

try:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using Device: {device}")

    # Use MaskedLM to calculate loss/perplexity
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
    model.to(device)
    model.eval()
    print("Model Loaded Successfully!")

except Exception as e:
    print(f"Error loading model: {e}")
    model = None
    tokenizer = None

import time

def calculate_token_scores(text):
    """
    Analyzes text at token level to identify AI-generated regions.
    Returns: List of (start_char, end_char, score) tuples.
    """
    if not tokenizer or not model:
        return []

    encodings = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, return_offsets_mapping=True)
    input_ids = encodings.input_ids.to(device)
    offsets = encodings.offset_mapping[0].cpu().numpy() # [(0,0), (0,3), (3,4)...]
    seq_len = input_ids.shape[1]
    
    if seq_len < 4:
        return []

    # Calculate loss for every token (Stride=1 for max granularity)
    nlls = []
    
    # Reduced BATCH_SIZE to 4 to prevent OOM on 4GB GPU
    BATCH_SIZE = 4
    tensor_input_ids = input_ids.repeat(BATCH_SIZE, 1)
    
    # Iterate all tokens (excluding CLS/SEP)
    indices_to_mask = list(range(1, seq_len - 1))
    
    token_losses = [0.0] * seq_len
    
    loss_fct = torch.nn.CrossEntropyLoss(reduction='none')

    torch.cuda.empty_cache() # Clear memory before loop

    for i in range(0, len(indices_to_mask), BATCH_SIZE):
        batch_indices = indices_to_mask[i : i + BATCH_SIZE]
        current_curr_size = len(batch_indices)
        if current_curr_size == 0: break
        
        batch_input_ids = tensor_input_ids[:current_curr_size].clone()
        batch_labels = torch.full(batch_input_ids.shape, -100).to(device)
        
        for j, pos in enumerate(batch_indices):
            batch_labels[j, pos] = batch_input_ids[j, pos].item()
            batch_input_ids[j, pos] = tokenizer.mask_token_id
            
        with torch.no_grad():
            outputs = model(batch_input_ids)
            predictions = outputs.logits # [B, L, V]
            
        predictions = predictions.permute(0, 2, 1)
        loss = loss_fct(predictions, batch_labels) # [B, L]
        
        masked_losses = loss.sum(dim=1) # [B]
        
        for j, pos in enumerate(batch_indices):
            token_losses[pos] = masked_losses[j].item()

    # Identify ranges to highlight
    # Threshold: If loss is low (< approx 2.0 - 3.0), it's likely AI.
    # Let's say Threshold = 2.5 (similar to global threshold)
    # Adjusted to 1.0 (Strict) to avoid highlighting common human words.
    HIGHLIGHT_THRESHOLD = 1.0 
    
    highlights = []
    
    for i in range(1, seq_len - 1):
        loss = token_losses[i]
        # Lower loss = More likely AI
        if loss < HIGHLIGHT_THRESHOLD:
            start, end = offsets[i]
            # Filter out special tokens or empty offsets
            if start == end: continue
            highlights.append((start, end, loss))
            
    return highlights

def calculate_perplexity(text):
    """
    Calculates Pseudo-Perplexity (PPL) for Masked Language Models (like RoBERTa).
    Formula: PPL = exp( -1/N * sum( log(P(w_i | context)) ) )
    
    OPTIMIZATION: 
    - Strided Masking (Stride=3): Mask every 3rd token. 3x Speedup.
    - Batch Size: 16
    """
    t0 = time.time()
    
    # 1. Tokenize
    encodings = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    input_ids = encodings.input_ids.to(device) # Shape: [1, seq_len]
    seq_len = input_ids.shape[1]
    
    # If sequence is too short, return default
    if seq_len < 4:
        return 100.0

    nlls = [] # Negative Log Likelihoods
    
    # 2. Batched Masking for Efficiency
    BATCH_SIZE = 8
    
    # STRIDE: How many tokens to skip? 
    # Stride 1 = All tokens (Slowest, Most Accurate)
    # Stride 3 = Every 3rd token (3x Faster, Good Approx)
    STRIDE = 3
    
    tensor_input_ids = input_ids.repeat(BATCH_SIZE, 1) # [BATCH, seq_len]
    
    # Iterate through tokens [1...seq_len-2] to avoid CLS/SEP
    start_idx = 1
    end_idx = seq_len - 1
    
    loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
    
    # Create list of indices to mask
    indices_to_mask = list(range(start_idx, end_idx, STRIDE))
    total_steps = len(indices_to_mask)
    
    for i in range(0, total_steps, BATCH_SIZE):
        # valid batch current step
        batch_indices = indices_to_mask[i : i + BATCH_SIZE]
        current_batch_size = len(batch_indices)
        
        if current_batch_size == 0: break
        
        # Prepare inputs: Clone the repeated text
        batch_input_ids = tensor_input_ids[:current_batch_size].clone()
        
        # Create labels: -100 means ignore index for loss calculation
        batch_labels = torch.full(batch_input_ids.shape, -100).to(device)
        
        # Mask the target tokens
        for j, token_pos in enumerate(batch_indices):
            # Save the original token ID as the label
            batch_labels[j, token_pos] = batch_input_ids[j, token_pos].item()
            # Replace input with [MASK]
            batch_input_ids[j, token_pos] = tokenizer.mask_token_id
            
        # Forward pass
        with torch.no_grad():
            outputs = model(batch_input_ids)
            predictions = outputs.logits # [batch, seq_len, vocab]
            
        # Calculate loss only for the masked positions
        predictions = predictions.permute(0, 2, 1)
        loss = loss_fct(predictions, batch_labels)
        
        # Get scalar loss for the masked token
        masked_losses = loss.sum(dim=1) # [current_batch_size]
        nlls.append(masked_losses)
        
    if not nlls:
        return 0.0
        
    # Stack all NLLs
    all_nlls = torch.cat(nlls)
    
    # Mean NLL
    mean_nll = all_nlls.mean()
    
    # Perplexity = exp(mean_nll)
    ppl = torch.exp(mean_nll)
    
    print(f"Inference Time: {time.time() - t0:.2f}s")
    return ppl.item()

class AnalyzeView(APIView):
    def post(self, request):
        if not model or not tokenizer:
            return Response({"error": "Model not loaded"}, status=status.HTTP_503_SERVICE_UNAVAILABLE)

        data = request.data
        text = data.get('text', '')
        if not text and 'file' in request.FILES:
             text = "File content placeholder" 

        if not text:
            return Response({"error": "No text provided"}, status=status.HTTP_400_BAD_REQUEST)

        try:
            ppl = calculate_perplexity(text)
            
            # PPL < 2.5 -> AI
            # PPL > 2.5 -> Human
            # Sigmoid Curve calibrated at 2.5
            # Formula: 100 / (1 + exp(3 * (ppl - 2.5)))
            
            ai_score = 100 / (1 + math.exp(3 * (ppl - 2.5)))
            
            # Clamp between 2 and 99
            ai_score = max(2.5, min(99.5, ai_score))
            
            label = "AI Generated" if ai_score > 50 else "Human Written"
            
            return Response({
                "score": round(ai_score, 1),
                "label": label,
                "perplexity": round(ppl, 2),
                "device": str(device)
            })

        except Exception as e:
            return Response({"error": str(e)}, status=status.HTTP_500_INTERNAL_SERVER_ERROR)

from django.http import FileResponse
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_JUSTIFY
import io

class ReportView(APIView):
    def post(self, request):
        try:
            data = request.data
            text = data.get('text', '')
            score = data.get('score', 0)
            label = data.get('label', 'Unknown')
            
            # Granular Highlighting Logic
            # 1. Get highlight ranges
            highlights = calculate_token_scores(text)
            
            # 2. Sort highlights by start index
            highlights.sort(key=lambda x: x[0])
            
            # 3. Construct XML tagged string
            # IMPORTANT: Escape text to prevent XML errors in ReportLab
            import html
            
            formatted_text = ""
            current_idx = 0
            
            # Simple greedy tagging
            print(f"Generating PDF for text length: {len(text)} with {len(highlights)} highlights.")
            
            for start, end, loss in highlights:
                if start < current_idx: continue # Skip overlaps
                
                # Append non-highlighted text
                segment = text[current_idx:start]
                formatted_text += html.escape(segment)
                
                # Append highlighted text
                segment = text[start:end]
                # XML tag for yellow background
                formatted_text += f'<font backColor="yellow">{html.escape(segment)}</font>'
                
                current_idx = end
                
            # Append remaining text
            formatted_text += html.escape(text[current_idx:])
            
            # Handle newlines for HTML
            formatted_text = formatted_text.replace('\n', '<br/>')
            
            # Create PDF
            buffer = io.BytesIO()
            doc = SimpleDocTemplate(buffer, pagesize=letter,
                                  rightMargin=72, leftMargin=72,
                                  topMargin=72, bottomMargin=18)
            
            Story = []
            
            styles = getSampleStyleSheet()
            styles.add(ParagraphStyle(name='Justify', alignment=TA_JUSTIFY))
            
            # Header
            Story.append(Paragraph("DetectAI Analysis Report", styles["Heading1"]))
            Story.append(Paragraph(f"Date: {time.strftime('%Y-%m-%d %H:%M:%S')}", styles["Normal"]))
            Story.append(Spacer(1, 12))
            
            # Result
            # Calculate Highlighted Percentage
            total_chars = len(text)
            highlighted_chars = sum((h[1] - h[0]) for h in highlights)
            highlight_ratio = (highlighted_chars / total_chars * 100) if total_chars > 0 else 0
            
            # Display metrics
            res_color = "red" if score > 50 else "green"
            
            # Primary Metric: AI Probability
            Story.append(Paragraph(f'AI Probability: <font color="{res_color}"><b>{score}%</b></font>', styles["Heading2"]))
            
            # Secondary Metric: Highlighted Content
            Story.append(Paragraph(f'Highlighted Content: <b>{highlight_ratio:.1f}%</b>', styles["Normal"]))
            
            Story.append(Spacer(1, 12))
            
            # Content Header
            Story.append(Paragraph("Analyzed Content:", styles["Heading3"]))
            Story.append(Spacer(1, 6))
            
            # The Content (Highlighted)
            Story.append(Paragraph(formatted_text, styles["Justify"]))
            
            doc.build(Story)

            buffer.seek(0)
            return FileResponse(buffer, as_attachment=True, filename='detectAI_report.pdf')

        except Exception as e:
            print(f"Report Generation Error: {e}")
            return Response({"error": str(e)}, status=status.HTTP_500_INTERNAL_SERVER_ERROR)