File size: 4,428 Bytes
a53dc0a
 
 
 
 
 
 
 
6480e0c
de65627
a53dc0a
7c4688e
de65627
 
 
 
a53dc0a
7c4688e
a53dc0a
7c4688e
 
a53dc0a
 
7c4688e
a53dc0a
 
 
7c4688e
a53dc0a
 
 
de65627
 
a53dc0a
7c4688e
a53dc0a
de65627
7c4688e
a53dc0a
 
 
de65627
a53dc0a
 
 
de65627
a53dc0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de65627
a53dc0a
 
 
 
 
03180f1
 
 
 
 
a53dc0a
 
de65627
a53dc0a
 
 
 
de65627
a53dc0a
 
 
 
 
 
 
 
 
03180f1
 
 
 
a53dc0a
 
de65627
a53dc0a
de65627
03180f1
a53dc0a
 
03180f1
a53dc0a
 
 
 
 
 
 
 
 
 
03180f1
a53dc0a
 
 
 
029cea2
a53dc0a
 
03180f1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
import torch
from pathlib import Path
from nltk.tokenize import sent_tokenize
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from reportlab.lib import colors
from io import BytesIO
import nltk

# === Environment Setup ===
os.environ["HF_HOME"] = "/tmp/hf_home"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_home"
os.environ["NLTK_DATA"] = "/tmp/nltk_data"
nltk.data.path.append("/tmp/nltk_data")

# === Model Source (Hugging Face or Local) ===
USE_HF_MODEL = os.getenv("USE_HF_MODEL") == "1"
hf_token = os.getenv("HF_TOKEN")
MODEL_PATH = "AlyanAkram/stealth-roberta" if USE_HF_MODEL else "./detector/models/roberta-detector"

if USE_HF_MODEL:
    print("πŸ” Loading model from Hugging Face Hub...")
    tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, token=hf_token)
    model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, token=hf_token)
else:
    print("πŸ“ Loading model from local files...")
    tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
    model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, local_files_only=True)

model.eval().to("cuda" if torch.cuda.is_available() else "cpu")
device = next(model.parameters()).device

# === Constants ===
AI_THRESHOLD = 0.5

# === Main Analysis Function ===
def analyze_text(text: str):
    results = []
    paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
    ai_count, total_sentences = 0, 0

    for paragraph in paragraphs:
        sentence_results = []
        for sentence in sent_tokenize(paragraph):
            inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
            with torch.no_grad():
                outputs = model(**inputs)
                probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
                ai_prob = probs[1].item()

            is_ai = ai_prob >= AI_THRESHOLD
            sentence_results.append((sentence, is_ai, ai_prob))
            total_sentences += 1
            if is_ai:
                ai_count += 1

        results.append(sentence_results)

    return {
        "overall_ai_percent": round((ai_count / total_sentences) * 100, 2) if total_sentences else 0,
        "total_sentences": total_sentences,
        "ai_sentences": ai_count,
        "results": results
    }

# === PDF Report Generator (In-Memory) ===
def generate_pdf_report(results: dict, filename: str) -> BytesIO:

    buffer = BytesIO()
    c = canvas.Canvas(buffer, pagesize=A4)
    width, height = A4
    x, y = 40, height - 60
    line_height, font_size = 18, 12

    c.setFont("Helvetica-Bold", 14)
    c.drawString(x, y, f"πŸ“„ AI Detection Report: {filename}")
    y -= 25
    c.setFont("Helvetica", font_size)
    c.drawString(x, y, f"🧠 AI Detected: {results['overall_ai_percent']}% of {results['total_sentences']} sentences")
    y -= 30

    for para_result in results["results"]:
        for sentence, is_ai, _ in para_result:
            sentence = sentence.strip()
            if not sentence:
                continue

            if y < 50:
                c.showPage()
                y = height - 50

            words = sentence.split()
            current_line = ""

            for word in words:
                test_line = f"{current_line} {word}".strip()
                if c.stringWidth(test_line, "Helvetica", font_size) > width - 80:
                    if is_ai:
                        c.setFillColor(colors.cyan)
                        c.rect(x - 2, y - 4, c.stringWidth(current_line, "Helvetica", font_size) + 4, line_height + 2, fill=True, stroke=False)
                        c.setFillColor(colors.black)
                    c.drawString(x, y, current_line)
                    y -= line_height
                    current_line = word
                else:
                    current_line = test_line

            if current_line:
                if is_ai:
                    c.setFillColor(colors.cyan)
                    c.rect(x - 2, y - 4, c.stringWidth(current_line, "Helvetica", font_size) + 4, line_height + 2, fill=True, stroke=False)
                    c.setFillColor(colors.black)
                c.drawString(x, y, current_line)
                y -= line_height

        y -= line_height

    c.save()
    buffer.seek(0)
    return buffer