StealthWriter / detector /detector.py
AlyanAkram's picture
Upload 11 files
a53dc0a verified
import os
import sys
import torch
import docx
import nltk
from nltk.tokenize import sent_tokenize
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from reportlab.lib import colors
nltk.download("punkt")
# Load model
model_dir = "./models/roberta-detector"
tokenizer = RobertaTokenizer.from_pretrained(model_dir)
model = RobertaForSequenceClassification.from_pretrained(model_dir)
model.eval().to("cuda" if torch.cuda.is_available() else "cpu")
device = next(model.parameters()).device
# === THRESHOLD CONFIG ===
AI_THRESHOLD = 0.50 # Adjust this as needed for better results
# === Input File ===
filepath = sys.argv[1]
filename = os.path.splitext(os.path.basename(filepath))[0]
output_dir = "output_reports"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f"{filename}_report.pdf")
# === DOCX Reader ===
def read_docx_paragraphs(path):
doc = docx.Document(path)
return [para.text for para in doc.paragraphs]
paragraphs = read_docx_paragraphs(filepath)
# === Detection Loop ===
results = []
total_sentences = 0
ai_sentences = 0
for paragraph in paragraphs:
if not paragraph.strip():
results.append([]) # preserve spacing
continue
sentences = sent_tokenize(paragraph)
para_result = []
for sentence in sentences:
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
with torch.no_grad():
outputs = model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
ai_prob = probs[1].item()
is_ai = ai_prob >= AI_THRESHOLD
para_result.append((sentence, is_ai, ai_prob))
total_sentences += 1
if is_ai:
ai_sentences += 1
# Debugging
print(f"[DEBUG] AI probability: {ai_prob:.2f} β€” {'βœ” Highlight' if is_ai else '✘ Skip'}")
results.append(para_result)
ai_percent = round((ai_sentences / total_sentences) * 100, 2) if total_sentences else 0
# === PDF Writer ===
c = canvas.Canvas(output_path, pagesize=A4)
width, height = A4
x, y = 40, height - 60
line_height = 18
font_size = 12
# Title
c.setFont("Helvetica-Bold", 14)
c.drawString(x, y, f"πŸ“„ AI Detection Report: {filename}")
y -= 25
c.setFont("Helvetica", 12)
c.drawString(x, y, f"🧠 AI Detected: {ai_percent}% of {total_sentences} sentences")
y -= 30
c.setFont("Helvetica", font_size)
# Body rendering
for para_result in results:
if not para_result:
y -= line_height
continue
for sentence, is_ai, ai_prob in para_result:
if y < 50:
c.showPage()
y = height - 50
c.setFont("Helvetica", font_size)
if is_ai:
text_width = c.stringWidth(sentence, "Helvetica", font_size)
c.setFillColor(colors.cyan)
c.rect(x - 2, y - 4, text_width + 4, line_height + 2, fill=True, stroke=False)
c.setFillColor(colors.black)
c.drawString(x, y, sentence)
y -= line_height
y -= line_height # spacing between paragraphs
c.save()
print(f"\nβœ… Report saved: {output_path}")