Spaces:

AlyanAkram
/

StealthWriter

Sleeping

App Files Files Community

StealthWriter / detector /detector.py

AlyanAkram

Upload 11 files

a53dc0a verified 8 months ago

raw

history blame contribute delete

3.23 kB

	import os
	import sys
	import torch
	import docx
	import nltk
	from nltk.tokenize import sent_tokenize
	from transformers import RobertaTokenizer, RobertaForSequenceClassification
	from reportlab.lib.pagesizes import A4
	from reportlab.pdfgen import canvas
	from reportlab.lib import colors

	nltk.download("punkt")

	# Load model
	model_dir = "./models/roberta-detector"
	tokenizer = RobertaTokenizer.from_pretrained(model_dir)
	model = RobertaForSequenceClassification.from_pretrained(model_dir)
	model.eval().to("cuda" if torch.cuda.is_available() else "cpu")
	device = next(model.parameters()).device

	# === THRESHOLD CONFIG ===
	AI_THRESHOLD = 0.50 # Adjust this as needed for better results

	# === Input File ===
	filepath = sys.argv[1]
	filename = os.path.splitext(os.path.basename(filepath))[0]
	output_dir = "output_reports"
	os.makedirs(output_dir, exist_ok=True)
	output_path = os.path.join(output_dir, f"{filename}_report.pdf")

	# === DOCX Reader ===
	def read_docx_paragraphs(path):
	doc = docx.Document(path)
	return [para.text for para in doc.paragraphs]

	paragraphs = read_docx_paragraphs(filepath)

	# === Detection Loop ===
	results = []
	total_sentences = 0
	ai_sentences = 0

	for paragraph in paragraphs:
	if not paragraph.strip():
	results.append([]) # preserve spacing
	continue

	sentences = sent_tokenize(paragraph)
	para_result = []

	for sentence in sentences:
	inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
	with torch.no_grad():
	outputs = model(**inputs)
	probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
	ai_prob = probs[1].item()

	is_ai = ai_prob >= AI_THRESHOLD
	para_result.append((sentence, is_ai, ai_prob))

	total_sentences += 1
	if is_ai:
	ai_sentences += 1

	# Debugging
	print(f"[DEBUG] AI probability: {ai_prob:.2f} — {'✔ Highlight' if is_ai else '✘ Skip'}")

	results.append(para_result)

	ai_percent = round((ai_sentences / total_sentences) * 100, 2) if total_sentences else 0

	# === PDF Writer ===
	c = canvas.Canvas(output_path, pagesize=A4)
	width, height = A4
	x, y = 40, height - 60
	line_height = 18
	font_size = 12

	# Title
	c.setFont("Helvetica-Bold", 14)
	c.drawString(x, y, f"📄 AI Detection Report: {filename}")
	y -= 25
	c.setFont("Helvetica", 12)
	c.drawString(x, y, f"🧠 AI Detected: {ai_percent}% of {total_sentences} sentences")
	y -= 30
	c.setFont("Helvetica", font_size)

	# Body rendering
	for para_result in results:
	if not para_result:
	y -= line_height
	continue

	for sentence, is_ai, ai_prob in para_result:
	if y < 50:
	c.showPage()
	y = height - 50
	c.setFont("Helvetica", font_size)

	if is_ai:
	text_width = c.stringWidth(sentence, "Helvetica", font_size)
	c.setFillColor(colors.cyan)
	c.rect(x - 2, y - 4, text_width + 4, line_height + 2, fill=True, stroke=False)
	c.setFillColor(colors.black)

	c.drawString(x, y, sentence)
	y -= line_height

	y -= line_height # spacing between paragraphs

	c.save()
	print(f"\n✅ Report saved: {output_path}")