import os import sys import torch import docx import nltk from nltk.tokenize import sent_tokenize from transformers import RobertaTokenizer, RobertaForSequenceClassification from reportlab.lib.pagesizes import A4 from reportlab.pdfgen import canvas from reportlab.lib import colors nltk.download("punkt") # Load model model_dir = "./models/roberta-detector" tokenizer = RobertaTokenizer.from_pretrained(model_dir) model = RobertaForSequenceClassification.from_pretrained(model_dir) model.eval().to("cuda" if torch.cuda.is_available() else "cpu") device = next(model.parameters()).device # === THRESHOLD CONFIG === AI_THRESHOLD = 0.50 # Adjust this as needed for better results # === Input File === filepath = sys.argv[1] filename = os.path.splitext(os.path.basename(filepath))[0] output_dir = "output_reports" os.makedirs(output_dir, exist_ok=True) output_path = os.path.join(output_dir, f"{filename}_report.pdf") # === DOCX Reader === def read_docx_paragraphs(path): doc = docx.Document(path) return [para.text for para in doc.paragraphs] paragraphs = read_docx_paragraphs(filepath) # === Detection Loop === results = [] total_sentences = 0 ai_sentences = 0 for paragraph in paragraphs: if not paragraph.strip(): results.append([]) # preserve spacing continue sentences = sent_tokenize(paragraph) para_result = [] for sentence in sentences: inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device) with torch.no_grad(): outputs = model(**inputs) probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0] ai_prob = probs[1].item() is_ai = ai_prob >= AI_THRESHOLD para_result.append((sentence, is_ai, ai_prob)) total_sentences += 1 if is_ai: ai_sentences += 1 # Debugging print(f"[DEBUG] AI probability: {ai_prob:.2f} — {'āœ” Highlight' if is_ai else '✘ Skip'}") results.append(para_result) ai_percent = round((ai_sentences / total_sentences) * 100, 2) if total_sentences else 0 # === PDF Writer === c = canvas.Canvas(output_path, pagesize=A4) width, height = A4 x, y = 40, height - 60 line_height = 18 font_size = 12 # Title c.setFont("Helvetica-Bold", 14) c.drawString(x, y, f"šŸ“„ AI Detection Report: {filename}") y -= 25 c.setFont("Helvetica", 12) c.drawString(x, y, f"🧠 AI Detected: {ai_percent}% of {total_sentences} sentences") y -= 30 c.setFont("Helvetica", font_size) # Body rendering for para_result in results: if not para_result: y -= line_height continue for sentence, is_ai, ai_prob in para_result: if y < 50: c.showPage() y = height - 50 c.setFont("Helvetica", font_size) if is_ai: text_width = c.stringWidth(sentence, "Helvetica", font_size) c.setFillColor(colors.cyan) c.rect(x - 2, y - 4, text_width + 4, line_height + 2, fill=True, stroke=False) c.setFillColor(colors.black) c.drawString(x, y, sentence) y -= line_height y -= line_height # spacing between paragraphs c.save() print(f"\nāœ… Report saved: {output_path}")