Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| import torch | |
| import docx | |
| import nltk | |
| from nltk.tokenize import sent_tokenize | |
| from transformers import RobertaTokenizer, RobertaForSequenceClassification | |
| from reportlab.lib.pagesizes import A4 | |
| from reportlab.pdfgen import canvas | |
| from reportlab.lib import colors | |
| nltk.download("punkt") | |
| # Load model | |
| model_dir = "./models/roberta-detector" | |
| tokenizer = RobertaTokenizer.from_pretrained(model_dir) | |
| model = RobertaForSequenceClassification.from_pretrained(model_dir) | |
| model.eval().to("cuda" if torch.cuda.is_available() else "cpu") | |
| device = next(model.parameters()).device | |
| # === THRESHOLD CONFIG === | |
| AI_THRESHOLD = 0.50 # Adjust this as needed for better results | |
| # === Input File === | |
| filepath = sys.argv[1] | |
| filename = os.path.splitext(os.path.basename(filepath))[0] | |
| output_dir = "output_reports" | |
| os.makedirs(output_dir, exist_ok=True) | |
| output_path = os.path.join(output_dir, f"{filename}_report.pdf") | |
| # === DOCX Reader === | |
| def read_docx_paragraphs(path): | |
| doc = docx.Document(path) | |
| return [para.text for para in doc.paragraphs] | |
| paragraphs = read_docx_paragraphs(filepath) | |
| # === Detection Loop === | |
| results = [] | |
| total_sentences = 0 | |
| ai_sentences = 0 | |
| for paragraph in paragraphs: | |
| if not paragraph.strip(): | |
| results.append([]) # preserve spacing | |
| continue | |
| sentences = sent_tokenize(paragraph) | |
| para_result = [] | |
| for sentence in sentences: | |
| inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0] | |
| ai_prob = probs[1].item() | |
| is_ai = ai_prob >= AI_THRESHOLD | |
| para_result.append((sentence, is_ai, ai_prob)) | |
| total_sentences += 1 | |
| if is_ai: | |
| ai_sentences += 1 | |
| # Debugging | |
| print(f"[DEBUG] AI probability: {ai_prob:.2f} β {'β Highlight' if is_ai else 'β Skip'}") | |
| results.append(para_result) | |
| ai_percent = round((ai_sentences / total_sentences) * 100, 2) if total_sentences else 0 | |
| # === PDF Writer === | |
| c = canvas.Canvas(output_path, pagesize=A4) | |
| width, height = A4 | |
| x, y = 40, height - 60 | |
| line_height = 18 | |
| font_size = 12 | |
| # Title | |
| c.setFont("Helvetica-Bold", 14) | |
| c.drawString(x, y, f"π AI Detection Report: {filename}") | |
| y -= 25 | |
| c.setFont("Helvetica", 12) | |
| c.drawString(x, y, f"π§ AI Detected: {ai_percent}% of {total_sentences} sentences") | |
| y -= 30 | |
| c.setFont("Helvetica", font_size) | |
| # Body rendering | |
| for para_result in results: | |
| if not para_result: | |
| y -= line_height | |
| continue | |
| for sentence, is_ai, ai_prob in para_result: | |
| if y < 50: | |
| c.showPage() | |
| y = height - 50 | |
| c.setFont("Helvetica", font_size) | |
| if is_ai: | |
| text_width = c.stringWidth(sentence, "Helvetica", font_size) | |
| c.setFillColor(colors.cyan) | |
| c.rect(x - 2, y - 4, text_width + 4, line_height + 2, fill=True, stroke=False) | |
| c.setFillColor(colors.black) | |
| c.drawString(x, y, sentence) | |
| y -= line_height | |
| y -= line_height # spacing between paragraphs | |
| c.save() | |
| print(f"\nβ Report saved: {output_path}") | |