Spaces:
Sleeping
Sleeping
| import os | |
| import torch | |
| from pathlib import Path | |
| from nltk.tokenize import sent_tokenize | |
| from transformers import RobertaTokenizer, RobertaForSequenceClassification | |
| from reportlab.lib.pagesizes import A4 | |
| from reportlab.pdfgen import canvas | |
| from reportlab.lib import colors | |
| from io import BytesIO | |
| import nltk | |
| # === Environment Setup === | |
| os.environ["HF_HOME"] = "/tmp/hf_home" | |
| os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_home" | |
| os.environ["NLTK_DATA"] = "/tmp/nltk_data" | |
| nltk.data.path.append("/tmp/nltk_data") | |
| # === Model Source (Hugging Face or Local) === | |
| USE_HF_MODEL = os.getenv("USE_HF_MODEL") == "1" | |
| hf_token = os.getenv("HF_TOKEN") | |
| MODEL_PATH = "AlyanAkram/stealth-roberta" if USE_HF_MODEL else "./detector/models/roberta-detector" | |
| if USE_HF_MODEL: | |
| print("π Loading model from Hugging Face Hub...") | |
| tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, token=hf_token) | |
| model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, token=hf_token) | |
| else: | |
| print("π Loading model from local files...") | |
| tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, local_files_only=True) | |
| model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, local_files_only=True) | |
| model.eval().to("cuda" if torch.cuda.is_available() else "cpu") | |
| device = next(model.parameters()).device | |
| # === Constants === | |
| AI_THRESHOLD = 0.5 | |
| # === Main Analysis Function === | |
| def analyze_text(text: str): | |
| results = [] | |
| paragraphs = [p.strip() for p in text.split("\n") if p.strip()] | |
| ai_count, total_sentences = 0, 0 | |
| for paragraph in paragraphs: | |
| sentence_results = [] | |
| for sentence in sent_tokenize(paragraph): | |
| inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0] | |
| ai_prob = probs[1].item() | |
| is_ai = ai_prob >= AI_THRESHOLD | |
| sentence_results.append((sentence, is_ai, ai_prob)) | |
| total_sentences += 1 | |
| if is_ai: | |
| ai_count += 1 | |
| results.append(sentence_results) | |
| return { | |
| "overall_ai_percent": round((ai_count / total_sentences) * 100, 2) if total_sentences else 0, | |
| "total_sentences": total_sentences, | |
| "ai_sentences": ai_count, | |
| "results": results | |
| } | |
| # === PDF Report Generator (In-Memory) === | |
| def generate_pdf_report(results: dict, filename: str) -> BytesIO: | |
| buffer = BytesIO() | |
| c = canvas.Canvas(buffer, pagesize=A4) | |
| width, height = A4 | |
| x, y = 40, height - 60 | |
| line_height, font_size = 18, 12 | |
| c.setFont("Helvetica-Bold", 14) | |
| c.drawString(x, y, f"π AI Detection Report: {filename}") | |
| y -= 25 | |
| c.setFont("Helvetica", font_size) | |
| c.drawString(x, y, f"π§ AI Detected: {results['overall_ai_percent']}% of {results['total_sentences']} sentences") | |
| y -= 30 | |
| for para_result in results["results"]: | |
| for sentence, is_ai, _ in para_result: | |
| sentence = sentence.strip() | |
| if not sentence: | |
| continue | |
| if y < 50: | |
| c.showPage() | |
| y = height - 50 | |
| words = sentence.split() | |
| current_line = "" | |
| for word in words: | |
| test_line = f"{current_line} {word}".strip() | |
| if c.stringWidth(test_line, "Helvetica", font_size) > width - 80: | |
| if is_ai: | |
| c.setFillColor(colors.cyan) | |
| c.rect(x - 2, y - 4, c.stringWidth(current_line, "Helvetica", font_size) + 4, line_height + 2, fill=True, stroke=False) | |
| c.setFillColor(colors.black) | |
| c.drawString(x, y, current_line) | |
| y -= line_height | |
| current_line = word | |
| else: | |
| current_line = test_line | |
| if current_line: | |
| if is_ai: | |
| c.setFillColor(colors.cyan) | |
| c.rect(x - 2, y - 4, c.stringWidth(current_line, "Helvetica", font_size) + 4, line_height + 2, fill=True, stroke=False) | |
| c.setFillColor(colors.black) | |
| c.drawString(x, y, current_line) | |
| y -= line_height | |
| y -= line_height | |
| c.save() | |
| buffer.seek(0) | |
| return buffer | |