|
|
import os |
|
|
import tempfile |
|
|
from pathlib import Path |
|
|
from typing import List |
|
|
import numpy as np |
|
|
|
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import torch |
|
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
|
import docx |
|
|
import matplotlib.pyplot as plt |
|
|
|
|
|
try: |
|
|
import fitz |
|
|
except ImportError as e: |
|
|
raise ImportError("Missing dependency: PyMuPDF") from e |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
torch.set_num_threads(2) |
|
|
torch.set_grad_enabled(False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_NAME = "openai-community/roberta-base-openai-detector" |
|
|
AI_THRESHOLD = 0.5 |
|
|
MAX_LENGTH = 256 |
|
|
BATCH_SIZE = 8 |
|
|
DEVICE = "cpu" |
|
|
|
|
|
SUPPORTED_EXTENSIONS = {".txt", ".pdf", ".docx"} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
|
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME) |
|
|
model.to(DEVICE) |
|
|
model.eval() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_text_from_file(file_path: str) -> str: |
|
|
path = Path(file_path) |
|
|
|
|
|
if path.suffix.lower() not in SUPPORTED_EXTENSIONS: |
|
|
raise ValueError(f"Unsupported file type: {path.suffix}") |
|
|
|
|
|
if path.suffix == ".txt": |
|
|
return path.read_text(encoding="utf-8", errors="ignore") |
|
|
|
|
|
if path.suffix == ".pdf": |
|
|
text = [] |
|
|
with fitz.open(path) as pdf: |
|
|
for page in pdf: |
|
|
text.append(page.get_text()) |
|
|
return "\n".join(text) |
|
|
|
|
|
if path.suffix == ".docx": |
|
|
document = docx.Document(path) |
|
|
return "\n".join(p.text for p in document.paragraphs if p.text.strip()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def chunk_text(text: str, max_words: int = 200) -> List[str]: |
|
|
words = text.split() |
|
|
chunks = [] |
|
|
|
|
|
for i in range(0, len(words), max_words): |
|
|
chunk = " ".join(words[i:i + max_words]) |
|
|
if len(chunk.split()) >= 20: |
|
|
chunks.append(chunk) |
|
|
|
|
|
return chunks |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def calibrate_confidence(prob: float) -> str: |
|
|
distance = abs(prob - AI_THRESHOLD) |
|
|
if distance >= 0.35: |
|
|
return "High" |
|
|
elif distance >= 0.15: |
|
|
return "Medium" |
|
|
return "Low" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@torch.no_grad() |
|
|
def detect_ai_probability(texts: List[str], progress=gr.Progress()): |
|
|
probabilities = [] |
|
|
total = len(texts) |
|
|
|
|
|
for i in range(0, total, BATCH_SIZE): |
|
|
progress((i, total)) |
|
|
batch = texts[i:i + BATCH_SIZE] |
|
|
|
|
|
inputs = tokenizer( |
|
|
batch, |
|
|
return_tensors="pt", |
|
|
padding=True, |
|
|
truncation=True, |
|
|
max_length=MAX_LENGTH |
|
|
) |
|
|
|
|
|
logits = model(**inputs).logits |
|
|
probs = torch.softmax(logits, dim=1)[:, 1] |
|
|
probabilities.extend(probs.tolist()) |
|
|
|
|
|
progress((total, total)) |
|
|
return probabilities |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def classify_chunks(chunks: List[str], progress=gr.Progress()) -> pd.DataFrame: |
|
|
probabilities = detect_ai_probability(chunks, progress) |
|
|
|
|
|
df = pd.DataFrame({ |
|
|
"Text Chunk": chunks, |
|
|
"AI Probability (%)": [round(p * 100, 2) for p in probabilities], |
|
|
"Prediction": [ |
|
|
"🤖 Likely AI" if p >= AI_THRESHOLD else "🧍 Human" |
|
|
for p in probabilities |
|
|
], |
|
|
"Confidence": [ |
|
|
calibrate_confidence(p) for p in probabilities |
|
|
] |
|
|
}) |
|
|
|
|
|
return df |
|
|
|
|
|
|
|
|
def document_summary(df: pd.DataFrame) -> pd.DataFrame: |
|
|
high_conf = df[df["Confidence"] == "High"] |
|
|
avg_prob = df["AI Probability (%)"].mean() |
|
|
|
|
|
summary = pd.DataFrame([{ |
|
|
"Text Chunk": "📄 Document Summary", |
|
|
"AI Probability (%)": round(avg_prob, 2), |
|
|
"Prediction": "🤖 Likely AI" if len(high_conf) >= len(df) * 0.6 else "🧍 Human", |
|
|
"Confidence": "High" if len(high_conf) >= len(df) * 0.6 else "Medium" |
|
|
}]) |
|
|
|
|
|
return pd.concat([df, summary], ignore_index=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_gauge(prob_percent: float, prediction: str) -> str: |
|
|
fig, ax = plt.subplots(figsize=(6, 3)) |
|
|
|
|
|
angles = np.linspace(np.pi, 0, 100) |
|
|
|
|
|
|
|
|
ax.plot(np.cos(angles), np.sin(angles), linewidth=20, alpha=0.15) |
|
|
|
|
|
|
|
|
for i, val in enumerate(np.linspace(0, 100, 99)): |
|
|
if val < 40: |
|
|
color = "green" |
|
|
elif val < 70: |
|
|
color = "orange" |
|
|
else: |
|
|
color = "red" |
|
|
|
|
|
ax.plot( |
|
|
np.cos(angles[i:i + 2]), |
|
|
np.sin(angles[i:i + 2]), |
|
|
linewidth=20, |
|
|
color=color |
|
|
) |
|
|
|
|
|
|
|
|
needle_angle = np.pi * (1 - prob_percent / 100) |
|
|
ax.plot( |
|
|
[0, 0.8 * np.cos(needle_angle)], |
|
|
[0, 0.8 * np.sin(needle_angle)], |
|
|
linewidth=4 |
|
|
) |
|
|
|
|
|
|
|
|
ax.text(0, -0.1, f"{prob_percent:.0f}%", ha="center", va="center", fontsize=24, weight="bold") |
|
|
ax.text(0, -0.32, prediction, ha="center", va="center", fontsize=12) |
|
|
|
|
|
ax.set_aspect("equal") |
|
|
ax.axis("off") |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp: |
|
|
fig.savefig(tmp.name, bbox_inches="tight", dpi=150) |
|
|
path = tmp.name |
|
|
|
|
|
plt.close(fig) |
|
|
return path |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_detector(text_input: str, uploaded_files, progress=gr.Progress()): |
|
|
texts = [] |
|
|
|
|
|
if text_input.strip(): |
|
|
texts.append(text_input.strip()) |
|
|
|
|
|
if uploaded_files: |
|
|
for file in uploaded_files: |
|
|
texts.append(load_text_from_file(file.name)) |
|
|
|
|
|
if not texts: |
|
|
return pd.DataFrame({"Error": ["No input provided"]}), None |
|
|
|
|
|
chunks = [] |
|
|
for text in texts: |
|
|
chunks.extend(chunk_text(text)) |
|
|
|
|
|
if not chunks: |
|
|
return pd.DataFrame({"Error": ["Text too short for analysis"]}), None |
|
|
|
|
|
df = classify_chunks(chunks, progress) |
|
|
final_df = document_summary(df) |
|
|
|
|
|
summary_row = final_df[final_df["Text Chunk"] == "📄 Document Summary"].iloc[0] |
|
|
gauge_path = generate_gauge( |
|
|
summary_row["AI Probability (%)"], |
|
|
summary_row["Prediction"] |
|
|
) |
|
|
|
|
|
return final_df, gauge_path |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="🧪 Offline AI Document Detector") as app: |
|
|
gr.Markdown("## 🧪 Offline AI Document Detector") |
|
|
gr.Markdown( |
|
|
"Detect whether content is AI-generated using an **offline, open-source model**. " |
|
|
"Supports **PDF, DOCX, TXT, and pasted text**. Optimized for **CPU-only Hugging Face Spaces**." |
|
|
) |
|
|
|
|
|
text_input = gr.Textbox( |
|
|
lines=6, |
|
|
label="✍️ Paste Text (optional)" |
|
|
) |
|
|
|
|
|
file_input = gr.File( |
|
|
label="📂 Upload Documents", |
|
|
file_types=[".pdf", ".docx", ".txt"], |
|
|
file_count="multiple" |
|
|
) |
|
|
|
|
|
analyze_btn = gr.Button("🔍 Analyze") |
|
|
output_table = gr.Dataframe(label="📊 Detection Results") |
|
|
gauge_plot = gr.Image(label="🧠 AI Probability Gauge") |
|
|
|
|
|
analyze_btn.click( |
|
|
fn=run_detector, |
|
|
inputs=[text_input, file_input], |
|
|
outputs=[output_table, gauge_plot] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
app.launch() |
|
|
|