Spaces:
Sleeping
Sleeping
Upload 11 files
Browse files- .gitignore +43 -0
- auth.py +37 -0
- detector/create_dataset.py +45 -0
- detector/custom_model.py +149 -0
- detector/detector.py +112 -0
- detector/preprocess.py +47 -0
- detector/requirements.txt +14 -0
- detector/train_model.py +79 -0
- detector/utils.py +21 -0
- main.py +116 -0
- requirements.txt +101 -0
.gitignore
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python artifacts
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*.pyo
|
| 5 |
+
*.pyd
|
| 6 |
+
*.so
|
| 7 |
+
*.egg-info/
|
| 8 |
+
|
| 9 |
+
# Virtual environments
|
| 10 |
+
venv/
|
| 11 |
+
env/
|
| 12 |
+
.venv/
|
| 13 |
+
|
| 14 |
+
# IDE/editor files
|
| 15 |
+
.vscode/
|
| 16 |
+
.idea/
|
| 17 |
+
*.log
|
| 18 |
+
.DS_Store
|
| 19 |
+
Thumbs.db
|
| 20 |
+
|
| 21 |
+
# Model weights and training artifacts
|
| 22 |
+
detector/models/
|
| 23 |
+
detector/training_data/
|
| 24 |
+
detector/output_reports/
|
| 25 |
+
test_files/
|
| 26 |
+
reports/
|
| 27 |
+
*.safetensors
|
| 28 |
+
*.bin
|
| 29 |
+
*.pt
|
| 30 |
+
*.ckpt
|
| 31 |
+
|
| 32 |
+
# Data files
|
| 33 |
+
ai_training_dataset.json
|
| 34 |
+
|
| 35 |
+
# Optional build/deploy stuff
|
| 36 |
+
build/
|
| 37 |
+
tmp/
|
| 38 |
+
.cache/
|
| 39 |
+
|
| 40 |
+
# Environment and config
|
| 41 |
+
.env
|
| 42 |
+
*.env
|
| 43 |
+
.env.*
|
auth.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, HTTPException
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
from supabase import create_client, Client
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
SUPABASE_URL = "https://ylyxgffttgvvjyrfovpl.supabase.co"
|
| 7 |
+
SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InlseXhnZmZ0dGd2dmp5cmZvdnBsIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NTA0MTc4NDcsImV4cCI6MjA2NTk5Mzg0N30.a6-biroEh-TNTS8E_uAYYt7mgdY2A-xexjCzYp1MsuI"
|
| 8 |
+
|
| 9 |
+
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
|
| 10 |
+
|
| 11 |
+
router = APIRouter()
|
| 12 |
+
|
| 13 |
+
class UserAuth(BaseModel):
|
| 14 |
+
email: str
|
| 15 |
+
password: str
|
| 16 |
+
|
| 17 |
+
@router.post("/signup")
|
| 18 |
+
def signup(user: UserAuth):
|
| 19 |
+
try:
|
| 20 |
+
result = supabase.auth.sign_up({
|
| 21 |
+
"email": user.email,
|
| 22 |
+
"password": user.password
|
| 23 |
+
})
|
| 24 |
+
return {"message": "Signup successful", "user": result.user}
|
| 25 |
+
except Exception as e:
|
| 26 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 27 |
+
|
| 28 |
+
@router.post("/login")
|
| 29 |
+
def login(user: UserAuth):
|
| 30 |
+
try:
|
| 31 |
+
result = supabase.auth.sign_in_with_password({
|
| 32 |
+
"email": user.email,
|
| 33 |
+
"password": user.password
|
| 34 |
+
})
|
| 35 |
+
return {"access_token": result.session.access_token, "user_id": result.user.id}
|
| 36 |
+
except Exception as e:
|
| 37 |
+
raise HTTPException(status_code=401, detail="Invalid credentials")
|
detector/create_dataset.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
from preprocess import extract_paragraphs
|
| 4 |
+
|
| 5 |
+
# Label mapping
|
| 6 |
+
LABELS = {
|
| 7 |
+
"ai": 1,
|
| 8 |
+
"human": 0,
|
| 9 |
+
"mixed": 2 # You can change to 2 if doing 3-class classification
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
root_dir = "training_data"
|
| 13 |
+
dataset = []
|
| 14 |
+
|
| 15 |
+
for label_folder in os.listdir(root_dir):
|
| 16 |
+
folder_path = os.path.join(root_dir, label_folder)
|
| 17 |
+
if not os.path.isdir(folder_path):
|
| 18 |
+
continue
|
| 19 |
+
|
| 20 |
+
label = LABELS.get(label_folder.lower())
|
| 21 |
+
if label is None:
|
| 22 |
+
continue
|
| 23 |
+
|
| 24 |
+
for filename in os.listdir(folder_path):
|
| 25 |
+
if not filename.endswith((".pdf", ".docx")):
|
| 26 |
+
continue
|
| 27 |
+
|
| 28 |
+
file_path = os.path.join(folder_path, filename)
|
| 29 |
+
print(f"📄 Extracting: {file_path}")
|
| 30 |
+
try:
|
| 31 |
+
paragraphs = extract_paragraphs(file_path)
|
| 32 |
+
for para in paragraphs:
|
| 33 |
+
if para.strip():
|
| 34 |
+
dataset.append({
|
| 35 |
+
"text": para.strip(),
|
| 36 |
+
"label": label
|
| 37 |
+
})
|
| 38 |
+
except Exception as e:
|
| 39 |
+
print(f"❌ Failed: {file_path} — {str(e)}")
|
| 40 |
+
|
| 41 |
+
# Save dataset
|
| 42 |
+
with open("ai_training_dataset.json", "w", encoding="utf-8") as f:
|
| 43 |
+
json.dump(dataset, f, indent=2, ensure_ascii=False)
|
| 44 |
+
|
| 45 |
+
print(f"\n✅ Saved {len(dataset)} samples.")
|
detector/custom_model.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
import nltk
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from nltk.tokenize import sent_tokenize
|
| 6 |
+
from transformers import RobertaTokenizer, RobertaForSequenceClassification
|
| 7 |
+
from reportlab.lib.pagesizes import A4
|
| 8 |
+
from reportlab.pdfgen import canvas
|
| 9 |
+
from reportlab.lib import colors
|
| 10 |
+
|
| 11 |
+
# === NLTK tokenizer ===
|
| 12 |
+
nltk.download("punkt")
|
| 13 |
+
|
| 14 |
+
# === Model loading: Hugging Face (Render) vs Local (Dev) ===
|
| 15 |
+
USE_HF_MODEL = os.getenv("USE_HF_MODEL") == "1"
|
| 16 |
+
|
| 17 |
+
if USE_HF_MODEL:
|
| 18 |
+
from huggingface_hub import login
|
| 19 |
+
|
| 20 |
+
hf_token = os.getenv("HF_TOKEN")
|
| 21 |
+
if hf_token:
|
| 22 |
+
login(token=hf_token)
|
| 23 |
+
|
| 24 |
+
MODEL_PATH = "AlyanAkram/stealth-roberta"
|
| 25 |
+
tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, token=hf_token)
|
| 26 |
+
model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, token=hf_token)
|
| 27 |
+
else:
|
| 28 |
+
MODEL_PATH = "./detector/models/roberta-detector"
|
| 29 |
+
tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
|
| 30 |
+
model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, local_files_only=True)
|
| 31 |
+
|
| 32 |
+
model.eval().to("cuda" if torch.cuda.is_available() else "cpu")
|
| 33 |
+
device = next(model.parameters()).device
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# === AI classification threshold ===
|
| 37 |
+
AI_THRESHOLD = 0.5
|
| 38 |
+
|
| 39 |
+
# === Report directory ===
|
| 40 |
+
REPORT_DIR = Path(__file__).resolve().parent.parent / "reports"
|
| 41 |
+
REPORT_DIR.mkdir(exist_ok=True)
|
| 42 |
+
|
| 43 |
+
def analyze_text(text: str):
|
| 44 |
+
results = []
|
| 45 |
+
paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
|
| 46 |
+
ai_count = 0
|
| 47 |
+
total_sentences = 0
|
| 48 |
+
|
| 49 |
+
for paragraph in paragraphs:
|
| 50 |
+
sentence_results = []
|
| 51 |
+
sentences = sent_tokenize(paragraph)
|
| 52 |
+
for sentence in sentences:
|
| 53 |
+
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
|
| 54 |
+
|
| 55 |
+
with torch.no_grad():
|
| 56 |
+
outputs = model(**inputs)
|
| 57 |
+
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
|
| 58 |
+
ai_prob = probs[1].item()
|
| 59 |
+
|
| 60 |
+
is_ai = ai_prob >= AI_THRESHOLD
|
| 61 |
+
sentence_results.append((sentence, is_ai, ai_prob))
|
| 62 |
+
|
| 63 |
+
total_sentences += 1
|
| 64 |
+
if is_ai:
|
| 65 |
+
ai_count += 1
|
| 66 |
+
|
| 67 |
+
results.append(sentence_results)
|
| 68 |
+
|
| 69 |
+
overall_score = round((ai_count / total_sentences) * 100, 2) if total_sentences else 0
|
| 70 |
+
|
| 71 |
+
return {
|
| 72 |
+
"overall_ai_percent": overall_score,
|
| 73 |
+
"total_sentences": total_sentences,
|
| 74 |
+
"ai_sentences": ai_count,
|
| 75 |
+
"results": results
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
def generate_pdf_report(results: dict, filename: str) -> str:
|
| 79 |
+
pdf_path = REPORT_DIR / f"{filename}.pdf"
|
| 80 |
+
|
| 81 |
+
c = canvas.Canvas(str(pdf_path), pagesize=A4)
|
| 82 |
+
width, height = A4
|
| 83 |
+
x, y = 40, height - 60
|
| 84 |
+
line_height = 18
|
| 85 |
+
font_size = 12
|
| 86 |
+
|
| 87 |
+
# Header
|
| 88 |
+
c.setFont("Helvetica-Bold", 14)
|
| 89 |
+
c.drawString(x, y, f"📄 AI Detection Report: {filename}")
|
| 90 |
+
y -= 25
|
| 91 |
+
c.setFont("Helvetica", 12)
|
| 92 |
+
c.drawString(x, y, f"🧠 AI Detected: {results['overall_ai_percent']}% of {results['total_sentences']} sentences")
|
| 93 |
+
y -= 30
|
| 94 |
+
c.setFont("Helvetica", font_size)
|
| 95 |
+
|
| 96 |
+
for para_result in results["results"]:
|
| 97 |
+
if not para_result:
|
| 98 |
+
y -= line_height
|
| 99 |
+
continue
|
| 100 |
+
|
| 101 |
+
for sentence, is_ai, _ in para_result:
|
| 102 |
+
if y < 50:
|
| 103 |
+
c.showPage()
|
| 104 |
+
y = height - 50
|
| 105 |
+
c.setFont("Helvetica", font_size)
|
| 106 |
+
|
| 107 |
+
sentence = sentence.strip()
|
| 108 |
+
if not sentence:
|
| 109 |
+
continue
|
| 110 |
+
|
| 111 |
+
# Wrap long sentences
|
| 112 |
+
max_width = width - 80
|
| 113 |
+
words = sentence.split()
|
| 114 |
+
current_line = ""
|
| 115 |
+
for word in words:
|
| 116 |
+
test_line = current_line + " " + word if current_line else word
|
| 117 |
+
if c.stringWidth(test_line, "Helvetica", font_size) > max_width:
|
| 118 |
+
if is_ai:
|
| 119 |
+
text_width = c.stringWidth(current_line, "Helvetica", font_size)
|
| 120 |
+
c.setFillColor(colors.cyan)
|
| 121 |
+
c.rect(x - 2, y - 4, text_width + 4, line_height + 2, fill=True, stroke=False)
|
| 122 |
+
c.setFillColor(colors.black)
|
| 123 |
+
|
| 124 |
+
c.drawString(x, y, current_line)
|
| 125 |
+
y -= line_height
|
| 126 |
+
current_line = word
|
| 127 |
+
else:
|
| 128 |
+
current_line = test_line
|
| 129 |
+
|
| 130 |
+
if current_line:
|
| 131 |
+
if y < 50:
|
| 132 |
+
c.showPage()
|
| 133 |
+
y = height - 50
|
| 134 |
+
c.setFont("Helvetica", font_size)
|
| 135 |
+
|
| 136 |
+
if is_ai:
|
| 137 |
+
text_width = c.stringWidth(current_line, "Helvetica", font_size)
|
| 138 |
+
c.setFillColor(colors.cyan)
|
| 139 |
+
c.rect(x - 2, y - 4, text_width + 4, line_height + 2, fill=True, stroke=False)
|
| 140 |
+
c.setFillColor(colors.black)
|
| 141 |
+
|
| 142 |
+
c.drawString(x, y, current_line)
|
| 143 |
+
y -= line_height
|
| 144 |
+
|
| 145 |
+
y -= line_height # Paragraph spacing
|
| 146 |
+
|
| 147 |
+
c.save()
|
| 148 |
+
return f"{filename}.pdf"
|
| 149 |
+
|
detector/detector.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import torch
|
| 4 |
+
import docx
|
| 5 |
+
import nltk
|
| 6 |
+
from nltk.tokenize import sent_tokenize
|
| 7 |
+
from transformers import RobertaTokenizer, RobertaForSequenceClassification
|
| 8 |
+
from reportlab.lib.pagesizes import A4
|
| 9 |
+
from reportlab.pdfgen import canvas
|
| 10 |
+
from reportlab.lib import colors
|
| 11 |
+
|
| 12 |
+
nltk.download("punkt")
|
| 13 |
+
|
| 14 |
+
# Load model
|
| 15 |
+
model_dir = "./models/roberta-detector"
|
| 16 |
+
tokenizer = RobertaTokenizer.from_pretrained(model_dir)
|
| 17 |
+
model = RobertaForSequenceClassification.from_pretrained(model_dir)
|
| 18 |
+
model.eval().to("cuda" if torch.cuda.is_available() else "cpu")
|
| 19 |
+
device = next(model.parameters()).device
|
| 20 |
+
|
| 21 |
+
# === THRESHOLD CONFIG ===
|
| 22 |
+
AI_THRESHOLD = 0.50 # Adjust this as needed for better results
|
| 23 |
+
|
| 24 |
+
# === Input File ===
|
| 25 |
+
filepath = sys.argv[1]
|
| 26 |
+
filename = os.path.splitext(os.path.basename(filepath))[0]
|
| 27 |
+
output_dir = "output_reports"
|
| 28 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 29 |
+
output_path = os.path.join(output_dir, f"{filename}_report.pdf")
|
| 30 |
+
|
| 31 |
+
# === DOCX Reader ===
|
| 32 |
+
def read_docx_paragraphs(path):
|
| 33 |
+
doc = docx.Document(path)
|
| 34 |
+
return [para.text for para in doc.paragraphs]
|
| 35 |
+
|
| 36 |
+
paragraphs = read_docx_paragraphs(filepath)
|
| 37 |
+
|
| 38 |
+
# === Detection Loop ===
|
| 39 |
+
results = []
|
| 40 |
+
total_sentences = 0
|
| 41 |
+
ai_sentences = 0
|
| 42 |
+
|
| 43 |
+
for paragraph in paragraphs:
|
| 44 |
+
if not paragraph.strip():
|
| 45 |
+
results.append([]) # preserve spacing
|
| 46 |
+
continue
|
| 47 |
+
|
| 48 |
+
sentences = sent_tokenize(paragraph)
|
| 49 |
+
para_result = []
|
| 50 |
+
|
| 51 |
+
for sentence in sentences:
|
| 52 |
+
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
|
| 53 |
+
with torch.no_grad():
|
| 54 |
+
outputs = model(**inputs)
|
| 55 |
+
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
|
| 56 |
+
ai_prob = probs[1].item()
|
| 57 |
+
|
| 58 |
+
is_ai = ai_prob >= AI_THRESHOLD
|
| 59 |
+
para_result.append((sentence, is_ai, ai_prob))
|
| 60 |
+
|
| 61 |
+
total_sentences += 1
|
| 62 |
+
if is_ai:
|
| 63 |
+
ai_sentences += 1
|
| 64 |
+
|
| 65 |
+
# Debugging
|
| 66 |
+
print(f"[DEBUG] AI probability: {ai_prob:.2f} — {'✔ Highlight' if is_ai else '✘ Skip'}")
|
| 67 |
+
|
| 68 |
+
results.append(para_result)
|
| 69 |
+
|
| 70 |
+
ai_percent = round((ai_sentences / total_sentences) * 100, 2) if total_sentences else 0
|
| 71 |
+
|
| 72 |
+
# === PDF Writer ===
|
| 73 |
+
c = canvas.Canvas(output_path, pagesize=A4)
|
| 74 |
+
width, height = A4
|
| 75 |
+
x, y = 40, height - 60
|
| 76 |
+
line_height = 18
|
| 77 |
+
font_size = 12
|
| 78 |
+
|
| 79 |
+
# Title
|
| 80 |
+
c.setFont("Helvetica-Bold", 14)
|
| 81 |
+
c.drawString(x, y, f"📄 AI Detection Report: {filename}")
|
| 82 |
+
y -= 25
|
| 83 |
+
c.setFont("Helvetica", 12)
|
| 84 |
+
c.drawString(x, y, f"🧠 AI Detected: {ai_percent}% of {total_sentences} sentences")
|
| 85 |
+
y -= 30
|
| 86 |
+
c.setFont("Helvetica", font_size)
|
| 87 |
+
|
| 88 |
+
# Body rendering
|
| 89 |
+
for para_result in results:
|
| 90 |
+
if not para_result:
|
| 91 |
+
y -= line_height
|
| 92 |
+
continue
|
| 93 |
+
|
| 94 |
+
for sentence, is_ai, ai_prob in para_result:
|
| 95 |
+
if y < 50:
|
| 96 |
+
c.showPage()
|
| 97 |
+
y = height - 50
|
| 98 |
+
c.setFont("Helvetica", font_size)
|
| 99 |
+
|
| 100 |
+
if is_ai:
|
| 101 |
+
text_width = c.stringWidth(sentence, "Helvetica", font_size)
|
| 102 |
+
c.setFillColor(colors.cyan)
|
| 103 |
+
c.rect(x - 2, y - 4, text_width + 4, line_height + 2, fill=True, stroke=False)
|
| 104 |
+
c.setFillColor(colors.black)
|
| 105 |
+
|
| 106 |
+
c.drawString(x, y, sentence)
|
| 107 |
+
y -= line_height
|
| 108 |
+
|
| 109 |
+
y -= line_height # spacing between paragraphs
|
| 110 |
+
|
| 111 |
+
c.save()
|
| 112 |
+
print(f"\n✅ Report saved: {output_path}")
|
detector/preprocess.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import docx
|
| 3 |
+
import pdfplumber
|
| 4 |
+
import nltk
|
| 5 |
+
|
| 6 |
+
nltk.download("punkt")
|
| 7 |
+
from nltk.tokenize import sent_tokenize
|
| 8 |
+
|
| 9 |
+
def extract_text_from_docx(path):
|
| 10 |
+
try:
|
| 11 |
+
doc = docx.Document(path)
|
| 12 |
+
paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
|
| 13 |
+
return paragraphs if paragraphs else []
|
| 14 |
+
except Exception as e:
|
| 15 |
+
print(f"❌ Failed to extract DOCX: {e}")
|
| 16 |
+
return []
|
| 17 |
+
|
| 18 |
+
def extract_text_from_pdf(path):
|
| 19 |
+
try:
|
| 20 |
+
with pdfplumber.open(path) as pdf:
|
| 21 |
+
all_text = "\n".join(
|
| 22 |
+
page.extract_text() for page in pdf.pages if page.extract_text()
|
| 23 |
+
)
|
| 24 |
+
except Exception as e:
|
| 25 |
+
print(f"❌ Failed to extract PDF: {e}")
|
| 26 |
+
return []
|
| 27 |
+
|
| 28 |
+
if not all_text.strip():
|
| 29 |
+
return []
|
| 30 |
+
|
| 31 |
+
# Try splitting by paragraphs
|
| 32 |
+
paragraphs = [p.strip() for p in all_text.split("\n\n") if p.strip()]
|
| 33 |
+
if paragraphs:
|
| 34 |
+
return paragraphs
|
| 35 |
+
|
| 36 |
+
# Fallback: break into 3–5 sentence chunks
|
| 37 |
+
sentences = sent_tokenize(all_text)
|
| 38 |
+
return [" ".join(sentences[i:i + 5]) for i in range(0, len(sentences), 5)]
|
| 39 |
+
|
| 40 |
+
def extract_paragraphs(path):
|
| 41 |
+
ext = os.path.splitext(path)[-1].lower()
|
| 42 |
+
if ext == ".docx":
|
| 43 |
+
return extract_text_from_docx(path)
|
| 44 |
+
elif ext == ".pdf":
|
| 45 |
+
return extract_text_from_pdf(path)
|
| 46 |
+
else:
|
| 47 |
+
raise ValueError(f"Unsupported file type: {ext}")
|
detector/requirements.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
transformers
|
| 2 |
+
torch
|
| 3 |
+
scikit-learn
|
| 4 |
+
pdfplumber
|
| 5 |
+
python-docx
|
| 6 |
+
nltk
|
| 7 |
+
datasets
|
| 8 |
+
fastapi
|
| 9 |
+
uvicorn
|
| 10 |
+
transformers
|
| 11 |
+
torch
|
| 12 |
+
nltk
|
| 13 |
+
PyPDF2
|
| 14 |
+
python-docx
|
detector/train_model.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import torch
|
| 3 |
+
from datasets import Dataset
|
| 4 |
+
import evaluate
|
| 5 |
+
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
|
| 6 |
+
|
| 7 |
+
# Load dataset from JSON
|
| 8 |
+
with open("ai_training_dataset.json", "r", encoding="utf-8") as f:
|
| 9 |
+
data = json.load(f)
|
| 10 |
+
|
| 11 |
+
# Check that all labels are integers (0, 1, or 2)
|
| 12 |
+
for item in data:
|
| 13 |
+
item["label"] = int(item["label"]) # Ensure type is correct
|
| 14 |
+
|
| 15 |
+
# Convert to HuggingFace Dataset
|
| 16 |
+
dataset = Dataset.from_list(data)
|
| 17 |
+
dataset = dataset.train_test_split(test_size=0.2)
|
| 18 |
+
train_dataset = dataset["train"]
|
| 19 |
+
eval_dataset = dataset["test"]
|
| 20 |
+
|
| 21 |
+
# Load tokenizer
|
| 22 |
+
model_name = "roberta-base"
|
| 23 |
+
tokenizer = RobertaTokenizer.from_pretrained(model_name)
|
| 24 |
+
|
| 25 |
+
# Tokenization function
|
| 26 |
+
def tokenize(example):
|
| 27 |
+
return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)
|
| 28 |
+
|
| 29 |
+
# Tokenize datasets
|
| 30 |
+
train_dataset = train_dataset.map(tokenize, batched=True)
|
| 31 |
+
eval_dataset = eval_dataset.map(tokenize, batched=True)
|
| 32 |
+
|
| 33 |
+
# Keep only model-required fields
|
| 34 |
+
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
|
| 35 |
+
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
|
| 36 |
+
|
| 37 |
+
# Load model with 3 output labels
|
| 38 |
+
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)
|
| 39 |
+
|
| 40 |
+
# Optional: define metrics
|
| 41 |
+
accuracy = evaluate.load("accuracy")
|
| 42 |
+
|
| 43 |
+
def compute_metrics(eval_pred):
|
| 44 |
+
logits, labels = eval_pred
|
| 45 |
+
preds = torch.argmax(torch.tensor(logits), dim=1)
|
| 46 |
+
return accuracy.compute(predictions=preds, references=labels)
|
| 47 |
+
|
| 48 |
+
# Training configuration
|
| 49 |
+
training_args = TrainingArguments(
|
| 50 |
+
output_dir="./models/roberta-detector",
|
| 51 |
+
evaluation_strategy="epoch", # MUST match save_strategy
|
| 52 |
+
save_strategy="epoch",
|
| 53 |
+
per_device_train_batch_size=4,
|
| 54 |
+
per_device_eval_batch_size=4,
|
| 55 |
+
num_train_epochs=3,
|
| 56 |
+
logging_steps=10,
|
| 57 |
+
save_total_limit=1,
|
| 58 |
+
load_best_model_at_end=True,
|
| 59 |
+
metric_for_best_model="eval_loss",
|
| 60 |
+
report_to="none", # Prevents WandB issues
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
# Trainer setup
|
| 64 |
+
trainer = Trainer(
|
| 65 |
+
model=model,
|
| 66 |
+
args=training_args,
|
| 67 |
+
train_dataset=train_dataset,
|
| 68 |
+
eval_dataset=eval_dataset,
|
| 69 |
+
compute_metrics=compute_metrics,
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# Train
|
| 73 |
+
trainer.train()
|
| 74 |
+
|
| 75 |
+
# Save model + tokenizer
|
| 76 |
+
model.save_pretrained("./models/roberta-detector")
|
| 77 |
+
tokenizer.save_pretrained("./models/roberta-detector")
|
| 78 |
+
|
| 79 |
+
print("✅ Model trained and saved.")
|
detector/utils.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# detector/utils.py
|
| 2 |
+
|
| 3 |
+
from PyPDF2 import PdfReader
|
| 4 |
+
import docx
|
| 5 |
+
|
| 6 |
+
async def extract_text_from_file(file):
|
| 7 |
+
filename = file.filename.lower()
|
| 8 |
+
|
| 9 |
+
if filename.endswith(".pdf"):
|
| 10 |
+
reader = PdfReader(file.file)
|
| 11 |
+
return "\n".join([page.extract_text() or "" for page in reader.pages])
|
| 12 |
+
|
| 13 |
+
elif filename.endswith(".docx"):
|
| 14 |
+
document = docx.Document(file.file)
|
| 15 |
+
return "\n".join([para.text for para in document.paragraphs])
|
| 16 |
+
|
| 17 |
+
elif filename.endswith(".txt"):
|
| 18 |
+
return (await file.read()).decode("utf-8")
|
| 19 |
+
|
| 20 |
+
else:
|
| 21 |
+
raise ValueError("Unsupported file type.")
|
main.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, UploadFile, File
|
| 2 |
+
from fastapi.responses import JSONResponse
|
| 3 |
+
from fastapi.staticfiles import StaticFiles
|
| 4 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 5 |
+
|
| 6 |
+
from PyPDF2 import PdfReader
|
| 7 |
+
import docx
|
| 8 |
+
from docx.enum.text import WD_COLOR_INDEX
|
| 9 |
+
from io import BytesIO
|
| 10 |
+
import os
|
| 11 |
+
import uvicorn
|
| 12 |
+
|
| 13 |
+
# === App setup ===
|
| 14 |
+
app = FastAPI()
|
| 15 |
+
|
| 16 |
+
# ✅ CORS
|
| 17 |
+
app.add_middleware(
|
| 18 |
+
CORSMiddleware,
|
| 19 |
+
allow_origins=["http://localhost:5173", "https://stealth-writer.vercel.app/"],
|
| 20 |
+
allow_credentials=True,
|
| 21 |
+
allow_methods=["*"],
|
| 22 |
+
allow_headers=["*"],
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
# === Global lazy-load model vars ===
|
| 26 |
+
analyze_text = None
|
| 27 |
+
generate_pdf_report = None
|
| 28 |
+
|
| 29 |
+
@app.on_event("startup")
|
| 30 |
+
async def load_model():
|
| 31 |
+
global analyze_text, generate_pdf_report
|
| 32 |
+
from detector.custom_model import analyze_text as at, generate_pdf_report as gpr
|
| 33 |
+
analyze_text = at
|
| 34 |
+
generate_pdf_report = gpr
|
| 35 |
+
|
| 36 |
+
# === Paths ===
|
| 37 |
+
REPORTS_DIR = os.path.join(os.path.dirname(__file__), "reports")
|
| 38 |
+
os.makedirs(REPORTS_DIR, exist_ok=True)
|
| 39 |
+
|
| 40 |
+
app.mount("/reports", StaticFiles(directory=REPORTS_DIR), name="reports")
|
| 41 |
+
|
| 42 |
+
# === File text extraction ===
|
| 43 |
+
def extract_text(file: UploadFile, ext: str) -> str:
|
| 44 |
+
content = file.file.read()
|
| 45 |
+
file_bytes = BytesIO(content)
|
| 46 |
+
|
| 47 |
+
if ext == ".txt":
|
| 48 |
+
return content.decode("utf-8", errors="ignore")
|
| 49 |
+
elif ext == ".pdf":
|
| 50 |
+
reader = PdfReader(file_bytes)
|
| 51 |
+
return "".join([page.extract_text() or "" for page in reader.pages])
|
| 52 |
+
elif ext == ".docx":
|
| 53 |
+
doc = docx.Document(file_bytes)
|
| 54 |
+
return "\n".join([para.text for para in doc.paragraphs])
|
| 55 |
+
else:
|
| 56 |
+
raise ValueError("Unsupported file type")
|
| 57 |
+
|
| 58 |
+
# === Main endpoint ===
|
| 59 |
+
@app.post("/api/detect")
|
| 60 |
+
async def detect(file: UploadFile = File(...)):
|
| 61 |
+
try:
|
| 62 |
+
ext = os.path.splitext(file.filename)[1].lower()
|
| 63 |
+
if ext not in [".txt", ".pdf", ".docx"]:
|
| 64 |
+
raise ValueError("Unsupported file format")
|
| 65 |
+
|
| 66 |
+
# Extract + Analyze
|
| 67 |
+
text = extract_text(file, ext)
|
| 68 |
+
result = analyze_text(text)
|
| 69 |
+
|
| 70 |
+
# === Save DOCX report ===
|
| 71 |
+
filename_base = os.path.splitext(file.filename)[0]
|
| 72 |
+
docx_filename = f"{filename_base}_report.docx"
|
| 73 |
+
docx_path = os.path.join(REPORTS_DIR, docx_filename)
|
| 74 |
+
|
| 75 |
+
doc = docx.Document()
|
| 76 |
+
doc.add_heading("AI Detection Summary", level=1)
|
| 77 |
+
doc.add_paragraph(f"Overall AI %: {result['overall_ai_percent']}%")
|
| 78 |
+
doc.add_paragraph(f"Total Sentences: {result['total_sentences']}")
|
| 79 |
+
doc.add_paragraph(f"AI Sentences: {result['ai_sentences']}")
|
| 80 |
+
doc.add_paragraph("Sentences detected as AI are highlighted in cyan.\n")
|
| 81 |
+
doc.add_heading("Sentence Analysis", level=2)
|
| 82 |
+
|
| 83 |
+
paragraph = doc.add_paragraph()
|
| 84 |
+
for para in result["results"]:
|
| 85 |
+
for sentence, is_ai, _ in para:
|
| 86 |
+
if not isinstance(sentence, str) or not sentence.strip():
|
| 87 |
+
continue
|
| 88 |
+
run = paragraph.add_run(sentence + " ")
|
| 89 |
+
if is_ai:
|
| 90 |
+
run.font.highlight_color = WD_COLOR_INDEX.TURQUOISE
|
| 91 |
+
|
| 92 |
+
doc.save(docx_path)
|
| 93 |
+
|
| 94 |
+
# === Save PDF report (uses ReportLab) ===
|
| 95 |
+
pdf_filename = generate_pdf_report(result, filename_base)
|
| 96 |
+
|
| 97 |
+
return {
|
| 98 |
+
"success": True,
|
| 99 |
+
"score": {
|
| 100 |
+
**{k: v for k, v in result.items() if k != "results"},
|
| 101 |
+
"results": [
|
| 102 |
+
[{"sentence": s, "is_ai": is_ai, "ai_score": round(ai_score * 100, 2)} for s, is_ai, ai_score in para]
|
| 103 |
+
for para in result["results"]
|
| 104 |
+
]
|
| 105 |
+
},
|
| 106 |
+
"docx_url": f"/reports/{docx_filename}",
|
| 107 |
+
"pdf_url": f"/reports/{pdf_filename}"
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
except Exception as e:
|
| 111 |
+
return JSONResponse(content={"success": False, "error": str(e)}, status_code=500)
|
| 112 |
+
|
| 113 |
+
# === Port binding for Render ===
|
| 114 |
+
if __name__ == "__main__":
|
| 115 |
+
port = int(os.environ.get("PORT", 7860)) # Render will inject PORT
|
| 116 |
+
uvicorn.run("main:app", host="0.0.0.0", port=port)
|
requirements.txt
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accelerate==1.8.1
|
| 2 |
+
aiohappyeyeballs==2.6.1
|
| 3 |
+
aiohttp==3.12.13
|
| 4 |
+
aiosignal==1.3.2
|
| 5 |
+
annotated-types==0.7.0
|
| 6 |
+
anyio==4.9.0
|
| 7 |
+
async-timeout==5.0.1
|
| 8 |
+
attrs==25.3.0
|
| 9 |
+
certifi==2025.6.15
|
| 10 |
+
cffi==1.17.1
|
| 11 |
+
charset-normalizer==3.4.2
|
| 12 |
+
click==8.2.1
|
| 13 |
+
colorama==0.4.6
|
| 14 |
+
cryptography==45.0.4
|
| 15 |
+
datasets==3.6.0
|
| 16 |
+
defusedxml==0.7.1
|
| 17 |
+
deprecation==2.1.0
|
| 18 |
+
dill==0.3.8
|
| 19 |
+
evaluate==0.4.4
|
| 20 |
+
exceptiongroup==1.3.0
|
| 21 |
+
fastapi==0.115.13
|
| 22 |
+
filelock==3.18.0
|
| 23 |
+
fonttools==4.58.4
|
| 24 |
+
fpdf2==2.7.8
|
| 25 |
+
frozenlist==1.7.0
|
| 26 |
+
fsspec==2025.3.0
|
| 27 |
+
gotrue==2.12.0
|
| 28 |
+
greenlet==3.2.3
|
| 29 |
+
h11==0.16.0
|
| 30 |
+
h2==4.2.0
|
| 31 |
+
hpack==4.1.0
|
| 32 |
+
httpcore==1.0.9
|
| 33 |
+
httpx==0.28.1
|
| 34 |
+
huggingface-hub==0.33.0
|
| 35 |
+
hyperframe==6.1.0
|
| 36 |
+
idna==3.10
|
| 37 |
+
iniconfig==2.1.0
|
| 38 |
+
Jinja2==3.1.6
|
| 39 |
+
joblib==1.5.1
|
| 40 |
+
lxml==5.4.0
|
| 41 |
+
MarkupSafe==3.0.2
|
| 42 |
+
mpmath==1.3.0
|
| 43 |
+
multidict==6.5.0
|
| 44 |
+
multiprocess==0.70.16
|
| 45 |
+
networkx==3.4.2
|
| 46 |
+
nltk==3.8.1
|
| 47 |
+
numpy==2.2.6
|
| 48 |
+
packaging==25.0
|
| 49 |
+
pandas==2.3.0
|
| 50 |
+
pdfminer.six==20250506
|
| 51 |
+
pdfplumber==0.11.7
|
| 52 |
+
pillow==11.0.0
|
| 53 |
+
pluggy==1.6.0
|
| 54 |
+
postgrest==1.0.2
|
| 55 |
+
propcache==0.3.2
|
| 56 |
+
psutil==7.0.0
|
| 57 |
+
pyarrow==20.0.0
|
| 58 |
+
pycparser==2.22
|
| 59 |
+
pydantic==2.11.7
|
| 60 |
+
pydantic_core==2.33.2
|
| 61 |
+
pyee==13.0.0
|
| 62 |
+
Pygments==2.19.1
|
| 63 |
+
PyJWT==2.10.1
|
| 64 |
+
PyMuPDF==1.24.2
|
| 65 |
+
PyPDF2==3.0.1
|
| 66 |
+
pypdfium2==4.30.1
|
| 67 |
+
pytest==8.4.1
|
| 68 |
+
pytest-mock==3.14.1
|
| 69 |
+
python-dateutil==2.9.0.post0
|
| 70 |
+
python-docx==1.2.0
|
| 71 |
+
python-dotenv==1.1.1
|
| 72 |
+
python-multipart==0.0.20
|
| 73 |
+
pytz==2025.2
|
| 74 |
+
PyYAML==6.0.2
|
| 75 |
+
realtime==2.4.3
|
| 76 |
+
regex==2024.11.6
|
| 77 |
+
reportlab==4.4.2
|
| 78 |
+
requests==2.32.4
|
| 79 |
+
safetensors==0.5.3
|
| 80 |
+
scikit-learn==1.7.0
|
| 81 |
+
scipy==1.15.3
|
| 82 |
+
six==1.17.0
|
| 83 |
+
sniffio==1.3.1
|
| 84 |
+
starlette==0.46.2
|
| 85 |
+
storage3==0.11.3
|
| 86 |
+
StrEnum==0.4.15
|
| 87 |
+
supabase==2.15.3
|
| 88 |
+
sympy==1.13.1
|
| 89 |
+
threadpoolctl==3.6.0
|
| 90 |
+
tomli==2.2.1
|
| 91 |
+
torch==2.5.1
|
| 92 |
+
tqdm==4.67.1
|
| 93 |
+
transformers==4.41.1
|
| 94 |
+
typing-inspection==0.4.1
|
| 95 |
+
typing_extensions==4.14.0
|
| 96 |
+
tzdata==2025.2
|
| 97 |
+
urllib3==2.5.0
|
| 98 |
+
uvicorn==0.34.3
|
| 99 |
+
websockets==14.2
|
| 100 |
+
xxhash==3.5.0
|
| 101 |
+
yarl==1.20.1
|