Spaces:

AlyanAkram
/

StealthWriter

Runtime error

App Files Files Community

AlyanAkram commited on Jul 9, 2025

Commit

2a668e8

verified ·

1 Parent(s): 9aef97b

Delete app

Browse files

Files changed (11) hide show

app/.gitignore +0 -43
app/auth.py +0 -37
app/detector/create_dataset.py +0 -45
app/detector/custom_model.py +0 -149
app/detector/detector.py +0 -112
app/detector/preprocess.py +0 -47
app/detector/requirements.txt +0 -14
app/detector/train_model.py +0 -79
app/detector/utils.py +0 -21
app/requirements.txt +0 -101
app/sample.docx +0 -3

app/.gitignore DELETED Viewed

@@ -1,43 +0,0 @@
-# Python artifacts
-__pycache__/
-*.py[cod]
-*.pyo
-*.pyd
-*.so
-*.egg-info/
-# Virtual environments
-venv/
-env/
-.venv/
-# IDE/editor files
-.vscode/
-.idea/
-*.log
-.DS_Store
-Thumbs.db
-# Model weights and training artifacts
-detector/models/
-detector/training_data/
-detector/output_reports/
-test_files/
-reports/
-*.safetensors
-*.bin
-*.pt
-*.ckpt
-# Data files
-ai_training_dataset.json
-# Optional build/deploy stuff
-build/
-tmp/
-.cache/
-# Environment and config
-.env
-*.env
-.env.*

app/auth.py DELETED Viewed

@@ -1,37 +0,0 @@
-from fastapi import APIRouter, HTTPException
-from pydantic import BaseModel
-from supabase import create_client, Client
-import os
-SUPABASE_URL = "https://ylyxgffttgvvjyrfovpl.supabase.co"
-SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InlseXhnZmZ0dGd2dmp5cmZvdnBsIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NTA0MTc4NDcsImV4cCI6MjA2NTk5Mzg0N30.a6-biroEh-TNTS8E_uAYYt7mgdY2A-xexjCzYp1MsuI"
-supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
-router = APIRouter()
-class UserAuth(BaseModel):
-    email: str
-    password: str
-@router.post("/signup")
-def signup(user: UserAuth):
-    try:
-        result = supabase.auth.sign_up({
-            "email": user.email,
-            "password": user.password
-        })
-        return {"message": "Signup successful", "user": result.user}
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=str(e))
-@router.post("/login")
-def login(user: UserAuth):
-    try:
-        result = supabase.auth.sign_in_with_password({
-            "email": user.email,
-            "password": user.password
-        })
-        return {"access_token": result.session.access_token, "user_id": result.user.id}
-    except Exception as e:
-        raise HTTPException(status_code=401, detail="Invalid credentials")

app/detector/create_dataset.py DELETED Viewed

@@ -1,45 +0,0 @@
-import os
-import json
-from preprocess import extract_paragraphs
-# Label mapping
-LABELS = {
-    "ai": 1,
-    "human": 0,
-    "mixed": 2  # You can change to 2 if doing 3-class classification
-}
-root_dir = "training_data"
-dataset = []
-for label_folder in os.listdir(root_dir):
-    folder_path = os.path.join(root_dir, label_folder)
-    if not os.path.isdir(folder_path):
-        continue
-    label = LABELS.get(label_folder.lower())
-    if label is None:
-        continue
-    for filename in os.listdir(folder_path):
-        if not filename.endswith((".pdf", ".docx")):
-            continue
-        file_path = os.path.join(folder_path, filename)
-        print(f"📄 Extracting: {file_path}")
-        try:
-            paragraphs = extract_paragraphs(file_path)
-            for para in paragraphs:
-                if para.strip():
-                    dataset.append({
-                        "text": para.strip(),
-                        "label": label
-                    })
-        except Exception as e:
-            print(f"❌ Failed: {file_path} — {str(e)}")
-# Save dataset
-with open("ai_training_dataset.json", "w", encoding="utf-8") as f:
-    json.dump(dataset, f, indent=2, ensure_ascii=False)
-print(f"\n✅ Saved {len(dataset)} samples.")

app/detector/custom_model.py DELETED Viewed

@@ -1,149 +0,0 @@
-import os
-import torch
-import nltk
-from pathlib import Path
-from nltk.tokenize import sent_tokenize
-from transformers import RobertaTokenizer, RobertaForSequenceClassification
-from reportlab.lib.pagesizes import A4
-from reportlab.pdfgen import canvas
-from reportlab.lib import colors
-# === NLTK tokenizer ===
-nltk.download("punkt")
-# === Model loading: Hugging Face (Render) vs Local (Dev) ===
-USE_HF_MODEL = os.getenv("USE_HF_MODEL") == "1"
-if USE_HF_MODEL:
-    from huggingface_hub import login
-    hf_token = os.getenv("HF_TOKEN")
-    if hf_token:
-        login(token=hf_token)
-    MODEL_PATH = "AlyanAkram/stealth-roberta"
-    tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, token=hf_token)
-    model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, token=hf_token)
-else:
-    MODEL_PATH = "./detector/models/roberta-detector"
-    tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
-    model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, local_files_only=True)
-model.eval().to("cuda" if torch.cuda.is_available() else "cpu")
-device = next(model.parameters()).device
-# === AI classification threshold ===
-AI_THRESHOLD = 0.5
-# === Report directory ===
-REPORT_DIR = Path(__file__).resolve().parent.parent / "reports"
-REPORT_DIR.mkdir(exist_ok=True)
-def analyze_text(text: str):
-    results = []
-    paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
-    ai_count = 0
-    total_sentences = 0
-    for paragraph in paragraphs:
-        sentence_results = []
-        sentences = sent_tokenize(paragraph)
-        for sentence in sentences:
-            inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
-            with torch.no_grad():
-                outputs = model(**inputs)
-                probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
-                ai_prob = probs[1].item()
-            is_ai = ai_prob >= AI_THRESHOLD
-            sentence_results.append((sentence, is_ai, ai_prob))
-            total_sentences += 1
-            if is_ai:
-                ai_count += 1
-        results.append(sentence_results)
-    overall_score = round((ai_count / total_sentences) * 100, 2) if total_sentences else 0
-    return {
-        "overall_ai_percent": overall_score,
-        "total_sentences": total_sentences,
-        "ai_sentences": ai_count,
-        "results": results
-    }
-def generate_pdf_report(results: dict, filename: str) -> str:
-    pdf_path = REPORT_DIR / f"{filename}.pdf"
-    c = canvas.Canvas(str(pdf_path), pagesize=A4)
-    width, height = A4
-    x, y = 40, height - 60
-    line_height = 18
-    font_size = 12
-    # Header
-    c.setFont("Helvetica-Bold", 14)
-    c.drawString(x, y, f"📄 AI Detection Report: {filename}")
-    y -= 25
-    c.setFont("Helvetica", 12)
-    c.drawString(x, y, f"🧠 AI Detected: {results['overall_ai_percent']}% of {results['total_sentences']} sentences")
-    y -= 30
-    c.setFont("Helvetica", font_size)
-    for para_result in results["results"]:
-        if not para_result:
-            y -= line_height
-            continue
-        for sentence, is_ai, _ in para_result:
-            if y < 50:
-                c.showPage()
-                y = height - 50
-                c.setFont("Helvetica", font_size)
-            sentence = sentence.strip()
-            if not sentence:
-                continue
-            # Wrap long sentences
-            max_width = width - 80
-            words = sentence.split()
-            current_line = ""
-            for word in words:
-                test_line = current_line + " " + word if current_line else word
-                if c.stringWidth(test_line, "Helvetica", font_size) > max_width:
-                    if is_ai:
-                        text_width = c.stringWidth(current_line, "Helvetica", font_size)
-                        c.setFillColor(colors.cyan)
-                        c.rect(x - 2, y - 4, text_width + 4, line_height + 2, fill=True, stroke=False)
-                        c.setFillColor(colors.black)
-                    c.drawString(x, y, current_line)
-                    y -= line_height
-                    current_line = word
-                else:
-                    current_line = test_line
-            if current_line:
-                if y < 50:
-                    c.showPage()
-                    y = height - 50
-                    c.setFont("Helvetica", font_size)
-                if is_ai:
-                    text_width = c.stringWidth(current_line, "Helvetica", font_size)
-                    c.setFillColor(colors.cyan)
-                    c.rect(x - 2, y - 4, text_width + 4, line_height + 2, fill=True, stroke=False)
-                    c.setFillColor(colors.black)
-                c.drawString(x, y, current_line)
-                y -= line_height
-        y -= line_height  # Paragraph spacing
-    c.save()
-    return f"{filename}.pdf"

app/detector/detector.py DELETED Viewed

@@ -1,112 +0,0 @@
-import os
-import sys
-import torch
-import docx
-import nltk
-from nltk.tokenize import sent_tokenize
-from transformers import RobertaTokenizer, RobertaForSequenceClassification
-from reportlab.lib.pagesizes import A4
-from reportlab.pdfgen import canvas
-from reportlab.lib import colors
-nltk.download("punkt")
-# Load model
-model_dir = "./models/roberta-detector"
-tokenizer = RobertaTokenizer.from_pretrained(model_dir)
-model = RobertaForSequenceClassification.from_pretrained(model_dir)
-model.eval().to("cuda" if torch.cuda.is_available() else "cpu")
-device = next(model.parameters()).device
-# === THRESHOLD CONFIG ===
-AI_THRESHOLD = 0.50  # Adjust this as needed for better results
-# === Input File ===
-filepath = sys.argv[1]
-filename = os.path.splitext(os.path.basename(filepath))[0]
-output_dir = "output_reports"
-os.makedirs(output_dir, exist_ok=True)
-output_path = os.path.join(output_dir, f"{filename}_report.pdf")
-# === DOCX Reader ===
-def read_docx_paragraphs(path):
-    doc = docx.Document(path)
-    return [para.text for para in doc.paragraphs]
-paragraphs = read_docx_paragraphs(filepath)
-# === Detection Loop ===
-results = []
-total_sentences = 0
-ai_sentences = 0
-for paragraph in paragraphs:
-    if not paragraph.strip():
-        results.append([])  # preserve spacing
-        continue
-    sentences = sent_tokenize(paragraph)
-    para_result = []
-    for sentence in sentences:
-        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
-        with torch.no_grad():
-            outputs = model(**inputs)
-            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
-            ai_prob = probs[1].item()
-        is_ai = ai_prob >= AI_THRESHOLD
-        para_result.append((sentence, is_ai, ai_prob))
-        total_sentences += 1
-        if is_ai:
-            ai_sentences += 1
-        # Debugging
-        print(f"[DEBUG] AI probability: {ai_prob:.2f} — {'✔ Highlight' if is_ai else '✘ Skip'}")
-    results.append(para_result)
-ai_percent = round((ai_sentences / total_sentences) * 100, 2) if total_sentences else 0
-# === PDF Writer ===
-c = canvas.Canvas(output_path, pagesize=A4)
-width, height = A4
-x, y = 40, height - 60
-line_height = 18
-font_size = 12
-# Title
-c.setFont("Helvetica-Bold", 14)
-c.drawString(x, y, f"📄 AI Detection Report: {filename}")
-y -= 25
-c.setFont("Helvetica", 12)
-c.drawString(x, y, f"🧠 AI Detected: {ai_percent}% of {total_sentences} sentences")
-y -= 30
-c.setFont("Helvetica", font_size)
-# Body rendering
-for para_result in results:
-    if not para_result:
-        y -= line_height
-        continue
-    for sentence, is_ai, ai_prob in para_result:
-        if y < 50:
-            c.showPage()
-            y = height - 50
-            c.setFont("Helvetica", font_size)
-        if is_ai:
-            text_width = c.stringWidth(sentence, "Helvetica", font_size)
-            c.setFillColor(colors.cyan)
-            c.rect(x - 2, y - 4, text_width + 4, line_height + 2, fill=True, stroke=False)
-            c.setFillColor(colors.black)
-        c.drawString(x, y, sentence)
-        y -= line_height
-    y -= line_height  # spacing between paragraphs
-c.save()
-print(f"\n✅ Report saved: {output_path}")

app/detector/preprocess.py DELETED Viewed

@@ -1,47 +0,0 @@
-import os
-import docx
-import pdfplumber
-import nltk
-nltk.download("punkt")
-from nltk.tokenize import sent_tokenize
-def extract_text_from_docx(path):
-    try:
-        doc = docx.Document(path)
-        paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
-        return paragraphs if paragraphs else []
-    except Exception as e:
-        print(f"❌ Failed to extract DOCX: {e}")
-        return []
-def extract_text_from_pdf(path):
-    try:
-        with pdfplumber.open(path) as pdf:
-            all_text = "\n".join(
-                page.extract_text() for page in pdf.pages if page.extract_text()
-            )
-    except Exception as e:
-        print(f"❌ Failed to extract PDF: {e}")
-        return []
-    if not all_text.strip():
-        return []
-    # Try splitting by paragraphs
-    paragraphs = [p.strip() for p in all_text.split("\n\n") if p.strip()]
-    if paragraphs:
-        return paragraphs
-    # Fallback: break into 3–5 sentence chunks
-    sentences = sent_tokenize(all_text)
-    return [" ".join(sentences[i:i + 5]) for i in range(0, len(sentences), 5)]
-def extract_paragraphs(path):
-    ext = os.path.splitext(path)[-1].lower()
-    if ext == ".docx":
-        return extract_text_from_docx(path)
-    elif ext == ".pdf":
-        return extract_text_from_pdf(path)
-    else:
-        raise ValueError(f"Unsupported file type: {ext}")

app/detector/requirements.txt DELETED Viewed

@@ -1,14 +0,0 @@
-transformers
-torch
-scikit-learn
-pdfplumber
-python-docx
-nltk
-datasets
-fastapi
-uvicorn
-transformers
-torch
-nltk
-PyPDF2
-python-docx

app/detector/train_model.py DELETED Viewed

@@ -1,79 +0,0 @@
-import json
-import torch
-from datasets import Dataset
-import evaluate
-from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
-# Load dataset from JSON
-with open("ai_training_dataset.json", "r", encoding="utf-8") as f:
-    data = json.load(f)
-# Check that all labels are integers (0, 1, or 2)
-for item in data:
-    item["label"] = int(item["label"])  # Ensure type is correct
-# Convert to HuggingFace Dataset
-dataset = Dataset.from_list(data)
-dataset = dataset.train_test_split(test_size=0.2)
-train_dataset = dataset["train"]
-eval_dataset = dataset["test"]
-# Load tokenizer
-model_name = "roberta-base"
-tokenizer = RobertaTokenizer.from_pretrained(model_name)
-# Tokenization function
-def tokenize(example):
-    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)
-# Tokenize datasets
-train_dataset = train_dataset.map(tokenize, batched=True)
-eval_dataset = eval_dataset.map(tokenize, batched=True)
-# Keep only model-required fields
-train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
-eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
-# Load model with 3 output labels
-model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)
-# Optional: define metrics
-accuracy = evaluate.load("accuracy")
-def compute_metrics(eval_pred):
-    logits, labels = eval_pred
-    preds = torch.argmax(torch.tensor(logits), dim=1)
-    return accuracy.compute(predictions=preds, references=labels)
-# Training configuration
-training_args = TrainingArguments(
-    output_dir="./models/roberta-detector",
-    evaluation_strategy="epoch",  # MUST match save_strategy
-    save_strategy="epoch",
-    per_device_train_batch_size=4,
-    per_device_eval_batch_size=4,
-    num_train_epochs=3,
-    logging_steps=10,
-    save_total_limit=1,
-    load_best_model_at_end=True,
-    metric_for_best_model="eval_loss",
-    report_to="none",  # Prevents WandB issues
-)
-# Trainer setup
-trainer = Trainer(
-    model=model,
-    args=training_args,
-    train_dataset=train_dataset,
-    eval_dataset=eval_dataset,
-    compute_metrics=compute_metrics,
-)
-# Train
-trainer.train()
-# Save model + tokenizer
-model.save_pretrained("./models/roberta-detector")
-tokenizer.save_pretrained("./models/roberta-detector")
-print("✅ Model trained and saved.")

app/detector/utils.py DELETED Viewed

@@ -1,21 +0,0 @@
-# detector/utils.py
-from PyPDF2 import PdfReader
-import docx
-async def extract_text_from_file(file):
-    filename = file.filename.lower()
-    if filename.endswith(".pdf"):
-        reader = PdfReader(file.file)
-        return "\n".join([page.extract_text() or "" for page in reader.pages])
-    elif filename.endswith(".docx"):
-        document = docx.Document(file.file)
-        return "\n".join([para.text for para in document.paragraphs])
-    elif filename.endswith(".txt"):
-        return (await file.read()).decode("utf-8")
-    else:
-        raise ValueError("Unsupported file type.")

app/requirements.txt DELETED Viewed

@@ -1,101 +0,0 @@
-accelerate==1.8.1
-aiohappyeyeballs==2.6.1
-aiohttp==3.12.13
-aiosignal==1.3.2
-annotated-types==0.7.0
-anyio==4.9.0
-async-timeout==5.0.1
-attrs==25.3.0
-certifi==2025.6.15
-cffi==1.17.1
-charset-normalizer==3.4.2
-click==8.2.1
-colorama==0.4.6
-cryptography==45.0.4
-datasets==3.6.0
-defusedxml==0.7.1
-deprecation==2.1.0
-dill==0.3.8
-evaluate==0.4.4
-exceptiongroup==1.3.0
-fastapi==0.115.13
-filelock==3.18.0
-fonttools==4.58.4
-fpdf2==2.7.8
-frozenlist==1.7.0
-fsspec==2025.3.0
-gotrue==2.12.0
-greenlet==3.2.3
-h11==0.16.0
-h2==4.2.0
-hpack==4.1.0
-httpcore==1.0.9
-httpx==0.28.1
-huggingface-hub==0.33.0
-hyperframe==6.1.0
-idna==3.10
-iniconfig==2.1.0
-Jinja2==3.1.6
-joblib==1.5.1
-lxml==5.4.0
-MarkupSafe==3.0.2
-mpmath==1.3.0
-multidict==6.5.0
-multiprocess==0.70.16
-networkx==3.4.2
-nltk==3.8.1
-numpy==2.2.6
-packaging==25.0
-pandas==2.3.0
-pdfminer.six==20250506
-pdfplumber==0.11.7
-pillow==11.0.0
-pluggy==1.6.0
-postgrest==1.0.2
-propcache==0.3.2
-psutil==7.0.0
-pyarrow==20.0.0
-pycparser==2.22
-pydantic==2.11.7
-pydantic_core==2.33.2
-pyee==13.0.0
-Pygments==2.19.1
-PyJWT==2.10.1
-PyMuPDF==1.24.2
-PyPDF2==3.0.1
-pypdfium2==4.30.1
-pytest==8.4.1
-pytest-mock==3.14.1
-python-dateutil==2.9.0.post0
-python-docx==1.2.0
-python-dotenv==1.1.1
-python-multipart==0.0.20
-pytz==2025.2
-PyYAML==6.0.2
-realtime==2.4.3
-regex==2024.11.6
-reportlab==4.4.2
-requests==2.32.4
-safetensors==0.5.3
-scikit-learn==1.7.0
-scipy==1.15.3
-six==1.17.0
-sniffio==1.3.1
-starlette==0.46.2
-storage3==0.11.3
-StrEnum==0.4.15
-supabase==2.15.3
-sympy==1.13.1
-threadpoolctl==3.6.0
-tomli==2.2.1
-torch==2.5.1
-tqdm==4.67.1
-transformers==4.41.1
-typing-inspection==0.4.1
-typing_extensions==4.14.0
-tzdata==2025.2
-urllib3==2.5.0
-uvicorn==0.34.3
-websockets==14.2
-xxhash==3.5.0
-yarl==1.20.1

app/sample.docx DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b36b8849df4a2d706e8aa2c2e9be106950f78cef26fd759ac7e2889dbe65e815
-size 108513