AlyanAkram commited on
Commit
a53dc0a
·
verified ·
1 Parent(s): 2a668e8

Upload 11 files

Browse files
.gitignore ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python artifacts
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.pyo
5
+ *.pyd
6
+ *.so
7
+ *.egg-info/
8
+
9
+ # Virtual environments
10
+ venv/
11
+ env/
12
+ .venv/
13
+
14
+ # IDE/editor files
15
+ .vscode/
16
+ .idea/
17
+ *.log
18
+ .DS_Store
19
+ Thumbs.db
20
+
21
+ # Model weights and training artifacts
22
+ detector/models/
23
+ detector/training_data/
24
+ detector/output_reports/
25
+ test_files/
26
+ reports/
27
+ *.safetensors
28
+ *.bin
29
+ *.pt
30
+ *.ckpt
31
+
32
+ # Data files
33
+ ai_training_dataset.json
34
+
35
+ # Optional build/deploy stuff
36
+ build/
37
+ tmp/
38
+ .cache/
39
+
40
+ # Environment and config
41
+ .env
42
+ *.env
43
+ .env.*
auth.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException
2
+ from pydantic import BaseModel
3
+ from supabase import create_client, Client
4
+ import os
5
+
6
+ SUPABASE_URL = "https://ylyxgffttgvvjyrfovpl.supabase.co"
7
+ SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InlseXhnZmZ0dGd2dmp5cmZvdnBsIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NTA0MTc4NDcsImV4cCI6MjA2NTk5Mzg0N30.a6-biroEh-TNTS8E_uAYYt7mgdY2A-xexjCzYp1MsuI"
8
+
9
+ supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
10
+
11
+ router = APIRouter()
12
+
13
+ class UserAuth(BaseModel):
14
+ email: str
15
+ password: str
16
+
17
+ @router.post("/signup")
18
+ def signup(user: UserAuth):
19
+ try:
20
+ result = supabase.auth.sign_up({
21
+ "email": user.email,
22
+ "password": user.password
23
+ })
24
+ return {"message": "Signup successful", "user": result.user}
25
+ except Exception as e:
26
+ raise HTTPException(status_code=400, detail=str(e))
27
+
28
+ @router.post("/login")
29
+ def login(user: UserAuth):
30
+ try:
31
+ result = supabase.auth.sign_in_with_password({
32
+ "email": user.email,
33
+ "password": user.password
34
+ })
35
+ return {"access_token": result.session.access_token, "user_id": result.user.id}
36
+ except Exception as e:
37
+ raise HTTPException(status_code=401, detail="Invalid credentials")
detector/create_dataset.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from preprocess import extract_paragraphs
4
+
5
+ # Label mapping
6
+ LABELS = {
7
+ "ai": 1,
8
+ "human": 0,
9
+ "mixed": 2 # You can change to 2 if doing 3-class classification
10
+ }
11
+
12
+ root_dir = "training_data"
13
+ dataset = []
14
+
15
+ for label_folder in os.listdir(root_dir):
16
+ folder_path = os.path.join(root_dir, label_folder)
17
+ if not os.path.isdir(folder_path):
18
+ continue
19
+
20
+ label = LABELS.get(label_folder.lower())
21
+ if label is None:
22
+ continue
23
+
24
+ for filename in os.listdir(folder_path):
25
+ if not filename.endswith((".pdf", ".docx")):
26
+ continue
27
+
28
+ file_path = os.path.join(folder_path, filename)
29
+ print(f"📄 Extracting: {file_path}")
30
+ try:
31
+ paragraphs = extract_paragraphs(file_path)
32
+ for para in paragraphs:
33
+ if para.strip():
34
+ dataset.append({
35
+ "text": para.strip(),
36
+ "label": label
37
+ })
38
+ except Exception as e:
39
+ print(f"❌ Failed: {file_path} — {str(e)}")
40
+
41
+ # Save dataset
42
+ with open("ai_training_dataset.json", "w", encoding="utf-8") as f:
43
+ json.dump(dataset, f, indent=2, ensure_ascii=False)
44
+
45
+ print(f"\n✅ Saved {len(dataset)} samples.")
detector/custom_model.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import nltk
4
+ from pathlib import Path
5
+ from nltk.tokenize import sent_tokenize
6
+ from transformers import RobertaTokenizer, RobertaForSequenceClassification
7
+ from reportlab.lib.pagesizes import A4
8
+ from reportlab.pdfgen import canvas
9
+ from reportlab.lib import colors
10
+
11
+ # === NLTK tokenizer ===
12
+ nltk.download("punkt")
13
+
14
+ # === Model loading: Hugging Face (Render) vs Local (Dev) ===
15
+ USE_HF_MODEL = os.getenv("USE_HF_MODEL") == "1"
16
+
17
+ if USE_HF_MODEL:
18
+ from huggingface_hub import login
19
+
20
+ hf_token = os.getenv("HF_TOKEN")
21
+ if hf_token:
22
+ login(token=hf_token)
23
+
24
+ MODEL_PATH = "AlyanAkram/stealth-roberta"
25
+ tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, token=hf_token)
26
+ model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, token=hf_token)
27
+ else:
28
+ MODEL_PATH = "./detector/models/roberta-detector"
29
+ tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
30
+ model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, local_files_only=True)
31
+
32
+ model.eval().to("cuda" if torch.cuda.is_available() else "cpu")
33
+ device = next(model.parameters()).device
34
+
35
+
36
+ # === AI classification threshold ===
37
+ AI_THRESHOLD = 0.5
38
+
39
+ # === Report directory ===
40
+ REPORT_DIR = Path(__file__).resolve().parent.parent / "reports"
41
+ REPORT_DIR.mkdir(exist_ok=True)
42
+
43
+ def analyze_text(text: str):
44
+ results = []
45
+ paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
46
+ ai_count = 0
47
+ total_sentences = 0
48
+
49
+ for paragraph in paragraphs:
50
+ sentence_results = []
51
+ sentences = sent_tokenize(paragraph)
52
+ for sentence in sentences:
53
+ inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
54
+
55
+ with torch.no_grad():
56
+ outputs = model(**inputs)
57
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
58
+ ai_prob = probs[1].item()
59
+
60
+ is_ai = ai_prob >= AI_THRESHOLD
61
+ sentence_results.append((sentence, is_ai, ai_prob))
62
+
63
+ total_sentences += 1
64
+ if is_ai:
65
+ ai_count += 1
66
+
67
+ results.append(sentence_results)
68
+
69
+ overall_score = round((ai_count / total_sentences) * 100, 2) if total_sentences else 0
70
+
71
+ return {
72
+ "overall_ai_percent": overall_score,
73
+ "total_sentences": total_sentences,
74
+ "ai_sentences": ai_count,
75
+ "results": results
76
+ }
77
+
78
+ def generate_pdf_report(results: dict, filename: str) -> str:
79
+ pdf_path = REPORT_DIR / f"{filename}.pdf"
80
+
81
+ c = canvas.Canvas(str(pdf_path), pagesize=A4)
82
+ width, height = A4
83
+ x, y = 40, height - 60
84
+ line_height = 18
85
+ font_size = 12
86
+
87
+ # Header
88
+ c.setFont("Helvetica-Bold", 14)
89
+ c.drawString(x, y, f"📄 AI Detection Report: {filename}")
90
+ y -= 25
91
+ c.setFont("Helvetica", 12)
92
+ c.drawString(x, y, f"🧠 AI Detected: {results['overall_ai_percent']}% of {results['total_sentences']} sentences")
93
+ y -= 30
94
+ c.setFont("Helvetica", font_size)
95
+
96
+ for para_result in results["results"]:
97
+ if not para_result:
98
+ y -= line_height
99
+ continue
100
+
101
+ for sentence, is_ai, _ in para_result:
102
+ if y < 50:
103
+ c.showPage()
104
+ y = height - 50
105
+ c.setFont("Helvetica", font_size)
106
+
107
+ sentence = sentence.strip()
108
+ if not sentence:
109
+ continue
110
+
111
+ # Wrap long sentences
112
+ max_width = width - 80
113
+ words = sentence.split()
114
+ current_line = ""
115
+ for word in words:
116
+ test_line = current_line + " " + word if current_line else word
117
+ if c.stringWidth(test_line, "Helvetica", font_size) > max_width:
118
+ if is_ai:
119
+ text_width = c.stringWidth(current_line, "Helvetica", font_size)
120
+ c.setFillColor(colors.cyan)
121
+ c.rect(x - 2, y - 4, text_width + 4, line_height + 2, fill=True, stroke=False)
122
+ c.setFillColor(colors.black)
123
+
124
+ c.drawString(x, y, current_line)
125
+ y -= line_height
126
+ current_line = word
127
+ else:
128
+ current_line = test_line
129
+
130
+ if current_line:
131
+ if y < 50:
132
+ c.showPage()
133
+ y = height - 50
134
+ c.setFont("Helvetica", font_size)
135
+
136
+ if is_ai:
137
+ text_width = c.stringWidth(current_line, "Helvetica", font_size)
138
+ c.setFillColor(colors.cyan)
139
+ c.rect(x - 2, y - 4, text_width + 4, line_height + 2, fill=True, stroke=False)
140
+ c.setFillColor(colors.black)
141
+
142
+ c.drawString(x, y, current_line)
143
+ y -= line_height
144
+
145
+ y -= line_height # Paragraph spacing
146
+
147
+ c.save()
148
+ return f"{filename}.pdf"
149
+
detector/detector.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ import docx
5
+ import nltk
6
+ from nltk.tokenize import sent_tokenize
7
+ from transformers import RobertaTokenizer, RobertaForSequenceClassification
8
+ from reportlab.lib.pagesizes import A4
9
+ from reportlab.pdfgen import canvas
10
+ from reportlab.lib import colors
11
+
12
+ nltk.download("punkt")
13
+
14
+ # Load model
15
+ model_dir = "./models/roberta-detector"
16
+ tokenizer = RobertaTokenizer.from_pretrained(model_dir)
17
+ model = RobertaForSequenceClassification.from_pretrained(model_dir)
18
+ model.eval().to("cuda" if torch.cuda.is_available() else "cpu")
19
+ device = next(model.parameters()).device
20
+
21
+ # === THRESHOLD CONFIG ===
22
+ AI_THRESHOLD = 0.50 # Adjust this as needed for better results
23
+
24
+ # === Input File ===
25
+ filepath = sys.argv[1]
26
+ filename = os.path.splitext(os.path.basename(filepath))[0]
27
+ output_dir = "output_reports"
28
+ os.makedirs(output_dir, exist_ok=True)
29
+ output_path = os.path.join(output_dir, f"{filename}_report.pdf")
30
+
31
+ # === DOCX Reader ===
32
+ def read_docx_paragraphs(path):
33
+ doc = docx.Document(path)
34
+ return [para.text for para in doc.paragraphs]
35
+
36
+ paragraphs = read_docx_paragraphs(filepath)
37
+
38
+ # === Detection Loop ===
39
+ results = []
40
+ total_sentences = 0
41
+ ai_sentences = 0
42
+
43
+ for paragraph in paragraphs:
44
+ if not paragraph.strip():
45
+ results.append([]) # preserve spacing
46
+ continue
47
+
48
+ sentences = sent_tokenize(paragraph)
49
+ para_result = []
50
+
51
+ for sentence in sentences:
52
+ inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
53
+ with torch.no_grad():
54
+ outputs = model(**inputs)
55
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
56
+ ai_prob = probs[1].item()
57
+
58
+ is_ai = ai_prob >= AI_THRESHOLD
59
+ para_result.append((sentence, is_ai, ai_prob))
60
+
61
+ total_sentences += 1
62
+ if is_ai:
63
+ ai_sentences += 1
64
+
65
+ # Debugging
66
+ print(f"[DEBUG] AI probability: {ai_prob:.2f} — {'✔ Highlight' if is_ai else '✘ Skip'}")
67
+
68
+ results.append(para_result)
69
+
70
+ ai_percent = round((ai_sentences / total_sentences) * 100, 2) if total_sentences else 0
71
+
72
+ # === PDF Writer ===
73
+ c = canvas.Canvas(output_path, pagesize=A4)
74
+ width, height = A4
75
+ x, y = 40, height - 60
76
+ line_height = 18
77
+ font_size = 12
78
+
79
+ # Title
80
+ c.setFont("Helvetica-Bold", 14)
81
+ c.drawString(x, y, f"📄 AI Detection Report: {filename}")
82
+ y -= 25
83
+ c.setFont("Helvetica", 12)
84
+ c.drawString(x, y, f"🧠 AI Detected: {ai_percent}% of {total_sentences} sentences")
85
+ y -= 30
86
+ c.setFont("Helvetica", font_size)
87
+
88
+ # Body rendering
89
+ for para_result in results:
90
+ if not para_result:
91
+ y -= line_height
92
+ continue
93
+
94
+ for sentence, is_ai, ai_prob in para_result:
95
+ if y < 50:
96
+ c.showPage()
97
+ y = height - 50
98
+ c.setFont("Helvetica", font_size)
99
+
100
+ if is_ai:
101
+ text_width = c.stringWidth(sentence, "Helvetica", font_size)
102
+ c.setFillColor(colors.cyan)
103
+ c.rect(x - 2, y - 4, text_width + 4, line_height + 2, fill=True, stroke=False)
104
+ c.setFillColor(colors.black)
105
+
106
+ c.drawString(x, y, sentence)
107
+ y -= line_height
108
+
109
+ y -= line_height # spacing between paragraphs
110
+
111
+ c.save()
112
+ print(f"\n✅ Report saved: {output_path}")
detector/preprocess.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import docx
3
+ import pdfplumber
4
+ import nltk
5
+
6
+ nltk.download("punkt")
7
+ from nltk.tokenize import sent_tokenize
8
+
9
+ def extract_text_from_docx(path):
10
+ try:
11
+ doc = docx.Document(path)
12
+ paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
13
+ return paragraphs if paragraphs else []
14
+ except Exception as e:
15
+ print(f"❌ Failed to extract DOCX: {e}")
16
+ return []
17
+
18
+ def extract_text_from_pdf(path):
19
+ try:
20
+ with pdfplumber.open(path) as pdf:
21
+ all_text = "\n".join(
22
+ page.extract_text() for page in pdf.pages if page.extract_text()
23
+ )
24
+ except Exception as e:
25
+ print(f"❌ Failed to extract PDF: {e}")
26
+ return []
27
+
28
+ if not all_text.strip():
29
+ return []
30
+
31
+ # Try splitting by paragraphs
32
+ paragraphs = [p.strip() for p in all_text.split("\n\n") if p.strip()]
33
+ if paragraphs:
34
+ return paragraphs
35
+
36
+ # Fallback: break into 3–5 sentence chunks
37
+ sentences = sent_tokenize(all_text)
38
+ return [" ".join(sentences[i:i + 5]) for i in range(0, len(sentences), 5)]
39
+
40
+ def extract_paragraphs(path):
41
+ ext = os.path.splitext(path)[-1].lower()
42
+ if ext == ".docx":
43
+ return extract_text_from_docx(path)
44
+ elif ext == ".pdf":
45
+ return extract_text_from_pdf(path)
46
+ else:
47
+ raise ValueError(f"Unsupported file type: {ext}")
detector/requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ scikit-learn
4
+ pdfplumber
5
+ python-docx
6
+ nltk
7
+ datasets
8
+ fastapi
9
+ uvicorn
10
+ transformers
11
+ torch
12
+ nltk
13
+ PyPDF2
14
+ python-docx
detector/train_model.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import torch
3
+ from datasets import Dataset
4
+ import evaluate
5
+ from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
6
+
7
+ # Load dataset from JSON
8
+ with open("ai_training_dataset.json", "r", encoding="utf-8") as f:
9
+ data = json.load(f)
10
+
11
+ # Check that all labels are integers (0, 1, or 2)
12
+ for item in data:
13
+ item["label"] = int(item["label"]) # Ensure type is correct
14
+
15
+ # Convert to HuggingFace Dataset
16
+ dataset = Dataset.from_list(data)
17
+ dataset = dataset.train_test_split(test_size=0.2)
18
+ train_dataset = dataset["train"]
19
+ eval_dataset = dataset["test"]
20
+
21
+ # Load tokenizer
22
+ model_name = "roberta-base"
23
+ tokenizer = RobertaTokenizer.from_pretrained(model_name)
24
+
25
+ # Tokenization function
26
+ def tokenize(example):
27
+ return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)
28
+
29
+ # Tokenize datasets
30
+ train_dataset = train_dataset.map(tokenize, batched=True)
31
+ eval_dataset = eval_dataset.map(tokenize, batched=True)
32
+
33
+ # Keep only model-required fields
34
+ train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
35
+ eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
36
+
37
+ # Load model with 3 output labels
38
+ model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)
39
+
40
+ # Optional: define metrics
41
+ accuracy = evaluate.load("accuracy")
42
+
43
+ def compute_metrics(eval_pred):
44
+ logits, labels = eval_pred
45
+ preds = torch.argmax(torch.tensor(logits), dim=1)
46
+ return accuracy.compute(predictions=preds, references=labels)
47
+
48
+ # Training configuration
49
+ training_args = TrainingArguments(
50
+ output_dir="./models/roberta-detector",
51
+ evaluation_strategy="epoch", # MUST match save_strategy
52
+ save_strategy="epoch",
53
+ per_device_train_batch_size=4,
54
+ per_device_eval_batch_size=4,
55
+ num_train_epochs=3,
56
+ logging_steps=10,
57
+ save_total_limit=1,
58
+ load_best_model_at_end=True,
59
+ metric_for_best_model="eval_loss",
60
+ report_to="none", # Prevents WandB issues
61
+ )
62
+
63
+ # Trainer setup
64
+ trainer = Trainer(
65
+ model=model,
66
+ args=training_args,
67
+ train_dataset=train_dataset,
68
+ eval_dataset=eval_dataset,
69
+ compute_metrics=compute_metrics,
70
+ )
71
+
72
+ # Train
73
+ trainer.train()
74
+
75
+ # Save model + tokenizer
76
+ model.save_pretrained("./models/roberta-detector")
77
+ tokenizer.save_pretrained("./models/roberta-detector")
78
+
79
+ print("✅ Model trained and saved.")
detector/utils.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # detector/utils.py
2
+
3
+ from PyPDF2 import PdfReader
4
+ import docx
5
+
6
+ async def extract_text_from_file(file):
7
+ filename = file.filename.lower()
8
+
9
+ if filename.endswith(".pdf"):
10
+ reader = PdfReader(file.file)
11
+ return "\n".join([page.extract_text() or "" for page in reader.pages])
12
+
13
+ elif filename.endswith(".docx"):
14
+ document = docx.Document(file.file)
15
+ return "\n".join([para.text for para in document.paragraphs])
16
+
17
+ elif filename.endswith(".txt"):
18
+ return (await file.read()).decode("utf-8")
19
+
20
+ else:
21
+ raise ValueError("Unsupported file type.")
main.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File
2
+ from fastapi.responses import JSONResponse
3
+ from fastapi.staticfiles import StaticFiles
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+
6
+ from PyPDF2 import PdfReader
7
+ import docx
8
+ from docx.enum.text import WD_COLOR_INDEX
9
+ from io import BytesIO
10
+ import os
11
+ import uvicorn
12
+
13
+ # === App setup ===
14
+ app = FastAPI()
15
+
16
+ # ✅ CORS
17
+ app.add_middleware(
18
+ CORSMiddleware,
19
+ allow_origins=["http://localhost:5173", "https://stealth-writer.vercel.app/"],
20
+ allow_credentials=True,
21
+ allow_methods=["*"],
22
+ allow_headers=["*"],
23
+ )
24
+
25
+ # === Global lazy-load model vars ===
26
+ analyze_text = None
27
+ generate_pdf_report = None
28
+
29
+ @app.on_event("startup")
30
+ async def load_model():
31
+ global analyze_text, generate_pdf_report
32
+ from detector.custom_model import analyze_text as at, generate_pdf_report as gpr
33
+ analyze_text = at
34
+ generate_pdf_report = gpr
35
+
36
+ # === Paths ===
37
+ REPORTS_DIR = os.path.join(os.path.dirname(__file__), "reports")
38
+ os.makedirs(REPORTS_DIR, exist_ok=True)
39
+
40
+ app.mount("/reports", StaticFiles(directory=REPORTS_DIR), name="reports")
41
+
42
+ # === File text extraction ===
43
+ def extract_text(file: UploadFile, ext: str) -> str:
44
+ content = file.file.read()
45
+ file_bytes = BytesIO(content)
46
+
47
+ if ext == ".txt":
48
+ return content.decode("utf-8", errors="ignore")
49
+ elif ext == ".pdf":
50
+ reader = PdfReader(file_bytes)
51
+ return "".join([page.extract_text() or "" for page in reader.pages])
52
+ elif ext == ".docx":
53
+ doc = docx.Document(file_bytes)
54
+ return "\n".join([para.text for para in doc.paragraphs])
55
+ else:
56
+ raise ValueError("Unsupported file type")
57
+
58
+ # === Main endpoint ===
59
+ @app.post("/api/detect")
60
+ async def detect(file: UploadFile = File(...)):
61
+ try:
62
+ ext = os.path.splitext(file.filename)[1].lower()
63
+ if ext not in [".txt", ".pdf", ".docx"]:
64
+ raise ValueError("Unsupported file format")
65
+
66
+ # Extract + Analyze
67
+ text = extract_text(file, ext)
68
+ result = analyze_text(text)
69
+
70
+ # === Save DOCX report ===
71
+ filename_base = os.path.splitext(file.filename)[0]
72
+ docx_filename = f"{filename_base}_report.docx"
73
+ docx_path = os.path.join(REPORTS_DIR, docx_filename)
74
+
75
+ doc = docx.Document()
76
+ doc.add_heading("AI Detection Summary", level=1)
77
+ doc.add_paragraph(f"Overall AI %: {result['overall_ai_percent']}%")
78
+ doc.add_paragraph(f"Total Sentences: {result['total_sentences']}")
79
+ doc.add_paragraph(f"AI Sentences: {result['ai_sentences']}")
80
+ doc.add_paragraph("Sentences detected as AI are highlighted in cyan.\n")
81
+ doc.add_heading("Sentence Analysis", level=2)
82
+
83
+ paragraph = doc.add_paragraph()
84
+ for para in result["results"]:
85
+ for sentence, is_ai, _ in para:
86
+ if not isinstance(sentence, str) or not sentence.strip():
87
+ continue
88
+ run = paragraph.add_run(sentence + " ")
89
+ if is_ai:
90
+ run.font.highlight_color = WD_COLOR_INDEX.TURQUOISE
91
+
92
+ doc.save(docx_path)
93
+
94
+ # === Save PDF report (uses ReportLab) ===
95
+ pdf_filename = generate_pdf_report(result, filename_base)
96
+
97
+ return {
98
+ "success": True,
99
+ "score": {
100
+ **{k: v for k, v in result.items() if k != "results"},
101
+ "results": [
102
+ [{"sentence": s, "is_ai": is_ai, "ai_score": round(ai_score * 100, 2)} for s, is_ai, ai_score in para]
103
+ for para in result["results"]
104
+ ]
105
+ },
106
+ "docx_url": f"/reports/{docx_filename}",
107
+ "pdf_url": f"/reports/{pdf_filename}"
108
+ }
109
+
110
+ except Exception as e:
111
+ return JSONResponse(content={"success": False, "error": str(e)}, status_code=500)
112
+
113
+ # === Port binding for Render ===
114
+ if __name__ == "__main__":
115
+ port = int(os.environ.get("PORT", 7860)) # Render will inject PORT
116
+ uvicorn.run("main:app", host="0.0.0.0", port=port)
requirements.txt ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==1.8.1
2
+ aiohappyeyeballs==2.6.1
3
+ aiohttp==3.12.13
4
+ aiosignal==1.3.2
5
+ annotated-types==0.7.0
6
+ anyio==4.9.0
7
+ async-timeout==5.0.1
8
+ attrs==25.3.0
9
+ certifi==2025.6.15
10
+ cffi==1.17.1
11
+ charset-normalizer==3.4.2
12
+ click==8.2.1
13
+ colorama==0.4.6
14
+ cryptography==45.0.4
15
+ datasets==3.6.0
16
+ defusedxml==0.7.1
17
+ deprecation==2.1.0
18
+ dill==0.3.8
19
+ evaluate==0.4.4
20
+ exceptiongroup==1.3.0
21
+ fastapi==0.115.13
22
+ filelock==3.18.0
23
+ fonttools==4.58.4
24
+ fpdf2==2.7.8
25
+ frozenlist==1.7.0
26
+ fsspec==2025.3.0
27
+ gotrue==2.12.0
28
+ greenlet==3.2.3
29
+ h11==0.16.0
30
+ h2==4.2.0
31
+ hpack==4.1.0
32
+ httpcore==1.0.9
33
+ httpx==0.28.1
34
+ huggingface-hub==0.33.0
35
+ hyperframe==6.1.0
36
+ idna==3.10
37
+ iniconfig==2.1.0
38
+ Jinja2==3.1.6
39
+ joblib==1.5.1
40
+ lxml==5.4.0
41
+ MarkupSafe==3.0.2
42
+ mpmath==1.3.0
43
+ multidict==6.5.0
44
+ multiprocess==0.70.16
45
+ networkx==3.4.2
46
+ nltk==3.8.1
47
+ numpy==2.2.6
48
+ packaging==25.0
49
+ pandas==2.3.0
50
+ pdfminer.six==20250506
51
+ pdfplumber==0.11.7
52
+ pillow==11.0.0
53
+ pluggy==1.6.0
54
+ postgrest==1.0.2
55
+ propcache==0.3.2
56
+ psutil==7.0.0
57
+ pyarrow==20.0.0
58
+ pycparser==2.22
59
+ pydantic==2.11.7
60
+ pydantic_core==2.33.2
61
+ pyee==13.0.0
62
+ Pygments==2.19.1
63
+ PyJWT==2.10.1
64
+ PyMuPDF==1.24.2
65
+ PyPDF2==3.0.1
66
+ pypdfium2==4.30.1
67
+ pytest==8.4.1
68
+ pytest-mock==3.14.1
69
+ python-dateutil==2.9.0.post0
70
+ python-docx==1.2.0
71
+ python-dotenv==1.1.1
72
+ python-multipart==0.0.20
73
+ pytz==2025.2
74
+ PyYAML==6.0.2
75
+ realtime==2.4.3
76
+ regex==2024.11.6
77
+ reportlab==4.4.2
78
+ requests==2.32.4
79
+ safetensors==0.5.3
80
+ scikit-learn==1.7.0
81
+ scipy==1.15.3
82
+ six==1.17.0
83
+ sniffio==1.3.1
84
+ starlette==0.46.2
85
+ storage3==0.11.3
86
+ StrEnum==0.4.15
87
+ supabase==2.15.3
88
+ sympy==1.13.1
89
+ threadpoolctl==3.6.0
90
+ tomli==2.2.1
91
+ torch==2.5.1
92
+ tqdm==4.67.1
93
+ transformers==4.41.1
94
+ typing-inspection==0.4.1
95
+ typing_extensions==4.14.0
96
+ tzdata==2025.2
97
+ urllib3==2.5.0
98
+ uvicorn==0.34.3
99
+ websockets==14.2
100
+ xxhash==3.5.0
101
+ yarl==1.20.1