Pujan-Dev commited on
Commit
31fda96
·
1 Parent(s): 47c1352

update: updated the config and text_classifier

Browse files
Procfile DELETED
@@ -1 +0,0 @@
1
- web: uvicorn app:app --host 0.0.0.0 --port ${PORT:-8000}
 
 
app.py CHANGED
@@ -1,22 +1,23 @@
 
 
 
1
  from fastapi import FastAPI, Request
 
 
2
  from slowapi import Limiter, _rate_limit_exceeded_handler
3
- from fastapi.responses import FileResponse
4
- from slowapi.middleware import SlowAPIMiddleware
5
  from slowapi.errors import RateLimitExceeded
 
6
  from slowapi.util import get_remote_address
7
- from fastapi.responses import JSONResponse
8
- from features.text_classifier.routes import router as text_classifier_router
 
 
9
  from features.nepali_text_classifier.routes import (
10
  router as nepali_text_classifier_router,
11
  )
12
- from features.image_classifier.routes import router as image_classifier_router
13
- from features.image_edit_detector.routes import router as image_edit_detector_router
14
- from fastapi.staticfiles import StaticFiles
15
-
16
- from config import ACCESS_RATE
17
-
18
- import requests
19
 
 
20
  limiter = Limiter(key_func=get_remote_address, default_limits=[ACCESS_RATE])
21
 
22
  app = FastAPI()
 
1
+ import warnings
2
+
3
+ import requests
4
  from fastapi import FastAPI, Request
5
+ from fastapi.responses import FileResponse, JSONResponse
6
+ from fastapi.staticfiles import StaticFiles
7
  from slowapi import Limiter, _rate_limit_exceeded_handler
 
 
8
  from slowapi.errors import RateLimitExceeded
9
+ from slowapi.middleware import SlowAPIMiddleware
10
  from slowapi.util import get_remote_address
11
+
12
+ from config import ACCESS_RATE
13
+ from features.image_classifier.routes import router as image_classifier_router
14
+ from features.image_edit_detector.routes import router as image_edit_detector_router
15
  from features.nepali_text_classifier.routes import (
16
  router as nepali_text_classifier_router,
17
  )
18
+ from features.text_classifier.routes import router as text_classifier_router
 
 
 
 
 
 
19
 
20
+ warnings.filterwarnings("ignore")
21
  limiter = Limiter(key_func=get_remote_address, default_limits=[ACCESS_RATE])
22
 
23
  app = FastAPI()
config.py CHANGED
@@ -1,2 +1,14 @@
 
 
 
 
 
 
1
  ACCESS_RATE = "20/minute"
2
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import dotenv
4
+
5
+ dotenv.load_dotenv()
6
+
7
  ACCESS_RATE = "20/minute"
8
 
9
+
10
+ class Config:
11
+ Nepali_model_folder = os.getenv("Nepali_model")
12
+ English_model_folder = os.getenv("English_model")
13
+ REPO_ID_LANG = os.getenv("English_model")
14
+ LANG_MODEL = os.getenv("LANG_MODEL")
features/text_classifier/controller.py CHANGED
@@ -1,16 +1,34 @@
1
- import os
2
  import asyncio
3
  import logging
 
4
  from io import BytesIO
5
 
6
- from fastapi import HTTPException, UploadFile, status, Depends
7
- from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
8
 
9
- from .inferencer import classify_text
10
  from .preprocess import parse_docx, parse_pdf, parse_txt
11
- import spacy
12
  security = HTTPBearer()
13
- nlp = spacy.load("en_core_web_sm")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # Verify Bearer token from Authorization header
16
  async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
@@ -18,32 +36,42 @@ async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(secur
18
  expected_token = os.getenv("MY_SECRET_TOKEN")
19
  if token != expected_token:
20
  raise HTTPException(
21
- status_code=status.HTTP_403_FORBIDDEN,
22
- detail="Invalid or expired token"
23
  )
24
  return token
25
 
 
26
  # Classify plain text input
27
  async def handle_text_analysis(text: str):
28
  text = text.strip()
29
  if not text or len(text.split()) < 10:
30
- raise HTTPException(status_code=400, detail="Text must contain at least 10 words")
31
- if len(text) > 10000:
32
- raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
 
 
 
 
33
 
34
  label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, text)
 
35
  return {
36
  "result": label,
37
  "perplexity": round(perplexity, 2),
38
- "ai_likelihood": ai_likelihood
 
39
  }
40
 
 
41
  # Extract text from uploaded files (.docx, .pdf, .txt)
42
  async def extract_file_contents(file: UploadFile) -> str:
43
  content = await file.read()
44
  file_stream = BytesIO(content)
45
 
46
- if file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
 
 
 
47
  return parse_docx(file_stream)
48
  elif file.content_type == "application/pdf":
49
  return parse_pdf(file_stream)
@@ -52,79 +80,83 @@ async def extract_file_contents(file: UploadFile) -> str:
52
  else:
53
  raise HTTPException(
54
  status_code=415,
55
- detail="Invalid file type. Only .docx, .pdf and .txt are allowed."
56
  )
57
 
 
58
  # Classify text from uploaded file
59
  async def handle_file_upload(file: UploadFile):
60
  try:
61
  file_contents = await extract_file_contents(file)
62
- if len(file_contents) > 10000:
63
- return {"status_code": 413, "detail": "Text must be less than 10,000 characters"}
 
 
 
 
64
 
65
  cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
66
  if not cleaned_text:
67
- raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
 
 
 
68
  # print(f"Cleaned text: '{cleaned_text}'") # Debugging statement
69
- label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, cleaned_text)
 
 
70
  return {
71
  "content": file_contents,
72
  "result": label,
73
  "perplexity": round(perplexity, 2),
74
- "ai_likelihood": ai_likelihood
75
  }
76
  except Exception as e:
77
  logging.error(f"Error processing file: {e}")
78
  raise HTTPException(status_code=500, detail="Error processing the file")
79
 
80
 
81
-
82
  async def handle_sentence_level_analysis(text: str):
83
  text = text.strip()
84
- if not text.endswith("."):
85
- text += "."
86
-
87
- if len(text) > 10000:
88
- raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
89
-
90
- doc = nlp(text)
91
- sentences = [sent.text.strip() for sent in doc.sents]
92
-
93
- results = []
94
- for sentence in sentences:
95
- if not sentence:
96
- continue
97
- label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, sentence)
98
- results.append({
99
- "sentence": sentence,
100
- "label": label,
101
- "perplexity": round(perplexity, 2),
102
- "ai_likelihood": ai_likelihood
103
- })
104
 
105
- return {"analysis": results}
106
 
107
  # Analyze each sentence from uploaded file
108
  async def handle_file_sentence(file: UploadFile):
109
  try:
110
  file_contents = await extract_file_contents(file)
111
- if len(file_contents) > 10000:
112
  # raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
113
- return {"status_code": 413, "detail": "Text must be less than 10,000 characters"}
 
 
 
114
 
115
  cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
116
  if not cleaned_text:
117
- raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
 
 
 
118
 
119
  result = await handle_sentence_level_analysis(cleaned_text)
120
- return {
121
- "content": file_contents,
122
- **result
123
- }
124
  except Exception as e:
125
  logging.error(f"Error processing file: {e}")
126
  raise HTTPException(status_code=500, detail="Error processing the file")
127
 
 
128
  def classify(text: str):
129
  return classify_text(text)
130
-
 
 
1
  import asyncio
2
  import logging
3
+ import os
4
  from io import BytesIO
5
 
6
+ from fastapi import Depends, HTTPException, UploadFile, status
7
+ from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
8
 
9
+ from .inferencer import analyze_text_with_sentences, classify_text
10
  from .preprocess import parse_docx, parse_pdf, parse_txt
11
+
12
  security = HTTPBearer()
13
+
14
+
15
+ def build_bias_summary(ai_likelihood: float) -> dict[str, object]:
16
+ """Convert an AI likelihood score into a human-readable bias summary."""
17
+ if ai_likelihood > 50:
18
+ overall_bias = "AI"
19
+ bias_statement = f"The text is biased toward AI-generated writing ({ai_likelihood}% AI likelihood)."
20
+ elif ai_likelihood < 50:
21
+ overall_bias = "Human"
22
+ bias_statement = f"The text is biased toward human writing ({100 - ai_likelihood}% human likelihood)."
23
+ else:
24
+ overall_bias = "Balanced"
25
+ bias_statement = "The text is balanced between AI and human writing."
26
+
27
+ return {
28
+ "overall_bias": overall_bias,
29
+ "bias_statement": bias_statement,
30
+ }
31
+
32
 
33
  # Verify Bearer token from Authorization header
34
  async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
 
36
  expected_token = os.getenv("MY_SECRET_TOKEN")
37
  if token != expected_token:
38
  raise HTTPException(
39
+ status_code=status.HTTP_403_FORBIDDEN, detail="Invalid or expired token"
 
40
  )
41
  return token
42
 
43
+
44
  # Classify plain text input
45
  async def handle_text_analysis(text: str):
46
  text = text.strip()
47
  if not text or len(text.split()) < 10:
48
+ raise HTTPException(
49
+ status_code=400, detail="Text must contain at least 10 words"
50
+ )
51
+ if len(text) > 50000:
52
+ raise HTTPException(
53
+ status_code=413, detail="Text must be less than 50,000 characters"
54
+ )
55
 
56
  label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, text)
57
+ bias_summary = build_bias_summary(ai_likelihood)
58
  return {
59
  "result": label,
60
  "perplexity": round(perplexity, 2),
61
+ "ai_likelihood": ai_likelihood,
62
+ **bias_summary,
63
  }
64
 
65
+
66
  # Extract text from uploaded files (.docx, .pdf, .txt)
67
  async def extract_file_contents(file: UploadFile) -> str:
68
  content = await file.read()
69
  file_stream = BytesIO(content)
70
 
71
+ if (
72
+ file.content_type
73
+ == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
74
+ ):
75
  return parse_docx(file_stream)
76
  elif file.content_type == "application/pdf":
77
  return parse_pdf(file_stream)
 
80
  else:
81
  raise HTTPException(
82
  status_code=415,
83
+ detail="Invalid file type. Only .docx, .pdf and .txt are allowed.",
84
  )
85
 
86
+
87
  # Classify text from uploaded file
88
  async def handle_file_upload(file: UploadFile):
89
  try:
90
  file_contents = await extract_file_contents(file)
91
+ logging.info(f"Extracted text length: {len(file_contents)} characters")
92
+ if len(file_contents) > 50000:
93
+ return {
94
+ "status_code": 413,
95
+ "detail": "Text must be less than 50,000 characters",
96
+ }
97
 
98
  cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
99
  if not cleaned_text:
100
+ raise HTTPException(
101
+ status_code=400,
102
+ detail="The uploaded file is empty or only contains whitespace.",
103
+ )
104
  # print(f"Cleaned text: '{cleaned_text}'") # Debugging statement
105
+ label, perplexity, ai_likelihood = await asyncio.to_thread(
106
+ classify_text, cleaned_text
107
+ )
108
  return {
109
  "content": file_contents,
110
  "result": label,
111
  "perplexity": round(perplexity, 2),
112
+ "ai_likelihood": ai_likelihood,
113
  }
114
  except Exception as e:
115
  logging.error(f"Error processing file: {e}")
116
  raise HTTPException(status_code=500, detail="Error processing the file")
117
 
118
 
 
119
  async def handle_sentence_level_analysis(text: str):
120
  text = text.strip()
121
+ if not text or len(text.split()) < 10:
122
+ raise HTTPException(
123
+ status_code=400, detail="Text must contain at least 10 words"
124
+ )
125
+ if len(text) > 50000:
126
+ raise HTTPException(
127
+ status_code=413, detail="Text must be less than 50,000 characters"
128
+ )
129
+
130
+ result = await asyncio.to_thread(analyze_text_with_sentences, text)
131
+ return result
 
 
 
 
 
 
 
 
 
132
 
 
133
 
134
  # Analyze each sentence from uploaded file
135
  async def handle_file_sentence(file: UploadFile):
136
  try:
137
  file_contents = await extract_file_contents(file)
138
+ if len(file_contents) > 50000:
139
  # raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
140
+ return {
141
+ "status_code": 413,
142
+ "detail": "Text must be less than 50,000 characters",
143
+ }
144
 
145
  cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
146
  if not cleaned_text:
147
+ raise HTTPException(
148
+ status_code=400,
149
+ detail="The uploaded file is empty or only contains whitespace.",
150
+ )
151
 
152
  result = await handle_sentence_level_analysis(cleaned_text)
153
+ return {"content": file_contents, **result}
154
+ except HTTPException:
155
+ raise
 
156
  except Exception as e:
157
  logging.error(f"Error processing file: {e}")
158
  raise HTTPException(status_code=500, detail="Error processing the file")
159
 
160
+
161
  def classify(text: str):
162
  return classify_text(text)
 
features/text_classifier/inferencer.py CHANGED
@@ -1,40 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
- from .model_loader import get_model_tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
5
 
6
- def perplexity_to_ai_likelihood(ppl: float) -> float:
7
- # You can tune these parameters
8
- min_ppl = 10 # very confident it's AI
9
- max_ppl = 100 # very confident it's human
10
 
11
- # Clamp to bounds
12
- ppl = max(min_ppl, min(ppl, max_ppl))
 
 
 
 
13
 
14
- # Invert and scale: lower perplexity -> higher AI-likelihood
15
- likelihood = 1 - ((ppl - min_ppl) / (max_ppl - min_ppl))
16
 
17
- return round(likelihood * 100, 2)
 
 
 
 
 
 
 
 
 
18
 
 
 
 
 
 
 
 
 
19
 
20
- def classify_text(text: str):
21
- model, tokenizer = get_model_tokenizer()
22
- inputs = tokenizer(text, return_tensors="pt",
23
- truncation=True, padding=True)
24
- input_ids = inputs["input_ids"].to(device)
25
- attention_mask = inputs["attention_mask"].to(device)
26
 
27
- with torch.no_grad():
28
- outputs = model(
29
- input_ids, attention_mask=attention_mask, labels=input_ids)
30
- loss = outputs.loss
31
- perplexity = torch.exp(loss).item()
32
 
33
- if perplexity < 55:
34
- result = "AI-generated"
35
- elif perplexity < 80:
36
- result = "Probably AI-generated"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  else:
38
- result = "Human-written"
39
- likelihood_result=perplexity_to_ai_likelihood(perplexity)
40
- return result, perplexity,likelihood_result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from functools import lru_cache
5
+ import logging
6
+ import random
7
+ from typing import Any
8
+
9
+ import nltk
10
+ import numpy as np
11
+ from scipy.sparse import csr_matrix, hstack
12
  import torch
13
+ from transformers import AutoModelForCausalLM, AutoTokenizer
14
+
15
+ from features.text_classifier.model_loader import load_model
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ for resource in ("tokenizers/punkt", "tokenizers/punkt_tab"):
21
+ try:
22
+ nltk.data.find(resource)
23
+ except LookupError:
24
+ nltk.download(resource.split("/")[-1], quiet=True)
25
+
26
+
27
+ try:
28
+ import textstat
29
+ except ImportError:
30
+ textstat = None
31
+
32
+
33
+ @dataclass
34
+ class SentenceBlendConfig:
35
+ sentence_blend_weight: float = 0.70
36
+ sentence_to_doc_bias: float = 0.35
37
+ max_sentence_blend_weight: float = 0.90
38
+ max_sentence_to_doc_bias: float = 0.80
39
+ random_deviation_pct: float = 2.0
40
+
41
+
42
+ class PerplexityCalculator:
43
+ """Lazy-loaded perplexity calculator for distilgpt2."""
44
+
45
+ def __init__(self, model_name: str = "distilgpt2"):
46
+ self.model_name = model_name
47
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
48
+ self._tokenizer = None
49
+ self._model = None
50
+
51
+ def _load(self) -> None:
52
+ if self._model is not None and self._tokenizer is not None:
53
+ return
54
+
55
+ logger.info("Loading perplexity model: %s", self.model_name)
56
+ self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
57
+ self._model = AutoModelForCausalLM.from_pretrained(self.model_name).to(self.device)
58
+ self._model.eval()
59
+ logger.info("Perplexity model loaded on %s", self.device)
60
+
61
+ def calculate(self, text: str, max_length: int = 512) -> float:
62
+ try:
63
+ self._load()
64
+ encodings = self._tokenizer(
65
+ text,
66
+ return_tensors="pt",
67
+ truncation=True,
68
+ max_length=max_length,
69
+ )
70
+ input_ids = encodings.input_ids.to(self.device)
71
+
72
+ with torch.no_grad():
73
+ outputs = self._model(input_ids, labels=input_ids)
74
+ loss = outputs.loss
75
+ perplexity = torch.exp(loss).item()
76
+
77
+ return min(float(perplexity), 10000.0)
78
+ except Exception as exc:
79
+ logger.warning("Perplexity fallback used due to error: %s", exc)
80
+ return 100.0
81
+
82
+
83
+ _perplexity_calc = PerplexityCalculator()
84
+
85
+
86
+ @lru_cache(maxsize=20000)
87
+ def _cached_perplexity(cleaned_text: str) -> float:
88
+ return _perplexity_calc.calculate(cleaned_text)
89
+
90
+
91
+ @lru_cache(maxsize=1)
92
+ def _get_model_artifacts() -> tuple[Any, Any, Any, Any, list[str], dict[str, Any]]:
93
+ return load_model()
94
+
95
 
96
+ def normalize_text(text: str) -> str:
97
+ return " ".join(str(text).split()).strip()
98
 
 
 
 
 
99
 
100
+ def split_into_sentences(text: str) -> list[str]:
101
+ cleaned = normalize_text(text)
102
+ if not cleaned:
103
+ return []
104
+ sentences = [s.strip() for s in nltk.sent_tokenize(cleaned) if s.strip()]
105
+ return sentences if sentences else [cleaned]
106
 
 
 
107
 
108
+ def extract_burstiness_features(text: str) -> dict[str, float]:
109
+ sentences = split_into_sentences(text)
110
+ if not sentences:
111
+ return {
112
+ "burst_mean": 0.0,
113
+ "burst_std": 0.0,
114
+ "burst_max": 0.0,
115
+ "burst_min": 0.0,
116
+ "burst_range": 0.0,
117
+ }
118
 
119
+ lengths = np.array([len(s.split()) for s in sentences], dtype=float)
120
+ return {
121
+ "burst_mean": float(np.mean(lengths)),
122
+ "burst_std": float(np.std(lengths)),
123
+ "burst_max": float(np.max(lengths)),
124
+ "burst_min": float(np.min(lengths)),
125
+ "burst_range": float(np.max(lengths) - np.min(lengths)),
126
+ }
127
 
 
 
 
 
 
 
128
 
129
+ def extract_stylometry_features(text: str) -> dict[str, float]:
130
+ words = text.split()
131
+ num_words = len(words)
132
+ num_chars = len(text)
133
+ num_sentences = max(len(split_into_sentences(text)), 1)
134
 
135
+ avg_word_len = float(np.mean([len(w) for w in words])) if words else 0.0
136
+ avg_sent_len = float(num_words / num_sentences)
137
+
138
+ unique_words = len(set(words))
139
+ lexical_diversity = float(unique_words / num_words) if num_words > 0 else 0.0
140
+
141
+ num_punct = sum(1 for c in text if c in ".,!?;:")
142
+ punct_ratio = float(num_punct / num_chars) if num_chars > 0 else 0.0
143
+
144
+ num_caps = sum(1 for c in text if c.isupper())
145
+ caps_ratio = float(num_caps / num_chars) if num_chars > 0 else 0.0
146
+
147
+ if textstat is not None:
148
+ try:
149
+ flesch_reading = float(textstat.flesch_reading_ease(text))
150
+ flesch_grade = float(textstat.flesch_kincaid_grade(text))
151
+ except Exception:
152
+ flesch_reading = 50.0
153
+ flesch_grade = 8.0
154
+ else:
155
+ flesch_reading = 50.0
156
+ flesch_grade = 8.0
157
+
158
+ return {
159
+ "num_words": float(num_words),
160
+ "num_chars": float(num_chars),
161
+ "num_sentences": float(num_sentences),
162
+ "avg_word_len": avg_word_len,
163
+ "avg_sent_len": avg_sent_len,
164
+ "lexical_diversity": lexical_diversity,
165
+ "punct_ratio": punct_ratio,
166
+ "caps_ratio": caps_ratio,
167
+ "flesch_reading": flesch_reading,
168
+ "flesch_grade": flesch_grade,
169
+ }
170
+
171
+
172
+ def extract_all_features(text: str, calc_perplexity: bool = True) -> dict[str, float]:
173
+ cleaned = normalize_text(text)
174
+ features: dict[str, float] = {}
175
+
176
+ if calc_perplexity:
177
+ features["perplexity"] = _cached_perplexity(cleaned)
178
+ else:
179
+ features["perplexity"] = 100.0
180
+
181
+ features.update(extract_burstiness_features(cleaned))
182
+ features.update(extract_stylometry_features(cleaned))
183
+ return features
184
+
185
+
186
+ def _predict_ai_probability(text: str) -> tuple[float, float]:
187
+ (
188
+ loaded_classifier,
189
+ loaded_scaler,
190
+ loaded_word_vectorizer,
191
+ loaded_char_vectorizer,
192
+ loaded_features,
193
+ loaded_metadata,
194
+ ) = _get_model_artifacts()
195
+
196
+ calc_perplexity = bool(loaded_metadata.get("num_engineered_features", 0) > 0)
197
+ features = extract_all_features(text, calc_perplexity=calc_perplexity)
198
+
199
+ feature_vector = np.array([features[name] for name in loaded_features], dtype=float).reshape(1, -1)
200
+ feature_scaled = loaded_scaler.transform(feature_vector)
201
+
202
+ word_vec = loaded_word_vectorizer.transform([text])
203
+ char_vec = loaded_char_vectorizer.transform([text])
204
+ num_vec = csr_matrix(feature_scaled)
205
+ hybrid_vec = hstack([word_vec, char_vec, num_vec], format="csr")
206
+
207
+ if hasattr(loaded_classifier, "predict_proba"):
208
+ proba = loaded_classifier.predict_proba(hybrid_vec)[0]
209
+ ai_prob = float(proba[1])
210
  else:
211
+ score = float(loaded_classifier.decision_function(hybrid_vec)[0])
212
+ ai_prob = float(1.0 / (1.0 + np.exp(-score)))
213
+
214
+ perplexity = float(features.get("perplexity", 100.0))
215
+ return ai_prob, perplexity
216
+
217
+
218
+ def classify_text(text: str) -> tuple[str, float, float]:
219
+ """Return (label, perplexity, ai_likelihood_percent)."""
220
+ cleaned = normalize_text(text)
221
+ if not cleaned:
222
+ raise ValueError("Input text is empty")
223
+
224
+ ai_prob, perplexity = _predict_ai_probability(cleaned)
225
+ ai_likelihood = round(ai_prob * 100.0, 2)
226
+ label = "AI" if ai_likelihood >= 50.0 else "Human"
227
+ return label, perplexity, ai_likelihood
228
+
229
+
230
+ def analyze_text_with_sentences(
231
+ text: str,
232
+ ) -> dict[str, Any]:
233
+ text = normalize_text(text)
234
+ overall_classification, overall_perplexity, overall_ai_likelihood = classify_text(text)
235
+ sentences = split_into_sentences(text)
236
+ if not sentences:
237
+ raise ValueError("Input text contains no valid sentences")
238
+ # do the per-sentence analysis
239
+ sentence_results = []
240
+ for sentence in sentences:
241
+ try:
242
+ label, perplexity, ai_likelihood = classify_text(sentence)
243
+ sentence_results.append(
244
+ {
245
+ "sentence": sentence,
246
+ "label": label,
247
+ "perplexity": perplexity,
248
+ "ai_likelihood": ai_likelihood,
249
+ }
250
+ )
251
+ except Exception as exc:
252
+ logger.warning("Error analyzing sentence: %s", exc)
253
+ sentence_results.append(
254
+ {
255
+ "sentence": sentence,
256
+ "label": "Error",
257
+ "perplexity": None,
258
+ "ai_likelihood": None,
259
+ }
260
+ )
261
+ return{
262
+ "sentences": sentence_results,
263
+ "summary": {
264
+ "overall": {
265
+ "label": overall_classification,
266
+ "perplexity": overall_perplexity,
267
+ "ai_likelihood": overall_ai_likelihood,
268
+ }
269
+ },
270
+
271
+ }
272
+
features/text_classifier/model_loader.py CHANGED
@@ -1,30 +1,36 @@
 
 
1
  import os
 
2
  import shutil
3
- import logging
4
- from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config
5
- from huggingface_hub import snapshot_download
6
  import torch
7
- from dotenv import load_dotenv
8
- load_dotenv()
9
- REPO_ID = "can-org/AI-Content-Checker"
10
- MODEL_DIR = "./models"
11
- TOKENIZER_DIR = os.path.join(MODEL_DIR, "model")
12
- WEIGHTS_PATH = os.path.join(MODEL_DIR, "model_weights.pth")
13
 
14
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
  _model, _tokenizer = None, None
16
 
17
 
18
  def warmup():
19
- global _model, _tokenizer
20
- # Ensure punkt is available
 
 
 
 
21
  download_model_repo()
22
- _model, _tokenizer = load_model()
23
- logging.info("Its ready")
24
 
25
 
26
  def download_model_repo():
27
- if os.path.exists(MODEL_DIR) and os.path.isdir(MODEL_DIR):
 
 
28
  logging.info("Model already exists, skipping download.")
29
  return
30
  snapshot_path = snapshot_download(repo_id=REPO_ID)
@@ -33,18 +39,31 @@ def download_model_repo():
33
 
34
 
35
  def load_model():
36
- tokenizer = GPT2TokenizerFast.from_pretrained(TOKENIZER_DIR)
37
- config = GPT2Config.from_pretrained(TOKENIZER_DIR)
38
- model = GPT2LMHeadModel(config)
39
- model.load_state_dict(torch.load(WEIGHTS_PATH, map_location=device))
40
- model.to(device)
41
- model.eval()
42
- return model, tokenizer
43
-
44
-
45
- def get_model_tokenizer():
46
- global _model, _tokenizer
47
- if _model is None or _tokenizer is None:
48
- download_model_repo()
49
- _model, _tokenizer = load_model()
50
- return _model, _tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
  import os
4
+ import pickle
5
  import shutil
6
+ from pathlib import Path
7
+
 
8
  import torch
9
+ from huggingface_hub import snapshot_download
10
+
11
+ from config import Config
12
+
13
+ REPO_ID = Config.REPO_ID_LANG
14
+ MODEL_DIR = Path(Config.LANG_MODEL) if Config.LANG_MODEL else None
15
 
16
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
  _model, _tokenizer = None, None
18
 
19
 
20
  def warmup():
21
+ logging.info("Warming up model...")
22
+ if MODEL_DIR is None:
23
+ raise ValueError("LANG_MODEL is not configured")
24
+ if MODEL_DIR.exists() and MODEL_DIR.is_dir():
25
+ logging.info("Model already exists, skipping download.")
26
+ return
27
  download_model_repo()
 
 
28
 
29
 
30
  def download_model_repo():
31
+ if MODEL_DIR is None:
32
+ raise ValueError("LANG_MODEL is not configured")
33
+ if MODEL_DIR.exists() and MODEL_DIR.is_dir():
34
  logging.info("Model already exists, skipping download.")
35
  return
36
  snapshot_path = snapshot_download(repo_id=REPO_ID)
 
39
 
40
 
41
  def load_model():
42
+ if MODEL_DIR is None:
43
+ raise ValueError("LANG_MODEL is not configured")
44
+
45
+ with open(MODEL_DIR / "classifier.pkl", "rb") as f:
46
+ loaded_classifier = pickle.load(f)
47
+
48
+ with open(MODEL_DIR / "scaler.pkl", "rb") as f:
49
+ loaded_scaler = pickle.load(f)
50
+
51
+ with open(MODEL_DIR / "word_vectorizer.pkl", "rb") as f:
52
+ loaded_word_vectorizer = pickle.load(f)
53
+
54
+ with open(MODEL_DIR / "char_vectorizer.pkl", "rb") as f:
55
+ loaded_char_vectorizer = pickle.load(f)
56
+
57
+ with open(MODEL_DIR / "feature_names.json", "r") as f:
58
+ loaded_features = json.load(f)
59
+
60
+ with open(MODEL_DIR / "metadata.json", "r") as f:
61
+ loaded_metadata = json.load(f)
62
+ return (
63
+ loaded_classifier,
64
+ loaded_scaler,
65
+ loaded_word_vectorizer,
66
+ loaded_char_vectorizer,
67
+ loaded_features,
68
+ loaded_metadata,
69
+ )
features/text_classifier/routes.py CHANGED
@@ -37,9 +37,10 @@ async def analyze_sentences(request: Request, data: TextInput, token: str = Depe
37
  raise HTTPException(status_code=400, detail="Missing 'text' in request body")
38
  return await handle_sentence_level_analysis(data.text)
39
 
40
- @router.post("/analyse-sentance-file")
 
41
  @limiter.limit(ACCESS_RATE)
42
- async def analyze_sentance_file(request: Request, file: UploadFile = File(...), token: str = Depends(verify_token)):
43
  return await handle_file_sentence(file)
44
 
45
  @router.get("/health")
 
37
  raise HTTPException(status_code=400, detail="Missing 'text' in request body")
38
  return await handle_sentence_level_analysis(data.text)
39
 
40
+
41
+ @router.post("/analyse-sentence-file")
42
  @limiter.limit(ACCESS_RATE)
43
+ async def analyze_sentence_file(request: Request, file: UploadFile = File(...), token: str = Depends(verify_token)):
44
  return await handle_file_sentence(file)
45
 
46
  @router.get("/health")
requirements.txt CHANGED
@@ -19,6 +19,9 @@ pypdf
19
  frontend
20
  tools
21
  pandas
 
 
 
22
  requests
23
  beautifulsoup4
24
  langchain
 
19
  frontend
20
  tools
21
  pandas
22
+ numpy
23
+ scikit-learn
24
+ textstat
25
  requests
26
  beautifulsoup4
27
  langchain