Spaces:

Pujan-Dev
/

AI_API

Sleeping

App Files Files Community

Pujan-Dev commited on Apr 2

Commit

31fda96

1 Parent(s): 47c1352

update: updated the config and text_classifier

Browse files

Files changed (8) hide show

Procfile +0 -1
app.py +12 -11
config.py +12 -0
features/text_classifier/controller.py +81 -49
features/text_classifier/inferencer.py +261 -29
features/text_classifier/model_loader.py +48 -29
features/text_classifier/routes.py +3 -2
requirements.txt +3 -0

Procfile DELETED Viewed

	@@ -1 +0,0 @@
1	- web: uvicorn app:app --host 0.0.0.0 --port ${PORT:-8000}

app.py CHANGED Viewed

@@ -1,22 +1,23 @@
 from fastapi import FastAPI, Request
 from slowapi import Limiter, _rate_limit_exceeded_handler
-from fastapi.responses import FileResponse
-from slowapi.middleware import SlowAPIMiddleware
 from slowapi.errors import RateLimitExceeded
 from slowapi.util import get_remote_address
-from fastapi.responses import JSONResponse
-from features.text_classifier.routes import router as text_classifier_router
 from features.nepali_text_classifier.routes import (
     router as nepali_text_classifier_router,
 )
-from features.image_classifier.routes import router as image_classifier_router
-from features.image_edit_detector.routes import router as image_edit_detector_router
-from fastapi.staticfiles import StaticFiles
-from config import ACCESS_RATE
-import requests
 limiter = Limiter(key_func=get_remote_address, default_limits=[ACCESS_RATE])
 app = FastAPI()

+import warnings
+import requests
 from fastapi import FastAPI, Request
+from fastapi.responses import FileResponse, JSONResponse
+from fastapi.staticfiles import StaticFiles
 from slowapi import Limiter, _rate_limit_exceeded_handler
 from slowapi.errors import RateLimitExceeded
+from slowapi.middleware import SlowAPIMiddleware
 from slowapi.util import get_remote_address
+from config import ACCESS_RATE
+from features.image_classifier.routes import router as image_classifier_router
+from features.image_edit_detector.routes import router as image_edit_detector_router
 from features.nepali_text_classifier.routes import (
     router as nepali_text_classifier_router,
 )
+from features.text_classifier.routes import router as text_classifier_router
+warnings.filterwarnings("ignore")
 limiter = Limiter(key_func=get_remote_address, default_limits=[ACCESS_RATE])
 app = FastAPI()

config.py CHANGED Viewed

	@@ -1,2 +1,14 @@






1	ACCESS_RATE = "20/minute"
2

+import os
+import dotenv
+dotenv.load_dotenv()
 ACCESS_RATE = "20/minute"
+class Config:
+    Nepali_model_folder = os.getenv("Nepali_model")
+    English_model_folder = os.getenv("English_model")
+    REPO_ID_LANG = os.getenv("English_model")
+    LANG_MODEL = os.getenv("LANG_MODEL")

features/text_classifier/controller.py CHANGED Viewed

@@ -1,16 +1,34 @@
-import os
 import asyncio
 import logging
 from io import BytesIO
-from fastapi import HTTPException, UploadFile, status, Depends
-from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
-from .inferencer import classify_text
 from .preprocess import parse_docx, parse_pdf, parse_txt
-import spacy
 security = HTTPBearer()
-nlp = spacy.load("en_core_web_sm")
 # Verify Bearer token from Authorization header
 async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
@@ -18,32 +36,42 @@ async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(secur
     expected_token = os.getenv("MY_SECRET_TOKEN")
     if token != expected_token:
         raise HTTPException(
-            status_code=status.HTTP_403_FORBIDDEN,
-            detail="Invalid or expired token"
         )
     return token
 # Classify plain text input
 async def handle_text_analysis(text: str):
     text = text.strip()
     if not text or len(text.split()) < 10:
-        raise HTTPException(status_code=400, detail="Text must contain at least 10 words")
-    if len(text) > 10000:
-        raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
     label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, text)
     return {
         "result": label,
         "perplexity": round(perplexity, 2),
-        "ai_likelihood": ai_likelihood
     }
 # Extract text from uploaded files (.docx, .pdf, .txt)
 async def extract_file_contents(file: UploadFile) -> str:
     content = await file.read()
     file_stream = BytesIO(content)
-    if file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
         return parse_docx(file_stream)
     elif file.content_type == "application/pdf":
         return parse_pdf(file_stream)
@@ -52,79 +80,83 @@ async def extract_file_contents(file: UploadFile) -> str:
     else:
         raise HTTPException(
             status_code=415,
-            detail="Invalid file type. Only .docx, .pdf and .txt are allowed."
         )
 # Classify text from uploaded file
 async def handle_file_upload(file: UploadFile):
     try:
         file_contents = await extract_file_contents(file)
-        if len(file_contents) > 10000:
-            return {"status_code": 413, "detail": "Text must be less than 10,000 characters"}
         cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
         if not cleaned_text:
-            raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
         # print(f"Cleaned text: '{cleaned_text}'")  # Debugging statement
-        label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, cleaned_text)
         return {
             "content": file_contents,
             "result": label,
             "perplexity": round(perplexity, 2),
-            "ai_likelihood": ai_likelihood
         }
     except Exception as e:
         logging.error(f"Error processing file: {e}")
         raise HTTPException(status_code=500, detail="Error processing the file")
 async def handle_sentence_level_analysis(text: str):
     text = text.strip()
-    if not text.endswith("."):
-        text += "."
-    if len(text) > 10000:
-        raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
-    doc = nlp(text)
-    sentences = [sent.text.strip() for sent in doc.sents]
-    results = []
-    for sentence in sentences:
-        if not sentence:
-            continue
-        label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, sentence)
-        results.append({
-            "sentence": sentence,
-            "label": label,
-            "perplexity": round(perplexity, 2),
-            "ai_likelihood": ai_likelihood
-        })
-    return {"analysis": results}
 # Analyze each sentence from uploaded file
 async def handle_file_sentence(file: UploadFile):
     try:
         file_contents = await extract_file_contents(file)
-        if len(file_contents) > 10000:
             # raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
-            return {"status_code": 413, "detail": "Text must be less than 10,000 characters"}
         cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
         if not cleaned_text:
-            raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
         result = await handle_sentence_level_analysis(cleaned_text)
-        return {
-            "content": file_contents,
-            **result
-        }
     except Exception as e:
         logging.error(f"Error processing file: {e}")
         raise HTTPException(status_code=500, detail="Error processing the file")
 def classify(text: str):
     return classify_text(text)

 import asyncio
 import logging
+import os
 from io import BytesIO
+from fastapi import Depends, HTTPException, UploadFile, status
+from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
+from .inferencer import analyze_text_with_sentences, classify_text
 from .preprocess import parse_docx, parse_pdf, parse_txt
 security = HTTPBearer()
+def build_bias_summary(ai_likelihood: float) -> dict[str, object]:
+    """Convert an AI likelihood score into a human-readable bias summary."""
+    if ai_likelihood > 50:
+        overall_bias = "AI"
+        bias_statement = f"The text is biased toward AI-generated writing ({ai_likelihood}% AI likelihood)."
+    elif ai_likelihood < 50:
+        overall_bias = "Human"
+        bias_statement = f"The text is biased toward human writing ({100 - ai_likelihood}% human likelihood)."
+    else:
+        overall_bias = "Balanced"
+        bias_statement = "The text is balanced between AI and human writing."
+    return {
+        "overall_bias": overall_bias,
+        "bias_statement": bias_statement,
+    }
 # Verify Bearer token from Authorization header
 async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
     expected_token = os.getenv("MY_SECRET_TOKEN")
     if token != expected_token:
         raise HTTPException(
+            status_code=status.HTTP_403_FORBIDDEN, detail="Invalid or expired token"
         )
     return token
 # Classify plain text input
 async def handle_text_analysis(text: str):
     text = text.strip()
     if not text or len(text.split()) < 10:
+        raise HTTPException(
+            status_code=400, detail="Text must contain at least 10 words"
+        )
+    if len(text) > 50000:
+        raise HTTPException(
+            status_code=413, detail="Text must be less than 50,000 characters"
+        )
     label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, text)
+    bias_summary = build_bias_summary(ai_likelihood)
     return {
         "result": label,
         "perplexity": round(perplexity, 2),
+        "ai_likelihood": ai_likelihood,
+        **bias_summary,
     }
 # Extract text from uploaded files (.docx, .pdf, .txt)
 async def extract_file_contents(file: UploadFile) -> str:
     content = await file.read()
     file_stream = BytesIO(content)
+    if (
+        file.content_type
+        == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    ):
         return parse_docx(file_stream)
     elif file.content_type == "application/pdf":
         return parse_pdf(file_stream)
     else:
         raise HTTPException(
             status_code=415,
+            detail="Invalid file type. Only .docx, .pdf and .txt are allowed.",
         )
 # Classify text from uploaded file
 async def handle_file_upload(file: UploadFile):
     try:
         file_contents = await extract_file_contents(file)
+        logging.info(f"Extracted text length: {len(file_contents)} characters")
+        if len(file_contents) > 50000:
+            return {
+                "status_code": 413,
+                "detail": "Text must be less than 50,000 characters",
+            }
         cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
         if not cleaned_text:
+            raise HTTPException(
+                status_code=400,
+                detail="The uploaded file is empty or only contains whitespace.",
+            )
         # print(f"Cleaned text: '{cleaned_text}'")  # Debugging statement
+        label, perplexity, ai_likelihood = await asyncio.to_thread(
+            classify_text, cleaned_text
+        )
         return {
             "content": file_contents,
             "result": label,
             "perplexity": round(perplexity, 2),
+            "ai_likelihood": ai_likelihood,
         }
     except Exception as e:
         logging.error(f"Error processing file: {e}")
         raise HTTPException(status_code=500, detail="Error processing the file")
 async def handle_sentence_level_analysis(text: str):
     text = text.strip()
+    if not text or len(text.split()) < 10:
+        raise HTTPException(
+            status_code=400, detail="Text must contain at least 10 words"
+        )
+    if len(text) > 50000:
+        raise HTTPException(
+            status_code=413, detail="Text must be less than 50,000 characters"
+        )
+    result = await asyncio.to_thread(analyze_text_with_sentences, text)
+    return result
 # Analyze each sentence from uploaded file
 async def handle_file_sentence(file: UploadFile):
     try:
         file_contents = await extract_file_contents(file)
+        if len(file_contents) > 50000:
             # raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
+            return {
+                "status_code": 413,
+                "detail": "Text must be less than 50,000 characters",
+            }
         cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
         if not cleaned_text:
+            raise HTTPException(
+                status_code=400,
+                detail="The uploaded file is empty or only contains whitespace.",
+            )
         result = await handle_sentence_level_analysis(cleaned_text)
+        return {"content": file_contents, **result}
+    except HTTPException:
+        raise
     except Exception as e:
         logging.error(f"Error processing file: {e}")
         raise HTTPException(status_code=500, detail="Error processing the file")
 def classify(text: str):
     return classify_text(text)

features/text_classifier/inferencer.py CHANGED Viewed

@@ -1,40 +1,272 @@
 import torch
-from .model_loader import get_model_tokenizer
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-def perplexity_to_ai_likelihood(ppl: float) -> float:
-    # You can tune these parameters
-    min_ppl = 10     # very confident it's AI
-    max_ppl = 100    # very confident it's human
-    # Clamp to bounds
-    ppl = max(min_ppl, min(ppl, max_ppl))
-    # Invert and scale: lower perplexity -> higher AI-likelihood
-    likelihood = 1 - ((ppl - min_ppl) / (max_ppl - min_ppl))
-    return round(likelihood * 100, 2)
-def classify_text(text: str):
-    model, tokenizer = get_model_tokenizer()
-    inputs = tokenizer(text, return_tensors="pt",
-                       truncation=True, padding=True)
-    input_ids = inputs["input_ids"].to(device)
-    attention_mask = inputs["attention_mask"].to(device)
-    with torch.no_grad():
-        outputs = model(
-            input_ids, attention_mask=attention_mask, labels=input_ids)
-        loss = outputs.loss
-        perplexity = torch.exp(loss).item()
-    if perplexity < 55:
-        result = "AI-generated"
-    elif perplexity < 80:
-        result = "Probably AI-generated"
     else:
-        result = "Human-written"
-    likelihood_result=perplexity_to_ai_likelihood(perplexity)
-    return result, perplexity,likelihood_result

+from __future__ import annotations
+from dataclasses import dataclass
+from functools import lru_cache
+import logging
+import random
+from typing import Any
+import nltk
+import numpy as np
+from scipy.sparse import csr_matrix, hstack
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from features.text_classifier.model_loader import load_model
+logger = logging.getLogger(__name__)
+for resource in ("tokenizers/punkt", "tokenizers/punkt_tab"):
+    try:
+        nltk.data.find(resource)
+    except LookupError:
+        nltk.download(resource.split("/")[-1], quiet=True)
+try:
+    import textstat
+except ImportError:
+    textstat = None
+@dataclass
+class SentenceBlendConfig:
+    sentence_blend_weight: float = 0.70
+    sentence_to_doc_bias: float = 0.35
+    max_sentence_blend_weight: float = 0.90
+    max_sentence_to_doc_bias: float = 0.80
+    random_deviation_pct: float = 2.0
+class PerplexityCalculator:
+    """Lazy-loaded perplexity calculator for distilgpt2."""
+    def __init__(self, model_name: str = "distilgpt2"):
+        self.model_name = model_name
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self._tokenizer = None
+        self._model = None
+    def _load(self) -> None:
+        if self._model is not None and self._tokenizer is not None:
+            return
+        logger.info("Loading perplexity model: %s", self.model_name)
+        self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        self._model = AutoModelForCausalLM.from_pretrained(self.model_name).to(self.device)
+        self._model.eval()
+        logger.info("Perplexity model loaded on %s", self.device)
+    def calculate(self, text: str, max_length: int = 512) -> float:
+        try:
+            self._load()
+            encodings = self._tokenizer(
+                text,
+                return_tensors="pt",
+                truncation=True,
+                max_length=max_length,
+            )
+            input_ids = encodings.input_ids.to(self.device)
+            with torch.no_grad():
+                outputs = self._model(input_ids, labels=input_ids)
+                loss = outputs.loss
+                perplexity = torch.exp(loss).item()
+            return min(float(perplexity), 10000.0)
+        except Exception as exc:
+            logger.warning("Perplexity fallback used due to error: %s", exc)
+            return 100.0
+_perplexity_calc = PerplexityCalculator()
+@lru_cache(maxsize=20000)
+def _cached_perplexity(cleaned_text: str) -> float:
+    return _perplexity_calc.calculate(cleaned_text)
+@lru_cache(maxsize=1)
+def _get_model_artifacts() -> tuple[Any, Any, Any, Any, list[str], dict[str, Any]]:
+    return load_model()
+def normalize_text(text: str) -> str:
+    return " ".join(str(text).split()).strip()
+def split_into_sentences(text: str) -> list[str]:
+    cleaned = normalize_text(text)
+    if not cleaned:
+        return []
+    sentences = [s.strip() for s in nltk.sent_tokenize(cleaned) if s.strip()]
+    return sentences if sentences else [cleaned]
+def extract_burstiness_features(text: str) -> dict[str, float]:
+    sentences = split_into_sentences(text)
+    if not sentences:
+        return {
+            "burst_mean": 0.0,
+            "burst_std": 0.0,
+            "burst_max": 0.0,
+            "burst_min": 0.0,
+            "burst_range": 0.0,
+        }
+    lengths = np.array([len(s.split()) for s in sentences], dtype=float)
+    return {
+        "burst_mean": float(np.mean(lengths)),
+        "burst_std": float(np.std(lengths)),
+        "burst_max": float(np.max(lengths)),
+        "burst_min": float(np.min(lengths)),
+        "burst_range": float(np.max(lengths) - np.min(lengths)),
+    }
+def extract_stylometry_features(text: str) -> dict[str, float]:
+    words = text.split()
+    num_words = len(words)
+    num_chars = len(text)
+    num_sentences = max(len(split_into_sentences(text)), 1)
+    avg_word_len = float(np.mean([len(w) for w in words])) if words else 0.0
+    avg_sent_len = float(num_words / num_sentences)
+    unique_words = len(set(words))
+    lexical_diversity = float(unique_words / num_words) if num_words > 0 else 0.0
+    num_punct = sum(1 for c in text if c in ".,!?;:")
+    punct_ratio = float(num_punct / num_chars) if num_chars > 0 else 0.0
+    num_caps = sum(1 for c in text if c.isupper())
+    caps_ratio = float(num_caps / num_chars) if num_chars > 0 else 0.0
+    if textstat is not None:
+        try:
+            flesch_reading = float(textstat.flesch_reading_ease(text))
+            flesch_grade = float(textstat.flesch_kincaid_grade(text))
+        except Exception:
+            flesch_reading = 50.0
+            flesch_grade = 8.0
+    else:
+        flesch_reading = 50.0
+        flesch_grade = 8.0
+    return {
+        "num_words": float(num_words),
+        "num_chars": float(num_chars),
+        "num_sentences": float(num_sentences),
+        "avg_word_len": avg_word_len,
+        "avg_sent_len": avg_sent_len,
+        "lexical_diversity": lexical_diversity,
+        "punct_ratio": punct_ratio,
+        "caps_ratio": caps_ratio,
+        "flesch_reading": flesch_reading,
+        "flesch_grade": flesch_grade,
+    }
+def extract_all_features(text: str, calc_perplexity: bool = True) -> dict[str, float]:
+    cleaned = normalize_text(text)
+    features: dict[str, float] = {}
+    if calc_perplexity:
+        features["perplexity"] = _cached_perplexity(cleaned)
+    else:
+        features["perplexity"] = 100.0
+    features.update(extract_burstiness_features(cleaned))
+    features.update(extract_stylometry_features(cleaned))
+    return features
+def _predict_ai_probability(text: str) -> tuple[float, float]:
+    (
+        loaded_classifier,
+        loaded_scaler,
+        loaded_word_vectorizer,
+        loaded_char_vectorizer,
+        loaded_features,
+        loaded_metadata,
+    ) = _get_model_artifacts()
+    calc_perplexity = bool(loaded_metadata.get("num_engineered_features", 0) > 0)
+    features = extract_all_features(text, calc_perplexity=calc_perplexity)
+    feature_vector = np.array([features[name] for name in loaded_features], dtype=float).reshape(1, -1)
+    feature_scaled = loaded_scaler.transform(feature_vector)
+    word_vec = loaded_word_vectorizer.transform([text])
+    char_vec = loaded_char_vectorizer.transform([text])
+    num_vec = csr_matrix(feature_scaled)
+    hybrid_vec = hstack([word_vec, char_vec, num_vec], format="csr")
+    if hasattr(loaded_classifier, "predict_proba"):
+        proba = loaded_classifier.predict_proba(hybrid_vec)[0]
+        ai_prob = float(proba[1])
     else:
+        score = float(loaded_classifier.decision_function(hybrid_vec)[0])
+        ai_prob = float(1.0 / (1.0 + np.exp(-score)))
+    perplexity = float(features.get("perplexity", 100.0))
+    return ai_prob, perplexity
+def classify_text(text: str) -> tuple[str, float, float]:
+    """Return (label, perplexity, ai_likelihood_percent)."""
+    cleaned = normalize_text(text)
+    if not cleaned:
+        raise ValueError("Input text is empty")
+    ai_prob, perplexity = _predict_ai_probability(cleaned)
+    ai_likelihood = round(ai_prob * 100.0, 2)
+    label = "AI" if ai_likelihood >= 50.0 else "Human"
+    return label, perplexity, ai_likelihood
+def analyze_text_with_sentences(
+    text: str,
+) -> dict[str, Any]:
+    text = normalize_text(text)
+    overall_classification, overall_perplexity, overall_ai_likelihood = classify_text(text)
+    sentences = split_into_sentences(text)
+    if not sentences:
+        raise ValueError("Input text contains no valid sentences")
+    #  do the per-sentence analysis
+    sentence_results = []
+    for sentence in sentences:
+        try:
+            label, perplexity, ai_likelihood = classify_text(sentence)
+            sentence_results.append(
+                {
+                    "sentence": sentence,
+                    "label": label,
+                    "perplexity": perplexity,
+                    "ai_likelihood": ai_likelihood,
+                }
+            )
+        except Exception as exc:
+            logger.warning("Error analyzing sentence: %s", exc)
+            sentence_results.append(
+                {
+                    "sentence": sentence,
+                    "label": "Error",
+                    "perplexity": None,
+                    "ai_likelihood": None,
+                }
+            )
+    return{
+        "sentences": sentence_results,
+        "summary": {
+            "overall": {
+                "label": overall_classification,
+                "perplexity": overall_perplexity,
+                "ai_likelihood": overall_ai_likelihood,
+            }
+        },
+    }

features/text_classifier/model_loader.py CHANGED Viewed

@@ -1,30 +1,36 @@
 import os
 import shutil
-import logging
-from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config
-from huggingface_hub import snapshot_download
 import torch
-from dotenv import load_dotenv
-load_dotenv()
-REPO_ID = "can-org/AI-Content-Checker"
-MODEL_DIR = "./models"
-TOKENIZER_DIR = os.path.join(MODEL_DIR, "model")
-WEIGHTS_PATH = os.path.join(MODEL_DIR, "model_weights.pth")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 _model, _tokenizer = None, None
 def warmup():
-    global _model, _tokenizer
-    # Ensure punkt is available
     download_model_repo()
-    _model, _tokenizer = load_model()
-    logging.info("Its ready")
 def download_model_repo():
-    if os.path.exists(MODEL_DIR) and os.path.isdir(MODEL_DIR):
         logging.info("Model already exists, skipping download.")
         return
     snapshot_path = snapshot_download(repo_id=REPO_ID)
@@ -33,18 +39,31 @@ def download_model_repo():
 def load_model():
-    tokenizer = GPT2TokenizerFast.from_pretrained(TOKENIZER_DIR)
-    config = GPT2Config.from_pretrained(TOKENIZER_DIR)
-    model = GPT2LMHeadModel(config)
-    model.load_state_dict(torch.load(WEIGHTS_PATH, map_location=device))
-    model.to(device)
-    model.eval()
-    return model, tokenizer
-def get_model_tokenizer():
-    global _model, _tokenizer
-    if _model is None or _tokenizer is None:
-        download_model_repo()
-        _model, _tokenizer = load_model()
-    return _model, _tokenizer

+import json
+import logging
 import os
+import pickle
 import shutil
+from pathlib import Path
 import torch
+from huggingface_hub import snapshot_download
+from config import Config
+REPO_ID = Config.REPO_ID_LANG
+MODEL_DIR = Path(Config.LANG_MODEL) if Config.LANG_MODEL else None
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 _model, _tokenizer = None, None
 def warmup():
+    logging.info("Warming up model...")
+    if MODEL_DIR is None:
+        raise ValueError("LANG_MODEL is not configured")
+    if MODEL_DIR.exists() and MODEL_DIR.is_dir():
+        logging.info("Model already exists, skipping download.")
+        return
     download_model_repo()
 def download_model_repo():
+    if MODEL_DIR is None:
+        raise ValueError("LANG_MODEL is not configured")
+    if MODEL_DIR.exists() and MODEL_DIR.is_dir():
         logging.info("Model already exists, skipping download.")
         return
     snapshot_path = snapshot_download(repo_id=REPO_ID)
 def load_model():
+    if MODEL_DIR is None:
+        raise ValueError("LANG_MODEL is not configured")
+    with open(MODEL_DIR / "classifier.pkl", "rb") as f:
+        loaded_classifier = pickle.load(f)
+    with open(MODEL_DIR / "scaler.pkl", "rb") as f:
+        loaded_scaler = pickle.load(f)
+    with open(MODEL_DIR / "word_vectorizer.pkl", "rb") as f:
+        loaded_word_vectorizer = pickle.load(f)
+    with open(MODEL_DIR / "char_vectorizer.pkl", "rb") as f:
+        loaded_char_vectorizer = pickle.load(f)
+    with open(MODEL_DIR / "feature_names.json", "r") as f:
+        loaded_features = json.load(f)
+    with open(MODEL_DIR / "metadata.json", "r") as f:
+        loaded_metadata = json.load(f)
+    return (
+        loaded_classifier,
+        loaded_scaler,
+        loaded_word_vectorizer,
+        loaded_char_vectorizer,
+        loaded_features,
+        loaded_metadata,
+    )

features/text_classifier/routes.py CHANGED Viewed

@@ -37,9 +37,10 @@ async def analyze_sentences(request: Request, data: TextInput, token: str = Depe
         raise HTTPException(status_code=400, detail="Missing 'text' in request body")
     return await handle_sentence_level_analysis(data.text)
-@router.post("/analyse-sentance-file")
 @limiter.limit(ACCESS_RATE)
-async def analyze_sentance_file(request: Request, file: UploadFile = File(...), token: str = Depends(verify_token)):
     return await handle_file_sentence(file)
 @router.get("/health")

         raise HTTPException(status_code=400, detail="Missing 'text' in request body")
     return await handle_sentence_level_analysis(data.text)
+@router.post("/analyse-sentence-file")
 @limiter.limit(ACCESS_RATE)
+async def analyze_sentence_file(request: Request, file: UploadFile = File(...), token: str = Depends(verify_token)):
     return await handle_file_sentence(file)
 @router.get("/health")

requirements.txt CHANGED Viewed

@@ -19,6 +19,9 @@ pypdf
 frontend
 tools
 pandas
 requests
 beautifulsoup4
 langchain

 frontend
 tools
 pandas
+numpy
+scikit-learn
+textstat
 requests
 beautifulsoup4
 langchain