Spaces:

Pujan-Dev
/

AI_API

Sleeping

App Files Files Community

Pujan-Dev commited on Apr 3

Commit

582b4bf

1 Parent(s): 31fda96

Changed: Nepali text classifier with new Models and multi models and improved endpoints

Browse files

Files changed (5) hide show

app.py +14 -6
features/nepali_text_classifier/controller.py +23 -10
features/nepali_text_classifier/inferencer.py +81 -15
features/nepali_text_classifier/model_loader.py +163 -52
features/nepali_text_classifier/routes.py +21 -6

app.py CHANGED Viewed

@@ -20,7 +20,15 @@ from features.text_classifier.routes import router as text_classifier_router
 warnings.filterwarnings("ignore")
 limiter = Limiter(key_func=get_remote_address, default_limits=[ACCESS_RATE])
-app = FastAPI()
 # added the robots.txt
 # Set up SlowAPI
 app.state.limiter = limiter
@@ -38,13 +46,13 @@ app.add_exception_handler(
 app.add_middleware(SlowAPIMiddleware)
 # Include your routes
-app.include_router(text_classifier_router, prefix="/text")
-app.include_router(nepali_text_classifier_router, prefix="/NP")
-app.include_router(image_classifier_router, prefix="/AI-image")
-app.include_router(image_edit_detector_router, prefix="/detect")
-@app.get("/")
 @limiter.limit(ACCESS_RATE)
 async def root(request: Request):
     return {

 warnings.filterwarnings("ignore")
 limiter = Limiter(key_func=get_remote_address, default_limits=[ACCESS_RATE])
+openapi_tags = [
+    {"name": "English Text Classifier", "description": "Endpoints for English AI-vs-human text analysis."},
+    {"name": "Nepali Text Classifier", "description": "Endpoints for Nepali AI-vs-human text analysis."},
+    {"name": "AI Image Classifier", "description": "Endpoints for AI-vs-human image classification."},
+    {"name": "Image Edit Detection", "description": "Endpoints for edited/forged image detection."},
+    {"name": "System", "description": "Health and root endpoints."},
+]
+app = FastAPI(openapi_tags=openapi_tags)
 # added the robots.txt
 # Set up SlowAPI
 app.state.limiter = limiter
 app.add_middleware(SlowAPIMiddleware)
 # Include your routes
+app.include_router(text_classifier_router, prefix="/text", tags=["English Text Classifier"])
+app.include_router(nepali_text_classifier_router, prefix="/NP", tags=["Nepali Text Classifier"])
+app.include_router(image_classifier_router, prefix="/AI-image", tags=["AI Image Classifier"])
+app.include_router(image_edit_detector_router, prefix="/detect", tags=["Image Edit Detection"])
+@app.get("/", tags=["System"])
 @limiter.limit(ACCESS_RATE)
 async def root(request: Request):
     return {

features/nepali_text_classifier/controller.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import asyncio
 from io import BytesIO
 from fastapi import HTTPException, UploadFile, status, Depends
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
@@ -9,6 +10,13 @@ import re
 security = HTTPBearer()
 def contains_english(text: str) -> bool:
     # Remove escape characters
     cleaned = text.replace("\n", "").replace("\t", "")
@@ -25,7 +33,7 @@ async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(secur
         )
     return token
-async def nepali_text_analysis(text: str):
     end_symbol_for_NP_text(text)
     words = text.split()
     if len(words) < 10:
@@ -33,7 +41,8 @@ async def nepali_text_analysis(text: str):
     if len(text) > 10000:
         raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
-    result = await asyncio.to_thread(classify_text, text)
     return result
@@ -51,7 +60,7 @@ async def extract_file_contents(file:UploadFile)-> str:
     else:
         raise HTTPException(status_code=415,detail="Invalid file type. Only .docx,.pdf and .txt are allowed")
-async def handle_file_upload(file: UploadFile):
     try:
         file_contents = await extract_file_contents(file)
         end_symbol_for_NP_text(file_contents)
@@ -62,7 +71,8 @@ async def handle_file_upload(file: UploadFile):
         if not cleaned_text:
             raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
-        result = await asyncio.to_thread(classify_text, cleaned_text)
         return result
     except Exception as e:
         logging.error(f"Error processing file: {e}")
@@ -70,7 +80,7 @@ async def handle_file_upload(file: UploadFile):
-async def handle_sentence_level_analysis(text: str):
     text = text.strip()
     if len(text) > 10000:
         raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
@@ -79,11 +89,12 @@ async def handle_sentence_level_analysis(text: str):
     # Split text into sentences
     sentences = [s.strip() + "।" for s in text.split("।") if s.strip()]
     results = []
     for sentence in sentences:
         end_symbol_for_NP_text(sentence)
-        result = await asyncio.to_thread(classify_text, sentence)
         results.append({
             "text": sentence,
             "result": result["label"],
@@ -93,7 +104,7 @@ async def handle_sentence_level_analysis(text: str):
     return {"analysis": results}
-async def handle_file_sentence(file:UploadFile):
     try:
         file_contents = await extract_file_contents(file)
         if len(file_contents) > 10000:
@@ -106,12 +117,13 @@ async def handle_file_sentence(file:UploadFile):
         # Split text into sentences
         sentences = [s.strip() + "।" for s in cleaned_text.split("।") if s.strip()]
         results = []
         for sentence in sentences:
             end_symbol_for_NP_text(sentence)
-            result = await asyncio.to_thread(classify_text, sentence)
             results.append({
                 "text": sentence,
                 "result": result["label"],
@@ -125,6 +137,7 @@ async def handle_file_sentence(file:UploadFile):
         raise HTTPException(status_code=500, detail="Error processing the file")
-def classify(text: str):
-    return classify_text(text)

 import asyncio
+import logging
 from io import BytesIO
 from fastapi import HTTPException, UploadFile, status, Depends
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 security = HTTPBearer()
+def parse_selected_models(models: str | None) -> list[str] | None:
+    if not models:
+        return None
+    parsed = [m.strip() for m in models.split(",") if m.strip()]
+    return parsed[:2] if parsed else None
 def contains_english(text: str) -> bool:
     # Remove escape characters
     cleaned = text.replace("\n", "").replace("\t", "")
         )
     return token
+async def nepali_text_analysis(text: str, models: str | None = None):
     end_symbol_for_NP_text(text)
     words = text.split()
     if len(words) < 10:
     if len(text) > 10000:
         raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
+    selected_models = parse_selected_models(models)
+    result = await asyncio.to_thread(classify_text, text, selected_models, 2)
     return result
     else:
         raise HTTPException(status_code=415,detail="Invalid file type. Only .docx,.pdf and .txt are allowed")
+async def handle_file_upload(file: UploadFile, models: str | None = None):
     try:
         file_contents = await extract_file_contents(file)
         end_symbol_for_NP_text(file_contents)
         if not cleaned_text:
             raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
+        selected_models = parse_selected_models(models)
+        result = await asyncio.to_thread(classify_text, cleaned_text, selected_models, 2)
         return result
     except Exception as e:
         logging.error(f"Error processing file: {e}")
+async def handle_sentence_level_analysis(text: str, models: str | None = None):
     text = text.strip()
     if len(text) > 10000:
         raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
     # Split text into sentences
     sentences = [s.strip() + "।" for s in text.split("।") if s.strip()]
+    selected_models = parse_selected_models(models)
     results = []
     for sentence in sentences:
         end_symbol_for_NP_text(sentence)
+        result = await asyncio.to_thread(classify_text, sentence, selected_models, 2)
         results.append({
             "text": sentence,
             "result": result["label"],
     return {"analysis": results}
+async def handle_file_sentence(file:UploadFile, models: str | None = None):
     try:
         file_contents = await extract_file_contents(file)
         if len(file_contents) > 10000:
         # Split text into sentences
         sentences = [s.strip() + "।" for s in cleaned_text.split("।") if s.strip()]
+        selected_models = parse_selected_models(models)
         results = []
         for sentence in sentences:
             end_symbol_for_NP_text(sentence)
+            result = await asyncio.to_thread(classify_text, sentence, selected_models, 2)
             results.append({
                 "text": sentence,
                 "result": result["label"],
         raise HTTPException(status_code=500, detail="Error processing the file")
+def classify(text: str, models: str | None = None):
+    selected_models = parse_selected_models(models)
+    return classify_text(text, selected_models, 2)

features/nepali_text_classifier/inferencer.py CHANGED Viewed

@@ -1,23 +1,89 @@
-import torch
-from .model_loader import get_model_tokenizer
-import torch.nn.functional as F
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-def classify_text(text: str):
-    model, tokenizer = get_model_tokenizer()
-    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    with torch.no_grad():
-        outputs = model(**inputs)
-        logits = outputs if isinstance(outputs, torch.Tensor) else outputs.logits
-        probs = F.softmax(logits, dim=1)
-        pred = torch.argmax(probs, dim=1).item()
-        prob_percent = probs[0][pred].item() * 100
-    return {"label": "Human" if pred == 0 else "AI", "confidence": round(prob_percent, 2)}

+import re
+from scipy.sparse import csr_matrix, hstack
+from .model_loader import get_default_top_models, load_artifacts
+TOP_K_MODELS = 2
+def normalize_nepali_text(text: str) -> str:
+    text = str(text)
+    text = re.sub(r"https?://\S+|www\.\S+", " ", text)
+    text = re.sub(r"[^\u0900-\u097F\s।!?,]", " ", text)
+    return re.sub(r"\s+", " ", text).strip()
+def _select_models(models, model_names=None, top_k=2):
+    _ = model_names
+    ranked = [name for name in get_default_top_models(top_k=top_k) if name in models]
+    if ranked:
+        return ranked[:top_k]
+    return list(models.keys())[:top_k]
+def classify_text(text: str, model_names=None, top_k: int = 2):
+    artifacts = load_artifacts()
+    models = artifacts["models"]
+    if not models:
+        return {"error": "No models available for inference"}
+    cleaned_text = normalize_nepali_text(text)
+    word_features = artifacts["word_vectorizer"].transform([cleaned_text])
+    char_features = artifacts["char_vectorizer"].transform([cleaned_text])
+    rich_features = artifacts["rich_transformer"].transform([cleaned_text])
+    features = hstack([word_features, char_features, csr_matrix(rich_features)])
+    selected_names = _select_models(models, model_names=model_names, top_k=TOP_K_MODELS)
+    dense_models = {"Linear SVC"}
+    per_model = []
+    ai_votes = 0
+    human_votes = 0
+    confidence_sum = 0.0
+    for name in selected_names:
+        model = models[name]
+        model_input = features.toarray() if name in dense_models else features
+        pred = int(model.predict(model_input)[0])
+        confidence = None
+        if hasattr(model, "predict_proba"):
+            probs = model.predict_proba(model_input)
+            confidence = float(probs[0][pred])
+        elif hasattr(model, "decision_function"):
+            score = float(model.decision_function(model_input)[0])
+            confidence = abs(score) / (1.0 + abs(score))
+        else:
+            confidence = 0.5
+        if pred == 1:
+            ai_votes += 1
+            label = "AI"
+        else:
+            human_votes += 1
+            label = "Human"
+        confidence_sum += confidence
+        per_model.append(
+            {
+                "model": name,
+                "label": label,
+                "confidence": round(confidence * 100, 2),
+            }
+        )
+    final_label = "AI" if ai_votes > human_votes else "Human"
+    if ai_votes == human_votes:
+        final_label = per_model[0]["label"]
+    avg_conf = confidence_sum / max(len(per_model), 1)
+    return {
+        "label": final_label,
+        "confidence": round(avg_conf * 100, 2),
+        "selected_models": selected_names,
+        "model_predictions": per_model,
+        "votes": {"AI": ai_votes, "Human": human_votes},
+        "available_models": list(models.keys()),
+        "unavailable_models": artifacts["unavailable_models"],
+    }

features/nepali_text_classifier/model_loader.py CHANGED Viewed

@@ -1,54 +1,165 @@
-import os
-import shutil
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
 import logging
-from huggingface_hub import snapshot_download
-from transformers import AutoTokenizer, AutoModel
-# Configs
-REPO_ID = "can-org/Nepali-AI-VS-HUMAN"
-BASE_DIR = "./np_text_model"
-TOKENIZER_DIR = os.path.join(BASE_DIR, "classifier")  # <- update this to match your uploaded folder
-WEIGHTS_PATH = os.path.join(BASE_DIR, "model_95_acc.pth")  # <- change to match actual uploaded weight
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# Define model class
-class XLMRClassifier(nn.Module):
-    def __init__(self):
-        super(XLMRClassifier, self).__init__()
-        self.bert = AutoModel.from_pretrained("xlm-roberta-base")
-        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)
-    def forward(self, input_ids, attention_mask):
-        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
-        cls_output = outputs.last_hidden_state[:, 0, :]
-        return self.classifier(cls_output)
-# Globals for caching
-_model = None
-_tokenizer = None
-def download_model_repo():
-    if os.path.exists(BASE_DIR) and os.path.isdir(BASE_DIR):
-        logging.info("Model already downloaded.")
-        return
-    snapshot_path = snapshot_download(repo_id=REPO_ID)
-    os.makedirs(BASE_DIR, exist_ok=True)
-    shutil.copytree(snapshot_path, BASE_DIR, dirs_exist_ok=True)
-def load_model():
-    download_model_repo()
-    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
-    model = XLMRClassifier().to(device)
-    model.load_state_dict(torch.load(WEIGHTS_PATH, map_location=device))
-    model.eval()
-    return model, tokenizer
-def get_model_tokenizer():
-    global _model, _tokenizer
-    if _model is None or _tokenizer is None:
-        _model, _tokenizer = load_model()
-    return _model, _tokenizer

 import logging
+import os
+import pickle
+import re
+from functools import lru_cache
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from config import Config
+LOGGER = logging.getLogger(__name__)
+MODEL_FILES = {
+    "Logistic Regression": "Logistic_Regression.pkl",
+    "Random Forest": "Random_Forest.pkl",
+    "Gradient Boosting": "Gradient_Boosting.pkl",
+    "Linear SVC": "Linear_SVC.pkl",
+    "Ridge Classifier": "Ridge_Classifier.pkl",
+    "Multinomial NB": "Multinomial_NB.pkl",
+    "Bernoulli NB": "Bernoulli_NB.pkl",
+    "K-Nearest Neighbors": "KNearest_Neighbors.pkl",
+}
+# KNN artifact in this repo is very large; keep API responsive by skipping it.
+SKIP_MODELS = {"K-Nearest Neighbors"}
+# Ranked by validation accuracy from final_model/final_results.csv
+DEFAULT_MODEL_RANKING = [
+    "Gradient Boosting",
+    "Logistic Regression",
+    "Linear SVC",
+    "Ridge Classifier",
+    "Bernoulli NB",
+    "Random Forest",
+    "Multinomial NB",
+]
+class NepaliRichFeatures:
+    """Burstiness + stylometry feature extractor used during model training."""
+    @staticmethod
+    def extract_burstiness(text: str) -> dict:
+        sentences = [s.strip() for s in re.split(r"[।!?]", str(text)) if s.strip()]
+        if not sentences:
+            return {
+                "burst_mean": 0.0,
+                "burst_std": 0.0,
+                "burst_max": 0.0,
+                "burst_min": 0.0,
+                "burst_range": 0.0,
+            }
+        lengths = [len(s.split()) for s in sentences]
+        return {
+            "burst_mean": float(np.mean(lengths)),
+            "burst_std": float(np.std(lengths)),
+            "burst_max": float(np.max(lengths)),
+            "burst_min": float(np.min(lengths)),
+            "burst_range": float(np.max(lengths) - np.min(lengths)),
+        }
+    @staticmethod
+    def extract_stylometry(text: str) -> dict:
+        words = str(text).split()
+        num_words = max(len(words), 1)
+        num_chars = max(len(str(text)), 1)
+        num_sentences = max(len([s for s in re.split(r"[।!?]", str(text)) if s.strip()]), 1)
+        avg_word_len = float(np.mean([len(w) for w in words])) if words else 0.0
+        avg_sent_len = num_words / num_sentences
+        lexical_diversity = len(set(words)) / num_words
+        punct_count = str(text).count("।") + str(text).count("?") + str(text).count("!") + str(text).count(",")
+        punct_ratio = punct_count / num_chars
+        bigrams = [" ".join(words[i : i + 2]) for i in range(len(words) - 1)]
+        rep_bigram_ratio = (1.0 - len(set(bigrams)) / max(len(bigrams), 1)) if bigrams else 0.0
+        diacritic_count = sum(1 for c in str(text) if "\u093e" <= c <= "\u094d")
+        diacritic_ratio = diacritic_count / num_chars
+        return {
+            "num_words": num_words,
+            "num_chars": num_chars,
+            "num_sentences": num_sentences,
+            "avg_word_len": avg_word_len,
+            "avg_sent_len": avg_sent_len,
+            "lexical_diversity": lexical_diversity,
+            "punct_ratio": punct_ratio,
+            "rep_bigram_ratio": rep_bigram_ratio,
+            "diacritic_ratio": diacritic_ratio,
+        }
+    def transform(self, texts):
+        if isinstance(texts, str):
+            texts = [texts]
+        rows = []
+        for text in texts:
+            row = {**self.extract_burstiness(text), **self.extract_stylometry(text)}
+            rows.append(row)
+        return pd.DataFrame(rows).values.astype(np.float32)
+def _repo_root() -> Path:
+    return Path(__file__).resolve().parents[2]
+def resolve_model_dir() -> Path:
+    candidates = []
+    if Config.Nepali_model_folder:
+        candidates.append(Path(Config.Nepali_model_folder))
+    repo = _repo_root()
+    candidates.append(repo / "features" / "Model" / "Nepali_model")
+    candidates.append(repo / "notebook" / "ai_vs_human_nepali" / "final_model" / "saved_models")
+    for path in candidates:
+        if path.exists() and path.is_dir() and (path / "word_vectorizer.pkl").exists():
+            return path
+    raise FileNotFoundError("Nepali model directory not found. Set Nepali_model env or add expected artifacts.")
+@lru_cache(maxsize=1)
+def load_artifacts():
+    model_dir = resolve_model_dir()
+    LOGGER.info("Loading Nepali artifacts from %s", model_dir)
+    models = {}
+    unavailable = {}
+    for model_name, file_name in MODEL_FILES.items():
+        if model_name in SKIP_MODELS:
+            unavailable[model_name] = "Skipped due to large artifact size"
+            continue
+        file_path = model_dir / file_name
+        if not file_path.exists():
+            unavailable[model_name] = "Missing model file"
+            continue
+        with open(file_path, "rb") as fp:
+            models[model_name] = pickle.load(fp)
+    with open(model_dir / "word_vectorizer.pkl", "rb") as fp:
+        word_vectorizer = pickle.load(fp)
+    with open(model_dir / "char_vectorizer.pkl", "rb") as fp:
+        char_vectorizer = pickle.load(fp)
+    rich_transformer = NepaliRichFeatures()
+    return {
+        "model_dir": str(model_dir),
+        "models": models,
+        "unavailable_models": unavailable,
+        "word_vectorizer": word_vectorizer,
+        "char_vectorizer": char_vectorizer,
+        "rich_transformer": rich_transformer,
+    }
+def get_available_models():
+    artifacts = load_artifacts()
+    return list(artifacts["models"].keys())
+def get_default_top_models(top_k: int = 2):
+    available = set(get_available_models())
+    ranked = [name for name in DEFAULT_MODEL_RANKING if name in available]
+    if not ranked:
+        return list(available)[:top_k]
+    return ranked[: max(1, top_k)]

features/nepali_text_classifier/routes.py CHANGED Viewed

@@ -15,27 +15,42 @@ security = HTTPBearer()
 # Input schema
 class TextInput(BaseModel):
     text: str
 @router.post("/analyse")
 @limiter.limit(ACCESS_RATE)
 async def analyse(request: Request, data: TextInput, token: str = Depends(security)):
-    result = classify_text(data.text)
     return result
 @router.post("/upload")
 @limiter.limit(ACCESS_RATE)
-async def upload_file(request:Request,file:UploadFile=File(...),token:str=Depends(security)):
-    return await handle_file_upload(file)
 @router.post("/analyse-sentences")
 @limiter.limit(ACCESS_RATE)
 async def upload_file(request:Request,data:TextInput,token:str=Depends(security)):
-    return await  handle_sentence_level_analysis(data.text)
 @router.post("/file-sentences-analyse")
 @limiter.limit(ACCESS_RATE)
-async def analyze_sentance_file(request: Request, file: UploadFile = File(...), token: str = Depends(security)):
-    return await handle_file_sentence(file)
 @router.get("/health")

 # Input schema
 class TextInput(BaseModel):
     text: str
+    models: list[str] | None = None
 @router.post("/analyse")
 @limiter.limit(ACCESS_RATE)
 async def analyse(request: Request, data: TextInput, token: str = Depends(security)):
+    selected = ",".join(data.models[:2]) if data.models else None
+    result = await nepali_text_analysis(data.text, selected)
     return result
 @router.post("/upload")
 @limiter.limit(ACCESS_RATE)
+async def upload_file(request:Request,file:UploadFile=File(...), models: str | None = None, token:str=Depends(security)):
+    return await handle_file_upload(file, models)
 @router.post("/analyse-sentences")
 @limiter.limit(ACCESS_RATE)
 async def upload_file(request:Request,data:TextInput,token:str=Depends(security)):
+    selected = ",".join(data.models[:2]) if data.models else None
+    return await  handle_sentence_level_analysis(data.text, selected)
 @router.post("/file-sentences-analyse")
 @limiter.limit(ACCESS_RATE)
+async def analyze_sentance_file(request: Request, file: UploadFile = File(...), models: str | None = None, token: str = Depends(security)):
+    return await handle_file_sentence(file, models)
+@router.get("/models")
+@limiter.limit(ACCESS_RATE)
+def get_models(request: Request):
+    from .model_loader import get_available_models, get_default_top_models
+    available = get_available_models()
+    return {
+        "available_models": available,
+        "default_top_2": get_default_top_models(2),
+    }
 @router.get("/health")