Spaces:

can-org
/

Testing-AI-Contain

Sleeping

App Files Files Community

Pujan-Dev commited on Apr 27

Commit

7bda3a9

1 Parent(s): 0d1c39e

push: used only the logistic

Browse files

Files changed (2) hide show

features/nepali_text_classifier/inferencer.py +4 -4
features/nepali_text_classifier/model_loader.py +32 -10

features/nepali_text_classifier/inferencer.py CHANGED Viewed

@@ -5,7 +5,7 @@ from scipy.sparse import csr_matrix, hstack
 from .model_loader import get_default_top_models, load_artifacts
-TOP_K_MODELS = 2
 def normalize_nepali_text(text: str) -> str:
@@ -23,7 +23,7 @@ def _select_models(models, model_names=None, top_k=2):
     return list(models.keys())[:top_k]
-def classify_text(text: str, model_names=None, top_k: int = 2):
     artifacts = load_artifacts()
     models = artifacts["models"]
     if not models:
@@ -81,8 +81,8 @@ def classify_text(text: str, model_names=None, top_k: int = 2):
     return {
         "label": final_label,
         "confidence": round(avg_conf * 100, 2),
-        # "selected_models": selected_names,
-        # "model_predictions": per_model,
         # "votes": {"AI": ai_votes, "Human": human_votes},
         # "available_models": list(models.keys()),
         # "unavailable_models": artifacts["unavailable_models"],

 from .model_loader import get_default_top_models, load_artifacts
+TOP_K_MODELS = 1
 def normalize_nepali_text(text: str) -> str:
     return list(models.keys())[:top_k]
+def classify_text(text: str, model_names="Logistic Regression", top_k: int = 1):
     artifacts = load_artifacts()
     models = artifacts["models"]
     if not models:
     return {
         "label": final_label,
         "confidence": round(avg_conf * 100, 2),
+        "selected_models": selected_names,
+        "model_predictions": per_model,
         # "votes": {"AI": ai_votes, "Human": human_votes},
         # "available_models": list(models.keys()),
         # "unavailable_models": artifacts["unavailable_models"],

features/nepali_text_classifier/model_loader.py CHANGED Viewed

@@ -12,14 +12,13 @@ from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
 from config import Config
 LOGGER = logging.getLogger(__name__)
 MODEL_FILES = {
     "Logistic Regression": "Logistic_Regression.pkl",
     "Random Forest": "Random_Forest.pkl",
-    "Gradient Boosting": "Gradient_Boosting.pkl",
     "Linear SVC": "Linear_SVC.pkl",
     "Ridge Classifier": "Ridge_Classifier.pkl",
     "Multinomial NB": "Multinomial_NB.pkl",
@@ -48,7 +47,9 @@ DEFAULT_MODEL_RANKING = [
 def _patch_legacy_logistic_model(model):
     """Backfill attributes expected by newer sklearn versions."""
-    if isinstance(model, (LogisticRegression, LogisticRegressionCV)) and not hasattr(model, "multi_class"):
         model.multi_class = "auto"
     return model
@@ -81,14 +82,23 @@ class NepaliRichFeatures:
         words = str(text).split()
         num_words = max(len(words), 1)
         num_chars = max(len(str(text)), 1)
-        num_sentences = max(len([s for s in re.split(r"[।!?]", str(text)) if s.strip()]), 1)
         avg_word_len = float(np.mean([len(w) for w in words])) if words else 0.0
         avg_sent_len = num_words / num_sentences
         lexical_diversity = len(set(words)) / num_words
-        punct_count = str(text).count("।") + str(text).count("?") + str(text).count("!") + str(text).count(",")
         punct_ratio = punct_count / num_chars
         bigrams = [" ".join(words[i : i + 2]) for i in range(len(words) - 1)]
-        rep_bigram_ratio = (1.0 - len(set(bigrams)) / max(len(bigrams), 1)) if bigrams else 0.0
         diacritic_count = sum(1 for c in str(text) if "\u093e" <= c <= "\u094d")
         diacritic_ratio = diacritic_count / num_chars
         return {
@@ -135,7 +145,9 @@ def _candidate_model_dirs() -> list[Path]:
     default_dir = repo / "features" / "Model" / "Nepali_model"
     candidates.extend([default_dir, default_dir / NEPALI_SUBDIR])
-    candidates.append(repo / "notebook" / "ai_vs_human_nepali" / "final_model" / "saved_models")
     return candidates
@@ -144,10 +156,18 @@ def _download_nepali_artifacts() -> None:
         raise ValueError("English_model repo id is not configured")
     repo = _repo_root()
-    target_dir = Path(Config.Nepali_model_folder) if Config.Nepali_model_folder else repo / "features" / "Model" / "Nepali_model"
     snapshot_path = Path(snapshot_download(repo_id=REPO_ID, token=HF_TOKEN))
-    source_dir = snapshot_path / NEPALI_SUBDIR if (snapshot_path / NEPALI_SUBDIR).is_dir() else snapshot_path
     target_dir.mkdir(parents=True, exist_ok=True)
     shutil.copytree(source_dir, target_dir, dirs_exist_ok=True)
@@ -165,7 +185,9 @@ def resolve_model_dir() -> Path:
         if _has_required_artifacts(path):
             return path
-    raise FileNotFoundError("Nepali model directory not found. Set Nepali_model env or add expected artifacts.")
 @lru_cache(maxsize=1)

 from config import Config
 LOGGER = logging.getLogger(__name__)
 MODEL_FILES = {
     "Logistic Regression": "Logistic_Regression.pkl",
     "Random Forest": "Random_Forest.pkl",
+    # "Gradient Boosting": "Gradient_Boosting.pkl",
     "Linear SVC": "Linear_SVC.pkl",
     "Ridge Classifier": "Ridge_Classifier.pkl",
     "Multinomial NB": "Multinomial_NB.pkl",
 def _patch_legacy_logistic_model(model):
     """Backfill attributes expected by newer sklearn versions."""
+    if isinstance(model, (LogisticRegression, LogisticRegressionCV)) and not hasattr(
+        model, "multi_class"
+    ):
         model.multi_class = "auto"
     return model
         words = str(text).split()
         num_words = max(len(words), 1)
         num_chars = max(len(str(text)), 1)
+        num_sentences = max(
+            len([s for s in re.split(r"[।!?]", str(text)) if s.strip()]), 1
+        )
         avg_word_len = float(np.mean([len(w) for w in words])) if words else 0.0
         avg_sent_len = num_words / num_sentences
         lexical_diversity = len(set(words)) / num_words
+        punct_count = (
+            str(text).count("।")
+            + str(text).count("?")
+            + str(text).count("!")
+            + str(text).count(",")
+        )
         punct_ratio = punct_count / num_chars
         bigrams = [" ".join(words[i : i + 2]) for i in range(len(words) - 1)]
+        rep_bigram_ratio = (
+            (1.0 - len(set(bigrams)) / max(len(bigrams), 1)) if bigrams else 0.0
+        )
         diacritic_count = sum(1 for c in str(text) if "\u093e" <= c <= "\u094d")
         diacritic_ratio = diacritic_count / num_chars
         return {
     default_dir = repo / "features" / "Model" / "Nepali_model"
     candidates.extend([default_dir, default_dir / NEPALI_SUBDIR])
+    candidates.append(
+        repo / "notebook" / "ai_vs_human_nepali" / "final_model" / "saved_models"
+    )
     return candidates
         raise ValueError("English_model repo id is not configured")
     repo = _repo_root()
+    target_dir = (
+        Path(Config.Nepali_model_folder)
+        if Config.Nepali_model_folder
+        else repo / "features" / "Model" / "Nepali_model"
+    )
     snapshot_path = Path(snapshot_download(repo_id=REPO_ID, token=HF_TOKEN))
+    source_dir = (
+        snapshot_path / NEPALI_SUBDIR
+        if (snapshot_path / NEPALI_SUBDIR).is_dir()
+        else snapshot_path
+    )
     target_dir.mkdir(parents=True, exist_ok=True)
     shutil.copytree(source_dir, target_dir, dirs_exist_ok=True)
         if _has_required_artifacts(path):
             return path
+    raise FileNotFoundError(
+        "Nepali model directory not found. Set Nepali_model env or add expected artifacts."
+    )
 @lru_cache(maxsize=1)