Spaces:
Sleeping
Sleeping
push: used only the logistic
Browse files
features/nepali_text_classifier/inferencer.py
CHANGED
|
@@ -5,7 +5,7 @@ from scipy.sparse import csr_matrix, hstack
|
|
| 5 |
from .model_loader import get_default_top_models, load_artifacts
|
| 6 |
|
| 7 |
|
| 8 |
-
TOP_K_MODELS =
|
| 9 |
|
| 10 |
|
| 11 |
def normalize_nepali_text(text: str) -> str:
|
|
@@ -23,7 +23,7 @@ def _select_models(models, model_names=None, top_k=2):
|
|
| 23 |
return list(models.keys())[:top_k]
|
| 24 |
|
| 25 |
|
| 26 |
-
def classify_text(text: str, model_names=
|
| 27 |
artifacts = load_artifacts()
|
| 28 |
models = artifacts["models"]
|
| 29 |
if not models:
|
|
@@ -81,8 +81,8 @@ def classify_text(text: str, model_names=None, top_k: int = 2):
|
|
| 81 |
return {
|
| 82 |
"label": final_label,
|
| 83 |
"confidence": round(avg_conf * 100, 2),
|
| 84 |
-
|
| 85 |
-
|
| 86 |
# "votes": {"AI": ai_votes, "Human": human_votes},
|
| 87 |
# "available_models": list(models.keys()),
|
| 88 |
# "unavailable_models": artifacts["unavailable_models"],
|
|
|
|
| 5 |
from .model_loader import get_default_top_models, load_artifacts
|
| 6 |
|
| 7 |
|
| 8 |
+
TOP_K_MODELS = 1
|
| 9 |
|
| 10 |
|
| 11 |
def normalize_nepali_text(text: str) -> str:
|
|
|
|
| 23 |
return list(models.keys())[:top_k]
|
| 24 |
|
| 25 |
|
| 26 |
+
def classify_text(text: str, model_names="Logistic Regression", top_k: int = 1):
|
| 27 |
artifacts = load_artifacts()
|
| 28 |
models = artifacts["models"]
|
| 29 |
if not models:
|
|
|
|
| 81 |
return {
|
| 82 |
"label": final_label,
|
| 83 |
"confidence": round(avg_conf * 100, 2),
|
| 84 |
+
"selected_models": selected_names,
|
| 85 |
+
"model_predictions": per_model,
|
| 86 |
# "votes": {"AI": ai_votes, "Human": human_votes},
|
| 87 |
# "available_models": list(models.keys()),
|
| 88 |
# "unavailable_models": artifacts["unavailable_models"],
|
features/nepali_text_classifier/model_loader.py
CHANGED
|
@@ -12,14 +12,13 @@ from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
|
|
| 12 |
|
| 13 |
from config import Config
|
| 14 |
|
| 15 |
-
|
| 16 |
LOGGER = logging.getLogger(__name__)
|
| 17 |
|
| 18 |
|
| 19 |
MODEL_FILES = {
|
| 20 |
"Logistic Regression": "Logistic_Regression.pkl",
|
| 21 |
"Random Forest": "Random_Forest.pkl",
|
| 22 |
-
"Gradient Boosting": "Gradient_Boosting.pkl",
|
| 23 |
"Linear SVC": "Linear_SVC.pkl",
|
| 24 |
"Ridge Classifier": "Ridge_Classifier.pkl",
|
| 25 |
"Multinomial NB": "Multinomial_NB.pkl",
|
|
@@ -48,7 +47,9 @@ DEFAULT_MODEL_RANKING = [
|
|
| 48 |
|
| 49 |
def _patch_legacy_logistic_model(model):
|
| 50 |
"""Backfill attributes expected by newer sklearn versions."""
|
| 51 |
-
if isinstance(model, (LogisticRegression, LogisticRegressionCV)) and not hasattr(
|
|
|
|
|
|
|
| 52 |
model.multi_class = "auto"
|
| 53 |
return model
|
| 54 |
|
|
@@ -81,14 +82,23 @@ class NepaliRichFeatures:
|
|
| 81 |
words = str(text).split()
|
| 82 |
num_words = max(len(words), 1)
|
| 83 |
num_chars = max(len(str(text)), 1)
|
| 84 |
-
num_sentences = max(
|
|
|
|
|
|
|
| 85 |
avg_word_len = float(np.mean([len(w) for w in words])) if words else 0.0
|
| 86 |
avg_sent_len = num_words / num_sentences
|
| 87 |
lexical_diversity = len(set(words)) / num_words
|
| 88 |
-
punct_count =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
punct_ratio = punct_count / num_chars
|
| 90 |
bigrams = [" ".join(words[i : i + 2]) for i in range(len(words) - 1)]
|
| 91 |
-
rep_bigram_ratio = (
|
|
|
|
|
|
|
| 92 |
diacritic_count = sum(1 for c in str(text) if "\u093e" <= c <= "\u094d")
|
| 93 |
diacritic_ratio = diacritic_count / num_chars
|
| 94 |
return {
|
|
@@ -135,7 +145,9 @@ def _candidate_model_dirs() -> list[Path]:
|
|
| 135 |
|
| 136 |
default_dir = repo / "features" / "Model" / "Nepali_model"
|
| 137 |
candidates.extend([default_dir, default_dir / NEPALI_SUBDIR])
|
| 138 |
-
candidates.append(
|
|
|
|
|
|
|
| 139 |
return candidates
|
| 140 |
|
| 141 |
|
|
@@ -144,10 +156,18 @@ def _download_nepali_artifacts() -> None:
|
|
| 144 |
raise ValueError("English_model repo id is not configured")
|
| 145 |
|
| 146 |
repo = _repo_root()
|
| 147 |
-
target_dir =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
snapshot_path = Path(snapshot_download(repo_id=REPO_ID, token=HF_TOKEN))
|
| 150 |
-
source_dir =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
target_dir.mkdir(parents=True, exist_ok=True)
|
| 153 |
shutil.copytree(source_dir, target_dir, dirs_exist_ok=True)
|
|
@@ -165,7 +185,9 @@ def resolve_model_dir() -> Path:
|
|
| 165 |
if _has_required_artifacts(path):
|
| 166 |
return path
|
| 167 |
|
| 168 |
-
raise FileNotFoundError(
|
|
|
|
|
|
|
| 169 |
|
| 170 |
|
| 171 |
@lru_cache(maxsize=1)
|
|
|
|
| 12 |
|
| 13 |
from config import Config
|
| 14 |
|
|
|
|
| 15 |
LOGGER = logging.getLogger(__name__)
|
| 16 |
|
| 17 |
|
| 18 |
MODEL_FILES = {
|
| 19 |
"Logistic Regression": "Logistic_Regression.pkl",
|
| 20 |
"Random Forest": "Random_Forest.pkl",
|
| 21 |
+
# "Gradient Boosting": "Gradient_Boosting.pkl",
|
| 22 |
"Linear SVC": "Linear_SVC.pkl",
|
| 23 |
"Ridge Classifier": "Ridge_Classifier.pkl",
|
| 24 |
"Multinomial NB": "Multinomial_NB.pkl",
|
|
|
|
| 47 |
|
| 48 |
def _patch_legacy_logistic_model(model):
|
| 49 |
"""Backfill attributes expected by newer sklearn versions."""
|
| 50 |
+
if isinstance(model, (LogisticRegression, LogisticRegressionCV)) and not hasattr(
|
| 51 |
+
model, "multi_class"
|
| 52 |
+
):
|
| 53 |
model.multi_class = "auto"
|
| 54 |
return model
|
| 55 |
|
|
|
|
| 82 |
words = str(text).split()
|
| 83 |
num_words = max(len(words), 1)
|
| 84 |
num_chars = max(len(str(text)), 1)
|
| 85 |
+
num_sentences = max(
|
| 86 |
+
len([s for s in re.split(r"[।!?]", str(text)) if s.strip()]), 1
|
| 87 |
+
)
|
| 88 |
avg_word_len = float(np.mean([len(w) for w in words])) if words else 0.0
|
| 89 |
avg_sent_len = num_words / num_sentences
|
| 90 |
lexical_diversity = len(set(words)) / num_words
|
| 91 |
+
punct_count = (
|
| 92 |
+
str(text).count("।")
|
| 93 |
+
+ str(text).count("?")
|
| 94 |
+
+ str(text).count("!")
|
| 95 |
+
+ str(text).count(",")
|
| 96 |
+
)
|
| 97 |
punct_ratio = punct_count / num_chars
|
| 98 |
bigrams = [" ".join(words[i : i + 2]) for i in range(len(words) - 1)]
|
| 99 |
+
rep_bigram_ratio = (
|
| 100 |
+
(1.0 - len(set(bigrams)) / max(len(bigrams), 1)) if bigrams else 0.0
|
| 101 |
+
)
|
| 102 |
diacritic_count = sum(1 for c in str(text) if "\u093e" <= c <= "\u094d")
|
| 103 |
diacritic_ratio = diacritic_count / num_chars
|
| 104 |
return {
|
|
|
|
| 145 |
|
| 146 |
default_dir = repo / "features" / "Model" / "Nepali_model"
|
| 147 |
candidates.extend([default_dir, default_dir / NEPALI_SUBDIR])
|
| 148 |
+
candidates.append(
|
| 149 |
+
repo / "notebook" / "ai_vs_human_nepali" / "final_model" / "saved_models"
|
| 150 |
+
)
|
| 151 |
return candidates
|
| 152 |
|
| 153 |
|
|
|
|
| 156 |
raise ValueError("English_model repo id is not configured")
|
| 157 |
|
| 158 |
repo = _repo_root()
|
| 159 |
+
target_dir = (
|
| 160 |
+
Path(Config.Nepali_model_folder)
|
| 161 |
+
if Config.Nepali_model_folder
|
| 162 |
+
else repo / "features" / "Model" / "Nepali_model"
|
| 163 |
+
)
|
| 164 |
|
| 165 |
snapshot_path = Path(snapshot_download(repo_id=REPO_ID, token=HF_TOKEN))
|
| 166 |
+
source_dir = (
|
| 167 |
+
snapshot_path / NEPALI_SUBDIR
|
| 168 |
+
if (snapshot_path / NEPALI_SUBDIR).is_dir()
|
| 169 |
+
else snapshot_path
|
| 170 |
+
)
|
| 171 |
|
| 172 |
target_dir.mkdir(parents=True, exist_ok=True)
|
| 173 |
shutil.copytree(source_dir, target_dir, dirs_exist_ok=True)
|
|
|
|
| 185 |
if _has_required_artifacts(path):
|
| 186 |
return path
|
| 187 |
|
| 188 |
+
raise FileNotFoundError(
|
| 189 |
+
"Nepali model directory not found. Set Nepali_model env or add expected artifacts."
|
| 190 |
+
)
|
| 191 |
|
| 192 |
|
| 193 |
@lru_cache(maxsize=1)
|