Pujan-Dev commited on
Commit
7bda3a9
·
1 Parent(s): 0d1c39e

push: used only the logistic

Browse files
features/nepali_text_classifier/inferencer.py CHANGED
@@ -5,7 +5,7 @@ from scipy.sparse import csr_matrix, hstack
5
  from .model_loader import get_default_top_models, load_artifacts
6
 
7
 
8
- TOP_K_MODELS = 2
9
 
10
 
11
  def normalize_nepali_text(text: str) -> str:
@@ -23,7 +23,7 @@ def _select_models(models, model_names=None, top_k=2):
23
  return list(models.keys())[:top_k]
24
 
25
 
26
- def classify_text(text: str, model_names=None, top_k: int = 2):
27
  artifacts = load_artifacts()
28
  models = artifacts["models"]
29
  if not models:
@@ -81,8 +81,8 @@ def classify_text(text: str, model_names=None, top_k: int = 2):
81
  return {
82
  "label": final_label,
83
  "confidence": round(avg_conf * 100, 2),
84
- # "selected_models": selected_names,
85
- # "model_predictions": per_model,
86
  # "votes": {"AI": ai_votes, "Human": human_votes},
87
  # "available_models": list(models.keys()),
88
  # "unavailable_models": artifacts["unavailable_models"],
 
5
  from .model_loader import get_default_top_models, load_artifacts
6
 
7
 
8
+ TOP_K_MODELS = 1
9
 
10
 
11
  def normalize_nepali_text(text: str) -> str:
 
23
  return list(models.keys())[:top_k]
24
 
25
 
26
+ def classify_text(text: str, model_names="Logistic Regression", top_k: int = 1):
27
  artifacts = load_artifacts()
28
  models = artifacts["models"]
29
  if not models:
 
81
  return {
82
  "label": final_label,
83
  "confidence": round(avg_conf * 100, 2),
84
+ "selected_models": selected_names,
85
+ "model_predictions": per_model,
86
  # "votes": {"AI": ai_votes, "Human": human_votes},
87
  # "available_models": list(models.keys()),
88
  # "unavailable_models": artifacts["unavailable_models"],
features/nepali_text_classifier/model_loader.py CHANGED
@@ -12,14 +12,13 @@ from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
12
 
13
  from config import Config
14
 
15
-
16
  LOGGER = logging.getLogger(__name__)
17
 
18
 
19
  MODEL_FILES = {
20
  "Logistic Regression": "Logistic_Regression.pkl",
21
  "Random Forest": "Random_Forest.pkl",
22
- "Gradient Boosting": "Gradient_Boosting.pkl",
23
  "Linear SVC": "Linear_SVC.pkl",
24
  "Ridge Classifier": "Ridge_Classifier.pkl",
25
  "Multinomial NB": "Multinomial_NB.pkl",
@@ -48,7 +47,9 @@ DEFAULT_MODEL_RANKING = [
48
 
49
  def _patch_legacy_logistic_model(model):
50
  """Backfill attributes expected by newer sklearn versions."""
51
- if isinstance(model, (LogisticRegression, LogisticRegressionCV)) and not hasattr(model, "multi_class"):
 
 
52
  model.multi_class = "auto"
53
  return model
54
 
@@ -81,14 +82,23 @@ class NepaliRichFeatures:
81
  words = str(text).split()
82
  num_words = max(len(words), 1)
83
  num_chars = max(len(str(text)), 1)
84
- num_sentences = max(len([s for s in re.split(r"[।!?]", str(text)) if s.strip()]), 1)
 
 
85
  avg_word_len = float(np.mean([len(w) for w in words])) if words else 0.0
86
  avg_sent_len = num_words / num_sentences
87
  lexical_diversity = len(set(words)) / num_words
88
- punct_count = str(text).count("।") + str(text).count("?") + str(text).count("!") + str(text).count(",")
 
 
 
 
 
89
  punct_ratio = punct_count / num_chars
90
  bigrams = [" ".join(words[i : i + 2]) for i in range(len(words) - 1)]
91
- rep_bigram_ratio = (1.0 - len(set(bigrams)) / max(len(bigrams), 1)) if bigrams else 0.0
 
 
92
  diacritic_count = sum(1 for c in str(text) if "\u093e" <= c <= "\u094d")
93
  diacritic_ratio = diacritic_count / num_chars
94
  return {
@@ -135,7 +145,9 @@ def _candidate_model_dirs() -> list[Path]:
135
 
136
  default_dir = repo / "features" / "Model" / "Nepali_model"
137
  candidates.extend([default_dir, default_dir / NEPALI_SUBDIR])
138
- candidates.append(repo / "notebook" / "ai_vs_human_nepali" / "final_model" / "saved_models")
 
 
139
  return candidates
140
 
141
 
@@ -144,10 +156,18 @@ def _download_nepali_artifacts() -> None:
144
  raise ValueError("English_model repo id is not configured")
145
 
146
  repo = _repo_root()
147
- target_dir = Path(Config.Nepali_model_folder) if Config.Nepali_model_folder else repo / "features" / "Model" / "Nepali_model"
 
 
 
 
148
 
149
  snapshot_path = Path(snapshot_download(repo_id=REPO_ID, token=HF_TOKEN))
150
- source_dir = snapshot_path / NEPALI_SUBDIR if (snapshot_path / NEPALI_SUBDIR).is_dir() else snapshot_path
 
 
 
 
151
 
152
  target_dir.mkdir(parents=True, exist_ok=True)
153
  shutil.copytree(source_dir, target_dir, dirs_exist_ok=True)
@@ -165,7 +185,9 @@ def resolve_model_dir() -> Path:
165
  if _has_required_artifacts(path):
166
  return path
167
 
168
- raise FileNotFoundError("Nepali model directory not found. Set Nepali_model env or add expected artifacts.")
 
 
169
 
170
 
171
  @lru_cache(maxsize=1)
 
12
 
13
  from config import Config
14
 
 
15
  LOGGER = logging.getLogger(__name__)
16
 
17
 
18
  MODEL_FILES = {
19
  "Logistic Regression": "Logistic_Regression.pkl",
20
  "Random Forest": "Random_Forest.pkl",
21
+ # "Gradient Boosting": "Gradient_Boosting.pkl",
22
  "Linear SVC": "Linear_SVC.pkl",
23
  "Ridge Classifier": "Ridge_Classifier.pkl",
24
  "Multinomial NB": "Multinomial_NB.pkl",
 
47
 
48
  def _patch_legacy_logistic_model(model):
49
  """Backfill attributes expected by newer sklearn versions."""
50
+ if isinstance(model, (LogisticRegression, LogisticRegressionCV)) and not hasattr(
51
+ model, "multi_class"
52
+ ):
53
  model.multi_class = "auto"
54
  return model
55
 
 
82
  words = str(text).split()
83
  num_words = max(len(words), 1)
84
  num_chars = max(len(str(text)), 1)
85
+ num_sentences = max(
86
+ len([s for s in re.split(r"[।!?]", str(text)) if s.strip()]), 1
87
+ )
88
  avg_word_len = float(np.mean([len(w) for w in words])) if words else 0.0
89
  avg_sent_len = num_words / num_sentences
90
  lexical_diversity = len(set(words)) / num_words
91
+ punct_count = (
92
+ str(text).count("।")
93
+ + str(text).count("?")
94
+ + str(text).count("!")
95
+ + str(text).count(",")
96
+ )
97
  punct_ratio = punct_count / num_chars
98
  bigrams = [" ".join(words[i : i + 2]) for i in range(len(words) - 1)]
99
+ rep_bigram_ratio = (
100
+ (1.0 - len(set(bigrams)) / max(len(bigrams), 1)) if bigrams else 0.0
101
+ )
102
  diacritic_count = sum(1 for c in str(text) if "\u093e" <= c <= "\u094d")
103
  diacritic_ratio = diacritic_count / num_chars
104
  return {
 
145
 
146
  default_dir = repo / "features" / "Model" / "Nepali_model"
147
  candidates.extend([default_dir, default_dir / NEPALI_SUBDIR])
148
+ candidates.append(
149
+ repo / "notebook" / "ai_vs_human_nepali" / "final_model" / "saved_models"
150
+ )
151
  return candidates
152
 
153
 
 
156
  raise ValueError("English_model repo id is not configured")
157
 
158
  repo = _repo_root()
159
+ target_dir = (
160
+ Path(Config.Nepali_model_folder)
161
+ if Config.Nepali_model_folder
162
+ else repo / "features" / "Model" / "Nepali_model"
163
+ )
164
 
165
  snapshot_path = Path(snapshot_download(repo_id=REPO_ID, token=HF_TOKEN))
166
+ source_dir = (
167
+ snapshot_path / NEPALI_SUBDIR
168
+ if (snapshot_path / NEPALI_SUBDIR).is_dir()
169
+ else snapshot_path
170
+ )
171
 
172
  target_dir.mkdir(parents=True, exist_ok=True)
173
  shutil.copytree(source_dir, target_dir, dirs_exist_ok=True)
 
185
  if _has_required_artifacts(path):
186
  return path
187
 
188
+ raise FileNotFoundError(
189
+ "Nepali model directory not found. Set Nepali_model env or add expected artifacts."
190
+ )
191
 
192
 
193
  @lru_cache(maxsize=1)