ashtii commited on
Commit
cddf9bb
·
verified ·
1 Parent(s): 6681f0f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +241 -82
app.py CHANGED
@@ -1,110 +1,269 @@
1
- # app.py — attempt to reconstruct training features by stacking available vectorizers
2
  import gradio as gr
3
- import joblib, os, requests
 
4
  import numpy as np
 
 
5
  from scipy.sparse import hstack, csr_matrix
6
 
7
- # download model files directly from HF repo
8
- BASE = "https://huggingface.co/ashtii/cosmetic-category-model/resolve/main/"
9
- FILES = ["model.joblib",
10
- "char_vect.joblib","word_vect.joblib","vect_f.joblib",
11
- "char_vect_cat.joblib","word_vect_cat.joblib"]
12
 
13
- os.makedirs("modelrepo", exist_ok=True)
14
- for f in FILES:
15
- url = BASE + f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  try:
17
  r = requests.get(url, timeout=20)
18
- if r.status_code == 200:
19
- with open(os.path.join("modelrepo", f), "wb") as fh:
20
  fh.write(r.content)
21
- print("Downloaded", f)
22
- except Exception as e:
23
- print("skip", f, e)
 
24
 
25
- # load model
26
- model = joblib.load("modelrepo/model.joblib")
27
- EXPECTED_DIM = getattr(model, "n_features_in_", None)
28
- print("Model expects features:", EXPECTED_DIM)
 
 
 
 
 
 
 
 
29
 
30
- # helper to load optional vectorizers
31
- def opt_load(path):
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  try:
33
- return joblib.load(path)
34
  except Exception:
35
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- # load vectorizers that exist
38
- vec_names = ["char_vect.joblib","word_vect.joblib","vect_f.joblib","char_vect_cat.joblib","word_vect_cat.joblib"]
39
- vecs = []
40
- for name in vec_names:
41
- p = os.path.join("modelrepo", name)
42
- v = opt_load(p)
43
- if v is not None:
44
- vecs.append((name, v))
45
- print("Loaded vectorizer:", name, type(v))
46
-
47
- # Function to build combined features
48
- def build_features(text):
49
- # Accept `text` as string or list
50
- if isinstance(text, str):
51
- X_in = [text]
52
- elif isinstance(text, (list,tuple)):
53
- X_in = list(text)
54
- else:
55
- X_in = [str(text)]
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  mats = []
58
- for (name, v) in vecs:
59
  try:
60
- mat = v.transform(X_in)
61
- mats.append(mat if hasattr(mat, "shape") else csr_matrix(mat))
62
  except Exception as e:
63
- print("transform failed for", name, e)
64
-
 
 
 
65
  if not mats:
66
- # No vectorizers loaded — fallback: try model.predict on raw text (may fail)
67
  return None
68
-
69
- # hstack the sparse matrices in the same order we loaded them
70
  try:
71
- X_comb = hstack(mats).tocsr()
72
- except Exception as e:
73
- # if any mat is dense, convert to sparse and hstack
74
  mats2 = [csr_matrix(m) if not hasattr(m, "tocsr") else m.tocsr() for m in mats]
75
- X_comb = hstack(mats2).tocsr()
76
-
77
- # If model expects a fixed size, pad or trim to match
78
- if EXPECTED_DIM is not None:
79
- cur = X_comb.shape[1]
80
- if cur < EXPECTED_DIM:
81
- # pad with zeros on the right
82
- pad_width = EXPECTED_DIM - cur
83
- pad = csr_matrix((X_comb.shape[0], pad_width), dtype=X_comb.dtype)
84
- X_comb = hstack([X_comb, pad]).tocsr()
85
- elif cur > EXPECTED_DIM:
86
- # trim extra columns (best-effort)
87
- X_comb = X_comb[:, :EXPECTED_DIM]
88
-
89
- return X_comb
90
-
91
- # prediction function
92
- def predict(text):
93
  try:
94
- X = build_features(text)
 
 
 
95
  if X is None:
96
- return {"error": "No vectorizers available; cannot build features."}
97
-
98
- # If model accepts predict_proba return probabilities else labels
99
- if hasattr(model, "predict_proba"):
100
- out = model.predict_proba(X).tolist()
 
 
 
 
 
101
  else:
102
- out = model.predict(X).tolist()
103
- return {"prediction": out}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  except Exception as e:
105
  return {"error": str(e)}
106
 
107
- # Gradio interface
108
- iface = gr.Interface(fn=predict, inputs=gr.Textbox(lines=2, placeholder="Aqua, glycerin, ..."), outputs="json",
109
- title="Cosmetic Category Classifier")
 
 
 
 
 
 
 
 
 
 
110
  iface.launch()
 
1
+ # app.py
2
  import gradio as gr
3
+ import joblib
4
+ import os, requests, json
5
  import numpy as np
6
+ import pandas as pd
7
+ from difflib import get_close_matches, SequenceMatcher
8
  from scipy.sparse import hstack, csr_matrix
9
 
10
+ # ---- CONFIG ----
11
+ HF_REPO = "ashtii/cosmetic-category-model" # your HF repo with model + vectorizers + optional labels/ingredients
12
+ BASE_URL = f"https://huggingface.co/{HF_REPO}/resolve/main/"
 
 
13
 
14
+ # filenames we expect in the repo
15
+ MODEL_FNAME = "model.joblib"
16
+ LABELS_FNAME = "labels.json" # optional: list of class names in order
17
+ ING_CSV_CANDIDATES = [
18
+ "ingredients.csv",
19
+ "final_ingridients_dataset.csv",
20
+ "final_ingridients_dataset - Sheet1.csv",
21
+ "final ingridients dataset - Sheet1.csv"
22
+ ]
23
+ VECT_FILES = ["char_vect.joblib","word_vect.joblib","vect_f.joblib","char_vect_cat.joblib","word_vect_cat.joblib"]
24
+
25
+ WORKDIR = "modelrepo"
26
+ os.makedirs(WORKDIR, exist_ok=True)
27
+
28
+ # ---- helper: download file from HF repo if exists ----
29
+ def try_download(fname):
30
+ url = BASE_URL + fname
31
+ save_path = os.path.join(WORKDIR, fname)
32
  try:
33
  r = requests.get(url, timeout=20)
34
+ if r.status_code == 200 and r.content:
35
+ with open(save_path, "wb") as fh:
36
  fh.write(r.content)
37
+ return save_path
38
+ except Exception:
39
+ pass
40
+ return None
41
 
42
+ # download model + vectorizers + labels + ingredients if available
43
+ print("Downloading model & assets (best-effort)...")
44
+ try_download(MODEL_FNAME)
45
+ for vf in VECT_FILES:
46
+ try_download(vf)
47
+ try_download(LABELS_FNAME)
48
+ ing_path = None
49
+ for cand in ING_CSV_CANDIDATES:
50
+ p = try_download(cand)
51
+ if p:
52
+ ing_path = p
53
+ break
54
 
55
+ # ---- load model ----
56
+ if not os.path.exists(os.path.join(WORKDIR, MODEL_FNAME)):
57
+ raise RuntimeError(f"Model file not found in repo. Please add {MODEL_FNAME} to {HF_REPO}.")
58
+ model = joblib.load(os.path.join(WORKDIR, MODEL_FNAME))
59
+ print("Loaded model:", type(model))
60
+
61
+ # get class labels from model if possible, else from labels.json
62
+ CLASS_LABELS = None
63
+ try:
64
+ if hasattr(model, "classes_"):
65
+ CLASS_LABELS = list(map(str, model.classes_.tolist()))
66
+ except Exception:
67
+ CLASS_LABELS = None
68
+
69
+ if CLASS_LABELS is None and os.path.exists(os.path.join(WORKDIR, LABELS_FNAME)):
70
  try:
71
+ CLASS_LABELS = json.load(open(os.path.join(WORKDIR, LABELS_FNAME), "r"))
72
  except Exception:
73
+ CLASS_LABELS = None
74
+
75
+ # ---- load available vectorizers (order matters) ----
76
+ vectorizers = []
77
+ for name in VECT_FILES:
78
+ p = os.path.join(WORKDIR, name)
79
+ if os.path.exists(p):
80
+ try:
81
+ v = joblib.load(p)
82
+ vectorizers.append((name, v))
83
+ print("Loaded vectorizer:", name, type(v))
84
+ except Exception as e:
85
+ print("Failed load vectorizer", name, e)
86
+
87
+ # ---- load ingredients CSV (if available) ----
88
+ ING_DF = None
89
+ if ing_path and os.path.exists(ing_path):
90
+ try:
91
+ ING_DF = pd.read_csv(ing_path)
92
+ # normalize column names to lower-case trimmed
93
+ ING_DF.columns = [c.strip() for c in ING_DF.columns]
94
+ print("Loaded ingredients CSV:", ing_path, "columns:", ING_DF.columns.tolist())
95
+ except Exception as e:
96
+ print("Failed to load ingredients CSV:", e)
97
+ else:
98
+ print("No ingredients CSV found in repo. Upload a CSV named ingredients.csv with columns like Ingredient, Function, Benefits, Harmfulness.")
99
+
100
+ # ---- helpers for ingredient matching & normalization ----
101
+ def normalize_ingredient(s):
102
+ if not isinstance(s, str):
103
+ return ""
104
+ s = s.lower().strip()
105
+ # remove common parentheses content and extra punctuation
106
+ import re
107
+ s = re.sub(r"\([^)]*\)", "", s)
108
+ s = re.sub(r"[^a-z0-9\s%/.,-]", " ", s)
109
+ s = " ".join(s.split())
110
+ return s
111
 
112
+ def fuzzy_best_match(name, choices, cutoff=0.6):
113
+ """Return (best_match, score) using SequenceMatcher ratio; or (None,0)"""
114
+ if not choices:
115
+ return None, 0.0
116
+ best = None
117
+ best_score = 0.0
118
+ for c in choices:
119
+ score = SequenceMatcher(None, name, c).ratio()
120
+ if score > best_score:
121
+ best_score = score
122
+ best = c
123
+ if best_score >= cutoff:
124
+ return best, best_score
125
+ return best, best_score # return best even if below cutoff
 
 
 
 
 
126
 
127
+ # get choices from ING_DF
128
+ ING_CHOICES = []
129
+ if ING_DF is not None and "Ingredient" in ING_DF.columns:
130
+ # use original names
131
+ ING_CHOICES = [str(x).strip().lower() for x in ING_DF["Ingredient"].astype(str).tolist()]
132
+ else:
133
+ # if Ingredient column not present, try first column
134
+ if ING_DF is not None and len(ING_DF.columns) > 0:
135
+ col0 = ING_DF.columns[0]
136
+ ING_CHOICES = [str(x).strip().lower() for x in ING_DF[col0].astype(str).tolist()]
137
+
138
+ # ---- helper to build feature vector consistent with model ----
139
+ def build_feature_matrix(texts):
140
+ """
141
+ texts: list[str]
142
+ returns sparse matrix compatible with model (pads/trims to n_features_in_ if needed)
143
+ """
144
  mats = []
145
+ for name, v in vectorizers:
146
  try:
147
+ mats.append(v.transform(texts))
 
148
  except Exception as e:
149
+ # if transform fails, try transform on cleaned strings
150
+ try:
151
+ mats.append(v.transform([normalize_ingredient(t) for t in texts]))
152
+ except Exception:
153
+ pass
154
  if not mats:
 
155
  return None
 
 
156
  try:
157
+ X = hstack(mats).tocsr()
158
+ except Exception:
 
159
  mats2 = [csr_matrix(m) if not hasattr(m, "tocsr") else m.tocsr() for m in mats]
160
+ X = hstack(mats2).tocsr()
161
+ # pad or trim to model.n_features_in_ if available
162
+ n_expected = getattr(model, "n_features_in_", None)
163
+ if n_expected is not None:
164
+ cur = X.shape[1]
165
+ if cur < n_expected:
166
+ pad = csr_matrix((X.shape[0], n_expected - cur), dtype=X.dtype)
167
+ X = hstack([X, pad]).tocsr()
168
+ elif cur > n_expected:
169
+ X = X[:, :n_expected]
170
+ return X
171
+
172
+ # ---- main predict + ingredient analysis function ----
173
+ def analyze_and_predict(raw_text: str):
 
 
 
 
174
  try:
175
+ # 1) category prediction
176
+ texts = [raw_text]
177
+ X = build_feature_matrix(texts)
178
+ category_result = None
179
  if X is None:
180
+ # try direct predict (if model can accept raw text)
181
+ try:
182
+ if hasattr(model, "predict_proba"):
183
+ probs = model.predict_proba(texts)[0].tolist()
184
+ else:
185
+ pred = model.predict(texts).tolist()
186
+ probs = [float(pred[0])]
187
+ except Exception as e:
188
+ category_result = {"error": "Model cannot run (missing vectorizers). " + str(e)}
189
+ probs = None
190
  else:
191
+ if hasattr(model, "predict_proba"):
192
+ probs = model.predict_proba(X)[0].tolist()
193
+ else:
194
+ pred = model.predict(X).tolist()
195
+ # still make it list-of-probs-like
196
+ probs = [float(x) for x in pred]
197
+
198
+ if probs is not None:
199
+ # map to labels if available, else use indices
200
+ if CLASS_LABELS:
201
+ label_idx = int(np.argmax(probs))
202
+ label_name = CLASS_LABELS[label_idx] if label_idx < len(CLASS_LABELS) else str(label_idx)
203
+ else:
204
+ label_idx = int(np.argmax(probs))
205
+ label_name = str(label_idx)
206
+ category_result = {
207
+ "label": label_name,
208
+ "label_index": int(label_idx),
209
+ "probabilities": probs,
210
+ "classes": CLASS_LABELS or [str(i) for i in range(len(probs))]
211
+ }
212
+
213
+ # 2) ingredient analysis: split input by commas and newlines
214
+ # basic splitting — you can improve for multi-word separators
215
+ raw_items = [i.strip() for i in raw_text.replace("\n", ",").split(",") if i.strip()]
216
+ analyses = []
217
+ for item in raw_items:
218
+ norm = normalize_ingredient(item)
219
+ best_match, score = fuzzy_best_match(norm, ING_CHOICES, cutoff=0.0)
220
+ row = None
221
+ if best_match and ING_DF is not None:
222
+ # find first row with that ingredient (match lowercase)
223
+ mask = ING_DF.apply(lambda r: str(r.astype(str).tolist()).lower().find(best_match) >= 0, axis=1)
224
+ # safer: try find exact match in Ingredient column
225
+ if "Ingredient" in ING_DF.columns:
226
+ matches = ING_DF[ING_DF["Ingredient"].astype(str).str.strip().str.lower() == best_match]
227
+ if len(matches) == 0:
228
+ # fallback to fuzzy first hit
229
+ matches = ING_DF[ING_DF.apply(lambda row: best_match in str(row.values).lower(), axis=1)]
230
+ else:
231
+ matches = ING_DF[ING_DF.apply(lambda row: best_match in str(row.values).lower(), axis=1)]
232
+ if len(matches) > 0:
233
+ row = matches.iloc[0]
234
+ # build analysis dict
235
+ analysis = {
236
+ "input": item,
237
+ "normalized": norm,
238
+ "matched": best_match,
239
+ "match_score": float(score)
240
+ }
241
+ if row is not None:
242
+ # add known fields if present
243
+ for col in ING_DF.columns:
244
+ try:
245
+ analysis[col] = row[col] if pd.notna(row[col]) else None
246
+ except Exception:
247
+ analysis[col] = None
248
+ analyses.append(analysis)
249
+
250
+ # final JSON
251
+ return {"category": category_result, "ingredients": analyses}
252
+
253
  except Exception as e:
254
  return {"error": str(e)}
255
 
256
+ # ---- Gradio interface ----
257
+ def api_predict(text):
258
+ # Gradio passes raw string; return JSON-like structure
259
+ return analyze_and_predict(text)
260
+
261
+ title = "Category + Ingredient Analysis"
262
+ desc = "Paste product ingredient string (comma separated). Returns predicted category and per-ingredient analysis."
263
+
264
+ iface = gr.Interface(fn=api_predict,
265
+ inputs=gr.Textbox(lines=3, placeholder="Aqua, Glycerin, Aloe vera, ..."),
266
+ outputs="json",
267
+ title=title, description=desc)
268
+
269
  iface.launch()