aqibtahir commited on
Commit
ce8bebc
·
verified ·
1 Parent(s): 4cea15f

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +33 -182
app.py CHANGED
@@ -1,89 +1,59 @@
1
  """
2
- FastAPI Serverless API for Cookie Classification
3
- Deploy this to Hugging Face Spaces for FREE serverless inference!
4
  """
5
-
6
  from fastapi import FastAPI, HTTPException
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from pydantic import BaseModel
9
- from typing import List, Optional
10
  from huggingface_hub import hf_hub_download
11
  import joblib
12
  import numpy as np
13
  import re
14
  import pandas as pd
15
  from scipy.sparse import hstack, csr_matrix
16
- import os
17
 
18
- # Initialize FastAPI
19
- app = FastAPI(
20
- title="Cookie Classifier API",
21
- description="Classify web cookies into privacy categories: Strictly Necessary, Functionality, Analytics, Advertising/Tracking",
22
- version="1.0.0"
23
- )
24
 
25
- # Enable CORS for frontend access
26
  app.add_middleware(
27
  CORSMiddleware,
28
- allow_origins=["*"], # In production, specify your frontend domain
29
  allow_credentials=True,
30
  allow_methods=["*"],
31
  allow_headers=["*"],
32
  )
33
 
34
- # Class mapping
35
- CLASS_NAMES = {
36
- 0: "Strictly Necessary",
37
- 1: "Functionality",
38
- 2: "Analytics",
39
- 3: "Advertising/Tracking"
40
- }
41
-
42
- # Tracker tokens
43
- TRACKER_TOKENS = {
44
- "ga", "gid", "utm", "ad", "ads", "pixel", "trk", "track", "fbp", "fbc",
45
- "gclid", "sess", "session", "id", "uuid", "cid", "cmp", "campaign",
46
- "click", "impress"
47
- }
48
-
49
- # Global model storage
50
  model = None
51
  tfidf_word = None
52
  tfidf_char = None
53
 
 
 
 
54
  def extract_name_features(s: str):
55
- """Extract engineered features from cookie name"""
56
  if not isinstance(s, str):
57
  s = ""
58
-
59
  lower = s.lower()
60
  L = len(s)
61
  digits = sum(ch.isdigit() for ch in s)
62
  alphas = sum(ch.isalpha() for ch in s)
63
- underscores = lower.count("_")
64
- dashes = lower.count("-")
65
- dots = lower.count(".")
66
- prefix3 = lower[:3] if L >= 3 else lower
67
- suffix3 = lower[-3:] if L >= 3 else lower
68
  tokens = re.split(r"[^a-z0-9]+", lower)
69
  tokens = [t for t in tokens if t]
70
- uniq_tokens = len(set(tokens))
71
- token_len_mean = np.mean([len(t) for t in tokens]) if tokens else 0.0
72
  has_tracker = int(any(t in TRACKER_TOKENS for t in tokens))
73
- camel = int(bool(re.search(r"[a-z][A-Z]", s)))
74
- snake = int("_" in s)
75
- has_hex = int(bool(re.search(r"\b[0-9a-f]{8,}\b", lower)))
76
 
77
  return {
78
- "len": L, "digits": digits, "alphas": alphas, "underscores": underscores,
79
- "dashes": dashes, "dots": dots, "prefix3": prefix3, "suffix3": suffix3,
80
- "uniq_tokens": uniq_tokens, "token_len_mean": float(token_len_mean),
81
- "has_tracker_token": has_tracker, "camelCase": camel, "snake_case": snake,
82
- "has_hex": has_hex
 
 
 
 
83
  }
84
 
85
  def build_name_features(series):
86
- """Build name features DataFrame"""
87
  X = pd.DataFrame([extract_name_features(x) for x in series.fillna("")])
88
  for col in ["prefix3", "suffix3"]:
89
  top = X[col].value_counts().head(30).index
@@ -92,179 +62,60 @@ def build_name_features(series):
92
  return X
93
 
94
  def preprocess_cookie(cookie_name: str):
95
- """Complete preprocessing for a single cookie name"""
96
  series = pd.Series([cookie_name])
97
-
98
- # TF-IDF features
99
  Xw = tfidf_word.transform(series.fillna("").astype(str))
100
  Xc = tfidf_char.transform(series.fillna("").astype(str))
101
  Xtf = hstack([Xw, Xc])
102
-
103
- # Name features
104
  Xname = build_name_features(series)
105
  Xname = Xname.select_dtypes(include=[np.number]).astype("float64")
106
-
107
- # Combine
108
  X_combined = hstack([Xtf, csr_matrix(Xname.values)])
109
  return X_combined
110
 
111
  @app.on_event("startup")
112
- async def load_model():
113
- """Load model and vectorizers on startup"""
114
  global model, tfidf_word, tfidf_char
115
-
116
  try:
117
- print("🔄 Loading model from Hugging Face...")
118
-
119
- # Download model
120
- model_path = hf_hub_download(
121
- repo_id="aqibtahir/cookie-classifier-lr-tfidf",
122
- filename="LR_TFIDF+NAME.joblib"
123
- )
124
  model = joblib.load(model_path)
125
- print("✓ Model loaded")
126
 
127
- # Load vectorizers
128
- print("🔄 Loading vectorizers...")
129
- tfidf_word_path = hf_hub_download(
130
- repo_id="aqibtahir/cookie-classifier-lr-tfidf",
131
- filename="tfidf_word.joblib"
132
- )
133
- tfidf_char_path = hf_hub_download(
134
- repo_id="aqibtahir/cookie-classifier-lr-tfidf",
135
- filename="tfidf_char.joblib"
136
- )
137
- tfidf_word = joblib.load(tfidf_word_path)
138
- tfidf_char = joblib.load(tfidf_char_path)
139
- print("✓ Vectorizers loaded")
140
- print("🎉 API ready to serve predictions!")
141
 
 
142
  except Exception as e:
143
- print(f"Error during startup: {e}")
144
- import traceback
145
- traceback.print_exc()
146
  raise
147
 
148
- # Request/Response models
149
  class CookieRequest(BaseModel):
150
  cookie_name: str
151
 
152
- class BatchCookieRequest(BaseModel):
153
- cookie_names: List[str]
154
-
155
  class PredictionResponse(BaseModel):
156
  cookie_name: str
157
  category: str
158
  class_id: int
159
- confidence: Optional[float] = None
160
 
161
  @app.get("/")
162
- async def root():
163
- """Health check and API info"""
164
- return {
165
- "status": "online",
166
- "model": "Cookie Classifier - Linear Regression",
167
- "categories": list(CLASS_NAMES.values()),
168
- "endpoints": {
169
- "predict": "/predict",
170
- "batch": "/predict/batch",
171
- "docs": "/docs"
172
- }
173
- }
174
 
175
  @app.post("/predict", response_model=PredictionResponse)
176
- async def predict(request: CookieRequest):
177
- """
178
- Predict cookie category for a single cookie name
179
-
180
- Example:
181
- ```
182
- POST /predict
183
- {"cookie_name": "_ga"}
184
- ```
185
- """
186
- if not model:
187
  raise HTTPException(status_code=503, detail="Model not loaded")
188
 
189
- if not tfidf_word or not tfidf_char:
190
- raise HTTPException(
191
- status_code=503,
192
- detail="Vectorizers not available. Please upload tfidf_word.joblib and tfidf_char.joblib to the model repository"
193
- )
194
-
195
  try:
196
- # Preprocess and predict
197
  features = preprocess_cookie(request.cookie_name)
198
  prediction = model.predict(features)[0]
199
  class_id = int(prediction)
200
 
201
- # Get confidence if available
202
- confidence = None
203
- try:
204
- decision = model.decision_function(features)[0]
205
- # Normalize decision scores to pseudo-probabilities
206
- scores = np.exp(decision) / np.exp(decision).sum()
207
- confidence = float(scores[class_id])
208
- except:
209
- pass
210
-
211
  return PredictionResponse(
212
  cookie_name=request.cookie_name,
213
  category=CLASS_NAMES[class_id],
214
- class_id=class_id,
215
- confidence=confidence
216
- )
217
-
218
- except Exception as e:
219
- raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")
220
-
221
- @app.post("/predict/batch")
222
- async def predict_batch(request: BatchCookieRequest):
223
- """
224
- Predict categories for multiple cookie names
225
-
226
- Example:
227
- ```
228
- POST /predict/batch
229
- {"cookie_names": ["_ga", "sessionid", "utm_campaign"]}
230
- ```
231
- """
232
- if not model:
233
- raise HTTPException(status_code=503, detail="Model not loaded")
234
-
235
- if not tfidf_word or not tfidf_char:
236
- raise HTTPException(
237
- status_code=503,
238
- detail="Vectorizers not available"
239
  )
240
-
241
- try:
242
- results = []
243
- for cookie_name in request.cookie_names:
244
- features = preprocess_cookie(cookie_name)
245
- prediction = model.predict(features)[0]
246
- class_id = int(prediction)
247
-
248
- confidence = None
249
- try:
250
- decision = model.decision_function(features)[0]
251
- scores = np.exp(decision) / np.exp(decision).sum()
252
- confidence = float(scores[class_id])
253
- except:
254
- pass
255
-
256
- results.append({
257
- "cookie_name": cookie_name,
258
- "category": CLASS_NAMES[class_id],
259
- "class_id": class_id,
260
- "confidence": confidence
261
- })
262
-
263
- return {"predictions": results}
264
-
265
  except Exception as e:
266
- raise HTTPException(status_code=500, detail=f"Batch prediction error: {str(e)}")
267
-
268
- if __name__ == "__main__":
269
- import uvicorn
270
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  """
2
+ Minimal FastAPI for Cookie Classification
 
3
  """
 
4
  from fastapi import FastAPI, HTTPException
5
  from fastapi.middleware.cors import CORSMiddleware
6
  from pydantic import BaseModel
 
7
  from huggingface_hub import hf_hub_download
8
  import joblib
9
  import numpy as np
10
  import re
11
  import pandas as pd
12
  from scipy.sparse import hstack, csr_matrix
 
13
 
14
+ app = FastAPI(title="Cookie Classifier API")
 
 
 
 
 
15
 
16
+ # CORS
17
  app.add_middleware(
18
  CORSMiddleware,
19
+ allow_origins=["*"],
20
  allow_credentials=True,
21
  allow_methods=["*"],
22
  allow_headers=["*"],
23
  )
24
 
25
+ # Globals
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  model = None
27
  tfidf_word = None
28
  tfidf_char = None
29
 
30
+ CLASS_NAMES = {0: "Strictly Necessary", 1: "Functionality", 2: "Analytics", 3: "Advertising/Tracking"}
31
+ TRACKER_TOKENS = {"ga", "gid", "utm", "ad", "ads", "pixel", "trk", "track", "fbp", "fbc", "gclid", "sess", "session", "id", "uuid", "cid", "cmp", "campaign", "click", "impress"}
32
+
33
  def extract_name_features(s: str):
 
34
  if not isinstance(s, str):
35
  s = ""
 
36
  lower = s.lower()
37
  L = len(s)
38
  digits = sum(ch.isdigit() for ch in s)
39
  alphas = sum(ch.isalpha() for ch in s)
 
 
 
 
 
40
  tokens = re.split(r"[^a-z0-9]+", lower)
41
  tokens = [t for t in tokens if t]
 
 
42
  has_tracker = int(any(t in TRACKER_TOKENS for t in tokens))
 
 
 
43
 
44
  return {
45
+ "len": L, "digits": digits, "alphas": alphas,
46
+ "underscores": lower.count("_"), "dashes": lower.count("-"), "dots": lower.count("."),
47
+ "prefix3": lower[:3] if L >= 3 else lower, "suffix3": lower[-3:] if L >= 3 else lower,
48
+ "uniq_tokens": len(set(tokens)),
49
+ "token_len_mean": float(np.mean([len(t) for t in tokens]) if tokens else 0.0),
50
+ "has_tracker_token": has_tracker,
51
+ "camelCase": int(bool(re.search(r"[a-z][A-Z]", s))),
52
+ "snake_case": int("_" in s),
53
+ "has_hex": int(bool(re.search(r"\b[0-9a-f]{8,}\b", lower)))
54
  }
55
 
56
  def build_name_features(series):
 
57
  X = pd.DataFrame([extract_name_features(x) for x in series.fillna("")])
58
  for col in ["prefix3", "suffix3"]:
59
  top = X[col].value_counts().head(30).index
 
62
  return X
63
 
64
  def preprocess_cookie(cookie_name: str):
 
65
  series = pd.Series([cookie_name])
 
 
66
  Xw = tfidf_word.transform(series.fillna("").astype(str))
67
  Xc = tfidf_char.transform(series.fillna("").astype(str))
68
  Xtf = hstack([Xw, Xc])
 
 
69
  Xname = build_name_features(series)
70
  Xname = Xname.select_dtypes(include=[np.number]).astype("float64")
 
 
71
  X_combined = hstack([Xtf, csr_matrix(Xname.values)])
72
  return X_combined
73
 
74
  @app.on_event("startup")
75
+ def load_model():
 
76
  global model, tfidf_word, tfidf_char
 
77
  try:
78
+ print("Loading model...")
79
+ model_path = hf_hub_download(repo_id="aqibtahir/cookie-classifier-lr-tfidf", filename="LR_TFIDF+NAME.joblib")
 
 
 
 
 
80
  model = joblib.load(model_path)
 
81
 
82
+ print("Loading vectorizers...")
83
+ word_path = hf_hub_download(repo_id="aqibtahir/cookie-classifier-lr-tfidf", filename="tfidf_word.joblib")
84
+ char_path = hf_hub_download(repo_id="aqibtahir/cookie-classifier-lr-tfidf", filename="tfidf_char.joblib")
85
+ tfidf_word = joblib.load(word_path)
86
+ tfidf_char = joblib.load(char_path)
 
 
 
 
 
 
 
 
 
87
 
88
+ print("✓ Ready!")
89
  except Exception as e:
90
+ print(f"Error: {e}")
 
 
91
  raise
92
 
 
93
  class CookieRequest(BaseModel):
94
  cookie_name: str
95
 
 
 
 
96
  class PredictionResponse(BaseModel):
97
  cookie_name: str
98
  category: str
99
  class_id: int
 
100
 
101
  @app.get("/")
102
+ def root():
103
+ return {"status": "online", "categories": list(CLASS_NAMES.values())}
 
 
 
 
 
 
 
 
 
 
104
 
105
  @app.post("/predict", response_model=PredictionResponse)
106
+ def predict(request: CookieRequest):
107
+ if not model or not tfidf_word or not tfidf_char:
 
 
 
 
 
 
 
 
 
108
  raise HTTPException(status_code=503, detail="Model not loaded")
109
 
 
 
 
 
 
 
110
  try:
 
111
  features = preprocess_cookie(request.cookie_name)
112
  prediction = model.predict(features)[0]
113
  class_id = int(prediction)
114
 
 
 
 
 
 
 
 
 
 
 
115
  return PredictionResponse(
116
  cookie_name=request.cookie_name,
117
  category=CLASS_NAMES[class_id],
118
+ class_id=class_id
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  except Exception as e:
121
+ raise HTTPException(status_code=500, detail=str(e))