syeedalireza commited on
Commit
c47feb4
·
verified ·
1 Parent(s): 85907e5

Upload folder using huggingface_hub

Browse files
Files changed (9) hide show
  1. .env.example +5 -0
  2. README.md +67 -0
  3. config.py +21 -0
  4. data/.gitkeep +0 -0
  5. data/serp_labels.csv +21 -0
  6. inference.py +54 -0
  7. models/.gitkeep +0 -0
  8. requirements.txt +10 -0
  9. train.py +64 -0
.env.example ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # SERP-Feature-Classifier
2
+ DATA_PATH=data/serp_labels.csv
3
+ MODEL_DIR=models
4
+ HF_MODEL=bert-base-uncased
5
+ RANDOM_STATE=42
README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SERP-Feature-Classifier: SERP Feature Type Prediction
2
+
3
+ **Type:** Academic | **Domain:** SEO, Search
4
+ **Hugging Face:** [syeedalireza/serp-feature-classifier](https://huggingface.co/syeedalireza/serp-feature-classifier)
5
+
6
+ Multi-label classification of SERP feature types (featured snippet, PAA, local pack, etc.) from query and context.
7
+
8
+ ## Author
9
+
10
+ **Alireza Aminzadeh**
11
+ - Hugging Face: [syeedalireza](https://huggingface.co/syeedalireza)
12
+ - LinkedIn: [alirezaaminzadeh](https://www.linkedin.com/in/alirezaaminzadeh)
13
+ - Email: alireza.aminzadeh@hotmail.com
14
+
15
+ ## Problem
16
+
17
+ Understanding which SERP features appear for a query helps content and technical SEO strategy (e.g. snippet optimization, local SEO).
18
+
19
+ ## Approach
20
+
21
+ - **Input:** Query text, optional context (device, locale).
22
+ - **Output:** Multi-label (featured_snippet, paa, local_pack, knowledge_panel, etc.).
23
+ - **Models:** Transformer-based text classifier (e.g. BERT mini) or sentence-transformers + linear head; optional XGBoost on query features.
24
+
25
+ ## Tech Stack
26
+
27
+ | Category | Tools |
28
+ |----------|------|
29
+ | NLP / DL | Hugging Face Transformers, sentence-transformers |
30
+ | ML | scikit-learn, PyTorch |
31
+ | Data | pandas, NumPy |
32
+
33
+ ## Setup
34
+
35
+ ```bash
36
+ pip install -r requirements.txt
37
+ ```
38
+
39
+ ## Usage
40
+
41
+ ```bash
42
+ python train.py
43
+ python inference.py --query "best coffee shops near me"
44
+ ```
45
+
46
+ ## Project structure
47
+
48
+ ```
49
+ 02_serp-feature-classifier/
50
+ ├── config.py
51
+ ├── train.py # Sentence-transformers + MultiOutputClassifier
52
+ ├── inference.py # Single query or batch CSV
53
+ ├── requirements.txt
54
+ ├── .env.example
55
+ ├── data/
56
+ │ └── serp_labels.csv # Sample: query + binary labels per SERP feature
57
+ └── models/
58
+ ```
59
+
60
+ ## Data
61
+
62
+ - **Sample data (included):** `data/serp_labels.csv` — columns: `query`, `featured_snippet`, `paa`, `local_pack`, `knowledge_panel`, `images` (0/1).
63
+ - Set `DATA_PATH` in `.env` if using another file.
64
+
65
+ ## License
66
+
67
+ MIT.
config.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration for SERP-Feature-Classifier.
3
+ """
4
+ import os
5
+ from pathlib import Path
6
+
7
+ BASE_DIR = Path(__file__).resolve().parent
8
+ DATA_PATH = os.getenv("DATA_PATH", str(BASE_DIR / "data" / "serp_labels.csv"))
9
+ MODEL_DIR = Path(os.getenv("MODEL_DIR", str(BASE_DIR / "models")))
10
+ HF_MODEL = os.getenv("HF_MODEL", "bert-base-uncased")
11
+ RANDOM_STATE = int(os.getenv("RANDOM_STATE", "42"))
12
+
13
+ SERP_LABELS = [
14
+ "featured_snippet",
15
+ "paa",
16
+ "local_pack",
17
+ "knowledge_panel",
18
+ "images",
19
+ ]
20
+ QUERY_COLUMN = "query"
21
+ MODEL_DIR.mkdir(parents=True, exist_ok=True)
data/.gitkeep ADDED
File without changes
data/serp_labels.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ query,featured_snippet,paa,local_pack,knowledge_panel,images
2
+ best coffee shops near me,0,1,1,0,1
3
+ what is machine learning,1,1,0,1,0
4
+ buy running shoes online,0,1,0,0,1
5
+ how to fix 404 error,1,1,0,0,0
6
+ weather in new york today,0,0,0,1,0
7
+ python tutorial for beginners,1,1,0,0,0
8
+ restaurants open now,0,1,1,0,1
9
+ seo best practices 2024,1,1,0,0,0
10
+ apple store locations,0,0,1,1,0
11
+ how does photosynthesis work,1,1,0,1,0
12
+ cheap hotels near airport,0,1,1,0,1
13
+ what is deep learning,1,1,0,1,0
14
+ pizza delivery near me,0,1,1,0,1
15
+ how to learn python,1,1,0,0,0
16
+ best laptops 2024,0,1,0,0,1
17
+ who invented the telephone,1,1,0,1,0
18
+ plumbers near me,0,1,1,0,0
19
+ docker vs kubernetes,1,1,0,0,0
20
+ nearest gas station,0,0,1,0,0
21
+ how to tie a tie,1,1,0,0,1
inference.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Predict SERP feature labels for queries.
3
+ """
4
+ import argparse
5
+ import joblib
6
+ from pathlib import Path
7
+
8
+ import numpy as np
9
+ from sentence_transformers import SentenceTransformer
10
+
11
+ from config import MODEL_DIR, SERP_LABELS
12
+
13
+
14
+ def main():
15
+ parser = argparse.ArgumentParser()
16
+ parser.add_argument("--query", type=str, help="Single query to predict")
17
+ parser.add_argument("--input", type=str, help="CSV with 'query' column for batch")
18
+ parser.add_argument("--output", type=str, default="predictions.csv")
19
+ args = parser.parse_args()
20
+
21
+ model_path = MODEL_DIR / "serp_classifier.joblib"
22
+ label_path = MODEL_DIR / "label_columns.joblib"
23
+ if not model_path.exists():
24
+ raise FileNotFoundError(f"Train first: {model_path} not found")
25
+
26
+ clf = joblib.load(model_path)
27
+ labels = joblib.load(label_path) if label_path.exists() else SERP_LABELS
28
+ encoder = SentenceTransformer("all-MiniLM-L6-v2")
29
+
30
+ if args.query:
31
+ X = encoder.encode([args.query])
32
+ pred = np.array([est.predict(X) for est in clf.estimators_]).T
33
+ out = dict(zip(labels, pred[0].tolist()))
34
+ print(out)
35
+ return
36
+
37
+ if args.input and Path(args.input).exists():
38
+ import pandas as pd
39
+ df = pd.read_csv(args.input)
40
+ if "query" not in df.columns:
41
+ raise ValueError("CSV must have 'query' column")
42
+ X = encoder.encode(df["query"].astype(str).tolist())
43
+ pred = np.array([est.predict(X) for est in clf.estimators_]).T
44
+ for i, col in enumerate(labels):
45
+ df[f"pred_{col}"] = pred[:, i]
46
+ df.to_csv(args.output, index=False)
47
+ print(f"Saved to {args.output}")
48
+ return
49
+
50
+ print("Use --query 'text' or --input file.csv")
51
+
52
+
53
+ if __name__ == "__main__":
54
+ main()
models/.gitkeep ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # SERP-Feature-Classifier
2
+ # Python 3.9+
3
+
4
+ torch>=1.12.0
5
+ transformers>=4.20.0
6
+ sentence-transformers>=2.2.0
7
+ scikit-learn>=1.0.0
8
+ joblib>=1.1.0
9
+ pandas>=1.3.0
10
+ numpy>=1.21.0
train.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Train SERP feature multi-label classifier.
3
+ Uses sentence-transformers embeddings + sklearn multi-output classifier.
4
+ """
5
+ import json
6
+ from pathlib import Path
7
+
8
+ import pandas as pd
9
+ import numpy as np
10
+ from sklearn.model_selection import train_test_split
11
+ from sklearn.multioutput import MultiOutputClassifier
12
+ from sklearn.ensemble import RandomForestClassifier
13
+ from sklearn.metrics import f1_score, accuracy_score
14
+ from sentence_transformers import SentenceTransformer
15
+
16
+ from config import (
17
+ DATA_PATH,
18
+ MODEL_DIR,
19
+ RANDOM_STATE,
20
+ SERP_LABELS,
21
+ QUERY_COLUMN,
22
+ )
23
+
24
+
25
+ def load_data(path: str) -> tuple[pd.DataFrame, list[str]]:
26
+ df = pd.read_csv(path)
27
+ if QUERY_COLUMN not in df.columns:
28
+ raise ValueError(f"Missing column: {QUERY_COLUMN}")
29
+ labels = [c for c in SERP_LABELS if c in df.columns]
30
+ return df, labels
31
+
32
+
33
+ def main():
34
+ if not Path(DATA_PATH).exists():
35
+ print(f"Data not found at {DATA_PATH}. Create data/serp_labels.csv with columns: {QUERY_COLUMN}, {SERP_LABELS}")
36
+ return
37
+
38
+ df, labels = load_data(DATA_PATH)
39
+ encoder = SentenceTransformer("all-MiniLM-L6-v2")
40
+ X = encoder.encode(df[QUERY_COLUMN].astype(str).tolist())
41
+ y = df[labels].values
42
+
43
+ X_train, X_val, y_train, y_val = train_test_split(
44
+ X, y, test_size=0.2, random_state=RANDOM_STATE
45
+ )
46
+
47
+ clf = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE))
48
+ clf.fit(X_train, y_train)
49
+ pred = np.array([est.predict(X_val) for est in clf.estimators_]).T
50
+
51
+ metrics = {
52
+ "macro_f1": float(f1_score(y_val, pred, average="macro", zero_division=0)),
53
+ "micro_f1": float(f1_score(y_val, pred, average="micro", zero_division=0)),
54
+ }
55
+ import joblib
56
+ joblib.dump(clf, MODEL_DIR / "serp_classifier.joblib")
57
+ joblib.dump(labels, MODEL_DIR / "label_columns.joblib")
58
+ with open(MODEL_DIR / "metrics.json", "w") as f:
59
+ json.dump(metrics, f, indent=2)
60
+ print("Metrics:", metrics)
61
+
62
+
63
+ if __name__ == "__main__":
64
+ main()