Upload folder using huggingface_hub
Browse files- .env.example +5 -0
- README.md +67 -0
- config.py +21 -0
- data/.gitkeep +0 -0
- data/serp_labels.csv +21 -0
- inference.py +54 -0
- models/.gitkeep +0 -0
- requirements.txt +10 -0
- train.py +64 -0
.env.example
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SERP-Feature-Classifier
|
| 2 |
+
DATA_PATH=data/serp_labels.csv
|
| 3 |
+
MODEL_DIR=models
|
| 4 |
+
HF_MODEL=bert-base-uncased
|
| 5 |
+
RANDOM_STATE=42
|
README.md
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SERP-Feature-Classifier: SERP Feature Type Prediction
|
| 2 |
+
|
| 3 |
+
**Type:** Academic | **Domain:** SEO, Search
|
| 4 |
+
**Hugging Face:** [syeedalireza/serp-feature-classifier](https://huggingface.co/syeedalireza/serp-feature-classifier)
|
| 5 |
+
|
| 6 |
+
Multi-label classification of SERP feature types (featured snippet, PAA, local pack, etc.) from query and context.
|
| 7 |
+
|
| 8 |
+
## Author
|
| 9 |
+
|
| 10 |
+
**Alireza Aminzadeh**
|
| 11 |
+
- Hugging Face: [syeedalireza](https://huggingface.co/syeedalireza)
|
| 12 |
+
- LinkedIn: [alirezaaminzadeh](https://www.linkedin.com/in/alirezaaminzadeh)
|
| 13 |
+
- Email: alireza.aminzadeh@hotmail.com
|
| 14 |
+
|
| 15 |
+
## Problem
|
| 16 |
+
|
| 17 |
+
Understanding which SERP features appear for a query helps content and technical SEO strategy (e.g. snippet optimization, local SEO).
|
| 18 |
+
|
| 19 |
+
## Approach
|
| 20 |
+
|
| 21 |
+
- **Input:** Query text, optional context (device, locale).
|
| 22 |
+
- **Output:** Multi-label (featured_snippet, paa, local_pack, knowledge_panel, etc.).
|
| 23 |
+
- **Models:** Transformer-based text classifier (e.g. BERT mini) or sentence-transformers + linear head; optional XGBoost on query features.
|
| 24 |
+
|
| 25 |
+
## Tech Stack
|
| 26 |
+
|
| 27 |
+
| Category | Tools |
|
| 28 |
+
|----------|------|
|
| 29 |
+
| NLP / DL | Hugging Face Transformers, sentence-transformers |
|
| 30 |
+
| ML | scikit-learn, PyTorch |
|
| 31 |
+
| Data | pandas, NumPy |
|
| 32 |
+
|
| 33 |
+
## Setup
|
| 34 |
+
|
| 35 |
+
```bash
|
| 36 |
+
pip install -r requirements.txt
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
## Usage
|
| 40 |
+
|
| 41 |
+
```bash
|
| 42 |
+
python train.py
|
| 43 |
+
python inference.py --query "best coffee shops near me"
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
## Project structure
|
| 47 |
+
|
| 48 |
+
```
|
| 49 |
+
02_serp-feature-classifier/
|
| 50 |
+
├── config.py
|
| 51 |
+
├── train.py # Sentence-transformers + MultiOutputClassifier
|
| 52 |
+
├── inference.py # Single query or batch CSV
|
| 53 |
+
├── requirements.txt
|
| 54 |
+
├── .env.example
|
| 55 |
+
├── data/
|
| 56 |
+
│ └── serp_labels.csv # Sample: query + binary labels per SERP feature
|
| 57 |
+
└── models/
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
## Data
|
| 61 |
+
|
| 62 |
+
- **Sample data (included):** `data/serp_labels.csv` — columns: `query`, `featured_snippet`, `paa`, `local_pack`, `knowledge_panel`, `images` (0/1).
|
| 63 |
+
- Set `DATA_PATH` in `.env` if using another file.
|
| 64 |
+
|
| 65 |
+
## License
|
| 66 |
+
|
| 67 |
+
MIT.
|
config.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration for SERP-Feature-Classifier.
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
BASE_DIR = Path(__file__).resolve().parent
|
| 8 |
+
DATA_PATH = os.getenv("DATA_PATH", str(BASE_DIR / "data" / "serp_labels.csv"))
|
| 9 |
+
MODEL_DIR = Path(os.getenv("MODEL_DIR", str(BASE_DIR / "models")))
|
| 10 |
+
HF_MODEL = os.getenv("HF_MODEL", "bert-base-uncased")
|
| 11 |
+
RANDOM_STATE = int(os.getenv("RANDOM_STATE", "42"))
|
| 12 |
+
|
| 13 |
+
SERP_LABELS = [
|
| 14 |
+
"featured_snippet",
|
| 15 |
+
"paa",
|
| 16 |
+
"local_pack",
|
| 17 |
+
"knowledge_panel",
|
| 18 |
+
"images",
|
| 19 |
+
]
|
| 20 |
+
QUERY_COLUMN = "query"
|
| 21 |
+
MODEL_DIR.mkdir(parents=True, exist_ok=True)
|
data/.gitkeep
ADDED
|
File without changes
|
data/serp_labels.csv
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
query,featured_snippet,paa,local_pack,knowledge_panel,images
|
| 2 |
+
best coffee shops near me,0,1,1,0,1
|
| 3 |
+
what is machine learning,1,1,0,1,0
|
| 4 |
+
buy running shoes online,0,1,0,0,1
|
| 5 |
+
how to fix 404 error,1,1,0,0,0
|
| 6 |
+
weather in new york today,0,0,0,1,0
|
| 7 |
+
python tutorial for beginners,1,1,0,0,0
|
| 8 |
+
restaurants open now,0,1,1,0,1
|
| 9 |
+
seo best practices 2024,1,1,0,0,0
|
| 10 |
+
apple store locations,0,0,1,1,0
|
| 11 |
+
how does photosynthesis work,1,1,0,1,0
|
| 12 |
+
cheap hotels near airport,0,1,1,0,1
|
| 13 |
+
what is deep learning,1,1,0,1,0
|
| 14 |
+
pizza delivery near me,0,1,1,0,1
|
| 15 |
+
how to learn python,1,1,0,0,0
|
| 16 |
+
best laptops 2024,0,1,0,0,1
|
| 17 |
+
who invented the telephone,1,1,0,1,0
|
| 18 |
+
plumbers near me,0,1,1,0,0
|
| 19 |
+
docker vs kubernetes,1,1,0,0,0
|
| 20 |
+
nearest gas station,0,0,1,0,0
|
| 21 |
+
how to tie a tie,1,1,0,0,1
|
inference.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Predict SERP feature labels for queries.
|
| 3 |
+
"""
|
| 4 |
+
import argparse
|
| 5 |
+
import joblib
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
from sentence_transformers import SentenceTransformer
|
| 10 |
+
|
| 11 |
+
from config import MODEL_DIR, SERP_LABELS
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def main():
|
| 15 |
+
parser = argparse.ArgumentParser()
|
| 16 |
+
parser.add_argument("--query", type=str, help="Single query to predict")
|
| 17 |
+
parser.add_argument("--input", type=str, help="CSV with 'query' column for batch")
|
| 18 |
+
parser.add_argument("--output", type=str, default="predictions.csv")
|
| 19 |
+
args = parser.parse_args()
|
| 20 |
+
|
| 21 |
+
model_path = MODEL_DIR / "serp_classifier.joblib"
|
| 22 |
+
label_path = MODEL_DIR / "label_columns.joblib"
|
| 23 |
+
if not model_path.exists():
|
| 24 |
+
raise FileNotFoundError(f"Train first: {model_path} not found")
|
| 25 |
+
|
| 26 |
+
clf = joblib.load(model_path)
|
| 27 |
+
labels = joblib.load(label_path) if label_path.exists() else SERP_LABELS
|
| 28 |
+
encoder = SentenceTransformer("all-MiniLM-L6-v2")
|
| 29 |
+
|
| 30 |
+
if args.query:
|
| 31 |
+
X = encoder.encode([args.query])
|
| 32 |
+
pred = np.array([est.predict(X) for est in clf.estimators_]).T
|
| 33 |
+
out = dict(zip(labels, pred[0].tolist()))
|
| 34 |
+
print(out)
|
| 35 |
+
return
|
| 36 |
+
|
| 37 |
+
if args.input and Path(args.input).exists():
|
| 38 |
+
import pandas as pd
|
| 39 |
+
df = pd.read_csv(args.input)
|
| 40 |
+
if "query" not in df.columns:
|
| 41 |
+
raise ValueError("CSV must have 'query' column")
|
| 42 |
+
X = encoder.encode(df["query"].astype(str).tolist())
|
| 43 |
+
pred = np.array([est.predict(X) for est in clf.estimators_]).T
|
| 44 |
+
for i, col in enumerate(labels):
|
| 45 |
+
df[f"pred_{col}"] = pred[:, i]
|
| 46 |
+
df.to_csv(args.output, index=False)
|
| 47 |
+
print(f"Saved to {args.output}")
|
| 48 |
+
return
|
| 49 |
+
|
| 50 |
+
print("Use --query 'text' or --input file.csv")
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
if __name__ == "__main__":
|
| 54 |
+
main()
|
models/.gitkeep
ADDED
|
File without changes
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SERP-Feature-Classifier
|
| 2 |
+
# Python 3.9+
|
| 3 |
+
|
| 4 |
+
torch>=1.12.0
|
| 5 |
+
transformers>=4.20.0
|
| 6 |
+
sentence-transformers>=2.2.0
|
| 7 |
+
scikit-learn>=1.0.0
|
| 8 |
+
joblib>=1.1.0
|
| 9 |
+
pandas>=1.3.0
|
| 10 |
+
numpy>=1.21.0
|
train.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Train SERP feature multi-label classifier.
|
| 3 |
+
Uses sentence-transformers embeddings + sklearn multi-output classifier.
|
| 4 |
+
"""
|
| 5 |
+
import json
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import numpy as np
|
| 10 |
+
from sklearn.model_selection import train_test_split
|
| 11 |
+
from sklearn.multioutput import MultiOutputClassifier
|
| 12 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 13 |
+
from sklearn.metrics import f1_score, accuracy_score
|
| 14 |
+
from sentence_transformers import SentenceTransformer
|
| 15 |
+
|
| 16 |
+
from config import (
|
| 17 |
+
DATA_PATH,
|
| 18 |
+
MODEL_DIR,
|
| 19 |
+
RANDOM_STATE,
|
| 20 |
+
SERP_LABELS,
|
| 21 |
+
QUERY_COLUMN,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def load_data(path: str) -> tuple[pd.DataFrame, list[str]]:
|
| 26 |
+
df = pd.read_csv(path)
|
| 27 |
+
if QUERY_COLUMN not in df.columns:
|
| 28 |
+
raise ValueError(f"Missing column: {QUERY_COLUMN}")
|
| 29 |
+
labels = [c for c in SERP_LABELS if c in df.columns]
|
| 30 |
+
return df, labels
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def main():
|
| 34 |
+
if not Path(DATA_PATH).exists():
|
| 35 |
+
print(f"Data not found at {DATA_PATH}. Create data/serp_labels.csv with columns: {QUERY_COLUMN}, {SERP_LABELS}")
|
| 36 |
+
return
|
| 37 |
+
|
| 38 |
+
df, labels = load_data(DATA_PATH)
|
| 39 |
+
encoder = SentenceTransformer("all-MiniLM-L6-v2")
|
| 40 |
+
X = encoder.encode(df[QUERY_COLUMN].astype(str).tolist())
|
| 41 |
+
y = df[labels].values
|
| 42 |
+
|
| 43 |
+
X_train, X_val, y_train, y_val = train_test_split(
|
| 44 |
+
X, y, test_size=0.2, random_state=RANDOM_STATE
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
clf = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE))
|
| 48 |
+
clf.fit(X_train, y_train)
|
| 49 |
+
pred = np.array([est.predict(X_val) for est in clf.estimators_]).T
|
| 50 |
+
|
| 51 |
+
metrics = {
|
| 52 |
+
"macro_f1": float(f1_score(y_val, pred, average="macro", zero_division=0)),
|
| 53 |
+
"micro_f1": float(f1_score(y_val, pred, average="micro", zero_division=0)),
|
| 54 |
+
}
|
| 55 |
+
import joblib
|
| 56 |
+
joblib.dump(clf, MODEL_DIR / "serp_classifier.joblib")
|
| 57 |
+
joblib.dump(labels, MODEL_DIR / "label_columns.joblib")
|
| 58 |
+
with open(MODEL_DIR / "metrics.json", "w") as f:
|
| 59 |
+
json.dump(metrics, f, indent=2)
|
| 60 |
+
print("Metrics:", metrics)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
if __name__ == "__main__":
|
| 64 |
+
main()
|