Ali Kefia commited on
Commit ·
4c31c97
1
Parent(s): 231da5b
ok
Browse files- .gitattributes +1 -0
- .mise.toml +1 -1
- data/eval.parquet +3 -0
- data/train.parquet +3 -0
- debug.py +13 -0
- imgs/confusion_matrix.png +0 -0
- imgs/roc_curve.png +0 -0
- model/model.pickle +2 -2
- out/confusion_matrix.png +0 -0
- out/preds.csv +0 -45
- out/roc_curve.png +0 -0
- prepare.py +45 -0
- train.py +12 -58
- usage.py +15 -23
- utils/__init__.py +0 -0
- utils/data.py +17 -0
- embed.py → utils/embed.py +0 -0
- utils/paths.py +9 -0
.gitattributes
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
*.csv filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 2 |
*.pickle filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
*.csv filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 3 |
*.pickle filter=lfs diff=lfs merge=lfs -text
|
.mise.toml
CHANGED
|
@@ -6,7 +6,7 @@ EMBEDDING_MODEL_REV = "d8c86521100d3556476a063fc2342036d45c106f"
|
|
| 6 |
|
| 7 |
DATA_DIR = "{{config_root}}/data"
|
| 8 |
MODEL_DIR = "{{config_root}}/model"
|
| 9 |
-
|
| 10 |
|
| 11 |
[tasks.deps]
|
| 12 |
run = [
|
|
|
|
| 6 |
|
| 7 |
DATA_DIR = "{{config_root}}/data"
|
| 8 |
MODEL_DIR = "{{config_root}}/model"
|
| 9 |
+
IMGS_DIR = "{{config_root}}/imgs"
|
| 10 |
|
| 11 |
[tasks.deps]
|
| 12 |
run = [
|
data/eval.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1ce81584baeb7eb8ca4322bc0f50af105ae3795229718cda1dfa1f600e945f3a
|
| 3 |
+
size 195251
|
data/train.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bf199fc047485c2c453c4d9b80714261ed58152ef34c59903a64f9725d0e4956
|
| 3 |
+
size 6608000
|
debug.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import polars as pl
|
| 2 |
+
|
| 3 |
+
from utils.paths import DATA
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def main() -> None:
|
| 7 |
+
for name in ["train", "eval"]:
|
| 8 |
+
df = pl.read_parquet(DATA / (name + ".parquet"))
|
| 9 |
+
print(df)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
if __name__ == "__main__":
|
| 13 |
+
main()
|
imgs/confusion_matrix.png
ADDED
|
imgs/roc_curve.png
ADDED
|
model/model.pickle
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:73bf71607b6b99d8576a79ec96cdf97e008134e7d348477f93b8cdcf057db19e
|
| 3 |
+
size 3411728
|
out/confusion_matrix.png
DELETED
|
Binary file (16.4 kB)
|
|
|
out/preds.csv
DELETED
|
@@ -1,45 +0,0 @@
|
|
| 1 |
-
url,is_news_article,prediction,is_prediction_correct
|
| 2 |
-
https://quantumcomputingreport.com/quandela-launches-belenos-photonic-quantum-computer-with-doubling-of-qubit-count-and-4000x-power-increase/,true,true,true
|
| 3 |
-
https://www.nqcc.ac.uk/,false,false,true
|
| 4 |
-
https://quantumcomputingreport.com/qsensato-raises-e500k-560k-usd-to-advance-integrated-atomic-quantum-sensors-for-precision-sensing/,true,true,true
|
| 5 |
-
https://quantumcomputingreport.com/zurich-instruments-and-rohde-schwarz-join-australias-national-quantum-computing-testbed-facility/,true,true,true
|
| 6 |
-
https://quantumcomputingreport.com/hbku-launches-qatars-first-quantum-computing-laboratory-backed-by-10m-mod-grant/,true,true,true
|
| 7 |
-
https://quantumcomputingreport.com/quantinuum-releases-%ce%bbambeq-gen-ii-for-scalable-interpretable-quantum-nlp/,true,false,false
|
| 8 |
-
https://quantumcomputingreport.com/quobly-secures-e21m-23-7m-usd-to-industrialize-100-qubit-silicon-quantum-processor/,true,true,true
|
| 9 |
-
https://quantumcomputingreport.com/semiqon-and-nanoacademic-partner-to-advance-silicon-spin-qubit-research-and-education/,true,true,true
|
| 10 |
-
https://quantumcomputingreport.com/united-nations-itu-launches-quantum-for-good-to-align-innovation-with-global-impact/,true,false,false
|
| 11 |
-
https://quantumcomputingreport.com/microsoft-adds-post-quantum-cryptography-to-windows-insider-builds-and-linux/,true,true,true
|
| 12 |
-
https://www.nqcc.ac.uk/technology-and-research/our-research/,false,false,true
|
| 13 |
-
https://quantumcomputingreport.com/podcast-with-scott-davis-ceo-and-co-founder-of-vescent/,false,false,true
|
| 14 |
-
https://quantumzeitgeist.com/building-atoms-the-rise-of-nanotechnology-and-molecular-engineering/,false,true,false
|
| 15 |
-
https://quantumzeitgeist.com/networked-services-technologies-applications-and-challenges-for-advanced-communication/,false,false,true
|
| 16 |
-
https://quantumzeitgeist.com/amazon-braket-sdk-and-multi-platform-quantum-development/,false,true,false
|
| 17 |
-
https://quantumzeitgeist.com/pennylane-and-quantum-machine-learning/,false,false,true
|
| 18 |
-
https://quantumzeitgeist.com/quantum-physics-meets-spiritual-philosophy-exploring-the-intersection-of-string-theory-and-consciousness/,false,false,true
|
| 19 |
-
https://quantumzeitgeist.com/quantum-computing-transforms-financial-derivatives-pricing-for-complex-options-and-risk-analysis/,false,true,false
|
| 20 |
-
https://quantumzeitgeist.com/quantifying-quantum-correlations-in-symmetric-gaussian-states-with-universal-invariants/,true,false,false
|
| 21 |
-
https://www.horseandhound.co.uk/news/horse-life-threatening-stomach-tumour-saved-pioneering-surgery-894298,true,true,true
|
| 22 |
-
https://www.maddyness.com/2025/06/02/vivatech-startups-deals-annonces-ce-que-la-mission-french-tech-prevoit-pour-levenement/,false,false,true
|
| 23 |
-
https://www.cbsnews.com/sanfrancisco/news/padel-a-fast-growing-sport-has-become-a-new-obsession-for-silicon-valley/,false,true,false
|
| 24 |
-
https://www.cloudcomputing-news.net/news/microsoft-launches-its-first-cloud-region-in-malaysia/,true,true,true
|
| 25 |
-
https://padelmagazine.fr/best-padel-racket-awards-2025-les-meilleures-raquettes-de-lannee-devoilees/,false,false,true
|
| 26 |
-
https://www.horseandhound.co.uk/news/polly-dickson-obituary-894506,true,true,true
|
| 27 |
-
https://www.homeselect.paris/en/blog/devenir-proprietaire,false,false,true
|
| 28 |
-
https://www.maddyness.com/2020/10/23/salomon-aiach-interview-facebook-startups/,false,false,true
|
| 29 |
-
https://www.solarpowerportal.co.uk/grid-operators-must-work-together-in-aftermath-of-spain-and-portugal-blackout/,false,true,false
|
| 30 |
-
https://www.cloudcomputing-news.net/news/podcast/nginx-f5-api-proxy-podcast-apac-sprint-two-point-one-podcast-s02-e30/,false,false,true
|
| 31 |
-
https://www.farminguk.com/news/vegan-activists-attempt-to-shut-down-royal-highland-parade_66662.html,true,true,true
|
| 32 |
-
https://dairynews.today/news/world_milk_day_2025_health_innovation_and_sustainability_drive_india_s_milk_movement_9339211.html,false,true,false
|
| 33 |
-
"https://lerail.com/news/95810-signature-du-second-appel-%C3%A0-projets-gares-de-demain-entre-la-r%C3%A9gion-%C3%AEle-de-france,-%C3%AEle-de-france-mobilit%C3%A9s-et-sncf-gares-connexions",true,false,false
|
| 34 |
-
https://lerail.com/news/95984-drive-to-zero-2025,false,false,true
|
| 35 |
-
https://www.horseandhound.co.uk/news/farewell-to-twinshock-warrior-894106,true,true,true
|
| 36 |
-
https://www.farminguk.com/news/new-ai-driven-test-targets-silent-killer-in-uk-cattle_66604.html,true,true,true
|
| 37 |
-
https://www.maddyness.com/2019/05/02/growthhacking-chahab-nastar-scaleups/,false,false,true
|
| 38 |
-
https://www.businesstravelnews.com/Lodging/Hyatt-Creates-New-Unscripted-Collection-Brand,true,false,false
|
| 39 |
-
https://meuble-info.fr/falmec-gessi-le-duo-gagnant-du-point-deau/,true,false,false
|
| 40 |
-
https://www.cloudcomputing-news.net/news/podcast/supply-chain-automation-warehousing-distribution-rpa-best-dematic-podcast-s03-e10/,false,false,true
|
| 41 |
-
https://www.maddyness.com/2025/05/06/mon-petit-placement-tombe-dans-le-giron-de-malakoff-humanis/,true,false,false
|
| 42 |
-
https://lerail.com/technical-articles/79770-southco-s%C3%A9curisation-du-v%C3%A9hicule-%C3%A9lectrique-infrastructure-de-recharge-et-de-stockage-sur-batterie-de-r%C3%A9seau,false,false,true
|
| 43 |
-
https://www.watches-news.com/alpine-eagle-41-xp-cs-platinum/,true,true,true
|
| 44 |
-
https://www.imarcgroup.com/football-market,false,true,false
|
| 45 |
-
https://www.constructionnews.co.uk/contractors/balfour-beatty/balfour-beatty-court-battle-over-serious-trucks-cartel-ends-17-01-2025/,true,true,true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
out/roc_curve.png
DELETED
|
Binary file (29.3 kB)
|
|
|
prepare.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
import polars as pl
|
| 4 |
+
|
| 5 |
+
from utils.embed import embed as embed
|
| 6 |
+
from utils.paths import DATA
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
logging.basicConfig(level=logging.INFO)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def load_dataset(file_name: str):
|
| 14 |
+
features = ["meta_title", "meta_description", "content"]
|
| 15 |
+
return (
|
| 16 |
+
pl.scan_csv(file_name)
|
| 17 |
+
.with_columns(
|
| 18 |
+
pl.concat_str([pl.col(c) for c in features], separator="\n\n").alias(
|
| 19 |
+
"text"
|
| 20 |
+
),
|
| 21 |
+
pl.col("date").str.to_date().alias("date"),
|
| 22 |
+
)
|
| 23 |
+
.rename(
|
| 24 |
+
{
|
| 25 |
+
"is_news_article": "is_news",
|
| 26 |
+
"link_count": "links",
|
| 27 |
+
"paragraph_count": "paragraphs",
|
| 28 |
+
}
|
| 29 |
+
)
|
| 30 |
+
.select("text", "is_news", "url", "date", "paragraphs", "links")
|
| 31 |
+
.collect()
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def main() -> None:
|
| 36 |
+
for name in ["train", "eval"]:
|
| 37 |
+
df = load_dataset(DATA / (name + ".csv"))
|
| 38 |
+
embeds = embed(df.get_column("text").to_list())
|
| 39 |
+
df = df.with_columns(pl.Series(embeds).alias("embeds")).write_parquet(
|
| 40 |
+
DATA / (name + ".parquet")
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
if __name__ == "__main__":
|
| 45 |
+
main()
|
train.py
CHANGED
|
@@ -1,63 +1,17 @@
|
|
| 1 |
import logging
|
| 2 |
-
import os
|
| 3 |
import pickle
|
| 4 |
-
from pathlib import Path
|
| 5 |
|
| 6 |
import matplotlib.pyplot as plt
|
| 7 |
-
import numpy as np
|
| 8 |
import polars as pl
|
| 9 |
import seaborn as sns
|
| 10 |
from numpy.typing import NDArray
|
| 11 |
-
from polars import DataFrame
|
| 12 |
from sklearn.metrics import auc, confusion_matrix, roc_curve
|
| 13 |
from sklearn.svm import SVC
|
| 14 |
|
| 15 |
-
from
|
| 16 |
-
|
| 17 |
-
logger = logging.getLogger(__name__)
|
| 18 |
|
| 19 |
logging.basicConfig(level=logging.INFO)
|
| 20 |
|
| 21 |
-
DATA = Path(os.environ["DATA_DIR"])
|
| 22 |
-
DATA.mkdir(parents=True, exist_ok=True)
|
| 23 |
-
MODEL = Path(os.environ["MODEL_DIR"])
|
| 24 |
-
MODEL.mkdir(parents=True, exist_ok=True)
|
| 25 |
-
OUT = Path(os.environ["OUT_DIR"])
|
| 26 |
-
OUT.mkdir(parents=True, exist_ok=True)
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
def embed(df: DataFrame):
|
| 30 |
-
logger.info(f"embed start {df.height}")
|
| 31 |
-
features = ["content", "meta_title", "meta_description"]
|
| 32 |
-
embeddings = []
|
| 33 |
-
for col in features:
|
| 34 |
-
train_texts = df.select(col).to_series().to_list()
|
| 35 |
-
embeddings.append(_embed(train_texts))
|
| 36 |
-
res = np.hstack(embeddings)
|
| 37 |
-
logger.info(f"embed done {res.shape}")
|
| 38 |
-
return res
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
def train(df: DataFrame, target: str):
|
| 42 |
-
logger.info(f"train start {df.height}")
|
| 43 |
-
X = embed(df)
|
| 44 |
-
y = df.select(target).to_numpy().ravel()
|
| 45 |
-
clf = SVC(kernel="linear", probability=True)
|
| 46 |
-
clf.fit(X, y)
|
| 47 |
-
logger.info("train done")
|
| 48 |
-
return clf
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
def save_prediction(eval_df: DataFrame, y_eval: NDArray, y_pred: NDArray) -> None:
|
| 52 |
-
pl.DataFrame(
|
| 53 |
-
{
|
| 54 |
-
"url": eval_df.select("url").to_series().to_list(),
|
| 55 |
-
"is_news_article": y_eval,
|
| 56 |
-
"prediction": y_pred,
|
| 57 |
-
"is_prediction_correct": y_eval == y_pred,
|
| 58 |
-
}
|
| 59 |
-
).write_csv(OUT / "preds.csv")
|
| 60 |
-
|
| 61 |
|
| 62 |
def save_roc_curve(clf, X: NDArray, y: NDArray):
|
| 63 |
probs = clf.predict_proba(X)[:, 1] # Probability for the positive class
|
|
@@ -76,7 +30,7 @@ def save_roc_curve(clf, X: NDArray, y: NDArray):
|
|
| 76 |
plt.title("Receiver Operating Characteristic (ROC)")
|
| 77 |
plt.legend(loc="lower right")
|
| 78 |
plt.tight_layout()
|
| 79 |
-
plt.savefig(
|
| 80 |
plt.close()
|
| 81 |
|
| 82 |
|
|
@@ -94,26 +48,26 @@ def save_confusion_matrix(y: NDArray, pred: NDArray):
|
|
| 94 |
plt.ylabel("Actual")
|
| 95 |
plt.title("Confusion Matrix")
|
| 96 |
plt.tight_layout()
|
| 97 |
-
plt.savefig(
|
| 98 |
plt.close()
|
| 99 |
|
| 100 |
|
| 101 |
def main() -> None:
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
clf
|
|
|
|
|
|
|
|
|
|
| 105 |
with open(MODEL / "model.pickle", "wb") as f:
|
| 106 |
pickle.dump(clf, f)
|
| 107 |
|
| 108 |
-
eval_df = pl.
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
eval_y = eval_df.select(target).to_numpy().ravel()
|
| 112 |
eval_pred = clf.predict(eval_X)
|
| 113 |
-
save_prediction(eval_df, eval_y, eval_pred)
|
| 114 |
save_confusion_matrix(eval_y, eval_pred)
|
| 115 |
save_roc_curve(clf, eval_X, eval_y)
|
| 116 |
-
logger.info("eval done")
|
| 117 |
|
| 118 |
|
| 119 |
if __name__ == "__main__":
|
|
|
|
| 1 |
import logging
|
|
|
|
| 2 |
import pickle
|
|
|
|
| 3 |
|
| 4 |
import matplotlib.pyplot as plt
|
|
|
|
| 5 |
import polars as pl
|
| 6 |
import seaborn as sns
|
| 7 |
from numpy.typing import NDArray
|
|
|
|
| 8 |
from sklearn.metrics import auc, confusion_matrix, roc_curve
|
| 9 |
from sklearn.svm import SVC
|
| 10 |
|
| 11 |
+
from utils.paths import DATA, IMGS, MODEL
|
|
|
|
|
|
|
| 12 |
|
| 13 |
logging.basicConfig(level=logging.INFO)
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
def save_roc_curve(clf, X: NDArray, y: NDArray):
|
| 17 |
probs = clf.predict_proba(X)[:, 1] # Probability for the positive class
|
|
|
|
| 30 |
plt.title("Receiver Operating Characteristic (ROC)")
|
| 31 |
plt.legend(loc="lower right")
|
| 32 |
plt.tight_layout()
|
| 33 |
+
plt.savefig(IMGS / "roc_curve.png")
|
| 34 |
plt.close()
|
| 35 |
|
| 36 |
|
|
|
|
| 48 |
plt.ylabel("Actual")
|
| 49 |
plt.title("Confusion Matrix")
|
| 50 |
plt.tight_layout()
|
| 51 |
+
plt.savefig(IMGS / "confusion_matrix.png")
|
| 52 |
plt.close()
|
| 53 |
|
| 54 |
|
| 55 |
def main() -> None:
|
| 56 |
+
train_df = pl.read_parquet(DATA / "train.parquet")
|
| 57 |
+
clf = SVC(kernel="linear", probability=True)
|
| 58 |
+
clf.fit(
|
| 59 |
+
train_df.get_column("embeds").to_numpy(),
|
| 60 |
+
train_df.get_column("is_news").to_numpy(),
|
| 61 |
+
)
|
| 62 |
with open(MODEL / "model.pickle", "wb") as f:
|
| 63 |
pickle.dump(clf, f)
|
| 64 |
|
| 65 |
+
eval_df = pl.read_parquet(DATA / "eval.parquet")
|
| 66 |
+
eval_X = eval_df.get_column("embeds").to_numpy()
|
| 67 |
+
eval_y = eval_df.get_column("is_news").to_numpy()
|
|
|
|
| 68 |
eval_pred = clf.predict(eval_X)
|
|
|
|
| 69 |
save_confusion_matrix(eval_y, eval_pred)
|
| 70 |
save_roc_curve(clf, eval_X, eval_y)
|
|
|
|
| 71 |
|
| 72 |
|
| 73 |
if __name__ == "__main__":
|
usage.py
CHANGED
|
@@ -1,45 +1,37 @@
|
|
| 1 |
-
import os
|
| 2 |
import pickle
|
| 3 |
from functools import cache
|
| 4 |
-
from pathlib import Path
|
| 5 |
|
| 6 |
-
import numpy as np
|
| 7 |
import polars as pl
|
| 8 |
from huggingface_hub import hf_hub_download
|
| 9 |
|
| 10 |
-
from embed import embed
|
| 11 |
-
|
| 12 |
-
DATA = Path(os.environ["DATA_DIR"])
|
| 13 |
-
|
| 14 |
-
features = ["content", "meta_title", "meta_description"]
|
| 15 |
|
| 16 |
|
| 17 |
@cache
|
| 18 |
def get_model():
|
| 19 |
-
file_name = hf_hub_download(
|
|
|
|
|
|
|
| 20 |
with open(file_name, "rb") as f:
|
| 21 |
return pickle.load(f)
|
| 22 |
|
| 23 |
|
| 24 |
-
def
|
| 25 |
-
df = pl.
|
| 26 |
-
return {
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
for f in features:
|
| 32 |
-
embeddings.append(embed([rec[f]]))
|
| 33 |
-
return np.hstack(embeddings)
|
| 34 |
|
| 35 |
|
| 36 |
def main():
|
| 37 |
model = get_model()
|
| 38 |
-
record =
|
| 39 |
-
embeds =
|
| 40 |
(pred,) = model.predict(embeds)
|
| 41 |
-
print(record[
|
| 42 |
-
print(f"is news (real): {record['is_news_article']}")
|
| 43 |
print(f"is news (pred): {pred}")
|
| 44 |
|
| 45 |
|
|
|
|
|
|
|
| 1 |
import pickle
|
| 2 |
from functools import cache
|
|
|
|
| 3 |
|
|
|
|
| 4 |
import polars as pl
|
| 5 |
from huggingface_hub import hf_hub_download
|
| 6 |
|
| 7 |
+
from utils.embed import embed
|
| 8 |
+
from utils.paths import DATA
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
@cache
|
| 12 |
def get_model():
|
| 13 |
+
file_name = hf_hub_download(
|
| 14 |
+
"opale-ai/news-classifier", "model/model.pickle", revision="main"
|
| 15 |
+
)
|
| 16 |
with open(file_name, "rb") as f:
|
| 17 |
return pickle.load(f)
|
| 18 |
|
| 19 |
|
| 20 |
+
def get_record():
|
| 21 |
+
df = pl.read_parquet(DATA / "eval.parquet")
|
| 22 |
+
return {
|
| 23 |
+
col: val
|
| 24 |
+
for col, val in zip(df.columns, df.sample().row(0))
|
| 25 |
+
if col in ["text", "is_news"]
|
| 26 |
+
}
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
def main():
|
| 30 |
model = get_model()
|
| 31 |
+
record = get_record()
|
| 32 |
+
embeds = embed([record["text"]])
|
| 33 |
(pred,) = model.predict(embeds)
|
| 34 |
+
print(f"is news (real): {record['is_news']}")
|
|
|
|
| 35 |
print(f"is news (pred): {pred}")
|
| 36 |
|
| 37 |
|
utils/__init__.py
ADDED
|
File without changes
|
utils/data.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
COLUMNS = [
|
| 2 |
+
"url",
|
| 3 |
+
"website",
|
| 4 |
+
"og_type",
|
| 5 |
+
"meta_description",
|
| 6 |
+
"meta_title",
|
| 7 |
+
"content",
|
| 8 |
+
"date",
|
| 9 |
+
"days_old",
|
| 10 |
+
"link_count",
|
| 11 |
+
"paragraph_count",
|
| 12 |
+
"average_links",
|
| 13 |
+
"text_to_html_ratio",
|
| 14 |
+
"css_title",
|
| 15 |
+
"is_news_article",
|
| 16 |
+
"reason",
|
| 17 |
+
]
|
embed.py → utils/embed.py
RENAMED
|
File without changes
|
utils/paths.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
DATA = Path(os.environ["DATA_DIR"])
|
| 5 |
+
DATA.mkdir(parents=True, exist_ok=True)
|
| 6 |
+
MODEL = Path(os.environ["MODEL_DIR"])
|
| 7 |
+
MODEL.mkdir(parents=True, exist_ok=True)
|
| 8 |
+
IMGS = Path(os.environ["IMGS_DIR"])
|
| 9 |
+
IMGS.mkdir(parents=True, exist_ok=True)
|