| | import os, json |
| | import pandas as pd |
| | from datasets import Dataset, DatasetDict |
| | from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer |
| | import torch |
| |
|
| | |
| | json_dir = "../Article-Bias-Prediction/data/jsons" |
| | id_to_article = {} |
| | for filename in os.listdir(json_dir): |
| | with open(os.path.join(json_dir, filename), 'r', encoding='utf-8') as f: |
| | data = json.load(f) |
| | id_to_article[data["ID"]] = data |
| |
|
| | |
| | def load_split(split_path): |
| | df = pd.read_csv(split_path, sep="\t", header=None, names=["id", "label"]) |
| | articles = [] |
| | for _, row in df.iterrows(): |
| | article = id_to_article.get(row["id"]) |
| | if article and article["content"]: |
| | articles.append({ |
| | "text": article["content"], |
| | "label": row["label"] |
| | }) |
| | return Dataset.from_pandas(pd.DataFrame(articles)) |
| |
|
| | train = load_split("../Article-Bias-Prediction/data/splits/random/train.tsv") |
| | valid = load_split("../Article-Bias-Prediction/data/splits/random/valid.tsv") |
| | test = load_split("../Article-Bias-Prediction/data/splits/random/test.tsv") |
| |
|
| | dataset = DatasetDict({ |
| | "train": train, |
| | "test": test, |
| | "validation": valid |
| | }) |
| |
|