File size: 4,350 Bytes
ceedef8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95


import os
import json
import sys
from datasets import load_dataset, load_from_disk, concatenate_datasets, Dataset
from transformers import PreTrainedTokenizerFast
import transformers
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    default_data_collator,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel

from transformers import GPT2Model
from transformers import GPT2TokenizerFast
import transformers
import torch
import numpy as np

root = r'G:\Data\yle\data'#download from kielipankki and extract

texts = []
subjects = []
first_subjects = []
first_ids = []
subject_ids = []

for path, subdirs, files in os.walk(root):
    #Data is split into multiple files
    for name in files:
        print(os.path.join(path, name))
        with open(os.path.join(path, name), encoding="utf8") as f:
            data = json.load(f)

            #Each file contains json with multiple articles
            for i in range(len(data["data"])):
                try:
                    txt = ""
                    s = [] #Subjects
                    s_ids = []#Id for the subjects
                    #From the content loop trough the content and get only heading as text as we do not want to add metadata to a text dataset
                    for c in data["data"][i]["content"]:
                        if c["type"] in ("heading","text"):
                            txt += c["text"]
                        txt += "\n"
                    first = ""
                    #An article contains n subjects. Loop trough those and also save which one was first. We want that as a distinct column in the dataset for performance.
                    if "subjects" in data["data"][i]:#To know if we have a first subject, check first if we even have subjects in json.
                        first = data["data"][i]["subjects"][0]["title"]["fi"]
                        first_id = data["data"][i]["subjects"][0]["id"]
                        for subject in data["data"][i]["subjects"]:
                            s.append(subject["title"]["fi"])
                            s_ids.append(subject["id"])
                    first_subjects.append(first)
                    first_ids.append(first_id)
                    texts.append(txt)
                    subjects.append(s)
                    subject_ids.append(s_ids)
                except:
                    #Some texts contain formatting errors, just skip those as they are a neglible portion of all the articles.
                    pass


dataset = Dataset.from_dict({"text":texts, "subjects":subjects, "first_subject":first_subjects, "first_ids":first_ids, "subject_ids":subject_ids})

tokenizer_loc = "/tokenizer_loc"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_loc)
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

def find_major_subject(example):
    good_subjects = ["urheilu","Kotimaan uutiset","Ulkomaat","jääkiekko","talous","politiikka","poliisi","Liikenne ja kuljetus","kulttuuri","puolueet","onnettomuudet","musiikki","Koulutus ja kasvatus","Venäjä","tieliikenne","luonto","autot","terveys","Helsinki","Pohjoismaat","kunnat","Eurooppa","rikokset","vaalit","Yhdysvallat","lainvalvonta"]
    import numpy as np #Some scopes were broken on Windows so import again here to get batched processing to work...
    example["main_subject"] = None
    label = np.zeros(len(good_subjects))#sparse label matrix, to be made into one-hot later
    for subject in example["subjects"]:
        if subject in good_subjects:
            example["main_subject"] = subject
            label[good_subjects.index(subject)] = 1
            #example["labels"] = label
            break
    return {"labels":label}

filtered = dataset.map(find_major_subject, num_proc=12).filter(lambda example: example['main_subject'] != None)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=800)
tokenized_and_filtered_dataset = filtered.map(tokenize_function, batched=True)

tokenized_and_filtered_dataset.save_to_disk("/output/dir")