File size: 865 Bytes
45fe8b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import os


def load_documents(dataset_path):

    documents = []
    labels = []

    for category in os.listdir(dataset_path):

        category_path = os.path.join(dataset_path, category)

        if os.path.isdir(category_path):

            for file in os.listdir(category_path):

                file_path = os.path.join(category_path, file)

                try:
                    with open(file_path, "r", encoding="latin-1") as f:

                        text = f.read()

                        documents.append(text)
                        labels.append(category)

                except:
                    continue

    return documents, labels


if __name__ == "__main__":

    dataset_path = "data/20_newsgroups"

    docs, labels = load_documents(dataset_path)

    print("Total documents:", len(docs))
    print("Example category:", labels[0])