File size: 549 Bytes
45fe8b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import os
import pickle

data_path = "data/20_newsgroups"

documents = []

for root, dirs, files in os.walk(data_path):
    for file in files:
        file_path = os.path.join(root, file)

        try:
            with open(file_path, "r", encoding="latin1") as f:
                documents.append(f.read())
        except:
            pass

print("Total documents loaded:", len(documents))

os.makedirs("models", exist_ok=True)

with open("models/documents.pkl", "wb") as f:
    pickle.dump(documents, f)

print("documents.pkl saved successfully!")