IvanHU commited on
Commit
1b0223b
·
verified ·
1 Parent(s): 0fdb914

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +161 -0
README.md CHANGED
@@ -13,6 +13,167 @@ tags:
13
 
14
  We use `math-classifier` to retrieve math-related content from `fineweb-edu`, `dclm`, ... to upsample math-related content
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
 
18
  ## Related resources
 
13
 
14
  We use `math-classifier` to retrieve math-related content from `fineweb-edu`, `dclm`, ... to upsample math-related content
15
 
16
+ ```python
17
+ import json
18
+ import os
19
+ import time
20
+ from concurrent.futures import ProcessPoolExecutor, wait, ALL_COMPLETED
21
+ from time import sleep
22
+
23
+ import fasttext
24
+ import numpy as np
25
+ import pandas as pd
26
+ import pyarrow.parquet as pq
27
+ from tqdm import tqdm
28
+
29
+
30
+ def print_error(value):
31
+ print("error: ", value)
32
+
33
+
34
+ def data_process(index, file, saved_dir):
35
+
36
+ try:
37
+ model_path = "math_score.bin"
38
+ model = fasttext.load_model(model_path)
39
+
40
+ # saved_dir: fineweb-edu/data/CC...-math/
41
+ filename = file.split('/')[-1].replace('.parquet', '.jsonl')
42
+ path90 = os.path.join(saved_dir, "09_10", filename)
43
+ if os.path.exists(path90):
44
+ print("exist", path90, flush=True)
45
+ return
46
+
47
+ sleep(index * 3)
48
+ os.makedirs(saved_dir, exist_ok=True)
49
+
50
+ label_list = []
51
+ s67_list = []
52
+ s78_list = []
53
+ s89_list = []
54
+ s90_list = []
55
+
56
+ st = time.time()
57
+ print("reading parquet", file, flush=True)
58
+ df = pd.read_parquet(file)
59
+ ed = time.time()
60
+ print("read parquet time: ", ed - st, flush=True)
61
+ for _, row_orginal in tqdm(
62
+ df.iterrows(),
63
+ total=len(df),
64
+ position=index,
65
+ desc=filename,
66
+ ):
67
+ row = row_orginal.to_dict()
68
+ text = row['text'].replace('\n', ' ')
69
+
70
+ pred = model.predict(text)
71
+ label, score = pred[0][0], pred[1][0]
72
+ label_list.append(pred)
73
+ if label == '__label__positive':
74
+ if 0.6 <= score < 0.7:
75
+ s67_list.append(row)
76
+ if 0.7 <= score < 0.8:
77
+ s78_list.append(row)
78
+ elif 0.8 <= score < 0.9:
79
+ s89_list.append(row)
80
+ elif 0.9 <= score <= 1.0:
81
+ s90_list.append(row)
82
+ else:
83
+ continue
84
+ except Exception as e:
85
+ print_error(e)
86
+ return None
87
+
88
+ os.makedirs(os.path.join(saved_dir, "labeled"), exist_ok=True)
89
+
90
+ print("writing to file", flush=True)
91
+
92
+ with open(
93
+ os.path.join(saved_dir, "labeled",
94
+ filename.replace('.jsonl', '.txt')), 'w') as f:
95
+ f.write("\n".join(str(pred) for pred in label_list))
96
+
97
+ for dir_name in [ "07_08", "08_09", "09_10"]:
98
+ os.makedirs(os.path.join(saved_dir, dir_name), exist_ok=True)
99
+
100
+ with open(os.path.join(saved_dir, "06_07", filename), 'w') as f:
101
+ f.write("\n".join(json.dumps(line_now) for line_now in s67_list))
102
+
103
+ with open(os.path.join(saved_dir, "07_08", filename), 'w') as f:
104
+ f.write("\n".join(json.dumps(line_now) for line_now in s78_list))
105
+
106
+ with open(os.path.join(saved_dir, "08_09", filename), 'w') as f:
107
+ f.write("\n".join(json.dumps(line_now) for line_now in s89_list))
108
+
109
+ with open(os.path.join(saved_dir, "09_10", filename), 'w') as f:
110
+ f.write("\n".join(json.dumps(line_now) for line_now in s90_list))
111
+
112
+ return None
113
+
114
+
115
+ if __name__ == '__main__':
116
+
117
+ num_process = 5
118
+ start_time = time.time()
119
+ file_paths = []
120
+ base = "fineweb-edu/data/"
121
+
122
+ coun=0
123
+ for file_name in [
124
+ 'CC-MAIN-2017-04','CC-MAIN-2017-09','CC-MAIN-2017-13',
125
+ 'CC-MAIN-2017-17','CC-MAIN-2017-22','CC-MAIN-2017-26',
126
+ 'CC-MAIN-2017-30','CC-MAIN-2017-34','CC-MAIN-2017-39',
127
+ 'CC-MAIN-2017-43','CC-MAIN-2017-47','CC-MAIN-2017-51',
128
+
129
+ "CC-MAIN-2018-05","CC-MAIN-2018-09","CC-MAIN-2018-13",
130
+ "CC-MAIN-2018-17","CC-MAIN-2018-22","CC-MAIN-2018-26",
131
+ "CC-MAIN-2018-30","CC-MAIN-2018-34","CC-MAIN-2018-39",
132
+ "CC-MAIN-2018-43","CC-MAIN-2018-47","CC-MAIN-2018-51",
133
+
134
+ "CC-MAIN-2019-04","CC-MAIN-2019-09","CC-MAIN-2019-13",
135
+ "CC-MAIN-2019-18","CC-MAIN-2019-22","CC-MAIN-2019-26",
136
+ "CC-MAIN-2019-30","CC-MAIN-2019-35","CC-MAIN-2019-39",
137
+ "CC-MAIN-2019-43","CC-MAIN-2019-47","CC-MAIN-2019-51",
138
+
139
+ ]:
140
+
141
+ print("Walking:", file_name)
142
+ original_file_path = base + file_name
143
+ math_dir = original_file_path + "-math"
144
+ print(math_dir)
145
+
146
+ for root, dirs, files in os.walk(original_file_path):
147
+ for file in files:
148
+ if file.endswith(".parquet"): # 只处理Parquet文件
149
+ file_path = os.path.abspath(os.path.join(root, file))
150
+ coun+=1
151
+ saved_dir = math_dir + "/" + file_path.split("/")[-1][:-8]
152
+ print(saved_dir)
153
+ file_paths.append((file_path, saved_dir))
154
+ print(coun)
155
+ print(len(lines))
156
+
157
+ print("total file paths", len(file_paths))
158
+ num_process = min(num_process, len(file_paths))
159
+ print("num_process", num_process)
160
+
161
+ futures = []
162
+ with ProcessPoolExecutor(num_process) as executor:
163
+ for index, (file_path, saved_dir) in enumerate(file_paths):
164
+ futures.append(
165
+ executor.submit(data_process, index % num_process, file_path,
166
+ saved_dir))
167
+ done, not_done = wait(futures, return_when=ALL_COMPLETED)
168
+
169
+ end_time = time.time()
170
+
171
+ # 计算并打印所用时间
172
+ elapsed_time = end_time - start_time
173
+ print(f"Time taken: {elapsed_time} seconds")
174
+ print("=" * 100)
175
+
176
+ ```
177
 
178
 
179
  ## Related resources