kaushik-harsh-99 commited on
Commit
95f644c
·
1 Parent(s): 11eb6e9

initial-upload

Browse files
FastText/FastText-Test.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import fasttext
3
+ import pandas as pd
4
+
5
+ from sklearn.metrics import (
6
+ accuracy_score,
7
+ classification_report,
8
+ confusion_matrix,
9
+ )
10
+
11
+ # ============================================================
12
+ # CONFIG
13
+ # ============================================================
14
+
15
+ MODEL_FILE = "fasttext_language_classifier.bin"
16
+
17
+ VALIDATION_FILE = "dataset/validation.jsonl"
18
+ TEST_FILE = "dataset/test.jsonl"
19
+
20
+ # ============================================================
21
+ # LOAD MODEL
22
+ # ============================================================
23
+
24
+ print("Loading model...")
25
+
26
+ model = fasttext.load_model(MODEL_FILE)
27
+
28
+ print("Model loaded.")
29
+
30
+ # ============================================================
31
+ # EVALUATION
32
+ # ============================================================
33
+
34
+ def evaluate_jsonl(
35
+ model,
36
+ jsonl_file,
37
+ split_name,
38
+ ):
39
+
40
+ print(f"\nEvaluating {split_name}")
41
+
42
+ y_true = []
43
+ y_pred = []
44
+
45
+ processed = 0
46
+
47
+ with open(
48
+ jsonl_file,
49
+ "r",
50
+ encoding="utf-8",
51
+ ) as f:
52
+
53
+ for line in f:
54
+
55
+ row = json.loads(line)
56
+
57
+ true_label = row["label"]
58
+
59
+ # Match FastText training format
60
+ text = " ".join(
61
+ row["content"].split()
62
+ )
63
+
64
+ labels, probs = model.predict(
65
+ text,
66
+ k=1,
67
+ )
68
+
69
+ pred_label = (
70
+ labels[0]
71
+ .replace("__label__", "")
72
+ )
73
+
74
+ y_true.append(true_label)
75
+ y_pred.append(pred_label)
76
+
77
+ processed += 1
78
+
79
+ if processed % 5000 == 0:
80
+ print(
81
+ f"Processed {processed:,}"
82
+ )
83
+
84
+ # ========================================================
85
+ # ACCURACY
86
+ # ========================================================
87
+
88
+ acc = accuracy_score(
89
+ y_true,
90
+ y_pred,
91
+ )
92
+
93
+ print(
94
+ f"\n{split_name} Accuracy: "
95
+ f"{acc:.6f}"
96
+ )
97
+
98
+ # ========================================================
99
+ # CLASSIFICATION REPORT
100
+ # ========================================================
101
+
102
+ report = classification_report(
103
+ y_true,
104
+ y_pred,
105
+ output_dict=True,
106
+ digits=4,
107
+ )
108
+
109
+ report_df = (
110
+ pd.DataFrame(report)
111
+ .transpose()
112
+ )
113
+
114
+ report_csv = (
115
+ f"{split_name}_classification_report.csv"
116
+ )
117
+
118
+ report_df.to_csv(report_csv)
119
+
120
+ print(f"Saved {report_csv}")
121
+
122
+ # ========================================================
123
+ # CONFUSION MATRIX
124
+ # ========================================================
125
+
126
+ labels_sorted = sorted(
127
+ list(set(y_true))
128
+ )
129
+
130
+ cm = confusion_matrix(
131
+ y_true,
132
+ y_pred,
133
+ labels=labels_sorted,
134
+ )
135
+
136
+ cm_df = pd.DataFrame(
137
+ cm,
138
+ index=labels_sorted,
139
+ columns=labels_sorted,
140
+ )
141
+
142
+ cm_csv = (
143
+ f"{split_name}_confusion_matrix.csv"
144
+ )
145
+
146
+ cm_df.to_csv(cm_csv)
147
+
148
+ print(f"Saved {cm_csv}")
149
+
150
+ return acc
151
+
152
+ # ============================================================
153
+ # VALIDATION
154
+ # ============================================================
155
+
156
+ validation_accuracy = evaluate_jsonl(
157
+ model,
158
+ VALIDATION_FILE,
159
+ "validation",
160
+ )
161
+
162
+ # ============================================================
163
+ # TEST
164
+ # ============================================================
165
+
166
+ test_accuracy = evaluate_jsonl(
167
+ model,
168
+ TEST_FILE,
169
+ "test",
170
+ )
171
+
172
+ # ============================================================
173
+ # SUMMARY
174
+ # ============================================================
175
+
176
+ summary = pd.DataFrame([
177
+ {
178
+ "validation_accuracy": validation_accuracy,
179
+ "test_accuracy": test_accuracy,
180
+ }
181
+ ])
182
+
183
+ summary.to_csv(
184
+ "fasttext_summary.csv",
185
+ index=False,
186
+ )
187
+
188
+ print("\nSaved fasttext_summary.csv")
189
+
190
+ print("\n==============================")
191
+ print(f"Validation Accuracy: {validation_accuracy:.6f}")
192
+ print(f"Test Accuracy: {test_accuracy:.6f}")
193
+ print("==============================")
194
+
195
+ print("\nDone.")
FastText/FastText.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+
5
+ import fasttext
6
+ import pandas as pd
7
+
8
+ from sklearn.metrics import (
9
+ accuracy_score,
10
+ classification_report,
11
+ confusion_matrix,
12
+ )
13
+
14
+ # ============================================================
15
+ # CONFIG
16
+ # ============================================================
17
+
18
+ TRAIN_FILE = "fasttext_train.txt"
19
+
20
+ VALIDATION_JSONL = "dataset/validation.jsonl"
21
+ TEST_JSONL = "dataset/test.jsonl"
22
+
23
+ MODEL_FILE = "fasttext_language_classifier.bin"
24
+
25
+ EPOCHS = 25
26
+ LR = 0.7
27
+
28
+ DIM = 50
29
+
30
+ WORD_NGRAMS = 3
31
+
32
+ MINN = 2
33
+ MAXN = 5
34
+
35
+ MIN_COUNT = 100
36
+
37
+ BUCKET = 50000
38
+
39
+ THREADS = os.cpu_count()
40
+
41
+ # ============================================================
42
+ # TRAIN
43
+ # ============================================================
44
+
45
+ print("Training FastText...")
46
+ print()
47
+
48
+ start = time.time()
49
+
50
+ model = fasttext.train_supervised(
51
+ input=TRAIN_FILE,
52
+ epoch=EPOCHS,
53
+ lr=LR,
54
+ dim=DIM,
55
+ wordNgrams=WORD_NGRAMS,
56
+ minn=MINN,
57
+ maxn=MAXN,
58
+ minCount=MIN_COUNT,
59
+ bucket=BUCKET,
60
+ loss="softmax",
61
+ thread=THREADS,
62
+ verbose=2,
63
+ )
64
+
65
+ elapsed = time.time() - start
66
+
67
+ print()
68
+ print(f"Training completed in {elapsed:.1f}s")
69
+
70
+ # ============================================================
71
+ # LABEL DEBUG
72
+ # ============================================================
73
+
74
+ print()
75
+ print("Labels found by FastText:")
76
+ print(f"Count: {len(model.labels)}")
77
+
78
+ for label in model.labels:
79
+ print(label)
80
+
81
+ # ============================================================
82
+ # SAVE MODEL
83
+ # ============================================================
84
+
85
+ model.save_model(MODEL_FILE)
86
+
87
+ size_mb = os.path.getsize(MODEL_FILE) / 1024 / 1024
88
+
89
+ print()
90
+ print(f"Saved model: {MODEL_FILE}")
91
+ print(f"Model size: {size_mb:.2f} MB")
92
+
93
+ # ============================================================
94
+ # EVALUATION
95
+ # ============================================================
96
+
97
+ def evaluate_jsonl(
98
+ model,
99
+ jsonl_file,
100
+ split_name,
101
+ ):
102
+ print()
103
+ print(f"Evaluating {split_name}")
104
+
105
+ y_true = []
106
+ y_pred = []
107
+
108
+ processed = 0
109
+
110
+ with open(
111
+ jsonl_file,
112
+ "r",
113
+ encoding="utf-8",
114
+ ) as f:
115
+
116
+ for line in f:
117
+
118
+ row = json.loads(line)
119
+
120
+ true_label = row["label"]
121
+
122
+
123
+ text = " ".join(
124
+ str(row["content"]).split()
125
+ )
126
+
127
+ labels, probs = model.predict(
128
+ text,
129
+ k=1,
130
+ )
131
+
132
+ pred_label = (
133
+ labels[0]
134
+ .replace("__label__", "")
135
+ )
136
+
137
+ y_true.append(true_label)
138
+ y_pred.append(pred_label)
139
+
140
+ processed += 1
141
+
142
+ if processed % 5000 == 0:
143
+ print(
144
+ f"Processed {processed:,}"
145
+ )
146
+
147
+ # ========================================================
148
+ # ACCURACY
149
+ # ========================================================
150
+
151
+ accuracy = accuracy_score(
152
+ y_true,
153
+ y_pred,
154
+ )
155
+
156
+ print()
157
+ print(
158
+ f"{split_name} Accuracy: "
159
+ f"{accuracy:.6f}"
160
+ )
161
+
162
+ # ========================================================
163
+ # CLASSIFICATION REPORT
164
+ # ========================================================
165
+
166
+ report = classification_report(
167
+ y_true,
168
+ y_pred,
169
+ output_dict=True,
170
+ digits=4,
171
+ )
172
+
173
+ report_df = pd.DataFrame(
174
+ report
175
+ ).transpose()
176
+
177
+ report_file = (
178
+ f"{split_name}_classification_report.csv"
179
+ )
180
+
181
+ report_df.to_csv(report_file)
182
+
183
+ print(f"Saved {report_file}")
184
+
185
+ # ========================================================
186
+ # CONFUSION MATRIX
187
+ # ========================================================
188
+
189
+ labels_sorted = sorted(
190
+ list(set(y_true))
191
+ )
192
+
193
+ cm = confusion_matrix(
194
+ y_true,
195
+ y_pred,
196
+ labels=labels_sorted,
197
+ )
198
+
199
+ cm_df = pd.DataFrame(
200
+ cm,
201
+ index=labels_sorted,
202
+ columns=labels_sorted,
203
+ )
204
+
205
+ cm_file = (
206
+ f"{split_name}_confusion_matrix.csv"
207
+ )
208
+
209
+ cm_df.to_csv(cm_file)
210
+
211
+ print(f"Saved {cm_file}")
212
+
213
+ return accuracy
214
+
215
+ # ============================================================
216
+ # VALIDATION
217
+ # ============================================================
218
+
219
+ validation_accuracy = evaluate_jsonl(
220
+ model,
221
+ VALIDATION_JSONL,
222
+ "validation",
223
+ )
224
+
225
+ # ============================================================
226
+ # TEST
227
+ # ============================================================
228
+
229
+ test_accuracy = evaluate_jsonl(
230
+ model,
231
+ TEST_JSONL,
232
+ "test",
233
+ )
234
+
235
+ # ============================================================
236
+ # SUMMARY
237
+ # ============================================================
238
+
239
+ summary = pd.DataFrame(
240
+ [
241
+ {
242
+ "validation_accuracy": validation_accuracy,
243
+ "test_accuracy": test_accuracy,
244
+ "epochs": EPOCHS,
245
+ "lr": LR,
246
+ "dim": DIM,
247
+ "word_ngrams": WORD_NGRAMS,
248
+ "min_count": MIN_COUNT,
249
+ "bucket": BUCKET,
250
+ "model_size_mb": size_mb,
251
+ }
252
+ ]
253
+ )
254
+
255
+ summary.to_csv(
256
+ "fasttext_summary.csv",
257
+ index=False,
258
+ )
259
+
260
+ print()
261
+ print("=" * 60)
262
+ print(f"Validation Accuracy : {validation_accuracy:.6f}")
263
+ print(f"Test Accuracy : {test_accuracy:.6f}")
264
+ print(f"Model Size (MB) : {size_mb:.2f}")
265
+ print("=" * 60)
266
+
267
+ print()
268
+ print("Done.")
FastText/convert-to-fast-text-format.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ FILES = {
4
+ "dataset/train.jsonl": "fasttext_train.txt",
5
+ "dataset/validation.jsonl": "fasttext_validation.txt",
6
+ "dataset/test.jsonl": "fasttext_test.txt",
7
+ }
8
+
9
+ for input_file, output_file in FILES.items():
10
+
11
+ print(f"Converting {input_file} -> {output_file}")
12
+
13
+ count = 0
14
+
15
+ with open(input_file, "r", encoding="utf-8") as fin, \
16
+ open(output_file, "w", encoding="utf-8") as fout:
17
+
18
+ for line in fin:
19
+
20
+ row = json.loads(line)
21
+
22
+ label = str(row["label"]).strip()
23
+
24
+ text = str(row["content"])
25
+
26
+
27
+ text = text.replace("__label__", "__lbl__")
28
+
29
+
30
+ text = " ".join(text.split())
31
+
32
+ fout.write(
33
+ f"__label__{label} {text}\n"
34
+ )
35
+
36
+ count += 1
37
+
38
+ print(f"Saved {count:,} samples")
39
+
40
+ print("\nDone.")
FastText/fasttext_language_classifier.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8734bd145050cf8c458943d1fec8a311410bf2f7f21b89c677c42c1ec3d4d39
3
+ size 38263405
FastText/fasttext_summary.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ validation_accuracy,test_accuracy,epochs,lr,dim,word_ngrams,min_count,bucket,model_size_mb
2
+ 0.9555,0.953125,25,0.7,50,3,100,50000,36.49082660675049
FastText/test_classification_report.csv ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,precision,recall,f1-score,support
2
+ Assembly,0.9874874874874875,0.9865,0.9869934967483742,2000.0
3
+ C,0.9132374814080317,0.921,0.9171023151605676,2000.0
4
+ C#,0.9763937719738824,0.972,0.974191931846655,2000.0
5
+ C++,0.9087261785356068,0.906,0.9073610415623435,2000.0
6
+ CSS,0.9709072978303748,0.9845,0.977656405163853,2000.0
7
+ Dart,0.9794589178356713,0.9775,0.9784784784784785,2000.0
8
+ Go,0.9725411882176734,0.974,0.9732700474644017,2000.0
9
+ HTML,0.896236012207528,0.881,0.8885526979324256,2000.0
10
+ Java,0.9676777722526106,0.973,0.9703315881326352,2000.0
11
+ JavaScript,0.851581508515815,0.875,0.8631319358816276,2000.0
12
+ Kotlin,0.9863979848866499,0.979,0.9826850690087829,2000.0
13
+ Lua,0.9859084046300957,0.9795,0.9826937547027841,2000.0
14
+ Markdown,0.9464196294441662,0.945,0.9457092819614711,2000.0
15
+ Python,0.9853609288238263,0.976,0.9806581260989701,2000.0
16
+ Rust,0.9894736842105263,0.987,0.9882352941176471,2000.0
17
+ Typescript,0.9348697394789579,0.933,0.933933933933934,2000.0
18
+ accuracy,0.953125,0.953125,0.953125,0.953125
19
+ macro avg,0.9532923742336815,0.953125,0.9531865873871844,32000.0
20
+ weighted avg,0.9532923742336815,0.953125,0.9531865873871844,32000.0
FastText/test_confusion_matrix.csv ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,Assembly,C,C#,C++,CSS,Dart,Go,HTML,Java,JavaScript,Kotlin,Lua,Markdown,Python,Rust,Typescript
2
+ Assembly,1973,13,0,6,1,0,1,0,0,1,0,3,1,0,0,1
3
+ C,9,1842,5,123,0,4,0,3,3,1,1,1,4,1,2,1
4
+ C#,0,4,1944,6,1,2,6,1,8,6,0,3,7,2,3,7
5
+ C++,2,132,6,1812,1,2,5,7,8,5,2,5,5,1,4,3
6
+ CSS,0,0,1,0,1969,1,1,17,1,7,1,0,1,1,0,0
7
+ Dart,0,1,1,0,1,1955,4,5,4,17,0,2,3,3,1,3
8
+ Go,1,0,2,6,2,1,1948,6,6,10,2,1,9,2,1,3
9
+ HTML,3,1,2,4,36,3,8,1762,3,125,4,3,32,3,0,11
10
+ Java,0,9,7,6,0,9,4,5,1946,2,1,2,6,0,0,3
11
+ JavaScript,1,3,3,6,12,7,9,98,7,1750,7,3,10,1,1,82
12
+ Kotlin,1,0,1,0,0,1,5,4,11,8,1958,2,3,1,0,5
13
+ Lua,1,7,7,5,1,0,1,5,2,5,2,1959,3,1,1,0
14
+ Markdown,3,3,5,8,3,2,4,31,6,17,3,0,1890,12,3,10
15
+ Python,0,1,2,1,0,3,3,8,0,5,3,3,16,1952,2,1
16
+ Rust,3,0,2,5,1,2,1,2,2,1,0,0,6,1,1974,0
17
+ Typescript,1,1,3,6,0,4,3,12,4,95,1,0,1,0,3,1866
FastText/validation_classification_report.csv ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,precision,recall,f1-score,support
2
+ Assembly,0.9929789368104313,0.99,0.9914872308462694,2000.0
3
+ C,0.9302558956347216,0.927,0.9286250939143501,2000.0
4
+ C#,0.977710233029382,0.965,0.9713135379969804,2000.0
5
+ C++,0.912873225648556,0.9325,0.9225822409102152,2000.0
6
+ CSS,0.961895456765999,0.9845,0.9730664689893749,2000.0
7
+ Dart,0.9804511278195489,0.978,0.979224030037547,2000.0
8
+ Go,0.9788199697428139,0.9705,0.9746422294752699,2000.0
9
+ HTML,0.8991383679675621,0.887,0.8930279385854518,2000.0
10
+ Java,0.9728370221327968,0.967,0.9699097291875627,2000.0
11
+ JavaScript,0.8583252190847127,0.8815,0.8697582634435126,2000.0
12
+ Kotlin,0.9859508278976418,0.9825,0.9842223891810669,2000.0
13
+ Lua,0.986404833836858,0.9795,0.982940291018565,2000.0
14
+ Markdown,0.947289156626506,0.9435,0.9453907815631263,2000.0
15
+ Python,0.977977977977978,0.977,0.9774887443721861,2000.0
16
+ Rust,0.9875,0.9875,0.9875,2000.0
17
+ Typescript,0.9411172622043281,0.935,0.9380486581389516,2000.0
18
+ accuracy,0.9555,0.9555,0.9555,0.9555
19
+ macro avg,0.9557203445737397,0.9555,0.9555767267287768,32000.0
20
+ weighted avg,0.9557203445737398,0.9555,0.9555767267287769,32000.0
FastText/validation_confusion_matrix.csv ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,Assembly,C,C#,C++,CSS,Dart,Go,HTML,Java,JavaScript,Kotlin,Lua,Markdown,Python,Rust,Typescript
2
+ Assembly,1980,8,1,3,0,0,1,2,0,1,0,0,1,1,1,1
3
+ C,4,1854,5,118,1,0,1,4,2,1,1,2,4,0,2,1
4
+ C#,2,7,1930,13,2,3,3,3,12,8,2,3,3,0,4,5
5
+ C++,2,97,3,1865,0,2,3,3,7,4,0,1,6,1,3,3
6
+ CSS,0,1,0,0,1969,2,0,18,0,7,1,0,2,0,0,0
7
+ Dart,0,0,1,3,2,1956,2,4,0,18,2,0,6,1,1,4
8
+ Go,1,3,3,3,3,3,1941,4,4,15,2,4,7,2,3,2
9
+ HTML,0,2,4,3,53,5,4,1774,1,104,3,3,30,9,2,3
10
+ Java,2,2,10,12,1,3,1,6,1934,9,5,0,5,2,0,8
11
+ JavaScript,2,1,4,2,12,12,11,83,7,1763,6,3,13,4,1,76
12
+ Kotlin,0,1,2,0,0,5,0,4,6,10,1965,1,4,1,0,1
13
+ Lua,0,7,4,2,1,1,2,1,3,6,2,1959,1,8,1,2
14
+ Markdown,0,4,4,7,2,1,7,39,6,11,1,5,1887,13,5,8
15
+ Python,0,1,1,1,0,1,5,9,1,4,2,5,14,1954,2,0
16
+ Rust,1,4,1,7,0,0,1,2,1,2,0,0,2,1,1975,3
17
+ Typescript,0,1,1,4,1,1,1,17,4,91,1,0,7,1,0,1870
SGD-Classifier/Logistic-Regresssion.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import joblib
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from sklearn.feature_extraction.text import HashingVectorizer
9
+ from sklearn.linear_model import SGDClassifier
10
+ from sklearn.metrics import (
11
+ accuracy_score,
12
+ classification_report,
13
+ confusion_matrix,
14
+ )
15
+
16
+ # ============================================================
17
+ # CONFIG
18
+ # ============================================================
19
+
20
+ TRAIN_FILE = "dataset/train.jsonl"
21
+ VALIDATION_FILE = "dataset/validation.jsonl"
22
+ TEST_FILE = "dataset/test.jsonl"
23
+
24
+ BATCH_SIZE = 20000
25
+
26
+ EPOCHS = 10
27
+
28
+ N_FEATURES = 2**17
29
+ NGRAM_RANGE = (2, 6)
30
+
31
+ MODEL_DIR = "models"
32
+ METRICS_DIR = "metrics"
33
+
34
+ # ============================================================
35
+ # CREATE OUTPUT DIRS
36
+ # ============================================================
37
+
38
+ os.makedirs(MODEL_DIR, exist_ok=True)
39
+ os.makedirs(METRICS_DIR, exist_ok=True)
40
+
41
+ # ============================================================
42
+ # HELPERS
43
+ # ============================================================
44
+
45
+ def jsonl_batch_reader(path, batch_size):
46
+
47
+ texts = []
48
+ labels = []
49
+
50
+ with open(path, "r", encoding="utf-8") as f:
51
+
52
+ for line in f:
53
+
54
+ row = json.loads(line)
55
+
56
+ texts.append(row["content"])
57
+ labels.append(row["label"])
58
+
59
+ if len(texts) >= batch_size:
60
+
61
+ yield texts, labels
62
+
63
+ texts = []
64
+ labels = []
65
+
66
+ if texts:
67
+ yield texts, labels
68
+
69
+
70
+ def load_split(path):
71
+
72
+ texts = []
73
+ labels = []
74
+
75
+ with open(path, "r", encoding="utf-8") as f:
76
+
77
+ for line in f:
78
+
79
+ row = json.loads(line)
80
+
81
+ texts.append(row["content"])
82
+ labels.append(row["label"])
83
+
84
+ return texts, labels
85
+
86
+
87
+ def evaluate_split(
88
+ model,
89
+ vectorizer,
90
+ split_name,
91
+ texts,
92
+ labels,
93
+ epoch,
94
+ ):
95
+
96
+ print(f"\nEvaluating {split_name}")
97
+
98
+ X = vectorizer.transform(texts)
99
+
100
+ preds = model.predict(X)
101
+
102
+ acc = accuracy_score(labels, preds)
103
+
104
+ print(f"{split_name} accuracy: {acc:.6f}")
105
+
106
+ report = classification_report(
107
+ labels,
108
+ preds,
109
+ output_dict=True,
110
+ digits=4,
111
+ )
112
+
113
+ report_df = pd.DataFrame(report).transpose()
114
+
115
+ report_path = os.path.join(
116
+ METRICS_DIR,
117
+ f"{split_name}_epoch_{epoch:03d}_report.csv",
118
+ )
119
+
120
+ report_df.to_csv(report_path)
121
+
122
+ labels_sorted = sorted(list(set(labels)))
123
+
124
+ cm = confusion_matrix(
125
+ labels,
126
+ preds,
127
+ labels=labels_sorted,
128
+ )
129
+
130
+ cm_df = pd.DataFrame(
131
+ cm,
132
+ index=labels_sorted,
133
+ columns=labels_sorted,
134
+ )
135
+
136
+ cm_path = os.path.join(
137
+ METRICS_DIR,
138
+ f"{split_name}_epoch_{epoch:03d}_confusion_matrix.csv",
139
+ )
140
+
141
+ cm_df.to_csv(cm_path)
142
+
143
+ return acc
144
+
145
+
146
+ # ============================================================
147
+ # INFO
148
+ # ============================================================
149
+
150
+ print(f"CPU Cores: {os.cpu_count()}")
151
+
152
+ # ============================================================
153
+ # LOAD VALIDATION + TEST ONCE
154
+ # ============================================================
155
+
156
+ print("Loading validation set...")
157
+
158
+ val_texts, val_labels = load_split(
159
+ VALIDATION_FILE
160
+ )
161
+
162
+ print("Loading test set...")
163
+
164
+ test_texts, test_labels = load_split(
165
+ TEST_FILE
166
+ )
167
+
168
+ # ============================================================
169
+ # VECTORIZER
170
+ # ============================================================
171
+
172
+ vectorizer = HashingVectorizer(
173
+ analyzer="char",
174
+ ngram_range=NGRAM_RANGE,
175
+ n_features=N_FEATURES,
176
+ alternate_sign=False,
177
+ lowercase=False,
178
+ )
179
+
180
+ # ============================================================
181
+ # DISCOVER CLASSES
182
+ # ============================================================
183
+
184
+ print("Discovering classes...")
185
+
186
+ all_classes = set()
187
+
188
+ for _, labels in jsonl_batch_reader(
189
+ TRAIN_FILE,
190
+ BATCH_SIZE,
191
+ ):
192
+ all_classes.update(labels)
193
+
194
+ all_classes = np.array(
195
+ sorted(all_classes)
196
+ )
197
+
198
+ print("\nClasses:")
199
+ print(all_classes)
200
+
201
+ # ============================================================
202
+ # MODEL
203
+ # ============================================================
204
+
205
+ model = SGDClassifier(
206
+ loss="log_loss",
207
+ alpha=1e-6,
208
+ max_iter=1,
209
+ warm_start=True,
210
+ verbose=1,
211
+ random_state=42,
212
+ )
213
+
214
+ # ============================================================
215
+ # TRAIN
216
+ # ============================================================
217
+
218
+ epoch_results = []
219
+
220
+ first_fit = True
221
+
222
+ overall_start = time.time()
223
+
224
+ for epoch in range(EPOCHS):
225
+
226
+ print("\n" + "=" * 80)
227
+ print(f"Epoch {epoch + 1}/{EPOCHS}")
228
+ print("=" * 80)
229
+
230
+ epoch_start = time.time()
231
+
232
+ batch_count = 0
233
+
234
+ for texts, labels in jsonl_batch_reader(
235
+ TRAIN_FILE,
236
+ BATCH_SIZE,
237
+ ):
238
+
239
+ batch_count += 1
240
+
241
+ print(
242
+ f"Epoch {epoch+1} | Batch {batch_count}"
243
+ )
244
+
245
+ X = vectorizer.transform(texts)
246
+
247
+ if first_fit:
248
+
249
+ model.partial_fit(
250
+ X,
251
+ labels,
252
+ classes=all_classes,
253
+ )
254
+
255
+ first_fit = False
256
+
257
+ else:
258
+
259
+ model.partial_fit(
260
+ X,
261
+ labels,
262
+ )
263
+
264
+ epoch_time = time.time() - epoch_start
265
+
266
+ print(
267
+ f"\nEpoch finished in "
268
+ f"{epoch_time:.1f}s"
269
+ )
270
+
271
+ # ========================================================
272
+ # SAVE MODEL
273
+ # ========================================================
274
+
275
+ model_path = os.path.join(
276
+ MODEL_DIR,
277
+ f"epoch_{epoch+1:03d}.pkl",
278
+ )
279
+
280
+ joblib.dump(
281
+ {
282
+ "model": model,
283
+ "vectorizer": vectorizer,
284
+ },
285
+ model_path,
286
+ )
287
+
288
+ print(f"Saved {model_path}")
289
+
290
+ # ========================================================
291
+ # VALIDATION
292
+ # ========================================================
293
+
294
+ val_acc = evaluate_split(
295
+ model,
296
+ vectorizer,
297
+ "validation",
298
+ val_texts,
299
+ val_labels,
300
+ epoch + 1,
301
+ )
302
+
303
+ # ========================================================
304
+ # TEST
305
+ # ========================================================
306
+
307
+ test_acc = evaluate_split(
308
+ model,
309
+ vectorizer,
310
+ "test",
311
+ test_texts,
312
+ test_labels,
313
+ epoch + 1,
314
+ )
315
+
316
+ epoch_results.append(
317
+ {
318
+ "epoch": epoch + 1,
319
+ "validation_accuracy": val_acc,
320
+ "test_accuracy": test_acc,
321
+ "epoch_time_seconds": epoch_time,
322
+ }
323
+ )
324
+
325
+ pd.DataFrame(
326
+ epoch_results
327
+ ).to_csv(
328
+ os.path.join(
329
+ METRICS_DIR,
330
+ "epoch_summary.csv",
331
+ ),
332
+ index=False,
333
+ )
334
+
335
+ # ============================================================
336
+ # FINAL
337
+ # ============================================================
338
+
339
+ total_time = time.time() - overall_start
340
+
341
+ print("\nTraining Complete")
342
+
343
+ print(
344
+ f"Total training time: "
345
+ f"{total_time:.1f}s"
346
+ )
347
+
348
+ summary_df = pd.DataFrame(epoch_results)
349
+
350
+ best_val_epoch = summary_df[
351
+ "validation_accuracy"
352
+ ].idxmax()
353
+
354
+ best_row = summary_df.iloc[
355
+ best_val_epoch
356
+ ]
357
+
358
+ print("\nBest Epoch")
359
+ print(best_row)
360
+
361
+ summary_df.to_csv(
362
+ os.path.join(
363
+ METRICS_DIR,
364
+ "final_summary.csv",
365
+ ),
366
+ index=False,
367
+ )
368
+
369
+ print("\nDone.")
SGD-Classifier/metrics/epoch_summary.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ epoch,validation_accuracy,test_accuracy,epoch_time_seconds
2
+ 1,0.89421875,0.8940625,3927.5892601013184
3
+ 2,0.89746875,0.897,3837.121087551117
SGD-Classifier/metrics/test_epoch_001_confusion_matrix.csv ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,Assembly,C,C#,C++,CSS,Dart,Go,HTML,Java,JavaScript,Kotlin,Lua,Markdown,Python,Rust,Typescript
2
+ Assembly,1967,26,0,1,0,0,0,0,1,1,0,3,1,0,0,0
3
+ C,3,1798,3,176,0,1,3,0,3,3,0,3,3,3,0,1
4
+ C#,3,12,1925,11,0,1,7,0,13,13,0,8,2,0,2,3
5
+ C++,10,193,4,1756,1,0,11,0,6,7,2,5,1,2,2,0
6
+ CSS,3,0,0,3,1970,2,1,0,1,18,0,0,2,0,0,0
7
+ Dart,0,3,5,3,1,1923,4,0,11,38,0,1,2,3,1,5
8
+ Go,0,1,0,1,1,0,1957,0,3,17,0,4,12,4,0,0
9
+ HTML,11,5,2,7,161,12,9,7,25,648,1,12,1056,34,1,9
10
+ Java,1,5,6,14,1,5,2,0,1937,13,2,6,4,3,0,1
11
+ JavaScript,6,4,7,4,14,18,6,0,16,1837,0,4,14,6,1,63
12
+ Kotlin,0,1,1,2,0,2,6,0,12,9,1949,5,7,3,0,3
13
+ Lua,2,9,7,2,1,0,4,0,2,6,1,1957,5,4,0,0
14
+ Markdown,4,7,1,10,1,2,10,0,7,17,1,7,1893,26,5,9
15
+ Python,2,2,1,0,2,7,5,0,0,10,1,2,14,1953,0,1
16
+ Rust,4,2,3,4,1,0,3,0,3,5,0,4,10,1,1960,0
17
+ Typescript,2,7,5,6,0,9,1,0,9,130,1,2,5,0,2,1821
SGD-Classifier/metrics/test_epoch_001_report.csv ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,precision,recall,f1-score,support
2
+ Assembly,0.9747274529236868,0.9835,0.9790940766550522,2000.0
3
+ C,0.8665060240963856,0.899,0.8824539877300613,2000.0
4
+ C#,0.9771573604060914,0.9625,0.9697732997481109,2000.0
5
+ C++,0.878,0.878,0.878,2000.0
6
+ CSS,0.914577530176416,0.985,0.9484833895040924,2000.0
7
+ Dart,0.9702320887991928,0.9615,0.9658463083877449,2000.0
8
+ Go,0.964514539181863,0.9785,0.9714569372052618,2000.0
9
+ HTML,1.0,0.0035,0.006975585450921774,2000.0
10
+ Java,0.9453391898487067,0.9685,0.9567794517164732,2000.0
11
+ JavaScript,0.6626984126984127,0.9185,0.769907795473596,2000.0
12
+ Kotlin,0.9954034729315628,0.9745,0.9848408287013644,2000.0
13
+ Lua,0.967375185368265,0.9785,0.972905791697738,2000.0
14
+ Markdown,0.6245463543385021,0.9465,0.7525342874180083,2000.0
15
+ Python,0.9564152791380999,0.9765,0.9663532904502722,2000.0
16
+ Rust,0.9929078014184397,0.98,0.9864116758933065,2000.0
17
+ Typescript,0.9504175365344467,0.9105,0.9300306435137896,2000.0
18
+ accuracy,0.8940625,0.8940625,0.8940625,0.8940625
19
+ macro avg,0.9150511392412544,0.8940625,0.8701154593466122,32000.0
20
+ weighted avg,0.9150511392412544,0.8940625,0.8701154593466122,32000.0
SGD-Classifier/metrics/test_epoch_002_confusion_matrix.csv ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,Assembly,C,C#,C++,CSS,Dart,Go,HTML,Java,JavaScript,Kotlin,Lua,Markdown,Python,Rust,Typescript
2
+ Assembly,1965,27,0,1,0,0,0,0,1,1,0,3,2,0,0,0
3
+ C,3,1807,3,171,0,2,3,0,2,2,0,0,3,3,0,1
4
+ C#,3,15,1925,11,1,1,5,0,13,11,0,7,3,0,2,3
5
+ C++,8,206,4,1744,2,0,10,0,6,7,2,6,1,2,2,0
6
+ CSS,3,0,0,2,1974,2,1,0,0,15,1,0,2,0,0,0
7
+ Dart,0,3,4,3,1,1927,3,0,8,37,0,2,3,2,1,6
8
+ Go,0,1,0,2,2,0,1954,0,4,18,0,3,12,3,1,0
9
+ HTML,10,5,2,6,154,15,7,96,20,600,2,10,1027,35,1,10
10
+ Java,1,8,6,13,1,5,2,0,1937,12,2,5,4,3,0,1
11
+ JavaScript,6,4,8,5,15,23,4,1,15,1822,0,4,16,5,1,71
12
+ Kotlin,0,1,1,3,0,2,6,0,12,9,1948,4,8,3,0,3
13
+ Lua,3,6,7,2,0,0,3,0,2,5,1,1961,7,3,0,0
14
+ Markdown,3,8,1,10,1,1,10,0,8,17,1,3,1898,25,5,9
15
+ Python,2,2,1,1,2,7,4,0,0,10,1,2,15,1952,0,1
16
+ Rust,4,2,3,4,1,0,3,0,3,5,0,3,10,1,1961,0
17
+ Typescript,1,7,5,6,0,10,1,0,9,118,1,2,5,0,2,1833
SGD-Classifier/metrics/test_epoch_002_report.csv ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,precision,recall,f1-score,support
2
+ Assembly,0.9766401590457257,0.9825,0.9795613160518445,2000.0
3
+ C,0.8596574690770694,0.9035,0.8810336421257923,2000.0
4
+ C#,0.9771573604060914,0.9625,0.9697732997481109,2000.0
5
+ C++,0.8790322580645161,0.872,0.8755020080321285,2000.0
6
+ CSS,0.9164345403899722,0.987,0.9504092441020703,2000.0
7
+ Dart,0.9659147869674185,0.9635,0.9647058823529412,2000.0
8
+ Go,0.9692460317460317,0.977,0.9731075697211156,2000.0
9
+ HTML,0.9896907216494846,0.048,0.09155937052932761,2000.0
10
+ Java,0.9495098039215686,0.9685,0.9589108910891089,2000.0
11
+ JavaScript,0.6775753068055039,0.911,0.7771379825122627,2000.0
12
+ Kotlin,0.9943848902501277,0.974,0.9840868906289467,2000.0
13
+ Lua,0.9732009925558313,0.9805,0.9768368617683686,2000.0
14
+ Markdown,0.6293103448275862,0.949,0.7567783094098883,2000.0
15
+ Python,0.958271968581247,0.976,0.9670547436215011,2000.0
16
+ Rust,0.9924089068825911,0.9805,0.9864185110663984,2000.0
17
+ Typescript,0.9458204334365325,0.9165,0.930929405789741,2000.0
18
+ accuracy,0.897,0.897,0.897,0.897
19
+ macro avg,0.9158909984129562,0.897,0.8764878705343466,32000.0
20
+ weighted avg,0.9158909984129562,0.897,0.8764878705343467,32000.0
SGD-Classifier/metrics/validation_epoch_001_confusion_matrix.csv ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,Assembly,C,C#,C++,CSS,Dart,Go,HTML,Java,JavaScript,Kotlin,Lua,Markdown,Python,Rust,Typescript
2
+ Assembly,1959,29,0,1,0,0,2,0,1,1,1,2,1,1,1,1
3
+ C,7,1830,7,139,1,2,2,0,1,1,1,2,1,3,2,1
4
+ C#,5,15,1899,13,0,1,9,0,23,11,1,14,2,1,2,4
5
+ C++,10,167,2,1795,0,0,3,0,9,4,0,2,3,4,1,0
6
+ CSS,3,3,1,0,1954,4,1,0,0,27,0,0,6,1,0,0
7
+ Dart,0,0,2,4,0,1944,0,0,4,35,0,1,6,1,1,2
8
+ Go,5,2,2,3,3,2,1945,0,2,20,0,3,6,5,0,2
9
+ HTML,7,10,6,9,178,8,10,2,36,646,3,11,1039,26,2,7
10
+ Java,1,5,7,15,1,3,7,0,1924,17,2,9,3,5,0,1
11
+ JavaScript,3,2,2,6,18,16,7,0,14,1844,4,10,10,4,2,58
12
+ Kotlin,2,5,2,6,1,3,3,0,13,15,1933,3,10,3,0,1
13
+ Lua,1,9,2,5,0,0,2,0,2,6,0,1959,6,5,2,1
14
+ Markdown,2,8,7,13,4,3,9,0,9,21,3,7,1879,25,5,5
15
+ Python,1,2,0,1,1,1,4,0,2,11,1,7,10,1958,0,1
16
+ Rust,3,7,3,6,0,0,5,0,3,2,0,0,1,1,1969,0
17
+ Typescript,1,1,4,5,1,6,3,0,13,129,1,0,11,3,1,1821
SGD-Classifier/metrics/validation_epoch_001_report.csv ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,precision,recall,f1-score,support
2
+ Assembly,0.9746268656716418,0.9795,0.9770573566084788,2000.0
3
+ C,0.8735083532219571,0.915,0.8937728937728938,2000.0
4
+ C#,0.9758478931140802,0.9495,0.9624936644703497,2000.0
5
+ C++,0.8881741712023751,0.8975,0.8928127331509574,2000.0
6
+ CSS,0.9037927844588344,0.977,0.9389716482460355,2000.0
7
+ Dart,0.9754139488208731,0.972,0.9737039819684448,2000.0
8
+ Go,0.9666998011928429,0.9725,0.9695912263210369,2000.0
9
+ HTML,1.0,0.001,0.001998001998001998,2000.0
10
+ Java,0.9357976653696498,0.962,0.9487179487179487,2000.0
11
+ JavaScript,0.6609318996415771,0.922,0.7699373695198329,2000.0
12
+ Kotlin,0.9912820512820513,0.9665,0.9787341772151898,2000.0
13
+ Lua,0.9650246305418719,0.9795,0.9722084367245658,2000.0
14
+ Markdown,0.6275885103540414,0.9395,0.7525030036043252,2000.0
15
+ Python,0.956989247311828,0.979,0.967869500741473,2000.0
16
+ Rust,0.9904426559356136,0.9845,0.9874623871614845,2000.0
17
+ Typescript,0.9559055118110236,0.9105,0.9326504481434059,2000.0
18
+ accuracy,0.89421875,0.89421875,0.89421875,0.89421875
19
+ macro avg,0.9151266243706413,0.8942187500000001,0.8700302986477766,32000.0
20
+ weighted avg,0.9151266243706414,0.89421875,0.8700302986477766,32000.0
SGD-Classifier/metrics/validation_epoch_002_confusion_matrix.csv ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,Assembly,C,C#,C++,CSS,Dart,Go,HTML,Java,JavaScript,Kotlin,Lua,Markdown,Python,Rust,Typescript
2
+ Assembly,1956,31,0,2,0,0,2,0,1,1,1,1,2,1,1,1
3
+ C,5,1841,7,128,1,2,2,0,1,1,1,2,2,4,2,1
4
+ C#,4,15,1907,11,0,2,7,0,22,10,1,13,2,1,2,3
5
+ C++,11,174,4,1788,0,0,3,0,7,4,0,1,3,4,1,0
6
+ CSS,1,4,1,0,1956,4,1,0,0,25,0,0,7,1,0,0
7
+ Dart,0,0,1,1,0,1951,0,0,4,31,0,1,6,1,1,3
8
+ Go,5,2,3,3,2,3,1943,0,1,21,0,3,7,5,0,2
9
+ HTML,6,10,5,9,171,9,6,83,31,607,5,13,1009,27,2,7
10
+ Java,3,6,6,16,1,3,7,0,1927,12,2,9,3,4,0,1
11
+ JavaScript,3,1,3,6,19,17,7,0,13,1833,3,10,12,4,2,67
12
+ Kotlin,1,6,2,6,1,3,3,0,12,14,1934,3,11,3,0,1
13
+ Lua,1,8,3,6,0,0,2,0,1,6,0,1960,6,5,1,1
14
+ Markdown,2,9,7,12,4,3,9,0,10,19,3,2,1883,25,5,7
15
+ Python,1,2,0,1,1,1,3,0,2,12,2,7,11,1956,0,1
16
+ Rust,3,7,4,5,0,3,5,0,2,2,0,0,1,1,1967,0
17
+ Typescript,1,2,4,4,1,5,2,0,13,117,2,0,11,3,1,1834
SGD-Classifier/metrics/validation_epoch_002_report.csv ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,precision,recall,f1-score,support
2
+ Assembly,0.9765351972041937,0.978,0.9772670497127155,2000.0
3
+ C,0.8692162417374882,0.9205,0.8941233608547838,2000.0
4
+ C#,0.9744506898313745,0.9535,0.9638615112458934,2000.0
5
+ C++,0.8948948948948949,0.894,0.894447223611806,2000.0
6
+ CSS,0.9068150208623088,0.978,0.9410632667789272,2000.0
7
+ Dart,0.9725822532402791,0.9755,0.9740389415876186,2000.0
8
+ Go,0.9705294705294706,0.9715,0.9710144927536232,2000.0
9
+ HTML,1.0,0.0415,0.07969275084013443,2000.0
10
+ Java,0.9413776257938447,0.9635,0.9523103533481592,2000.0
11
+ JavaScript,0.6751381215469613,0.9165,0.7775185577942736,2000.0
12
+ Kotlin,0.9897645854657113,0.967,0.9782498735457764,2000.0
13
+ Lua,0.9679012345679012,0.98,0.9739130434782609,2000.0
14
+ Markdown,0.6327284946236559,0.9415,0.7568327974276527,2000.0
15
+ Python,0.956479217603912,0.978,0.9671199011124846,2000.0
16
+ Rust,0.9909319899244332,0.9835,0.9872020075282308,2000.0
17
+ Typescript,0.9507516848107828,0.917,0.9335708831763807,2000.0
18
+ accuracy,0.89746875,0.89746875,0.89746875,0.89746875
19
+ macro avg,0.9168810451648257,0.89746875,0.8763891259247951,32000.0
20
+ weighted avg,0.9168810451648257,0.89746875,0.876389125924795,32000.0
SGD-Classifier/models/epoch_001.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3cb9dfc421e9f5eed281199cd3e6ac7d41b4dc5d13efce09f60949d830b2eee
3
+ size 16779530
SGD-Classifier/models/epoch_002.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3941f702f4d9d1bee58087062178846a70c34311a10329e6eca075a9a4603633
3
+ size 16779530