Text Classification
fastText
English
scikit-learn
code-classification
programming-language-detection
source-code
machine-learning
modernbert
classification
nlp
code-analysis
software-engineering
Eval Results (legacy)
Instructions to use kaushik-harsh-99/Code-Lang-Classifier with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- fastText
How to use kaushik-harsh-99/Code-Lang-Classifier with fastText:
from huggingface_hub import hf_hub_download import fasttext model = fasttext.load_model(hf_hub_download("kaushik-harsh-99/Code-Lang-Classifier", "model.bin")) - Notebooks
- Google Colab
- Kaggle
Commit ·
95f644c
1
Parent(s): 11eb6e9
initial-upload
Browse files- FastText/FastText-Test.py +195 -0
- FastText/FastText.py +268 -0
- FastText/convert-to-fast-text-format.py +40 -0
- FastText/fasttext_language_classifier.bin +3 -0
- FastText/fasttext_summary.csv +2 -0
- FastText/test_classification_report.csv +20 -0
- FastText/test_confusion_matrix.csv +17 -0
- FastText/validation_classification_report.csv +20 -0
- FastText/validation_confusion_matrix.csv +17 -0
- SGD-Classifier/Logistic-Regresssion.py +369 -0
- SGD-Classifier/metrics/epoch_summary.csv +3 -0
- SGD-Classifier/metrics/test_epoch_001_confusion_matrix.csv +17 -0
- SGD-Classifier/metrics/test_epoch_001_report.csv +20 -0
- SGD-Classifier/metrics/test_epoch_002_confusion_matrix.csv +17 -0
- SGD-Classifier/metrics/test_epoch_002_report.csv +20 -0
- SGD-Classifier/metrics/validation_epoch_001_confusion_matrix.csv +17 -0
- SGD-Classifier/metrics/validation_epoch_001_report.csv +20 -0
- SGD-Classifier/metrics/validation_epoch_002_confusion_matrix.csv +17 -0
- SGD-Classifier/metrics/validation_epoch_002_report.csv +20 -0
- SGD-Classifier/models/epoch_001.pkl +3 -0
- SGD-Classifier/models/epoch_002.pkl +3 -0
FastText/FastText-Test.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import fasttext
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
from sklearn.metrics import (
|
| 6 |
+
accuracy_score,
|
| 7 |
+
classification_report,
|
| 8 |
+
confusion_matrix,
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
# ============================================================
|
| 12 |
+
# CONFIG
|
| 13 |
+
# ============================================================
|
| 14 |
+
|
| 15 |
+
MODEL_FILE = "fasttext_language_classifier.bin"
|
| 16 |
+
|
| 17 |
+
VALIDATION_FILE = "dataset/validation.jsonl"
|
| 18 |
+
TEST_FILE = "dataset/test.jsonl"
|
| 19 |
+
|
| 20 |
+
# ============================================================
|
| 21 |
+
# LOAD MODEL
|
| 22 |
+
# ============================================================
|
| 23 |
+
|
| 24 |
+
print("Loading model...")
|
| 25 |
+
|
| 26 |
+
model = fasttext.load_model(MODEL_FILE)
|
| 27 |
+
|
| 28 |
+
print("Model loaded.")
|
| 29 |
+
|
| 30 |
+
# ============================================================
|
| 31 |
+
# EVALUATION
|
| 32 |
+
# ============================================================
|
| 33 |
+
|
| 34 |
+
def evaluate_jsonl(
|
| 35 |
+
model,
|
| 36 |
+
jsonl_file,
|
| 37 |
+
split_name,
|
| 38 |
+
):
|
| 39 |
+
|
| 40 |
+
print(f"\nEvaluating {split_name}")
|
| 41 |
+
|
| 42 |
+
y_true = []
|
| 43 |
+
y_pred = []
|
| 44 |
+
|
| 45 |
+
processed = 0
|
| 46 |
+
|
| 47 |
+
with open(
|
| 48 |
+
jsonl_file,
|
| 49 |
+
"r",
|
| 50 |
+
encoding="utf-8",
|
| 51 |
+
) as f:
|
| 52 |
+
|
| 53 |
+
for line in f:
|
| 54 |
+
|
| 55 |
+
row = json.loads(line)
|
| 56 |
+
|
| 57 |
+
true_label = row["label"]
|
| 58 |
+
|
| 59 |
+
# Match FastText training format
|
| 60 |
+
text = " ".join(
|
| 61 |
+
row["content"].split()
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
labels, probs = model.predict(
|
| 65 |
+
text,
|
| 66 |
+
k=1,
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
pred_label = (
|
| 70 |
+
labels[0]
|
| 71 |
+
.replace("__label__", "")
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
y_true.append(true_label)
|
| 75 |
+
y_pred.append(pred_label)
|
| 76 |
+
|
| 77 |
+
processed += 1
|
| 78 |
+
|
| 79 |
+
if processed % 5000 == 0:
|
| 80 |
+
print(
|
| 81 |
+
f"Processed {processed:,}"
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
# ========================================================
|
| 85 |
+
# ACCURACY
|
| 86 |
+
# ========================================================
|
| 87 |
+
|
| 88 |
+
acc = accuracy_score(
|
| 89 |
+
y_true,
|
| 90 |
+
y_pred,
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
print(
|
| 94 |
+
f"\n{split_name} Accuracy: "
|
| 95 |
+
f"{acc:.6f}"
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# ========================================================
|
| 99 |
+
# CLASSIFICATION REPORT
|
| 100 |
+
# ========================================================
|
| 101 |
+
|
| 102 |
+
report = classification_report(
|
| 103 |
+
y_true,
|
| 104 |
+
y_pred,
|
| 105 |
+
output_dict=True,
|
| 106 |
+
digits=4,
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
report_df = (
|
| 110 |
+
pd.DataFrame(report)
|
| 111 |
+
.transpose()
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
report_csv = (
|
| 115 |
+
f"{split_name}_classification_report.csv"
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
report_df.to_csv(report_csv)
|
| 119 |
+
|
| 120 |
+
print(f"Saved {report_csv}")
|
| 121 |
+
|
| 122 |
+
# ========================================================
|
| 123 |
+
# CONFUSION MATRIX
|
| 124 |
+
# ========================================================
|
| 125 |
+
|
| 126 |
+
labels_sorted = sorted(
|
| 127 |
+
list(set(y_true))
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
cm = confusion_matrix(
|
| 131 |
+
y_true,
|
| 132 |
+
y_pred,
|
| 133 |
+
labels=labels_sorted,
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
cm_df = pd.DataFrame(
|
| 137 |
+
cm,
|
| 138 |
+
index=labels_sorted,
|
| 139 |
+
columns=labels_sorted,
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
cm_csv = (
|
| 143 |
+
f"{split_name}_confusion_matrix.csv"
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
cm_df.to_csv(cm_csv)
|
| 147 |
+
|
| 148 |
+
print(f"Saved {cm_csv}")
|
| 149 |
+
|
| 150 |
+
return acc
|
| 151 |
+
|
| 152 |
+
# ============================================================
|
| 153 |
+
# VALIDATION
|
| 154 |
+
# ============================================================
|
| 155 |
+
|
| 156 |
+
validation_accuracy = evaluate_jsonl(
|
| 157 |
+
model,
|
| 158 |
+
VALIDATION_FILE,
|
| 159 |
+
"validation",
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
# ============================================================
|
| 163 |
+
# TEST
|
| 164 |
+
# ============================================================
|
| 165 |
+
|
| 166 |
+
test_accuracy = evaluate_jsonl(
|
| 167 |
+
model,
|
| 168 |
+
TEST_FILE,
|
| 169 |
+
"test",
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
# ============================================================
|
| 173 |
+
# SUMMARY
|
| 174 |
+
# ============================================================
|
| 175 |
+
|
| 176 |
+
summary = pd.DataFrame([
|
| 177 |
+
{
|
| 178 |
+
"validation_accuracy": validation_accuracy,
|
| 179 |
+
"test_accuracy": test_accuracy,
|
| 180 |
+
}
|
| 181 |
+
])
|
| 182 |
+
|
| 183 |
+
summary.to_csv(
|
| 184 |
+
"fasttext_summary.csv",
|
| 185 |
+
index=False,
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
print("\nSaved fasttext_summary.csv")
|
| 189 |
+
|
| 190 |
+
print("\n==============================")
|
| 191 |
+
print(f"Validation Accuracy: {validation_accuracy:.6f}")
|
| 192 |
+
print(f"Test Accuracy: {test_accuracy:.6f}")
|
| 193 |
+
print("==============================")
|
| 194 |
+
|
| 195 |
+
print("\nDone.")
|
FastText/FastText.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
|
| 5 |
+
import fasttext
|
| 6 |
+
import pandas as pd
|
| 7 |
+
|
| 8 |
+
from sklearn.metrics import (
|
| 9 |
+
accuracy_score,
|
| 10 |
+
classification_report,
|
| 11 |
+
confusion_matrix,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
# ============================================================
|
| 15 |
+
# CONFIG
|
| 16 |
+
# ============================================================
|
| 17 |
+
|
| 18 |
+
TRAIN_FILE = "fasttext_train.txt"
|
| 19 |
+
|
| 20 |
+
VALIDATION_JSONL = "dataset/validation.jsonl"
|
| 21 |
+
TEST_JSONL = "dataset/test.jsonl"
|
| 22 |
+
|
| 23 |
+
MODEL_FILE = "fasttext_language_classifier.bin"
|
| 24 |
+
|
| 25 |
+
EPOCHS = 25
|
| 26 |
+
LR = 0.7
|
| 27 |
+
|
| 28 |
+
DIM = 50
|
| 29 |
+
|
| 30 |
+
WORD_NGRAMS = 3
|
| 31 |
+
|
| 32 |
+
MINN = 2
|
| 33 |
+
MAXN = 5
|
| 34 |
+
|
| 35 |
+
MIN_COUNT = 100
|
| 36 |
+
|
| 37 |
+
BUCKET = 50000
|
| 38 |
+
|
| 39 |
+
THREADS = os.cpu_count()
|
| 40 |
+
|
| 41 |
+
# ============================================================
|
| 42 |
+
# TRAIN
|
| 43 |
+
# ============================================================
|
| 44 |
+
|
| 45 |
+
print("Training FastText...")
|
| 46 |
+
print()
|
| 47 |
+
|
| 48 |
+
start = time.time()
|
| 49 |
+
|
| 50 |
+
model = fasttext.train_supervised(
|
| 51 |
+
input=TRAIN_FILE,
|
| 52 |
+
epoch=EPOCHS,
|
| 53 |
+
lr=LR,
|
| 54 |
+
dim=DIM,
|
| 55 |
+
wordNgrams=WORD_NGRAMS,
|
| 56 |
+
minn=MINN,
|
| 57 |
+
maxn=MAXN,
|
| 58 |
+
minCount=MIN_COUNT,
|
| 59 |
+
bucket=BUCKET,
|
| 60 |
+
loss="softmax",
|
| 61 |
+
thread=THREADS,
|
| 62 |
+
verbose=2,
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
elapsed = time.time() - start
|
| 66 |
+
|
| 67 |
+
print()
|
| 68 |
+
print(f"Training completed in {elapsed:.1f}s")
|
| 69 |
+
|
| 70 |
+
# ============================================================
|
| 71 |
+
# LABEL DEBUG
|
| 72 |
+
# ============================================================
|
| 73 |
+
|
| 74 |
+
print()
|
| 75 |
+
print("Labels found by FastText:")
|
| 76 |
+
print(f"Count: {len(model.labels)}")
|
| 77 |
+
|
| 78 |
+
for label in model.labels:
|
| 79 |
+
print(label)
|
| 80 |
+
|
| 81 |
+
# ============================================================
|
| 82 |
+
# SAVE MODEL
|
| 83 |
+
# ============================================================
|
| 84 |
+
|
| 85 |
+
model.save_model(MODEL_FILE)
|
| 86 |
+
|
| 87 |
+
size_mb = os.path.getsize(MODEL_FILE) / 1024 / 1024
|
| 88 |
+
|
| 89 |
+
print()
|
| 90 |
+
print(f"Saved model: {MODEL_FILE}")
|
| 91 |
+
print(f"Model size: {size_mb:.2f} MB")
|
| 92 |
+
|
| 93 |
+
# ============================================================
|
| 94 |
+
# EVALUATION
|
| 95 |
+
# ============================================================
|
| 96 |
+
|
| 97 |
+
def evaluate_jsonl(
|
| 98 |
+
model,
|
| 99 |
+
jsonl_file,
|
| 100 |
+
split_name,
|
| 101 |
+
):
|
| 102 |
+
print()
|
| 103 |
+
print(f"Evaluating {split_name}")
|
| 104 |
+
|
| 105 |
+
y_true = []
|
| 106 |
+
y_pred = []
|
| 107 |
+
|
| 108 |
+
processed = 0
|
| 109 |
+
|
| 110 |
+
with open(
|
| 111 |
+
jsonl_file,
|
| 112 |
+
"r",
|
| 113 |
+
encoding="utf-8",
|
| 114 |
+
) as f:
|
| 115 |
+
|
| 116 |
+
for line in f:
|
| 117 |
+
|
| 118 |
+
row = json.loads(line)
|
| 119 |
+
|
| 120 |
+
true_label = row["label"]
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
text = " ".join(
|
| 124 |
+
str(row["content"]).split()
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
labels, probs = model.predict(
|
| 128 |
+
text,
|
| 129 |
+
k=1,
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
pred_label = (
|
| 133 |
+
labels[0]
|
| 134 |
+
.replace("__label__", "")
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
y_true.append(true_label)
|
| 138 |
+
y_pred.append(pred_label)
|
| 139 |
+
|
| 140 |
+
processed += 1
|
| 141 |
+
|
| 142 |
+
if processed % 5000 == 0:
|
| 143 |
+
print(
|
| 144 |
+
f"Processed {processed:,}"
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
# ========================================================
|
| 148 |
+
# ACCURACY
|
| 149 |
+
# ========================================================
|
| 150 |
+
|
| 151 |
+
accuracy = accuracy_score(
|
| 152 |
+
y_true,
|
| 153 |
+
y_pred,
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
print()
|
| 157 |
+
print(
|
| 158 |
+
f"{split_name} Accuracy: "
|
| 159 |
+
f"{accuracy:.6f}"
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
# ========================================================
|
| 163 |
+
# CLASSIFICATION REPORT
|
| 164 |
+
# ========================================================
|
| 165 |
+
|
| 166 |
+
report = classification_report(
|
| 167 |
+
y_true,
|
| 168 |
+
y_pred,
|
| 169 |
+
output_dict=True,
|
| 170 |
+
digits=4,
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
report_df = pd.DataFrame(
|
| 174 |
+
report
|
| 175 |
+
).transpose()
|
| 176 |
+
|
| 177 |
+
report_file = (
|
| 178 |
+
f"{split_name}_classification_report.csv"
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
report_df.to_csv(report_file)
|
| 182 |
+
|
| 183 |
+
print(f"Saved {report_file}")
|
| 184 |
+
|
| 185 |
+
# ========================================================
|
| 186 |
+
# CONFUSION MATRIX
|
| 187 |
+
# ========================================================
|
| 188 |
+
|
| 189 |
+
labels_sorted = sorted(
|
| 190 |
+
list(set(y_true))
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
cm = confusion_matrix(
|
| 194 |
+
y_true,
|
| 195 |
+
y_pred,
|
| 196 |
+
labels=labels_sorted,
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
cm_df = pd.DataFrame(
|
| 200 |
+
cm,
|
| 201 |
+
index=labels_sorted,
|
| 202 |
+
columns=labels_sorted,
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
cm_file = (
|
| 206 |
+
f"{split_name}_confusion_matrix.csv"
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
cm_df.to_csv(cm_file)
|
| 210 |
+
|
| 211 |
+
print(f"Saved {cm_file}")
|
| 212 |
+
|
| 213 |
+
return accuracy
|
| 214 |
+
|
| 215 |
+
# ============================================================
|
| 216 |
+
# VALIDATION
|
| 217 |
+
# ============================================================
|
| 218 |
+
|
| 219 |
+
validation_accuracy = evaluate_jsonl(
|
| 220 |
+
model,
|
| 221 |
+
VALIDATION_JSONL,
|
| 222 |
+
"validation",
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
# ============================================================
|
| 226 |
+
# TEST
|
| 227 |
+
# ============================================================
|
| 228 |
+
|
| 229 |
+
test_accuracy = evaluate_jsonl(
|
| 230 |
+
model,
|
| 231 |
+
TEST_JSONL,
|
| 232 |
+
"test",
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
# ============================================================
|
| 236 |
+
# SUMMARY
|
| 237 |
+
# ============================================================
|
| 238 |
+
|
| 239 |
+
summary = pd.DataFrame(
|
| 240 |
+
[
|
| 241 |
+
{
|
| 242 |
+
"validation_accuracy": validation_accuracy,
|
| 243 |
+
"test_accuracy": test_accuracy,
|
| 244 |
+
"epochs": EPOCHS,
|
| 245 |
+
"lr": LR,
|
| 246 |
+
"dim": DIM,
|
| 247 |
+
"word_ngrams": WORD_NGRAMS,
|
| 248 |
+
"min_count": MIN_COUNT,
|
| 249 |
+
"bucket": BUCKET,
|
| 250 |
+
"model_size_mb": size_mb,
|
| 251 |
+
}
|
| 252 |
+
]
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
summary.to_csv(
|
| 256 |
+
"fasttext_summary.csv",
|
| 257 |
+
index=False,
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
print()
|
| 261 |
+
print("=" * 60)
|
| 262 |
+
print(f"Validation Accuracy : {validation_accuracy:.6f}")
|
| 263 |
+
print(f"Test Accuracy : {test_accuracy:.6f}")
|
| 264 |
+
print(f"Model Size (MB) : {size_mb:.2f}")
|
| 265 |
+
print("=" * 60)
|
| 266 |
+
|
| 267 |
+
print()
|
| 268 |
+
print("Done.")
|
FastText/convert-to-fast-text-format.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
FILES = {
|
| 4 |
+
"dataset/train.jsonl": "fasttext_train.txt",
|
| 5 |
+
"dataset/validation.jsonl": "fasttext_validation.txt",
|
| 6 |
+
"dataset/test.jsonl": "fasttext_test.txt",
|
| 7 |
+
}
|
| 8 |
+
|
| 9 |
+
for input_file, output_file in FILES.items():
|
| 10 |
+
|
| 11 |
+
print(f"Converting {input_file} -> {output_file}")
|
| 12 |
+
|
| 13 |
+
count = 0
|
| 14 |
+
|
| 15 |
+
with open(input_file, "r", encoding="utf-8") as fin, \
|
| 16 |
+
open(output_file, "w", encoding="utf-8") as fout:
|
| 17 |
+
|
| 18 |
+
for line in fin:
|
| 19 |
+
|
| 20 |
+
row = json.loads(line)
|
| 21 |
+
|
| 22 |
+
label = str(row["label"]).strip()
|
| 23 |
+
|
| 24 |
+
text = str(row["content"])
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
text = text.replace("__label__", "__lbl__")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
text = " ".join(text.split())
|
| 31 |
+
|
| 32 |
+
fout.write(
|
| 33 |
+
f"__label__{label} {text}\n"
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
count += 1
|
| 37 |
+
|
| 38 |
+
print(f"Saved {count:,} samples")
|
| 39 |
+
|
| 40 |
+
print("\nDone.")
|
FastText/fasttext_language_classifier.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a8734bd145050cf8c458943d1fec8a311410bf2f7f21b89c677c42c1ec3d4d39
|
| 3 |
+
size 38263405
|
FastText/fasttext_summary.csv
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
validation_accuracy,test_accuracy,epochs,lr,dim,word_ngrams,min_count,bucket,model_size_mb
|
| 2 |
+
0.9555,0.953125,25,0.7,50,3,100,50000,36.49082660675049
|
FastText/test_classification_report.csv
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
,precision,recall,f1-score,support
|
| 2 |
+
Assembly,0.9874874874874875,0.9865,0.9869934967483742,2000.0
|
| 3 |
+
C,0.9132374814080317,0.921,0.9171023151605676,2000.0
|
| 4 |
+
C#,0.9763937719738824,0.972,0.974191931846655,2000.0
|
| 5 |
+
C++,0.9087261785356068,0.906,0.9073610415623435,2000.0
|
| 6 |
+
CSS,0.9709072978303748,0.9845,0.977656405163853,2000.0
|
| 7 |
+
Dart,0.9794589178356713,0.9775,0.9784784784784785,2000.0
|
| 8 |
+
Go,0.9725411882176734,0.974,0.9732700474644017,2000.0
|
| 9 |
+
HTML,0.896236012207528,0.881,0.8885526979324256,2000.0
|
| 10 |
+
Java,0.9676777722526106,0.973,0.9703315881326352,2000.0
|
| 11 |
+
JavaScript,0.851581508515815,0.875,0.8631319358816276,2000.0
|
| 12 |
+
Kotlin,0.9863979848866499,0.979,0.9826850690087829,2000.0
|
| 13 |
+
Lua,0.9859084046300957,0.9795,0.9826937547027841,2000.0
|
| 14 |
+
Markdown,0.9464196294441662,0.945,0.9457092819614711,2000.0
|
| 15 |
+
Python,0.9853609288238263,0.976,0.9806581260989701,2000.0
|
| 16 |
+
Rust,0.9894736842105263,0.987,0.9882352941176471,2000.0
|
| 17 |
+
Typescript,0.9348697394789579,0.933,0.933933933933934,2000.0
|
| 18 |
+
accuracy,0.953125,0.953125,0.953125,0.953125
|
| 19 |
+
macro avg,0.9532923742336815,0.953125,0.9531865873871844,32000.0
|
| 20 |
+
weighted avg,0.9532923742336815,0.953125,0.9531865873871844,32000.0
|
FastText/test_confusion_matrix.csv
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
,Assembly,C,C#,C++,CSS,Dart,Go,HTML,Java,JavaScript,Kotlin,Lua,Markdown,Python,Rust,Typescript
|
| 2 |
+
Assembly,1973,13,0,6,1,0,1,0,0,1,0,3,1,0,0,1
|
| 3 |
+
C,9,1842,5,123,0,4,0,3,3,1,1,1,4,1,2,1
|
| 4 |
+
C#,0,4,1944,6,1,2,6,1,8,6,0,3,7,2,3,7
|
| 5 |
+
C++,2,132,6,1812,1,2,5,7,8,5,2,5,5,1,4,3
|
| 6 |
+
CSS,0,0,1,0,1969,1,1,17,1,7,1,0,1,1,0,0
|
| 7 |
+
Dart,0,1,1,0,1,1955,4,5,4,17,0,2,3,3,1,3
|
| 8 |
+
Go,1,0,2,6,2,1,1948,6,6,10,2,1,9,2,1,3
|
| 9 |
+
HTML,3,1,2,4,36,3,8,1762,3,125,4,3,32,3,0,11
|
| 10 |
+
Java,0,9,7,6,0,9,4,5,1946,2,1,2,6,0,0,3
|
| 11 |
+
JavaScript,1,3,3,6,12,7,9,98,7,1750,7,3,10,1,1,82
|
| 12 |
+
Kotlin,1,0,1,0,0,1,5,4,11,8,1958,2,3,1,0,5
|
| 13 |
+
Lua,1,7,7,5,1,0,1,5,2,5,2,1959,3,1,1,0
|
| 14 |
+
Markdown,3,3,5,8,3,2,4,31,6,17,3,0,1890,12,3,10
|
| 15 |
+
Python,0,1,2,1,0,3,3,8,0,5,3,3,16,1952,2,1
|
| 16 |
+
Rust,3,0,2,5,1,2,1,2,2,1,0,0,6,1,1974,0
|
| 17 |
+
Typescript,1,1,3,6,0,4,3,12,4,95,1,0,1,0,3,1866
|
FastText/validation_classification_report.csv
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
,precision,recall,f1-score,support
|
| 2 |
+
Assembly,0.9929789368104313,0.99,0.9914872308462694,2000.0
|
| 3 |
+
C,0.9302558956347216,0.927,0.9286250939143501,2000.0
|
| 4 |
+
C#,0.977710233029382,0.965,0.9713135379969804,2000.0
|
| 5 |
+
C++,0.912873225648556,0.9325,0.9225822409102152,2000.0
|
| 6 |
+
CSS,0.961895456765999,0.9845,0.9730664689893749,2000.0
|
| 7 |
+
Dart,0.9804511278195489,0.978,0.979224030037547,2000.0
|
| 8 |
+
Go,0.9788199697428139,0.9705,0.9746422294752699,2000.0
|
| 9 |
+
HTML,0.8991383679675621,0.887,0.8930279385854518,2000.0
|
| 10 |
+
Java,0.9728370221327968,0.967,0.9699097291875627,2000.0
|
| 11 |
+
JavaScript,0.8583252190847127,0.8815,0.8697582634435126,2000.0
|
| 12 |
+
Kotlin,0.9859508278976418,0.9825,0.9842223891810669,2000.0
|
| 13 |
+
Lua,0.986404833836858,0.9795,0.982940291018565,2000.0
|
| 14 |
+
Markdown,0.947289156626506,0.9435,0.9453907815631263,2000.0
|
| 15 |
+
Python,0.977977977977978,0.977,0.9774887443721861,2000.0
|
| 16 |
+
Rust,0.9875,0.9875,0.9875,2000.0
|
| 17 |
+
Typescript,0.9411172622043281,0.935,0.9380486581389516,2000.0
|
| 18 |
+
accuracy,0.9555,0.9555,0.9555,0.9555
|
| 19 |
+
macro avg,0.9557203445737397,0.9555,0.9555767267287768,32000.0
|
| 20 |
+
weighted avg,0.9557203445737398,0.9555,0.9555767267287769,32000.0
|
FastText/validation_confusion_matrix.csv
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
,Assembly,C,C#,C++,CSS,Dart,Go,HTML,Java,JavaScript,Kotlin,Lua,Markdown,Python,Rust,Typescript
|
| 2 |
+
Assembly,1980,8,1,3,0,0,1,2,0,1,0,0,1,1,1,1
|
| 3 |
+
C,4,1854,5,118,1,0,1,4,2,1,1,2,4,0,2,1
|
| 4 |
+
C#,2,7,1930,13,2,3,3,3,12,8,2,3,3,0,4,5
|
| 5 |
+
C++,2,97,3,1865,0,2,3,3,7,4,0,1,6,1,3,3
|
| 6 |
+
CSS,0,1,0,0,1969,2,0,18,0,7,1,0,2,0,0,0
|
| 7 |
+
Dart,0,0,1,3,2,1956,2,4,0,18,2,0,6,1,1,4
|
| 8 |
+
Go,1,3,3,3,3,3,1941,4,4,15,2,4,7,2,3,2
|
| 9 |
+
HTML,0,2,4,3,53,5,4,1774,1,104,3,3,30,9,2,3
|
| 10 |
+
Java,2,2,10,12,1,3,1,6,1934,9,5,0,5,2,0,8
|
| 11 |
+
JavaScript,2,1,4,2,12,12,11,83,7,1763,6,3,13,4,1,76
|
| 12 |
+
Kotlin,0,1,2,0,0,5,0,4,6,10,1965,1,4,1,0,1
|
| 13 |
+
Lua,0,7,4,2,1,1,2,1,3,6,2,1959,1,8,1,2
|
| 14 |
+
Markdown,0,4,4,7,2,1,7,39,6,11,1,5,1887,13,5,8
|
| 15 |
+
Python,0,1,1,1,0,1,5,9,1,4,2,5,14,1954,2,0
|
| 16 |
+
Rust,1,4,1,7,0,0,1,2,1,2,0,0,2,1,1975,3
|
| 17 |
+
Typescript,0,1,1,4,1,1,1,17,4,91,1,0,7,1,0,1870
|
SGD-Classifier/Logistic-Regresssion.py
ADDED
|
@@ -0,0 +1,369 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
import joblib
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
|
| 8 |
+
from sklearn.feature_extraction.text import HashingVectorizer
|
| 9 |
+
from sklearn.linear_model import SGDClassifier
|
| 10 |
+
from sklearn.metrics import (
|
| 11 |
+
accuracy_score,
|
| 12 |
+
classification_report,
|
| 13 |
+
confusion_matrix,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
# ============================================================
|
| 17 |
+
# CONFIG
|
| 18 |
+
# ============================================================
|
| 19 |
+
|
| 20 |
+
TRAIN_FILE = "dataset/train.jsonl"
|
| 21 |
+
VALIDATION_FILE = "dataset/validation.jsonl"
|
| 22 |
+
TEST_FILE = "dataset/test.jsonl"
|
| 23 |
+
|
| 24 |
+
BATCH_SIZE = 20000
|
| 25 |
+
|
| 26 |
+
EPOCHS = 10
|
| 27 |
+
|
| 28 |
+
N_FEATURES = 2**17
|
| 29 |
+
NGRAM_RANGE = (2, 6)
|
| 30 |
+
|
| 31 |
+
MODEL_DIR = "models"
|
| 32 |
+
METRICS_DIR = "metrics"
|
| 33 |
+
|
| 34 |
+
# ============================================================
|
| 35 |
+
# CREATE OUTPUT DIRS
|
| 36 |
+
# ============================================================
|
| 37 |
+
|
| 38 |
+
os.makedirs(MODEL_DIR, exist_ok=True)
|
| 39 |
+
os.makedirs(METRICS_DIR, exist_ok=True)
|
| 40 |
+
|
| 41 |
+
# ============================================================
|
| 42 |
+
# HELPERS
|
| 43 |
+
# ============================================================
|
| 44 |
+
|
| 45 |
+
def jsonl_batch_reader(path, batch_size):
|
| 46 |
+
|
| 47 |
+
texts = []
|
| 48 |
+
labels = []
|
| 49 |
+
|
| 50 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 51 |
+
|
| 52 |
+
for line in f:
|
| 53 |
+
|
| 54 |
+
row = json.loads(line)
|
| 55 |
+
|
| 56 |
+
texts.append(row["content"])
|
| 57 |
+
labels.append(row["label"])
|
| 58 |
+
|
| 59 |
+
if len(texts) >= batch_size:
|
| 60 |
+
|
| 61 |
+
yield texts, labels
|
| 62 |
+
|
| 63 |
+
texts = []
|
| 64 |
+
labels = []
|
| 65 |
+
|
| 66 |
+
if texts:
|
| 67 |
+
yield texts, labels
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def load_split(path):
|
| 71 |
+
|
| 72 |
+
texts = []
|
| 73 |
+
labels = []
|
| 74 |
+
|
| 75 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 76 |
+
|
| 77 |
+
for line in f:
|
| 78 |
+
|
| 79 |
+
row = json.loads(line)
|
| 80 |
+
|
| 81 |
+
texts.append(row["content"])
|
| 82 |
+
labels.append(row["label"])
|
| 83 |
+
|
| 84 |
+
return texts, labels
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def evaluate_split(
|
| 88 |
+
model,
|
| 89 |
+
vectorizer,
|
| 90 |
+
split_name,
|
| 91 |
+
texts,
|
| 92 |
+
labels,
|
| 93 |
+
epoch,
|
| 94 |
+
):
|
| 95 |
+
|
| 96 |
+
print(f"\nEvaluating {split_name}")
|
| 97 |
+
|
| 98 |
+
X = vectorizer.transform(texts)
|
| 99 |
+
|
| 100 |
+
preds = model.predict(X)
|
| 101 |
+
|
| 102 |
+
acc = accuracy_score(labels, preds)
|
| 103 |
+
|
| 104 |
+
print(f"{split_name} accuracy: {acc:.6f}")
|
| 105 |
+
|
| 106 |
+
report = classification_report(
|
| 107 |
+
labels,
|
| 108 |
+
preds,
|
| 109 |
+
output_dict=True,
|
| 110 |
+
digits=4,
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
report_df = pd.DataFrame(report).transpose()
|
| 114 |
+
|
| 115 |
+
report_path = os.path.join(
|
| 116 |
+
METRICS_DIR,
|
| 117 |
+
f"{split_name}_epoch_{epoch:03d}_report.csv",
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
report_df.to_csv(report_path)
|
| 121 |
+
|
| 122 |
+
labels_sorted = sorted(list(set(labels)))
|
| 123 |
+
|
| 124 |
+
cm = confusion_matrix(
|
| 125 |
+
labels,
|
| 126 |
+
preds,
|
| 127 |
+
labels=labels_sorted,
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
cm_df = pd.DataFrame(
|
| 131 |
+
cm,
|
| 132 |
+
index=labels_sorted,
|
| 133 |
+
columns=labels_sorted,
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
cm_path = os.path.join(
|
| 137 |
+
METRICS_DIR,
|
| 138 |
+
f"{split_name}_epoch_{epoch:03d}_confusion_matrix.csv",
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
cm_df.to_csv(cm_path)
|
| 142 |
+
|
| 143 |
+
return acc
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
# ============================================================
|
| 147 |
+
# INFO
|
| 148 |
+
# ============================================================
|
| 149 |
+
|
| 150 |
+
print(f"CPU Cores: {os.cpu_count()}")
|
| 151 |
+
|
| 152 |
+
# ============================================================
|
| 153 |
+
# LOAD VALIDATION + TEST ONCE
|
| 154 |
+
# ============================================================
|
| 155 |
+
|
| 156 |
+
print("Loading validation set...")
|
| 157 |
+
|
| 158 |
+
val_texts, val_labels = load_split(
|
| 159 |
+
VALIDATION_FILE
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
print("Loading test set...")
|
| 163 |
+
|
| 164 |
+
test_texts, test_labels = load_split(
|
| 165 |
+
TEST_FILE
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
# ============================================================
|
| 169 |
+
# VECTORIZER
|
| 170 |
+
# ============================================================
|
| 171 |
+
|
| 172 |
+
vectorizer = HashingVectorizer(
|
| 173 |
+
analyzer="char",
|
| 174 |
+
ngram_range=NGRAM_RANGE,
|
| 175 |
+
n_features=N_FEATURES,
|
| 176 |
+
alternate_sign=False,
|
| 177 |
+
lowercase=False,
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
# ============================================================
|
| 181 |
+
# DISCOVER CLASSES
|
| 182 |
+
# ============================================================
|
| 183 |
+
|
| 184 |
+
print("Discovering classes...")
|
| 185 |
+
|
| 186 |
+
all_classes = set()
|
| 187 |
+
|
| 188 |
+
for _, labels in jsonl_batch_reader(
|
| 189 |
+
TRAIN_FILE,
|
| 190 |
+
BATCH_SIZE,
|
| 191 |
+
):
|
| 192 |
+
all_classes.update(labels)
|
| 193 |
+
|
| 194 |
+
all_classes = np.array(
|
| 195 |
+
sorted(all_classes)
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
print("\nClasses:")
|
| 199 |
+
print(all_classes)
|
| 200 |
+
|
| 201 |
+
# ============================================================
|
| 202 |
+
# MODEL
|
| 203 |
+
# ============================================================
|
| 204 |
+
|
| 205 |
+
model = SGDClassifier(
|
| 206 |
+
loss="log_loss",
|
| 207 |
+
alpha=1e-6,
|
| 208 |
+
max_iter=1,
|
| 209 |
+
warm_start=True,
|
| 210 |
+
verbose=1,
|
| 211 |
+
random_state=42,
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
# ============================================================
|
| 215 |
+
# TRAIN
|
| 216 |
+
# ============================================================
|
| 217 |
+
|
| 218 |
+
epoch_results = []
|
| 219 |
+
|
| 220 |
+
first_fit = True
|
| 221 |
+
|
| 222 |
+
overall_start = time.time()
|
| 223 |
+
|
| 224 |
+
for epoch in range(EPOCHS):
|
| 225 |
+
|
| 226 |
+
print("\n" + "=" * 80)
|
| 227 |
+
print(f"Epoch {epoch + 1}/{EPOCHS}")
|
| 228 |
+
print("=" * 80)
|
| 229 |
+
|
| 230 |
+
epoch_start = time.time()
|
| 231 |
+
|
| 232 |
+
batch_count = 0
|
| 233 |
+
|
| 234 |
+
for texts, labels in jsonl_batch_reader(
|
| 235 |
+
TRAIN_FILE,
|
| 236 |
+
BATCH_SIZE,
|
| 237 |
+
):
|
| 238 |
+
|
| 239 |
+
batch_count += 1
|
| 240 |
+
|
| 241 |
+
print(
|
| 242 |
+
f"Epoch {epoch+1} | Batch {batch_count}"
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
X = vectorizer.transform(texts)
|
| 246 |
+
|
| 247 |
+
if first_fit:
|
| 248 |
+
|
| 249 |
+
model.partial_fit(
|
| 250 |
+
X,
|
| 251 |
+
labels,
|
| 252 |
+
classes=all_classes,
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
first_fit = False
|
| 256 |
+
|
| 257 |
+
else:
|
| 258 |
+
|
| 259 |
+
model.partial_fit(
|
| 260 |
+
X,
|
| 261 |
+
labels,
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
epoch_time = time.time() - epoch_start
|
| 265 |
+
|
| 266 |
+
print(
|
| 267 |
+
f"\nEpoch finished in "
|
| 268 |
+
f"{epoch_time:.1f}s"
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
# ========================================================
|
| 272 |
+
# SAVE MODEL
|
| 273 |
+
# ========================================================
|
| 274 |
+
|
| 275 |
+
model_path = os.path.join(
|
| 276 |
+
MODEL_DIR,
|
| 277 |
+
f"epoch_{epoch+1:03d}.pkl",
|
| 278 |
+
)
|
| 279 |
+
|
| 280 |
+
joblib.dump(
|
| 281 |
+
{
|
| 282 |
+
"model": model,
|
| 283 |
+
"vectorizer": vectorizer,
|
| 284 |
+
},
|
| 285 |
+
model_path,
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
print(f"Saved {model_path}")
|
| 289 |
+
|
| 290 |
+
# ========================================================
|
| 291 |
+
# VALIDATION
|
| 292 |
+
# ========================================================
|
| 293 |
+
|
| 294 |
+
val_acc = evaluate_split(
|
| 295 |
+
model,
|
| 296 |
+
vectorizer,
|
| 297 |
+
"validation",
|
| 298 |
+
val_texts,
|
| 299 |
+
val_labels,
|
| 300 |
+
epoch + 1,
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
# ========================================================
|
| 304 |
+
# TEST
|
| 305 |
+
# ========================================================
|
| 306 |
+
|
| 307 |
+
test_acc = evaluate_split(
|
| 308 |
+
model,
|
| 309 |
+
vectorizer,
|
| 310 |
+
"test",
|
| 311 |
+
test_texts,
|
| 312 |
+
test_labels,
|
| 313 |
+
epoch + 1,
|
| 314 |
+
)
|
| 315 |
+
|
| 316 |
+
epoch_results.append(
|
| 317 |
+
{
|
| 318 |
+
"epoch": epoch + 1,
|
| 319 |
+
"validation_accuracy": val_acc,
|
| 320 |
+
"test_accuracy": test_acc,
|
| 321 |
+
"epoch_time_seconds": epoch_time,
|
| 322 |
+
}
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
pd.DataFrame(
|
| 326 |
+
epoch_results
|
| 327 |
+
).to_csv(
|
| 328 |
+
os.path.join(
|
| 329 |
+
METRICS_DIR,
|
| 330 |
+
"epoch_summary.csv",
|
| 331 |
+
),
|
| 332 |
+
index=False,
|
| 333 |
+
)
|
| 334 |
+
|
| 335 |
+
# ============================================================
|
| 336 |
+
# FINAL
|
| 337 |
+
# ============================================================
|
| 338 |
+
|
| 339 |
+
total_time = time.time() - overall_start
|
| 340 |
+
|
| 341 |
+
print("\nTraining Complete")
|
| 342 |
+
|
| 343 |
+
print(
|
| 344 |
+
f"Total training time: "
|
| 345 |
+
f"{total_time:.1f}s"
|
| 346 |
+
)
|
| 347 |
+
|
| 348 |
+
summary_df = pd.DataFrame(epoch_results)
|
| 349 |
+
|
| 350 |
+
best_val_epoch = summary_df[
|
| 351 |
+
"validation_accuracy"
|
| 352 |
+
].idxmax()
|
| 353 |
+
|
| 354 |
+
best_row = summary_df.iloc[
|
| 355 |
+
best_val_epoch
|
| 356 |
+
]
|
| 357 |
+
|
| 358 |
+
print("\nBest Epoch")
|
| 359 |
+
print(best_row)
|
| 360 |
+
|
| 361 |
+
summary_df.to_csv(
|
| 362 |
+
os.path.join(
|
| 363 |
+
METRICS_DIR,
|
| 364 |
+
"final_summary.csv",
|
| 365 |
+
),
|
| 366 |
+
index=False,
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
print("\nDone.")
|
SGD-Classifier/metrics/epoch_summary.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
epoch,validation_accuracy,test_accuracy,epoch_time_seconds
|
| 2 |
+
1,0.89421875,0.8940625,3927.5892601013184
|
| 3 |
+
2,0.89746875,0.897,3837.121087551117
|
SGD-Classifier/metrics/test_epoch_001_confusion_matrix.csv
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
,Assembly,C,C#,C++,CSS,Dart,Go,HTML,Java,JavaScript,Kotlin,Lua,Markdown,Python,Rust,Typescript
|
| 2 |
+
Assembly,1967,26,0,1,0,0,0,0,1,1,0,3,1,0,0,0
|
| 3 |
+
C,3,1798,3,176,0,1,3,0,3,3,0,3,3,3,0,1
|
| 4 |
+
C#,3,12,1925,11,0,1,7,0,13,13,0,8,2,0,2,3
|
| 5 |
+
C++,10,193,4,1756,1,0,11,0,6,7,2,5,1,2,2,0
|
| 6 |
+
CSS,3,0,0,3,1970,2,1,0,1,18,0,0,2,0,0,0
|
| 7 |
+
Dart,0,3,5,3,1,1923,4,0,11,38,0,1,2,3,1,5
|
| 8 |
+
Go,0,1,0,1,1,0,1957,0,3,17,0,4,12,4,0,0
|
| 9 |
+
HTML,11,5,2,7,161,12,9,7,25,648,1,12,1056,34,1,9
|
| 10 |
+
Java,1,5,6,14,1,5,2,0,1937,13,2,6,4,3,0,1
|
| 11 |
+
JavaScript,6,4,7,4,14,18,6,0,16,1837,0,4,14,6,1,63
|
| 12 |
+
Kotlin,0,1,1,2,0,2,6,0,12,9,1949,5,7,3,0,3
|
| 13 |
+
Lua,2,9,7,2,1,0,4,0,2,6,1,1957,5,4,0,0
|
| 14 |
+
Markdown,4,7,1,10,1,2,10,0,7,17,1,7,1893,26,5,9
|
| 15 |
+
Python,2,2,1,0,2,7,5,0,0,10,1,2,14,1953,0,1
|
| 16 |
+
Rust,4,2,3,4,1,0,3,0,3,5,0,4,10,1,1960,0
|
| 17 |
+
Typescript,2,7,5,6,0,9,1,0,9,130,1,2,5,0,2,1821
|
SGD-Classifier/metrics/test_epoch_001_report.csv
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
,precision,recall,f1-score,support
|
| 2 |
+
Assembly,0.9747274529236868,0.9835,0.9790940766550522,2000.0
|
| 3 |
+
C,0.8665060240963856,0.899,0.8824539877300613,2000.0
|
| 4 |
+
C#,0.9771573604060914,0.9625,0.9697732997481109,2000.0
|
| 5 |
+
C++,0.878,0.878,0.878,2000.0
|
| 6 |
+
CSS,0.914577530176416,0.985,0.9484833895040924,2000.0
|
| 7 |
+
Dart,0.9702320887991928,0.9615,0.9658463083877449,2000.0
|
| 8 |
+
Go,0.964514539181863,0.9785,0.9714569372052618,2000.0
|
| 9 |
+
HTML,1.0,0.0035,0.006975585450921774,2000.0
|
| 10 |
+
Java,0.9453391898487067,0.9685,0.9567794517164732,2000.0
|
| 11 |
+
JavaScript,0.6626984126984127,0.9185,0.769907795473596,2000.0
|
| 12 |
+
Kotlin,0.9954034729315628,0.9745,0.9848408287013644,2000.0
|
| 13 |
+
Lua,0.967375185368265,0.9785,0.972905791697738,2000.0
|
| 14 |
+
Markdown,0.6245463543385021,0.9465,0.7525342874180083,2000.0
|
| 15 |
+
Python,0.9564152791380999,0.9765,0.9663532904502722,2000.0
|
| 16 |
+
Rust,0.9929078014184397,0.98,0.9864116758933065,2000.0
|
| 17 |
+
Typescript,0.9504175365344467,0.9105,0.9300306435137896,2000.0
|
| 18 |
+
accuracy,0.8940625,0.8940625,0.8940625,0.8940625
|
| 19 |
+
macro avg,0.9150511392412544,0.8940625,0.8701154593466122,32000.0
|
| 20 |
+
weighted avg,0.9150511392412544,0.8940625,0.8701154593466122,32000.0
|
SGD-Classifier/metrics/test_epoch_002_confusion_matrix.csv
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
,Assembly,C,C#,C++,CSS,Dart,Go,HTML,Java,JavaScript,Kotlin,Lua,Markdown,Python,Rust,Typescript
|
| 2 |
+
Assembly,1965,27,0,1,0,0,0,0,1,1,0,3,2,0,0,0
|
| 3 |
+
C,3,1807,3,171,0,2,3,0,2,2,0,0,3,3,0,1
|
| 4 |
+
C#,3,15,1925,11,1,1,5,0,13,11,0,7,3,0,2,3
|
| 5 |
+
C++,8,206,4,1744,2,0,10,0,6,7,2,6,1,2,2,0
|
| 6 |
+
CSS,3,0,0,2,1974,2,1,0,0,15,1,0,2,0,0,0
|
| 7 |
+
Dart,0,3,4,3,1,1927,3,0,8,37,0,2,3,2,1,6
|
| 8 |
+
Go,0,1,0,2,2,0,1954,0,4,18,0,3,12,3,1,0
|
| 9 |
+
HTML,10,5,2,6,154,15,7,96,20,600,2,10,1027,35,1,10
|
| 10 |
+
Java,1,8,6,13,1,5,2,0,1937,12,2,5,4,3,0,1
|
| 11 |
+
JavaScript,6,4,8,5,15,23,4,1,15,1822,0,4,16,5,1,71
|
| 12 |
+
Kotlin,0,1,1,3,0,2,6,0,12,9,1948,4,8,3,0,3
|
| 13 |
+
Lua,3,6,7,2,0,0,3,0,2,5,1,1961,7,3,0,0
|
| 14 |
+
Markdown,3,8,1,10,1,1,10,0,8,17,1,3,1898,25,5,9
|
| 15 |
+
Python,2,2,1,1,2,7,4,0,0,10,1,2,15,1952,0,1
|
| 16 |
+
Rust,4,2,3,4,1,0,3,0,3,5,0,3,10,1,1961,0
|
| 17 |
+
Typescript,1,7,5,6,0,10,1,0,9,118,1,2,5,0,2,1833
|
SGD-Classifier/metrics/test_epoch_002_report.csv
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
,precision,recall,f1-score,support
|
| 2 |
+
Assembly,0.9766401590457257,0.9825,0.9795613160518445,2000.0
|
| 3 |
+
C,0.8596574690770694,0.9035,0.8810336421257923,2000.0
|
| 4 |
+
C#,0.9771573604060914,0.9625,0.9697732997481109,2000.0
|
| 5 |
+
C++,0.8790322580645161,0.872,0.8755020080321285,2000.0
|
| 6 |
+
CSS,0.9164345403899722,0.987,0.9504092441020703,2000.0
|
| 7 |
+
Dart,0.9659147869674185,0.9635,0.9647058823529412,2000.0
|
| 8 |
+
Go,0.9692460317460317,0.977,0.9731075697211156,2000.0
|
| 9 |
+
HTML,0.9896907216494846,0.048,0.09155937052932761,2000.0
|
| 10 |
+
Java,0.9495098039215686,0.9685,0.9589108910891089,2000.0
|
| 11 |
+
JavaScript,0.6775753068055039,0.911,0.7771379825122627,2000.0
|
| 12 |
+
Kotlin,0.9943848902501277,0.974,0.9840868906289467,2000.0
|
| 13 |
+
Lua,0.9732009925558313,0.9805,0.9768368617683686,2000.0
|
| 14 |
+
Markdown,0.6293103448275862,0.949,0.7567783094098883,2000.0
|
| 15 |
+
Python,0.958271968581247,0.976,0.9670547436215011,2000.0
|
| 16 |
+
Rust,0.9924089068825911,0.9805,0.9864185110663984,2000.0
|
| 17 |
+
Typescript,0.9458204334365325,0.9165,0.930929405789741,2000.0
|
| 18 |
+
accuracy,0.897,0.897,0.897,0.897
|
| 19 |
+
macro avg,0.9158909984129562,0.897,0.8764878705343466,32000.0
|
| 20 |
+
weighted avg,0.9158909984129562,0.897,0.8764878705343467,32000.0
|
SGD-Classifier/metrics/validation_epoch_001_confusion_matrix.csv
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
,Assembly,C,C#,C++,CSS,Dart,Go,HTML,Java,JavaScript,Kotlin,Lua,Markdown,Python,Rust,Typescript
|
| 2 |
+
Assembly,1959,29,0,1,0,0,2,0,1,1,1,2,1,1,1,1
|
| 3 |
+
C,7,1830,7,139,1,2,2,0,1,1,1,2,1,3,2,1
|
| 4 |
+
C#,5,15,1899,13,0,1,9,0,23,11,1,14,2,1,2,4
|
| 5 |
+
C++,10,167,2,1795,0,0,3,0,9,4,0,2,3,4,1,0
|
| 6 |
+
CSS,3,3,1,0,1954,4,1,0,0,27,0,0,6,1,0,0
|
| 7 |
+
Dart,0,0,2,4,0,1944,0,0,4,35,0,1,6,1,1,2
|
| 8 |
+
Go,5,2,2,3,3,2,1945,0,2,20,0,3,6,5,0,2
|
| 9 |
+
HTML,7,10,6,9,178,8,10,2,36,646,3,11,1039,26,2,7
|
| 10 |
+
Java,1,5,7,15,1,3,7,0,1924,17,2,9,3,5,0,1
|
| 11 |
+
JavaScript,3,2,2,6,18,16,7,0,14,1844,4,10,10,4,2,58
|
| 12 |
+
Kotlin,2,5,2,6,1,3,3,0,13,15,1933,3,10,3,0,1
|
| 13 |
+
Lua,1,9,2,5,0,0,2,0,2,6,0,1959,6,5,2,1
|
| 14 |
+
Markdown,2,8,7,13,4,3,9,0,9,21,3,7,1879,25,5,5
|
| 15 |
+
Python,1,2,0,1,1,1,4,0,2,11,1,7,10,1958,0,1
|
| 16 |
+
Rust,3,7,3,6,0,0,5,0,3,2,0,0,1,1,1969,0
|
| 17 |
+
Typescript,1,1,4,5,1,6,3,0,13,129,1,0,11,3,1,1821
|
SGD-Classifier/metrics/validation_epoch_001_report.csv
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
,precision,recall,f1-score,support
|
| 2 |
+
Assembly,0.9746268656716418,0.9795,0.9770573566084788,2000.0
|
| 3 |
+
C,0.8735083532219571,0.915,0.8937728937728938,2000.0
|
| 4 |
+
C#,0.9758478931140802,0.9495,0.9624936644703497,2000.0
|
| 5 |
+
C++,0.8881741712023751,0.8975,0.8928127331509574,2000.0
|
| 6 |
+
CSS,0.9037927844588344,0.977,0.9389716482460355,2000.0
|
| 7 |
+
Dart,0.9754139488208731,0.972,0.9737039819684448,2000.0
|
| 8 |
+
Go,0.9666998011928429,0.9725,0.9695912263210369,2000.0
|
| 9 |
+
HTML,1.0,0.001,0.001998001998001998,2000.0
|
| 10 |
+
Java,0.9357976653696498,0.962,0.9487179487179487,2000.0
|
| 11 |
+
JavaScript,0.6609318996415771,0.922,0.7699373695198329,2000.0
|
| 12 |
+
Kotlin,0.9912820512820513,0.9665,0.9787341772151898,2000.0
|
| 13 |
+
Lua,0.9650246305418719,0.9795,0.9722084367245658,2000.0
|
| 14 |
+
Markdown,0.6275885103540414,0.9395,0.7525030036043252,2000.0
|
| 15 |
+
Python,0.956989247311828,0.979,0.967869500741473,2000.0
|
| 16 |
+
Rust,0.9904426559356136,0.9845,0.9874623871614845,2000.0
|
| 17 |
+
Typescript,0.9559055118110236,0.9105,0.9326504481434059,2000.0
|
| 18 |
+
accuracy,0.89421875,0.89421875,0.89421875,0.89421875
|
| 19 |
+
macro avg,0.9151266243706413,0.8942187500000001,0.8700302986477766,32000.0
|
| 20 |
+
weighted avg,0.9151266243706414,0.89421875,0.8700302986477766,32000.0
|
SGD-Classifier/metrics/validation_epoch_002_confusion_matrix.csv
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
,Assembly,C,C#,C++,CSS,Dart,Go,HTML,Java,JavaScript,Kotlin,Lua,Markdown,Python,Rust,Typescript
|
| 2 |
+
Assembly,1956,31,0,2,0,0,2,0,1,1,1,1,2,1,1,1
|
| 3 |
+
C,5,1841,7,128,1,2,2,0,1,1,1,2,2,4,2,1
|
| 4 |
+
C#,4,15,1907,11,0,2,7,0,22,10,1,13,2,1,2,3
|
| 5 |
+
C++,11,174,4,1788,0,0,3,0,7,4,0,1,3,4,1,0
|
| 6 |
+
CSS,1,4,1,0,1956,4,1,0,0,25,0,0,7,1,0,0
|
| 7 |
+
Dart,0,0,1,1,0,1951,0,0,4,31,0,1,6,1,1,3
|
| 8 |
+
Go,5,2,3,3,2,3,1943,0,1,21,0,3,7,5,0,2
|
| 9 |
+
HTML,6,10,5,9,171,9,6,83,31,607,5,13,1009,27,2,7
|
| 10 |
+
Java,3,6,6,16,1,3,7,0,1927,12,2,9,3,4,0,1
|
| 11 |
+
JavaScript,3,1,3,6,19,17,7,0,13,1833,3,10,12,4,2,67
|
| 12 |
+
Kotlin,1,6,2,6,1,3,3,0,12,14,1934,3,11,3,0,1
|
| 13 |
+
Lua,1,8,3,6,0,0,2,0,1,6,0,1960,6,5,1,1
|
| 14 |
+
Markdown,2,9,7,12,4,3,9,0,10,19,3,2,1883,25,5,7
|
| 15 |
+
Python,1,2,0,1,1,1,3,0,2,12,2,7,11,1956,0,1
|
| 16 |
+
Rust,3,7,4,5,0,3,5,0,2,2,0,0,1,1,1967,0
|
| 17 |
+
Typescript,1,2,4,4,1,5,2,0,13,117,2,0,11,3,1,1834
|
SGD-Classifier/metrics/validation_epoch_002_report.csv
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
,precision,recall,f1-score,support
|
| 2 |
+
Assembly,0.9765351972041937,0.978,0.9772670497127155,2000.0
|
| 3 |
+
C,0.8692162417374882,0.9205,0.8941233608547838,2000.0
|
| 4 |
+
C#,0.9744506898313745,0.9535,0.9638615112458934,2000.0
|
| 5 |
+
C++,0.8948948948948949,0.894,0.894447223611806,2000.0
|
| 6 |
+
CSS,0.9068150208623088,0.978,0.9410632667789272,2000.0
|
| 7 |
+
Dart,0.9725822532402791,0.9755,0.9740389415876186,2000.0
|
| 8 |
+
Go,0.9705294705294706,0.9715,0.9710144927536232,2000.0
|
| 9 |
+
HTML,1.0,0.0415,0.07969275084013443,2000.0
|
| 10 |
+
Java,0.9413776257938447,0.9635,0.9523103533481592,2000.0
|
| 11 |
+
JavaScript,0.6751381215469613,0.9165,0.7775185577942736,2000.0
|
| 12 |
+
Kotlin,0.9897645854657113,0.967,0.9782498735457764,2000.0
|
| 13 |
+
Lua,0.9679012345679012,0.98,0.9739130434782609,2000.0
|
| 14 |
+
Markdown,0.6327284946236559,0.9415,0.7568327974276527,2000.0
|
| 15 |
+
Python,0.956479217603912,0.978,0.9671199011124846,2000.0
|
| 16 |
+
Rust,0.9909319899244332,0.9835,0.9872020075282308,2000.0
|
| 17 |
+
Typescript,0.9507516848107828,0.917,0.9335708831763807,2000.0
|
| 18 |
+
accuracy,0.89746875,0.89746875,0.89746875,0.89746875
|
| 19 |
+
macro avg,0.9168810451648257,0.89746875,0.8763891259247951,32000.0
|
| 20 |
+
weighted avg,0.9168810451648257,0.89746875,0.876389125924795,32000.0
|
SGD-Classifier/models/epoch_001.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d3cb9dfc421e9f5eed281199cd3e6ac7d41b4dc5d13efce09f60949d830b2eee
|
| 3 |
+
size 16779530
|
SGD-Classifier/models/epoch_002.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3941f702f4d9d1bee58087062178846a70c34311a10329e6eca075a9a4603633
|
| 3 |
+
size 16779530
|