kaushik-harsh-99 commited on 2 days ago

Commit

95f644c

1 Parent(s): 11eb6e9

initial-upload

Browse files

Files changed (21) hide show

FastText/FastText-Test.py +195 -0
FastText/FastText.py +268 -0
FastText/convert-to-fast-text-format.py +40 -0
FastText/fasttext_language_classifier.bin +3 -0
FastText/fasttext_summary.csv +2 -0
FastText/test_classification_report.csv +20 -0
FastText/test_confusion_matrix.csv +17 -0
FastText/validation_classification_report.csv +20 -0
FastText/validation_confusion_matrix.csv +17 -0
SGD-Classifier/Logistic-Regresssion.py +369 -0
SGD-Classifier/metrics/epoch_summary.csv +3 -0
SGD-Classifier/metrics/test_epoch_001_confusion_matrix.csv +17 -0
SGD-Classifier/metrics/test_epoch_001_report.csv +20 -0
SGD-Classifier/metrics/test_epoch_002_confusion_matrix.csv +17 -0
SGD-Classifier/metrics/test_epoch_002_report.csv +20 -0
SGD-Classifier/metrics/validation_epoch_001_confusion_matrix.csv +17 -0
SGD-Classifier/metrics/validation_epoch_001_report.csv +20 -0
SGD-Classifier/metrics/validation_epoch_002_confusion_matrix.csv +17 -0
SGD-Classifier/metrics/validation_epoch_002_report.csv +20 -0
SGD-Classifier/models/epoch_001.pkl +3 -0
SGD-Classifier/models/epoch_002.pkl +3 -0

FastText/FastText-Test.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import json
+import fasttext
+import pandas as pd
+from sklearn.metrics import (
+    accuracy_score,
+    classification_report,
+    confusion_matrix,
+)
+# ============================================================
+# CONFIG
+# ============================================================
+MODEL_FILE = "fasttext_language_classifier.bin"
+VALIDATION_FILE = "dataset/validation.jsonl"
+TEST_FILE = "dataset/test.jsonl"
+# ============================================================
+# LOAD MODEL
+# ============================================================
+print("Loading model...")
+model = fasttext.load_model(MODEL_FILE)
+print("Model loaded.")
+# ============================================================
+# EVALUATION
+# ============================================================
+def evaluate_jsonl(
+    model,
+    jsonl_file,
+    split_name,
+):
+    print(f"\nEvaluating {split_name}")
+    y_true = []
+    y_pred = []
+    processed = 0
+    with open(
+        jsonl_file,
+        "r",
+        encoding="utf-8",
+    ) as f:
+        for line in f:
+            row = json.loads(line)
+            true_label = row["label"]
+            # Match FastText training format
+            text = " ".join(
+                row["content"].split()
+            )
+            labels, probs = model.predict(
+                text,
+                k=1,
+            )
+            pred_label = (
+                labels[0]
+                .replace("__label__", "")
+            )
+            y_true.append(true_label)
+            y_pred.append(pred_label)
+            processed += 1
+            if processed % 5000 == 0:
+                print(
+                    f"Processed {processed:,}"
+                )
+    # ========================================================
+    # ACCURACY
+    # ========================================================
+    acc = accuracy_score(
+        y_true,
+        y_pred,
+    )
+    print(
+        f"\n{split_name} Accuracy: "
+        f"{acc:.6f}"
+    )
+    # ========================================================
+    # CLASSIFICATION REPORT
+    # ========================================================
+    report = classification_report(
+        y_true,
+        y_pred,
+        output_dict=True,
+        digits=4,
+    )
+    report_df = (
+        pd.DataFrame(report)
+        .transpose()
+    )
+    report_csv = (
+        f"{split_name}_classification_report.csv"
+    )
+    report_df.to_csv(report_csv)
+    print(f"Saved {report_csv}")
+    # ========================================================
+    # CONFUSION MATRIX
+    # ========================================================
+    labels_sorted = sorted(
+        list(set(y_true))
+    )
+    cm = confusion_matrix(
+        y_true,
+        y_pred,
+        labels=labels_sorted,
+    )
+    cm_df = pd.DataFrame(
+        cm,
+        index=labels_sorted,
+        columns=labels_sorted,
+    )
+    cm_csv = (
+        f"{split_name}_confusion_matrix.csv"
+    )
+    cm_df.to_csv(cm_csv)
+    print(f"Saved {cm_csv}")
+    return acc
+# ============================================================
+# VALIDATION
+# ============================================================
+validation_accuracy = evaluate_jsonl(
+    model,
+    VALIDATION_FILE,
+    "validation",
+)
+# ============================================================
+# TEST
+# ============================================================
+test_accuracy = evaluate_jsonl(
+    model,
+    TEST_FILE,
+    "test",
+)
+# ============================================================
+# SUMMARY
+# ============================================================
+summary = pd.DataFrame([
+    {
+        "validation_accuracy": validation_accuracy,
+        "test_accuracy": test_accuracy,
+    }
+])
+summary.to_csv(
+    "fasttext_summary.csv",
+    index=False,
+)
+print("\nSaved fasttext_summary.csv")
+print("\n==============================")
+print(f"Validation Accuracy: {validation_accuracy:.6f}")
+print(f"Test Accuracy:       {test_accuracy:.6f}")
+print("==============================")
+print("\nDone.")

FastText/FastText.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import json
+import os
+import time
+import fasttext
+import pandas as pd
+from sklearn.metrics import (
+    accuracy_score,
+    classification_report,
+    confusion_matrix,
+)
+# ============================================================
+# CONFIG
+# ============================================================
+TRAIN_FILE = "fasttext_train.txt"
+VALIDATION_JSONL = "dataset/validation.jsonl"
+TEST_JSONL = "dataset/test.jsonl"
+MODEL_FILE = "fasttext_language_classifier.bin"
+EPOCHS = 25
+LR = 0.7
+DIM = 50
+WORD_NGRAMS = 3
+MINN = 2
+MAXN = 5
+MIN_COUNT = 100
+BUCKET = 50000
+THREADS = os.cpu_count()
+# ============================================================
+# TRAIN
+# ============================================================
+print("Training FastText...")
+print()
+start = time.time()
+model = fasttext.train_supervised(
+    input=TRAIN_FILE,
+    epoch=EPOCHS,
+    lr=LR,
+    dim=DIM,
+    wordNgrams=WORD_NGRAMS,
+    minn=MINN,
+    maxn=MAXN,
+    minCount=MIN_COUNT,
+    bucket=BUCKET,
+    loss="softmax",
+    thread=THREADS,
+    verbose=2,
+)
+elapsed = time.time() - start
+print()
+print(f"Training completed in {elapsed:.1f}s")
+# ============================================================
+# LABEL DEBUG
+# ============================================================
+print()
+print("Labels found by FastText:")
+print(f"Count: {len(model.labels)}")
+for label in model.labels:
+    print(label)
+# ============================================================
+# SAVE MODEL
+# ============================================================
+model.save_model(MODEL_FILE)
+size_mb = os.path.getsize(MODEL_FILE) / 1024 / 1024
+print()
+print(f"Saved model: {MODEL_FILE}")
+print(f"Model size: {size_mb:.2f} MB")
+# ============================================================
+# EVALUATION
+# ============================================================
+def evaluate_jsonl(
+    model,
+    jsonl_file,
+    split_name,
+):
+    print()
+    print(f"Evaluating {split_name}")
+    y_true = []
+    y_pred = []
+    processed = 0
+    with open(
+        jsonl_file,
+        "r",
+        encoding="utf-8",
+    ) as f:
+        for line in f:
+            row = json.loads(line)
+            true_label = row["label"]
+            text = " ".join(
+                str(row["content"]).split()
+            )
+            labels, probs = model.predict(
+                text,
+                k=1,
+            )
+            pred_label = (
+                labels[0]
+                .replace("__label__", "")
+            )
+            y_true.append(true_label)
+            y_pred.append(pred_label)
+            processed += 1
+            if processed % 5000 == 0:
+                print(
+                    f"Processed {processed:,}"
+                )
+    # ========================================================
+    # ACCURACY
+    # ========================================================
+    accuracy = accuracy_score(
+        y_true,
+        y_pred,
+    )
+    print()
+    print(
+        f"{split_name} Accuracy: "
+        f"{accuracy:.6f}"
+    )
+    # ========================================================
+    # CLASSIFICATION REPORT
+    # ========================================================
+    report = classification_report(
+        y_true,
+        y_pred,
+        output_dict=True,
+        digits=4,
+    )
+    report_df = pd.DataFrame(
+        report
+    ).transpose()
+    report_file = (
+        f"{split_name}_classification_report.csv"
+    )
+    report_df.to_csv(report_file)
+    print(f"Saved {report_file}")
+    # ========================================================
+    # CONFUSION MATRIX
+    # ========================================================
+    labels_sorted = sorted(
+        list(set(y_true))
+    )
+    cm = confusion_matrix(
+        y_true,
+        y_pred,
+        labels=labels_sorted,
+    )
+    cm_df = pd.DataFrame(
+        cm,
+        index=labels_sorted,
+        columns=labels_sorted,
+    )
+    cm_file = (
+        f"{split_name}_confusion_matrix.csv"
+    )
+    cm_df.to_csv(cm_file)
+    print(f"Saved {cm_file}")
+    return accuracy
+# ============================================================
+# VALIDATION
+# ============================================================
+validation_accuracy = evaluate_jsonl(
+    model,
+    VALIDATION_JSONL,
+    "validation",
+)
+# ============================================================
+# TEST
+# ============================================================
+test_accuracy = evaluate_jsonl(
+    model,
+    TEST_JSONL,
+    "test",
+)
+# ============================================================
+# SUMMARY
+# ============================================================
+summary = pd.DataFrame(
+    [
+        {
+            "validation_accuracy": validation_accuracy,
+            "test_accuracy": test_accuracy,
+            "epochs": EPOCHS,
+            "lr": LR,
+            "dim": DIM,
+            "word_ngrams": WORD_NGRAMS,
+            "min_count": MIN_COUNT,
+            "bucket": BUCKET,
+            "model_size_mb": size_mb,
+        }
+    ]
+)
+summary.to_csv(
+    "fasttext_summary.csv",
+    index=False,
+)
+print()
+print("=" * 60)
+print(f"Validation Accuracy : {validation_accuracy:.6f}")
+print(f"Test Accuracy       : {test_accuracy:.6f}")
+print(f"Model Size (MB)     : {size_mb:.2f}")
+print("=" * 60)
+print()
+print("Done.")

FastText/convert-to-fast-text-format.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import json
+FILES = {
+    "dataset/train.jsonl": "fasttext_train.txt",
+    "dataset/validation.jsonl": "fasttext_validation.txt",
+    "dataset/test.jsonl": "fasttext_test.txt",
+}
+for input_file, output_file in FILES.items():
+    print(f"Converting {input_file} -> {output_file}")
+    count = 0
+    with open(input_file, "r", encoding="utf-8") as fin, \
+         open(output_file, "w", encoding="utf-8") as fout:
+        for line in fin:
+            row = json.loads(line)
+            label = str(row["label"]).strip()
+            text = str(row["content"])
+            text = text.replace("__label__", "__lbl__")
+            text = " ".join(text.split())
+            fout.write(
+                f"__label__{label} {text}\n"
+            )
+            count += 1
+    print(f"Saved {count:,} samples")
+print("\nDone.")

FastText/fasttext_language_classifier.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8734bd145050cf8c458943d1fec8a311410bf2f7f21b89c677c42c1ec3d4d39
+size 38263405

FastText/fasttext_summary.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ validation_accuracy,test_accuracy,epochs,lr,dim,word_ngrams,min_count,bucket,model_size_mb
2	+ 0.9555,0.953125,25,0.7,50,3,100,50000,36.49082660675049

FastText/test_classification_report.csv ADDED Viewed

	@@ -0,0 +1,20 @@

+,precision,recall,f1-score,support
+Assembly,0.9874874874874875,0.9865,0.9869934967483742,2000.0
+C,0.9132374814080317,0.921,0.9171023151605676,2000.0
+C#,0.9763937719738824,0.972,0.974191931846655,2000.0
+C++,0.9087261785356068,0.906,0.9073610415623435,2000.0
+CSS,0.9709072978303748,0.9845,0.977656405163853,2000.0
+Dart,0.9794589178356713,0.9775,0.9784784784784785,2000.0
+Go,0.9725411882176734,0.974,0.9732700474644017,2000.0
+HTML,0.896236012207528,0.881,0.8885526979324256,2000.0
+Java,0.9676777722526106,0.973,0.9703315881326352,2000.0
+JavaScript,0.851581508515815,0.875,0.8631319358816276,2000.0
+Kotlin,0.9863979848866499,0.979,0.9826850690087829,2000.0
+Lua,0.9859084046300957,0.9795,0.9826937547027841,2000.0
+Markdown,0.9464196294441662,0.945,0.9457092819614711,2000.0
+Python,0.9853609288238263,0.976,0.9806581260989701,2000.0
+Rust,0.9894736842105263,0.987,0.9882352941176471,2000.0
+Typescript,0.9348697394789579,0.933,0.933933933933934,2000.0
+accuracy,0.953125,0.953125,0.953125,0.953125
+macro avg,0.9532923742336815,0.953125,0.9531865873871844,32000.0
+weighted avg,0.9532923742336815,0.953125,0.9531865873871844,32000.0

FastText/test_confusion_matrix.csv ADDED Viewed

	@@ -0,0 +1,17 @@

+,Assembly,C,C#,C++,CSS,Dart,Go,HTML,Java,JavaScript,Kotlin,Lua,Markdown,Python,Rust,Typescript
+Assembly,1973,13,0,6,1,0,1,0,0,1,0,3,1,0,0,1
+C,9,1842,5,123,0,4,0,3,3,1,1,1,4,1,2,1
+C#,0,4,1944,6,1,2,6,1,8,6,0,3,7,2,3,7
+C++,2,132,6,1812,1,2,5,7,8,5,2,5,5,1,4,3
+CSS,0,0,1,0,1969,1,1,17,1,7,1,0,1,1,0,0
+Dart,0,1,1,0,1,1955,4,5,4,17,0,2,3,3,1,3
+Go,1,0,2,6,2,1,1948,6,6,10,2,1,9,2,1,3
+HTML,3,1,2,4,36,3,8,1762,3,125,4,3,32,3,0,11
+Java,0,9,7,6,0,9,4,5,1946,2,1,2,6,0,0,3
+JavaScript,1,3,3,6,12,7,9,98,7,1750,7,3,10,1,1,82
+Kotlin,1,0,1,0,0,1,5,4,11,8,1958,2,3,1,0,5
+Lua,1,7,7,5,1,0,1,5,2,5,2,1959,3,1,1,0
+Markdown,3,3,5,8,3,2,4,31,6,17,3,0,1890,12,3,10
+Python,0,1,2,1,0,3,3,8,0,5,3,3,16,1952,2,1
+Rust,3,0,2,5,1,2,1,2,2,1,0,0,6,1,1974,0
+Typescript,1,1,3,6,0,4,3,12,4,95,1,0,1,0,3,1866

FastText/validation_classification_report.csv ADDED Viewed

	@@ -0,0 +1,20 @@

+,precision,recall,f1-score,support
+Assembly,0.9929789368104313,0.99,0.9914872308462694,2000.0
+C,0.9302558956347216,0.927,0.9286250939143501,2000.0
+C#,0.977710233029382,0.965,0.9713135379969804,2000.0
+C++,0.912873225648556,0.9325,0.9225822409102152,2000.0
+CSS,0.961895456765999,0.9845,0.9730664689893749,2000.0
+Dart,0.9804511278195489,0.978,0.979224030037547,2000.0
+Go,0.9788199697428139,0.9705,0.9746422294752699,2000.0
+HTML,0.8991383679675621,0.887,0.8930279385854518,2000.0
+Java,0.9728370221327968,0.967,0.9699097291875627,2000.0
+JavaScript,0.8583252190847127,0.8815,0.8697582634435126,2000.0
+Kotlin,0.9859508278976418,0.9825,0.9842223891810669,2000.0
+Lua,0.986404833836858,0.9795,0.982940291018565,2000.0
+Markdown,0.947289156626506,0.9435,0.9453907815631263,2000.0
+Python,0.977977977977978,0.977,0.9774887443721861,2000.0
+Rust,0.9875,0.9875,0.9875,2000.0
+Typescript,0.9411172622043281,0.935,0.9380486581389516,2000.0
+accuracy,0.9555,0.9555,0.9555,0.9555
+macro avg,0.9557203445737397,0.9555,0.9555767267287768,32000.0
+weighted avg,0.9557203445737398,0.9555,0.9555767267287769,32000.0

FastText/validation_confusion_matrix.csv ADDED Viewed

	@@ -0,0 +1,17 @@

+,Assembly,C,C#,C++,CSS,Dart,Go,HTML,Java,JavaScript,Kotlin,Lua,Markdown,Python,Rust,Typescript
+Assembly,1980,8,1,3,0,0,1,2,0,1,0,0,1,1,1,1
+C,4,1854,5,118,1,0,1,4,2,1,1,2,4,0,2,1
+C#,2,7,1930,13,2,3,3,3,12,8,2,3,3,0,4,5
+C++,2,97,3,1865,0,2,3,3,7,4,0,1,6,1,3,3
+CSS,0,1,0,0,1969,2,0,18,0,7,1,0,2,0,0,0
+Dart,0,0,1,3,2,1956,2,4,0,18,2,0,6,1,1,4
+Go,1,3,3,3,3,3,1941,4,4,15,2,4,7,2,3,2
+HTML,0,2,4,3,53,5,4,1774,1,104,3,3,30,9,2,3
+Java,2,2,10,12,1,3,1,6,1934,9,5,0,5,2,0,8
+JavaScript,2,1,4,2,12,12,11,83,7,1763,6,3,13,4,1,76
+Kotlin,0,1,2,0,0,5,0,4,6,10,1965,1,4,1,0,1
+Lua,0,7,4,2,1,1,2,1,3,6,2,1959,1,8,1,2
+Markdown,0,4,4,7,2,1,7,39,6,11,1,5,1887,13,5,8
+Python,0,1,1,1,0,1,5,9,1,4,2,5,14,1954,2,0
+Rust,1,4,1,7,0,0,1,2,1,2,0,0,2,1,1975,3
+Typescript,0,1,1,4,1,1,1,17,4,91,1,0,7,1,0,1870

SGD-Classifier/Logistic-Regresssion.py ADDED Viewed

	@@ -0,0 +1,369 @@

+import json
+import os
+import time
+import joblib
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import HashingVectorizer
+from sklearn.linear_model import SGDClassifier
+from sklearn.metrics import (
+    accuracy_score,
+    classification_report,
+    confusion_matrix,
+)
+# ============================================================
+# CONFIG
+# ============================================================
+TRAIN_FILE = "dataset/train.jsonl"
+VALIDATION_FILE = "dataset/validation.jsonl"
+TEST_FILE = "dataset/test.jsonl"
+BATCH_SIZE = 20000
+EPOCHS = 10
+N_FEATURES = 2**17
+NGRAM_RANGE = (2, 6)
+MODEL_DIR = "models"
+METRICS_DIR = "metrics"
+# ============================================================
+# CREATE OUTPUT DIRS
+# ============================================================
+os.makedirs(MODEL_DIR, exist_ok=True)
+os.makedirs(METRICS_DIR, exist_ok=True)
+# ============================================================
+# HELPERS
+# ============================================================
+def jsonl_batch_reader(path, batch_size):
+    texts = []
+    labels = []
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            row = json.loads(line)
+            texts.append(row["content"])
+            labels.append(row["label"])
+            if len(texts) >= batch_size:
+                yield texts, labels
+                texts = []
+                labels = []
+    if texts:
+        yield texts, labels
+def load_split(path):
+    texts = []
+    labels = []
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            row = json.loads(line)
+            texts.append(row["content"])
+            labels.append(row["label"])
+    return texts, labels
+def evaluate_split(
+    model,
+    vectorizer,
+    split_name,
+    texts,
+    labels,
+    epoch,
+):
+    print(f"\nEvaluating {split_name}")
+    X = vectorizer.transform(texts)
+    preds = model.predict(X)
+    acc = accuracy_score(labels, preds)
+    print(f"{split_name} accuracy: {acc:.6f}")
+    report = classification_report(
+        labels,
+        preds,
+        output_dict=True,
+        digits=4,
+    )
+    report_df = pd.DataFrame(report).transpose()
+    report_path = os.path.join(
+        METRICS_DIR,
+        f"{split_name}_epoch_{epoch:03d}_report.csv",
+    )
+    report_df.to_csv(report_path)
+    labels_sorted = sorted(list(set(labels)))
+    cm = confusion_matrix(
+        labels,
+        preds,
+        labels=labels_sorted,
+    )
+    cm_df = pd.DataFrame(
+        cm,
+        index=labels_sorted,
+        columns=labels_sorted,
+    )
+    cm_path = os.path.join(
+        METRICS_DIR,
+        f"{split_name}_epoch_{epoch:03d}_confusion_matrix.csv",
+    )
+    cm_df.to_csv(cm_path)
+    return acc
+# ============================================================
+# INFO
+# ============================================================
+print(f"CPU Cores: {os.cpu_count()}")
+# ============================================================
+# LOAD VALIDATION + TEST ONCE
+# ============================================================
+print("Loading validation set...")
+val_texts, val_labels = load_split(
+    VALIDATION_FILE
+)
+print("Loading test set...")
+test_texts, test_labels = load_split(
+    TEST_FILE
+)
+# ============================================================
+# VECTORIZER
+# ============================================================
+vectorizer = HashingVectorizer(
+    analyzer="char",
+    ngram_range=NGRAM_RANGE,
+    n_features=N_FEATURES,
+    alternate_sign=False,
+    lowercase=False,
+)
+# ============================================================
+# DISCOVER CLASSES
+# ============================================================
+print("Discovering classes...")
+all_classes = set()
+for _, labels in jsonl_batch_reader(
+    TRAIN_FILE,
+    BATCH_SIZE,
+):
+    all_classes.update(labels)
+all_classes = np.array(
+    sorted(all_classes)
+)
+print("\nClasses:")
+print(all_classes)
+# ============================================================
+# MODEL
+# ============================================================
+model = SGDClassifier(
+    loss="log_loss",
+    alpha=1e-6,
+    max_iter=1,
+    warm_start=True,
+    verbose=1,
+    random_state=42,
+)
+# ============================================================
+# TRAIN
+# ============================================================
+epoch_results = []
+first_fit = True
+overall_start = time.time()
+for epoch in range(EPOCHS):
+    print("\n" + "=" * 80)
+    print(f"Epoch {epoch + 1}/{EPOCHS}")
+    print("=" * 80)
+    epoch_start = time.time()
+    batch_count = 0
+    for texts, labels in jsonl_batch_reader(
+        TRAIN_FILE,
+        BATCH_SIZE,
+    ):
+        batch_count += 1
+        print(
+            f"Epoch {epoch+1} | Batch {batch_count}"
+        )
+        X = vectorizer.transform(texts)
+        if first_fit:
+            model.partial_fit(
+                X,
+                labels,
+                classes=all_classes,
+            )
+            first_fit = False
+        else:
+            model.partial_fit(
+                X,
+                labels,
+            )
+    epoch_time = time.time() - epoch_start
+    print(
+        f"\nEpoch finished in "
+        f"{epoch_time:.1f}s"
+    )
+    # ========================================================
+    # SAVE MODEL
+    # ========================================================
+    model_path = os.path.join(
+        MODEL_DIR,
+        f"epoch_{epoch+1:03d}.pkl",
+    )
+    joblib.dump(
+        {
+            "model": model,
+            "vectorizer": vectorizer,
+        },
+        model_path,
+    )
+    print(f"Saved {model_path}")
+    # ========================================================
+    # VALIDATION
+    # ========================================================
+    val_acc = evaluate_split(
+        model,
+        vectorizer,
+        "validation",
+        val_texts,
+        val_labels,
+        epoch + 1,
+    )
+    # ========================================================
+    # TEST
+    # ========================================================
+    test_acc = evaluate_split(
+        model,
+        vectorizer,
+        "test",
+        test_texts,
+        test_labels,
+        epoch + 1,
+    )
+    epoch_results.append(
+        {
+            "epoch": epoch + 1,
+            "validation_accuracy": val_acc,
+            "test_accuracy": test_acc,
+            "epoch_time_seconds": epoch_time,
+        }
+    )
+    pd.DataFrame(
+        epoch_results
+    ).to_csv(
+        os.path.join(
+            METRICS_DIR,
+            "epoch_summary.csv",
+        ),
+        index=False,
+    )
+# ============================================================
+# FINAL
+# ============================================================
+total_time = time.time() - overall_start
+print("\nTraining Complete")
+print(
+    f"Total training time: "
+    f"{total_time:.1f}s"
+)
+summary_df = pd.DataFrame(epoch_results)
+best_val_epoch = summary_df[
+    "validation_accuracy"
+].idxmax()
+best_row = summary_df.iloc[
+    best_val_epoch
+]
+print("\nBest Epoch")
+print(best_row)
+summary_df.to_csv(
+    os.path.join(
+        METRICS_DIR,
+        "final_summary.csv",
+    ),
+    index=False,
+)
+print("\nDone.")

SGD-Classifier/metrics/epoch_summary.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+epoch,validation_accuracy,test_accuracy,epoch_time_seconds
+1,0.89421875,0.8940625,3927.5892601013184
+2,0.89746875,0.897,3837.121087551117

SGD-Classifier/metrics/test_epoch_001_confusion_matrix.csv ADDED Viewed

	@@ -0,0 +1,17 @@

+,Assembly,C,C#,C++,CSS,Dart,Go,HTML,Java,JavaScript,Kotlin,Lua,Markdown,Python,Rust,Typescript
+Assembly,1967,26,0,1,0,0,0,0,1,1,0,3,1,0,0,0
+C,3,1798,3,176,0,1,3,0,3,3,0,3,3,3,0,1
+C#,3,12,1925,11,0,1,7,0,13,13,0,8,2,0,2,3
+C++,10,193,4,1756,1,0,11,0,6,7,2,5,1,2,2,0
+CSS,3,0,0,3,1970,2,1,0,1,18,0,0,2,0,0,0
+Dart,0,3,5,3,1,1923,4,0,11,38,0,1,2,3,1,5
+Go,0,1,0,1,1,0,1957,0,3,17,0,4,12,4,0,0
+HTML,11,5,2,7,161,12,9,7,25,648,1,12,1056,34,1,9
+Java,1,5,6,14,1,5,2,0,1937,13,2,6,4,3,0,1
+JavaScript,6,4,7,4,14,18,6,0,16,1837,0,4,14,6,1,63
+Kotlin,0,1,1,2,0,2,6,0,12,9,1949,5,7,3,0,3
+Lua,2,9,7,2,1,0,4,0,2,6,1,1957,5,4,0,0
+Markdown,4,7,1,10,1,2,10,0,7,17,1,7,1893,26,5,9
+Python,2,2,1,0,2,7,5,0,0,10,1,2,14,1953,0,1
+Rust,4,2,3,4,1,0,3,0,3,5,0,4,10,1,1960,0
+Typescript,2,7,5,6,0,9,1,0,9,130,1,2,5,0,2,1821

SGD-Classifier/metrics/test_epoch_001_report.csv ADDED Viewed

	@@ -0,0 +1,20 @@

+,precision,recall,f1-score,support
+Assembly,0.9747274529236868,0.9835,0.9790940766550522,2000.0
+C,0.8665060240963856,0.899,0.8824539877300613,2000.0
+C#,0.9771573604060914,0.9625,0.9697732997481109,2000.0
+C++,0.878,0.878,0.878,2000.0
+CSS,0.914577530176416,0.985,0.9484833895040924,2000.0
+Dart,0.9702320887991928,0.9615,0.9658463083877449,2000.0
+Go,0.964514539181863,0.9785,0.9714569372052618,2000.0
+HTML,1.0,0.0035,0.006975585450921774,2000.0
+Java,0.9453391898487067,0.9685,0.9567794517164732,2000.0
+JavaScript,0.6626984126984127,0.9185,0.769907795473596,2000.0
+Kotlin,0.9954034729315628,0.9745,0.9848408287013644,2000.0
+Lua,0.967375185368265,0.9785,0.972905791697738,2000.0
+Markdown,0.6245463543385021,0.9465,0.7525342874180083,2000.0
+Python,0.9564152791380999,0.9765,0.9663532904502722,2000.0
+Rust,0.9929078014184397,0.98,0.9864116758933065,2000.0
+Typescript,0.9504175365344467,0.9105,0.9300306435137896,2000.0
+accuracy,0.8940625,0.8940625,0.8940625,0.8940625
+macro avg,0.9150511392412544,0.8940625,0.8701154593466122,32000.0
+weighted avg,0.9150511392412544,0.8940625,0.8701154593466122,32000.0

SGD-Classifier/metrics/test_epoch_002_confusion_matrix.csv ADDED Viewed

	@@ -0,0 +1,17 @@

+,Assembly,C,C#,C++,CSS,Dart,Go,HTML,Java,JavaScript,Kotlin,Lua,Markdown,Python,Rust,Typescript
+Assembly,1965,27,0,1,0,0,0,0,1,1,0,3,2,0,0,0
+C,3,1807,3,171,0,2,3,0,2,2,0,0,3,3,0,1
+C#,3,15,1925,11,1,1,5,0,13,11,0,7,3,0,2,3
+C++,8,206,4,1744,2,0,10,0,6,7,2,6,1,2,2,0
+CSS,3,0,0,2,1974,2,1,0,0,15,1,0,2,0,0,0
+Dart,0,3,4,3,1,1927,3,0,8,37,0,2,3,2,1,6
+Go,0,1,0,2,2,0,1954,0,4,18,0,3,12,3,1,0
+HTML,10,5,2,6,154,15,7,96,20,600,2,10,1027,35,1,10
+Java,1,8,6,13,1,5,2,0,1937,12,2,5,4,3,0,1
+JavaScript,6,4,8,5,15,23,4,1,15,1822,0,4,16,5,1,71
+Kotlin,0,1,1,3,0,2,6,0,12,9,1948,4,8,3,0,3
+Lua,3,6,7,2,0,0,3,0,2,5,1,1961,7,3,0,0
+Markdown,3,8,1,10,1,1,10,0,8,17,1,3,1898,25,5,9
+Python,2,2,1,1,2,7,4,0,0,10,1,2,15,1952,0,1
+Rust,4,2,3,4,1,0,3,0,3,5,0,3,10,1,1961,0
+Typescript,1,7,5,6,0,10,1,0,9,118,1,2,5,0,2,1833

SGD-Classifier/metrics/test_epoch_002_report.csv ADDED Viewed

	@@ -0,0 +1,20 @@

+,precision,recall,f1-score,support
+Assembly,0.9766401590457257,0.9825,0.9795613160518445,2000.0
+C,0.8596574690770694,0.9035,0.8810336421257923,2000.0
+C#,0.9771573604060914,0.9625,0.9697732997481109,2000.0
+C++,0.8790322580645161,0.872,0.8755020080321285,2000.0
+CSS,0.9164345403899722,0.987,0.9504092441020703,2000.0
+Dart,0.9659147869674185,0.9635,0.9647058823529412,2000.0
+Go,0.9692460317460317,0.977,0.9731075697211156,2000.0
+HTML,0.9896907216494846,0.048,0.09155937052932761,2000.0
+Java,0.9495098039215686,0.9685,0.9589108910891089,2000.0
+JavaScript,0.6775753068055039,0.911,0.7771379825122627,2000.0
+Kotlin,0.9943848902501277,0.974,0.9840868906289467,2000.0
+Lua,0.9732009925558313,0.9805,0.9768368617683686,2000.0
+Markdown,0.6293103448275862,0.949,0.7567783094098883,2000.0
+Python,0.958271968581247,0.976,0.9670547436215011,2000.0
+Rust,0.9924089068825911,0.9805,0.9864185110663984,2000.0
+Typescript,0.9458204334365325,0.9165,0.930929405789741,2000.0
+accuracy,0.897,0.897,0.897,0.897
+macro avg,0.9158909984129562,0.897,0.8764878705343466,32000.0
+weighted avg,0.9158909984129562,0.897,0.8764878705343467,32000.0

SGD-Classifier/metrics/validation_epoch_001_confusion_matrix.csv ADDED Viewed

	@@ -0,0 +1,17 @@

+,Assembly,C,C#,C++,CSS,Dart,Go,HTML,Java,JavaScript,Kotlin,Lua,Markdown,Python,Rust,Typescript
+Assembly,1959,29,0,1,0,0,2,0,1,1,1,2,1,1,1,1
+C,7,1830,7,139,1,2,2,0,1,1,1,2,1,3,2,1
+C#,5,15,1899,13,0,1,9,0,23,11,1,14,2,1,2,4
+C++,10,167,2,1795,0,0,3,0,9,4,0,2,3,4,1,0
+CSS,3,3,1,0,1954,4,1,0,0,27,0,0,6,1,0,0
+Dart,0,0,2,4,0,1944,0,0,4,35,0,1,6,1,1,2
+Go,5,2,2,3,3,2,1945,0,2,20,0,3,6,5,0,2
+HTML,7,10,6,9,178,8,10,2,36,646,3,11,1039,26,2,7
+Java,1,5,7,15,1,3,7,0,1924,17,2,9,3,5,0,1
+JavaScript,3,2,2,6,18,16,7,0,14,1844,4,10,10,4,2,58
+Kotlin,2,5,2,6,1,3,3,0,13,15,1933,3,10,3,0,1
+Lua,1,9,2,5,0,0,2,0,2,6,0,1959,6,5,2,1
+Markdown,2,8,7,13,4,3,9,0,9,21,3,7,1879,25,5,5
+Python,1,2,0,1,1,1,4,0,2,11,1,7,10,1958,0,1
+Rust,3,7,3,6,0,0,5,0,3,2,0,0,1,1,1969,0
+Typescript,1,1,4,5,1,6,3,0,13,129,1,0,11,3,1,1821

SGD-Classifier/metrics/validation_epoch_001_report.csv ADDED Viewed

	@@ -0,0 +1,20 @@

+,precision,recall,f1-score,support
+Assembly,0.9746268656716418,0.9795,0.9770573566084788,2000.0
+C,0.8735083532219571,0.915,0.8937728937728938,2000.0
+C#,0.9758478931140802,0.9495,0.9624936644703497,2000.0
+C++,0.8881741712023751,0.8975,0.8928127331509574,2000.0
+CSS,0.9037927844588344,0.977,0.9389716482460355,2000.0
+Dart,0.9754139488208731,0.972,0.9737039819684448,2000.0
+Go,0.9666998011928429,0.9725,0.9695912263210369,2000.0
+HTML,1.0,0.001,0.001998001998001998,2000.0
+Java,0.9357976653696498,0.962,0.9487179487179487,2000.0
+JavaScript,0.6609318996415771,0.922,0.7699373695198329,2000.0
+Kotlin,0.9912820512820513,0.9665,0.9787341772151898,2000.0
+Lua,0.9650246305418719,0.9795,0.9722084367245658,2000.0
+Markdown,0.6275885103540414,0.9395,0.7525030036043252,2000.0
+Python,0.956989247311828,0.979,0.967869500741473,2000.0
+Rust,0.9904426559356136,0.9845,0.9874623871614845,2000.0
+Typescript,0.9559055118110236,0.9105,0.9326504481434059,2000.0
+accuracy,0.89421875,0.89421875,0.89421875,0.89421875
+macro avg,0.9151266243706413,0.8942187500000001,0.8700302986477766,32000.0
+weighted avg,0.9151266243706414,0.89421875,0.8700302986477766,32000.0

SGD-Classifier/metrics/validation_epoch_002_confusion_matrix.csv ADDED Viewed

	@@ -0,0 +1,17 @@

+,Assembly,C,C#,C++,CSS,Dart,Go,HTML,Java,JavaScript,Kotlin,Lua,Markdown,Python,Rust,Typescript
+Assembly,1956,31,0,2,0,0,2,0,1,1,1,1,2,1,1,1
+C,5,1841,7,128,1,2,2,0,1,1,1,2,2,4,2,1
+C#,4,15,1907,11,0,2,7,0,22,10,1,13,2,1,2,3
+C++,11,174,4,1788,0,0,3,0,7,4,0,1,3,4,1,0
+CSS,1,4,1,0,1956,4,1,0,0,25,0,0,7,1,0,0
+Dart,0,0,1,1,0,1951,0,0,4,31,0,1,6,1,1,3
+Go,5,2,3,3,2,3,1943,0,1,21,0,3,7,5,0,2
+HTML,6,10,5,9,171,9,6,83,31,607,5,13,1009,27,2,7
+Java,3,6,6,16,1,3,7,0,1927,12,2,9,3,4,0,1
+JavaScript,3,1,3,6,19,17,7,0,13,1833,3,10,12,4,2,67
+Kotlin,1,6,2,6,1,3,3,0,12,14,1934,3,11,3,0,1
+Lua,1,8,3,6,0,0,2,0,1,6,0,1960,6,5,1,1
+Markdown,2,9,7,12,4,3,9,0,10,19,3,2,1883,25,5,7
+Python,1,2,0,1,1,1,3,0,2,12,2,7,11,1956,0,1
+Rust,3,7,4,5,0,3,5,0,2,2,0,0,1,1,1967,0
+Typescript,1,2,4,4,1,5,2,0,13,117,2,0,11,3,1,1834

SGD-Classifier/metrics/validation_epoch_002_report.csv ADDED Viewed

	@@ -0,0 +1,20 @@

+,precision,recall,f1-score,support
+Assembly,0.9765351972041937,0.978,0.9772670497127155,2000.0
+C,0.8692162417374882,0.9205,0.8941233608547838,2000.0
+C#,0.9744506898313745,0.9535,0.9638615112458934,2000.0
+C++,0.8948948948948949,0.894,0.894447223611806,2000.0
+CSS,0.9068150208623088,0.978,0.9410632667789272,2000.0
+Dart,0.9725822532402791,0.9755,0.9740389415876186,2000.0
+Go,0.9705294705294706,0.9715,0.9710144927536232,2000.0
+HTML,1.0,0.0415,0.07969275084013443,2000.0
+Java,0.9413776257938447,0.9635,0.9523103533481592,2000.0
+JavaScript,0.6751381215469613,0.9165,0.7775185577942736,2000.0
+Kotlin,0.9897645854657113,0.967,0.9782498735457764,2000.0
+Lua,0.9679012345679012,0.98,0.9739130434782609,2000.0
+Markdown,0.6327284946236559,0.9415,0.7568327974276527,2000.0
+Python,0.956479217603912,0.978,0.9671199011124846,2000.0
+Rust,0.9909319899244332,0.9835,0.9872020075282308,2000.0
+Typescript,0.9507516848107828,0.917,0.9335708831763807,2000.0
+accuracy,0.89746875,0.89746875,0.89746875,0.89746875
+macro avg,0.9168810451648257,0.89746875,0.8763891259247951,32000.0
+weighted avg,0.9168810451648257,0.89746875,0.876389125924795,32000.0

SGD-Classifier/models/epoch_001.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3cb9dfc421e9f5eed281199cd3e6ac7d41b4dc5d13efce09f60949d830b2eee
+size 16779530

SGD-Classifier/models/epoch_002.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3941f702f4d9d1bee58087062178846a70c34311a10329e6eca075a9a4603633
+size 16779530