Text Classification
fastText
English
scikit-learn
code-classification
programming-language-detection
source-code
machine-learning
modernbert
classification
nlp
code-analysis
software-engineering
Eval Results (legacy)
Instructions to use kaushik-harsh-99/Code-Lang-Classifier with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- fastText
How to use kaushik-harsh-99/Code-Lang-Classifier with fastText:
from huggingface_hub import hf_hub_download import fasttext model = fasttext.load_model(hf_hub_download("kaushik-harsh-99/Code-Lang-Classifier", "model.bin")) - Notebooks
- Google Colab
- Kaggle
| import json | |
| import fasttext | |
| import pandas as pd | |
| from sklearn.metrics import ( | |
| accuracy_score, | |
| classification_report, | |
| confusion_matrix, | |
| ) | |
| # ============================================================ | |
| # CONFIG | |
| # ============================================================ | |
| MODEL_FILE = "fasttext_language_classifier.bin" | |
| VALIDATION_FILE = "dataset/validation.jsonl" | |
| TEST_FILE = "dataset/test.jsonl" | |
| # ============================================================ | |
| # LOAD MODEL | |
| # ============================================================ | |
| print("Loading model...") | |
| model = fasttext.load_model(MODEL_FILE) | |
| print("Model loaded.") | |
| # ============================================================ | |
| # EVALUATION | |
| # ============================================================ | |
| def evaluate_jsonl( | |
| model, | |
| jsonl_file, | |
| split_name, | |
| ): | |
| print(f"\nEvaluating {split_name}") | |
| y_true = [] | |
| y_pred = [] | |
| processed = 0 | |
| with open( | |
| jsonl_file, | |
| "r", | |
| encoding="utf-8", | |
| ) as f: | |
| for line in f: | |
| row = json.loads(line) | |
| true_label = row["label"] | |
| # Match FastText training format | |
| text = " ".join( | |
| row["content"].split() | |
| ) | |
| labels, probs = model.predict( | |
| text, | |
| k=1, | |
| ) | |
| pred_label = ( | |
| labels[0] | |
| .replace("__label__", "") | |
| ) | |
| y_true.append(true_label) | |
| y_pred.append(pred_label) | |
| processed += 1 | |
| if processed % 5000 == 0: | |
| print( | |
| f"Processed {processed:,}" | |
| ) | |
| # ======================================================== | |
| # ACCURACY | |
| # ======================================================== | |
| acc = accuracy_score( | |
| y_true, | |
| y_pred, | |
| ) | |
| print( | |
| f"\n{split_name} Accuracy: " | |
| f"{acc:.6f}" | |
| ) | |
| # ======================================================== | |
| # CLASSIFICATION REPORT | |
| # ======================================================== | |
| report = classification_report( | |
| y_true, | |
| y_pred, | |
| output_dict=True, | |
| digits=4, | |
| ) | |
| report_df = ( | |
| pd.DataFrame(report) | |
| .transpose() | |
| ) | |
| report_csv = ( | |
| f"{split_name}_classification_report.csv" | |
| ) | |
| report_df.to_csv(report_csv) | |
| print(f"Saved {report_csv}") | |
| # ======================================================== | |
| # CONFUSION MATRIX | |
| # ======================================================== | |
| labels_sorted = sorted( | |
| list(set(y_true)) | |
| ) | |
| cm = confusion_matrix( | |
| y_true, | |
| y_pred, | |
| labels=labels_sorted, | |
| ) | |
| cm_df = pd.DataFrame( | |
| cm, | |
| index=labels_sorted, | |
| columns=labels_sorted, | |
| ) | |
| cm_csv = ( | |
| f"{split_name}_confusion_matrix.csv" | |
| ) | |
| cm_df.to_csv(cm_csv) | |
| print(f"Saved {cm_csv}") | |
| return acc | |
| # ============================================================ | |
| # VALIDATION | |
| # ============================================================ | |
| validation_accuracy = evaluate_jsonl( | |
| model, | |
| VALIDATION_FILE, | |
| "validation", | |
| ) | |
| # ============================================================ | |
| # TEST | |
| # ============================================================ | |
| test_accuracy = evaluate_jsonl( | |
| model, | |
| TEST_FILE, | |
| "test", | |
| ) | |
| # ============================================================ | |
| # SUMMARY | |
| # ============================================================ | |
| summary = pd.DataFrame([ | |
| { | |
| "validation_accuracy": validation_accuracy, | |
| "test_accuracy": test_accuracy, | |
| } | |
| ]) | |
| summary.to_csv( | |
| "fasttext_summary.csv", | |
| index=False, | |
| ) | |
| print("\nSaved fasttext_summary.csv") | |
| print("\n==============================") | |
| print(f"Validation Accuracy: {validation_accuracy:.6f}") | |
| print(f"Test Accuracy: {test_accuracy:.6f}") | |
| print("==============================") | |
| print("\nDone.") |