import json
import os
import time

import fasttext
import pandas as pd

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
)

# ============================================================
# CONFIG
# ============================================================

TRAIN_FILE = "fasttext_train.txt"

VALIDATION_JSONL = "dataset/validation.jsonl"
TEST_JSONL = "dataset/test.jsonl"

MODEL_FILE = "fasttext_language_classifier.bin"

EPOCHS = 25
LR = 0.7

DIM = 50

WORD_NGRAMS = 3

MINN = 2
MAXN = 5

MIN_COUNT = 100

BUCKET = 50000

THREADS = os.cpu_count()

# ============================================================
# TRAIN
# ============================================================

print("Training FastText...")
print()

start = time.time()

model = fasttext.train_supervised(
    input=TRAIN_FILE,
    epoch=EPOCHS,
    lr=LR,
    dim=DIM,
    wordNgrams=WORD_NGRAMS,
    minn=MINN,
    maxn=MAXN,
    minCount=MIN_COUNT,
    bucket=BUCKET,
    loss="softmax",
    thread=THREADS,
    verbose=2,
)

elapsed = time.time() - start

print()
print(f"Training completed in {elapsed:.1f}s")

# ============================================================
# LABEL DEBUG
# ============================================================

print()
print("Labels found by FastText:")
print(f"Count: {len(model.labels)}")

for label in model.labels:
    print(label)

# ============================================================
# SAVE MODEL
# ============================================================

model.save_model(MODEL_FILE)

size_mb = os.path.getsize(MODEL_FILE) / 1024 / 1024

print()
print(f"Saved model: {MODEL_FILE}")
print(f"Model size: {size_mb:.2f} MB")

# ============================================================
# EVALUATION
# ============================================================

def evaluate_jsonl(
    model,
    jsonl_file,
    split_name,
):
    print()
    print(f"Evaluating {split_name}")

    y_true = []
    y_pred = []

    processed = 0

    with open(
        jsonl_file,
        "r",
        encoding="utf-8",
    ) as f:

        for line in f:

            row = json.loads(line)

            true_label = row["label"]

            
            text = " ".join(
                str(row["content"]).split()
            )

            labels, probs = model.predict(
                text,
                k=1,
            )

            pred_label = (
                labels[0]
                .replace("__label__", "")
            )

            y_true.append(true_label)
            y_pred.append(pred_label)

            processed += 1

            if processed % 5000 == 0:
                print(
                    f"Processed {processed:,}"
                )

    # ========================================================
    # ACCURACY
    # ========================================================

    accuracy = accuracy_score(
        y_true,
        y_pred,
    )

    print()
    print(
        f"{split_name} Accuracy: "
        f"{accuracy:.6f}"
    )

    # ========================================================
    # CLASSIFICATION REPORT
    # ========================================================

    report = classification_report(
        y_true,
        y_pred,
        output_dict=True,
        digits=4,
    )

    report_df = pd.DataFrame(
        report
    ).transpose()

    report_file = (
        f"{split_name}_classification_report.csv"
    )

    report_df.to_csv(report_file)

    print(f"Saved {report_file}")

    # ========================================================
    # CONFUSION MATRIX
    # ========================================================

    labels_sorted = sorted(
        list(set(y_true))
    )

    cm = confusion_matrix(
        y_true,
        y_pred,
        labels=labels_sorted,
    )

    cm_df = pd.DataFrame(
        cm,
        index=labels_sorted,
        columns=labels_sorted,
    )

    cm_file = (
        f"{split_name}_confusion_matrix.csv"
    )

    cm_df.to_csv(cm_file)

    print(f"Saved {cm_file}")

    return accuracy

# ============================================================
# VALIDATION
# ============================================================

validation_accuracy = evaluate_jsonl(
    model,
    VALIDATION_JSONL,
    "validation",
)

# ============================================================
# TEST
# ============================================================

test_accuracy = evaluate_jsonl(
    model,
    TEST_JSONL,
    "test",
)

# ============================================================
# SUMMARY
# ============================================================

summary = pd.DataFrame(
    [
        {
            "validation_accuracy": validation_accuracy,
            "test_accuracy": test_accuracy,
            "epochs": EPOCHS,
            "lr": LR,
            "dim": DIM,
            "word_ngrams": WORD_NGRAMS,
            "min_count": MIN_COUNT,
            "bucket": BUCKET,
            "model_size_mb": size_mb,
        }
    ]
)

summary.to_csv(
    "fasttext_summary.csv",
    index=False,
)

print()
print("=" * 60)
print(f"Validation Accuracy : {validation_accuracy:.6f}")
print(f"Test Accuracy       : {test_accuracy:.6f}")
print(f"Model Size (MB)     : {size_mb:.2f}")
print("=" * 60)

print()
print("Done.")