Spaces:

seai2526-uniba-TheClouds
/

Code-Comment-Classification-Api

Running

App Files Files Community

Sky-Blue-da-ba-dee commited on Dec 11, 2025

Commit

ac9ddbb

1 Parent(s): 629e980

added files

Browse files

Files changed (24) hide show

Dockerfile +27 -0
api/__pycache__/main.cpython-311.pyc +0 -0
api/__pycache__/schemas.cpython-311.pyc +0 -0
api/__pycache__/sync_models.cpython-311.pyc +0 -0
api/main.py +217 -0
api/schemas.py +75 -0
api/sync_models.py +192 -0
codecommentclassification/__init__.py +5 -0
codecommentclassification/__pycache__/__init__.cpython-311.pyc +0 -0
codecommentclassification/__pycache__/predictor.cpython-311.pyc +0 -0
codecommentclassification/modeling/__pycache__/evaluate_models.cpython-311.pyc +0 -0
codecommentclassification/modeling/__pycache__/train.cpython-311.pyc +0 -0
codecommentclassification/modeling/__pycache__/utils.cpython-311.pyc +0 -0
codecommentclassification/modeling/evaluate_models.py +223 -0
codecommentclassification/modeling/train.py +203 -0
codecommentclassification/modeling/transformer/__init__.py +10 -0
codecommentclassification/modeling/transformer/__pycache__/__init__.cpython-311.pyc +0 -0
codecommentclassification/modeling/transformer/__pycache__/preprocessing.cpython-311.pyc +0 -0
codecommentclassification/modeling/transformer/__pycache__/trainer.cpython-311.pyc +0 -0
codecommentclassification/modeling/transformer/preprocessing.py +208 -0
codecommentclassification/modeling/transformer/trainer.py +531 -0
codecommentclassification/modeling/utils.py +70 -0
codecommentclassification/predictor.py +149 -0
requirements.txt +234 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,27 @@

+FROM python:3.11
+# User
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user requirements.txt requirements.txt
+RUN grep -v "codecommentclassification" requirements.txt > requirements-docker.txt \
+    && pip install --no-cache-dir --upgrade -r requirements-docker.txt
+COPY --chown=user api /app/api
+COPY --chown=user codecommentclassification /app/codecommentclassification
+COPY --chown=user models/model_cards /app/models/model_cards
+RUN mkdir -p /app/models/api
+ENV PYTHONPATH=/app
+ENV MODELS_DIR=/app/models/api
+EXPOSE 7860
+CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "7860"]

api/__pycache__/main.cpython-311.pyc ADDED Viewed

Binary file (9.67 kB). View file

api/__pycache__/schemas.cpython-311.pyc ADDED Viewed

Binary file (2.73 kB). View file

api/__pycache__/sync_models.cpython-311.pyc ADDED Viewed

Binary file (7.42 kB). View file

api/main.py ADDED Viewed

	@@ -0,0 +1,217 @@

+"""Main API for Code Comment Classification using FastAPI."""
+from contextlib import asynccontextmanager
+from datetime import datetime
+from functools import lru_cache, wraps
+from http import HTTPStatus
+import inspect
+import logging
+import os
+from pathlib import Path
+from api.schemas import PredictRequest
+from api.sync_models import sync_best_models_to_disk
+from fastapi import FastAPI, Request, Response
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from codecommentclassification import ModelPredictor
+MODELS_DIR = Path(os.getenv("MODELS_DIR", "models/api"))
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+@lru_cache(maxsize=3)
+def get_predictor(lang: str, model_type: str) -> ModelPredictor:
+    """Lazily loads the heavy model only when requested."""
+    logger.info(f"Loading model for {lang} - {model_type}...")
+    return ModelPredictor(lang=lang, model_type=model_type, model_root=str(MODELS_DIR))
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Lifespan context manager to sync models at startup."""
+    try:
+        logger.info(f"Syncing champion models from MLflow to {MODELS_DIR}...")
+        sync_best_models_to_disk(
+            models_root=MODELS_DIR.parent,
+            api_subdir=MODELS_DIR.name,
+        )
+    except Exception as e:
+        logger.error(f"Failed to sync models at startup: {e}")
+    if not MODELS_DIR.exists():
+        logger.warning(f"Models directory not found at: {MODELS_DIR.resolve()}")
+    else:
+        logger.info(f"Using models from: {MODELS_DIR.resolve()}")
+    yield
+    get_predictor.cache_clear()
+app = FastAPI(
+    title="Code Comment Classification API",
+    description="API for classifying code comments using SetFit models.",
+    version="0.1",
+    lifespan=lifespan,
+)
+frontend_origins = os.getenv("FRONTEND_ORIGINS")
+if frontend_origins:
+    origins = [o.strip() for o in frontend_origins.split(",") if o.strip()]
+else:
+    # default di sviluppo
+    origins = [
+        "http://localhost:5173",
+        "http://127.0.0.1:5173",
+        "http://localhost",
+    ]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+def _build_response(results: dict, request: Request):
+    if isinstance(results, (Response, JSONResponse)):
+        return results
+    response = {
+        "message": results["message"],
+        "method": request.method,
+        "status-code": results["status-code"],
+        "timestamp": datetime.now().isoformat(),
+        "url": request.url._url,
+    }
+    if "data" in results:
+        response["data"] = results["data"]
+    return response
+def construct_response(f):
+    """Construct a JSON response for an endpoint's results (sync and async)."""
+    if inspect.iscoroutinefunction(f):
+        @wraps(f)
+        async def wrap(request: Request, *args, **kwargs):
+            results = await f(request, *args, **kwargs)
+            return _build_response(results, request)
+    else:
+        @wraps(f)
+        def wrap(request: Request, *args, **kwargs):
+            results = f(request, *args, **kwargs)
+            return _build_response(results, request)
+    return wrap
+@app.get("/", tags=["General"])
+@construct_response
+def _index(request: Request):
+    """Root endpoint."""
+    return {
+        "message": HTTPStatus.OK.phrase,
+        "status-code": HTTPStatus.OK,
+        "data": {
+            "message": "Welcome to the Code Comment Classification API! Please use /docs for API documentation."
+        },
+    }
+@app.get("/privacy", tags=["General"])
+@construct_response
+async def get_privacy_notice(request: Request):
+    """Return the Privacy Notice for the API."""
+    return {
+        "message": "Privacy Notice",
+        "status-code": HTTPStatus.OK,
+        "data": {
+            "policy": "This API processes text data for classification purposes only. No data is permanently stored.",
+            "compliance_link": "https://behavizapi.peopleware.ai/api/docs#section/Getting-Started/Privacy-Notice",
+        },
+    }
+@app.get("/status")
+def get_status():
+    """Endpoint to check if the API is running."""
+    return {"status": "API is running"}
+@app.get("/models", tags=["Prediction"])
+@construct_response
+def _get_models_list(request: Request):
+    """Return the list of available languages based on directories found in models/ ."""
+    # Since we aren't pre-loading, we scan the directory to see what IS available
+    if MODELS_DIR.exists():
+        available_languages = [
+            {"language": d.name, "model_types": mt.name}
+            for d in MODELS_DIR.iterdir()
+            if d.is_dir()
+            for mt in d.iterdir()
+            if mt.is_dir()
+        ]
+    else:
+        available_languages = []
+    return {
+        "message": HTTPStatus.OK.phrase,
+        "status-code": HTTPStatus.OK,
+        "data": available_languages,
+    }
+@app.post("/predict", tags=["Prediction"])
+@construct_response
+def predict(
+    request: Request,
+    payload: PredictRequest,
+):
+    """Inference endpoint."""
+    if payload.model_type is None:
+        return {
+            "message": "Model type must be specified.",
+            "status-code": HTTPStatus.BAD_REQUEST,
+        }
+    try:
+        predictor = get_predictor(payload.language.value, payload.model_type.value)
+        result = predictor.predict(payload.text)
+        predictions_list = result.tolist() if hasattr(result, "tolist") else result
+        return {
+            "message": HTTPStatus.OK.phrase,
+            "status-code": HTTPStatus.OK,
+            "data": {
+                "language": payload.language,
+                "model_type": payload.model_type,
+                "predictions": predictions_list,
+            },
+        }
+    except FileNotFoundError:
+        return {
+            "message": f"Model for language '{payload.language}' not found.",
+            "status-code": HTTPStatus.NOT_FOUND,
+        }
+    except ValueError as e:
+        return {
+            "message": str(e),
+            "status-code": HTTPStatus.BAD_REQUEST,
+        }
+    except Exception as e:
+        return {
+            "message": f"Internal Error: {str(e)}",
+            "status-code": HTTPStatus.INTERNAL_SERVER_ERROR,
+        }

api/schemas.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""API Schemas for Predict Request and Response."""
+from enum import Enum
+from pydantic import BaseModel, ConfigDict, ValidationError
+class ProgrammingLanguage(str, Enum):
+    """Programming languages supported for prediction."""
+    JAVA = "java"
+    PYTHON = "python"
+    PHARO = "pharo"
+class ModelType(str, Enum):
+    """Model types for prediction."""
+    SETFIT = "setfit"
+    RANDOM_FOREST = "random_forest"
+    TRANSFORMER = "transformer"
+class PredictRequest(BaseModel):
+    """Schema for Predict Request."""
+    text: str
+    language: ProgrammingLanguage
+    model_type: ModelType
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "text": "This method calculates the average score.",
+                "language": "python",
+                "model_type": "transformer",
+            }
+        }
+    )
+class PredictResponse(BaseModel):
+    """Schema for Predict Response."""
+    label: str
+    score: float
+""" Demonstration of object instantiation, printing,
+and validation error handling with dummy use cases"""
+if __name__ == "__main__":
+    print("\n--- 1. Object Instantiation & Printing ---")
+    valid_data = {
+        "text": "This method calculates the average score.",
+        "language": "java",
+        "model_type": "setfit",
+    }
+    # Instantiate the object
+    request = PredictRequest(**valid_data)
+    # Print object as dictionary (.model_dump() is Pydantic V2 syntax)
+    print(f"Valid Request Object: {request.model_dump()}")
+    print("\n--- 2. Handling Invalid Data ---")
+    try:
+        print("Attempting to create request with language='c++'...")
+        # This should fail because 'c++' is not in ProgrammingLanguage Enum
+        invalid_request = PredictRequest(
+            text="std::cout << 'Hello';", language="c++", model_type="setfit"
+        )
+    except ValidationError as e:
+        print("SUCCESS: Validation Error Caught!")
+        print(e.json())

api/sync_models.py ADDED Viewed

	@@ -0,0 +1,192 @@

+"""Synchronise champion MLflow models from the remote registry to the local filesystem."""
+import logging
+import os
+from pathlib import Path
+import shutil
+import mlflow
+from mlflow.tracking import MlflowClient
+logger = logging.getLogger(__name__)
+LANGUAGES = ("python", "java", "pharo")
+def _get_mlflow_client() -> MlflowClient:
+    """Return an MLflow client configured from environment variables.
+    If ``MLFLOW_TRACKING_URI`` is defined, it is passed to
+    :func:`mlflow.set_tracking_uri`. Authentication (for example on DagsHub)
+    is handled by MLflow itself via the standard environment variables
+    ``MLFLOW_TRACKING_USERNAME`` and ``MLFLOW_TRACKING_PASSWORD``.
+    """
+    tracking_uri = os.getenv("MLFLOW_TRACKING_URI")
+    if tracking_uri:
+        mlflow.set_tracking_uri(tracking_uri)
+    return MlflowClient()
+def _find_champion_version_for_language(
+    client: MlflowClient,
+    lang: str,
+):
+    """Return the champion model version for the given language, if any.
+    The function searches all registered models and looks for models whose name
+    starts with ``"<lang>-"`` (for example ``"python-transformer"``). For each
+    matching model it tries to resolve the alias ``"<lang>-champion"`` using
+    :meth:`MlflowClient.get_model_version_by_alias`.
+    Args:
+        client: Initialised MLflow client.
+        lang: Language identifier, such as ``"python"``, ``"java"`` or
+            ``"pharo"``.
+    Returns:
+        The matching :class:`mlflow.entities.model_registry.ModelVersion` if a
+        champion is found, otherwise ``None``.
+    """
+    alias_name = f"{lang}-champion"
+    prefix = f"{lang}-"
+    # Get all registered models and filter by language prefix.
+    for rm in client.search_registered_models():
+        model_name = rm.name
+        if not model_name.startswith(prefix):
+            continue
+        try:
+            mv = client.get_model_version_by_alias(
+                name=model_name,
+                alias=alias_name,
+            )
+            logger.info(
+                "Found champion model for %s: %s (version %s)",
+                lang,
+                model_name,
+                mv.version,
+            )
+            return mv
+        except Exception:  # noqa: BLE001
+            logger.info("Alias not defined for model %s, trying next one.", model_name)
+            continue
+    logger.warning("No champion model found for %s.", lang)
+    return None
+def sync_best_models_to_disk(
+    models_root: str | Path = "models",
+    api_subdir: str = "api",
+) -> None:
+    """Download champion models from MLflow and write them to disk.
+    For each language in :data:`LANGUAGES`, this function looks up the model
+    version with alias ``"<lang>-champion"`` and downloads its artifacts. After
+    download, the directory structure is normalised so that the final layout is:
+    .. code-block:: text
+        models/
+          <api_subdir>/
+            python/
+              <model_type>/
+                ...
+            java/
+              <model_type>/
+                ...
+            pharo/
+              <model_type>/
+                ...
+    For transformer models logged via ``mlflow.transformers``, the inner
+    ``model/`` directory is flattened so that the Hugging Face files
+    (``config.json``, ``model.safetensors``, ``tokenizer.json``, and so on)
+    live directly under ``<model_type>/``.
+    Args:
+        models_root: Base directory under which models are written. Can be a
+            string or :class:`pathlib.Path`. Defaults to ``"models"``.
+        api_subdir: Optional subdirectory appended under ``models_root`` (for
+            example ``"api"``). If empty, models are stored directly under
+            ``models_root``.
+    Raises:
+        OSError: If creating directories, moving files, or removing directories
+            fails at the OS level.
+    """
+    client = _get_mlflow_client()
+    root = Path(models_root)
+    if api_subdir:
+        root = root / api_subdir
+    root.mkdir(parents=True, exist_ok=True)
+    logger.info("Syncing best models to: %s", root.resolve())
+    for lang in LANGUAGES:
+        mv = _find_champion_version_for_language(client, lang)
+        if mv is None:
+            continue
+        model_name = mv.name
+        try:
+            lang_from_name, model_type = model_name.split("-", 1)
+        except ValueError:
+            logger.error("Unexpected model name format: %s", model_name)
+            continue
+        if lang_from_name != lang:
+            logger.warning(
+                "Language mismatch for model %s: expected %s, got %s",
+                model_name,
+                lang,
+                lang_from_name,
+            )
+        dest_dir = root / lang / model_type
+        if dest_dir.exists():
+            shutil.rmtree(dest_dir)
+        dest_dir.mkdir(parents=True, exist_ok=True)
+        logger.info(
+            "Downloading model '%s' version %s to %s...",
+            model_name,
+            mv.version,
+            dest_dir.resolve(),
+        )
+        try:
+            # Download the artifact (for example ".../java_transformer_model").
+            downloaded_path = Path(
+                mlflow.artifacts.download_artifacts(
+                    artifact_uri=mv.source,
+                    dst_path=str(dest_dir),
+                ),
+            )
+            # For transformer models logged with mlflow.transformers, artifacts
+            # are stored under an inner "model/" directory.
+            model_subdir = downloaded_path / "model"
+            if model_subdir.is_dir():
+                # Move the contents of "model" directly into dest_dir.
+                for item in model_subdir.iterdir():
+                    shutil.move(str(item), dest_dir / item.name)
+                # Remove the wrapper directory (with MLmodel, conda.yaml, etc.).
+                if downloaded_path != dest_dir:
+                    shutil.rmtree(downloaded_path)
+        except Exception as e:
+            logger.error(
+                "Failed to download/reshape model '%s' version %s: %s",
+                model_name,
+                mv.version,
+                e,
+            )
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    sync_best_models_to_disk()

codecommentclassification/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""CodeCommentClassification package initialization."""
+from .predictor import ModelPredictor
+__all__ = ["ModelPredictor"]

codecommentclassification/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (260 Bytes). View file

codecommentclassification/__pycache__/predictor.cpython-311.pyc ADDED Viewed

Binary file (7.51 kB). View file

codecommentclassification/modeling/__pycache__/evaluate_models.cpython-311.pyc ADDED Viewed

Binary file (12 kB). View file

codecommentclassification/modeling/__pycache__/train.cpython-311.pyc ADDED Viewed

Binary file (9.88 kB). View file

codecommentclassification/modeling/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (4.22 kB). View file

codecommentclassification/modeling/evaluate_models.py ADDED Viewed

	@@ -0,0 +1,223 @@

+"""Module for evaluating models on test set."""
+import argparse
+import json
+import os
+import time
+import dagshub
+import joblib
+import mlflow
+import numpy as np
+import pandas as pd
+from setfit import SetFitModel
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from .utils import load_dataset_splits, parse_labels_column
+LABELS = {
+    "java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"],
+    "python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"],
+    "pharo": [
+        "Keyimplementationpoints",
+        "Example",
+        "Responsibilities",
+        "Intent",
+        "Keymessages",
+        "Collaborators",
+    ],
+}
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+dagshub.init(repo_owner="se4ai2526-uniba", repo_name="TheClouds", mlflow=True)
+def evaluate_and_benchmark(lang, model_type, model_path, data_path, metrics_output_path):
+    """Load a trained model, run detailed benchmarking for performance and metrics,
+    and log the results to a new MLflow run.
+    """
+    mlflow.set_experiment("Model Benchmarking")
+    print(f"Starting Evaluation & Benchmarking for language: {lang} and model: {model_type}")
+    with mlflow.start_run(run_name=f"evaluation_local_{lang}_{model_type}"):
+        mlflow.log_param("language", lang)
+        mlflow.log_param("model_type", model_type)
+        mlflow.log_param("model_path", model_path)
+        mlflow.log_param("data_path", data_path)
+        avg_runtime_sec = 0.0
+        avg_gflops = 0.0
+        # -----------------------
+        # SETFIT
+        # -----------------------
+        if model_type == "setfit":
+            ds = load_dataset_splits(base_dir=data_path, langs=[lang])
+            eval_df = parse_labels_column(ds[f"{lang}_test"])
+            x_eval = eval_df["combo"].astype(str).tolist()
+            y_true = np.array(eval_df["labels"].tolist(), dtype=int)
+            model = SetFitModel.from_pretrained(model_path)
+            with torch.profiler.profile(with_flops=True) as p:
+                begin = time.time()
+                for _ in range(10):
+                    y_pred = model(x_eval)
+                total_runtime = time.time() - begin
+            avg_runtime_sec = total_runtime / 10
+            avg_gflops = (sum(k.flops for k in p.key_averages()) / 1e9) / 10
+            y_pred = np.array(y_pred)
+        # -----------------------
+        # RANDOM FOREST
+        # -----------------------
+        elif model_type == "random_forest":
+            ds = load_dataset_splits(base_dir=data_path, langs=[lang])
+            eval_df = parse_labels_column(ds[f"{lang}_test"])
+            x_eval = eval_df["combo"].astype(str).tolist()
+            y_true = np.array(eval_df["labels"].tolist(), dtype=int)
+            model = joblib.load(f"{model_path}.joblib")
+            begin = time.time()
+            for _ in range(10):
+                y_pred = model.predict(x_eval)
+            total_runtime = time.time() - begin
+            avg_runtime_sec = total_runtime / 10
+            avg_gflops = 0.0  # not applicable
+            y_pred = np.array(y_pred)
+        # -----------------------
+        # TRANSFORMER
+        # -----------------------
+        elif model_type == "transformer":
+            test_csv_path = os.path.join(data_path, f"{lang}_test.csv")
+            if not os.path.exists(test_csv_path):
+                raise FileNotFoundError(f"Test CSV for transformer not found: {test_csv_path}")
+            df_test = pd.read_csv(test_csv_path)
+            df_test = parse_labels_column(df_test)
+            # Ensure 'combo' exists
+            if "combo" not in df_test.columns:
+                df_test["combo"] = (
+                    df_test["comment_sentence"].astype(str) + " | " + df_test["class"].astype(str)
+                )
+            texts = df_test["combo"].astype(str).tolist()
+            y_true = np.array(df_test["labels"].tolist(), dtype=int)
+            tokenizer = AutoTokenizer.from_pretrained(model_path)
+            model = AutoModelForSequenceClassification.from_pretrained(model_path).to(DEVICE)
+            model.eval()
+            enc = tokenizer(
+                texts,
+                padding=True,
+                truncation=True,
+                max_length=128,  # keep consistent with training config
+                return_tensors="pt",
+            )
+            enc = {k: v.to(DEVICE) for k, v in enc.items()}
+            with torch.no_grad():
+                with torch.profiler.profile(with_flops=True) as p:
+                    begin = time.time()
+                    for _ in range(10):
+                        outputs = model(**enc)
+                    total_runtime = time.time() - begin
+                logits = outputs.logits
+                probs = torch.sigmoid(logits)
+                y_pred = (probs > 0.5).long().cpu().numpy()
+            avg_runtime_sec = total_runtime / 10
+            avg_gflops = (sum(k.flops for k in p.key_averages()) / 1e9) / 10
+        else:
+            raise ValueError(f"Unsupported model_type: {model_type}")
+        print(f"Avg runtime in seconds: {avg_runtime_sec:.4f}")
+        mlflow.log_metric("avg_runtime_sec", avg_runtime_sec)
+        mlflow.log_metric("avg_gflops", avg_gflops)
+        # -----------------------
+        # Manual per-label metrics (common)
+        # -----------------------
+        scores = []
+        y_true_transposed = y_true.T
+        y_pred_transposed = y_pred.T
+        for i in range(len(y_pred_transposed)):
+            tp = np.logical_and(y_true_transposed[i] == 1, y_pred_transposed[i] == 1).sum()
+            fp = np.logical_and(y_true_transposed[i] == 0, y_pred_transposed[i] == 1).sum()
+            fn = np.logical_and(y_true_transposed[i] == 1, y_pred_transposed[i] == 0).sum()
+            precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+            recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+            f1 = (2 * tp) / (2 * tp + fp + fn) if (2 * tp + fp + fn) > 0 else 0.0
+            scores.append(
+                {
+                    "lan": lang,
+                    "cat": LABELS[lang][i],
+                    "precision": precision,
+                    "recall": recall,
+                    "f1": f1,
+                }
+            )
+        lan_scores_df = pd.DataFrame(scores)
+        avg_f1 = lan_scores_df["f1"].mean()
+        avg_precision = lan_scores_df["precision"].mean()
+        avg_recall = lan_scores_df["recall"].mean()
+        mlflow.log_metric("avg_f1_score", avg_f1)
+        mlflow.log_metric("avg_precision", avg_precision)
+        mlflow.log_metric("avg_recall", avg_recall)
+        dvc_metrics = {
+            "avg_f1_score": avg_f1,
+            "avg_precision": avg_precision,
+            "avg_recall": avg_recall,
+            "avg_runtime_sec": avg_runtime_sec,
+            "avg_gflops": avg_gflops,
+        }
+        os.makedirs(os.path.dirname(metrics_output_path), exist_ok=True)
+        with open(metrics_output_path, "w") as f:
+            json.dump(dvc_metrics, f, indent=4)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--lang", type=str, required=True)
+    parser.add_argument("--model_type", type=str, required=True)
+    parser.add_argument(
+        "--data_path",
+        type=str,
+        default="data/raw",
+        help=(
+            "Path to evaluation data. "
+            "For setfit/random_forest: base dir with raw CSVs (e.g. data/raw). "
+            "For transformer: directory with {lang}_test.csv (e.g. data/processed/transformer)."
+        ),
+    )
+    args = parser.parse_args()
+    evaluate_and_benchmark(
+        lang=args.lang,
+        model_type=args.model_type,
+        model_path=f"models/{args.lang}/{args.model_type}",
+        data_path=args.data_path,
+        metrics_output_path=f"reports/metrics/{args.lang}/{args.model_type}_metrics.json",
+    )

codecommentclassification/modeling/train.py ADDED Viewed

	@@ -0,0 +1,203 @@

+"""Module for training different types of models for code comment classification."""
+import argparse
+import logging
+import os
+import dagshub
+from datasets import Dataset
+import mlflow
+import yaml
+from .utils import load_dataset_splits, parse_labels_column
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+dagshub.init(repo_owner="se4ai2526-uniba", repo_name="TheClouds", mlflow=True)
+def train_model(lang, model_type, data_path, model_output_path, params):
+    """Trains and saves a model for a specific language and model type."""
+    print(f"--- Starting training for language: {lang} with model: {model_type} ---")
+    ds = load_dataset_splits(data_path)
+    train_df = ds[f"{lang}_train"]
+    eval_df = ds[f"{lang}_test"]
+    train_df = parse_labels_column(train_df)
+    eval_df = parse_labels_column(eval_df)
+    # converto i DataFrame in HuggingFace Dataset
+    train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
+    eval_dataset = Dataset.from_pandas(eval_df, preserve_index=False)
+    if model_type == "setfit":
+        from setfit import SetFitModel, Trainer, TrainingArguments
+        mlflow.set_experiment("SetFit Training")
+        with mlflow.start_run(run_name=f"train-{lang}-{model_type}"):
+            mlflow.log_param("language", lang)
+            mlflow.log_param("model_type", model_type)
+            model = SetFitModel.from_pretrained(
+                "sentence-transformers/paraphrase-MiniLM-L6-v2",
+                multi_target_strategy="multi-output",
+            )
+            args = TrainingArguments(**params)
+            trainer = Trainer(
+                model=model,
+                args=args,
+                train_dataset=train_dataset,
+                eval_dataset=eval_dataset,
+                column_mapping={"combo": "text", "labels": "label"},
+            )
+            mlflow.log_param("num_epochs", args.num_epochs)
+            mlflow.log_param("num_iterations", args.num_iterations)
+            trainer.train()
+            eval_metrics = trainer.evaluate()
+            for metric_name, metric_value in eval_metrics.items():
+                mlflow.log_metric(metric_name, metric_value)
+            trainer.model.save_pretrained(model_output_path)
+            mlflow.transformers.log_model(
+                transformers_model=model_output_path,
+                artifact_path=f"{lang}_setfit_model",
+                task="text-classification",
+            )
+            mlflow.end_run()
+    elif model_type == "random_forest":
+        import joblib
+        import numpy as np
+        from sklearn.ensemble import RandomForestClassifier
+        from sklearn.feature_extraction.text import TfidfVectorizer
+        from sklearn.multioutput import MultiOutputClassifier
+        from sklearn.pipeline import Pipeline
+        mlflow.set_experiment("Random Forest Training")
+        with mlflow.start_run(run_name=f"train-{lang}-{model_type}"):
+            mlflow.log_param("language", lang)
+            mlflow.log_param("model_type", model_type)
+            mlflow.log_params(params)
+            tfidf_params = {
+                "ngram_range": tuple(params.pop("ngram_range", (1, 1))),
+                "max_features": params.pop("max_features", None),
+                "min_df": params.pop("min_df", 1),
+                "max_df": params.pop("max_df", 1.0),
+            }
+            rf_params = params
+            pipeline = Pipeline(
+                [
+                    ("tfidf", TfidfVectorizer(**tfidf_params)),
+                    (
+                        "clf",
+                        MultiOutputClassifier(
+                            RandomForestClassifier(
+                                random_state=42, class_weight="balanced", **rf_params
+                            )
+                        ),
+                    ),
+                ]
+            )
+            X_train = train_dataset["combo"]
+            y_train = np.array(train_dataset["labels"])
+            pipeline.fit(X_train, y_train)
+            X_test = eval_dataset["combo"]
+            y_test = np.array(eval_dataset["labels"])
+            score = pipeline.score(X_test, y_test)
+            mlflow.log_metric("accuracy", score)
+            os.makedirs(os.path.dirname(model_output_path), exist_ok=True)
+            joblib.dump(pipeline, f"{model_output_path}.joblib")
+            mlflow.sklearn.log_model(
+                sk_model=pipeline, artifact_path=f"{lang}_random_forest_model"
+            )
+            mlflow.end_run()
+    elif model_type == "transformer":
+        from .transformer import (
+            TransformerConfig,
+            TransformerTrainer,
+        )
+        mlflow.set_experiment("Transformer Training")
+        with mlflow.start_run(run_name=f"train-{lang}-{model_type}"):
+            mlflow.log_param("language", lang)
+            mlflow.log_param("model_type", model_type)
+            mlflow.log_params(params)
+            cfg = TransformerConfig(
+                lang=lang,
+                raw_data_dir="data/raw",
+                processed_data_dir="data/processed/transformer",
+                model_output_path=model_output_path,
+                pretrained_model_name=params.get(
+                    "pretrained_model_name", "microsoft/codebert-base"
+                ),
+                max_length=params.get("max_length", 128),
+                batch_size=params.get("batch_size", 16),
+                lr=params.get("lr", 2e-5),
+                num_epochs=params.get("num_epochs", 5),
+                warmup_ratio=params.get("warmup_ratio", 0.1),
+                pos_weight_cap=params.get("pos_weight_cap", 30.0),
+                threshold=params.get("threshold", 0.5),
+                preprocessing=params.get("preprocessing", False),
+                preprocessing_factor=params.get("preprocessing_factor", 1.0),
+            )
+            logger.info(
+                "Starting transformer training for language '%s' with config: %s",
+                lang,
+                cfg,
+            )
+            trainer = TransformerTrainer(cfg)
+            metrics = trainer.run()
+            logger.info("Final transformer metrics for %s: %s", lang, metrics)
+            for name, value in metrics.items():
+                mlflow.log_metric(f"final_{name}", value)
+            mlflow.end_run()
+    else:
+        raise ValueError(f"Unsupported model_type: {model_type}")
+    print(f"Model for {lang}-{model_type} saved to {model_output_path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--lang", type=str, required=True)
+    parser.add_argument("--model_type", type=str, required=True)
+    args = parser.parse_args()
+    with open("params.yaml", "r") as f:
+        all_params = yaml.safe_load(f)
+    model_params = all_params[args.model_type].copy()
+    train_model(
+        lang=args.lang,
+        model_type=args.model_type,
+        data_path="data/raw",
+        model_output_path=f"models/{args.lang}/{args.model_type}",
+        params=model_params,
+    )

codecommentclassification/modeling/transformer/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""Transformer model trainer module."""
+import logging
+from .trainer import TransformerConfig, TransformerTrainer
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+__all__ = ["TransformerConfig", "TransformerTrainer"]

codecommentclassification/modeling/transformer/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (562 Bytes). View file

codecommentclassification/modeling/transformer/__pycache__/preprocessing.cpython-311.pyc ADDED Viewed

Binary file (10.8 kB). View file

codecommentclassification/modeling/transformer/__pycache__/trainer.cpython-311.pyc ADDED Viewed

Binary file (26.7 kB). View file

codecommentclassification/modeling/transformer/preprocessing.py ADDED Viewed

	@@ -0,0 +1,208 @@

+"""Preprocessing helpers for transformer training.
+This module provides utilities to parse multi-label strings, ensure the
+`combo` column exists, perform label-aware supersampling of a training
+DataFrame, and a light-weight `load_or_prepare_data` entrypoint that loads
+raw CSVs, optionally applies preprocessing, and writes processed CSVs.
+"""
+import logging
+import os
+from typing import Tuple
+import numpy as np
+import pandas as pd
+logger = logging.getLogger(__name__)
+def parse_label_str(s: str) -> np.ndarray:
+    """Convert a string like '[0 0 1 0 0 0 0]' into a float32 numpy array."""
+    return np.fromstring(str(s).strip("[]"), sep=" ", dtype=np.float32)
+def ensure_combo_column(df: pd.DataFrame) -> pd.DataFrame:
+    """Ensure that the 'combo' column exists.
+    If missing, create it from 'comment_sentence' and 'class'.
+    """
+    if "combo" not in df.columns:
+        logger.info("Column 'combo' not found, creating it from 'comment_sentence' and 'class'.")
+        df = df.copy()
+        df["combo"] = df["comment_sentence"].astype(str) + " | " + df["class"].astype(str)
+    else:
+        logger.info("Column 'combo' already present, reusing it.")
+    return df
+def supersample_dataframe(
+    df: pd.DataFrame,
+    factor: float,
+    random_state: int = 42,
+) -> pd.DataFrame:
+    """Offline label-aware supersampling of the training DataFrame.
+    - Keeps all original rows.
+    - For each label j, duplicates rows that contain that label until:
+          target_j = min(max_freq, freq_j * factor)
+      where freq_j is the original count for label j and max_freq is the
+      maximum frequency across labels.
+    - Shuffles the resulting indices.
+    Assumes:
+      - df['labels'] is a string representation of a multi-hot vector.
+    """
+    if factor <= 1.0:
+        logger.info(
+            "Supersampling factor <= 1.0 (%.2f), returning original DataFrame.",
+            factor,
+        )
+        return df.copy()
+    rng = np.random.default_rng(random_state)
+    labels_array = np.stack(df["labels"].map(parse_label_str).values)
+    if labels_array.ndim == 1:
+        labels_array = labels_array[:, None]
+    num_samples, num_labels = labels_array.shape
+    freq = labels_array.sum(axis=0).astype(int)
+    max_freq = int(freq.max())
+    logger.info("Original label frequencies: %s", freq.tolist())
+    logger.info("Max label frequency: %d", max_freq)
+    if max_freq == 0:
+        logger.warning("All label frequencies are zero, skipping supersampling.")
+        return df.copy()
+    target = np.minimum(max_freq, (freq * factor).astype(int))
+    logger.info(
+        "Target label frequencies after supersampling (capped by max_freq): %s",
+        target.tolist(),
+    )
+    indices_by_label = {j: np.where(labels_array[:, j] == 1)[0] for j in range(num_labels)}
+    new_indices = list(range(num_samples))
+    for j in range(num_labels):
+        current = int(freq[j])
+        desired = int(target[j])
+        if desired <= current:
+            continue
+        candidate_indices = indices_by_label[j]
+        if candidate_indices.size == 0:
+            continue
+        needed = desired - current
+        extra = rng.choice(candidate_indices, size=needed, replace=True)
+        new_indices.extend(extra.tolist())
+        logger.info(
+            "Label %d: current=%d, target=%d, added=%d samples.",
+            j,
+            current,
+            desired,
+            needed,
+        )
+    rng.shuffle(new_indices)
+    df_sup = df.iloc[new_indices].reset_index(drop=True)
+    labels_array_after = np.stack(df_sup["labels"].map(parse_label_str).values)
+    freq_after = labels_array_after.sum(axis=0).astype(int)
+    logger.info("Final label frequencies after supersampling: %s", freq_after.tolist())
+    logger.info("Training rows before: %d, after: %d", num_samples, len(df_sup))
+    return df_sup
+def load_or_prepare_data(
+    lang: str,
+    raw_data_dir: str,
+    processed_data_dir: str,
+    preprocessing_enabled: bool,
+    preprocessing_factor: float,
+    random_state: int = 42,
+) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
+    """Load raw CSVs for the given language, optionally apply preprocessing.
+    (supersampling) on the train split, and save processed CSVs.
+    - Test split is NEVER supersampled or augmented.
+    - Train split:
+        - always gets 'combo' and 'labels_array'
+        - supersampled only if preprocessing_enabled=True and preprocessing_factor>1.0
+    Parameters
+    ----------
+    lang : str
+        Language key (e.g., 'java', 'python', 'pharo').
+    raw_data_dir : str
+        Directory containing {lang}_train.csv and {lang}_test.csv.
+    processed_data_dir : str
+        Directory where processed CSVs will be saved.
+    preprocessing_enabled : bool
+        Whether to apply supersampling on the training split.
+    preprocessing_factor : float
+        Supersampling factor (ignored if preprocessing_enabled=False).
+    random_state : int
+        RNG seed.
+    Returns
+    -------
+    train_df : pd.DataFrame
+    eval_df : pd.DataFrame
+    preprocessing_used : str
+        One of: 'none', 'supersampling'.
+    """
+    logger.info("Loading raw CSVs for language '%s' from '%s'.", lang, raw_data_dir)
+    raw_train_path = os.path.join(raw_data_dir, f"{lang}_train.csv")
+    raw_eval_path = os.path.join(raw_data_dir, f"{lang}_test.csv")
+    if not os.path.exists(raw_train_path):
+        raise FileNotFoundError(f"Raw train CSV not found: {raw_train_path}")
+    if not os.path.exists(raw_eval_path):
+        raise FileNotFoundError(f"Raw test CSV not found: {raw_eval_path}")
+    train_df = pd.read_csv(raw_train_path)
+    eval_df = pd.read_csv(raw_eval_path)
+    train_df = ensure_combo_column(train_df)
+    eval_df = ensure_combo_column(eval_df)
+    if preprocessing_enabled and preprocessing_factor > 1.0:
+        logger.info(
+            "Preprocessing enabled: applying supersampling with factor=%.2f.",
+            preprocessing_factor,
+        )
+        train_df = supersample_dataframe(
+            train_df,
+            factor=preprocessing_factor,
+            random_state=random_state,
+        )
+        preprocessing_used = "supersampling"
+    else:
+        logger.info(
+            "Preprocessing disabled or factor <= 1.0 (%.2f). Using original training data.",
+            preprocessing_factor,
+        )
+        preprocessing_used = "none"
+    # Save processed CSVs (for inspection / reproducibility)
+    os.makedirs(processed_data_dir, exist_ok=True)
+    processed_train_path = os.path.join(processed_data_dir, f"{lang}_train.csv")
+    processed_eval_path = os.path.join(processed_data_dir, f"{lang}_test.csv")
+    train_df.to_csv(processed_train_path, index=False)
+    eval_df.to_csv(processed_eval_path, index=False)
+    logger.info("Saved processed train/test CSVs to '%s'.", processed_data_dir)
+    # Ensure 'labels_array' exists for both splits
+    for df, split_name in ((train_df, "train"), (eval_df, "test")):
+        if "labels_array" not in df.columns:
+            logger.info("Parsing label strings into arrays for split '%s'.", split_name)
+            df["labels_array"] = df["labels"].apply(parse_label_str)
+    return train_df, eval_df, preprocessing_used

codecommentclassification/modeling/transformer/trainer.py ADDED Viewed

	@@ -0,0 +1,531 @@

+"""Training utilities for transformer-based multi-label classification.
+This module contains a small training harness around HuggingFace
+`AutoModelForSequenceClassification` specialized for the project's
+multi-label code-comment classification task. It provides:
+- `TransformerConfig` dataclass for configurable training runs.
+- `CommentDataset` to wrap tokenization of pandas DataFrames.
+- `TransformerTrainer` which runs the training loop, evaluation and
+    model export (with MLflow logging hooks).
+The helpers are intended for experimental, small-scale training and
+instrumentation rather than production-grade distributed training.
+"""
+from dataclasses import asdict, dataclass
+import logging
+import os
+from typing import Dict, List, Tuple
+import mlflow
+import numpy as np
+import pandas as pd
+from sklearn.metrics import (
+    accuracy_score,
+    classification_report,
+    f1_score,
+    precision_score,
+    recall_score,
+)
+import torch
+from torch.utils.data import DataLoader, Dataset
+from tqdm.auto import tqdm
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    get_linear_schedule_with_warmup,
+)
+from .preprocessing import load_or_prepare_data
+logger = logging.getLogger(__name__)
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {DEVICE}")
+# Label names per language, order must match the label vector in the CSV
+LABELS: Dict[str, Tuple[str, ...]] = {
+    "java": (
+        "summary",
+        "Ownership",
+        "Expand",
+        "usage",
+        "Pointer",
+        "deprecation",
+        "rational",
+    ),
+    "python": (
+        "Usage",
+        "Parameters",
+        "DevelopmentNotes",
+        "Expand",
+        "Summary",
+    ),
+    "pharo": (
+        "Keyimplementationpoints",
+        "Example",
+        "Responsibilities",
+        "Intent",
+        "Keymessages",
+        "Collaborators",
+    ),
+}
+@dataclass
+class TransformerConfig:
+    """Configuration for transformer training runs.
+    Attributes are intentionally simple dataclass fields and map directly to
+    CLI/YAML configuration keys used by the training harness.
+    """
+    lang: str
+    raw_data_dir: str
+    processed_data_dir: str
+    model_output_path: str
+    pretrained_model_name: str = "microsoft/codebert-base"
+    max_length: int = 128
+    batch_size: int = 16
+    lr: float = 2e-5
+    num_epochs: int = 5
+    warmup_ratio: float = 0.1
+    pos_weight_cap: float = 30.0
+    threshold: float = 0.5
+    preprocessing: bool = False
+    preprocessing_factor: float = 1.0
+    def __post_init__(self) -> None:
+        """Force correct types even if YAML provides strings."""
+        self.max_length = int(self.max_length)
+        self.batch_size = int(self.batch_size)
+        self.lr = float(self.lr)
+        self.num_epochs = int(self.num_epochs)
+        self.warmup_ratio = float(self.warmup_ratio)
+        self.pos_weight_cap = float(self.pos_weight_cap)
+        self.threshold = float(self.threshold)
+        self.preprocessing_factor = float(self.preprocessing_factor)
+        # allow 'true'/'false' as strings from YAML
+        if isinstance(self.preprocessing, str):
+            self.preprocessing = self.preprocessing.lower() == "true"
+class CommentDataset(Dataset):
+    """Simple Dataset wrapper around a pandas DataFrame with 'combo' and 'labels_array'."""
+    def __init__(self, df: pd.DataFrame, tokenizer: AutoTokenizer, max_length: int):
+        """Create a dataset that tokenizes rows on demand.
+        Parameters
+        ----------
+        df : pandas.DataFrame
+            Input frame containing at least `combo` and `labels_array` columns.
+        tokenizer : transformers.AutoTokenizer
+            Tokenizer used to encode text into model inputs.
+        max_length : int
+            Maximum tokenization length (used for padding/truncation).
+        """
+        self.df = df.reset_index(drop=True)
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+    def __len__(self) -> int:
+        """Return the number of examples in the dataset."""
+        return len(self.df)
+    def __getitem__(self, idx: int):
+        """Return a single tokenized example and its labels as tensors.
+        The returned dict contains tokenized inputs (PyTorch tensors) and a
+        `labels` tensor suitable for BCEWithLogitsLoss for multi-label tasks.
+        """
+        row = self.df.iloc[idx]
+        text = str(row["combo"])
+        labels = np.asarray(row["labels_array"], dtype=np.float32)
+        enc = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        item = {k: v.squeeze(0) for k, v in enc.items()}
+        item["labels"] = torch.from_numpy(labels)
+        return item
+class TransformerTrainer:
+    """End-to-end transformer trainer for the code comment multi-label task."""
+    def __init__(self, cfg: TransformerConfig) -> None:
+        """Initialize training state, data loaders, model and optimizer.
+        Parameters
+        ----------
+        cfg : TransformerConfig
+            Training configuration containing data paths and hyperparameters.
+        """
+        self.cfg = cfg
+        if cfg.lang not in LABELS:
+            raise ValueError(f"No LABELS defined for language '{cfg.lang}'.")
+        self.label_names = LABELS[cfg.lang]
+        self.num_labels = len(self.label_names)
+        logger.info("Initializing TransformerTrainer for language '%s'.", cfg.lang)
+        logger.info("Raw data directory: %s", cfg.raw_data_dir)
+        logger.info("Processed data directory: %s", cfg.processed_data_dir)
+        logger.info("Model output path: %s", cfg.model_output_path)
+        # --- data loading / preprocessing ---
+        self.train_df, self.eval_df, self.preprocessing_used = load_or_prepare_data(
+            lang=cfg.lang,
+            raw_data_dir=cfg.raw_data_dir,
+            processed_data_dir=cfg.processed_data_dir,
+            preprocessing_enabled=cfg.preprocessing,
+            preprocessing_factor=cfg.preprocessing_factor,
+            random_state=42,
+        )
+        logger.info("Preprocessing used for this run: %s", self.preprocessing_used)
+        logger.info("Using device: %s", DEVICE)
+        logger.info(
+            "Train size: %d rows, Eval size: %d rows",
+            len(self.train_df),
+            len(self.eval_df),
+        )
+        # --- log config and dataset info to MLflow ---
+        try:
+            cfg_dict = asdict(self.cfg)
+            mlflow.log_params({f"cfg_{k}": v for k, v in cfg_dict.items()})
+            mlflow.log_param("num_labels", self.num_labels)
+            mlflow.log_param("label_names", ",".join(self.label_names))
+            mlflow.log_param("train_samples", len(self.train_df))
+            mlflow.log_param("eval_samples", len(self.eval_df))
+            mlflow.log_param("preprocessing_used", self.preprocessing_used)
+        except Exception as e:
+            logger.warning("Could not log transformer config to MLflow: %s", e)
+        # tokenizer
+        logger.info("Loading tokenizer '%s'.", cfg.pretrained_model_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained_model_name)
+        # label statistics and pos_weight
+        y_train = np.stack(self.train_df["labels_array"].to_numpy())
+        self.pos_weight = self._compute_pos_weight(y_train)
+        # dataloaders
+        train_dataset = CommentDataset(self.train_df, self.tokenizer, cfg.max_length)
+        eval_dataset = CommentDataset(self.eval_df, self.tokenizer, cfg.max_length)
+        self.train_loader = DataLoader(
+            train_dataset,
+            batch_size=cfg.batch_size,
+            shuffle=True,
+        )
+        self.eval_loader = DataLoader(
+            eval_dataset,
+            batch_size=cfg.batch_size,
+            shuffle=False,
+        )
+        logger.info(
+            "Hyperparameters – lr=%s (type=%s), batch_size=%s, num_epochs=%s",
+            self.cfg.lr,
+            type(self.cfg.lr),
+            self.cfg.batch_size,
+            self.cfg.num_epochs,
+        )
+        # model
+        logger.info("Loading base model '%s'.", cfg.pretrained_model_name)
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            cfg.pretrained_model_name,
+            num_labels=self.num_labels,
+            problem_type="multi_label_classification",
+        ).to(DEVICE)
+        self.loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=self.pos_weight.to(DEVICE))
+        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.cfg.lr)
+        num_training_steps = cfg.num_epochs * len(self.train_loader)
+        num_warmup_steps = int(cfg.warmup_ratio * num_training_steps)
+        logger.info(
+            "Total training steps: %d, warmup steps: %d.",
+            num_training_steps,
+            num_warmup_steps,
+        )
+        self.scheduler = get_linear_schedule_with_warmup(
+            self.optimizer,
+            num_warmup_steps=num_warmup_steps,
+            num_training_steps=num_training_steps,
+        )
+        self.best_state_dict = None
+        self.best_val_macro_f1 = 0.0
+    def _compute_pos_weight(self, y: np.ndarray) -> torch.Tensor:
+        if y.ndim == 1:
+            y = y[:, None]
+        freq = y.sum(axis=0).astype(np.float64)
+        num_samples = y.shape[0]
+        pos_weight = (num_samples - freq) / np.clip(freq, 1.0, None)
+        pos_weight = np.clip(pos_weight, 1.0, self.cfg.pos_weight_cap)
+        logger.info("Positive class weights (clipped): %s", pos_weight.tolist())
+        return torch.tensor(pos_weight, dtype=torch.float32)
+    def _step_batch(self, batch, train: bool):
+        batch = {k: v.to(DEVICE) for k, v in batch.items()}
+        labels = batch.pop("labels")
+        outputs = self.model(**batch)
+        logits = outputs.logits
+        loss = self.loss_fn(logits, labels)
+        if train:
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
+            self.optimizer.step()
+            self.scheduler.step()
+            self.optimizer.zero_grad()
+        return loss, logits, labels
+    def train_one_epoch(self, epoch: int) -> float:
+        """Run a single training epoch over `self.train_loader`.
+        Returns
+        -------
+        float
+            The average training loss over the epoch.
+        """
+        self.model.train()
+        total_loss = 0.0
+        n_samples = 0
+        num_batches = len(self.train_loader)
+        logger.info("Starting epoch %d training. Number of batches: %d", epoch, num_batches)
+        progress_bar = tqdm(
+            self.train_loader,
+            desc=f"Epoch {epoch} [train]",
+            total=num_batches,
+            leave=False,
+        )
+        for step, batch in enumerate(progress_bar, start=1):
+            loss, _, _ = self._step_batch(batch, train=True)
+            batch_size = batch["input_ids"].size(0)
+            total_loss += loss.item() * batch_size
+            n_samples += batch_size
+            avg_loss_so_far = total_loss / max(n_samples, 1)
+            progress_bar.set_postfix({"loss": f"{avg_loss_so_far:.4f}"})
+        avg_loss = total_loss / max(n_samples, 1)
+        logger.info("Epoch %d training completed. Average loss: %.4f.", epoch, avg_loss)
+        mlflow.log_metric("train_loss", avg_loss, step=epoch)
+        return avg_loss
+    def evaluate(
+        self,
+        epoch: int,
+        split_name: str = "eval",
+    ) -> Tuple[float, float, float, np.ndarray, np.ndarray]:
+        """Evaluate the model on `self.eval_loader` and compute metrics.
+        Parameters
+        ----------
+        epoch : int
+            Current epoch number (used for logging).
+        split_name : str
+            Name of the evaluation split used for MLflow metric keys.
+        Returns
+        -------
+        tuple
+            (avg_loss, micro_f1, macro_f1, y_true, y_pred)
+        """
+        self.model.eval()
+        total_loss = 0.0
+        n_samples = 0
+        all_preds: List[np.ndarray] = []
+        all_labels: List[np.ndarray] = []
+        logger.info("Starting evaluation for epoch %d on split '%s'.", epoch, split_name)
+        num_batches = len(self.eval_loader)
+        progress_bar = tqdm(
+            self.eval_loader,
+            desc=f"Epoch {epoch} [{split_name}]",
+            total=num_batches,
+            leave=False,
+        )
+        with torch.no_grad():
+            for batch in progress_bar:
+                loss, logits, labels = self._step_batch(batch, train=False)
+                batch_size = logits.size(0)
+                total_loss += loss.item() * batch_size
+                n_samples += batch_size
+                probs = torch.sigmoid(logits)
+                preds = (probs > self.cfg.threshold).long()
+                all_preds.append(preds.cpu().numpy())
+                all_labels.append(labels.cpu().numpy())
+                avg_loss_so_far = total_loss / max(n_samples, 1)
+                progress_bar.set_postfix({"loss": f"{avg_loss_so_far:.4f}"})
+        avg_loss = total_loss / max(n_samples, 1)
+        y_pred = np.concatenate(all_preds, axis=0)
+        y_true = np.concatenate(all_labels, axis=0)
+        # F1
+        micro_f1 = f1_score(y_true, y_pred, average="micro", zero_division=0)
+        macro_f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)
+        # Precision
+        micro_precision = precision_score(y_true, y_pred, average="micro", zero_division=0)
+        macro_precision = precision_score(y_true, y_pred, average="macro", zero_division=0)
+        # Recall
+        micro_recall = recall_score(y_true, y_pred, average="micro", zero_division=0)
+        macro_recall = recall_score(y_true, y_pred, average="macro", zero_division=0)
+        # Accuracy (multi-label)
+        # subset_accuracy = exact match of all labels for each sample
+        subset_accuracy = accuracy_score(y_true, y_pred)
+        # micro_accuracy = accuracy over flattened label indicators
+        micro_accuracy = accuracy_score(y_true.flatten(), y_pred.flatten())
+        logger.info(
+            "Eval results [%s] - loss: %.4f | "
+            "micro-F1: %.4f, macro-F1: %.4f | "
+            "micro-P: %.4f, macro-P: %.4f | "
+            "micro-R: %.4f, macro-R: %.4f | "
+            "subset-acc: %.4f, micro-acc: %.4f",
+            split_name,
+            avg_loss,
+            micro_f1,
+            macro_f1,
+            micro_precision,
+            macro_precision,
+            micro_recall,
+            macro_recall,
+            subset_accuracy,
+            micro_accuracy,
+        )
+        # MLflow logging (per epoch)
+        mlflow.log_metric(f"{split_name}_loss", avg_loss, step=epoch)
+        mlflow.log_metric(f"{split_name}_micro_f1", micro_f1, step=epoch)
+        mlflow.log_metric(f"{split_name}_macro_f1", macro_f1, step=epoch)
+        mlflow.log_metric(f"{split_name}_micro_precision", micro_precision, step=epoch)
+        mlflow.log_metric(f"{split_name}_macro_precision", macro_precision, step=epoch)
+        mlflow.log_metric(f"{split_name}_micro_recall", micro_recall, step=epoch)
+        mlflow.log_metric(f"{split_name}_macro_recall", macro_recall, step=epoch)
+        mlflow.log_metric(f"{split_name}_subset_accuracy", subset_accuracy, step=epoch)
+        mlflow.log_metric(f"{split_name}_micro_accuracy", micro_accuracy, step=epoch)
+        return avg_loss, micro_f1, macro_f1, y_true, y_pred
+    def run(self) -> Dict[str, float]:
+        """Execute the full training loop and save the best model.
+        Returns
+        -------
+        dict
+            Summary metrics from the final evaluation (micro/macro F1).
+        """
+        logger.info("Starting training loop for %d epochs.", self.cfg.num_epochs)
+        for epoch in range(1, self.cfg.num_epochs + 1):
+            train_loss = self.train_one_epoch(epoch)
+            val_loss, val_micro_f1, val_macro_f1, _, _ = self.evaluate(epoch, split_name="eval")
+            logger.info(
+                "[%s] epoch=%d train_loss=%.4f val_loss=%.4f val_micro_f1=%.4f val_macro_f1=%.4f",
+                self.cfg.lang,
+                epoch,
+                train_loss,
+                val_loss,
+                val_micro_f1,
+                val_macro_f1,
+            )
+            if val_macro_f1 > self.best_val_macro_f1:
+                logger.info(
+                    "New best macro-F1: %.4f (previous: %.4f). Saving current model state.",
+                    val_macro_f1,
+                    self.best_val_macro_f1,
+                )
+                self.best_val_macro_f1 = val_macro_f1
+                self.best_state_dict = {k: v.cpu() for k, v in self.model.state_dict().items()}
+        if self.best_state_dict is not None:
+            logger.info("Loading best model weights (macro-F1 = %.4f).", self.best_val_macro_f1)
+            self.model.load_state_dict(self.best_state_dict)
+        # final evaluation
+        _, micro_f1, macro_f1, y_true, y_pred = self.evaluate(
+            epoch=self.cfg.num_epochs,
+            split_name="eval",
+        )
+        logger.info(
+            "[%s] FINAL micro-F1 = %.4f, macro-F1 = %.4f.",
+            self.cfg.lang,
+            micro_f1,
+            macro_f1,
+        )
+        logger.info(
+            "Per-label classification report:\n%s",
+            classification_report(y_true, y_pred, target_names=self.label_names, zero_division=0),
+        )
+        # save model and tokenizer
+        os.makedirs(self.cfg.model_output_path, exist_ok=True)
+        logger.info("Saving model and tokenizer to '%s'.", self.cfg.model_output_path)
+        self.model.save_pretrained(self.cfg.model_output_path)
+        self.tokenizer.save_pretrained(self.cfg.model_output_path)
+        # log model directory as MLflow artifact
+        logger.info("Logging final model artifacts to MLflow.")
+        mlflow.log_artifacts(
+            self.cfg.model_output_path,
+            artifact_path=f"{self.cfg.lang}_transformer_model",
+        )
+        logger.info("Logging HF transformers model to MLflow via mlflow.transformers.log_model.")
+        model_info = mlflow.transformers.log_model(
+            transformers_model=self.cfg.model_output_path,
+            artifact_path=f"{self.cfg.lang}_transformer_model",
+            task="text-classification",
+        )
+        logger.info(
+            "Logged transformers model to MLflow with URI: %s",
+            model_info.model_uri,
+        )
+        return {
+            "micro_f1": float(micro_f1),
+            "macro_f1": float(macro_f1),
+        }

codecommentclassification/modeling/utils.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""Utility functions for model training and evaluation."""
+import os
+from typing import List
+LANGS: List[str] = ["java", "python", "pharo"]
+def load_dataset_splits(base_dir=None, langs=None):
+    """Load dataset splits from CSV files under data/raw.
+    Expects files like data/raw/java_train.csv, data/raw/java_test.csv, etc.
+    Returns a dict mapping split names (e.g. "java_test") to pandas DataFrames.
+    Raises:
+        FileNotFoundError: se la directory base o un file atteso non esiste.
+        ImportError: se pandas non è installato.
+    """
+    if base_dir is None:
+        base_dir = os.path.join("data", "raw")
+    if langs is None:
+        langs = LANGS
+    if not os.path.isdir(base_dir):
+        raise FileNotFoundError(
+            f"CSV datasets not found under {base_dir}; cannot load dataset splits."
+        )
+    try:
+        import pandas as pd
+    except Exception as e:
+        raise ImportError("pandas is required to load dataset splits") from e
+    datasets = {}
+    for lang in langs:
+        for split in ("train", "test"):
+            fname = f"{lang}_{split}.csv"
+            path = os.path.join(base_dir, fname)
+            if not os.path.isfile(path):
+                raise FileNotFoundError(f"Expected dataset file missing: {path}")
+            df = pd.read_csv(path)
+            datasets[f"{lang}_{split}"] = df
+    return datasets
+def parse_labels_column(df):
+    """Parse the 'labels' column of a DataFrame into lists of integers."""
+    def _parse_one(x):
+        if isinstance(x, str):
+            s = x.strip()
+            if s.startswith("[") and s.endswith("]"):
+                s = s[1:-1]
+            return [int(tok) for tok in s.split() if tok]
+        try:
+            import numpy as np
+            if isinstance(x, np.ndarray):
+                return [int(v) for v in x.tolist()]
+        except ImportError:
+            pass
+        if isinstance(x, (list, tuple)):
+            return [int(v) for v in x]
+        raise ValueError(f"Formato labels non gestito: {type(x)} -> {x!r}")
+    df["labels"] = df["labels"].apply(_parse_one)
+    return df

codecommentclassification/predictor.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""Prediction helpers for different model types.
+This module provides `ModelPredictor`, a lightweight wrapper that unifies
+inference for SetFit, scikit-learn RandomForest pipelines, and HuggingFace
+transformer sequence classification models. It standardizes inputs/outputs
+to a NumPy array of shape (n_samples, n_labels).
+"""
+import os
+from typing import List, Union
+import joblib
+import numpy as np
+from setfit import SetFitModel
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+TextInput = Union[str, List[str]]
+class ModelPredictor:
+    """Unified predictor for SetFit, Random Forest and Transformer models.
+    Expected directory layout:
+        models/
+          ├── java/
+          │     ├── setfit/                # SetFit saved model directory
+          │     ├── random_forest.joblib   # sklearn pipeline
+          │     └── transformer/           # HF model + tokenizer (config.json, etc.)
+          ├── python/
+          │     ├── setfit/
+          │     ├── random_forest.joblib
+          │     └── transformer/
+          └── pharo/
+                ├── setfit/
+                ├── random_forest.joblib
+                └── transformer/
+    """
+    def __init__(
+        self,
+        lang: str,
+        model_type: str,
+        model_root: str = "models",
+        threshold: float = 0.5,
+        max_length: int = 128,
+    ) -> None:
+        """Parameters
+        ----------
+        lang : str
+            One of {"java", "python", "pharo"}.
+        model_type : str
+            One of {"setfit", "random_forest", "transformer"}.
+        model_root : str
+            Root directory where models are stored.
+        threshold : float
+            Decision threshold for multi-label Transformer predictions.
+            Ignored for SetFit and Random Forest (they already output labels).
+        max_length : int
+            Max sequence length for Transformer tokenization.
+        """
+        self.lang = lang
+        self.model_type = model_type
+        self.model_root = model_root
+        self.threshold = float(threshold)
+        self.max_length = int(max_length)
+        # device only matters for Transformer
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if model_type == "setfit":
+            model_path = os.path.join(self.model_root, self.lang, "setfit")
+            if not os.path.isdir(model_path):
+                raise FileNotFoundError(f"SetFit model not found at: {model_path}")
+            self.model = SetFitModel.from_pretrained(model_path)
+        elif model_type == "random_forest":
+            model_path = os.path.join(self.model_root, self.lang, "random_forest.joblib")
+            if not os.path.isfile(model_path):
+                raise FileNotFoundError(f"Random Forest model not found at: {model_path}")
+            self.model = joblib.load(model_path)
+        elif model_type == "transformer":
+            model_path = os.path.join(self.model_root, self.lang, "transformer")
+            if not os.path.isdir(model_path):
+                raise FileNotFoundError(f"Transformer model not found at: {model_path}")
+            # load tokenizer and model from the same directory used during training
+            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+            self.model = AutoModelForSequenceClassification.from_pretrained(model_path).to(
+                self.device
+            )
+            self.model.eval()
+        else:
+            raise ValueError(f"Unsupported model_type: {model_type}")
+    def predict(self, texts: TextInput) -> np.ndarray:
+        """Run prediction on one or many text samples.
+        Parameters
+        ----------
+        texts : str | list[str]
+            A single text or a list of texts.
+        Returns
+        -------
+        np.ndarray
+            Array of shape (n_samples, n_labels) with integer (typically binary) values.
+        """
+        if isinstance(texts, str):
+            texts = [texts]
+        if self.model_type == "setfit":
+            raw_outputs = self.model(texts)
+            outputs = np.array(list(raw_outputs), dtype=int)
+        elif self.model_type == "random_forest":
+            raw_outputs = self.model.predict(texts)
+            outputs = np.array(list(raw_outputs), dtype=int)
+        elif self.model_type == "transformer":
+            enc = self.tokenizer(
+                texts,
+                padding=True,
+                truncation=True,
+                max_length=self.max_length,
+                return_tensors="pt",
+            )
+            enc = {k: v.to(self.device) for k, v in enc.items()}
+            with torch.no_grad():
+                logits = self.model(**enc).logits
+                probs = torch.sigmoid(logits)
+                preds = (probs > self.threshold).long().cpu().numpy()
+            outputs = preds.astype(int)
+        else:
+            raise ValueError(f"Unsupported model_type: {self.model_type}")
+        # Ensure 2D shape (n_samples, n_labels)
+        if outputs.ndim == 1:
+            outputs = outputs.reshape(1, -1)
+        return outputs

requirements.txt ADDED Viewed

	@@ -0,0 +1,234 @@

+accelerate==1.11.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.1
+aiosignal==1.4.0
+alembic==1.17.0
+annotated-doc==0.0.3
+annotated-types==0.7.0
+anyio==4.11.0
+appdirs==1.4.4
+argon2-cffi==25.1.0
+argon2-cffi-bindings==25.1.0
+arrow==1.4.0
+asttokens==3.0.0
+async-lru==2.0.5
+attrs==25.4.0
+babel==2.17.0
+backoff==2.2.1
+beautifulsoup4==4.14.2
+bleach==6.2.0
+blinker==1.9.0
+boto3==1.40.60
+botocore==1.40.60
+cachetools==5.5.2
+certifi==2025.10.5
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.0
+cloudpickle==3.1.1
+cmake==4.1.2
+comm==0.2.3
+contourpy==1.3.3
+cryptography==46.0.3
+cycler==0.12.1
+dacite==1.6.0
+dagshub==0.6.3
+dagshub-annotation-converter==0.1.15
+databricks-sdk==0.70.0
+dataclasses-json==0.6.7
+datasets==3.6.0
+debugpy==1.8.17
+decorator==5.2.1
+deepchecks[nlp]==0.19.1
+defusedxml==0.7.1
+dill==0.3.8
+docker==7.1.0
+dvc==3.63.0
+dvc-data==3.16.12
+dvc-http==2.32.0
+dvc-objects==5.1.2
+dvc-render==1.0.2
+dvc-s3==3.2.2
+dvc-studio-client==0.22.0
+dvc-task==0.40.2
+evaluate==0.4.6
+executing==2.2.1
+fastapi[standard]==0.120.1
+fastjsonschema==2.21.2
+filelock==3.20.0
+Flask==3.1.2
+flask-cors==6.0.1
+fonttools==4.60.1
+fqdn==1.5.1
+frozenlist==1.8.0
+fsspec==2025.3.0
+ghp-import==2.1.0
+gitdb==4.0.12
+GitPython==3.1.45
+google-auth==2.41.1
+gql==4.0.0
+graphene==3.4.3
+graphql-core==3.2.6
+graphql-relay==3.2.0
+great-expectations==1.9.0
+greenlet==3.2.4
+gunicorn==23.0.0
+h11==0.16.0
+hf-xet==1.2.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.36.0
+idna==3.11
+importlib_metadata==8.7.0
+iniconfig==2.3.0
+ipykernel==7.1.0
+ipython==9.6.0
+ipython_pygments_lexers==1.1.1
+isoduration==20.11.0
+itsdangerous==2.2.0
+jedi==0.19.2
+Jinja2==3.1.6
+jmespath==1.0.1
+joblib==1.5.2
+json5==0.12.1
+jsonpointer==3.0.0
+jsonschema==4.25.1
+jsonschema-specifications==2025.9.1
+jupyter-events==0.12.0
+jupyter-lsp==2.3.0
+jupyter_client==8.6.3
+jupyter_core==5.9.1
+jupyter_server==2.17.0
+jupyter_server_terminals==0.5.3
+jupyterlab==4.4.10
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.28.0
+kiwisolver==1.4.9
+lark==1.3.0
+lit==18.1.8
+lxml==6.0.2
+Mako==1.3.10
+Markdown==3.9
+markdown-it-py==4.0.0
+MarkupSafe==3.0.3
+marshmallow==3.26.1
+matplotlib==3.10.7
+matplotlib-inline==0.2.1
+mdurl==0.1.2
+mergedeep==1.3.4
+mistune==3.1.4
+mkdocs==1.6.1
+mkdocs-get-deps==0.2.0
+mlflow==2.22.2
+mlflow-skinny==2.22.2
+mlflow-tracing==3.5.1
+mpmath==1.3.0
+multidict==6.7.0
+multiprocess==0.70.16
+mypy_extensions==1.1.0
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.5
+nltk==3.9.2
+notebook==7.4.7
+notebook_shim==0.2.4
+numpy==2.3.4
+opentelemetry-api==1.38.0
+opentelemetry-proto==1.38.0
+opentelemetry-sdk==1.38.0
+opentelemetry-semantic-conventions==0.59b0
+overrides==7.7.0
+packaging==24.2
+pandas==2.3.3
+pandocfilters==1.5.1
+parso==0.8.5
+pathspec==0.12.1
+pathvalidate==3.3.1
+pexpect==4.9.0
+pillow==12.0.0
+platformdirs==4.5.0
+pluggy==1.6.0
+pre-commit==4.4.0
+prometheus_client==0.23.1
+prompt_toolkit==3.0.52
+propcache==0.4.1
+protobuf==6.33.0
+psutil==7.1.2
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==19.0.1
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
+pycparser==2.23
+pydantic==2.12.3
+pydantic_core==2.41.4
+Pygments==2.19.2
+pyparsing==3.2.5
+pytest==8.4.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-json-logger==4.0.0
+pytz==2025.2
+PyYAML==6.0.3
+pyyaml_env_tag==1.1
+pyzmq==27.1.0
+referencing==0.37.0
+regex==2025.10.23
+requests==2.32.5
+requests-toolbelt==1.0.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rfc3987-syntax==1.1.0
+rich==14.2.0
+rpds-py==0.28.0
+rsa==4.9.1
+ruff==0.14.2
+s3transfer==0.14.0
+safetensors==0.6.2
+scikit-learn==1.7.2
+scipy==1.16.2
+semver==3.0.4
+Send2Trash==1.8.3
+sentence-transformers==5.1.2
+setfit==1.1.2
+six==1.17.0
+smmap==5.0.2
+sniffio==1.3.1
+soupsieve==2.8
+SQLAlchemy==2.0.44
+sqlparse==0.5.3
+stack-data==0.6.3
+starlette==0.48.0
+sympy==1.14.0
+tenacity==9.1.2
+terminado==0.18.1
+threadpoolctl==3.6.0
+tinycss2==1.4.0
+tokenizers==0.22.1
+torch==2.7.1
+torchaudio==2.7.1
+torchvision==0.22.1
+tornado==6.5.2
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.57.1
+treelib==1.8.0
+triton==3.3.1
+typing-inspect==0.9.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+tzdata==2025.2
+uri-template==1.3.0
+urllib3==2.5.0
+uvicorn==0.38.0
+watchdog==6.0.0
+wcwidth==0.2.14
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.9.0
+Werkzeug==3.1.3
+xxhash==3.6.0
+yarl==1.22.0
+zipp==3.23.0