Spaces:

se4ai2526-uniba-nygaard
/

NygaardCodeComment-backend

Runtime error

App Files Files Community

Davy592 commited on Jan 3

Commit

df25ba9

1 Parent(s): 992c3c4

Updated files to match new versions

Browse files

Files changed (3) hide show

nygaardcodecommentclassification/api/API.py +18 -5
nygaardcodecommentclassification/api/controllers.py +3 -7
nygaardcodecommentclassification/api/models.py +90 -39

nygaardcodecommentclassification/api/API.py CHANGED Viewed

@@ -15,11 +15,12 @@ from contextlib import asynccontextmanager
 from datetime import datetime
 from functools import wraps
 from http import HTTPStatus
 from typing import Any, Callable, Dict
-from fastapi import FastAPI, Request
-from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import RedirectResponse
 from nygaardcodecommentclassification import config
 from nygaardcodecommentclassification.api.controllers import PredictionController
@@ -35,9 +36,11 @@ controller = PredictionController()
 # This prevents blocking the async event loop during model predictions
 _executor = ThreadPoolExecutor(max_workers=4)
 @asynccontextmanager
-async def lifespan(app: FastAPI):
     """Async context manager for application lifecycle events.
     This handles:
@@ -48,7 +51,7 @@ async def lifespan(app: FastAPI):
         app: The FastAPI application instance
     Yields:
-        Control back to the application after startup is complete
     """
     # Startup: load models into memory
     controller.startup()
@@ -199,7 +202,9 @@ async def _get_languages(request: Request) -> Dict[str, Any]:
 @app.post("/predict", tags=["Prediction"])
 @construct_response
-async def _predict(request: Request, payload: PredictionRequest) -> Dict[str, Any]:
     """Classify code comments using multi-label classification.
     This endpoint performs ML inference to classify code comments into
@@ -207,6 +212,7 @@ async def _predict(request: Request, payload: PredictionRequest) -> Dict[str, An
     Args:
         request: The FastAPI request object
         payload: PredictionRequest containing:
             - texts: List of code comments
             - class_names: List of class names corresponding to each comment
@@ -246,6 +252,13 @@ async def _predict(request: Request, payload: PredictionRequest) -> Dict[str, An
         payload.model_type,
     )
     return {
         "status-code": HTTPStatus.OK,
         "message": "Prediction successful",

 from datetime import datetime
 from functools import wraps
 from http import HTTPStatus
+import json
 from typing import Any, Callable, Dict
+from fastapi import FastAPI, Request, Response
 from fastapi.responses import RedirectResponse
+from fastapi.middleware.cors import CORSMiddleware
 from nygaardcodecommentclassification import config
 from nygaardcodecommentclassification.api.controllers import PredictionController
 # This prevents blocking the async event loop during model predictions
 _executor = ThreadPoolExecutor(max_workers=4)
+# ---------------------------------------------------------------------------
 @asynccontextmanager
+async def lifespan(app: FastAPI) -> Any:
     """Async context manager for application lifecycle events.
     This handles:
         app: The FastAPI application instance
     Yields:
+        None: Control back to the application after startup is complete
     """
     # Startup: load models into memory
     controller.startup()
 @app.post("/predict", tags=["Prediction"])
 @construct_response
+async def _predict(
+    request: Request, response: Response, payload: PredictionRequest
+) -> Dict[str, Any]:
     """Classify code comments using multi-label classification.
     This endpoint performs ML inference to classify code comments into
     Args:
         request: The FastAPI request object
+        response: The FastAPI response object
         payload: PredictionRequest containing:
             - texts: List of code comments
             - class_names: List of class names corresponding to each comment
         payload.model_type,
     )
+    response.headers["X-model"] = payload.model_type
+    response.headers["X-language"] = payload.language
+    # Collect all predicted labels
+    all_labels = [label for result in results for label in result["labels"]]
+    response.headers["X-predicted-labels"] = json.dumps(all_labels)
     return {
         "status-code": HTTPStatus.OK,
         "message": "Prediction successful",

nygaardcodecommentclassification/api/controllers.py CHANGED Viewed

@@ -147,13 +147,11 @@ class PredictionController:
                 f"Model '{model_type}' unavailable for {language}. Available: {available_types}"
             )
-        # --- Combine texts with class names ---
-        # Format: "comment | class_name" for the model
         combined_texts = [f"{text} | {class_name}" for text, class_name in zip(texts, class_names)]
         # --- Model Inference ---
         try:
-            y_pred = self.predictor.predict(combined_texts, language, model_type)
         except Exception as e:
             logger.error("Prediction failed for %s/%s: %s", language, model_type, e)
             raise RuntimeError(f"Internal model error: {e}") from e
@@ -167,7 +165,7 @@ class PredictionController:
         # Convert numeric predictions to human-readable labels
         results: List[Dict[str, Any]] = []
-        for i, (text_input, class_name) in enumerate(zip(texts, class_names)):
             row_pred = y_pred[i]  # Binary array (1 = label present, 0 = absent)
             # Find indices where prediction is 1 (positive class)
@@ -176,8 +174,6 @@ class PredictionController:
             # Map indices to label strings
             predicted_labels = [labels_map[idx] for idx in predicted_indices]
-            results.append(
-                {"text": text_input, "class_name": class_name, "labels": predicted_labels}
-            )
         return results

                 f"Model '{model_type}' unavailable for {language}. Available: {available_types}"
             )
         combined_texts = [f"{text} | {class_name}" for text, class_name in zip(texts, class_names)]
         # --- Model Inference ---
         try:
+            y_pred, embeddings = self.predictor.predict(combined_texts, language, model_type)
         except Exception as e:
             logger.error("Prediction failed for %s/%s: %s", language, model_type, e)
             raise RuntimeError(f"Internal model error: {e}") from e
         # Convert numeric predictions to human-readable labels
         results: List[Dict[str, Any]] = []
+        for i, text_input in enumerate(texts):
             row_pred = y_pred[i]  # Binary array (1 = label present, 0 = absent)
             # Find indices where prediction is 1 (positive class)
             # Map indices to label strings
             predicted_labels = [labels_map[idx] for idx in predicted_indices]
+            results.append({"text": text_input, "labels": predicted_labels})
         return results

nygaardcodecommentclassification/api/models.py CHANGED Viewed

@@ -1,7 +1,9 @@
 """Model Layer - ML model management and inference.
 This module handles the low-level ML operations including:
 - Model loading and storage via ModelRegistry
 - Inference execution via ModelPredictor
 Architecture:
     - ModelRegistry: Central storage for loaded models with lazy loading
     - ModelPredictor: Executes inference using registered models
@@ -11,7 +13,7 @@ import logging
 import os
 from pathlib import Path
 import sys
-from typing import Any, Dict, List, Optional
 import dagshub
 import mlflow
@@ -20,16 +22,6 @@ import torch
 from nygaardcodecommentclassification import config
-# Patch torch.load to use CPU mapping by default if CUDA is not available
-# This prevents "Attempting to deserialize object on a CUDA device" errors
-_original_torch_load = torch.load
-def _patched_torch_load(f, map_location=None, *args, **kwargs):
-    """Wrapper around torch.load that uses CPU mapping if CUDA unavailable."""
-    if map_location is None and not torch.cuda.is_available():
-        map_location = torch.device('cpu')
-    return _original_torch_load(f, map_location=map_location, *args, **kwargs)
-torch.load = _patched_torch_load
 # Configure module logger with explicit handler to ensure visibility
 logger = logging.getLogger("nygaard.models")
 logger.setLevel(logging.DEBUG)
@@ -44,16 +36,20 @@ if not logger.handlers:
 class ModelRegistry:
     """Central registry for ML models loaded in memory.
     This class manages the lifecycle of ML models, providing:
     - Automatic discovery and loading of models from the filesystem
     - Organized storage by language and model type
     - Memory management with explicit cleanup
     Attributes:
         _registry: Internal dictionary storing loaded models
     Example:
         ```python
         registry = ModelRegistry()
         registry.load_all_models(Path("./models"))
         # Access a loaded model
         model_entry = registry.get_model("python", "catboost")
         if model_entry:
@@ -68,11 +64,14 @@ class ModelRegistry:
     def load_all_models(self) -> None:
         """Load all ML models from MLflow tracking server.
         This method connects to the MLflow tracking server (DagsHub) and loads
         CatBoost classifiers and sentence transformer embedders for all
         configured languages.
         Environment Variables:
             DAGSHUB_USER_TOKEN: Authentication token for DagsHub/MLflow
         Note:
             - Continues loading other models if one fails
             - Logs all loading activity for debugging
@@ -102,66 +101,104 @@ class ModelRegistry:
     def _load_catboost_models(self, lang: str) -> None:
         """Load CatBoost models for a specific language from MLflow.
         Downloads and loads the CatBoost classifier and sentence transformer
         embedder directly from MLflow tracking server.
         Args:
             lang: The programming language code (e.g., "python", "java")
         """
         # Find the CatBoost run
         catboost_runs = mlflow.search_runs(
             experiment_names=["evaluating"], filter_string="tags.model = 'catboost'"
-        )
         if catboost_runs.empty:
             logger.error("No CatBoost run found in 'evaluating' experiment")
             return
         catboost_run = catboost_runs.iloc[0]
         catboost_run_id = catboost_run.run_id
         catboost_run_name = catboost_run.get("tags.mlflow.runName", "unknown")
         logger.info(
-            "Found CatBoost run: '%s' (started: %s)",
             catboost_run_name,
-            catboost_run.get("start_time", "unknown"),
         )
-        # Find the embedder run
-        embedder_runs = mlflow.search_runs(
-            experiment_names=["evaluating"],
-            filter_string="run_name = 'sentence_transformer_paraphrase-MiniLM-L6-v2'",
-        )
-        if embedder_runs.empty:
-            logger.error(
-                "No embedder run found for 'sentence_transformer_paraphrase-MiniLM-L6-v2'"
             )
-            return
-        embedder_run = embedder_runs.iloc[0]
-        embedder_run_id = embedder_run.run_id
-        embedder_run_name = embedder_run.get("tags.mlflow.runName", "unknown")
-        logger.info(
-            "Found Embedder run: '%s' (started: %s)",
-            embedder_run_name,
-            embedder_run.get("start_time", "unknown"),
-        )
         try:
             # Load the CatBoost model from MLflow
             model_uri = f"runs:/{catboost_run_id}/model_{lang}"
             logger.info(
-                "[%s] Loading CatBoost classifier from run '%s'...",
                 lang.upper(),
                 catboost_run_name,
             )
             model = mlflow.sklearn.load_model(model_uri)
             # Load the sentence transformer embedder from MLflow
             embedder_uri = f"runs:/{embedder_run_id}/model_{lang}"
             logger.info(
-                "[%s] Loading sentence transformer from run '%s'...",
                 lang.upper(),
                 embedder_run_name,
             )
             embedder = mlflow.sklearn.load_model(embedder_uri)
-            if hasattr(embedder, "to"):
-                embedder.to("cpu")
             # Register the model with its metadata
             self._registry[lang]["catboost"] = {
@@ -180,9 +217,11 @@ class ModelRegistry:
     def get_model(self, language: str, model_type: str) -> Optional[Dict[str, Any]]:
         """Retrieve a loaded model entry by language and type.
         Args:
             language: The programming language code
             model_type: The type of model
         Returns:
             Dict containing the model and metadata, or None if not found.
             The dict contains:
@@ -194,6 +233,7 @@ class ModelRegistry:
     def clear(self) -> None:
         """Clear all models from the registry and free memory.
         This method should be called during application shutdown to
         release GPU memory and other resources.
         """
@@ -207,12 +247,15 @@ class ModelRegistry:
 class ModelPredictor:
     """Handles low-level prediction logic.
     Attributes:
         registry: Reference to the ModelRegistry for model access
     Example:
         ```python
         registry = ModelRegistry()
         registry.load_all_models(Path("./models"))
         predictor = ModelPredictor(registry)
         predictions = predictor.predict(
             texts=["# Calculate sum of list"],
@@ -225,25 +268,33 @@ class ModelPredictor:
     def __init__(self, model_registry: ModelRegistry) -> None:
         """Initialize the predictor with a model registry.
         Args:
             model_registry: The ModelRegistry instance containing loaded models
         """
         self.registry = model_registry
-    def predict(self, texts: List[str], language: str, model_type: str) -> np.ndarray:
         """Execute prediction on a list of texts.
         This method handles the full inference pipeline:
         1. Retrieve the appropriate model from the registry
         2. Extract features (e.g., generate embeddings)
         3. Run model inference
         4. Return raw predictions
         Args:
             texts: List of code comment strings to classify
             language: Programming language context for model selection
             model_type: Type of model to use
         Returns:
-            numpy array of predictions with shape (n_samples, n_labels).
-            For multi-label classification, each row is a binary array.
         Raises:
             ValueError: If the requested model is not available or
                        if an unsupported feature/model type is specified
@@ -267,7 +318,7 @@ class ModelPredictor:
                 embeddings = embedder.encode(texts, show_progress_bar=False)
                 # Run CatBoost prediction on embeddings
-                return model.predict(embeddings)
             raise ValueError("Unsupported feature type for CatBoost")

 """Model Layer - ML model management and inference.
 This module handles the low-level ML operations including:
 - Model loading and storage via ModelRegistry
 - Inference execution via ModelPredictor
 Architecture:
     - ModelRegistry: Central storage for loaded models with lazy loading
     - ModelPredictor: Executes inference using registered models
 import os
 from pathlib import Path
 import sys
+from typing import Any, Dict, List, Optional, Tuple
 import dagshub
 import mlflow
 from nygaardcodecommentclassification import config
 # Configure module logger with explicit handler to ensure visibility
 logger = logging.getLogger("nygaard.models")
 logger.setLevel(logging.DEBUG)
 class ModelRegistry:
     """Central registry for ML models loaded in memory.
     This class manages the lifecycle of ML models, providing:
     - Automatic discovery and loading of models from the filesystem
     - Organized storage by language and model type
     - Memory management with explicit cleanup
     Attributes:
         _registry: Internal dictionary storing loaded models
     Example:
         ```python
         registry = ModelRegistry()
         registry.load_all_models(Path("./models"))
         # Access a loaded model
         model_entry = registry.get_model("python", "catboost")
         if model_entry:
     def load_all_models(self) -> None:
         """Load all ML models from MLflow tracking server.
         This method connects to the MLflow tracking server (DagsHub) and loads
         CatBoost classifiers and sentence transformer embedders for all
         configured languages.
         Environment Variables:
             DAGSHUB_USER_TOKEN: Authentication token for DagsHub/MLflow
         Note:
             - Continues loading other models if one fails
             - Logs all loading activity for debugging
     def _load_catboost_models(self, lang: str) -> None:
         """Load CatBoost models for a specific language from MLflow.
         Downloads and loads the CatBoost classifier and sentence transformer
         embedder directly from MLflow tracking server.
         Args:
             lang: The programming language code (e.g., "python", "java")
         """
         # Find the CatBoost run
         catboost_runs = mlflow.search_runs(
             experiment_names=["evaluating"], filter_string="tags.model = 'catboost'"
+        ).sort_values(by="metrics.final_score", ascending=False)
         if catboost_runs.empty:
             logger.error("No CatBoost run found in 'evaluating' experiment")
             return
         catboost_run = catboost_runs.iloc[0]
         catboost_run_id = catboost_run.run_id
         catboost_run_name = catboost_run.get("tags.mlflow.runName", "unknown")
+        catboost_git_commit = catboost_run.get("tags.mlflow.source.git.commit")
         logger.info(
+            "Found CatBoost run: '%s' (ID: %s, commit: %s)",
             catboost_run_name,
+            catboost_run_id,
+            catboost_git_commit,
         )
+        # Find the embedder run with same git commit and source file
+        embedder_run = None
+        embedder_run_id = None
+        embedder_run_name = None
+        if catboost_git_commit:
+            # Search for sentence transformer with same git commit
+            logger.info(
+                "[%s] Searching for embedder with git commit: %s",
+                lang.upper(),
+                catboost_git_commit,
+            )
+            embedder_runs = mlflow.search_runs(
+                experiment_names=["evaluating"],
+                filter_string=f"tags.`mlflow.source.git.commit` = '{catboost_git_commit}' and run_name LIKE 'sentence_transformer%'",
+            )
+            if not embedder_runs.empty:
+                embedder_run = embedder_runs.iloc[0]
+                embedder_run_id = embedder_run.run_id
+                embedder_run_name = embedder_run.get("tags.mlflow.runName", "unknown")
+                logger.info(
+                    "[%s] Found embedder with matching git commit: '%s' (ID: %s)",
+                    lang.upper(),
+                    embedder_run_name,
+                    embedder_run_id,
+                )
+        # Fallback: search by default name if git commit search failed
+        if not embedder_run_id:
+            logger.info(
+                "[%s] Falling back to default embedder search",
+                lang.upper(),
+            )
+            embedder_runs = mlflow.search_runs(
+                experiment_names=["evaluating"],
+                filter_string="run_name = 'sentence_transformer_paraphrase-MiniLM-L6-v2'",
+            )
+            if embedder_runs.empty:
+                logger.error(
+                    "No embedder run found for 'sentence_transformer_paraphrase-MiniLM-L6-v2'"
+                )
+                return
+            embedder_run = embedder_runs.iloc[0]
+            embedder_run_id = embedder_run.run_id
+            embedder_run_name = embedder_run.get("tags.mlflow.runName", "unknown")
+            logger.info(
+                "Found Embedder run: '%s' (ID: %s)",
+                embedder_run_name,
+                embedder_run_id,
             )
         try:
             # Load the CatBoost model from MLflow
             model_uri = f"runs:/{catboost_run_id}/model_{lang}"
             logger.info(
+                "[%s] Loading CatBoost classifier from run '%s' (ID: %s)...",
                 lang.upper(),
                 catboost_run_name,
+                catboost_run_id,
             )
             model = mlflow.sklearn.load_model(model_uri)
             # Load the sentence transformer embedder from MLflow
             embedder_uri = f"runs:/{embedder_run_id}/model_{lang}"
             logger.info(
+                "[%s] Loading sentence transformer from run '%s' (ID: %s)...",
                 lang.upper(),
                 embedder_run_name,
+                embedder_run_id,
             )
             embedder = mlflow.sklearn.load_model(embedder_uri)
             # Register the model with its metadata
             self._registry[lang]["catboost"] = {
     def get_model(self, language: str, model_type: str) -> Optional[Dict[str, Any]]:
         """Retrieve a loaded model entry by language and type.
         Args:
             language: The programming language code
             model_type: The type of model
         Returns:
             Dict containing the model and metadata, or None if not found.
             The dict contains:
     def clear(self) -> None:
         """Clear all models from the registry and free memory.
         This method should be called during application shutdown to
         release GPU memory and other resources.
         """
 class ModelPredictor:
     """Handles low-level prediction logic.
     Attributes:
         registry: Reference to the ModelRegistry for model access
     Example:
         ```python
         registry = ModelRegistry()
         registry.load_all_models(Path("./models"))
         predictor = ModelPredictor(registry)
         predictions = predictor.predict(
             texts=["# Calculate sum of list"],
     def __init__(self, model_registry: ModelRegistry) -> None:
         """Initialize the predictor with a model registry.
         Args:
             model_registry: The ModelRegistry instance containing loaded models
         """
         self.registry = model_registry
+    def predict(
+        self, texts: List[str], language: str, model_type: str
+    ) -> Tuple[np.ndarray, Optional[np.ndarray]]:
         """Execute prediction on a list of texts.
         This method handles the full inference pipeline:
         1. Retrieve the appropriate model from the registry
         2. Extract features (e.g., generate embeddings)
         3. Run model inference
         4. Return raw predictions
         Args:
             texts: List of code comment strings to classify
             language: Programming language context for model selection
             model_type: Type of model to use
         Returns:
+            Tuple containing:
+            - numpy array of predictions with shape (n_samples, n_labels).
+            - numpy array of embeddings (if available, else None).
         Raises:
             ValueError: If the requested model is not available or
                        if an unsupported feature/model type is specified
                 embeddings = embedder.encode(texts, show_progress_bar=False)
                 # Run CatBoost prediction on embeddings
+                return model.predict(embeddings), embeddings
             raise ValueError("Unsupported feature type for CatBoost")