"""Feedback ingestion utilities for drift analysis.""" from pathlib import Path from typing import List, Tuple from loguru import logger import numpy as np import pandas as pd from turing import config def load_feedback_for_language( feedback_path: Path, language: str, ) -> Tuple[List[str], np.ndarray]: """ Load user feedback for a given language and return texts with one-hot labels. Rows with unknown labels are skipped. Returns empty lists if no valid rows. """ if not feedback_path.exists(): raise FileNotFoundError(f"Feedback file not found: {feedback_path}") df = pd.read_csv(feedback_path) if ( "Language" not in df.columns or "Input_Text" not in df.columns or "User_Correction" not in df.columns ): raise ValueError( "Feedback file must contain Language, Input_Text, and User_Correction columns" ) df_lang = df[df["Language"].str.lower() == language.lower()] if df_lang.empty: logger.warning(f"No feedback rows found for language {language}") return [], np.array([]) label_space = config.LABELS_MAP.get(language) if not label_space: raise ValueError(f"Label map not found for language: {language}") label_to_idx = {label.lower(): idx for idx, label in enumerate(label_space)} texts: List[str] = [] labels: List[np.ndarray] = [] for _, row in df_lang.iterrows(): correction = str(row["User_Correction"]).strip().lower() idx = label_to_idx.get(correction) if idx is None: logger.warning(f"Skipping feedback row with unknown label: {row['User_Correction']}") continue one_hot = np.zeros(len(label_space), dtype=int) one_hot[idx] = 1 texts.append(str(row["Input_Text"])) labels.append(one_hot) if not texts: logger.warning(f"No valid feedback rows for language {language}") return [], np.array([]) return texts, np.vstack(labels)