""" Screen a loaded article table against a fine-tuned model. Run via exec_script() from the notebook. Reads the following keys from globals(): selected_model — a loaded ModelWrapper recall_target — a Dropdown whose .value is an integer (95, 90, ...) articles_table — a PredictionTable _screen_progress — an IntProgress widget (or None) _screen_progress_label — an HTML widget for batch labels (or None) Writes to the namespace: data — copy of articles_table.df with y_prob and y_pred columns added review_data — rows where y_prob >= threshold, without y_prob/y_pred columns """ import numpy as np selected_model = globals().get("selected_model") recall_target_value = globals().get('recall_target').value articles_table = globals().get("articles_table") _progress = globals().get("_screen_progress") _progress_label = globals().get("_screen_progress_label") nlp = selected_model.model data = articles_table.df.dropna(subset=[nlp.input_col]) x_test = data[nlp.input_col].astype(str).to_list() x_test = [ nlp.tokenizer.decode( nlp.tokenizer.encode(text, max_length=nlp.max_length, truncation=True), skip_special_tokens=True ) for text in x_test ] threshold = selected_model.thresholds[recall_target_value] def _on_batch(done, total): if _progress is not None: _progress.max = total _progress.value = done if _progress_label is not None: _progress_label.value = ( f"" f"{done} / {total}" ) y_prob = nlp.predict(x_test, on_batch=_on_batch) data = data.copy() data['y_prob'] = y_prob data['y_pred'] = (np.array(y_prob) >= threshold).astype(int) review_data = data.loc[np.array(y_prob) >= threshold].copy() review_data = review_data.drop(columns=['y_prob', 'y_pred'], errors='ignore')