KangjunNoh commited on 1 day ago

Commit

906e061

verified ·

1 Parent(s): d238913

Upload 47 files

Browse files

Files changed (47) hide show

MACI-main/LICENSE +21 -0
MACI-main/README.md +46 -0
MACI-main/conditional-conformal/conditionalconformal/__init__.py +1 -0
MACI-main/conditional-conformal/conditionalconformal/condconf.py +877 -0
MACI-main/conditional-conformal/conditionalconformal/experiment_utils.py +182 -0
MACI-main/conditional-conformal/conditionalconformal/synthetic_data.py +55 -0
MACI-main/conditional-conformal/src/atomizer.py +347 -0
MACI-main/conditional-conformal/src/aws_utils.py +15 -0
MACI-main/conditional-conformal/src/client.py +89 -0
MACI-main/conditional-conformal/src/config.py +7 -0
MACI-main/conditional-conformal/src/conformal.py +68 -0
MACI-main/conditional-conformal/src/data_utils/sample_names.py +86 -0
MACI-main/conditional-conformal/src/dataset.py +279 -0
MACI-main/conditional-conformal/src/featurizer.py +352 -0
MACI-main/conditional-conformal/src/gpt.py +58 -0
MACI-main/conditional-conformal/src/llm_utils.py +111 -0
MACI-main/conditional-conformal/src/postprocess_factscore.py +34 -0
MACI-main/conditional-conformal/src/prob_model.py +101 -0
MACI-main/conditional-conformal/src/query.py +112 -0
MACI-main/conditional-conformal/src/ray_data.py +192 -0
MACI-main/conditional-conformal/src/retrieval.py +268 -0
MACI-main/conditional-conformal/src/retrieve_data.py +86 -0
MACI-main/conditional-conformal/src/run.py +119 -0
MACI-main/conditional-conformal/src/scorer.py +202 -0
MACI-main/conformal/__pycache__/adaptive_conformal.cpython-39.pyc +0 -0
MACI-main/conformal/__pycache__/basic_conformal.cpython-39.pyc +0 -0
MACI-main/conformal/__pycache__/conditional_conformal.cpython-39.pyc +0 -0
MACI-main/conformal/adaptive_conformal.py +403 -0
MACI-main/conformal/basic_conformal.py +189 -0
MACI-main/conformal/conditional_conformal.py +489 -0
MACI-main/data/med_scores/medlfqa_frequencies.npz +3 -0
MACI-main/data/med_scores/medlfqa_logprobs.npz +3 -0
MACI-main/data/med_scores/medlfqa_scores_deepseek_deepseek-chat-v3-0324.npz +3 -0
MACI-main/data/med_scores/medlfqa_scores_meta-llama_llama-3.3-70b-instruct.npz +3 -0
MACI-main/data/med_scores/medlfqa_scores_qwen_qwen-2.5-72b-instruct.npz +3 -0
MACI-main/data/med_scores/medlfqa_selfevals.npz +3 -0
MACI-main/data/wiki_scores/wikibio_final.csv +0 -0
MACI-main/data/wiki_scores/wikibio_final_dataset.pkl +3 -0
MACI-main/data/wiki_scores/wikibio_final_frequencies.npz +3 -0
MACI-main/data/wiki_scores/wikibio_final_logprobs.npz +3 -0
MACI-main/data/wiki_scores/wikibio_final_self_evals.npz +3 -0
MACI-main/data/wiki_scores/wikibio_scores_deepseek-chat-v3-0324.npz +3 -0
MACI-main/data/wiki_scores/wikibio_scores_meta-llama_llama-3.3-70b-instruct.npz +3 -0
MACI-main/data/wiki_scores/wikibio_scores_qwen_qwen-2.5-72b-instruct.npz +3 -0
MACI-main/experiments/conditional_groupers.py +542 -0
MACI-main/experiments/run_experiment.py +1127 -0
MACI-main/requirements.txt +12 -0

MACI-main/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Anonymous2026conf
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

MACI-main/README.md ADDED Viewed

	@@ -0,0 +1,46 @@

+# MACI
+This repository contains an anonymized version of our Multi-LLM Adaptive Conformal Inference experiments. The entry point is `experiments/run_experiment.py`.
+## Abstract
+Ensuring factuality is essential for the safe use of Large Language Models (LLMs) in high-stakes domains such as medicine and law. Conformal inference provides distribution-free guarantees, but existing approaches are either overly conservative, discarding many true-claims, or rely on adaptive error rates and simple linear models that fail to capture complex group structures. To address these challenges, we reformulate conformal inference in a multiplicative filtering setting, modeling factuality as a product of claim-level scores. Our method, Multi-LLM Adaptive Conformal Inference (MACI), leverages ensembles to produce more accurate factuality-scores, which in our experiments led to higher retention, while validity is preserved through group-conditional calibration. Experiments show that MACI consistently achieves user-specified coverage with substantially higher retention and lower time cost than baselines.
+## Running
+Step 1) Create a fresh Conda environment (Python 3.9)
+```bash
+conda create -y -n maci python=3.9
+```
+Step 2) Install dependencies from requirements.txt
+```bash
+conda run -n maci \
+  python -m pip install -r requirements.txt --no-input
+```
+Step 3) Prepare data layout (repo-relative defaults)
+- Place data under `data/` in the repository root (or pass `--data-dir`).
+- For MedLFQA: put files under `data/med_scores/`.
+- For WikiBio: put files under `data/wiki_scores/`.
+Step 4) Run a quick experiment (MedLFQA example)
+```bash
+conda run -n maci \
+  python experiments/run_experiment.py \
+  --dataset-type medlfqa \
+  --conditional-groups false_claim_risk \
+```
+Step 5) Where outputs go
+- Logs: `logs/` (repo-root-relative by default)
+- Results JSON: `analysis/experiment_results/`
+## CCI Attribution
+Our implementation of the Conditional Conformal Inference (CCI) baseline is a direct adoption of the work from the [conformal-safety](https://github.com/jjcherian/conformal-safety.git) repository. To ensure full reproducibility, we have included a local copy of the necessary modules in the conditional-conformal/ directory. We explicitly state that the code within this directory is not the work of the MACI project. For all details, please refer to the original repository: [conformal-safety](https://github.com/jjcherian/conformal-safety.git)

MACI-main/conditional-conformal/conditionalconformal/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .condconf import CondConf

MACI-main/conditional-conformal/conditionalconformal/condconf.py ADDED Viewed

	@@ -0,0 +1,877 @@

+import cvxpy as cp
+import numpy as np
+from functools import partial, lru_cache
+from scipy.optimize import linprog
+from sklearn.metrics.pairwise import pairwise_kernels
+from typing import Callable
+FUNCTION_DEFAULTS = {"kernel": None, "gamma" : 1, "lambda": 1}
+class CondConf:
+    def __init__(
+            self,
+            score_fn : Callable,
+            Phi_fn : Callable,
+            quantile_fn : Callable = None,
+            infinite_params : dict = {},
+            seed : int = 0
+        ):
+        """
+        Constructs the CondConf object that caches relevant information for
+        generating conditionally valid prediction sets.
+        We define the score function and set of conditional guarantees
+        that we care about in this function.
+        Parameters
+        ---------
+        score_fn : Callable[np.ndarray, np.ndarray] -> np.ndarray
+            Fixed (vectorized) conformity score function that takes in
+            X and Y as inputs and returns S as output
+        Phi_fn : Callable[np.ndarray] -> np.ndarray
+            Function that defines finite basis set that we provide
+            exact conditional guarantees over
+        infinite_params : dict = {}
+            Dictionary containing parameters for the RKHS component of the fit
+            Valid keys are ('kernel', 'gamma', 'lambda')
+                'kernel' should be a valid kernel name for sklearn.metrics.pairwise_kernels
+                'gamma' is a hyperparameter for certain kernels
+                'lambda' is the regularization penalty applied to the RKHS component
+        """
+        self.score_fn = score_fn
+        self.Phi_fn = Phi_fn
+        self.quantile_fn = quantile_fn
+        self.infinite_params = infinite_params
+        self.rng = np.random.default_rng(seed=seed)
+    def setup_problem(
+            self,
+            x_calib : np.ndarray,
+            y_calib : np.ndarray
+    ):
+        """
+        setup_problem sets up the final fitting problem for a
+        particular calibration set
+        The resulting cvxpy Problem object is stored inside the CondConf parent.
+        Arguments
+        ---------
+        x_calib : np.ndarray
+            Covariate data for the calibration set
+        y_calib : np.ndarray
+            Labels for the calibration set
+        """
+        self.x_calib = x_calib
+        self.y_calib = y_calib
+        phi_calib = self.Phi_fn(x_calib)
+        _, s, Vt = np.linalg.svd(phi_calib, full_matrices=False)
+        # Set a tolerance to decide which singular values are nonzero
+        tol = 1e-10
+        r = np.sum(s > tol)
+        if r < len(s):
+            self.Phi_fn_orig = self.Phi_fn
+            T = Vt.T[:, :r]
+            self.Phi_fn = lambda x: (self.Phi_fn_orig(x) @ T)
+            phi_calib = self.Phi_fn(x_calib)
+        self.phi_calib = phi_calib
+        self.scores_calib = self.score_fn(x_calib, y_calib)
+        if self.quantile_fn is not None:
+            self.quantile_calib = self.quantile_fn(x_calib).reshape(-1,1)
+        self.cvx_problem = setup_cvx_problem(
+            self.x_calib,
+            self.scores_calib,
+            self.phi_calib,
+            self.infinite_params
+        )
+    @lru_cache()
+    def _get_calibration_solution(
+            self,
+            quantile : float
+    ):
+        S = self.scores_calib.reshape(-1,1)
+        Phi = self.phi_calib.astype(float)
+        zeros = np.zeros((Phi.shape[1],))
+        if quantile is None:
+            bounds = np.concatenate((self.quantile_calib - 1, self.quantile_calib), axis=1)
+        else:
+            bounds = np.asarray([quantile - 1, quantile])
+            bounds = np.tile(bounds.reshape(1,-1), (len(S), 1))
+        res = linprog(-1 * S, A_eq=Phi.T, b_eq=zeros, bounds=bounds, method='highs')
+        primal_vars = -1 * res.eqlin.marginals.reshape(-1,1)
+        dual_vars = res.x.reshape(-1,1)
+        residuals = S - (Phi @ primal_vars)
+        interpolated_pts = np.isclose(residuals, 0)
+        # if I didn't converge to a solution that interpolates at least Phi.shape[1] pts,
+        # I need to manually find one via a modified simplex iteration
+        if interpolated_pts.sum() < Phi.shape[1]:
+            num_to_add = Phi.shape[1] - interpolated_pts.sum()
+            for _ in range(num_to_add):
+                candidate_pts = interpolated_pts.copy().flatten()
+                # find candidate idx for interpolation, e.g., new covariate that is
+                # linearly independent of the previously interpolated points
+                Q, _ = np.linalg.qr(Phi[candidate_pts].T)
+                projections = Phi @ Q @ Q.T
+                norms = np.linalg.norm(Phi - projections, axis=1)
+                candidate_idx = np.where(norms > 1e-5)[0][0]
+                candidate_pts[candidate_idx] = True
+                # find direction to solution that would interpolate the new point
+                gamma, _, _, _ = np.linalg.lstsq(Phi[candidate_pts], S[candidate_pts], rcond=None)
+                direction = gamma.reshape(-1,1) - primal_vars
+                step_sizes = residuals / (Phi @ direction)
+                # check the non-basic indices for which a step in this direction could have led to interpolation
+                # e.g., those for which the step size is positive and the point is not already interpolated
+                positive_indices = np.where((step_sizes > 0) & ~interpolated_pts)[0]
+                # take smallest possible step that would lead to interpolation
+                primal_vars += np.min(step_sizes[positive_indices]) * direction
+                residuals = S - (Phi @ primal_vars)
+                interpolated_pts = np.isclose(residuals, 0)
+        return dual_vars, primal_vars
+    def _compute_exact_cutoff(
+            self,
+            quantiles,
+            primals,
+            duals,
+            phi_test,
+            dual_threshold
+    ):
+        def get_current_basis(primals, duals, Phi, S, quantiles):
+            interp_bools = np.logical_and(~np.isclose(duals, quantiles - 1), ~np.isclose(duals, quantiles))
+            if np.sum(interp_bools) == Phi.shape[1]:
+                return interp_bools
+            preds = (Phi @ primals).flatten()
+            active_indices = np.where(interp_bools)[0]
+            interp_indices = np.where(np.isclose(np.abs(S - preds), 0))[0]
+            diff_indices = np.setdiff1d(interp_indices, active_indices)
+            num_missing = Phi.shape[1] - np.sum(interp_bools)
+            if num_missing < len(diff_indices):
+                from itertools import combinations
+                for cand_indices in combinations(diff_indices, num_missing):
+                    cand_phi = Phi[np.concatenate((active_indices, cand_indices))]
+                    if np.isfinite(np.linalg.cond(cand_phi)):
+                        interp_bools[np.asarray(cand_indices)] = True
+                        break
+            else:
+                interp_bools[diff_indices] = True
+            if np.sum(interp_bools) != Phi.shape[1]:
+                raise ValueError("Initial basis could not be found - retry with exact=False.")
+            return interp_bools
+        if np.allclose(phi_test, 0):
+            return np.inf if quantiles[-1] >= 0.5 else -np.inf
+        basis = get_current_basis(primals, duals, self.phi_calib, self.scores_calib, quantiles[:-1])
+        S_test = phi_test @ primals
+        duals = np.concatenate((duals.flatten(), [0]))
+        basis = np.concatenate((basis.flatten(), [False]))
+        phi = np.concatenate((self.phi_calib, phi_test.reshape(1,-1)), axis=0)
+        S = np.concatenate((self.scores_calib.reshape(-1,1), S_test.reshape(-1,1)), axis=0)
+        candidate_idx = phi.shape[0] - 1
+        num_iters = 0
+        while True:
+            # get direction vector for dual variable step
+            direction = -1 * np.linalg.solve(phi[basis].T, phi[candidate_idx].reshape(-1,1)).flatten()
+            # only consider non-zero entries of the direction vector
+            active_indices = ~np.isclose(direction, 0)
+            active_direction = direction[active_indices]
+            active_basis = basis.copy()
+            active_basis[np.where(basis)[0][~active_indices]] = False
+            positive_step = True if duals[candidate_idx] <= 0 else False
+            if candidate_idx == phi.shape[0] - 1:
+                positive_step = True if dual_threshold >= 0 else False
+            if positive_step:
+                gap_to_bounds = np.maximum(
+                    (quantiles[active_basis].flatten() - duals[active_basis]) / active_direction,
+                    ((quantiles[active_basis].flatten() - 1) - duals[active_basis]) / active_direction
+                )
+                step_size = np.min(gap_to_bounds)
+                departing_idx = np.where(active_basis)[0][np.argmin(gap_to_bounds)]
+            else:
+                gap_to_bounds = np.minimum(
+                    (quantiles[active_basis].flatten() - duals[active_basis]) / active_direction,
+                    ((quantiles[active_basis].flatten() - 1) - duals[active_basis]) / active_direction
+                )
+                step_size = np.max(gap_to_bounds)
+                departing_idx = np.where(active_basis)[0][np.argmax(gap_to_bounds)]
+            step_size_clip = np.clip(
+                step_size,
+                a_max=quantiles[candidate_idx] - duals[candidate_idx],
+                a_min=(quantiles[candidate_idx] - 1) - duals[candidate_idx]
+            )
+            duals[basis] += step_size_clip * direction
+            duals[candidate_idx] += step_size_clip
+            # print("Current value of final dual", duals[-1], "target threshold", dual_threshold)
+            if dual_threshold > 0 and duals[-1] > dual_threshold:
+                break
+            if dual_threshold < 0 and duals[-1] < dual_threshold:
+                break
+            if step_size_clip == step_size:
+                basis[departing_idx] = False
+                basis[candidate_idx] = True
+            if np.isclose(duals[-1], dual_threshold):
+                break
+            # TODO: make this a SMW update and reuse in the direction vector calc...
+            reduced_A = np.linalg.solve(phi[basis].T, phi[~basis].T)
+            reduced_costs = (S[~basis].T - S[basis].T @ reduced_A).flatten()
+            bottom = reduced_A[-1]
+            bottom[np.isclose(bottom, 0)] = np.inf
+            req_change = reduced_costs / bottom
+            if dual_threshold >= 0:
+                ignore_entries = (np.isclose(bottom, 0) | np.asarray(req_change <= 1e-5))
+            else:
+                ignore_entries = (np.isclose(bottom, 0) | np.asarray(req_change >= -1e-5))
+            if np.sum(~ignore_entries) == 0:
+                S[-1] = np.inf if quantiles[-1] >= 0.5 else -np.inf
+                break
+            if dual_threshold >= 0:
+                candidate_idx = np.where(~basis)[0][np.where(~ignore_entries, req_change, np.inf).argmin()]
+                S[-1] += np.min(req_change[~ignore_entries])
+            else:
+                candidate_idx = np.where(~basis)[0][np.where(~ignore_entries, req_change, -np.inf).argmax()]
+                S[-1] += np.max(req_change[~ignore_entries])
+            num_iters += 1
+            if num_iters > 10000:
+                S[-1] = np.inf if dual_threshold > 0 else -1 * np.inf
+        return S[-1]
+    def predict(
+            self,
+            quantile : float,
+            x_test : np.ndarray,
+            score_inv_fn : Callable,
+            S_min : float = None,
+            S_max : float = None,
+            randomize : bool = False,
+            exact : bool = True,
+            threshold : float = None
+    ):
+        """
+        Returns the (conditionally valid) prediction set for a given
+        test point
+        Arguments
+        ---------
+        quantile : float
+            Nominal quantile level
+        x_test : np.ndarray
+            Single test point
+        score_inv_fn : Callable[float, np.ndarray] -> .
+            Function that takes in a score threshold S^* and test point x and
+            outputs all values of y such that S(x, y) <= S^*
+        S_min : float = None
+            Lower bound (if available) on the conformity scores
+        S_max : float = None
+            Upper bound (if available) on the conformity scores
+        randomize : bool = False
+            Randomize prediction set for exact coverage
+        exact : bool = True
+            Avoid binary search and compute threshold exactly
+        Returns
+        -------
+        prediction_set
+        """
+        if quantile is None:
+            quantile_test = self.quantile_fn(x_test).reshape(-1,1)
+            quantiles = np.concatenate((self.quantile_calib, quantile_test), axis=0)
+        else:
+            quantile_test = quantile
+            quantiles = np.ones((len(self.scores_calib) + 1,1)) * quantile
+        if threshold is None:
+            if randomize:
+                threshold = self.rng.uniform(low=quantile_test - 1, high=quantile_test)
+            else:
+                if quantile_test < 0.5:
+                    threshold = quantile_test - 1
+                else:
+                    threshold = quantile_test
+        if exact:
+            if self.infinite_params.get('kernel', FUNCTION_DEFAULTS['kernel']):
+                raise ValueError("Exact computation doesn't support RKHS quantile regression for now.")
+            if np.allclose(quantiles[0], quantiles):
+                naive_duals, naive_primals = self._get_calibration_solution(
+                    quantiles.flatten()[0]
+                )
+            else:
+                naive_duals, naive_primals = self._get_calibration_solution(
+                    None
+                )
+            score_cutoff = self._compute_exact_cutoff(
+                quantiles,
+                naive_primals,
+                naive_duals,
+                self.Phi_fn(x_test),
+                threshold
+            )
+        else:
+            _solve = partial(_solve_dual, gcc=self, x_test=x_test, quantiles=quantiles, threshold=threshold)
+            if S_min is None:
+                S_min = np.min(self.scores_calib)
+            if S_max is None:
+                S_max = np.max(self.scores_calib)
+            lower, upper = binary_search(_solve, S_min, S_max * 2)
+            if quantile < 0.5:
+                score_cutoff = self._get_threshold(lower, x_test, quantiles)
+            else:
+                score_cutoff = self._get_threshold(upper, x_test, quantiles)
+        return score_inv_fn(score_cutoff, x_test.reshape(-1,1))
+    def estimate_coverage(
+            self,
+            quantile : float,
+            weights : np.ndarray,
+            x : np.ndarray = None
+    ):
+        """
+        estimate_coverage estimates the true percentile of the issued estimate of the
+        conditional quantile under the covariate shift induced by 'weights'
+        If we are ostensibly estimating the 0.95-quantile using an RKHS fit, we may
+        determine using our theory that the true percentile of this estimate is only 0.93
+        Arguments
+        ---------
+        quantile : float
+            Nominal quantile level
+        weights : np.ndarray
+            RKHS weights for tilt under which the coverage is estimated
+        x : np.ndarray = None
+            Points for which the RKHS weights are defined. If None, we assume
+            that weights corresponds to x_calib
+        Returns
+        -------
+        estimated_alpha : float
+            Our estimate for the realized quantile level
+        """
+        weights = weights.reshape(-1,1)
+        prob = setup_cvx_problem_calib(
+            quantile,
+            self.x_calib,
+            self.scores_calib,
+            self.phi_calib,
+            self.infinite_params
+        )
+        if "MOSEK" in cp.installed_solvers():
+            prob.solve(solver="MOSEK")
+        else:
+            prob.solve()
+        fitted_weights = prob.var_dict['weights'].value
+        if x is not None:
+            K = pairwise_kernels(
+                X=x,
+                Y=self.x_calib,
+                metric=self.infinite_params.get("kernel", FUNCTION_DEFAULTS["kernel"]),
+                gamma=self.infinite_params.get("gamma", FUNCTION_DEFAULTS["gamma"])
+            )
+        else:
+            K = pairwise_kernels(
+                X=self.x_calib,
+                metric=self.infinite_params.get("kernel", FUNCTION_DEFAULTS["kernel"]),
+                gamma=self.infinite_params.get("gamma", FUNCTION_DEFAULTS["gamma"])
+            )
+        inner_prod = weights.T @ K @ fitted_weights
+        expectation = np.mean(weights.T @ K)
+        #penalty = self.infinite_params['lambda'] * (inner_prod / expectation)
+        penalty = (1/(len(self.x_calib) + 1))*(inner_prod / expectation)
+        return quantile - penalty
+    def predict_naive(
+            self,
+            quantile : float,
+            x : np.ndarray,
+            score_inv_fn : Callable
+    ):
+        """
+        If we do not wish to include the imputed data point, we can sanity check that
+        the regression is appropriately adaptive to the conditional variability in the data
+        by running a quantile regression on the calibration set without any imputation.
+        When n_calib is large and the fit is stable, we expect these two sets to nearly coincide.
+        Arguments
+        ---------
+        quantile : float
+            Nominal quantile level
+        x : np.ndarray
+            Set of points for which we are issuing prediction sets
+        score_inv_fn : Callable[np.ndarray, np.ndarray] -> np.ndarray
+            Vectorized function that takes in a score threshold S^* and test point x and
+            outputs all values of y such that S(x, y) <= S^*
+        Returns
+        -------
+        prediction_sets
+        """
+        if len(x.shape) < 2:
+            raise ValueError("x needs to have shape (m, n), not {x_test.shape}.")
+        if self.infinite_params.get('kernel', FUNCTION_DEFAULTS['kernel']):
+            prob = setup_cvx_problem_calib(
+                quantile,
+                self.x_calib,
+                self.scores_calib,
+                self.phi_calib,
+                self.infinite_params
+            )
+            if "MOSEK" in cp.installed_solvers():
+                prob.solve(solver="MOSEK", verbose=False)
+            else:
+                prob.solve()
+            weights = prob.var_dict['weights'].value
+            beta = prob.constraints[-1].dual_value
+            K = pairwise_kernels(
+                X=x,
+                Y=self.x_calib,
+                metric=self.infinite_params.get("kernel", FUNCTION_DEFAULTS["kernel"]),
+                gamma=self.infinite_params.get("gamma", FUNCTION_DEFAULTS["gamma"])
+            )
+            threshold = K @ weights + self.Phi_fn(x) @ beta
+        else:
+            S = np.concatenate([self.scores_calib, [S]], dtype=float)
+            Phi = self.phi_calib.astype(float)
+            zeros = np.zeros((Phi.shape[1],))
+            if quantile is None:
+                bounds = np.concatenate((self.quantile_calib - 1, self.quantile_calib), axis=1)
+            else:
+                bounds = [(quantile - 1, quantile)] * (len(self.scores_calib) + 1)
+            res = linprog(-1 * S, A_eq=Phi.T, b_eq=zeros, bounds=bounds, method='highs')
+            beta = -1 * res.eqlin.marginals
+            threshold = self.Phi_fn(x) @ beta
+        return score_inv_fn(threshold, x)
+    def verify_coverage(
+            self,
+            x : np.ndarray,
+            y : np.ndarray,
+            quantile : float,
+            randomize : bool = False,
+            resolve : bool = False,
+            return_dual : bool = False,
+            eps : float = 0.001
+    ):
+        """
+        In some experiments, we may simply be interested in verifying the coverage of our method.
+        In this case, we do not need to binary search for the threshold S^*, but only need to verify that
+        S <= f_S(x) for the true value of S. This function implements this check for test points
+        denoted by x and y
+        Arguments
+        ---------
+        x : np.ndarray
+            A vector of test covariates
+        y : np.ndarray
+            A vector of test labels
+        quantile : float
+            Nominal quantile level
+        resolve : bool
+            Resolve LP/QP with posited value to determine coverage
+        Returns
+        -------
+        coverage_booleans : np.ndarray
+        """
+        covers = []
+        duals = []
+        if quantile is None:
+            quantiles = np.concatenate((self.quantile_calib, [[0.]]), axis=0).flatten()
+        else:
+            quantiles = quantile * np.ones((len(self.scores_calib) + 1, 1))
+        if self.infinite_params.get('kernel', FUNCTION_DEFAULTS['kernel']):
+            for x_val, y_val in zip(x, y):
+                S_true = self.score_fn(x_val.reshape(1,-1), y_val)
+                eta = self._get_dual_solution(S_true[0], x_val.reshape(1,-1), quantiles) # no need to recompute quantiles
+                if randomize:
+                    threshold = self.rng.uniform(low=quantile - 1, high=quantile)
+                elif quantile > 0.5:
+                    threshold = quantile - eps
+                else:
+                    threshold = quantile - 1 + eps
+                if quantile > 0.5:
+                    covers.append(eta[-1] < threshold)
+                else:
+                    covers.append(eta[-1] > threshold)
+                duals.append(eta[-1])
+        else:
+            for x_val, y_val in zip(x, y):
+                if randomize:
+                    threshold = self.rng.uniform(low=quantiles[-1] - 1, high=quantiles[-1])
+                elif quantiles[-1] > 0.5:
+                    threshold = quantiles[-1]
+                else:
+                    threshold = quantiles[-1] - 1
+                S_true = self.score_fn(x_val.reshape(1,-1), y_val)
+                if resolve:
+                    eta = self._get_dual_solution(S_true[0], x_val.reshape(1,-1), quantile)
+                    if quantile > 0.5:
+                        covers.append(eta[-1] < threshold)
+                    else:
+                        covers.append(eta[-1] > threshold)
+                    duals.append(eta[-1])
+                else:
+                    naive_duals, naive_primals = self._get_calibration_solution(
+                        quantile
+                    )
+                    score_cutoff = self._compute_exact_cutoff(
+                        quantiles,
+                        naive_primals,
+                        naive_duals,
+                        self.Phi_fn(x_val),
+                        threshold
+                    )
+                    if quantile > 0.5:
+                        covers.append(S_true < score_cutoff)
+                    else:
+                        covers.append(S_true > score_cutoff)
+                    duals.append(np.nan)
+        if return_dual:
+            return np.asarray(covers), np.asarray(duals)
+        return np.asarray(covers)
+    def _get_dual_solution(
+        self,
+        S : float,
+        x : np.ndarray,
+        quantiles : np.ndarray
+    ):
+        if self.infinite_params.get("kernel", FUNCTION_DEFAULTS['kernel']):
+            prob = finish_dual_setup(
+                self.cvx_problem,
+                S,
+                x,
+                quantiles[-1][0],
+                self.Phi_fn(x),
+                self.x_calib,
+                self.infinite_params
+            )
+            if "MOSEK" in cp.installed_solvers():
+                prob.solve(solver="MOSEK")
+            else:
+                prob.solve()
+            # TODO: THIS IS WRONG
+            #raise ValueError("need to get variable out of problem and return its value")
+            return prob.var_dict['weights'].value
+        else:
+            S = np.concatenate([self.scores_calib, [S]])
+            Phi = np.concatenate([self.phi_calib, self.Phi_fn(x)], axis=0)
+            zeros = np.zeros((Phi.shape[1],))
+            bounds = np.concatenate((quantiles - 1, quantiles), axis=1)
+            res = linprog(-1 * S, A_eq=Phi.T, b_eq=zeros, bounds=bounds,
+                          method='highs-ds', options={'presolve': False})
+            eta = res.x
+        return eta
+    def _get_primal_solution(
+        self,
+        S : float,
+        x : np.ndarray,
+        quantiles : np.ndarray
+    ):
+        if self.infinite_params.get("kernel", FUNCTION_DEFAULTS['kernel']):
+            prob = finish_dual_setup(
+                self.cvx_problem,
+                S,
+                x,
+                quantiles[-1][0],
+                self.Phi_fn(x),
+                self.x_calib,
+                self.infinite_params
+            )
+            if "MOSEK" in cp.installed_solvers():
+                prob.solve(solver="MOSEK")
+            else:
+                prob.solve()
+            weights = prob.var_dict['weights'].value
+            beta = prob.constraints[-1].dual_value
+        else:
+            S = np.concatenate([self.scores_calib, [S]])
+            Phi = np.concatenate([self.phi_calib, self.Phi_fn(x)], axis=0)
+            zeros = np.zeros((Phi.shape[1],))
+            bounds = np.concatenate((quantiles - 1, quantiles), axis=1)
+            res = linprog(-1 * S, A_eq=Phi.T, b_eq=zeros, bounds=bounds,
+                          method='highs-ds', options={'presolve': False})
+            beta = -1 * res.eqlin.marginals
+            weights = None
+        return beta, weights
+    def _get_threshold(
+        self,
+        S : float,
+        x : np.ndarray,
+        quantiles : np.ndarray
+    ):
+        beta, weights = self._get_primal_solution(S, x, quantiles)
+        threshold = self.Phi_fn(x) @ beta
+        if self.infinite_params.get('kernel', FUNCTION_DEFAULTS['kernel']):
+            K = pairwise_kernels(
+                X=np.concatenate([self.x_calib, x.reshape(1,-1)], axis=0),
+                Y=np.concatenate([self.x_calib, x.reshape(1,-1)], axis=0),
+                metric=self.infinite_params.get("kernel", FUNCTION_DEFAULTS["kernel"]),
+                gamma=self.infinite_params.get("gamma", FUNCTION_DEFAULTS["gamma"])
+            )
+            threshold = (K @ weights)[-1] + threshold
+        return threshold
+def binary_search(func, min, max, tol=1e-3):
+    min, max = float(min), float(max)
+    assert (max + tol) > max
+    while (max - min) > tol:
+        mid = (min + max) / 2
+        if func(mid) > 0:
+            max = mid
+        else:
+            min = mid
+    return min, max
+def _solve_dual(S, gcc, x_test, quantiles, threshold=None):
+    if gcc.infinite_params.get('kernel', None):
+        prob = finish_dual_setup(
+            gcc.cvx_problem,
+            S,
+            x_test,
+            quantiles[-1][0],
+            gcc.Phi_fn(x_test),
+            gcc.x_calib,
+            gcc.infinite_params
+        )
+        if "MOSEK" in cp.installed_solvers():
+            prob.solve(solver="MOSEK")
+        else:
+            prob.solve(solver="OSQP")
+        weights = prob.var_dict['weights'].value
+    else:
+        S = np.concatenate([gcc.scores_calib, [S]], dtype=float)
+        Phi = np.concatenate([gcc.phi_calib, gcc.Phi_fn(x_test)], axis=0, dtype=float)
+        zeros = np.zeros((Phi.shape[1],))
+        bounds = np.concatenate((quantiles - 1, quantiles), axis=1)
+        res = linprog(-1 * S, A_eq=Phi.T, b_eq=zeros, bounds=bounds,
+                      method='highs', options={'presolve': False})
+        weights = res.x
+    if threshold is None:
+        if quantiles[-1] < 0.5:
+            threshold = quantiles[-1] - 1
+        else:
+            threshold = quantiles[-1]
+    # if quantile < 0.5:
+    #     return weights[-1] + (1 - quantile)
+    return weights[-1] - threshold
+def setup_cvx_problem(
+    x_calib,
+    scores_calib,
+    phi_calib,
+    infinite_params = {}
+):
+    n_calib = len(scores_calib)
+    if phi_calib is None:
+        phi_calib = np.ones((n_calib,1))
+    eta = cp.Variable(name="weights", shape=n_calib + 1)
+    quantile = cp.Parameter(name="quantile")
+    scores_const = cp.Constant(scores_calib.reshape(-1,1))
+    scores_param = cp.Parameter(name="S_test", shape=(1,1))
+    scores = cp.vstack([scores_const, scores_param])
+    Phi_calibration = cp.Constant(phi_calib)
+    Phi_test = cp.Parameter(name="Phi_test", shape=(1, phi_calib.shape[1]))
+    Phi = cp.vstack([Phi_calibration, Phi_test])
+    kernel = infinite_params.get("kernel", FUNCTION_DEFAULTS["kernel"])
+    gamma = infinite_params.get("gamma", FUNCTION_DEFAULTS["gamma"])
+    if kernel is None: # no RKHS fitting
+        constraints = [
+            (quantile - 1) <= eta,
+            quantile >= eta,
+            eta.T @ Phi == 0
+        ]
+        prob = cp.Problem(
+            cp.Minimize(-1 * cp.sum(cp.multiply(eta, cp.vec(scores)))),
+            constraints
+        )
+    else: # RKHS fitting
+        radius = cp.Parameter(name="radius", nonneg=True)
+        _, L_11 = _get_kernel_matrix(x_calib, kernel, gamma)
+        L_11_const = cp.Constant(
+            np.hstack([L_11, np.zeros((L_11.shape[0], 1))])
+            )
+        L_21_22_param = cp.Parameter(name="L_21_22", shape=(1, n_calib + 1))
+        L = cp.vstack([L_11_const, L_21_22_param])
+        C = radius / (n_calib + 1)
+        # this is really C * (quantile - 1) and C * quantile
+        constraints = [
+            (quantile - 1) <= eta,
+            quantile >= eta,
+            eta.T @ Phi == 0]
+        prob = cp.Problem(
+                    cp.Minimize(0.5 * C * cp.sum_squares(L.T @ eta) - cp.sum(cp.multiply(eta, cp.vec(scores)))),
+                    constraints
+                )
+    return prob
+def _get_kernel_matrix(x_calib, kernel, gamma):
+    K = pairwise_kernels(
+        X=x_calib,
+        metric=kernel,
+        gamma=gamma
+    ) + 1e-5 * np.eye(len(x_calib))
+    K_chol = np.linalg.cholesky(K)
+    return K, K_chol
+def finish_dual_setup(
+    prob : cp.Problem,
+    S : np.ndarray,
+    X : np.ndarray,
+    quantile : float,
+    Phi : np.ndarray,
+    x_calib : np.ndarray,
+    infinite_params = {}
+):
+    prob.param_dict['S_test'].value = np.asarray([[S]])
+    prob.param_dict['Phi_test'].value = Phi.reshape(1,-1)
+    prob.param_dict['quantile'].value = quantile
+    kernel = infinite_params.get('kernel', FUNCTION_DEFAULTS['kernel'])
+    gamma = infinite_params.get('gamma', FUNCTION_DEFAULTS['gamma'])
+    radius = 1 / infinite_params.get('lambda', FUNCTION_DEFAULTS['lambda'])
+    if kernel is not None:
+        K_12 = pairwise_kernels(
+            X=np.concatenate([x_calib, X.reshape(1,-1)], axis=0),
+            Y=X.reshape(1,-1),
+            metric=kernel,
+            gamma=gamma
+            )
+        if 'K_12' in prob.param_dict:
+            prob.param_dict['K_12'].value = K_12[:-1]
+            prob.param_dict['K_21'].value = K_12.T
+        _, L_11 = _get_kernel_matrix(x_calib, kernel, gamma)
+        K_22 = pairwise_kernels(
+            X=X.reshape(1,-1),
+            metric=kernel,
+            gamma=gamma
+            )
+        L_21 = np.linalg.solve(L_11, K_12[:-1]).T
+        L_22 = K_22 - L_21 @ L_21.T
+        L_22[L_22 < 0] = 0
+        L_22 = np.sqrt(L_22)
+        prob.param_dict['L_21_22'].value = np.hstack([L_21, L_22])
+        prob.param_dict['radius'].value = radius
+        # update quantile definition for silly cvxpy reasons
+        prob.param_dict['quantile'].value = quantile
+        #prob.param_dict['quantile'].value *= radius / (len(x_calib) + 1)
+    return prob
+def setup_cvx_problem_calib(
+    quantile,
+    x_calib,
+    scores_calib,
+    phi_calib,
+    infinite_params = {}
+):
+    n_calib = len(scores_calib)
+    if phi_calib is None:
+        phi_calib = np.ones((n_calib,1))
+    eta = cp.Variable(name="weights", shape=n_calib)
+    scores = cp.Constant(scores_calib.reshape(-1,1))
+    Phi = cp.Constant(phi_calib)
+    kernel = infinite_params.get("kernel", FUNCTION_DEFAULTS["kernel"])
+    gamma = infinite_params.get("gamma", FUNCTION_DEFAULTS["gamma"])
+    if kernel is None: # no RKHS fitting
+        constraints = [
+            (quantile - 1) <= eta,
+            quantile >= eta,
+            eta.T @ Phi == 0
+        ]
+        prob = cp.Problem(
+            cp.Minimize(-1 * cp.sum(cp.multiply(eta, cp.vec(scores)))),
+            constraints
+        )
+    else: # RKHS fitting
+        radius = 1 / infinite_params.get('lambda', FUNCTION_DEFAULTS['lambda'])
+        _, L = _get_kernel_matrix(x_calib, kernel, gamma)
+        C = radius / (n_calib + 1)
+        constraints = [
+             (quantile - 1) <= eta,
+             quantile >= eta,
+            eta.T @ Phi == 0]
+        prob = cp.Problem(
+                    cp.Minimize(0.5 * C * cp.sum_squares(L.T @ eta) - cp.sum(cp.multiply(eta, cp.vec(scores)))),
+                    constraints
+                )
+    return prob

MACI-main/conditional-conformal/conditionalconformal/experiment_utils.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import numpy as np
+import pandas as pd
+from sklearn.linear_model import LinearRegression
+from quantile_forest import RandomForestQuantileRegressor
+from conditionalconformal import CondConf
+## get base model for constructing scores
+def fit_model(data_train, base_model):
+    x_train, y_train = data_train
+    if base_model == "ols":
+        reg = LinearRegression().fit(x_train, y_train)
+    elif base_model == "qrf":
+        reg = RandomForestQuantileRegressor()
+        reg.fit(x_train, y_train)
+    elif base_model == "qr":
+        reg = CondConf(score_fn = lambda x, y: y, Phi_fn = lambda x: x)
+        reg.setup_problem(x_train, y_train)
+        # overwrite prediction function so it looks like a regression object
+        reg.predict = lambda x, q: (x @ reg._get_calibration_solution(q)[1]).flatten() # expects x to be of form (n_points, n_feats)
+    return reg
+# helper function for splitting dataset
+def split_dataset(dataset, n_test, n_calib, rng):
+    X, Y = dataset
+    data_indices = np.arange(len(X))
+    rng.shuffle(data_indices)
+    test_indices, calib_indices, train_indices = np.array_split(
+        data_indices,
+        np.cumsum([n_test, n_calib])
+    )
+    X_test = X[test_indices]
+    Y_test = Y[test_indices]
+    X_calib = X[calib_indices]
+    Y_calib = Y[calib_indices]
+    X_train = X[train_indices]
+    Y_train = Y[train_indices]
+    return (X_train, Y_train), (X_calib, Y_calib), (X_test, Y_test)
+# get coverages for each method type...
+def get_coverage(dataset_calib, dataset_test, score_fn, method, quantile):
+    if method == "split":
+        scores_calib = score_fn(*dataset_calib)
+        scores_test = score_fn(*dataset_test)
+        score_cutoff = np.quantile(
+            scores_calib,
+            [quantile * (1 + 1/len(scores_calib))]
+        )
+        if quantile >= 0.5:
+            covs = scores_test <= score_cutoff
+        else:
+            covs = scores_test >= score_cutoff
+    elif "rand" in method:
+        condcalib = CondConf(score_fn, lambda x: x)
+        condcalib.setup_problem(*dataset_calib)
+        X_test, Y_test = dataset_test
+        covs = condcalib.verify_coverage(X_test, Y_test, quantile, resolve=True, randomize=True)
+    else:
+        condcalib = CondConf(score_fn, lambda x: x)
+        condcalib.setup_problem(*dataset_calib)
+        X_test, Y_test = dataset_test
+        covs = condcalib.verify_coverage(X_test, Y_test, quantile, resolve=True, randomize=False)
+    return covs
+# get coverages for each method type...
+def get_cutoff(dataset_calib, dataset_test, score_fn, method, quantile):
+    print(method, quantile)
+    scores_test = score_fn(*dataset_test)
+    if method == "split":
+        scores_calib = score_fn(*dataset_calib)
+        score_cutoff = np.quantile(
+            scores_calib,
+            [quantile * (1 + 1/len(scores_calib))]
+        )
+        cutoffs = np.ones((len(scores_test,))) * score_cutoff
+    elif "rand" in method:
+        condcalib = CondConf(score_fn, lambda x: x)
+        condcalib.setup_problem(*dataset_calib)
+        cutoffs = []
+        for x in dataset_test[0]:
+            cutoff = condcalib.predict(quantile, x, lambda c, x: c, randomize=True)
+            cutoffs.append(cutoff)
+        cutoffs = np.asarray(cutoffs)
+    else:
+        condcalib = CondConf(score_fn, lambda x: x)
+        condcalib.setup_problem(*dataset_calib)
+        cutoffs = []
+        for x in dataset_test[0]:
+            cutoff = condcalib.predict(quantile, x, lambda c, x: c, randomize=False)
+            cutoffs.append(cutoff)
+        cutoffs = np.asarray(cutoffs)
+    if quantile > 0.5:
+        coverages = scores_test <= cutoffs.flatten()
+    else:
+        coverages = scores_test >= cutoffs.flatten()
+    return cutoffs, coverages
+def run_coverage_experiment(dataset, n_test, n_calib, alpha, methods = [], seed = 0):
+    rng = np.random.default_rng(seed=seed)
+    dataset_train, dataset_calib, dataset_test = split_dataset(
+        dataset,
+        n_test,
+        n_calib,
+        rng
+    )
+    ### Compute conformity scores
+    base_methods = set([m.split('-')[0] for m in methods])
+    base_model = {base : fit_model(dataset_train, base) for base in base_methods}
+    coverages = []
+    # example methods: (BASE_METHOD)-(CONFORMAL_METHOD)
+    # BASE_METHOD valid choices: "ols", "qr", "qrf"
+    # CONFORMAL_METHOD valid choices: "split", "cc", "ccrand", "lcp", "rlcp" (todo on last two)
+    for method in methods:
+        base_method, conformal_method = method.split('-')
+        reg = base_model[base_method]
+        if "q" in base_method: # if a quantile regression score needs to specify quantile
+            score_fn_upper = lambda x, y: y - reg.predict(x, 1 - alpha/2)
+            score_fn_lower = lambda x, y: y - reg.predict(x, alpha/2)
+        else:
+            score_fn_upper = lambda x, y: y - reg.predict(x)
+            score_fn_lower = lambda x, y: y - reg.predict(x)
+        covers_upper = get_coverage(dataset_calib, dataset_test, score_fn_upper, conformal_method, 1 - alpha/2)
+        covers_lower = get_coverage(dataset_calib, dataset_test, score_fn_lower, conformal_method, alpha/2)
+        covers = np.logical_and(covers_upper, covers_lower)
+        coverages.append(covers)
+    return dataset_test[0], coverages
+def run_experiment(dataset, n_test, n_calib, alpha, methods = [], seed = 0):
+    rng = np.random.default_rng(seed=seed)
+    dataset_train, dataset_calib, dataset_test = split_dataset(
+        dataset,
+        n_test,
+        n_calib,
+        rng
+    )
+    ### Compute conformity scores
+    base_model = {base : fit_model(dataset_train, base) for base in ["ols", "qrf", "qr"]}
+    all_lengths = []
+    all_coverages = []
+    # example methods: (BASE_METHOD)-(CONFORMAL_METHOD)
+    # BASE_METHOD valid choices: "ols", "qr", "qrf"
+    # CONFORMAL_METHOD valid choices: "split", "cc", "ccrand", "lcp", "ccqp"
+    for method in methods:
+        base_method, conformal_method = method.split('-')
+        reg = base_model[base_method]
+        if "qrf" in base_method: # if a quantile regression score needs to specify quantile
+            score_fn_upper = lambda x, y: y - reg.predict(x, 1 - alpha/2) + rng.uniform(0, 1e-5, size=len(x))
+            score_fn_lower = lambda x, y: y - reg.predict(x, alpha/2) + rng.uniform(0, 1e-5, size=len(x))
+        elif "q" in base_method:
+            score_fn_upper = lambda x, y: y - reg.predict(x, 1 - alpha/2)
+            score_fn_lower = lambda x, y: y - reg.predict(x, alpha/2)
+        else:
+            score_fn_upper = lambda x, y: y - reg.predict(x)
+            score_fn_lower = lambda x, y: y - reg.predict(x)
+        cutoffs_upper, cov_upper = get_cutoff(dataset_calib, dataset_test, score_fn_upper, conformal_method, 1 - alpha/2)
+        cutoffs_lower, cov_lower = get_cutoff(dataset_calib, dataset_test, score_fn_lower, conformal_method, alpha/2)
+        if "q" in base_method:
+            pred_upper = reg.predict(dataset_test[0], 1 - alpha/2)
+            pred_lower = reg.predict(dataset_test[0], alpha/2)
+            pred_gap = pred_upper - pred_lower
+        else:
+            pred_gap = 0
+        lengths = cutoffs_upper - cutoffs_lower + pred_gap
+        coverage = np.logical_and(cov_upper, cov_lower)
+        all_lengths.append(lengths)
+        all_coverages.append(coverage)
+    return dataset_test[0], (all_lengths, all_coverages)

MACI-main/conditional-conformal/conditionalconformal/synthetic_data.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import numpy as np
+def generate_cqr_data(seed,n_train=2000,n_calib=1000,n_test=500):
+    np.random.seed(seed)
+    n_train = n_train + n_calib
+    def f(x):
+        ''' Construct data (1D example)
+        '''
+        ax = 0*x
+        for i in range(len(x)):
+            ax[i] = np.random.poisson(np.sin(x[i])**2+0.1) + 0.03*x[i]*np.random.randn(1)
+            ax[i] += 25*(np.random.uniform(0,1,1)<0.01)*np.random.randn(1)
+        return ax.astype(np.float32)
+    # training features
+    x_train = np.random.uniform(0, 5.0, size=n_train).astype(np.float32)
+    # test features
+    x_test = np.random.uniform(0, 5.0, size=n_test).astype(np.float32)
+    # generate labels
+    y_train = f(x_train)
+    y_test = f(x_test)
+    # reshape the features
+    x_train = np.reshape(x_train,(n_train,1))
+    x_test = np.reshape(x_test,(n_test,1))
+    train_set_size = len(y_train) - n_calib
+    x_train_final = x_train[ : train_set_size]
+    x_calib = x_train[train_set_size : ]
+    y_train_final = y_train[ : train_set_size]
+    y_calib = y_train[train_set_size : ]
+    return x_train_final, y_train_final, x_calib, y_calib, x_test, y_test
+def indicator_matrix(scalar_values, disc):
+    scalar_values = np.array(scalar_values)
+    # Create all possible intervals
+    intervals = [(disc[i], disc[i + 1]) for i in range(len(disc) - 1)]
+    # Initialize the indicator matrix
+    matrix = np.zeros((len(scalar_values), len(intervals)))
+    # Fill in the indicator matrix
+    for i, value in enumerate(scalar_values):
+        for j, (a, b) in enumerate(intervals):
+            if a <= value < b:
+                matrix[i, j] = 1
+    return matrix

MACI-main/conditional-conformal/src/atomizer.py ADDED Viewed

	@@ -0,0 +1,347 @@

+import json
+import numpy as np
+import re
+import string
+import spacy
+import nltk
+from rank_bm25 import BM25Okapi
+import os
+from concurrent.futures import ThreadPoolExecutor
+from nltk.tokenize import sent_tokenize
+nltk.download("punkt")
+class Atomizer(object):
+    def __init__(self, client, demo_dir):
+        self.nlp = spacy.load("en_core_web_sm")
+        self.is_bio = True
+        self.demo_path = os.path.join(demo_dir, "demos.json" if self.is_bio else "demos_complex.json")
+        self.client = client
+        # get the demos
+        with open(self.demo_path, 'r') as f:
+            self.demos = json.load(f)
+        tokenized_corpus = [doc.split(" ") for doc in self.demos.keys()]
+        self.bm25 = BM25Okapi(tokenized_corpus)
+    def save_cache(self):
+        self.client.save_cache()
+    def run(self, generation, cost_estimate=None):
+        """Convert the generation into a set of atomic facts. Return a total words cost if cost_estimate != None."""
+        assert isinstance(generation, str), "generation must be a string"
+        paragraphs = [para.strip() for para in generation.split("\n") if len(para.strip()) > 0]
+        return self.get_atomic_facts_from_paragraph(paragraphs, cost_estimate=cost_estimate)
+    def get_atomic_facts_from_paragraph(self, paragraphs, cost_estimate=None):
+        sentences = []
+        para_breaks = []
+        for para_idx, paragraph in enumerate(paragraphs):
+            if para_idx > 0 :
+                para_breaks.append(len(sentences))
+            initials = detect_initials(paragraph)
+            curr_sentences = sent_tokenize(paragraph)
+            curr_sentences_2 = sent_tokenize(paragraph)
+            curr_sentences = fix_sentence_splitter(curr_sentences, initials)
+            curr_sentences_2 = fix_sentence_splitter(curr_sentences_2, initials)
+            # checking this, just to ensure the crediability of the sentence splitter fixing algorithm
+            assert curr_sentences == curr_sentences_2, (paragraph, curr_sentences, curr_sentences_2)
+            sentences += curr_sentences
+        atoms_or_estimate = self.get_init_atomic_facts_from_sentence([sent for i, sent in enumerate(sentences) if not (not self.is_bio and ( \
+                            (i==0 and (sent.startswith("Sure") or sent.startswith("Here are"))) or \
+                            (i==len(sentences)-1 and (sent.startswith("Please") or sent.startswith("I hope") or sent.startswith("Here are")))))], cost_estimate=cost_estimate)
+        if cost_estimate:
+            return atoms_or_estimate
+        else:
+            atoms = atoms_or_estimate
+        atomic_facts_pairs = []
+        for i, sent in enumerate(sentences):
+            if not self.is_bio and ( \
+                (i==0 and (sent.startswith("Sure") or sent.startswith("Here are"))) or \
+                (i==len(sentences)-1 and (sent.startswith("Please") or sent.startswith("I hope") or sent.startswith("Here are")))):
+                atomic_facts_pairs.append((sent, []))
+            elif self.is_bio and sent.startswith("This sentence does not contain any facts"):
+                atomic_facts_pairs.append((sent, []))
+            elif sent.startswith("Sure") or sent.startswith("Please") or (i==0 and sent.startswith("Here are")):
+                atomic_facts_pairs.append((sent, []))
+            else:
+                atomic_facts_pairs.append((sent, atoms[sent]))
+        # postprocess_atomic_facts will fix minor issues from InstructGPT
+        # it is supposed to handle sentence splitter issue too, but since here
+        # we fixed sentence splitter issue already,
+        # the new para_breaks should be identical to the original para_breaks
+        if self.is_bio:
+            atomic_facts_pairs, para_breaks = postprocess_atomic_facts(atomic_facts_pairs, list(para_breaks), self.nlp)
+        return atomic_facts_pairs, para_breaks
+    def get_init_atomic_facts_from_sentence(self, sentences, cost_estimate=None):
+        """Get the initial atomic facts from the sentences. Return a total words cost if cost_estimate != None."""
+        is_bio = self.is_bio
+        demos = self.demos
+        k = 1 if is_bio else 0
+        n = 7 if is_bio else 8
+        prompts = []
+        prompt_to_sent = {}
+        atoms = {}
+        for sentence in sentences:
+            if sentence in atoms:
+                continue
+            top_matchings = best_demos(sentence, self.bm25, list(demos.keys()), k)
+            prompt = ""
+            for i in range(n):
+                prompt = prompt + "Please breakdown the following sentence into independent facts: {}\n".format(list(demos.keys())[i])
+                for fact in demos[list(demos.keys())[i]]:
+                    prompt = prompt + "- {}\n".format(fact)
+                prompt = prompt + "\n"
+            for match in top_matchings:
+                prompt = prompt + "Please breakdown the following sentence into independent facts: {}\n".format(match)
+                for fact in demos[match]:
+                    prompt = prompt + "- {}\n".format(fact)
+                prompt = prompt + "\n"
+            prompt = prompt + "Please breakdown the following sentence into independent facts: {}\n".format(sentence)
+            prompts.append(prompt)
+            prompt_to_sent[prompt] = sentence
+        if cost_estimate:
+            total_words_estimate = 0
+            for prompt in prompts:
+                if cost_estimate == "consider_cache" and (prompt.strip() + "_0") in self.client.cache_dict:
+                    continue
+                total_words_estimate += len(prompt.split())
+            return total_words_estimate
+        else:
+            outputs = []
+            with ThreadPoolExecutor(max_workers=len(prompts)) as executor:
+                outputs = list(
+                    executor.map(
+                        lambda x : self.client.query(x),
+                        prompts
+                    )
+                )
+                for prompt, output in zip(prompts, outputs):
+                    atoms[prompt_to_sent[prompt]] = text_to_sentences(output[0]['message'])
+            # for prompt in prompts:
+            #     output = self.client.query(prompt)
+            #     outputs.append(output)
+            #     atoms[prompt_to_sent[prompt]] = text_to_sentences(output[0]['message'])
+            self.client.cache_outputs(
+                prompts=prompts,
+                sample_indices=np.zeros((len(prompts),), dtype=int),
+                outputs=outputs
+            )
+            for key, value in demos.items():
+                if key not in atoms:
+                    atoms[key] = value
+            return atoms
+def best_demos(query, bm25, demos_sents, k):
+    tokenized_query = query.split(" ")
+    top_matchings = bm25.get_top_n(tokenized_query, demos_sents, k)
+    return top_matchings
+# transform InstructGPT output into sentences
+def text_to_sentences(text):
+    sentences = text.split("- ")[1:]
+    sentences = [sent.strip()[:-1] if sent.strip()[-1] == '\n' else sent.strip() for sent in sentences]
+    if len(sentences) > 0:
+        if sentences[-1][-1] != '.':
+            sentences[-1] = sentences[-1] + '.'
+    else:
+        sentences = []
+    return sentences
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    def remove_articles(text):
+        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
+        return re.sub(regex, ' ', text)
+    def white_space_fix(text):
+        return ' '.join(text.split())
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+MONTHS = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
+MONTHS = [m.lower() for m in MONTHS]
+def is_num(text):
+    try:
+        text = int(text)
+        return True
+    except Exception:
+        return False
+def is_date(text):
+    text = normalize_answer(text)
+    for token in text.split(" "):
+        if (not is_num(token)) and token not in MONTHS:
+            return False
+    return True
+def extract_numeric_values(text):
+    pattern = r'\b\d+\b'  # regular expression pattern for integers
+    numeric_values = re.findall(pattern, text)  # find all numeric values in the text
+    return set([value for value in numeric_values])  # convert the values to float and return as a list
+def detect_entities(text, nlp):
+    doc = nlp(text)
+    entities = set()
+    def _add_to_entities(text):
+        if "-" in text:
+            for _text in text.split("-"):
+                entities.add(_text.strip())
+        else:
+            entities.add(text)
+    for ent in doc.ents:
+        # spacy often has errors with other types of entities
+        if ent.label_ in ["DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"]:
+            if is_date(ent.text):
+                _add_to_entities(ent.text)
+            else:
+                for token in ent.text.split():
+                    if is_date(token):
+                        _add_to_entities(token)
+    for new_ent in extract_numeric_values(text):
+        if not np.any([new_ent in ent for ent in entities]):
+            entities.add(new_ent)
+    return entities
+def postprocess_atomic_facts(_atomic_facts, para_breaks, nlp):
+    verbs = ["born.", " appointed.", " characterized.", " described.", " known.", " member.", " advocate.", "served.", "elected."]
+    permitted_verbs = ["founding member."]
+    atomic_facts = []
+    new_atomic_facts = []
+    new_para_breaks = []
+    for i, (sent, facts) in enumerate(_atomic_facts):
+        sent = sent.strip()
+        if len(sent.split())==1 and i not in para_breaks and i > 0:
+            assert i not in para_breaks
+            atomic_facts[-1][0] += " " + sent
+            atomic_facts[-1][1] += facts
+        else:
+            if i in para_breaks:
+                new_para_breaks.append(len(atomic_facts))
+            atomic_facts.append([sent, facts])
+    for i, (sent, facts) in enumerate(atomic_facts):
+        entities = detect_entities(sent, nlp)
+        covered_entities = set()
+        # print (entities)
+        new_facts = []
+        for i, fact in enumerate(facts):
+            if any([fact.endswith(verb) for verb in verbs]) and not any([fact.endswith(verb) for verb in permitted_verbs]):
+                if any([fact[:-1] in other_fact for j, other_fact in enumerate(facts) if j != i]):
+                    continue
+            sent_entities = detect_entities(fact, nlp)
+            covered_entities |= set([e for e in sent_entities if e in entities])
+            new_entities = sent_entities - entities
+            if len(new_entities) > 0:
+                do_pass = False
+                for new_ent in new_entities:
+                    pre_ent = None
+                    for ent in entities:
+                        if ent.startswith(new_ent):
+                            pre_ent = ent
+                            break
+                    if pre_ent is None:
+                        do_pass = True
+                        break
+                    fact = fact.replace(new_ent, pre_ent)
+                    covered_entities.add(pre_ent)
+                if do_pass:
+                    continue
+            if fact in new_facts:
+                continue
+            new_facts.append(fact)
+        try:
+            assert entities==covered_entities
+        except Exception:
+            new_facts = facts # there is a bug in spacy entity linker, so just go with the previous facts
+        new_atomic_facts.append((sent, new_facts))
+    return new_atomic_facts, new_para_breaks
+def is_integer(s):
+    try:
+        s = int(s)
+        return True
+    except Exception:
+        return False
+def detect_initials(text):
+    pattern = r"[A-Z]\. ?[A-Z]\."
+    match = re.findall(pattern, text)
+    return [m for m in match]
+def fix_sentence_splitter(curr_sentences, initials):
+    for initial in initials:
+        if not np.any([initial in sent for sent in curr_sentences]):
+            alpha1, alpha2 = [t.strip() for t in initial.split(".") if len(t.strip())>0]
+            for i, (sent1, sent2) in enumerate(zip(curr_sentences, curr_sentences[1:])):
+                if sent1.endswith(alpha1 + ".") and sent2.startswith(alpha2 + "."):
+                    # merge sentence i and i+1
+                    curr_sentences = curr_sentences[:i] + [curr_sentences[i] + " " + curr_sentences[i+1]] + curr_sentences[i+2:]
+                    break
+    sentences = []
+    combine_with_previous = None
+    for sent_idx, sent in enumerate(curr_sentences):
+        if len(sent.split())<=1 and sent_idx==0:
+            assert not combine_with_previous
+            combine_with_previous = True
+            sentences.append(sent)
+        elif len(sent.split())<=1:
+            assert sent_idx > 0
+            sentences[-1] += " " + sent
+            combined_with_previous = False
+        elif sent[0].isalpha() and not sent[0].isupper() and sent_idx > 0:
+            assert sent_idx > 0, curr_sentences
+            sentences[-1] += " " + sent
+            combine_with_previous = False
+        elif combine_with_previous:
+            assert sent_idx > 0
+            sentences[-1] += " " + sent
+            combine_with_previous = False
+        else:
+            assert not combine_with_previous
+            sentences.append(sent)
+    return sentences

MACI-main/conditional-conformal/src/aws_utils.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import boto3
+import io
+def s3_open(bucket_name, key):
+    # Create a session using your AWS credentials
+    session = boto3.Session()
+    # Create an S3 client
+    s3 = session.client('s3')
+    # Download the file object
+    response = s3.get_object(Bucket=bucket_name, Key=key)
+    file_content = response['Body'].read()
+    # Return a BytesIO object to mimic a file object
+    return io.BytesIO(file_content)

MACI-main/conditional-conformal/src/client.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+import pickle
+import time
+from typing import Any, List
+class Client:
+    """
+    Wrapper class for language models that we query. It keeps a cache of prompts and
+    responses so that we don't have to requery things in experiments.
+    """
+    def __init__(self, cache_file, model : str = 'gpt-3.5-turbo'):
+        self.cache_file = cache_file
+        self.cache_dict = self.load_cache()
+        self.model = model
+        self.modified_cache = False
+    def load_model(self):
+        # load the model and put it as self.model
+        raise NotImplementedError()
+    def query(
+            self,
+            prompt : str,
+            sample_idx : int = 0,
+            **kwargs
+        ):
+        prompt = prompt.strip() # it's important not to end with a whitespace
+        cache_key = f"{prompt}_{sample_idx}"
+        if cache_key in self.cache_dict:
+            return self.cache_dict[cache_key]
+        if self.model is None:
+            self.load_model()
+        # print("I didn't find a cached copy!")
+        output = self._query(prompt, **kwargs)
+        return output
+    def cache_outputs(
+            self,
+            prompts : List[str],
+            sample_indices : List[int],
+            outputs : List[Any]
+    ):
+        for prompt, sample_idx, output in zip(prompts, sample_indices, outputs):
+            prompt = prompt.strip()
+            cache_key = f"{prompt}_{sample_idx}"
+            self.cache_dict[cache_key] = output
+            self.modified_cache = True
+    def save_cache(self):
+        if self.modified_cache == False:
+            return
+        # load the latest cache first, since if there were other processes running in parallel, cache might have been updated
+        for k, v in self.load_cache().items():
+            self.cache_dict[k] = v
+        with open(self.cache_file, "wb") as f:
+            pickle.dump(self.cache_dict, f)
+    def load_cache(self, allow_retry=True):
+        if os.path.exists(self.cache_file):
+            while True:
+                try:
+                    with open(self.cache_file, "rb") as f:
+                        cache = pickle.load(f)
+                    break
+                except Exception: # if there are concurent processes, things can fail
+                    if not allow_retry:
+                        assert False
+                    print ("Pickle Error: Retry in 5sec...")
+                    time.sleep(5)
+        elif 's3' in self.cache_file:
+            from aws_utils import s3_open
+            s3_path = self.cache_file.removeprefix('s3://')
+            bucket_name = s3_path.split('/')[0]
+            path_to_file = '/'.join(s3_path.split('/')[1:])
+            with s3_open(bucket_name, path_to_file) as fp:
+                cache = pickle.load(fp)
+        else:
+            cache = {}
+        return cache

MACI-main/conditional-conformal/src/config.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import munch
+import toml
+def get_config(filepath: str = 'configs/default.toml'):
+    return munch.munchify(
+        toml.load(filepath)
+    )

MACI-main/conditional-conformal/src/conformal.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import numpy as np
+from typing import Callable, List
+def compute_conformity_scores(
+    dataset : List,
+    scores_list : List,
+):
+    annotations_list = [
+        np.asarray([c['is_supported'] for c in unit['atomic_facts']])
+        for unit in dataset
+    ]
+    conf_scores = [np.max(scores[~annotes], initial=0) for scores, annotes in zip(scores_list, annotations_list)]
+    return conf_scores
+def calibrate_thresholds(
+    feats_test : List,
+    feats_valid : List,
+    scores_valid : List,
+    alpha_fn : Callable
+) -> List[float]:
+    alpha_valid = alpha_fn(feats_valid)
+    quantile = np.ceil((1 - alpha_valid[0]) * (len(feats_valid) + 1)) / len(feats_valid)
+    return [np.quantile(
+        scores_valid,
+        q=quantile
+    )] * len(feats_test)
+def conformal_filter(
+    dataset : List,
+    scores_list : List,
+    thresholds : List
+) -> List:
+    for unit, scores, t in zip(dataset, scores_list, thresholds):
+        filtered_claims = [
+            c for c, s in zip(unit['atomic_facts'], scores) if s >= t
+        ]
+        unit['filtered_claims'] = filtered_claims
+    return dataset
+def assess_factscore_coverage(
+    dataset : List,
+    nominal_alpha : float
+) -> None:
+    nonfactual_list = []
+    nonfactual_grps = {}
+    for d in dataset:
+        nonfactual = 'F' in [c['is_supported'] for c in d['filtered_claims']]
+        nonfactual_list.append(nonfactual)
+        # right now metadata is only *two* strings...TODO this needs to be more flexible
+        # if tuple(d['metadata']) not in nonfactual_grps:
+        #     nonfactual_grps[tuple(d['metadata'])] = [nonfactual]
+        # else:
+        #     nonfactual_grps[tuple(d['metadata'])].append(nonfactual)
+        # if d['metadata'][0] not in nonfactual_grps:
+        #     nonfactual_grps[d['metadata'][0]] = [nonfactual]
+        # else:
+        #     nonfactual_grps[d['metadata'][0]].append(nonfactual)
+        # if d['metadata'][1] not in nonfactual_grps:
+        #     nonfactual_grps[d['metadata'][1]] = [nonfactual]
+        # else:
+        #     nonfactual_grps[d['metadata'][1]].append(nonfactual)
+    print(f"Nominal coverage: {nominal_alpha}")
+    print(f"Realized marginal coverage: {np.mean(nonfactual_list)}")
+    # for grp, nonfactuals in nonfactual_grps.items():
+    #     print(f"Realized {grp} coverage: {np.mean(nonfactuals)}")

MACI-main/conditional-conformal/src/data_utils/sample_names.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import logging
+import numpy as np
+import requests
+from typing import Dict
+from tqdm import tqdm
+from concurrent.futures import ThreadPoolExecutor
+ENTITY_PATH = '/data/jcherian/wikipedia_entity_map.npz'
+WIKIDATA_URL = "https://www.wikidata.org/w/api.php"
+logger = logging.getLogger(__name__)
+logging.basicConfig(filename='human.log', level=logging.INFO)
+def get_id(response : Dict) -> str:
+    if response.get("entities", None) is None:
+        return None
+    wikidata_codes = list(response['entities'].keys())
+    assert len(wikidata_codes) == 1
+    return wikidata_codes[0]
+def is_human(response : Dict, id: str) -> bool:
+    instances = response['entities'][id]['claims'].get('P31', [])
+    for inst in instances:
+        if inst['mainsnak']['datavalue']['value']['id'] == 'Q5':
+            return True
+    return False
+def validate_entity(k):
+    name = k.split('/')[-1]
+    adapter = requests.adapters.HTTPAdapter(max_retries=10)
+    with requests.session() as s:
+        s.mount("https://", adapter)
+        response = s.get(url=WIKIDATA_URL, params={"action" : "wbgetentities",
+                                                        "sites" : "enwiki",
+                                                        "titles" : name,
+                                                        "normalize": "1",
+                                                        "languages": "en",
+                                                        "format": "json",
+                                                        "props": "claims"})
+    try:
+        response = response.json()
+    except:
+        print(response.text)
+    wiki_id = get_id(response)
+    if wiki_id is None:
+        return name, False
+    try:
+        human = is_human(response, wiki_id)
+    except:
+        return name, False
+    logger.info(f"{name}, {human}")
+    return name, human
+if __name__ == "__main__":
+    wiki_entities = np.load(ENTITY_PATH)
+    entity_names = list(wiki_entities.keys())
+    try:
+        with ThreadPoolExecutor(max_workers=5) as executor:
+            res = list(
+                tqdm(
+                    executor.map(
+                        lambda k : validate_entity(k),
+                        entity_names
+                    ),
+                    total=len(entity_names)
+                )
+            )
+    except:
+        import pickle
+        with open('human.pkl', 'wb') as fp:
+            pickle.dump(res, fp)
+    import pickle
+    with open('human.pkl', 'wb') as fp:
+        pickle.dump(res, fp)
+    import IPython; IPython.embed()

MACI-main/conditional-conformal/src/dataset.py ADDED Viewed

	@@ -0,0 +1,279 @@

+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
+from typing import List, Tuple
+import json
+import pandas as pd
+import numpy as np
+import os
+from atomizer import Atomizer, text_to_sentences
+from gpt import GPTClient
+from scorer import Scorer
+def get_prompts(
+    dataset : str,
+    data_path : str = None
+) -> List:
+    if dataset.lower() == "factscore":
+        with open('data/factscore_names.txt', 'r') as fp:
+            names = fp.readlines()
+        names = [name.strip() for name in names]
+        prompts = [
+            f"Please write one biographical paragraph about {name.strip()}."
+            for name in names
+        ]
+        return names, prompts
+    if dataset.lower() == "factscore_v2":
+        with open('data/factscore_v2_names.txt', 'r') as fp:
+            names = fp.readlines()
+        names = [name.strip() for name in names]
+        prompts = [
+            f"Please write one biographical paragraph about {name.strip()}."
+            for name in names
+        ]
+        return names, prompts
+    if dataset.lower() == "factscore_v3":
+        with open('data/factscore_v3_names.txt', 'r') as fp:
+            names = fp.readlines()
+        names = [name.strip() for name in names]
+        prompts = [
+            f"Please write one biographical paragraph about {name.strip()}."
+            for name in names
+        ]
+        return names, prompts
+    if dataset.lower() == "factscore_final":
+        df = pd.read_csv(data_path, index_col=0)
+        names = set([n.strip() for n in df['Name']])
+        prompts = [
+            f"Please write one biographical paragraph about {name.strip()}."
+            for name in names
+        ]
+        return names, prompts
+    if dataset.lower() == "medlfqa":
+        datasets = {}
+        suffix = "_test_MedLFQA.jsonl"
+        dataset_dir = "/Users/cherian/Projects/OLAPH/MedLFQA"
+        for path in os.listdir(dataset_dir):
+            if "MedLFQA" not in path:
+                continue
+            dataset_name = path[:-len(suffix)]
+            with open(os.path.join(dataset_dir, path), 'r') as fp:
+                datasets[dataset_name] = [json.loads(line) for line in fp.readlines()]
+        prompts = []
+        for _, dataset in datasets.items():
+            prompts += [pt['Question'] for pt in dataset]
+        prompts = list(set(prompts))
+        return prompts, prompts
+    if dataset.lower() == "medlfqav2":
+        datasets = {}
+        suffix = ".jsonl"
+        for filename in os.listdir(data_path):
+            dataset_name = filename[:-len(suffix)]
+            with open(os.path.join(data_path, filename), 'r') as fp:
+                datasets[dataset_name] = [json.loads(line) for line in fp.readlines()]
+        prompts = []
+        for _, dataset in datasets.items():
+            prompts += [pt['Question'] for pt in dataset]
+        return prompts, prompts
+    else:
+        raise ValueError("Unsupported data set.")
+def find_unique_element(lst, condition, approx_index):
+    # Check the approximate index first
+    if condition(lst[approx_index]):
+        return approx_index
+    # Initialize left and right pointers
+    left = approx_index - 1
+    right = approx_index + 1
+    # Expand outwards from the approximate index
+    while left >= 0 or right < len(lst):
+        if left >= 0 and condition(lst[left]):
+            return left
+        if right < len(lst) and condition(lst[right]):
+            return right
+        left -= 1
+        right += 1
+    # If no element satisfies the condition, return None or raise an exception
+    return None
+def load_dataset(
+    config : dict
+) -> List:
+    print("Loading responder.")
+    responder = GPTClient(config.model.responder.cache_path)
+    topics, prompts = get_prompts(config.dataset.name, config.dataset.path)
+    with ThreadPoolExecutor(max_workers=25) as executor:
+        responses = list(
+            tqdm(
+                executor.map(
+                    lambda x : responder.query(x),
+                    prompts
+                ),
+                total=len(prompts)
+            )
+        )
+    # TODO: Uncomment me if I want to run fresh dataset...
+    responder.cache_outputs(
+        prompts,
+        np.zeros((len(responses),), dtype=int),
+        responses
+    )
+    responder.save_cache()
+    responses = [r[0] for r in responses]
+    outputs = [{'prompt': p, 'response': o['message']}
+                    for p, o in zip(prompts, responses)] # first output is the response we will filter
+    import IPython; IPython.embed()
+    print("Loading atomizer.")
+    atomizer_client = GPTClient(config.model.parser.cache_path, model=config.model.parser.name)
+    atomizer = Atomizer(atomizer_client, demo_dir='data/demos')
+    CACHE_EXISTS = True
+    if CACHE_EXISTS: # TODO: dumb hard-coded variable to side step the slow retrieval
+        ordered_messages = [r['message'] for r in responses]
+        responder_cache = responder.cache_dict
+        messages = []
+        for val in responder_cache.values():
+            messages.append(val[0]['message'])
+        atomizer_cache = atomizer_client.cache_dict
+        idx_guess = 0
+        atomic_facts = [[] for _ in range(len(messages))]
+        atomic_facts_ph = [[] for _ in range(len(messages))]
+        sentences = defaultdict(int)
+        for k in tqdm(atomizer_cache.keys()):
+            atomized_msg = atomizer_cache[k][0]['message']
+            atomized_facts = text_to_sentences(atomized_msg)
+            sentence = k.split('\n')[-1].split('facts:')[-1].strip()[:-2]
+            cur_idx = -1
+            sentences[sentence] += 1
+            # if the sentence has appeared more than once we need to find the appropriate match...
+            for i in range(sentences[sentence]):
+                cur_idx = find_unique_element(messages[cur_idx + 1:], lambda x: sentence in x, approx_index=idx_guess)
+            if cur_idx is None: # TODO: TERRIBLE SPECIAL CASING that I looked at by hand...
+                raise ValueError()
+                if idx_guess in (4148, 4149, 4150):
+                    cur_idx = 4149
+                elif cur_idx == 993:
+                    cur_idx = 993
+                else:
+                    continue
+            idx_guess = cur_idx
+            atomic_facts[cur_idx].extend(atomized_facts)
+        for af, msg in zip(atomic_facts, messages):
+            if len(af) == 0:
+                continue
+            new_idx = ordered_messages.index(msg)
+            atomic_facts_ph[new_idx] = af
+        atomic_facts = atomic_facts_ph
+    else:
+        with ThreadPoolExecutor(max_workers=10) as executor:
+            atoms = list(
+                tqdm(
+                    executor.map(
+                        lambda x : atomizer.run(*x),
+                        [(o['response'],) for o in outputs]
+                    ),
+                    total=len(outputs)
+                )
+            )
+        atomizer.save_cache()
+        atomic_facts = [[fact for _, facts in atom[0] for fact in facts] for atom in atoms]
+    dataset = []
+    for p, r, af in zip(prompts, responses, atomic_facts):
+        atoms = [{'atom': fact} for fact in af]
+        data_pt = {'prompt': p, 'response': r, 'atomic_facts': atoms}
+        dataset.append(data_pt)
+    # time to annotate responses using factscore code
+    print("Loading annotator.")
+    scorer_client = GPTClient(config.model.annotator.cache_path, model=config.model.annotator.name)
+    scorer = Scorer(scorer_client, config, model_name="retrieval")
+    scorer_inputs = [(topic, output['response'], fact) for topic, output, fact in zip(topics, outputs, atomic_facts)]
+    with ThreadPoolExecutor(max_workers=4) as executor:
+        scores = list(
+            tqdm(
+                executor.map(
+                    lambda x : scorer.get_score(*x, knowledge_source='medlfqa'),
+                    scorer_inputs
+                ),
+                total=len(scorer_inputs)
+            )
+        )
+    # scorer.save_cache()
+    dataset = []
+    for p, r, s in zip(prompts, responses, scores):
+        data_pt = {
+            'prompt': p,
+            'response': r,
+            'atomic_facts': s['decisions'][0]
+        }
+        dataset.append(data_pt)
+    import IPython; IPython.embed()
+    return dataset
+def split_dataset(
+    dataset : List,
+    train_perc : float = 0.33,
+    valid_perc : float = 0.33,
+    rng : np.random.Generator = None
+) -> Tuple[List, List, List]:
+    """
+    Splits dataset into three parts. Split into training and validation is specified here.
+    """
+    total_length = len(dataset)
+    # Calculate lengths of each part based on percentages
+    len1 = int(total_length * train_perc)
+    len2 = int(total_length * valid_perc)
+    # if rng passed in, shuffle the dataset
+    if rng is not None:
+        rng.shuffle(dataset)
+    # Split the list using slicing
+    train_data = dataset[:len1]
+    valid_data = dataset[len1:len1+len2]
+    test_data = dataset[len1+len2:]
+    return train_data, valid_data, test_data

MACI-main/conditional-conformal/src/featurizer.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import numpy as np
+import json
+from functools import partial
+from tqdm.contrib.concurrent import process_map
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
+import client
+from typing import Dict, List, Tuple
+SELF_ASSESS_PROMPT = 'You will get a list of claims and the original prompt that motivated these claims. For each claim, assess the probability of correctness. Directly return a jsonl, where each line is {"id":[CLAIM_ID], "gpt-score":[SCORE]}. Directly return the jsonl with NO explanation or ANY other formatting. For the [SCORE], return the esimated probability of correctness to three significant figures.\n'
+SELF_BOOL_PROMPT = 'You will get a list of claims and the original prompt that motivated these claims. For each claim, assess the correctness. Directly return a jsonl, where each line is {"id":[CLAIM_ID], "gpt-bool":[BOOL]}. Directly return the jsonl with NO explanation or ANY other formatting. For the [BOOL], return "T" or "F" in quotes so that it is valid json.\n'
+MAX_WORKERS = 20
+def get_features(
+        dataset: List[Dict],
+        config : Dict
+) -> np.ndarray:
+    from gpt import GPTClient
+    feature_names = config.model.prob.features
+    all_features = []
+    if 'frequency' in feature_names:
+        client = GPTClient(f'.cache/{config.dataset.name}_frequency.pkl')
+        with ThreadPoolExecutor(max_workers=5) as executor:
+            frequencies = list(
+                tqdm(
+                    executor.map(
+                        lambda x: get_frequency(client, [af['atom'] for af in x['atomic_facts']], x['prompt'], config.model.prob.frequency.model),
+                        dataset
+                    ),
+                    total=len(dataset)
+                )
+            )
+        client.save_cache()
+        all_features.append(np.concatenate(frequencies).reshape(-1,1))
+    if 'selfeval' in feature_names:
+        eval_client = GPTClient(f'.cache/{config.dataset.name}_self_evals.pkl')
+        with ThreadPoolExecutor(max_workers=25) as executor:
+            self_evals = list(
+                tqdm(
+                    executor.map(
+                        lambda x: get_self_eval(x['prompt'], [af['atom'] for af in x['atomic_facts']], eval_client),
+                        dataset
+                    ),
+                    total=len(dataset)
+                )
+            )
+        eval_client.save_cache()
+        all_features.append(np.concatenate(self_evals).reshape(-1,1))
+    features = np.concatenate(
+        all_features,
+        axis=1
+    )
+    return features
+# def get_features(
+#         dataset : List[Dict],
+#         config : Dict
+# ) -> np.ndarray:
+#     feature_names = config.features
+#     num_claims = np.sum([len(dat['claims']) for dat in dataset])
+#     all_features = []
+#     for feat in feature_names:
+#         if feat == "embedding":
+#             embeds = np.zeros((num_claims, int(config.embedding.n_dimensions)))
+#             print("Fetching embeddings.")
+#             embedding_func = partial(get_embedding, model=config.embedding.model, n_dim=config.embedding.n_dimensions)
+#             res = process_map(embedding_func, [dat['claims'] for dat in dataset], max_workers=MAX_WORKERS)
+#             i = 0
+#             for dat in tqdm(dataset):
+#                 len_dat = len(dat['claims'])
+#                 embeds[i:(i + len_dat)] = get_embedding(dat['claims'], config.embedding.model, config.embedding.n_dimensions)
+#                 i += len_dat
+#             all_features.append(embeds)
+#         elif feat == "selfeval":
+#             print("Fetching selfevals.")
+#             evals = np.zeros((num_claims, 1))
+#             selfeval_func = partial(get_self_eval, model=config.selfeval.model.name)
+#             res = process_map(selfeval_func, dataset, max_workers=MAX_WORKERS)
+#             i = 0
+#             for dat in tqdm(dataset):
+#                 len_dat = len(dat['claims'])
+#                 evals[i:(i + len_dat)] = get_self_eval(dat['claims'], dat['prompt'], config.selfeval.model.name)
+#                 i += len_dat
+#             all_features.append(evals)
+#         elif feat == "frequency":
+#             print("Fetching frequency.")
+#             freqs = np.zeros(((num_claims), 1))
+#             i = 0
+#             for dat in tqdm(dataset):
+#                 len_dat = len(dat['claims'])
+#                 freqs[i:(i + len_dat)] = get_frequency(dat['claims'], dat['prompt'], config.frequency.model.n_samples, config.frequency.model.name)
+#                 i += len_dat
+#             all_features.append(freqs)
+#         else:
+#             raise ValueError(f"{feat} not supported.")
+#     return np.concatenate(all_features, axis=1)
+def get_embedding(
+        subclaims : List[str],
+        client : client.Client, # needs to be embedding client not *GPT* client
+        n_dim : int = 8
+) -> np.ndarray:
+    raise ValueError("not supported yet")
+    embeddings = []
+    for claim in subclaims:
+        msg = claim['message'].replace('\n', ' ')
+        embed = client.query(msg)
+        embeddings.append(embed[:n_dim])
+    return np.asarray(embeddings)
+def _eval_self(
+        prompt : str,
+        subclaims : List,
+        client : client.Client,
+        err_msg : str = None
+) -> Tuple[Tuple[str, List], np.ndarray]:
+    claim_string = "\n".join(
+        [str(i) + ": " + fact for i, fact in enumerate(subclaims)]
+    )
+    self_eval_prompt = SELF_ASSESS_PROMPT
+    self_eval_prompt += f"The original prompt is: {prompt}.\n"
+    self_eval_prompt += f"The claims are: {claim_string}.\n"
+    if err_msg is not None:
+        self_eval_prompt += "\n" + err_msg
+    self_evals = client.query(self_eval_prompt)
+    parsed_evals = self_evals[0]['message']
+    parsed_evals = parsed_evals.replace("```jsonl\n", "")
+    parsed_evals = parsed_evals.replace("```", "")
+    final_evals = np.zeros((len(parsed_evals.splitlines()),))
+    try:
+        assert len(final_evals) == len(subclaims)
+    except AssertionError:
+        if err_msg is not None and 'exactly' in err_msg:
+            print(f"I'm giving up on {claim_string} and {parsed_evals}, since I already retried this.")
+            return (None, None), None
+        err_msg = f"IMPORTANT: This is a retry. Make sure you return exactly {len(subclaims)} lines of JSON."
+        print(err_msg)
+        return _eval_self(prompt, subclaims, client, err_msg=err_msg)
+    try:
+        for line in parsed_evals.splitlines():
+            eval = json.loads(line)
+            idx = int(eval["id"])
+            final_evals[idx] += float(eval["gpt-score"])
+    except Exception as ex:
+        if err_msg is not None and 'requested' in err_msg:
+            print(f"I'm giving up on {claim_string} and {parsed_evals}, since I already retried this.")
+            return (None, None), None
+        err_msg = f"IMPORTANT: This is a retry. Make sure you return the lines in the requested JSON format with NO additional formatting."
+        print(err_msg)
+        return _eval_self(prompt, subclaims, client, err_msg=err_msg)
+    return (self_eval_prompt, self_evals), final_evals
+def get_self_eval(
+        prompt : str,
+        subclaims : List[str],
+        client : client.Client
+) -> np.ndarray:
+    all_evals = _eval_self(
+        prompt,
+        subclaims,
+        client
+    )
+    to_cache = all_evals[0]
+    if to_cache[0] is None:
+        return -1 * np.ones((len(subclaims),)) # -1 prob is error
+    client.cache_outputs(
+        [to_cache[0]],
+        np.zeros((1,), dtype=int),
+        [to_cache[1]]
+    )
+    return all_evals[1]
+def _bool_self(
+        prompt : str,
+        subclaims : List,
+        client : client.Client,
+        err_msg : str = None
+) -> Tuple[Tuple[str, List], np.ndarray]:
+    claim_string = "\n".join(
+        [str(i) + ": " + fact for i, fact in enumerate(subclaims)]
+    )
+    self_eval_prompt = SELF_BOOL_PROMPT
+    self_eval_prompt += f"The original prompt is: {prompt}.\n"
+    self_eval_prompt += f"The claims are: {claim_string}.\n"
+    if err_msg is not None:
+        self_eval_prompt += "\n" + err_msg
+    self_evals = client.query(self_eval_prompt)
+    parsed_evals = self_evals[0]['message']
+    parsed_evals = parsed_evals.replace("```jsonl\n", "")
+    parsed_evals = parsed_evals.replace("```", "")
+    final_evals = ['T' for i in range(len(parsed_evals.splitlines()))]
+    try:
+        assert len(final_evals) == len(subclaims)
+    except AssertionError:
+        if err_msg is not None and 'exactly' in err_msg:
+            print(f"I'm giving up on {claim_string} and {parsed_evals}, since I already retried this.")
+            return (None, None), None
+        err_msg = f"IMPORTANT: This is a retry. Make sure you return exactly {len(subclaims)} lines of JSON."
+        print(err_msg)
+        return _bool_self(prompt, subclaims, client, err_msg=err_msg)
+    try:
+        for line in parsed_evals.splitlines():
+            eval = json.loads(line)
+            idx = int(eval["id"])
+            final_evals[idx] = eval["gpt-bool"]
+    except Exception as ex:
+        if err_msg is not None and 'requested' in err_msg:
+            print(f"I'm giving up on {claim_string} and {parsed_evals}, since I already retried this.")
+            return (None, None), None
+        err_msg = f"IMPORTANT: This is a retry. Make sure you return the lines in the requested JSON format with NO additional formatting."
+        print(err_msg)
+        return _bool_self(prompt, subclaims, client, err_msg=err_msg)
+    return (self_eval_prompt, self_evals), final_evals
+def get_bool_eval(
+        prompt : str,
+        subclaims : List[str],
+        client : client.Client
+) -> np.ndarray:
+    all_evals = _bool_self(
+        prompt,
+        subclaims,
+        client
+    )
+    to_cache = all_evals[0]
+    if to_cache[0] is None:
+        return -1 * np.ones((len(subclaims),)) # -1 prob is error
+    client.cache_outputs(
+        [to_cache[0]],
+        np.zeros((1,), dtype=int),
+        [to_cache[1]]
+    )
+    return all_evals[1]
+def _eval_support(
+        output : str,
+        subclaims : List,
+        client : client.Client,
+        err_msg : str = None
+) -> Tuple[Tuple[str, List], np.ndarray]:
+    claim_string = "\n".join(
+        [str(i) + ": " + fact for i, fact in enumerate(subclaims)]
+    )
+    counting_prompt = (
+        'You will get a list of claims and piece of text. For each claim, score whether the text supports, contradicts, or is unrelated to the claim. Directly return a jsonl, where each line is {"id":[CLAIM_ID], "score":[SCORE]}. Directly return the jsonl with NO explanation or ANY other formatting. For the [SCORE], return 1 for supports, -1 for contradicts, and 0 for unrelated. The claims are:\n'
+        + claim_string
+        + "\n\nThe text is:\n"
+        + output
+    )
+    if err_msg is not None:
+        counting_prompt += "\n" + err_msg
+    support_scores = client.query(counting_prompt)
+    parsed_scores = support_scores[0]['message']
+    parsed_scores = parsed_scores.replace("```jsonl\n", "")
+    parsed_scores = parsed_scores.replace("```", "")
+    final_scores = np.zeros((len(parsed_scores.splitlines()),))
+    try:
+        assert len(final_scores) == len(subclaims)
+    except AssertionError:
+        if err_msg is not None and 'exactly' in err_msg:
+            print(f"I'm giving up on {claim_string} and {parsed_scores}, since I already retried this.")
+            return (None, None), None
+        err_msg = f"IMPORTANT: This is a retry. Make sure you return exactly {len(subclaims)} lines of JSON."
+        print(err_msg)
+        return _eval_support(output, subclaims, client, err_msg=err_msg)
+    try:
+        for line in parsed_scores.splitlines():
+            score = json.loads(line)
+            idx = int(score["id"])
+            final_scores[idx] += float(score["score"])
+    except Exception as ex:
+        if err_msg is not None and 'requested' in err_msg:
+            print(f"I'm giving up on {claim_string} and {parsed_scores}, since I already retried this.")
+            return (None, None), None
+        err_msg = f"IMPORTANT: This is a retry. Make sure you return the lines in the requested JSON format with NO additional formatting."
+        print(err_msg)
+        return _eval_support(output, subclaims, client, err_msg=err_msg)
+    return (counting_prompt, support_scores), final_scores
+def get_frequency(
+        client : client.Client,
+        subclaims : List,
+        prompt : str,
+        config : dict
+) -> np.ndarray:
+    """
+    Returns a vector of (frequency) scores corresponding to each entry of the subclaims list.
+    """
+    # Generate n_samples alternate outputs with temperature 1.0.
+    alternate_outputs = client.query(
+        prompt, 1, n_samples=config.n_samples, temperature=config.temperature
+    )
+    client.cache_outputs(
+        [prompt],
+        [int(1)],
+        [alternate_outputs]
+    )
+    alternate_outputs = [o['message'] for o in alternate_outputs]
+    with ThreadPoolExecutor(max_workers=config.n_samples) as executor:
+        all_scores = list(
+            executor.map(
+                lambda x : _eval_support(x, subclaims, client),
+                alternate_outputs
+            )
+        )
+    # to_cache = [s[0] for s in all_scores if s[0][0] is not None]
+    # client.cache_outputs(
+    #     [c[0] for c in to_cache],
+    #     np.zeros((len(to_cache),), dtype=int),
+    #     [c[1] for c in to_cache]
+    # )
+    # TODO: error handling if this is all empty?
+    parsed_scores = np.mean([s[1] for s in all_scores if s[1] is not None], axis=0)
+    return parsed_scores

MACI-main/conditional-conformal/src/gpt.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import openai
+from typing import List
+from client import Client
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_random_exponential,
+)  # for exponential backoff
+class GPTClient(Client):
+    def __init__(
+            self,
+            cache_file : str,
+            model : str = 'gpt-3.5-turbo'
+    ):
+        super(GPTClient, self).__init__(cache_file, model)
+        self.client = openai.Client()
+        self.tokens_used = 0
+        self.requests_made = 0
+    @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
+    def _query(
+            self,
+            prompt : List[str],
+            role : List[str] = None,
+            max_tokens : int = 1000,
+            temperature: float = 0,
+            response_format : str = None,
+            n_samples: int = 1
+    ):
+        if role is None:
+            messages = [{"role": "user", "content": prompt}]
+        else:
+            messages = [{"role": role, "content": prompt}]
+        completion = self.client.chat.completions.create(
+            model=self.model,
+            messages=messages,
+            response_format=response_format,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            n=n_samples,
+            logprobs=True
+        )
+        self.tokens_used += completion.usage.total_tokens
+        self.requests_made += 1
+        # print(self.tokens_used, self.requests_made)
+        outputs = []
+        for choice in completion.choices:
+            output_dict = {
+                'logprobs': choice.logprobs.content,
+                'message': choice.message.content
+            }
+            outputs.append(output_dict)
+        return outputs

MACI-main/conditional-conformal/src/llm_utils.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import json
+import numpy as np
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
+from typing import Dict, List
+from query import (
+    generate_subclaim_prompt, generate_annotation_prompt,
+    generate_merge_prompt, query_llm
+)
+import client
+MERGE_PROMPT = "You will get an instruction and a set of facts that are true. Construct an answer using ONLY the facts provided, and use ALL of the facts provided. If no facts are given, reply and say that you don't know enough to respond.\n"
+def parse_responses(
+    outputs : List[Dict],
+    parser_config : str,
+    annotate : bool = False,
+    annotator_config : str = None
+):
+    for output in tqdm(outputs):
+        prompt, response = output["prompt"], output["response"]
+        subclaims = get_subclaims(prompt, response, parser_config)
+        if annotate:
+            subclaims = add_annotations(prompt, subclaims, annotator_config)
+        output["claims"] = subclaims
+    return outputs
+def get_subclaims(
+        prompt : str,
+        response : str,
+        parser_config : str
+) -> List[Dict]:
+    subclaim_prompt = generate_subclaim_prompt(prompt, response)
+    subclaims = query_llm([subclaim_prompt], parser_config)[0] # get the first output
+    subclaims = [{'message': c} for c in subclaims['message'].splitlines()]
+    return subclaims
+def add_annotations(
+        prompt : str,
+        subclaims : List[Dict],
+        annotator_config : str
+) -> List[Dict]:
+    annotation_prompt = generate_annotation_prompt(prompt, subclaims)
+    annotations = query_llm([annotation_prompt], annotator_config)[0]
+    annotations = annotations['message'].splitlines()
+    num_retries = 0
+    while len(annotations) != len(subclaims):
+        print(f"Annotation length does not match subclaims for {prompt}. Retrying query.")
+        annotations = query_llm([annotation_prompt], annotator_config)[0]
+        annotations = annotations['message'].splitlines()
+        num_retries += 1
+        if num_retries > 5:
+            print("Giving up and assigning False to all subclaims.")
+            annotations = ['F' for _ in subclaims]
+    for a, subclaim in zip(annotations, subclaims):
+        try:
+            subclaim['annotation'] = json.loads(a)['value']
+        except:
+            import IPython; IPython.embed()
+    return subclaims
+def _concat_claims(
+    subclaims : List[str]
+) -> str:
+    return "\n".join(
+        f"{i}: {subclaim}" for i, subclaim in enumerate(subclaims)
+    )
+def _get_merged_output(
+    prompt : str,
+    subclaims : List[str],
+    client : client.Client
+) -> str:
+    final_prompt = MERGE_PROMPT + f"The original instruction was: {prompt}\n"
+    final_prompt += f"The facts are: {_concat_claims(subclaims)}"
+    output = client.query(final_prompt)
+    return (final_prompt, output), output[0]['message']
+def merge_claims(
+    dataset : List,
+    client : client.Client
+) -> List:
+    with ThreadPoolExecutor(max_workers=25) as executor:
+        responses = list(
+            tqdm(
+                executor.map(
+                    lambda x : _get_merged_output(x['prompt'], x['filtered_claims'], client),
+                    dataset
+                ),
+                total=len(dataset)
+            )
+        )
+    to_cache = [r[0] for r in responses]
+    client.cache_outputs(
+        [c[0] for c in to_cache],
+        np.zeros((len(to_cache),), dtype=int),
+        [c[1] for c in to_cache]
+    )
+    return [r[1] for r in responses]

MACI-main/conditional-conformal/src/postprocess_factscore.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import json
+output = []
+prompt_to_idx = {}
+idx = 0
+with open("/Users/cherian/Downloads/factscore-unlabeled-predictions/ChatGPT.jsonl") as fp:
+    for line in fp:
+        res = json.loads(line)
+        new_res = {}
+        new_res['prompt'] = res['prompt']
+        new_res['claims'] = []
+        annotator = 'ChatGPT_Labels' if 'ChatGPT_Labels' in res else 'LLAMA+NP_Labels'
+        for fact, annotation in zip(res['facts'], res[annotator]):
+            a = 'T' if annotation == 'S' else 'F'
+            new_res['claims'].append(
+                {'message': fact, 'annotation': a}
+            )
+        output.append(new_res)
+        prompt_to_idx[res['prompt']] = idx
+        idx += 1
+with open("/Users/cherian/Projects/FActScore/factscore/data/unlabeled/ChatGPT.jsonl", 'r') as fp:
+    for line in fp:
+        res = json.loads(line)
+        idx = prompt_to_idx.get(res['input'], None)
+        if idx is None:
+            continue
+        else:
+            output[idx]['response'] = res['output']
+            output[idx]['topic'] = res['topic']
+            output[idx]['metadata'] = res['cat']
+with open("data/factscore_processed.json", 'w') as fp:
+    fp.write(json.dumps(output) + "\n")

MACI-main/conditional-conformal/src/prob_model.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import numpy as np
+from sklearn.linear_model import LogisticRegressionCV
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from typing import List
+from conformal import compute_conformity_scores
+def fit_model(
+        features : np.ndarray,
+        labels : np.ndarray,
+        config : dict,
+        dataset_train : List = None,
+        eval_dict : dict = None
+):
+    name = config.model.prob.name
+    if name == "logistic":
+        model = LogisticRegressionCV()
+        model.fit(X=features, y=labels)
+        return model
+    elif name == "XGBoost":
+        raise ValueError("not implemented yet")
+    elif name == "torch":
+        # no data splitting for now when constructing conformal loss
+        model = LogisticRegression(features.shape[1])
+        optimizer = optim.Adam(model.parameters(), lr=1)
+        x = torch.tensor(features, requires_grad=True, dtype=torch.float32)
+        for i in range(500):
+            optimizer.zero_grad()
+            probs = model.forward(x)
+            loss, avg_train = get_conformal_loss(probs, labels, dataset_train, config.conformal.alpha)
+            if i % 100 == 0:
+                probs_valid = model.forward(torch.tensor(eval_dict['X_valid'], dtype=torch.float32)).detach().numpy()
+                probs_split = np.array_split(probs_valid, eval_dict['splits_valid'])
+                threshold = np.quantile(compute_conformity_scores(eval_dict['dataset_valid'], probs_split), 1 - config.conformal.alpha)
+                probs_test = model.forward(torch.tensor(eval_dict['X_test'], dtype=torch.float32)).detach().numpy()
+                probs_split = np.array_split(probs_test, eval_dict['splits_test'])
+                avg = 0
+                for prob in probs_split:
+                    avg_retain = np.mean(prob > threshold.item())
+                    avg += avg_retain
+                print(f"Average % of train claims retained: {avg_train}")
+                print(f"Average % of test claims retained: {avg / len(probs_split)}")
+                print(f"Loss at iteration {i}: {loss.item()}")
+            loss.backward()
+            optimizer.step()
+        return model
+    else:
+        return ValueError(f"{name} not available.")
+def get_conformal_loss(probs, labels, dataset_train, alpha):
+    claim_splits = torch.tensor(
+            np.cumsum([len(dat['atomic_facts']) for dat in dataset_train])[:-1]
+    )
+    claim_probs = torch.tensor_split(probs, claim_splits)
+    claim_labels = np.array_split(1 - labels, claim_splits.numpy())
+    scores = []
+    for c_prob, c_label in zip(claim_probs, claim_labels):
+        scores.append(c_prob[c_label].max()) # could replace this with element-wise multiplication and make annotations softer?
+    # use random set of scores to calibrate
+    random_indices = np.random.permutation(len(scores))
+    threshold_indices = random_indices[:25]
+    loss_indices = random_indices[25:]
+    threshold_scores = [scores[i] for i in range(len(scores)) if i in threshold_indices]
+    threshold = torch.quantile(torch.stack(threshold_scores), 1 - alpha)
+    loss = 0
+    avg = 0
+    for idx, c_prob in enumerate(claim_probs):
+        if idx in loss_indices:
+            loss += torch.sigmoid((threshold - c_prob)).mean()
+            avg_retain = (c_prob > threshold).float().mean()
+            avg += avg_retain
+    if np.isnan(loss.item()):
+        raise ValueError(claim_probs[0])
+    return loss, avg / len(loss_indices)
+class LogisticRegression(nn.Module):
+    def __init__(self, n_features):
+        super(LogisticRegression, self).__init__()
+        self.linear = nn.Linear(n_features, 1)
+    def forward(self, x):
+        return F.sigmoid(self.linear(x))

MACI-main/conditional-conformal/src/query.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from typing import Dict, List
+import openai
+SUBCLAIM_PROMPT = 'Please breakdown the following response to a prompt into a set of small, independent claims. Return each subclaim (with no other characters) on a new line. \n'
+MERGE_PROMPT = "You will get an instruction and a set of facts that are true. Construct an answer using ONLY the facts provided, and use ALL of the facts provided. If no facts are given, reply and say that you don't know enough to respond.\n"
+ANNOTATION_PROMPT = 'You will get an instruction and a set of claims made in response to that instruction. Determine whether each claim is true, subjective, or false. Each returned determination should be {"claim_id": ID, "value": TRUTH_VALUE} and be on its own line with NO other characters. The truth value should be in quotes and it should be T for Factual, S for Subjective, and F for False.\n'
+FREQUENCY_PROMPT = 'You will get a list of claims and piece of text. For each claim, score whether the text supports, contradicts, or is unrelated to the claim. Directly return a jsonl, where each line is {"id":[CLAIM_ID], "score":[SCORE]}. Directly return the jsonl with no explanation or other formatting. For the [SCORE], return 1 for supports, -1 for contradicts, and 0 for unrelated.\n'
+def _concat_claims(
+    subclaims : List[str]
+) -> str:
+    return "\n".join(
+        f"{i}: {subclaim['message']}" for i, subclaim in enumerate(subclaims)
+    )
+def generate_subclaim_prompt(
+    prompt : str,
+    response : str
+) -> str:
+    final_output = SUBCLAIM_PROMPT + f"The original instruction was: {prompt}\n"
+    final_output += f"The response to be broken down into subclaims is: {response}"
+    return final_output
+def generate_merge_prompt(
+    prompt : str,
+    subclaims : List[str]
+) -> str:
+    final_output = MERGE_PROMPT + f"The original instruction was: {prompt}\n"
+    final_output += f"The facts are: {_concat_claims(subclaims)}"
+    return final_output
+def generate_annotation_prompt(
+    prompt : str,
+    subclaims : List[str]
+) -> str:
+    final_output = ANNOTATION_PROMPT + f"The original instruction was: {prompt}\n"
+    final_output += f"The claims are: \n{_concat_claims(subclaims)}"
+    return final_output
+def generate_frequency_prompt(
+    subclaims : List[str],
+    output : str,
+) -> str:
+    final_output = FREQUENCY_PROMPT + f"The claims are: {_concat_claims(subclaims)}\n"
+    final_output += f"The text is: {output}"
+    return final_output
+def query_gpt(
+        client : openai.Client,
+        prompts : List[str],
+        model : str = "gpt-3.5-turbo",
+        roles : List[str] = None,
+        max_tokens : int = 1000,
+        temperature: float = 0,
+        response_format : str = None,
+        n_samples: int = 1
+):
+    if roles is None:
+        messages = [{"role": "user", "content": prompt} for prompt in prompts]
+    else:
+        messages = [{"role": role, "content": prompt} for role, prompt in zip(roles, prompts)]
+    completion = client.chat.completions.create(
+        model=model,
+        messages=messages,
+        response_format=response_format,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        n=n_samples,
+        logprobs=True
+    )
+    return completion
+def query_embedding(
+        client : openai.Client,
+        prompts : List[str],
+        model : str = "text-embedding-3-small",
+        **kwargs
+):
+    embed = client.embeddings.create(input = prompts, model = model, **kwargs).data[0].embedding
+    return embed
+def query_llm(
+        prompts : List[str],
+        model : str,
+        **kwargs
+) -> Dict:
+    if 'gpt' in model:
+        client = openai.Client() # OPENAI_API_KEY should be set as an environment variable
+        completion = query_gpt(client, prompts, model, **kwargs)
+        outputs = []
+        for choice in completion.choices:
+            output_dict = {
+                'logprobs': choice.logprobs.content,
+                'message': choice.message.content
+            }
+            outputs.append(output_dict)
+        return outputs
+    elif 'embedding' in model:
+        client = openai.Client()
+        output = query_embedding(client, prompts, model, **kwargs)
+        return output
+    else:
+        raise ValueError(f"Model {model} is not supported in query.")

MACI-main/conditional-conformal/src/ray_data.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import argparse
+import numpy as np
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
+from config import get_config
+from featurizer import get_frequency, get_self_eval
+from gpt import GPTClient
+from atomizer import text_to_sentences
+from dataset import get_prompts
+from scorer import Scorer
+import ray
+def parse_args():
+    parser = argparse.ArgumentParser(
+        prog="conformal-safety",
+        description="Auto-filter claims from LLM to meet accuracy and safety guarantees.",
+    )
+    parser.add_argument('-config_path', '-c', default='configs/default.toml', help="Config for construction.")
+    args = parser.parse_args()
+    return args
+def find_unique_element(lst, condition, approx_index):
+    # Check the approximate index first
+    if condition(lst[approx_index]):
+        return approx_index
+    # Initialize left and right pointers
+    left = approx_index - 1
+    right = approx_index + 1
+    # Expand outwards from the approximate index
+    while left >= 0 or right < len(lst):
+        if left >= 0 and condition(lst[left]):
+            return left
+        if right < len(lst) and condition(lst[right]):
+            return right
+        left -= 1
+        right += 1
+    # If no element satisfies the condition, return None or raise an exception
+    return None
+@ray.remote
+def parallel_scorer(*args, **kwargs):
+    return None
+    return run_experiment(*args, **kwargs)
+if __name__ == "__main__":
+    args = parse_args()
+    config = get_config(args.config_path)
+    import IPython; IPython.embed()
+    responder = GPTClient(config.model.responder.cache_path)
+    topics, prompts = get_prompts(config.dataset.name)
+    with ThreadPoolExecutor(max_workers=25) as executor:
+        responses = list(
+            tqdm(
+                executor.map(
+                    lambda x : responder.query(x),
+                    prompts
+                ),
+                total=len(prompts)
+            )
+        )
+    responses = [r[0] for r in responses]
+    outputs = [{'prompt': p, 'response': o['message']}
+                    for p, o in zip(prompts, responses)] # first output is the response we will filter
+    print("Loading atomizer.")
+    atomizer_client = GPTClient(config.model.parser.cache_path, model=config.model.parser.name)
+    responder_cache = responder.cache_dict
+    messages = []
+    for val in responder_cache.values():
+        messages.append(val[0]['message'])
+    atomizer_cache = atomizer_client.cache_dict
+    idx_guess = 0
+    atomic_facts = [[] for _ in range(len(messages))]
+    for k in tqdm(atomizer_cache.keys()):
+        atomized_msg = atomizer_cache[k][0]['message']
+        atomized_facts = text_to_sentences(atomized_msg)
+        sentence = k.split('\n')[-1].split('facts:')[-1].strip()[:-2]
+        cur_idx = find_unique_element(messages, lambda x: sentence in x, approx_index=idx_guess)
+        if cur_idx is None: # TODO: TERRIBLE SPECIAL CASING that I looked at by hand...
+            if idx_guess == 4151:
+                cur_idx = 4152
+            else:
+                cur_idx = idx_guess
+        idx_guess = cur_idx
+        atomic_facts[cur_idx].extend(atomized_facts)
+    # time to annotate responses using factscore code
+    print("Loading annotator.")
+    scorer_client = GPTClient(config.model.annotator.cache_path, model=config.model.annotator.name)
+    scorer = Scorer(scorer_client, config, model_name="retrieval")
+    scorer_inputs = [(topic, output['response'], fact) for topic, output, fact in zip(topics, outputs, atomic_facts)]
+    import IPython; IPython.embed()
+    # connect to cluster
+    ray.init(address="auto")
+    results = []
+    for seed in range(args.seed, args.seed + args.n_trials):
+        if args.type == 'coverage':
+            result = parallel_coverage_experiment.remote(
+                (X, Y), n_test, n_calib, alpha, methods=args.methods, seed=seed
+            )
+        else:
+            result = parallel_experiment.remote(
+                (X, Y), n_test, n_calib, alpha, methods=args.methods, seed=seed
+            )
+        results.append(result)
+    trial_results = ray.get(results)
+    with ThreadPoolExecutor(max_workers=1) as executor:
+        scores = list(
+            tqdm(
+                executor.map(
+                    lambda x : scorer.get_score(*x),
+                    scorer_inputs
+                ),
+                total=len(scorer_inputs)
+            )
+        )
+    scorer.save_cache()
+    dataset = []
+    for p, r, s in zip(prompts, responses, scores):
+        data_pt = {
+            'prompt': p,
+            'response': r,
+            'atomic_facts': s['decisions'][0]
+        }
+        dataset.append(data_pt)
+    import IPython
+    IPython.embed()
+    # client = GPTClient(f'.cache/{config.dataset.name}_frequency.pkl')
+    # with ThreadPoolExecutor(max_workers=5) as executor:
+    #     frequencies = list(
+    #         tqdm(
+    #             executor.map(
+    #                 lambda x: get_frequency(client, [af['atom'] for af in x['atomic_facts']], x['prompt'], config.model.prob.frequency.model),
+    #                 dataset
+    #             ),
+    #             total=len(dataset)
+    #         )
+    #     )
+    # client.save_cache()
+    # eval_client = GPTClient(f'.cache/{config.dataset.name}_self_evals.pkl')
+    # with ThreadPoolExecutor(max_workers=25) as executor:
+    #     self_evals = list(
+    #         tqdm(
+    #             executor.map(
+    #                 lambda x: get_self_eval(x['prompt'], [af['atom'] for af in x['atomic_facts']], eval_client),
+    #                 dataset
+    #             ),
+    #             total=len(dataset)
+    #         )
+    #     )
+    # eval_client.save_cache()
+    # features = np.concatenate(
+    #     [
+    #         np.concatenate(frequencies).reshape(-1,1),
+    #         np.concatenate(self_evals).reshape(-1,1)
+    #     ],
+    #     axis=1
+    # )

MACI-main/conditional-conformal/src/retrieval.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import json
+import time
+import os
+import sqlite3
+import numpy as np
+import pickle as pkl
+from rank_bm25 import BM25Okapi
+SPECIAL_SEPARATOR = "####SPECIAL####SEPARATOR####"
+MAX_LENGTH = 256
+class DocDB(object):
+    """Sqlite backed document storage.
+    Implements get_doc_text(doc_id).
+    """
+    def __init__(self, db_path=None, data_path=None, cache_path=None):
+        self.db_path = db_path
+        self.cache_file = cache_path
+        self.connection = sqlite3.connect(self.db_path, check_same_thread=False)
+        self.cache_dict = self.load_cache()
+        cursor = self.connection.cursor()
+        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
+        if len(cursor.fetchall())==0:
+            assert data_path is not None, f"{self.db_path} is empty. Specify `data_path` in order to create a DB."
+            print (f"{self.db_path} is empty. start building DB from {data_path}...")
+            self.build_db(self.db_path, data_path)
+    def load_cache(self, allow_retry=True):
+        if os.path.exists(self.cache_file):
+            while True:
+                try:
+                    with open(self.cache_file, "rb") as f:
+                        cache = pkl.load(f)
+                    break
+                except Exception: # if there are concurent processes, things can fail
+                    if not allow_retry:
+                        assert False
+                    print ("Pickle Error: Retry in 5sec...")
+                    time.sleep(5)
+        elif 's3' in self.cache_file:
+            from aws_utils import s3_open
+            s3_path = self.cache_file.removeprefix('s3://')
+            bucket_name = s3_path.split('/')[0]
+            path_to_file = '/'.join(s3_path.split('/')[1:])
+            with s3_open(bucket_name, path_to_file) as fp:
+                cache = pkl.load(fp)
+        else:
+            cache = {}
+        return cache
+    def save_cache(self):
+        # load the latest cache first, since if there were other processes running in parallel, cache might have been updated
+        for k, v in self.load_cache().items():
+            self.cache_dict[k] = v
+        with open(self.cache_file, "wb") as f:
+            pkl.dump(self.cache_dict, f)
+    def __enter__(self):
+        return self
+    def __exit__(self, *args):
+        self.close()
+    def path(self):
+        """Return the path to the file that backs this database."""
+        return self.path
+    def close(self):
+        """Close the connection to the database."""
+        self.connection.close()
+    def build_db(self, db_path, data_path):
+        from transformers import RobertaTokenizer
+        tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
+        titles = set()
+        output_lines = []
+        tot = 0
+        start_time = time.time()
+        c = self.connection.cursor()
+        c.execute("CREATE TABLE documents (title PRIMARY KEY, text);")
+        with open(data_path, "r") as f:
+            for line in f:
+                dp = json.loads(line)
+                title = dp["title"]
+                text = dp["text"]
+                if title in titles:
+                    continue
+                titles.add(title)
+                if type(text)==str:
+                    text = [text]
+                passages = [[]]
+                for sent_idx, sent in enumerate(text):
+                    assert len(sent.strip())>0
+                    tokens = tokenizer(sent)["input_ids"]
+                    max_length = MAX_LENGTH - len(passages[-1])
+                    if len(tokens) <= max_length:
+                        passages[-1].extend(tokens)
+                    else:
+                        passages[-1].extend(tokens[:max_length])
+                        offset = max_length
+                        while offset < len(tokens):
+                            passages.append(tokens[offset:offset+MAX_LENGTH])
+                            offset += MAX_LENGTH
+                psgs = [tokenizer.decode(tokens) for tokens in passages if np.sum([t not in [0, 2] for t in tokens])>0]
+                text = SPECIAL_SEPARATOR.join(psgs)
+                output_lines.append((title, text))
+                tot += 1
+                if len(output_lines) == 1000000:
+                    c.executemany("INSERT INTO documents VALUES (?,?)", output_lines)
+                    output_lines = []
+                    print ("Finish saving %dM documents (%dmin)" % (tot / 1000000, (time.time()-start_time)/60))
+        if len(output_lines) > 0:
+            c.executemany("INSERT INTO documents VALUES (?,?)", output_lines)
+            print ("Finish saving %dM documents (%dmin)" % (tot / 1000000, (time.time()-start_time)/60))
+        self.connection.commit()
+        self.connection.close()
+    def get_text_from_title(self, title):
+        """Fetch the raw text of the doc for 'doc_id'."""
+        with open('data/wiki_corrections.txt') as fp:
+            all_names = fp.readlines()
+            all_names = [n.strip() for n in all_names]
+            name_converter = {names.split('=')[0]:names.split('=')[1] for names in all_names}
+        if title in name_converter:
+            title = name_converter[title]
+        if title in self.cache_dict:
+            results = self.cache_dict[title]
+        else:
+            print("I SHOULD NOT BE HERE.")
+            cursor = self.connection.cursor()
+            cursor.execute("SELECT text FROM documents WHERE title = ?", (title,))
+            results = cursor.fetchall()
+            results = [r for r in results]
+            cursor.close()
+            try:
+                assert results is not None and len(results)==1, f"`topic` in your data ({title}) is likely to be not a valid title in the DB."
+            except Exception: # if there are concurent processes, things can fail
+                print (f"Retrieval error for {title}: Retry in 5sec...")
+                # time.sleep(5)
+                cursor = self.connection.cursor()
+                cursor.execute("SELECT text FROM documents WHERE title = ?", (title,))
+                results = cursor.fetchall()
+                results = [r for r in results]
+                results = [['blah blah blah']]
+                cursor.close()
+            results = [{"title": title, "text": para} for para in results[0][0].split(SPECIAL_SEPARATOR)]
+            assert len(results)>0, f"`topic` in your data ({title}) is likely to be not a valid title in the DB."
+            self.cache_dict[title] = results
+        return results
+class Retrieval(object):
+    def __init__(self, db, cache_path, embed_cache_path,
+                 retrieval_type="gtr-t5-large", batch_size=None):
+        self.db = db
+        self.cache_path = cache_path
+        self.embed_cache_path = embed_cache_path
+        self.retrieval_type = retrieval_type
+        self.batch_size = batch_size
+        assert retrieval_type=="bm25" or retrieval_type.startswith("gtr-")
+        self.encoder = None
+        self.load_cache()
+        self.add_n = 0
+        self.add_n_embed = 0
+    def load_encoder(self):
+        from sentence_transformers import SentenceTransformer
+        encoder = SentenceTransformer("sentence-transformers/" + self.retrieval_type)
+        encoder = encoder.cuda()
+        encoder = encoder.eval()
+        self.encoder = encoder
+        assert self.batch_size is not None
+    def load_cache(self):
+        if os.path.exists(self.cache_path):
+            with open(self.cache_path, "r") as f:
+                self.cache = json.load(f)
+        else:
+            self.cache = {}
+        if os.path.exists(self.embed_cache_path):
+            with open(self.embed_cache_path, "rb") as f:
+                self.embed_cache = pkl.load(f)
+        else:
+            self.embed_cache = {}
+    def save_cache(self):
+        if self.add_n > 0:
+            if os.path.exists(self.cache_path):
+                with open(self.cache_path, "r") as f:
+                    new_cache = json.load(f)
+                self.cache.update(new_cache)
+            with open(self.cache_path, "w") as f:
+                json.dump(self.cache, f)
+        if self.add_n_embed > 0:
+            if os.path.exists(self.embed_cache_path):
+                with open(self.embed_cache_path, "rb") as f:
+                    new_cache = pkl.load(f)
+                self.embed_cache.update(new_cache)
+            with open(self.embed_cache_path, "wb") as f:
+                pkl.dump(self.embed_cache, f)
+    def get_bm25_passages(self, topic, query, passages, k):
+        if topic in self.embed_cache:
+            bm25 = self.embed_cache[topic]
+        else:
+            bm25 = BM25Okapi([psg["text"].replace("<s>", "").replace("</s>", "").split() for psg in passages])
+            self.embed_cache[topic] = bm25
+            self.add_n_embed += 1
+        scores = bm25.get_scores(query.split())
+        indices = np.argsort(-scores)[:k]
+        return [passages[i] for i in indices]
+    def get_gtr_passages(self, topic, retrieval_query, passages, k):
+        if self.encoder is None:
+            self.load_encoder()
+        if topic in self.embed_cache:
+            passage_vectors = self.embed_cache[topic]
+        else:
+            inputs = [psg["title"] + " " + psg["text"].replace("<s>", "").replace("</s>", "") for psg in passages]
+            passage_vectors = self.encoder.encode(inputs, batch_size=self.batch_size, device=self.encoder.device)
+            self.embed_cache[topic] = passage_vectors
+            self.add_n_embed += 1
+        query_vectors = self.encoder.encode([retrieval_query],
+                                            batch_size=self.batch_size,
+                                            device=self.encoder.device)[0]
+        scores = np.inner(query_vectors, passage_vectors)
+        indices = np.argsort(-scores)[:k]
+        return [passages[i] for i in indices]
+    def get_passages(self, topic, question, k):
+        retrieval_query = topic + " " + question.strip()
+        cache_key = topic + "#" + retrieval_query
+        if cache_key not in self.cache:
+            passages = self.db.get_text_from_title(topic)
+            if self.retrieval_type=="bm25":
+                self.cache[cache_key] = self.get_bm25_passages(topic, retrieval_query, passages, k)
+            else:
+                self.cache[cache_key] = self.get_gtr_passages(topic, retrieval_query, passages, k)
+            assert len(self.cache[cache_key]) in [k, len(passages)]
+            self.add_n += 1
+        return self.cache[cache_key]

MACI-main/conditional-conformal/src/retrieve_data.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import argparse
+import numpy as np
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
+from dataset import load_dataset
+from config import get_config
+from featurizer import get_frequency, get_self_eval, get_bool_eval
+from gpt import GPTClient
+def parse_args():
+    parser = argparse.ArgumentParser(
+        prog="conformal-safety",
+        description="Auto-filter claims from LLM to meet accuracy and safety guarantees.",
+    )
+    parser.add_argument('-config_path', '-c', default='configs/default.toml', help="Config for construction.")
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = parse_args()
+    config = get_config(args.config_path)
+    dataset = load_dataset(config)
+    # client = GPTClient(f'.cache/{config.dataset.name}_frequency.pkl')
+    # with ThreadPoolExecutor(max_workers=8) as executor:
+    #     frequencies = list(
+    #         tqdm(
+    #             executor.map(
+    #                 lambda x: get_frequency(client, [af['atom'] for af in x['atomic_facts']], x['prompt'], config.model.prob.frequency.model),
+    #                 dataset
+    #             ),
+    #             total=len(dataset)
+    #         )
+    #     )
+    # client.save_cache()
+    # eval_client = GPTClient(f'.cache/{config.dataset.name}_self_evals.pkl')
+    # with ThreadPoolExecutor(max_workers=25) as executor:
+    #     self_evals = list(
+    #         tqdm(
+    #             executor.map(
+    #                 lambda x: get_self_eval(x['prompt'], [af['atom'] for af in x['atomic_facts']], eval_client),
+    #                 dataset
+    #             ),
+    #             total=len(dataset)
+    #         )
+    #     )
+    # eval_client.save_cache()
+    # bool_client = GPTClient(f'.cache/{config.dataset.name}_bool_evals.pkl')
+    # with ThreadPoolExecutor(max_workers=25) as executor:
+    #     self_bools = list(
+    #         tqdm(
+    #             executor.map(
+    #                 lambda x: get_bool_eval(x['prompt'], [af['atom'] for af in x['atomic_facts']], bool_client),
+    #                 dataset
+    #             ),
+    #             total=len(dataset)
+    #         )
+    #     )
+    # bool_client.save_cache()
+    # features = np.concatenate(
+    #     [
+    #         np.concatenate(frequencies).reshape(-1,1),
+    #         np.concatenate(self_evals).reshape(-1,1)
+    #     ],
+    #     axis=1
+    # )
+    import IPython; IPython.embed()

MACI-main/conditional-conformal/src/run.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import argparse
+import numpy as np
+from config import get_config
+from conformal import compute_conformity_scores, calibrate_thresholds, conformal_filter, assess_factscore_coverage
+from dataset import load_dataset, split_dataset
+from featurizer import get_features
+from llm_utils import merge_claims
+from prob_model import fit_model
+from gpt import GPTClient
+def parse_args():
+    parser = argparse.ArgumentParser(
+        prog="conformal-safety",
+        description="Auto-filter claims from LLM to meet accuracy and safety guarantees.",
+    )
+    parser.add_argument('-config_path', '-c', default='configs/default.toml', help="Config for construction.")
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = parse_args()
+    config = get_config(args.config_path)
+    rng = np.random.default_rng(seed=config.dataset.seed)
+    # annotate dataset
+    dataset = load_dataset(config)
+    # split dataset into train / validation / test
+    dataset_train, dataset_valid, dataset_test = split_dataset(
+        dataset,
+        train_perc=config.dataset.train_percent,
+        valid_perc=config.dataset.valid_percent,
+        rng=rng if config.dataset.randomize else None
+    )
+    X_train = get_features(dataset_train, config)
+    y_train = np.concatenate([[c['is_supported'] for c in dat['atomic_facts']] for dat in dataset_train])
+    y_train[y_train == True] = 1
+    y_train[y_train == False] = 0
+    y_train = y_train.astype(np.int8)
+    X_valid = get_features(dataset_valid, config)
+    y_valid = np.concatenate([[c['is_supported'] for c in dat['atomic_facts']] for dat in dataset_valid])
+    y_valid[y_valid == True] = 1
+    y_valid[y_valid == False] = 0
+    y_valid = y_valid.astype(np.int8)
+    splits_valid = np.cumsum([len(dat['atomic_facts']) for dat in dataset_valid])[:-1]
+    X_test = get_features(dataset_test, config)
+    y_test = np.concatenate([[c['is_supported'] for c in dat['atomic_facts']] for dat in dataset_test])
+    y_test[y_test == True] = 1
+    y_test[y_test == False] = 0
+    y_test = y_test.astype(np.int8)
+    splits_test = np.cumsum([len(dat['atomic_facts']) for dat in dataset_test])[:-1]
+    model = fit_model(X_train, y_train, config, dataset_train,
+                      eval_dict={'X_valid': X_valid, 'X_test': X_test, 'dataset_valid': dataset_valid, 'splits_valid': splits_valid, 'splits_test': splits_test})
+    scores_valid = model.predict_proba(X_valid)[:,1]
+    scores_valid = np.array_split(scores_valid, splits_valid)
+    scores_test = model.predict_proba(X_test)[:,1]
+    scores_test = np.array_split(scores_test, splits_test)
+    # identify features for scoring
+    score_features_v = [np.zeros((len(u['atomic_facts']), 1)) for u in dataset_valid]
+    score_features_te = [np.zeros((len(u['atomic_facts']), 1)) for u in dataset_test]
+    conf_scores_valid = compute_conformity_scores(dataset_valid, scores_valid)
+    # fit error probability function using training set (or just define it?)
+    # we want to be more sure about correctness on more sensitive prompts
+    alpha_fn = lambda x: [config.conformal.alpha] * len(x) # TODO: dumb one for now.
+    # identify features for conditional calibration
+    conf_features_v = np.zeros((len(dataset_valid),1))
+    conf_features_te = np.zeros((len(dataset_test),1))
+    # calibrate a threshold on the validation set
+    thresholds = calibrate_thresholds(
+        conf_features_te,
+        conf_features_v,
+        conf_scores_valid,
+        alpha_fn
+    )
+    dataset_test = conformal_filter(
+        dataset_test,
+        scores_test,
+        thresholds
+    )
+    if config.dataset.name.lower() == "factscore":
+        assess_factscore_coverage(dataset_test, config.conformal.alpha)
+    print("Merging filtered responses.")
+    merge_client = GPTClient(cache_file = config.model.merger.cache_path)
+    merged_responses = merge_claims(
+        dataset_test,
+        merge_client
+    )
+    merge_client.save_cache()
+    rand_idx = rng.integers(0, len(dataset_test))
+    print(dataset_test[rand_idx]['response']['message'] + "\n")
+    print(merged_responses[rand_idx])
+    import IPython; IPython.embed()

MACI-main/conditional-conformal/src/scorer.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import string
+import numpy as np
+import os
+import json
+from concurrent.futures import ThreadPoolExecutor
+# import logging
+# from tqdm import tqdm
+# from factscore.abstain_detection import is_response_abstained
+from retrieval import DocDB, Retrieval
+class Scorer(object):
+    def __init__(self,
+                 client,
+                 config,
+                 model_name="retrieval+ChatGPT",
+                 batch_size=256):
+        assert model_name in ["retrieval+llama", "retrieval+llama+npm", "retrieval+ChatGPT", "npm", "retrieval+ChatGPT+npm", "retrieval"]
+        self.model_name = model_name
+        self.client = client
+        self.config = config
+        self.data_dir = config.model.annotator.data_path
+        self.cache_dir = config.model.annotator.retrieval_cache_path
+        self.db = {}
+        self.retrieval = {}
+        self.npm = {}
+        self.batch_size = batch_size # batch size for retrieval
+        # self.abstain_detection_type = abstain_detection_type
+        # self.data_dir = data_dir
+        # self.cache_dir = cache_dir
+        # if not os.path.exists(cache_dir):
+        #     os.makedirs(cache_dir)
+        self.af_generator = None
+    def save_cache(self):
+        self.client.save_cache()
+        if "npm" in self.model_name:
+            for k, v in self.npm.items():
+                v.save_cache()
+        for k, v in self.retrieval.items():
+            v.save_cache()
+        for k, v in self.db:
+            v.save_cache()
+    def register_knowledge_source(self, name="enwiki-20230401", db_path=None, data_path=None):
+        assert name not in self.retrieval, f"{name} already registered"
+        if db_path is None:
+            db_path = os.path.join(self.data_dir, f"{name}.db")
+        if data_path is None:
+            data_path = os.path.join(self.data_dir, f"{name}.jsonl")
+        if name == "medlfqa":
+            datasets = {}
+            suffix = "_test_MedLFQA.jsonl"
+            # dataset_dir = "/Users/cherian/Projects/OLAPH/MedLFQA"
+            for path in os.listdir(self.data_dir):
+                if "MedLFQA" not in path:
+                    continue
+                dataset_name = path[:-len(suffix)]
+                with open(os.path.join(self.data_dir, path), 'r') as fp:
+                    datasets[dataset_name] = [json.loads(line) for line in fp.readlines()]
+                retrieval = {}
+                for _, dataset in datasets.items():
+                    for pt in dataset:
+                        retrieval[pt['Question']] = {
+                            'context': pt['Free_form_answer'],
+                            'must_have': pt['Must_have'],
+                            'nice_to_have': pt['Nice_to_have']
+                        }
+                self.retrieval[name] = retrieval
+        else:
+            db_cache_path = os.path.join(self.cache_dir, f"db-{name}.pkl")
+            cache_path = os.path.join(self.cache_dir, f"retrieval-{name}.json")
+            embed_cache_path = os.path.join(self.cache_dir, f"retrieval-{name}.pkl")
+            self.db[name] = DocDB(db_path=db_path, data_path=data_path, cache_path=db_cache_path)
+            self.retrieval[name] = Retrieval(self.db[name], cache_path, embed_cache_path, retrieval_type="bm25", batch_size=self.batch_size)
+            # if "npm" in self.model_name:
+            #     cache_path = os.path.join(self.cache_dir, f"bm25-{name}.json")
+            #     embed_cache_path = os.path.join(self.cache_dir, f"bm25-{name}.pkl")
+            #     self.npm[name] = NPM(Retrieval(self.db[name], cache_path, embed_cache_path, "bm25"),
+            #                          "npm-single",
+            #                          cache_file=os.path.join(self.cache_dir, f"npm-{name}.pkl"))
+    def get_score(self,
+                  topics,
+                  generations,
+                  atomic_facts,
+                  gamma=10,
+                  knowledge_source=None):
+        if knowledge_source is None:
+            # use the default knowledge source
+            knowledge_source = "enwiki-20230401"
+        if knowledge_source not in self.retrieval:
+            self.register_knowledge_source(knowledge_source)
+        if type(topics)==type(generations)==str:
+            topics = [topics]
+            generations = [generations]
+            atomic_facts = [atomic_facts]
+        else:
+            assert type(topics)==type(generations)==list, "`topics` and `generations` should be lists."
+            assert len(topics)==len(generations), "`topics` and `generations` should have the same length"
+            assert len(topics)==len(atomic_facts), "`topics` and `atomic_facts` should have the same length"
+        respond_ratio = np.mean([facts is not None for facts in atomic_facts])
+        scores = []
+        init_scores = []
+        decisions = []
+        for topic, generation, facts in zip(topics, generations, atomic_facts):
+            if facts is None:
+                decisions.append(None)
+            else:
+                decision = []
+                for fact in facts:
+                    decision.append(
+                        self._get_score(topic, generation, fact, knowledge_source, decision)
+                    )
+                score = np.mean([d["is_supported"] for d in decision])
+                if gamma:
+                    init_scores.append(score)
+                    penalty = 1.0 if len(facts)>gamma else np.exp(1-gamma/max(len(facts), 1))
+                    score = penalty * score
+                decisions.append(decision)
+                scores.append(score)
+                # if len(scores) % 10 == 0:
+                #     self.save_cache()
+        out = {"score": np.mean(scores),
+               "respond_ratio": respond_ratio,
+               "decisions": decisions,
+               "num_facts_per_response": np.mean([len(d) for d in decisions if d is not None])}
+        if gamma:
+            out["init_score"] = np.mean(init_scores)
+        return out
+    def _get_score(self, topic, generation, atom, knowledge_source, prev_decisions = []):
+        definition = f"Answer the question about {topic} based on the given context and your previous answers.\n\n"
+        atom = atom.strip()
+        if knowledge_source == "medlfqa":
+            context = self.retrieval[knowledge_source][topic]['context']
+        else:
+            passages = self.retrieval[knowledge_source].get_passages(topic, atom, k=5)
+            context = ""
+            for psg in reversed(passages):
+                context += "Title: {}\nText: {}\n\n".format(psg["title"], psg["text"].replace("<s>", "").replace("</s>", ""))
+        definition += context.strip()
+        if not definition[-1] in string.punctuation:
+            definition += "."
+        prompt = f"{definition.strip()}\n\n"
+        for prev_decision in prev_decisions:
+            prev_score = "True" if prev_decision["is_supported"] else "False"
+            prompt += f"Previous input: {prev_decision['atom']}\nTrue or False? Output: {prev_score}\n"
+        prompt += f"Input: {atom.strip()} True or False?\nOutput:"
+        # output = [{'message': 'blah blah blah'}]
+        output = self.client.query(prompt)
+        # if type(output[1])==np.ndarray:
+        #     # when logits are available
+        #     logits = np.array(output[1])
+        #     assert logits.shape[0] in [32000, 32001]
+        #     true_score = logits[5852]
+        #     false_score = logits[7700]
+        #     is_supported = true_score > false_score
+        # else:
+        # when logits are unavailable
+        generated_answer = output[0]['message'].lower()
+        if "true" in generated_answer or "false" in generated_answer:
+            if "true" in generated_answer and "false" not in generated_answer:
+                is_supported = True
+            elif "false" in generated_answer and "true" not in generated_answer:
+                is_supported = False
+            else:
+                is_supported = generated_answer.index("true") > generated_answer.index("false")
+        else:
+            is_supported = all([keyword not in generated_answer.lower().translate(str.maketrans("", "", string.punctuation)).split() for keyword in ["not", "cannot", "unknown", "information"]])
+        if is_supported and "npm" in self.model_name:
+            npprob = self.npm[knowledge_source].get_probabilty(topic, atom)
+            is_supported = npprob > 0.3
+        decision = {"atom": atom, "is_supported": is_supported}
+        return decision

MACI-main/conformal/__pycache__/adaptive_conformal.cpython-39.pyc ADDED Viewed

Binary file (19.1 kB). View file

MACI-main/conformal/__pycache__/basic_conformal.cpython-39.pyc ADDED Viewed

Binary file (5.87 kB). View file

MACI-main/conformal/__pycache__/conditional_conformal.cpython-39.pyc ADDED Viewed

Binary file (16.7 kB). View file

MACI-main/conformal/adaptive_conformal.py ADDED Viewed

	@@ -0,0 +1,403 @@

+import numpy as np
+import logging
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import roc_auc_score
+from collections import defaultdict
+from scipy.optimize import minimize
+from typing import Callable, List, Dict, Any, Optional, Tuple
+import cvxpy as cp
+class MACIAdaptiveConformal:
+    def __init__(
+        self,
+        score_function: Callable,
+        random_state: Optional[int] = None,
+        eps: float = 1e-6,
+        **kwargs,
+    ) -> None:
+        self.score_function = score_function
+        self.random_state = random_state
+        self.eps = float(eps)
+        self.tau_hat: Optional[float] = None
+        self._rng = np.random.default_rng(self.random_state)
+    def _process_raw_scores(self, raw_scores: List, data: List[Dict]) -> List[np.ndarray]:
+        if raw_scores and isinstance(raw_scores[0], np.ndarray):
+            return [np.asarray(s, dtype=float) for s in raw_scores]
+        per_sample_scores: List[np.ndarray] = []
+        samples = [d.get('sample', d) for d in data]
+        for i, s_i in enumerate(raw_scores):
+            n_claims = len(samples[i].get("atomic_facts", []))
+            s_arr = np.asarray(list(s_i), dtype=float)[:n_claims]
+            per_sample_scores.append(np.nan_to_num(s_arr, nan=0.0))
+        return per_sample_scores
+    def _compute_nonconformity_score(self, sample: dict, scores_i: np.ndarray) -> float:
+        atomic_facts = sample.get("atomic_facts", [])
+        if not atomic_facts or scores_i.size == 0: return 0.0
+        labels = np.asarray([af.get("is_supported", False) for af in atomic_facts], dtype=bool)
+        s_raw = np.asarray(scores_i, dtype=float)
+        s_raw = np.nan_to_num(s_raw, nan=0.0, posinf=1.0, neginf=0.0)
+        s = np.clip(s_raw, 0.0, 1.0 - self.eps)
+        idx = np.argsort(s, kind='mergesort')
+        s_sorted_asc, labels_asc = s[idx], labels[idx]
+        false_positions = np.where(~labels_asc)[0]
+        if not false_positions.size: return 0.0
+        k_star = int(false_positions.max())
+        costs = -np.log(1.0 - s_sorted_asc)
+        return float(np.sum(costs[:k_star + 1]))
+    def fit_on_calib(self, calib_data: List[dict], alpha: float = 0.1) -> "MACIAdaptiveConformal":
+        raw_scores = self.score_function(calib_data)
+        per_sample_scores = self._process_raw_scores(raw_scores, calib_data)
+        calib_samples = [entry.get('sample', entry) for entry in calib_data]
+        s_values = [self._compute_nonconformity_score(s, sc) for s, sc in zip(calib_samples, per_sample_scores)]
+        logging.info(f"    - Calibration set size: {len(calib_data)} samples")
+        if not s_values:
+            raise ValueError("Cannot compute scores from calibration data.")
+        logging.info(f"    - Nonconformity stats: min={min(s_values):.4f}, max={max(s_values):.4f}, mean={np.mean(s_values):.4f}")
+        n = len(s_values)
+        quantile_index = int(np.ceil((1.0 - alpha) * (n + 1))) - 1
+        quantile_index = min(quantile_index, n - 1)
+        sorted_s_values = np.sort(s_values)
+        self.tau_hat = sorted_s_values[quantile_index]
+        logging.info(f"    - Assigned tau_hat: {self.tau_hat:.4f}")
+        return self
+    def predict(self, data: List[dict]) -> Tuple[List[dict], List[float]]:
+        if self.tau_hat is None: raise ValueError("Model is not calibrated.")
+        raw_scores = self.score_function(data)
+        per_sample_scores = self._process_raw_scores(raw_scores, data)
+        samples = [d.get('sample', d) for d in data]
+        filtered_data, retention_rates = [], []
+        for sample, s_raw in zip(samples, per_sample_scores):
+            atomic_facts = sample.get("atomic_facts", [])
+            new_sample = dict(sample)
+            if not atomic_facts or s_raw.size == 0:
+                new_sample["filtered_claims"] = []
+                retention_rates.append(1.0 if not atomic_facts else 0.0)
+            else:
+                s_tmp = np.asarray(s_raw, dtype=float)
+                s_tmp = np.nan_to_num(s_tmp, nan=0.0, posinf=1.0, neginf=0.0)
+                s = np.clip(s_tmp, 0.0, 1.0 - self.eps)
+                indexed_items = sorted(list(zip(s, atomic_facts)), key=lambda x: x[0])
+                s_sorted_asc = np.array([item[0] for item in indexed_items])
+                costs = -np.log(1.0 - s_sorted_asc)
+                cumulative_costs = np.concatenate(([0.0], np.cumsum(costs)))
+                possible_K_indices = np.where(cumulative_costs <= self.tau_hat)[0]
+                K = int(possible_K_indices.max()) if possible_K_indices.size > 0 else 0
+                # Boundary randomization: with probability proportional to leftover budget,
+                # include one more boundary item (i.e., increase K by 1) if feasible.
+                # This randomization reduces discretization bias at the threshold.
+                if K < len(costs):
+                    leftover = float(self.tau_hat - cumulative_costs[K])
+                    next_cost = float(costs[K])  # cost of the (K)-th item in sorted order
+                    if np.isfinite(next_cost) and next_cost > 0.0 and leftover > 0.0:
+                        p = float(np.clip(leftover / next_cost, 0.0, 1.0))
+                        if self._rng.uniform(0.0, 1.0) < p:
+                            K = K + 1
+                new_sample["filtered_claims"] = [item[1] for item in indexed_items[K:]]
+                retention_rates.append(len(new_sample["filtered_claims"]) / len(atomic_facts))
+            filtered_data.append(new_sample)
+        return filtered_data, retention_rates
+class SubgroupOptimizedMACI:
+    def __init__(self, model_names: List[str], grouper: Any, n_bins: int = 3, **kwargs):
+        self.model_names, self.grouper, self.n_bins, self.kwargs = model_names, grouper, n_bins, kwargs
+        self.weights, self.conformal_models = {}, {}
+        self.fallback_weights, self.bin_edges = None, None
+        self.bin_labels = ['low', 'medium', 'high'] if n_bins == 3 else [f'group_{i}' for i in range(n_bins)]
+        # Timing accumulators
+        self._timing: Dict[str, float] = {
+            'weight_optimization_s': 0.0,
+            'calibration_s': 0.0
+        }
+    def _get_subgroup_label(self, value: float) -> str:
+        if self.bin_edges is None or not np.isfinite(value):
+            return self.bin_labels[0]
+        bin_index = np.digitize(value, self.bin_edges)
+        return self.bin_labels[min(bin_index, len(self.bin_labels) - 1)]
+    def _group_data_by_bins(self, data: List[Dict], bin_edges: np.ndarray) -> Dict[str, List[Dict]]:
+        grouped_data = defaultdict(list)
+        values = self.grouper.compute_values([d['sample'] for d in data])
+        for item, value in zip(data, values):
+            label = self._get_subgroup_label(value)
+            grouped_data[label].append(item)
+        return grouped_data
+    def _learn_robust_weights_by_retention(self, training_data: List[Dict], target_tpr: float = 0.95) -> np.ndarray:
+        """
+        Stable convex program for learning ensemble weights on the probability simplex.
+        Uses an epigraph reformulation with explicit nonnegative slack variables and
+        Tikhonov regularization to improve numerical stability across solvers.
+        """
+        all_scores, all_labels = [], []
+        for entry in training_data:
+            sample, scores_dict = entry.get('sample', {}), entry.get('scores', {})
+            labels = [af.get("is_supported", False) for af in sample.get("atomic_facts", [])]
+            scores_per_model = [scores_dict.get(m, []) for m in self.model_names]
+            min_len = min(len(labels), *[len(s) for s in scores_per_model])
+            if min_len == 0:
+                continue
+            for i in range(min_len):
+                all_labels.append(labels[i])
+                all_scores.append([s[i] for s in scores_per_model])
+        if len(all_labels) < 2 or len(np.unique(all_labels)) < 2:
+            logging.warning("Skipping weight optimization: insufficient or single-class labels.")
+            return np.ones(len(self.model_names)) / len(self.model_names)
+        scores_matrix = np.nan_to_num(np.array(all_scores, dtype=float))
+        labels_array = np.array(all_labels, dtype=int)
+        n_models = scores_matrix.shape[1]
+        pos = scores_matrix[labels_array == 1]
+        neg = scores_matrix[labels_array == 0]
+        if pos.shape[0] == 0 or neg.shape[0] == 0:
+            logging.warning("Skipping weight optimization: missing positive or negative samples.")
+            return np.ones(len(self.model_names)) / len(self.model_names)
+        neg_proxy = np.mean(neg, axis=1)
+        neg_w = np.clip(neg_proxy, 0.0, 1.0) ** 2
+        neg_w = neg_w / (np.mean(neg_w) + 1e-12)
+        pos_w = np.ones(pos.shape[0], dtype=float)
+        sum_pos = np.sum(pos_w)
+        sum_neg = np.sum(neg_w)
+        if sum_pos > 0 and sum_neg > 0:
+            scale = sum_pos / sum_neg
+            neg_w = neg_w * scale
+        alpha = 1.0
+        beta = 5.0 * (target_tpr / max(1.0 - target_tpr, 1e-6))
+        def solve_with(ridge: float, eps_w: float, solver_name: str) -> Optional[np.ndarray]:
+            try:
+                w = cp.Variable(n_models)
+                t = cp.Variable()
+                slack_neg = cp.Variable(neg.shape[0], nonneg=True)
+                slack_pos = cp.Variable(pos.shape[0], nonneg=True)
+                constraints = [
+                    neg @ w - t <= slack_neg,
+                    t - pos @ w <= slack_pos,
+                    w >= eps_w,
+                    cp.sum(w) == 1,
+                    t >= 0,
+                    t <= 1
+                ]
+                objective = (
+                    alpha * cp.sum(cp.multiply(neg_w, slack_neg)) +
+                    beta * cp.sum(cp.multiply(pos_w, slack_pos)) +
+                    ridge * cp.sum_squares(w)
+                )
+                prob = cp.Problem(cp.Minimize(objective), constraints)
+                if solver_name == 'osqp':
+                    prob.solve(solver=cp.OSQP, verbose=False, eps_abs=1e-6, eps_rel=1e-6, max_iter=20000, polishing=True, linsys_solver='qdldl')
+                elif solver_name == 'ecos':
+                    prob.solve(solver=cp.ECOS, verbose=False, max_iters=200000, abstol=1e-7, reltol=1e-7, feastol=1e-7)
+                elif solver_name == 'scs':
+                    prob.solve(solver=cp.SCS, verbose=False, max_iters=300000, eps=2e-5, acceleration_lookback=20)
+                else:
+                    return None
+                if w.value is None:
+                    return None
+                w_val = np.array(w.value, dtype=float).reshape(-1)
+                if not np.all(np.isfinite(w_val)):
+                    return None
+                w_val = np.clip(w_val, 0.0, None)
+                s = np.sum(w_val)
+                if s <= 1e-12:
+                    return None
+                w_val = w_val / s
+                logging.info("    - Weight optimization completed")
+                return w_val
+            except Exception as e:
+                logging.debug(f"{solver_name.upper()} attempt failed (ridge={ridge}, eps_w={eps_w}): {e}")
+                return None
+        solver_order = []
+        solver_pref = (self.kwargs or {}).get('solver', 'auto')
+        if solver_pref in ('osqp', 'ecos', 'scs'):
+            solver_order = [solver_pref] + [s for s in ('osqp', 'ecos', 'scs') if s != solver_pref]
+        else:
+            solver_order = ['osqp', 'ecos', 'scs']
+        for ridge in (5e-3, 5e-2, 1e-1, 5e-1):
+            for eps_w in (0.0, 1e-6, 1e-4):
+                for slv in solver_order:
+                    sol = solve_with(ridge=ridge, eps_w=eps_w, solver_name=slv)
+                    if sol is not None:
+                        return sol
+        logging.warning("CVXPY solvers failed repeatedly; falling back to AUC-based SLSQP optimizer as last resort.")
+        return self._learn_robust_weights(training_data)
+    def _learn_robust_weights(self, training_data: List[Dict]) -> np.ndarray:
+        all_scores, all_labels = [], []
+        for entry in training_data:
+            sample, scores_dict = entry.get('sample', {}), entry.get('scores', {})
+            labels = [af.get("is_supported", False) for af in sample.get("atomic_facts", [])]
+            if not all(m in scores_dict for m in self.model_names): continue
+            scores_per_model = [scores_dict.get(m, []) for m in self.model_names]
+            min_len = min(len(labels), *[len(s) for s in scores_per_model])
+            if min_len == 0: continue
+            for i in range(min_len):
+                all_labels.append(labels[i])
+                all_scores.append([s[i] for s in scores_per_model])
+        if len(all_labels) < 2 or len(np.unique(all_labels)) < 2:
+            return np.ones(len(self.model_names)) / len(self.model_names)
+        scores_matrix = np.nan_to_num(np.array(all_scores, dtype=float))
+        labels_array = np.array(all_labels, dtype=int)
+        n_models = scores_matrix.shape[1]
+        def objective_fn(weights: np.ndarray) -> float:
+            w = weights / np.sum(weights) if np.sum(weights) > 0 else weights
+            ensemble_scores = scores_matrix @ w
+            try: return -roc_auc_score(labels_array, ensemble_scores)
+            except ValueError: return 0.0
+        best_score, best_weights = -1.0, np.ones(n_models) / n_models
+        for _ in range(10):
+            w0 = np.random.dirichlet(np.ones(n_models))
+            res = minimize(objective_fn, w0, method='SLSQP', bounds=[(0, 1)] * n_models, constraints=({'type': 'eq', 'fun': lambda w: np.sum(w) - 1.0}))
+            if res.success and -res.fun > best_score:
+                best_score, best_weights = -res.fun, res.x / np.sum(res.x)
+        return best_weights
+    def get_budgets(self):
+        return {subgroup: model.tau_hat for subgroup, model in self.conformal_models.items()}
+    def get_weights(self):
+        return {
+            'subgroup_weights': self.weights,
+            'fallback_weights': self.fallback_weights,
+            'bin_edges': None if self.bin_edges is None else np.asarray(self.bin_edges).tolist(),
+            'bin_labels': list(self.bin_labels) if self.bin_labels is not None else None,
+        }
+    def _compute_ensemble_scores(self, data: List[Dict], subgroup_label: str) -> List[np.ndarray]:
+        subgroup_weights = self.weights.get(subgroup_label, self.fallback_weights)
+        if subgroup_weights is None:
+            raise RuntimeError(f"Weights not learned for subgroup '{subgroup_label}'.")
+        final_scores = []
+        for entry in data:
+            scores_dict = entry.get('scores', {})
+            scores_per_model = [scores_dict.get(m, []) for m in self.model_names]
+            min_len = min(len(entry['sample']['atomic_facts']), *[len(s) for s in scores_per_model])
+            if min_len == 0:
+                final_scores.append(np.array([]))
+            else:
+                scores_matrix = np.array([np.nan_to_num(s[:min_len]) for s in scores_per_model]).T
+                final_scores.append(scores_matrix @ subgroup_weights)
+        return final_scores
+    def fit(self, data: List[dict], alpha: float = 0.1, ensemble_train_ratio: float = 0.5, target_tpr: float = 0.95):
+        """Learn subgroup-specific ensemble weights and conformal thresholds."""
+        random_state = self.kwargs.get("random_state")
+        grouper_name = self.grouper.__class__.__name__
+        logging.info(f"SubgroupOptimizedMACI training started (grouper: '{grouper_name}')")
+        ensemble_train_data, calib_data = train_test_split(
+            data,
+            test_size=1.0 - ensemble_train_ratio,
+            random_state=random_state
+        )
+        logging.info(f"  - Data split: ensemble training {len(ensemble_train_data)} / conformal calibration {len(calib_data)}")
+        logging.info(f"  - Learning bin edges by '{grouper_name}' values...")
+        train_values = self.grouper.compute_values([d['sample'] for d in ensemble_train_data])
+        finite_train_values = train_values[np.isfinite(train_values)]
+        quantiles = np.linspace(0, 1, self.n_bins + 1)[1:-1]
+        self.bin_edges = np.quantile(finite_train_values, quantiles) if len(finite_train_values) > 0 else np.array([])
+        logging.info(f"  - Learned bin edges: {self.bin_edges}")
+        grouped_ensemble_data = self._group_data_by_bins(ensemble_train_data, self.bin_edges)
+        grouped_calib_data = self._group_data_by_bins(calib_data, self.bin_edges)
+        for label in self.bin_labels:
+            logging.info(f"--- Processing group '{label}' ---")
+            sub_ensemble_data = grouped_ensemble_data.get(label, [])
+            sub_calib_data = grouped_calib_data.get(label, [])
+            if not sub_ensemble_data or not sub_calib_data:
+                logging.warning(f"Skipping group '{label}' due to insufficient data.")
+                continue
+            logging.info(f"  - Learning ensemble weights (n={len(sub_ensemble_data)})...")
+            _t0 = __import__('time').perf_counter()
+            self.weights[label] = self._learn_robust_weights_by_retention(sub_ensemble_data, target_tpr=target_tpr)
+            self._timing['weight_optimization_s'] += __import__('time').perf_counter() - _t0
+            logging.info(f"  - Calibrating threshold (n={len(sub_calib_data)})...")
+            score_func = lambda data, l=label: self._compute_ensemble_scores(data, l)
+            conformal_model = MACIAdaptiveConformal(score_function=score_func, **self.kwargs)
+            _t1 = __import__('time').perf_counter()
+            conformal_model.fit_on_calib(sub_calib_data, alpha)
+            self._timing['calibration_s'] += __import__('time').perf_counter() - _t1
+            self.conformal_models[label] = conformal_model
+        logging.info("--- Training fallback model on all data ---")
+        self.fallback_weights = self._learn_robust_weights_by_retention(ensemble_train_data, target_tpr=target_tpr)
+        logging.info("✅ Training complete.")
+        return self
+    def get_timing(self) -> Dict[str, float]:
+        return dict(self._timing)
+    def predict(self, data: List[dict]) -> Tuple[List[dict], List[float]]:
+        if not self.conformal_models: raise ValueError("모델이 학습되지 않았습니다.")
+        grouped_data_with_indices = defaultdict(list)
+        values = self.grouper.compute_values([d['sample'] for d in data])
+        for i, (item, value) in enumerate(zip(data, values)):
+            label = self._get_subgroup_label(value)
+            grouped_data_with_indices[label].append((i, item))
+        results_placeholder = [None] * len(data)
+        rates_placeholder = [None] * len(data)
+        for label, indexed_subgroup_data in grouped_data_with_indices.items():
+            if not indexed_subgroup_data: continue
+            original_indices = [item[0] for item in indexed_subgroup_data]
+            subgroup_data = [item[1] for item in indexed_subgroup_data]
+            model = self.conformal_models.get(label)
+            if model:
+                logging.info(f"  - Predicting for group '{label}' (n={len(subgroup_data)})...")
+                predicted_samples, rates = model.predict(subgroup_data)
+                for i, original_item, predicted_sample, rate in zip(original_indices, subgroup_data, predicted_samples, rates):
+                    new_result_item = original_item.copy()
+                    new_result_item['sample'] = predicted_sample
+                    results_placeholder[i] = new_result_item
+                    rates_placeholder[i] = rate
+            else:
+                logging.warning(f"No trained model for group '{label}'. Using fallback weights for prediction.")
+                fallback_score_func = lambda data_list: self._compute_ensemble_scores(data_list, label)
+                fallback_model = MACIAdaptiveConformal(score_function=fallback_score_func, **self.kwargs)
+                fallback_model.tau_hat = 0.0
+                predicted_samples, rates = fallback_model.predict(subgroup_data)
+                for i, original_item, predicted_sample, rate in zip(original_indices, subgroup_data, predicted_samples, rates):
+                    new_result_item = original_item.copy()
+                    new_result_item['sample'] = predicted_sample
+                    results_placeholder[i] = new_result_item
+                    rates_placeholder[i] = rate
+        return results_placeholder, rates_placeholder

MACI-main/conformal/basic_conformal.py ADDED Viewed

	@@ -0,0 +1,189 @@

+"""
+Basic Conformal Implementation for Factuality Assessment
+This module implements a basic conformal prediction method for assessing
+the factuality of generated text by filtering claims based on conformity scores.
+"""
+import numpy as np
+from typing import List, Tuple, Optional, Callable
+class BasicConformal:
+    def __init__(
+        self,
+        score_function: Callable,
+        random_state: Optional[int] = None
+    ):
+        self.score_function = score_function
+        self.random_state = random_state
+        self.calibration_scores = None
+        self.threshold = None
+        self._rng = np.random.default_rng(random_state)
+        self._tie_gamma_keep: float = 1.0
+    def fit_on_calib(self, calib_data: List, alpha: float = 0.1) -> 'BasicConformal':
+        if not 0 < alpha < 1:
+            raise ValueError("alpha must be between 0 and 1")
+        raw_scores = self.score_function(calib_data)
+        per_sample_scores: List[List[float]] = []
+        if len(raw_scores) == len(calib_data) and hasattr(raw_scores[0], "__iter__") and not isinstance(raw_scores[0], (str, bytes)):
+            for i, sample in enumerate(calib_data):
+                if 'atomic_facts' in sample:
+                    s_i = np.asarray(list(raw_scores[i]), dtype=float)
+                else:
+                    s_i = np.asarray([float(raw_scores[i])], dtype=float)
+                s_i = np.where(np.isnan(s_i), -np.inf, s_i)
+                per_sample_scores.append(s_i.tolist())
+        else:
+            if len(raw_scores) != len(calib_data):
+                raise ValueError("score_function must return one score per sample or a per-claim score list per sample")
+            for i, sample in enumerate(calib_data):
+                if 'atomic_facts' in sample and len(sample['atomic_facts']) > 0:
+                    s_i = np.asarray([float(raw_scores[i])] * len(sample['atomic_facts']), dtype=float)
+                else:
+                    s_i = np.asarray([float(raw_scores[i])], dtype=float)
+                s_i = np.where(np.isnan(s_i), -np.inf, s_i)
+                per_sample_scores.append(s_i.tolist())
+        S_values: List[float] = []
+        for sample, scores_i in zip(calib_data, per_sample_scores):
+            if 'atomic_facts' in sample and len(sample['atomic_facts']) > 0:
+                false_scores = [s for s, fact in zip(scores_i, sample['atomic_facts']) if not fact.get('is_supported', False)]
+                if len(false_scores) == 0:
+                    S_values.append(float('-inf'))
+                else:
+                    vals = np.asarray(false_scores, dtype=float)
+                    S_values.append(float(np.nanmax(vals)) if vals.size > 0 else float('-inf'))
+            else:
+                vals = np.asarray(scores_i, dtype=float)
+                if vals.size == 0:
+                    S_values.append(float('-inf'))
+                else:
+                    S_values.append(float(np.nanmax(vals)))
+        self.calibration_scores = np.array(S_values, dtype=float)
+        n = len(self.calibration_scores)
+        if n == 0:
+            raise ValueError("No calibration samples available to compute threshold")
+        quantile = 1 - alpha
+        try:
+            self.threshold = np.quantile(self.calibration_scores, quantile, method='higher')
+        except TypeError:
+            self.threshold = np.quantile(self.calibration_scores, quantile)
+        sorted_scores = np.sort(self.calibration_scores)
+        k = int(np.ceil((1.0 - alpha) * (n + 1))) - 1
+        k = min(max(k, 0), n - 1)
+        t_star = float(sorted_scores[k])
+        n_lt = int(np.sum(self.calibration_scores < t_star))
+        n_eq = int(np.sum(np.isclose(self.calibration_scores, t_star)))
+        if n_eq <= 0:
+            gamma_standard = 0.0
+        else:
+            gamma_standard = ((1.0 - alpha) * (n + 1) - n_lt) / n_eq
+            gamma_standard = float(np.clip(gamma_standard, 0.0, 1.0))
+        self._tie_gamma_keep = 1.0 - gamma_standard
+        return self
+    def predict(self, data: List) -> Tuple[List, List]:
+        if self.threshold is None:
+            raise ValueError("Model must be fitted before prediction")
+        raw_scores = self.score_function(data)
+        per_sample_scores: List[List[float]] = []
+        if len(raw_scores) == len(data) and hasattr(raw_scores[0], "__iter__") and not isinstance(raw_scores[0], (str, bytes)):
+            for i, sample in enumerate(data):
+                if 'atomic_facts' in sample:
+                    s_i = np.asarray(list(raw_scores[i]), dtype=float)
+                else:
+                    s_i = np.asarray([float(raw_scores[i])], dtype=float)
+                s_i = np.where(np.isnan(s_i), -np.inf, s_i)
+                per_sample_scores.append(s_i.tolist())
+        else:
+            if len(raw_scores) != len(data):
+                raise ValueError("score_function must return one score per sample or per-claim score lists per sample")
+            for i, sample in enumerate(data):
+                if 'atomic_facts' in sample and len(sample['atomic_facts']) > 0:
+                    s_i = np.asarray([float(raw_scores[i])] * len(sample['atomic_facts']), dtype=float)
+                else:
+                    s_i = np.asarray([float(raw_scores[i])], dtype=float)
+                s_i = np.where(np.isnan(s_i), -np.inf, s_i)
+                per_sample_scores.append(s_i.tolist())
+        filtered_data: List = []
+        retention_rates: List[float] = []
+        for sample, scores_i in zip(data, per_sample_scores):
+            if 'atomic_facts' in sample and len(sample['atomic_facts']) > 0:
+                filtered_claims = []
+                for claim, s in zip(sample['atomic_facts'], scores_i):
+                    if s > self.threshold:
+                        filtered_claims.append(claim)
+                    elif np.isclose(s, self.threshold):
+                        if self._rng.uniform() < self._tie_gamma_keep:
+                            filtered_claims.append(claim)
+                sample = dict(sample)
+                sample['filtered_claims'] = filtered_claims
+                retention_rate = len(filtered_claims) / len(sample['atomic_facts'])
+            elif 'atomic_facts' in sample and len(sample['atomic_facts']) == 0:
+                sample = dict(sample)
+                sample['filtered_claims'] = []
+                retention_rate = 0.0
+            else:
+                sample = dict(sample)
+                if len(scores_i) == 0:
+                    sample['is_retained'] = False
+                    retention_rate = 0.0
+                else:
+                    s = float(scores_i[0])
+                    sample['is_retained'] = (s > self.threshold) or (np.isclose(s, self.threshold) and self._rng.uniform() < self._tie_gamma_keep)
+                    retention_rate = 1.0 if sample['is_retained'] else 0.0
+            filtered_data.append(sample)
+            retention_rates.append(retention_rate)
+        return filtered_data, retention_rates
+    def get_coverage(self, data: List) -> float:
+        if self.threshold is None:
+            raise ValueError("Model must be fitted before computing coverage")
+        raw_scores = self.score_function(data)
+        per_sample_scores: List[List[float]] = []
+        if len(raw_scores) == len(data) and hasattr(raw_scores[0], "__iter__") and not isinstance(raw_scores[0], (str, bytes)):
+            for i, sample in enumerate(data):
+                if 'atomic_facts' in sample:
+                    s_i = np.asarray(list(raw_scores[i]), dtype=float)
+                else:
+                    s_i = np.asarray([float(raw_scores[i])], dtype=float)
+                s_i = np.where(np.isnan(s_i), -np.inf, s_i)
+                per_sample_scores.append(s_i.tolist())
+        else:
+            if len(raw_scores) != len(data):
+                raise ValueError("score_function must return one score per sample or per-claim score lists per sample")
+            for i, sample in enumerate(data):
+                if 'atomic_facts' in sample and len(sample['atomic_facts']) > 0:
+                    s_i = np.asarray([float(raw_scores[i])] * len(sample['atomic_facts']), dtype=float)
+                else:
+                    s_i = np.asarray([float(raw_scores[i])], dtype=float)
+                s_i = np.where(np.isnan(s_i), -np.inf, s_i)
+                per_sample_scores.append(s_i.tolist())
+        indicators = []
+        for sample, scores_i in zip(data, per_sample_scores):
+            if 'atomic_facts' in sample and len(sample['atomic_facts']) > 0:
+                false_scores = [s for s, fact in zip(scores_i, sample['atomic_facts']) if not fact.get('is_supported', False)]
+                if len(false_scores) == 0:
+                    indicators.append(1.0)
+                else:
+                    vals = np.asarray(false_scores, dtype=float)
+                    max_false = float(np.nanmax(vals)) if vals.size > 0 else float('-inf')
+                    indicators.append(1.0 if max_false <= self.threshold else 0.0)
+            else:
+                vals = np.asarray(scores_i, dtype=float)
+                if vals.size == 0:
+                    indicators.append(1.0)
+                else:
+                    indicators.append(1.0 if float(np.nanmax(vals)) <= self.threshold else 0.0)
+        return float(np.mean(indicators))
+    def get_threshold(self) -> float:
+        return self.threshold

MACI-main/conformal/conditional_conformal.py ADDED Viewed

	@@ -0,0 +1,489 @@

+import numpy as np
+from typing import List, Tuple, Optional, Callable
+import torch
+from scipy.optimize import linprog
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import roc_auc_score, roc_curve
+from functools import lru_cache
+import sys
+import os
+# Add conditional-conformal path to Python path (local vendor copy) using repo-relative path
+repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+vendor_path = os.path.join(repo_root, 'conditional-conformal', 'src')
+if vendor_path not in sys.path:
+    sys.path.append(vendor_path)
+from conditionalconformal import CondConf
+# ==============================================================================
+# === Step 1: Classes and Helper Functions for Boosting ===
+# ==============================================================================
+def as_tensor(x, dtype, requires_grad=False):
+    return torch.tensor(x, dtype=dtype, requires_grad=requires_grad)
+def get_current_basis(primals, duals, Phi, S, quantile):
+    """Helper function to find a stable basis from LP solution"""
+    interp_bools = np.logical_and(~np.isclose(duals, quantile - 1), ~np.isclose(duals, quantile))
+    if np.sum(interp_bools) == Phi.shape[1]:
+        return interp_bools
+    preds = (Phi @ primals).flatten()
+    active_indices = np.where(interp_bools)[0]
+    interp_indices = np.argsort(np.abs(S - preds))[:Phi.shape[1]]
+    diff_indices = np.setdiff1d(interp_indices, active_indices)
+    num_missing = Phi.shape[1] - np.sum(interp_bools)
+    if num_missing < len(diff_indices):
+        from itertools import combinations
+        for cand_indices in combinations(diff_indices, num_missing):
+            cand_phi = Phi[np.concatenate((active_indices, cand_indices))]
+            if np.isfinite(np.linalg.cond(cand_phi)):
+                interp_bools[np.asarray(cand_indices)] = True
+                break
+    else:
+        interp_bools[diff_indices] = True
+    return interp_bools
+def _choose_full_rank_rows(Phi: np.ndarray) -> np.ndarray:
+    """Greedy row selection for full-rank basis"""
+    d = Phi.shape[1]
+    chosen = []
+    cur = np.empty((0, d))
+    for i in range(Phi.shape[0]):
+        cand = np.vstack([cur, Phi[i:i+1]])
+        if np.linalg.matrix_rank(cand) > np.linalg.matrix_rank(cur):
+            chosen.append(i)
+            cur = cand
+        if len(chosen) == d:
+            break
+    if len(chosen) < d:
+        chosen = list(range(Phi.shape[0]-d, Phi.shape[0]))
+    return np.asarray(chosen, dtype=int)
+def solve_qr_for_boosting(Phi: np.ndarray, s: torch.Tensor, q: float, dtype: torch.dtype) -> torch.Tensor:
+    """Differentiable tau calculation function for boosting - robust fallback included"""
+    S_np = s.detach().cpu().numpy().reshape(-1)
+    assert Phi.shape[0] == S_np.shape[0], "Phi rows must match len(s)"
+    assert 0.0 < q < 1.0, "q must be in (0,1)"
+    b_eq = np.zeros(Phi.shape[1])
+    bounds = [(q - 1.0, q)] * len(S_np)
+    res = None
+    try:
+        res = linprog(-S_np, A_eq=Phi.T, b_eq=b_eq, bounds=bounds, method='highs')
+    except Exception:
+        res = None
+    tau_initial = None
+    duals = None
+    if res is not None and getattr(res, "success", False):
+        marg = None
+        if hasattr(res, "eqlin") and res.eqlin is not None and hasattr(res.eqlin, "marginals") and res.eqlin.marginals is not None:
+            marg = res.eqlin.marginals
+        elif hasattr(res, "dual_eq") and res.dual_eq is not None:
+            marg = res.dual_eq
+        if marg is not None:
+            tau_initial = -np.asarray(marg, dtype=float)
+        if hasattr(res, "x") and res.x is not None:
+            duals = np.asarray(res.x, dtype=float)
+    try:
+        if tau_initial is not None and duals is not None:
+            basis_mask = get_current_basis(tau_initial, duals, Phi, S_np, q)
+            basis_idx = np.where(basis_mask)[0]
+            if basis_idx.size != Phi.shape[1]:
+                basis_idx = _choose_full_rank_rows(Phi)
+        else:
+            basis_idx = _choose_full_rank_rows(Phi)
+        Phi_basis = Phi[basis_idx]
+        s_basis = s[basis_idx]
+        tau_sol = torch.linalg.lstsq(as_tensor(Phi_basis, dtype), s_basis).solution
+        tau = tau_sol
+    except Exception:
+        tau = torch.zeros((Phi.shape[1],), dtype=dtype)
+    return tau.reshape(-1, 1)
+def torch_score_func_sample_level(features: List[np.ndarray], annotations: List[np.ndarray], beta: torch.Tensor) -> torch.Tensor:
+    """sample-level score (max_false_score) calculation"""
+    scores = as_tensor(np.zeros((len(features),)), dtype=beta.dtype)
+    for i, (f, a) in enumerate(zip(features, annotations)):
+        cs = -as_tensor(f, dtype=beta.dtype) @ beta
+        at = as_tensor(a, dtype=torch.bool)
+        scores[i] = torch.sort(cs[~at], descending=True)[0][0] if torch.sum(~at) > 0 else torch.tensor(1e9, dtype=beta.dtype)
+    return scores
+def cond_score_loss(beta: torch.Tensor, dataset: Tuple, z_processed: np.ndarray, random_seed: int, q: float) -> torch.Tensor:
+    """Claim-level loss function for boosting"""
+    indices = np.arange(len(dataset[0]))
+    ind_train, ind_calib = train_test_split(indices, test_size=0.5, random_state=random_seed)
+    x_train, y_train = [dataset[0][i] for i in ind_train], [dataset[1][i] for i in ind_train]
+    x_calib, y_calib = [dataset[0][i] for i in ind_calib], [dataset[1][i] for i in ind_calib]
+    z_train, z_calib = z_processed[ind_train], z_processed[ind_calib]
+    scores_train_sample = torch_score_func_sample_level(x_train, y_train, beta)
+    tau = solve_qr_for_boosting(z_train, scores_train_sample, q, beta.dtype)
+    cutoffs = (as_tensor(z_calib, dtype=beta.dtype) @ tau).flatten()
+    total_loss = torch.tensor(0.0, dtype=beta.dtype, requires_grad=True)
+    count = 0
+    for i, (f_c, a_c) in enumerate(zip(x_calib, y_calib)):
+        claim_scores = -(as_tensor(f_c, dtype=beta.dtype) @ beta)
+        perc = torch.sigmoid(cutoffs[i] - claim_scores)
+        total_loss = total_loss + torch.mean(perc)
+        count += 1
+    total_loss = total_loss / count if count > 0 else total_loss
+    return -total_loss
+class ConditionalConformalBoosting:
+    def __init__(self, random_state: int = 0):
+        self.rng = np.random.default_rng(random_state)
+        self.beta: Optional[np.ndarray] = None
+        self.z_projector: Optional[np.ndarray] = None
+    def _extract_features_for_boosting(self, data: List[dict]) -> Tuple[List[np.ndarray], np.ndarray, List[np.ndarray]]:
+        basic_features = [d['features_4d'] for d in data]
+        annotations = [d['annotations'] for d in data]
+        conditional_features = []
+        for d in data:
+            sample = d.get('sample', {})
+            scores_dict = d.get('scores', {})
+            base_features = d.get('prompt_features', [])
+            logprob_scores = scores_dict.get('logprob', np.array([]))
+            logprob_mean = np.mean(logprob_scores) if logprob_scores.size > 0 else 0.0
+            logprob_std = np.std(logprob_scores) if logprob_scores.size > 1 else 0.0
+            claim_count = len(sample.get('atomic_facts', []))
+            combined_features = np.concatenate([base_features, [logprob_mean, logprob_std, claim_count]])
+            conditional_features.append(combined_features)
+        z = np.array(conditional_features, dtype=float)
+        if not np.isfinite(z).all():
+            z = np.nan_to_num(z, nan=np.nanmean(z, axis=0))
+        return basic_features, z, annotations
+    def _preprocess_z(self, z: np.ndarray) -> np.ndarray:
+        intercept = np.ones((z.shape[0], 1))
+        z_aug = np.hstack([z, intercept])
+        try:
+            _, s, Vt = np.linalg.svd(z_aug, full_matrices=False)
+            rank = np.sum(s > 1e-10)
+            self.z_projector = Vt.T[:, :rank]
+        except np.linalg.LinAlgError:
+            self.z_projector = np.eye(z_aug.shape[1])
+        return z_aug @ self.z_projector
+    def fit(self, data: List[dict], alpha: float = 0.1, boosting_epochs: int = 1000, boosting_lr: float = 0.005) -> np.ndarray:
+        basic_features, z, annotations = self._extract_features_for_boosting(data)
+        dataset_boost = (basic_features, annotations)
+        z_processed = self._preprocess_z(z)
+        feature_dim = basic_features[0].shape[1]
+        beta_tensor = torch.tensor([0.25] * feature_dim, dtype=torch.float, requires_grad=True)
+        optimizer = torch.optim.Adam([beta_tensor], lr=boosting_lr)
+        for epoch in range(boosting_epochs):
+            optimizer.zero_grad()
+            seed_epoch = self.rng.integers(1e7)
+            loss = cond_score_loss(beta_tensor, dataset_boost, z_processed, seed_epoch, q=1 - alpha)
+            if torch.isnan(loss) or torch.isinf(loss): break
+            loss.backward()
+            if beta_tensor.grad is not None and torch.isfinite(beta_tensor.grad).all():
+                optimizer.step()
+        self.beta = beta_tensor.detach().cpu().numpy()
+        #
+        return self.beta
+# ==============================================================================
+# === Step 2: Classes and Helper Functions for Calibration and Prediction ===
+# ==============================================================================
+class ConditionalConformalInference:
+    def __init__(self, random_state: int = 0):
+        self.rng = np.random.default_rng(random_state)
+        self.alpha: Optional[float] = None
+        self.beta: Optional[np.ndarray] = None
+        self.model: Optional[CondConf] = None
+        # Adaptive alpha components
+        self.adaptive_enabled: bool = False
+        self.retention_target: Optional[float] = None
+        self.quantile_theta: Optional[np.ndarray] = None  # parameters for linear quantile_fn
+        self._z_proj_for_quantile: Optional[np.ndarray] = None  # projector used for z in quantile fit
+    def _make_z_only(self, data: List[dict]) -> np.ndarray:
+        """z generation - same structure as boosting: [prompt_features..., logprob_mean, logprob_std, claim_count]"""
+        max_base_len = 0
+        for d in data:
+            base = d.get('prompt_features', np.array([]))
+            try:
+                base_len = int(np.asarray(base).size)
+            except Exception:
+                base_len = 0
+            if base_len > max_base_len:
+                max_base_len = base_len
+        cond_feats: List[np.ndarray] = []
+        for d in data:
+            sample = d.get('sample', {})
+            scores_dict = d.get('scores', {})
+            base = np.asarray(d.get('prompt_features', np.array([])), dtype=float).ravel()
+            if base.size < max_base_len:
+                pad = np.zeros(max_base_len - base.size, dtype=float)
+                base = np.concatenate([base, pad])
+            elif base.size > max_base_len and max_base_len > 0:
+                base = base[:max_base_len]
+            logprob_scores = np.asarray(scores_dict.get('logprob', np.array([])), dtype=float).ravel()
+            logprob_mean = float(np.mean(logprob_scores)) if logprob_scores.size > 0 else 0.0
+            logprob_std = float(np.std(logprob_scores)) if logprob_scores.size > 1 else 0.0
+            claim_count = float(len(sample.get('atomic_facts', [])))
+            combined = np.concatenate([base, np.array([logprob_mean, logprob_std, claim_count], dtype=float)])
+            cond_feats.append(combined)
+        result = np.asarray(cond_feats, dtype=float)
+        return result
+    def _make_yz_for_calib(self, data: List[dict], beta: np.ndarray, eps: float = 0.0):
+        z = self._make_z_only(data)
+        y_list = []
+        for d in data:
+            feats = d['features_4d']
+            ann = np.asarray(d['annotations'], dtype=bool)
+            s = -(feats @ beta)
+            false_s = s[~ann]
+            if false_s.size > 0:
+                y_list.append(np.min(false_s) - eps)
+            else:
+                y_list.append((np.max(s) if s.size > 0 else 0.0))
+        y = np.asarray(y_list, dtype=float)
+        mask = np.isfinite(y)
+        return y[mask], z[mask], mask
+    def fit(self, calib_data: List[dict], alpha: float, beta: np.ndarray,
+            adaptive_alpha: bool = False, retention_target: float = 0.7):
+        """Set up and calibrate CondConf model"""
+        self.alpha = alpha
+        self.beta = beta
+        self.adaptive_enabled = bool(adaptive_alpha)
+        self.retention_target = float(retention_target) if adaptive_alpha else None
+        if not self.adaptive_enabled:
+            self.quantile_theta = None
+        y_calib, z_calib, mask = self._make_yz_for_calib(calib_data, beta)
+        self._last_calib_mask = mask
+        self.model = CondConf(score_fn=lambda x, y: y, Phi_fn=lambda x: x, seed=self.rng.integers(1e6))
+        self.model.setup_problem(x_calib=z_calib, y_calib=y_calib)
+        if self.adaptive_enabled:
+            try:
+                self._fit_adaptive_quantile_fn(calib_data, z_calib, mask)
+            except Exception as e:
+                self.adaptive_enabled = False
+        return self
+    def predict(self, test_data: List[dict]) -> List[dict]:
+        if not self.model or self.beta is None:
+            raise RuntimeError("Model is not fitted. Call fit() first.")
+        z_test = self._make_z_only(test_data)
+        out = []
+        for i, d in enumerate(test_data):
+            sample = dict(d.get('sample', {}))
+            claims = sample.get('atomic_facts', [])
+            if not claims:
+                sample['filtered_claims'] = []
+                out.append(sample)
+                continue
+            feats = d['features_4d']
+            scores = -(feats @ self.beta)
+            z_i = z_test[i:i+1]
+            get_threshold_fn = lambda threshold, x: threshold
+            try:
+                if self.adaptive_enabled and self.quantile_theta is not None:
+                    q_i = float(self._quantile_fn(z_i))
+                else:
+                    q_i = float(self.alpha)
+                thr = self.model.predict(
+                    quantile=q_i,
+                    x_test=z_i,
+                    score_inv_fn=get_threshold_fn,
+                    randomize=True,
+                    exact=True
+                )
+                thr = float(np.squeeze(thr))
+                s_min = float(np.min(scores)) if scores.size > 0 else -np.inf
+                s_max = float(np.max(scores)) if scores.size > 0 else np.inf
+                if not np.isfinite(thr):
+                    thr = s_max
+                else:
+                    thr = float(np.clip(thr, s_min, s_max))
+                sample['filtered_claims'] = [c for j, c in enumerate(claims) if scores[j] <= thr]
+            except Exception:
+                sample['filtered_claims'] = []
+            out.append(sample)
+        return out
+    # ------------------------------------------------------------------
+    # Adaptive alpha utilities
+    # ------------------------------------------------------------------
+    def _get_claim_scores_list(self, data: List[dict], beta: np.ndarray) -> List[np.ndarray]:
+        scores_list = []
+        for d in data:
+            feats = d['features_4d']
+            s = -(feats @ beta)
+            scores_list.append(s)
+        return scores_list
+    def _compute_retention_given_threshold(self, claim_scores: np.ndarray, threshold: float) -> float:
+        if claim_scores.size == 0:
+            return 0.0
+        return float(np.mean(claim_scores <= threshold))
+    def _fit_adaptive_quantile_fn(self, calib_data: List[dict], z_calib: np.ndarray, mask: np.ndarray):
+        assert self.model is not None and self.beta is not None and self.retention_target is not None
+        calib_data_masked = [calib_data[i] for i, m in enumerate(mask) if m]
+        claim_scores_list = self._get_claim_scores_list(calib_data_masked, self.beta)
+        quantile_grid = np.linspace(0.01, 0.99, 31)
+        q_star = np.zeros(len(z_calib), dtype=float)
+        for i in range(len(z_calib)):
+            z_i = z_calib[i:i+1]
+            best_q = None
+            best_r = -1.0
+            best_q_near = None
+            for q in quantile_grid:
+                try:
+                    cutoff = self.model.predict(
+                        quantile=float(q),
+                        x_test=z_i,
+                        score_inv_fn=lambda c, x: c,
+                        randomize=True,
+                        exact=True
+                    )
+                    T = float(np.asarray(cutoff).reshape(-1)[0])
+                except Exception:
+                    continue
+                if not np.isfinite(T):
+                    continue
+                r = self._compute_retention_given_threshold(claim_scores_list[i], T)
+                if r >= self.retention_target:
+                    best_q = float(q)
+                    break
+                if r > best_r:
+                    best_r = r
+                    best_q_near = float(q)
+            q_star[i] = float(best_q if best_q is not None else (best_q_near if best_q_near is not None else quantile_grid[-1]))
+        def phi_alpha(x: np.ndarray) -> np.ndarray:
+            x = np.asarray(x)
+            ones = np.ones((x.shape[0], 1))
+            return np.concatenate([ones, x, x**2], axis=1)
+        Phi = phi_alpha(z_calib)
+        ridge = 1e-6
+        theta = np.linalg.pinv(Phi.T @ Phi + ridge * np.eye(Phi.shape[1])) @ (Phi.T @ q_star)
+        self.quantile_theta = theta
+        self._z_proj_for_quantile = None
+    def _quantile_fn(self, z_row: np.ndarray) -> float:
+        """Given single-row z (1 x d), return clipped quantile using phi_alpha (1, z, z^2)."""
+        assert self.quantile_theta is not None
+        z = np.asarray(z_row)
+        phi = np.concatenate([np.ones((z.shape[0], 1)), z, z**2], axis=1)
+        q = float(phi @ self.quantile_theta)
+        return float(np.clip(q, 0.01, 0.99))
+    def evaluate_auroc(self, test_data: List[dict]) -> dict:
+        if not self.model or self.beta is None:
+            raise RuntimeError("Model is not fitted. Call fit() first.")
+        all_scores = []
+        all_labels = []
+        for sample_data in test_data:
+            features = sample_data['features_4d']
+            annotations = np.array(sample_data['annotations'])
+            nonconformity_scores = -features @ self.beta
+            all_scores.extend(nonconformity_scores)
+            all_labels.extend((~annotations.astype(bool)).astype(int))
+        all_scores = np.array(all_scores)
+        all_labels = np.array(all_labels)
+        try:
+            auroc = roc_auc_score(all_labels, all_scores)
+            fpr, tpr, thresholds = roc_curve(all_labels, all_scores)
+            results = {
+                'auroc': auroc,
+                'fpr': fpr,
+                'tpr': tpr,
+                'thresholds': thresholds,
+                'n_samples': len(all_scores),
+                'n_false_claims': np.sum(all_labels),
+                'n_true_claims': len(all_labels) - np.sum(all_labels)
+            }
+            return results
+        except ValueError as e:
+            return {
+                'auroc': np.nan,
+                'error': str(e),
+                'n_samples': len(all_scores),
+                'n_false_claims': np.sum(all_labels),
+                'n_true_claims': len(all_labels) - np.sum(all_labels)
+            }
+    def get_claim_scores(self, test_data: List[dict]) -> List[dict]:
+        """Return claim-level scores for each sample"""
+        if not self.model or self.beta is None:
+            raise RuntimeError("Model is not fitted. Call fit() first.")
+        results = []
+        for sample_data in test_data:
+            features = sample_data['features_4d']
+            annotations = np.array(sample_data['annotations'])
+            claims = sample_data.get('sample', {}).get('atomic_facts', [])
+            nonconformity_scores = -features @ self.beta
+            sample_result = {
+                'sample_id': sample_data.get('sample_id', 'unknown'),
+                'claims': claims,
+                'nonconformity_scores': nonconformity_scores.tolist(),
+                'annotations': annotations.tolist(),
+                'is_false': (~annotations.astype(bool)).tolist()
+            }
+            results.append(sample_result)
+        return results

MACI-main/data/med_scores/medlfqa_frequencies.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75c946c6772d18c650b4484bd743032f861a1af4819ded8014cbd5a3b7102857
+size 2225374

MACI-main/data/med_scores/medlfqa_logprobs.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:032e80b7aa0c1c73343aec30aca91c128fa9a3fa076333a07a601ba3495b1bd7
+size 2199362

MACI-main/data/med_scores/medlfqa_scores_deepseek_deepseek-chat-v3-0324.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9dee348e1265e4f01029224c070ab7b75d6fdbf51b8f4705828c378430c97e38
+size 426183

MACI-main/data/med_scores/medlfqa_scores_meta-llama_llama-3.3-70b-instruct.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12eebea7f09ff94aaba4944f6d3ccf3e3ad10e33cd154a6cc274218f2709f1bd
+size 426183

MACI-main/data/med_scores/medlfqa_scores_qwen_qwen-2.5-72b-instruct.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67fe74653e69c421202cc4da4308f262657349a5bdc10bf65c43f083d92499e8
+size 426183

MACI-main/data/med_scores/medlfqa_selfevals.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58bc690d686c367019bb016b68723747f5a311b3d56f35249c8ee36a61b77878
+size 2226438

MACI-main/data/wiki_scores/wikibio_final.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

MACI-main/data/wiki_scores/wikibio_final_dataset.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:482d015cec319b80bf92c4adb8e6d65c20cc40808801561291c7d5bcf76ed551
+size 20356478

MACI-main/data/wiki_scores/wikibio_final_frequencies.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f657a70d4e4307b91bdd80ee4b03daf5d3362e723ebae54ceefd5c7cc2330a37
+size 3933826

MACI-main/data/wiki_scores/wikibio_final_logprobs.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:644865e9037b9139e277f5d4dc2da594314a4414eca3a9bad4dcad5f1c511319
+size 4820424

MACI-main/data/wiki_scores/wikibio_final_self_evals.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77f61a212dabb3fd85724e78c793887ccbe90e0285c4055411888c64dd5d44d4
+size 4848638

MACI-main/data/wiki_scores/wikibio_scores_deepseek-chat-v3-0324.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9726f79ffdd9e6c1f89b64c66960cdc3cce5c3b868e750cc99ee28e4a666c50
+size 621202

MACI-main/data/wiki_scores/wikibio_scores_meta-llama_llama-3.3-70b-instruct.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c36a3e500d2c08be38fcf07260a0496f498849e8ea57c9cf074f5f10aca855a
+size 621202

MACI-main/data/wiki_scores/wikibio_scores_qwen_qwen-2.5-72b-instruct.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8047a2bcf802c082f6995fae735b634738f351cda58c37409a52376f667ac4a
+size 621168

MACI-main/experiments/conditional_groupers.py ADDED Viewed

	@@ -0,0 +1,542 @@

+#!/usr/bin/env python3
+"""
+Flexible conditional grouping utilities for subgroup analysis.
+"""
+import numpy as np
+import pandas as pd
+import re
+import warnings
+import json
+import os
+from typing import List, Dict, Any, Tuple
+from abc import ABC, abstractmethod
+warnings.filterwarnings('default')
+np.seterr(all='warn')
+class ConditionalGrouper(ABC):
+    def __init__(self, name: str, description: str):
+        self.name = name
+        self.description = description
+    @abstractmethod
+    def compute_values(self, dataset: List[Dict[str, Any]], **kwargs) -> np.ndarray:
+        pass
+    def create_bins(self, values: np.ndarray, method: str = 'quartiles',
+                   custom_bins: List[float] = None) -> List[Tuple[float, float]]:
+        finite_values = values[np.isfinite(values)]
+        if len(finite_values) == 0:
+            return [(float(np.min(values)), float(np.max(values)))]
+        if method == 'quartiles':
+            quantiles = [0.0, 0.25, 0.5, 0.75, 1.0]
+        elif method == 'quintiles':
+            quantiles = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+        elif method == 'deciles':
+            quantiles = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
+        elif method == 'tertiles':
+            quantiles = [0.0, 0.33, 0.67, 1.0]
+        elif method == 'median_split':
+            quantiles = [0.0, 0.5, 1.0]
+        elif method == 'custom' and custom_bins:
+            qs = np.array(custom_bins)
+        else:
+            quantiles = [0.0, 0.25, 0.5, 0.75, 1.0]
+        if method != 'custom':
+            qs = np.quantile(finite_values, quantiles)
+        bins = [(float(qs[i]), float(qs[i+1])) for i in range(len(qs)-1)]
+        return bins
+    def get_group_info(self, dataset: List[Dict[str, Any]], **kwargs) -> Dict[str, Any]:
+        values = self.compute_values(dataset, **kwargs)
+        finite_values = values[np.isfinite(values)]
+        return {
+            'name': self.name,
+            'description': self.description,
+            'total_samples': len(values),
+            'valid_samples': len(finite_values),
+            'min_value': float(np.min(finite_values)) if len(finite_values) > 0 else np.nan,
+            'max_value': float(np.max(finite_values)) if len(finite_values) > 0 else np.nan,
+            'mean_value': float(np.mean(finite_values)) if len(finite_values) > 0 else np.nan,
+            'std_value': float(np.std(finite_values)) if len(finite_values) > 0 else np.nan,
+        }
+# View metadata configuration (globally overridable)
+def _default_view_csv_path() -> str:
+    repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+    return os.path.join(repo_root, 'data', 'wiki_scores', 'wikibio_final.csv')
+GLOBAL_VIEW_METADATA_CSV = _default_view_csv_path()
+def set_view_metadata_csv(csv_path: str):
+    global GLOBAL_VIEW_METADATA_CSV
+    if isinstance(csv_path, str) and len(csv_path) > 0:
+        GLOBAL_VIEW_METADATA_CSV = csv_path
+class ViewCountGrouper(ConditionalGrouper):
+    def __init__(self):
+        super().__init__(
+            name="view_count",
+            description="Wikipedia view count (from wikibio_final.csv)"
+        )
+        self._loaded = False
+        self._csv_path = None
+        self._name_to_views = {}
+        self._global_min_count = 0.0
+    @staticmethod
+    def _parse_name_from_prompt(prompt: str) -> str:
+        if not isinstance(prompt, str):
+            try:
+                prompt = str(prompt)
+            except Exception:
+                return ""
+        txt = prompt.strip()
+        # Typical pattern: "Please write one biographical paragraph about {NAME}."
+        import re
+        m = re.search(r"about\s+(.+?)(?:[\.]|\n|$)", txt, flags=re.IGNORECASE)
+        if m:
+            return m.group(1).strip()
+        # Fallback: try after 'about '
+        if 'about ' in txt:
+            return txt.split('about ', 1)[-1].strip().rstrip('.').strip()
+        return txt
+    def _ensure_loaded(self):
+        # Lazy-load and refresh if global path changed
+        if (not self._loaded) or (self._csv_path != GLOBAL_VIEW_METADATA_CSV):
+            try:
+                df = pd.read_csv(GLOBAL_VIEW_METADATA_CSV)
+                name_col = 'Name' if 'Name' in df.columns else None
+                views_col = 'Views' if 'Views' in df.columns else None
+                maxc_col = 'max_counts' if 'max_counts' in df.columns else None
+                mapping = {}
+                values_for_min = []
+                if name_col and (views_col or maxc_col):
+                    for _, row in df.iterrows():
+                        name = str(row[name_col]).strip()
+                        v = np.nan
+                        # Per-row preference: Views if finite, else max_counts
+                        if views_col is not None:
+                            try:
+                                vv = float(row[views_col])
+                                if np.isfinite(vv):
+                                    v = vv
+                            except Exception:
+                                pass
+                        if (not np.isfinite(v)) and maxc_col is not None:
+                            try:
+                                mv = float(row[maxc_col])
+                                if np.isfinite(mv):
+                                    v = mv
+                            except Exception:
+                                pass
+                        mapping[name] = v
+                        if np.isfinite(v):
+                            values_for_min.append(v)
+                self._name_to_views = mapping
+                self._csv_path = GLOBAL_VIEW_METADATA_CSV
+                # Global minimum over available finite counts; default to 0.0 if none
+                self._global_min_count = float(np.min(values_for_min)) if len(values_for_min) > 0 else 0.0
+                self._loaded = True
+            except Exception:
+                # If loading fails, mark as loaded with empty mapping
+                self._name_to_views = {}
+                self._csv_path = GLOBAL_VIEW_METADATA_CSV
+                self._global_min_count = 0.0
+                self._loaded = True
+    def compute_values(self, dataset: List[Dict[str, Any]], **kwargs) -> np.ndarray:
+        self._ensure_loaded()
+        values = []
+        for sample in dataset:
+            prompt = sample.get('prompt', '')
+            name = self._parse_name_from_prompt(prompt)
+            # Direct match first
+            val = self._name_to_views.get(name)
+            if val is None:
+                # Try naive normalization: collapse spaces
+                key2 = " ".join(name.split())
+                val = self._name_to_views.get(key2, np.nan)
+            # Fallback: global min count if missing or NaN
+            if val is None or (isinstance(val, float) and not np.isfinite(val)):
+                val = self._global_min_count
+            values.append(float(val))
+        return np.array(values, dtype=float)
+class FalseClaimRiskGrouper(ConditionalGrouper):
+    def __init__(self):
+        super().__init__(
+            name="false_claim_risk",
+            description="Text-based false-claim risk index (higher → more risk)"
+        )
+        self.abs_terms = [
+            'always', 'never', 'guarantee', 'guaranteed', 'cure', 'proven',
+            'will', 'must', 'definitely', 'certainly', 'undoubtedly', 'no doubt'
+        ]
+        self.enum_keywords = [
+            'symptom', 'symptoms', 'signs', 'causes', 'cause', 'types', 'treatments',
+            'treatment', 'risk factors', 'complications', 'side effects', 'prevention'
+        ]
+        self.citation_patterns = [
+            r'according\s+to', r'based\s+on', r'research\s+(?:shows?|indicates?|suggests?)',
+            r'studies?\s+(?:show|indicate|suggest|reveal|demonstrate)', r'\(\d{4}\)', r'\[[\d,\s-]+\]'
+        ]
+        self.compiled_cite = [re.compile(p, re.IGNORECASE) for p in self.citation_patterns]
+    @staticmethod
+    def _num_sentences(text: str) -> int:
+        if not text:
+            return 0
+        return max(1, text.count('.') + text.count('!') + text.count('?') + text.count('\n'))
+    @staticmethod
+    def _listiness(text: str) -> int:
+        if not text:
+            return 0
+        markers = [',', ';', '\n', '-', '*', '•']
+        count = sum(text.count(m) for m in markers)
+        # Enumerations like "1.", "2)", "(3)"
+        count += len(re.findall(r'(?:(?<=\s)|^)(?:\d{1,2}[\.)\]])', text))
+        return count
+    def _citation_density(self, text: str) -> float:
+        if not text:
+            return 0.0
+        words = text.split()
+        if not words:
+            return 0.0
+        matches = 0
+        low = text.lower()
+        for pat in self.compiled_cite:
+            matches += len(pat.findall(low))
+        return matches / max(1, len(words))
+    def _absolute_density(self, text: str) -> float:
+        if not text:
+            return 0.0
+        words = re.findall(r"\b\w+\b", text.lower())
+        if not words:
+            return 0.0
+        abs_cnt = sum(1 for w in words if w in self.abs_terms)
+        return abs_cnt / max(1, len(words))
+    def _enum_keyword_score(self, prompt: str, response: str) -> float:
+        txt = f"{prompt} {response}".lower()
+        return float(sum(1 for k in self.enum_keywords if k in txt))
+    def compute_values(self, dataset: List[Dict[str, Any]], **kwargs) -> np.ndarray:
+        vals = []
+        for sample in dataset:
+            prompt = sample.get('prompt', '') or ''
+            response = sample.get('response', '') or ''
+            resp = str(response)
+            # Features
+            num_words = len(resp.split())
+            len_norm = min(1.0, num_words / 400.0)
+            sent_norm = min(1.0, self._num_sentences(resp) / 12.0)
+            list_norm = min(1.0, self._listiness(resp) / 40.0)
+            num_density = (sum(ch.isdigit() for ch in resp) / max(1, len(resp)))
+            abs_density = self._absolute_density(resp)
+            cite_density = self._citation_density(resp)
+            enum_score = min(1.0, self._enum_keyword_score(str(prompt), resp) / 4.0)
+            # Composite risk (clipped to [0,1])
+            risk = (
+                0.30 * len_norm +
+                0.15 * sent_norm +
+                0.20 * list_norm +
+                0.10 * num_density +
+                0.15 * abs_density +
+                0.10 * enum_score -
+                0.10 * cite_density
+            )
+            vals.append(float(np.clip(risk, 0.0, 1.0)))
+        return np.array(vals, dtype=float)
+class MedicalContentGrouper(ConditionalGrouper):
+    def __init__(self):
+        super().__init__(
+            name="medical_content",
+            description="Medical content (Information/Interpretation/Action)"
+        )
+    @staticmethod
+    def _normalize(text: str) -> str:
+        if not isinstance(text, str):
+            try:
+                text = str(text)
+            except Exception:
+                return ""
+        return " ".join(text.strip().lower().split())
+    def _classify(self, prompt: str) -> int:
+        p = self._normalize(prompt)
+        # Heuristic keyword sets
+        info_kw = [
+            "what is", "what are", "definition", "define", "symptom", "signs", "cause", "why",
+            "prognosis", "life expectancy", "effect", "does .* do", "means?", "treatment", "therapy",
+            "disease", "syndrome", "disorder", "cancer", "diabetes", "ards", "tay-sachs", "paget",
+            "thalassemia", "psp", "rosacea", "empyema"
+        ]
+        drug_kw = [
+            "drug", "medication", "medicine", "dose", "dosage", "tablet", "pill", "mg", "patch",
+            "paxlovid", "zoloft", "lexapro", "meloxicam", "naproxen", "fentanyl", "celexa", "restoril",
+            "calcitonin", "latanoprost", "aldactazide", "nicoderm"
+        ]
+        symptom_kw = [
+            "pain", "ache", "swelling", "lump", "dark urine", "dizziness", "lightheaded", "fatigue",
+            "muscle aches", "discharge", "sunburn", "hoarder", "smell"
+        ]
+        interpret_kw = [
+            "what does it mean", "what does .* mean", "when should you worry", "should i worry",
+        ]
+        action_kw = [
+            "should i", "do i need", "is it okay", "can i", "how to", "how do i", "stop", "start",
+            "continue", "switch", "swap", "get tested", "try", "take", "drink", "use"
+        ]
+        def contains_any(keys: List[str]) -> bool:
+            for k in keys:
+                if " .* " in k or ".*" in k:
+                    import re
+                    if re.search(k, p):
+                        return True
+                if k in p:
+                    return True
+            return False
+        # Action-seeking first (high precision phrases)
+        if contains_any(action_kw):
+            return 2
+        # Information-seeking: has disease/drug entity cues and info-type query words
+        if (contains_any(info_kw) or contains_any(drug_kw)) and ("?" in prompt or contains_any(["what", "why", "signs", "symptom", "life expectancy", "treatment"])):
+            return 0
+        # Interpretation-seeking: general symptom phrases or interpret patterns
+        if contains_any(interpret_kw) or contains_any(symptom_kw):
+            return 1
+        # Fallback: map generic questions with what/why to information
+        if contains_any(["what", "why"]):
+            return 0
+        # Otherwise treat as action if imperative-like
+        if contains_any(["how to", "how do i"]):
+            return 2
+        # Default to interpretation
+        return 1
+    def compute_values(self, dataset: List[Dict[str, Any]], **kwargs) -> np.ndarray:
+        values = []
+        for sample in dataset:
+            prompt = sample.get('prompt', '')
+            values.append(self._classify(prompt))
+        return np.array(values, dtype=float)
+    def create_bins(self, values: np.ndarray, method: str = 'ignored', custom_bins: List[float] = None) -> List[Tuple[float, float]]:
+        return [(-0.5, 0.5), (0.5, 1.5), (1.5, 2.5)]
+class ExpertQAFieldGrouper(ConditionalGrouper):
+    """ExpertQA official metadata.field based 3-group classifier
+    - 0: Biology/Medicine (Biology, Chemistry, Psychology, Environmental Science, etc.)
+    - 1: Engineering/Technology (Engineering and Technology, Physics and Astronomy, Architecture, etc.)
+    - 2: Other (All other fields)
+    The mapping is loaded from '/expertqa_prompt_to_field.json' by default.
+    If the file does not exist, all samples are classified as Other(2).
+    The values are integer labels, and create_bins is fixed to discrete intervals.
+    """
+    def __init__(self, mapping_path: str = "/expertqa_prompt_to_field.json"):
+        super().__init__(
+            name="expertqa_field",
+            description="ExpertQA metadata.field → {Bio/Med, Eng/Tech, Other}"
+        )
+        self.mapping_path = mapping_path
+        self._loaded = False
+        self._prompt_to_field = {}
+        self.bio_med_fields = set([
+            "Healthcare / Medicine",
+            "Biology",
+            "Chemistry",
+            "Psychology",
+            "Environmental Science",
+        ])
+        self.eng_tech_fields = set([
+            "Engineering and Technology",
+            "Physics and Astronomy",
+            "Architecture",
+        ])
+    @staticmethod
+    def _normalize(text: str) -> str:
+        if not isinstance(text, str):
+            try:
+                text = str(text)
+            except Exception:
+                return ""
+        return " ".join(text.strip().split())
+    def _ensure_loaded(self):
+        if self._loaded:
+            return
+        try:
+            if os.path.exists(self.mapping_path):
+                with open(self.mapping_path, "r", encoding="utf-8") as f:
+                    data = json.load(f)
+                    self._prompt_to_field = {self._normalize(k): v for k, v in data.items()}
+            else:
+                self._prompt_to_field = {}
+        except Exception:
+            self._prompt_to_field = {}
+        finally:
+            self._loaded = True
+    def _field_to_group(self, field: str) -> int:
+        if not isinstance(field, str):
+            return 2
+        f = field.strip()
+        if f in self.bio_med_fields:
+            return 0
+        if f in self.eng_tech_fields:
+            return 1
+        return 2
+    def compute_values(self, dataset: List[Dict[str, Any]], **kwargs) -> np.ndarray:
+        self._ensure_loaded()
+        labels = []
+        for sample in dataset:
+            prompt = sample.get('prompt', '')
+            p_key = self._normalize(prompt)
+            field = self._prompt_to_field.get(p_key)
+            if field is None:
+                q = sample.get('question', '')
+                q_key = self._normalize(q)
+                field = self._prompt_to_field.get(q_key)
+            group_id = self._field_to_group(field)
+            labels.append(float(group_id))
+        return np.array(labels, dtype=float)
+    def create_bins(self, values: np.ndarray, method: str = 'ignored', custom_bins: List[float] = None) -> List[Tuple[float, float]]:
+        return [(-0.5, 0.5), (0.5, 1.5), (1.5, 2.5)]
+def get_available_groupers() -> Dict[str, ConditionalGrouper]:
+    return {
+        'view_count': ViewCountGrouper(),
+        'medical_content': MedicalContentGrouper(),
+        'false_claim_risk': FalseClaimRiskGrouper(),
+    }
+def compute_conditional_coverage_by_grouper(
+    filtered_dataset: List[Dict[str, Any]],
+    grouping_values: np.ndarray,
+    bins: List[Tuple[float, float]]
+) -> List[float]:
+    """Calculate conditional coverage by a specific grouper"""
+    def compute_marginal_coverage(sub_dataset: List[Dict[str, Any]]) -> float:
+        """Calculate marginal coverage from a given subset"""
+        indicators = []
+        for d in sub_dataset:
+            retained = d.get('filtered_claims', [])
+            has_false = any([not c.get('is_supported', False) for c in retained])
+            indicators.append(0.0 if has_false else 1.0)
+        return float(np.mean(indicators)) if indicators else 0.0
+    coverage_results = []
+    for bin_min, bin_max in bins:
+        mask = []
+        for i, value in enumerate(grouping_values):
+            if np.isfinite(value):
+                mask.append(bin_min <= value <= bin_max)
+            else:
+                mask.append(False)
+        indices = [i for i, m in enumerate(mask) if m]
+        if not indices:
+            coverage_results.append(np.nan)
+            continue
+        subset = [filtered_dataset[i] for i in indices]
+        coverage = compute_marginal_coverage(subset)
+        coverage_results.append(coverage)
+    return coverage_results
+def compute_retention_by_grouper(
+    filtered_dataset: List[Dict[str, Any]],
+    grouping_values: np.ndarray,
+    bins: List[Tuple[float, float]]
+) -> List[Dict[str, Any]]:
+    """Calculate retention rate by a specific grouper"""
+    retention_results = []
+    for bin_min, bin_max in bins:
+        mask = []
+        for i, value in enumerate(grouping_values):
+            if np.isfinite(value):
+                mask.append(bin_min <= value <= bin_max)
+            else:
+                mask.append(False)
+        indices = [i for i, m in enumerate(mask) if m]
+        if not indices:
+            retention_results.append({
+                'bin': (float(bin_min), float(bin_max)),
+                'samples': 0,
+                'retained': 0,
+                'total': 0,
+                'rate': np.nan,
+            })
+            continue
+        total_claims = 0
+        retained_claims = 0
+        sample_count = len(indices)
+        for idx in indices:
+            d = filtered_dataset[idx]
+            afs = d.get('atomic_facts', [])
+            total_claims += len(afs)
+            retained_claims += len(d.get('filtered_claims', []))
+        rate = (retained_claims / total_claims) if total_claims > 0 else np.nan
+        retention_results.append({
+            'bin': (float(bin_min), float(bin_max)),
+            'samples': sample_count,
+            'retained': int(retained_claims),
+            'total': int(total_claims),
+            'rate': float(rate) if not np.isnan(rate) else np.nan,
+        })
+    return retention_results

MACI-main/experiments/run_experiment.py ADDED Viewed

	@@ -0,0 +1,1127 @@

+import numpy as np
+import pickle
+import os
+import sys
+import json
+import argparse
+import time
+import logging
+import warnings
+from datetime import datetime
+from typing import Optional, Dict, Any, List
+from collections import defaultdict
+warnings.filterwarnings('default')
+warnings.simplefilter('ignore', category=FutureWarning)
+np.seterr(all='warn')
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from conformal.basic_conformal import BasicConformal
+from conformal.adaptive_conformal import MACIAdaptiveConformal, SubgroupOptimizedMACI
+from conditional_groupers import get_available_groupers
+from conditional_groupers import set_view_metadata_csv
+MODEL_NAMES = ['qwen-2.5-72b-instruct', 'deepseek-chat-v3-0324', 'llama-3.3-70b-instruct']
+def setup_logging(log_dir: str):
+    """Sets up logging to both console and file."""
+    os.makedirs(log_dir, exist_ok=True)
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    log_filename = os.path.join(log_dir, f"experiment_log_{timestamp}.log")
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    for handler in logger.handlers[:]:
+        logger.removeHandler(handler)
+    file_handler = logging.FileHandler(log_filename)
+    file_handler.setFormatter(logging.Formatter('%(message)s'))
+    logger.addHandler(file_handler)
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(logging.Formatter('%(message)s'))
+    logger.addHandler(console_handler)
+    logging.info(f"📝 Logging to {log_filename}")
+def load_1000_samples(data_dir: str, scores_dir: Optional[str] = None, dataset_type: str = "auto", limit_samples: int = 1000):
+    """Load up to `limit_samples` samples and attach LLM scores."""
+    logging.info(f"📁 Loading up to {limit_samples} samples with provided scores...")
+    if dataset_type == "auto":
+        wikibio_path = os.path.join(data_dir, "wiki_scores", "wikibio_final_dataset.pkl")
+        medlfqa_path = os.path.join(data_dir, "med_scores", "medlfqa_dataset.pkl")
+        if os.path.exists(wikibio_path):
+            dataset_type = "wikibio"
+            logging.info(f"  🔍 Auto-detected dataset type: {dataset_type}")
+        elif os.path.exists(medlfqa_path):
+            dataset_type = "medlfqa"
+            logging.info(f"  🔍 Auto-detected dataset type: {dataset_type}")
+        else:
+            raise FileNotFoundError(f"Could not find dataset files in {data_dir}")
+    if dataset_type == "wikibio":
+        dataset_path = os.path.join(data_dir, "wiki_scores", "wikibio_final_dataset.pkl")
+        base_scores_dir = os.path.join(data_dir, "wiki_scores")
+        score_prefix = "wikibio_scores"
+        basic_scores = {
+            'frequencies': os.path.join(base_scores_dir, "wikibio_final_frequencies.npz"),
+            'logprobs': os.path.join(base_scores_dir, "wikibio_final_logprobs.npz"),
+            'selfevals': os.path.join(base_scores_dir, "wikibio_final_self_evals.npz")
+        }
+    elif dataset_type == "medlfqa":
+        dataset_path = os.path.join(data_dir, "med_scores", "medlfqa_dataset.pkl")
+        base_scores_dir = os.path.join(data_dir, "med_scores")
+        score_prefix = "medlfqa_scores"
+        basic_scores = {
+            'frequencies': os.path.join(base_scores_dir, "medlfqa_frequencies.npz"),
+            'logprobs': os.path.join(base_scores_dir, "medlfqa_logprobs.npz"),
+            'selfevals': os.path.join(base_scores_dir, "medlfqa_selfevals.npz")
+        }
+    else:
+        raise ValueError(f"Unknown dataset type: {dataset_type}")
+    logging.info(f"  📊 Dataset: {dataset_path}")
+    logging.info(f"  🎯 Score prefix: {score_prefix}")
+    with open(dataset_path, 'rb') as f:
+        dataset = pickle.load(f)
+    dataset_1000 = dataset[:limit_samples]
+    frequencies = {}
+    logprobs = {}
+    selfevals = {}
+    for score_type, score_path in basic_scores.items():
+        try:
+            if score_type == 'frequencies':
+                frequencies = np.load(score_path, allow_pickle=True)
+            elif score_type == 'logprobs':
+                logprobs = np.load(score_path, allow_pickle=True)
+            elif score_type == 'selfevals':
+                selfevals = np.load(score_path, allow_pickle=True)
+            logging.info(f"  ✅ Loaded {score_type}: {score_path}")
+        except FileNotFoundError:
+            logging.warning(f"  ⚠️ {score_type} not found: {score_path}")
+    if scores_dir is not None and os.path.isdir(scores_dir):
+        score_files_dir = scores_dir
+    else:
+        score_files_dir = base_scores_dir
+    logging.info(f"  🎯 Score files directory: {score_files_dir}")
+    import glob
+    all_npz_files = sorted(glob.glob(os.path.join(score_files_dir, f"{score_prefix}_*.npz")))
+    def find_by_tokens(token_options: List[List[str]]):
+        for tokens in token_options:
+            for fp in all_npz_files:
+                name = os.path.basename(fp).lower()
+                if all(t in name for t in tokens):
+                    return fp
+        return None
+    score_files = {
+        'qwen-2.5-72b-instruct': find_by_tokens([
+            ['qwen-2.5-72b','instruct'], ['qwen','instruct'], ['qwen']
+        ]),
+        'deepseek-chat-v3-0324': find_by_tokens([
+            ['deepseek','chat','v3'], ['deepseek','chat'], ['deepseek']
+        ]),
+        'llama-3.3-70b-instruct': find_by_tokens([
+            ['llama-3.3-70b','instruct'], ['llama-3.3','instruct'], ['llama']
+        ]),
+    }
+    llm_scores = {}
+    for model_name, filename in score_files.items():
+        try:
+            model_data = np.load(filename, allow_pickle=True)
+            model_prompts = model_data['prompts'].tolist()
+            model_scores_list = model_data['scores_list'].tolist()
+            llm_scores[model_name] = {p: s for p, s in zip(model_prompts, model_scores_list)}
+            logging.info(f"  ✅ Loaded {model_name} scores")
+        except (FileNotFoundError, TypeError):
+            logging.warning(f"  ⚠️ {model_name} scores not found or invalid: {filename}")
+            llm_scores[model_name] = {}
+    aligned_data = []
+    for i, sample in enumerate(dataset_1000):
+        prompt = sample['prompt']
+        atomic_facts = sample.get('atomic_facts', [])
+        n_claims = len(atomic_facts)
+        if n_claims == 0:
+            continue
+        if prompt in selfevals:
+            selfeval_vals = selfevals[prompt]
+            if hasattr(selfeval_vals, 'ndim') and selfeval_vals.ndim == 1:
+                if np.allclose(selfeval_vals, -1):
+                    continue
+            elif np.allclose(selfeval_vals, -1):
+                continue
+        annotations = np.array([af.get('is_supported', False) for af in atomic_facts])
+        freq_scores = np.zeros(n_claims)
+        if dataset_type == 'wikibio':
+            key = f'arr_{i}'
+            if key in frequencies:
+                freq_vals = frequencies[key]
+                if hasattr(freq_vals, 'ndim') and freq_vals.ndim == 1:
+                    freq_scores = freq_vals[:n_claims]
+                else:
+                    freq_scores = np.full(n_claims, freq_vals.item() if hasattr(freq_vals, 'item') else freq_vals)
+                freq_scores = np.nan_to_num(freq_scores, nan=0.0)
+        else:
+            if prompt in frequencies:
+                freq_vals = frequencies[prompt]
+                if hasattr(freq_vals, 'ndim') and freq_vals.ndim == 1:
+                    freq_scores = freq_vals[:n_claims]
+                else:
+                    freq_val = freq_vals.item() if hasattr(freq_vals, 'item') else freq_vals
+                    freq_val = 0.0 if np.isnan(freq_val) else freq_val
+                    freq_scores = np.full(n_claims, freq_val)
+                freq_scores = np.nan_to_num(freq_scores, nan=0.0)
+        if dataset_type == 'wikibio':
+            key = f'arr_{i}'
+            if key in logprobs:
+                lp_vals = logprobs[key]
+                if hasattr(lp_vals, 'ndim') and lp_vals.ndim == 1:
+                    logprob_scores = np.nan_to_num(lp_vals[:n_claims], nan=0.0)
+                else:
+                    v = lp_vals.item() if hasattr(lp_vals, 'item') else lp_vals
+                    v = 0.0 if np.isnan(v) else v
+                    logprob_scores = np.full(n_claims, v)
+            else:
+                logprob_scores = np.zeros(n_claims)
+        else:
+            if prompt in logprobs:
+                logprob_vals = logprobs[prompt]
+                if hasattr(logprob_vals, 'ndim') and logprob_vals.ndim == 1:
+                    logprob_scores = logprob_vals[:n_claims]
+                    logprob_scores = np.nan_to_num(logprob_scores, nan=0.0)
+                else:
+                    logprob_val = logprob_vals.item() if hasattr(logprob_vals, 'item') else logprob_vals
+                    logprob_val = 0.0 if np.isnan(logprob_val) else logprob_val
+                    logprob_scores = np.full(n_claims, logprob_val)
+            else:
+                logprob_scores = np.zeros(n_claims)
+        if dataset_type == 'wikibio':
+            key = f'arr_{i}'
+            if key in selfevals:
+                se_vals = selfevals[key]
+                if hasattr(se_vals, 'ndim') and se_vals.ndim == 1:
+                    selfeval_scores = np.nan_to_num(se_vals[:n_claims], nan=0.0)
+                else:
+                    v = se_vals.item() if hasattr(se_vals, 'item') else se_vals
+                    v = 0.0 if np.isnan(v) else v
+                    selfeval_scores = np.full(n_claims, v)
+            else:
+                selfeval_scores = np.zeros(n_claims)
+        else:
+            if prompt in selfevals:
+                selfeval_vals = selfevals[prompt]
+                if hasattr(selfeval_vals, 'ndim') and selfeval_vals.ndim == 1:
+                    selfeval_scores = selfeval_vals[:n_claims]
+                    selfeval_scores = np.nan_to_num(selfeval_scores, nan=0.0)
+                else:
+                    selfeval_val = selfeval_vals.item() if hasattr(selfeval_vals, 'item') else selfeval_vals
+                    selfeval_val = 0.0 if np.isnan(selfeval_val) else selfeval_val
+                    selfeval_scores = np.full(n_claims, selfeval_val)
+            else:
+                selfeval_scores = np.zeros(n_claims)
+        ordinal_scores = np.arange(n_claims)
+        if n_claims > 1:
+            ordinal_scores = ordinal_scores / (n_claims - 1)
+        else:
+            ordinal_scores = np.array([0.5])
+        scores_dict = {}
+        for model_name, model_data in llm_scores.items():
+            if prompt in model_data:
+                scores_dict[model_name] = np.array(model_data[prompt][:n_claims])
+                scores_dict[model_name] = np.clip(scores_dict[model_name], 0.0, 1.0)
+            else:
+                scores_dict[model_name] = np.full(n_claims, 0.5)
+        valid_llm_scores = []
+        for model_name in MODEL_NAMES:
+            if model_name in scores_dict:
+                valid_llm_scores.append(scores_dict[model_name])
+        if valid_llm_scores:
+            ensemble_mean = np.mean(valid_llm_scores, axis=0)
+            ensemble_std = np.std(valid_llm_scores, axis=0)
+            lambda_uncertainty = 0.0
+            ensemble_scores = ensemble_mean - lambda_uncertainty * ensemble_std
+            ensemble_scores = np.clip(ensemble_scores, 0.0, 1.0)
+        else:
+            ensemble_scores = np.full(n_claims, 0.5)
+        features_4d = np.concatenate((
+            freq_scores.reshape(-1, 1),
+            selfeval_scores.reshape(-1, 1),
+            (logprob_scores / (np.max(logprob_scores) + 1e-8)).reshape(-1, 1),
+            ordinal_scores.reshape(-1, 1)
+        ), axis=1)
+        aligned_data.append({
+            'sample': sample,
+            'annotations': annotations,
+            'scores': {
+                'frequency': freq_scores,
+                'selfeval': selfeval_scores,
+                'logprob': logprob_scores,
+                'ensemble': ensemble_scores,
+                **scores_dict
+            },
+            'features_4d': features_4d,
+            'prompt_features': np.array([1.0, len(sample.get('response', '')), len(prompt)])
+        })
+    logging.info(f"✅ Loaded {len(aligned_data)} valid samples")
+    return aligned_data
+def create_splits(data, calib_ratio=0.7, test_ratio=0.3, random_seed=42):
+    """Create calibration and test splits based on ratios with random shuffling"""
+    total_size = len(data)
+    calib_size = int(total_size * calib_ratio)
+    test_size = int(total_size * test_ratio)
+    if calib_size + test_size > total_size:
+        test_size = total_size - calib_size
+    logging.info(f"📊 Creating splits: {calib_size} calib ({calib_ratio*100:.0f}%), {test_size} test ({test_ratio*100:.0f}%)")
+    np.random.seed(random_seed)
+    indices = np.random.permutation(total_size)
+    calib_idx = indices[:calib_size]
+    test_idx = indices[calib_size:calib_size + test_size]
+    calib_data = [data[i] for i in calib_idx]
+    test_data = [data[i] for i in test_idx]
+    logging.info(f"🎲 Random split with seed {random_seed}: calib indices {calib_idx[:5]}..., test indices {test_idx[:5]}...")
+    return calib_data, test_data, calib_idx, test_idx
+def run_bcp_experiment(calib_data, test_data, score_type='frequency', alpha=0.1, **kwargs):
+    """
+    Run BCP (Split Conformal) experiment.
+    [FIXED] Uses a unified score_function that relies on pre-aligned data.
+    """
+    logging.info(f"📈 Running BCI (Split Conformal) with {score_type} scores...")
+    calib_samples = [item['sample'] for item in calib_data]
+    test_samples = [item['sample'] for item in test_data]
+    def score_function(samples):
+        result = []
+        sample_to_data = {item['sample']['prompt']: item for item in calib_data + test_data}
+        for sample in samples:
+            prompt = sample['prompt']
+            if prompt in sample_to_data:
+                scores = sample_to_data[prompt]['scores'].get(score_type)
+                if scores is not None:
+                    if score_type in ['frequency', 'selfeval', 'logprob']:
+                        non_conformity_scores = 1.0 - scores
+                    else:
+                        non_conformity_scores = 1.0 - scores
+                    result.append(non_conformity_scores)
+                else:
+                    n_claims = len(sample.get('atomic_facts', []))
+                    result.append(np.full(n_claims, 0.5))
+            else:
+                n_claims = len(sample.get('atomic_facts', []))
+                result.append(np.full(n_claims, 0.5))
+        return result
+    basic_conformal = BasicConformal(score_function=score_function, random_state=0)
+    basic_conformal.fit_on_calib(calib_samples, alpha=alpha)
+    filtered_results, _ = basic_conformal.predict(test_samples)
+    coverage = compute_marginal_coverage(filtered_results)
+    retention = evaluate_retention(filtered_results, "BCP")
+    return {
+        'coverage': coverage,
+        'retention_rate': retention['overall_retention_rate'],
+        'retained_claims': retention['retained_claims'],
+        'total_claims': retention['total_claims'],
+        'filtered_results': filtered_results
+    }
+def run_as_experiment(calib_data: List[Dict], test_data: List[Dict],
+                      model_names: List[str],
+                      alpha: float,
+                      as_mode: str,
+                      subgroup_name: str, **kwargs) -> Dict:
+    """Run MACI (Adaptive Subclaims) experiment for a given subgroup."""
+    logging.info(f"📊 Running MACI experiment with mode: {as_mode} for subgroup: '{subgroup_name}'...")
+    timing: Dict[str, float] = {}
+    if as_mode == 'subgroup_optimized':
+        available_groupers = get_available_groupers()
+        if subgroup_name not in available_groupers:
+            raise ValueError(f"Unknown subgroup: {subgroup_name}")
+        grouper = available_groupers[subgroup_name]
+        as_model = SubgroupOptimizedMACI(
+            model_names=model_names,
+            grouper=grouper,
+            n_bins=3,
+            random_state=kwargs.get('random_state', 0),
+            solver='osqp',
+        )
+        t0 = time.perf_counter()
+        as_model.fit(calib_data, alpha=alpha, ensemble_train_ratio=0.5, target_tpr=kwargs.get('target_tpr', 0.95))
+        timing_details = as_model.get_timing()
+        timing['maci_weight_optimization_s'] = timing_details.get('weight_optimization_s', 0.0)
+        timing['maci_calibration_s'] = timing_details.get('calibration_s', 0.0)
+        t1 = time.perf_counter()
+        filtered_results, _ = as_model.predict(test_data)
+        timing['maci_inference_s'] = time.perf_counter() - t1
+        budgets = as_model.get_budgets()
+        weights = as_model.get_weights()
+    else:
+        score_type = kwargs.get("as_score_type", "ensemble")
+        def score_function(data_list: List[Dict]) -> List[np.ndarray]:
+            scores_list = []
+            for item in data_list:
+                valid_scores = [item['scores'][m] for m in model_names if m in item['scores']]
+                if valid_scores:
+                    scores_list.append(np.mean(valid_scores, axis=0))
+                else:
+                    scores_list.append(np.array([0.5] * len(item.get('sample', {}).get('atomic_facts', []))))
+            return scores_list
+        as_model = MACIAdaptiveConformal(score_function=score_function, random_state=kwargs.get('random_state', 0))
+        t0 = time.perf_counter()
+        as_model.fit_on_calib(calib_data, alpha=alpha)
+        timing['maci_calibration_s'] = time.perf_counter() - t0
+        t1 = time.perf_counter()
+        filtered_results, _ = as_model.predict(test_data)
+        timing['maci_inference_s'] = time.perf_counter() - t1
+        budgets = {'overall': as_model.tau_hat}
+        weights = None
+    coverage = compute_marginal_coverage(filtered_results)
+    retention = evaluate_retention(filtered_results, "MACI")
+    return {
+        'coverage': coverage,
+        'retention_rate': retention['overall_retention_rate'],
+        'retained_claims': retention['retained_claims'],
+        'total_claims': retention['total_claims'],
+        'budgets': budgets,
+        'weights': weights,
+        'filtered_results': filtered_results,
+        'timing': timing
+    }
+def run_cci_experiment(
+    calib_data,
+    test_data,
+    alpha=0.1,
+    boosting_epochs=1000,
+    boosting_lr=0.005,
+    calib_split_for_boost=0.3,
+    random_seed=0,
+    adaptive_alpha: bool = False,
+    retention_target: float = 0.7
+):
+    """
+    Two-stage CCI:
+      - Stage 1 (Boosting): learn beta on a subset of calib_data
+      - Stage 2 (CondConf): calibrate CondConf on the remaining calib_data using learned beta
+      - Predict on test_data
+    """
+    logging.info("🎯 Running CCI (Boosting -> CondConf) with internal calib split...")
+    try:
+        from conformal.conditional_conformal import ConditionalConformalBoosting, ConditionalConformalInference
+    except Exception as e:
+        logging.error(f"CCI unavailable due to missing dependencies: {e}")
+        return {
+            "coverage": None,
+            "retention_rate": None,
+            "retained_claims": 0,
+            "total_claims": 0,
+            "filtered_results": [],
+            "timing": {"cci_skipped": True, "error": str(e)}
+        }
+    rng = np.random.default_rng(random_seed)
+    idx = np.arange(len(calib_data))
+    rng.shuffle(idx)
+    k = int(len(idx) * calib_split_for_boost)
+    idx_boost, idx_conf = idx[:k], idx[k:]
+    if len(idx_conf) == 0:
+        idx_boost, idx_conf = idx[:-1], idx[-1:] [1]
+    calib_boost = [calib_data[i] for i in idx_boost]
+    calib_conf  = [calib_data[i] for i in idx_conf]
+    logging.info(f"  🔧 calib split -> boost:{len(calib_boost)} | conf:{len(calib_conf)} (seed={random_seed})")
+    booster = ConditionalConformalBoosting(random_state=random_seed)
+    t_boost_0 = time.perf_counter()
+    beta = booster.fit(
+        calib_boost,
+        boosting_epochs=boosting_epochs,
+        boosting_lr=boosting_lr
+    )
+    t_boost_1 = time.perf_counter()
+    infer = ConditionalConformalInference(random_state=random_seed)
+    t_fit_0 = time.perf_counter()
+    infer.fit(calib_conf, alpha=alpha, beta=beta, adaptive_alpha=adaptive_alpha, retention_target=retention_target)
+    t_fit_1 = time.perf_counter()
+    auroc_results = infer.evaluate_auroc(test_data)
+    t_pred_0 = time.perf_counter()
+    filtered_results = infer.predict(test_data)
+    t_pred_1 = time.perf_counter()
+    coverage = compute_marginal_coverage(filtered_results)
+    retention = evaluate_retention(filtered_results, "CCI")
+    return {
+        "coverage": coverage,
+        "retention_rate": retention["overall_retention_rate"],
+        "retained_claims": retention["retained_claims"],
+        "total_claims": retention["total_claims"],
+        "filtered_results": filtered_results,
+        "beta": beta,
+        "calib_sizes": {"boost": len(calib_boost), "conf": len(calib_conf)},
+        "split_seed": random_seed,
+        "timing": {
+            "cci_boost_fit_s": t_boost_1 - t_boost_0,
+            "cci_condconf_fit_s": t_fit_1 - t_fit_0,
+            "cci_inference_s": t_pred_1 - t_pred_0,
+            "cci_adaptive_alpha_enabled": bool(adaptive_alpha)
+        }
+    }
+def evaluate_retention(filtered_dataset: List[Dict], method_name: str = "") -> Dict:
+    total_original_claims = 0
+    total_retained_claims = 0
+    if not filtered_dataset:
+        return {'overall_retention_rate': 0.0, 'retained_claims': 0, 'total_claims': 0}
+    for item in filtered_dataset:
+        sample_dict = item.get('sample', item)
+        if not isinstance(sample_dict, dict):
+            logging.warning(f"Skipping invalid item in retention evaluation: {type(sample_dict)}")
+            continue
+        original_claims = sample_dict.get('atomic_facts', [])
+        retained_claims = sample_dict.get('filtered_claims', [])
+        total_original_claims += len(original_claims)
+        total_retained_claims += len(retained_claims)
+    if total_original_claims > 0:
+        overall_retention_rate = total_retained_claims / total_original_claims
+    else:
+        overall_retention_rate = 0.0
+    return {
+        'overall_retention_rate': overall_retention_rate,
+        'retained_claims': total_retained_claims,
+        'total_claims': total_original_claims
+    }
+def compute_marginal_coverage(filtered_dataset: List[Dict]):
+    indicators = []
+    for item in filtered_dataset:
+        sample_dict = item.get('sample', item)
+        if not isinstance(sample_dict, dict):
+            logging.warning(f"Skipping invalid item in coverage calculation: {type(sample_dict)}")
+            continue
+        retained = sample_dict.get('filtered_claims', [])
+        if len(retained) == 0:
+            indicators.append(1.0)
+        else:
+            has_false = any(not claim.get('is_supported', False) for claim in retained if isinstance(claim, dict))
+            indicators.append(0.0 if has_false else 1.0)
+    return np.mean(indicators) if indicators else 0.0
+def compute_conditional_coverage(test_data, filtered_results, grouper, alpha=0.1, binning_method='quartiles'):
+    """Compute conditional coverage for subgroups"""
+    combined_data = []
+    for orig_sample, filtered_sample in zip(test_data, filtered_results):
+        combined_sample = dict(orig_sample['sample'])
+        combined_sample['scores'] = orig_sample['scores']
+        combined_sample['filtered_claims'] = filtered_sample.get('filtered_claims', [])
+        combined_data.append(combined_sample)
+    method_mapping = {
+        'quantile': 'tertiles',
+        'equal_width': 'tertiles',
+        'quartiles': 'tertiles'
+    }
+    method = method_mapping.get(binning_method, 'tertiles')
+    values = grouper.compute_values(combined_data)
+    if len(values) == 0:
+        logging.warning(f"    ⚠️ Warning: {grouper.__class__.__name__} returned no values")
+        return {}
+    if np.all(values == values[0]):
+        logging.warning(f"    ⚠️ Warning: {grouper.__class__.__name__} all values identical ({values[0]:.4f})")
+    bins = grouper.create_bins(values, method=method)
+    groups = {}
+    group_names = ['low', 'medium', 'high'] if len(bins) == 3 else [f'bin_{i}' for i in range(len(bins))]
+    for i, (bin_min, bin_max) in enumerate(bins):
+        if i == len(bins) - 1:
+            mask = (values >= bin_min) & (values <= bin_max)
+        else:
+            mask = (values >= bin_min) & (values < bin_max)
+        indices = np.where(mask)[0].tolist()
+        bin_name = group_names[i] if i < len(group_names) else f'bin_{i}'
+        groups[bin_name] = indices
+    results = {}
+    for group_name, indices in groups.items():
+        if len(indices) == 0:
+            continue
+        group_indicators = []
+        group_total_claims = 0
+        group_retained_claims = 0
+        for idx in indices:
+            filtered_sample = filtered_results[idx]
+            retained = filtered_sample.get('filtered_claims', [])
+            original_claims = test_data[idx]['sample'].get('atomic_facts', [])
+            has_false = any(not claim.get('is_supported', False) for claim in retained)
+            group_indicators.append(0.0 if has_false else 1.0)
+            group_total_claims += len(original_claims)
+            group_retained_claims += len(retained)
+        coverage = np.mean(group_indicators) if group_indicators else 0.0
+        retention_rate = group_retained_claims / group_total_claims if group_total_claims > 0 else 0.0
+        results[group_name] = {
+            'size': len(indices),
+            'coverage': coverage,
+            'retention_rate': retention_rate,
+            'retained_claims': group_retained_claims,
+            'total_claims': group_total_claims,
+            'target_coverage': 1 - alpha,
+        }
+    return results
+def save_aggregated_results_to_json(results: Dict, args: argparse.Namespace):
+    repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+    default_output_dir = os.path.join(repo_root, 'analysis', 'experiment_results')
+    output_dir = getattr(args, 'time_out', None) or default_output_dir
+    os.makedirs(output_dir, exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+    groups_str = "_".join(sorted(args.conditional_groups))
+    filename = f"results_{args.dataset_type}_{args.model_set}_{groups_str}_{timestamp}.json"
+    filepath = os.path.join(output_dir, filename)
+    logging.info(f"\n💾 Saving aggregated results to {filepath}...")
+    def convert_to_native_types(obj):
+        if isinstance(obj, np.integer):
+            return int(obj)
+        elif isinstance(obj, np.floating):
+            return float(obj)
+        elif isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, defaultdict):
+            return dict(obj)
+        try:
+            json.dumps(obj)
+            return obj
+        except TypeError:
+            return str(obj)
+    keys_to_exclude = {'filtered_results', 'beta', 'weights', 'budgets', 'calib_sizes', 'split_seed'}
+    serializable_data = {}
+    for method_name, method_data in results.items():
+        serializable_data[method_name] = {}
+        for key, value in method_data.items():
+            if key in keys_to_exclude:
+                continue
+            try:
+                cleaned_value = json.loads(json.dumps(value, default=convert_to_native_types))
+                serializable_data[method_name][key] = cleaned_value
+            except Exception as e:
+                logging.warning(f"Could not serialize key '{key}' for method '{method_name}'. Skipping. Error: {e}")
+    try:
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(serializable_data, f, indent=4, ensure_ascii=False)
+        logging.info(f"✅ Successfully saved results.")
+    except Exception as e:
+        logging.error(f"❌ Failed to save results to JSON: {e}")
+def main():
+    parser = argparse.ArgumentParser(description="Experiment with three conformal methods")
+    parser.add_argument("--random-seed", type=int, default=123, help="Random seed")
+    parser.add_argument("--data-dir", type=str, default=None, help="Data directory (defaults to repo_root/data)")
+    parser.add_argument("--log-dir", type=str, default=None, help="Directory to save logs (defaults to repo_root/logs)")
+    parser.add_argument("--dataset-type", type=str, default="auto", choices=["auto", "wikibio", "medlfqa"],
+                       help="Dataset type (auto-detected if not specified)")
+    parser.add_argument("--alpha", type=float, default=0.1, help="Significance level (fixed if --adaptive-alpha is false)")
+    parser.add_argument("--adaptive-alpha", action='store_true', help="Enable per-sample adaptive alpha (learn q*(z) for retention target)")
+    parser.add_argument("--retention-target", type=float, default=0.4, help="Target retention used to learn adaptive alpha")
+    parser.add_argument("--scores-dir", type=str, default=None, help="Directory containing final NPZ score files (optional)")
+    parser.add_argument("--calib-ratio", type=float, default=0.75, help="Calibration set ratio")
+    parser.add_argument("--test-ratio", type=float, default=0.25, help="Test set ratio")
+    parser.add_argument("--boosting-epochs", type=int, default=100, help="Boosting epochs")
+    parser.add_argument("--n-runs", type=int, default=10, help="Number of repeated runs with different random splits")
+    parser.add_argument("--model-set", type=str, default="fixed", choices=["fixed"], help="Model set (fixed 3 models)")
+    parser.add_argument("--bcp-score-type", type=str, default="frequency",
+                       choices=['frequency', 'selfeval', 'logprob', 'ensemble'],
+                       help="Score type for BCI")
+    # --as-score-type removed; MACI uses ensemble by default
+    parser.add_argument("--as-mode", type=str, default="subgroup_optimized", choices=["standard", "subgroup_optimized"], help="AS variant")
+    parser.add_argument("--conditional-groups", type=str, nargs='*',
+                       default=['false_claim_risk','medicalcontent','view_count'],
+                       choices=['false_claim_risk','medicalcontent','view_count'],
+                       help="Conditional groups to analyze")
+    parser.add_argument("--view-metadata-csv", type=str, default=None,
+                       help="Optional CSV for view_count grouper; defaults to repo-relative data path")
+    parser.add_argument("--binning-method", type=str, default="quantile",
+                       choices=['quantile', 'equal_width'],
+                       help="Binning method for conditional groups")
+    parser.add_argument("--limit-samples", type=int, default=2000, help="Max number of samples to load")
+    parser.add_argument("--target-tpr", type=float, default=0.8, help="Target TPR for subgroup-optimized AS")
+    parser.add_argument("--time-profile", action='store_true', help="Enable timing profile output")
+    parser.add_argument("--time-out", type=str, default=None, help="Directory to save timing JSON (defaults to repo_root/analysis/experiment_results)")
+    args = parser.parse_args()
+    repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+    if not args.data_dir:
+        args.data_dir = os.path.join(repo_root, 'data')
+    if not args.log_dir:
+        args.log_dir = os.path.join(repo_root, 'logs')
+    if not getattr(args, 'time_out', None):
+        args.time_out = os.path.join(repo_root, 'analysis', 'experiment_results')
+    setup_logging(args.log_dir)
+    if args.view_metadata_csv:
+        set_view_metadata_csv(args.view_metadata_csv)
+    logging.info("=" * 80)
+    logging.info(f"📊 Setup: {args.calib_ratio*100:.0f}% calibration + {args.test_ratio*100:.0f}% test, α={args.alpha}, adaptive={args.adaptive_alpha}")
+    logging.info(f"🔄 Number of runs: {args.n_runs}")
+    logging.info(f"🏷️  BCI Score: {args.bcp_score_type}")
+    logging.info(f"🧠  CCI: enabled")
+    logging.info(f"🎯 MACI: enabled")
+    logging.info(f"📊 Conditional groups: {args.conditional_groups}")
+    logging.info(f"🔧 Binning Method: {args.binning_method}")
+    logging.info(f"🆕 Using provided scores and enhanced features")
+    limit_samples = args.limit_samples
+    data = load_1000_samples(args.data_dir, scores_dir=args.scores_dir, dataset_type=args.dataset_type, limit_samples=limit_samples)
+    from collections import defaultdict
+    all_runs_results = defaultdict(lambda: defaultdict(list))
+    groupers = []
+    available_groupers = get_available_groupers()
+    for group_name in args.conditional_groups:
+        if group_name in available_groupers:
+            groupers.append(available_groupers[group_name])
+        else:
+            logging.warning(f"⚠️ Unknown grouper: {group_name}")
+    detected_dataset_type = args.dataset_type
+    if detected_dataset_type == "auto":
+        if data and 'scores' in data[0] and isinstance(data[0]['scores'].get('frequency'), np.ndarray):
+            detected_dataset_type = 'medlfqa'
+        else:
+            detected_dataset_type = 'wikibio'
+    logging.info(f"➡️ Using detected dataset type: {detected_dataset_type}")
+    factscore_npz_path = None
+    if detected_dataset_type == 'wikibio':
+        wikibio_npz_path = os.path.join(args.data_dir, "wiki_scores", "wikibio_final_frequencies.npz")
+    model_names_to_use = MODEL_NAMES
+    logging.info(f" MACI Models: {', '.join(model_names_to_use)}")
+    for run_idx in range(args.n_runs):
+        logging.info(f"\n🔄 Run {run_idx + 1}/{args.n_runs}")
+        logging.info("-" * 50)
+        random_seed = args.random_seed + run_idx
+        calib_data, test_data, calib_idx, test_idx = create_splits(
+            data, args.calib_ratio, args.test_ratio, random_seed=random_seed
+        )
+        logging.info(f"📊 Run {run_idx + 1} sizes: {len(calib_data)} calib, {len(test_data)} test (seed: {random_seed})")
+        results = {}
+        try:
+            results['BCI'] = run_bcp_experiment(calib_data, test_data, score_type=args.bcp_score_type, alpha=args.alpha)
+        except Exception as e:
+            logging.error(f"❌ BCP failed: {e}")
+            import traceback
+            logging.error(f"Traceback: {traceback.format_exc()}")
+            results['BCI'] = None
+        try:
+            results['CCI'] = run_cci_experiment(
+                calib_data, test_data,
+                alpha=args.alpha,
+                boosting_epochs=args.boosting_epochs,
+                adaptive_alpha=args.adaptive_alpha,
+                retention_target=args.retention_target
+            )
+        except Exception as e:
+            logging.error(f"❌ CCI failed: {e}")
+            import traceback
+            logging.error(f"Traceback: {traceback.format_exc()}")
+            results['CCI'] = None
+        results['MACI'] = {
+            'coverage': [],
+            'retention_rate': [],
+            'retained_claims': [],
+            'total_claims': [],
+            'subgroup_results': {}
+        }
+        logging.info("--- Starting MACI Experiments ---")
+        mace_marginal_results_set = False
+        for subgroup_name in args.conditional_groups:
+            try:
+                mace_subgroup_result = run_as_experiment(
+                    calib_data, test_data,
+                    model_names=model_names_to_use,
+                    alpha=args.alpha,
+                    as_mode='subgroup_optimized',
+                    subgroup_name=subgroup_name,
+                    random_state=random_seed,
+                    target_tpr=args.target_tpr
+                )
+                if mace_subgroup_result and 'filtered_results' in mace_subgroup_result:
+                    flat_filtered_results = []
+                    for res in mace_subgroup_result['filtered_results']:
+                        flat_item = dict(res.get('sample', {}))
+                        flat_item['filtered_claims'] = res.get('sample', {}).get('filtered_claims', [])
+                        flat_filtered_results.append(flat_item)
+                    mace_subgroup_result['filtered_results'] = flat_filtered_results
+                if not mace_marginal_results_set:
+                    results['MACI']['coverage'] = mace_subgroup_result['coverage']
+                    results['MACI']['retention_rate'] = mace_subgroup_result['retention_rate']
+                    results['MACI']['retained_claims'] = mace_subgroup_result['retained_claims']
+                    results['MACI']['total_claims'] = mace_subgroup_result['total_claims']
+                    results['MACI']['filtered_results'] = mace_subgroup_result.get('filtered_results', [])
+                    results['MACI']['timing'] = mace_subgroup_result.get('timing', {})
+                    mace_marginal_results_set = True
+                target_grouper = available_groupers.get(subgroup_name)
+                if target_grouper:
+                    try:
+                        conditional_results = compute_conditional_coverage(
+                            test_data,
+                            mace_subgroup_result['filtered_results'],
+                            target_grouper,
+                            args.alpha,
+                            args.binning_method
+                        )
+                        results['MACI']['subgroup_results'][target_grouper.__class__.__name__] = conditional_results
+                    except Exception as e:
+                        logging.error(f"    ❌ MACI subgroup analysis for {target_grouper.__class__.__name__} failed: {e}")
+            except Exception as e:
+                logging.error(f"❌ MACI ({subgroup_name}) failed: {e}")
+                import traceback
+                logging.error(f"Traceback: {traceback.format_exc()}")
+        for method_name, result in results.items():
+            if not result or result.get('coverage') is None:
+                continue
+            all_runs_results[method_name]['coverage'].append(result['coverage'])
+            all_runs_results[method_name]['retention_rate'].append(result['retention_rate'])
+            all_runs_results[method_name]['retained_claims'].append(result['retained_claims'])
+            all_runs_results[method_name]['total_claims'].append(result['total_claims'])
+            run_subgroup_results = {}
+            if method_name == 'MACI':
+                run_subgroup_results = result.get('subgroup_results', {})
+            else:
+                for grouper in groupers:
+                    try:
+                        conditional_results = compute_conditional_coverage(
+                            test_data,
+                            result['filtered_results'],
+                            grouper,
+                            args.alpha,
+                            args.binning_method
+                        )
+                        run_subgroup_results[grouper.__class__.__name__] = conditional_results
+                    except Exception as e:
+                        logging.error(f"    ❌ {grouper.__class__.__name__} failed for {method_name}: {e}")
+            all_runs_results[method_name]['subgroup_results'].append(run_subgroup_results)
+        logging.info(f"\n📊 Run {run_idx + 1} Results:")
+        for method_name, result in results.items():
+            if not result or result.get('coverage') is None:
+                logging.info(f"  {method_name}: ❌ FAILED or SKIPPED")
+                continue
+            logging.info(f"  {method_name}: Coverage={result['coverage']:.4f}, Retention={result['retention_rate']:.3f}, Claims={result['retained_claims']}/{result['total_claims']}")
+        if args.time_profile:
+            timing_payload = {
+                'dataset_type': detected_dataset_type,
+                'model_set': args.model_set,
+                'boosting_epochs': args.boosting_epochs,
+                'adaptive_alpha': args.adaptive_alpha,
+                'retention_target': args.retention_target,
+                'run_idx': run_idx,
+                'CCI': results.get('CCI', {}).get('timing', {}),
+                'MACI': {}
+            }
+            try:
+                first_subgroup = next(iter(results['MACI'].get('subgroup_results', {}).keys()), None)
+                if first_subgroup:
+                    mace_timing = None
+                    mace_timing = results['MACI'].get('timing')
+                    timing_payload['MACI'] = mace_timing if mace_timing else {}
+            except Exception:
+                pass
+            if not timing_payload['MACI']:
+                try:
+                    timing_payload['MACI'] = {}
+                except Exception:
+                    timing_payload['MACI'] = {}
+            os.makedirs(args.time_out, exist_ok=True)
+            tstamp = datetime.now().strftime('%Y%m%d-%H%M%S')
+            timing_path = os.path.join(args.time_out, f"time_profile_{detected_dataset_type}_{args.model_set}_{tstamp}.json")
+            with open(timing_path, 'w', encoding='utf-8') as f:
+                json.dump(timing_payload, f, indent=2, ensure_ascii=False)
+            logging.info(f"⏱️ Saved timing profile to {timing_path}")
+        if run_idx == 0 and getattr(args, 'show_sample_idx', None) is not None and args.show_sample_idx >= 0:
+            idx = int(args.show_sample_idx)
+            if 0 <= idx < len(test_data):
+                def _get_claim_text(c: Dict[str, Any]) -> str:
+                    if not isinstance(c, dict):
+                        return str(c)
+                    return c.get('atom') or c.get('text') or c.get('claim') or c.get('fact') or str(c)
+                def _get_claim_support(c: Dict[str, Any]) -> str:
+                    if isinstance(c, dict):
+                        v = c.get('is_supported')
+                        if isinstance(v, (bool, np.bool_)):
+                            return 'T' if bool(v) else 'F'
+                    return '?'
+                sample = test_data[idx]['sample']
+                prompt = sample.get('prompt', '')
+                response = sample.get('response', '')
+                original_claims = sample.get('atomic_facts', [])
+                original_pairs = [(_get_claim_text(c), _get_claim_support(c)) for c in original_claims]
+                bci_item = results.get('BCI', {}).get('filtered_results', [None]*len(test_data))[idx]
+                cci_item = results.get('CCI', {}).get('filtered_results', [None]*len(test_data))[idx]
+                mace_item = results.get('MACE', {}).get('filtered_results', [None]*len(test_data))[idx]
+                def _filtered_claims(item):
+                    if not item:
+                        return []
+                    claims = item.get('filtered_claims')
+                    if claims is None and isinstance(item.get('sample'), dict):
+                        claims = item['sample'].get('filtered_claims', [])
+                    return [(_get_claim_text(c), _get_claim_support(c)) for c in (claims or [])]
+                logging.info("\n=== SAMPLE CLAIMS DUMP ===")
+                logging.info(f"[Test idx={idx}] Prompt: {prompt}")
+                logging.info(f"Original claims ({len(original_pairs)}):")
+                for i, (t, lab) in enumerate(original_pairs, 1):
+                    logging.info(f"  {i:2d}. [{lab}] {t}")
+                bci_pairs = _filtered_claims(bci_item)
+                cci_pairs = _filtered_claims(cci_item)
+                mace_pairs = _filtered_claims(mace_item)
+                logging.info(f"\n[BCI] filtered claims ({len(bci_pairs)}):")
+                for i, (t, lab) in enumerate(bci_pairs, 1):
+                    logging.info(f"  {i:2d}. [{lab}] {t}")
+                logging.info(f"\n[CCI] filtered claims ({len(cci_pairs)}):")
+                for i, (t, lab) in enumerate(cci_pairs, 1):
+                    logging.info(f"  {i:2d}. [{lab}] {t}")
+                logging.info(f"\n[MACI] filtered claims ({len(mace_pairs)}):")
+                for i, (t, lab) in enumerate(mace_pairs, 1):
+                    logging.info(f"  {i:2d}. [{lab}] {t}")
+        if run_idx == 0 and getattr(args, 'show_sample_count', 0) > 0:
+            dump_n = min(int(args.show_sample_count), len(test_data))
+            def _get_claim_text(c: Dict[str, Any]) -> str:
+                if not isinstance(c, dict):
+                    return str(c)
+                return c.get('atom') or c.get('text') or c.get('claim') or c.get('fact') or str(c)
+            def _get_claim_support(c: Dict[str, Any]) -> str:
+                if isinstance(c, dict):
+                    v = c.get('is_supported')
+                    if isinstance(v, (bool, np.bool_)):
+                        return 'T' if bool(v) else 'F'
+                return '?'
+            def _filtered_pairs(item):
+                if not item:
+                    return []
+                claims = item.get('filtered_claims')
+                if claims is None and isinstance(item.get('sample'), dict):
+                    claims = item['sample'].get('filtered_claims', [])
+                return [(_get_claim_text(c), _get_claim_support(c)) for c in (claims or [])]
+            for idx in range(dump_n):
+                sample = test_data[idx]['sample']
+                prompt = sample.get('prompt', '')
+                original_pairs = [(_get_claim_text(c), _get_claim_support(c)) for c in sample.get('atomic_facts', [])]
+                bci_item = results.get('BCI', {}).get('filtered_results', [None]*len(test_data))[idx]
+                cci_item = results.get('CCI', {}).get('filtered_results', [None]*len(test_data))[idx]
+                mace_item = results.get('MACE', {}).get('filtered_results', [None]*len(test_data))[idx]
+                logging.info("\n=== SAMPLE CLAIMS DUMP ===")
+                logging.info(f"[Test idx={idx}] Prompt: {prompt}")
+                logging.info(f"Original claims ({len(original_pairs)}):")
+                for i, (t, lab) in enumerate(original_pairs, 1):
+                    logging.info(f"  {i:2d}. [{lab}] {t}")
+                bci_pairs = _filtered_pairs(bci_item)
+                cci_pairs = _filtered_pairs(cci_item)
+                mace_pairs = _filtered_pairs(mace_item)
+                logging.info(f"\n[BCI] filtered claims ({len(bci_pairs)}):")
+                for i, (t, lab) in enumerate(bci_pairs, 1):
+                    logging.info(f"  {i:2d}. [{lab}] {t}")
+                logging.info(f"\n[CCI] filtered claims ({len(cci_pairs)}):")
+                for i, (t, lab) in enumerate(cci_pairs, 1):
+                    logging.info(f"  {i:2d}. [{lab}] {t}")
+                logging.info(f"\n[MACI] filtered claims ({len(mace_pairs)}):")
+                for i, (t, lab) in enumerate(mace_pairs, 1):
+                    logging.info(f"  {i:2d}. [{lab}] {t}")
+    logging.info("\n" + "=" * 100)
+    logging.info("📊 AGGREGATED RESULTS (All Runs)")
+    logging.info("=" * 100)
+    for method_name in sorted(all_runs_results.keys()):
+        method_results = all_runs_results[method_name]
+        if not method_results['coverage']:
+            logging.info(f"\n{method_name}: ❌ NO SUCCESSFUL RUNS")
+            continue
+        n_runs = len(method_results['coverage'])
+        coverage_mean = np.mean(method_results['coverage'])
+        coverage_std = np.std(method_results['coverage'])
+        retention_mean = np.mean(method_results['retention_rate'])
+        retention_std = np.std(method_results['retention_rate'])
+        retained_claims_mean = np.mean(method_results['retained_claims'])
+        retained_claims_std = np.std(method_results['retained_claims'])
+        total_claims_mean = np.mean(method_results['total_claims'])
+        logging.info(f"\n{'='*20} {method_name} ({n_runs} runs) {'='*20}")
+        logging.info(f"📈 MARGINAL RESULTS:")
+        logging.info(f"  Coverage: {coverage_mean:.4f} ± {coverage_std:.4f}")
+        logging.info(f"  Retention Rate: {retention_mean:.3f} ± {retention_std:.3f}")
+        logging.info(f"  Claims: {retained_claims_mean:.1f} ± {retained_claims_std:.1f}/{total_claims_mean:.1f}")
+        if method_results['subgroup_results']:
+            logging.info(f"\n📊 SUBGROUP RESULTS:")
+            subgroup_data = {}
+            for run_results in method_results['subgroup_results']:
+                for grouper_name, grouper_results in run_results.items():
+                    if grouper_name not in subgroup_data:
+                        subgroup_data[grouper_name] = {}
+                    for group_name, group_result in grouper_results.items():
+                        if group_name not in subgroup_data[grouper_name]:
+                            subgroup_data[grouper_name][group_name] = {
+                                'coverage': [], 'retention_rate': [], 'retained_claims': [],
+                                'total_claims': [], 'size': []
+                            }
+                        subgroup_data[grouper_name][group_name]['coverage'].append(group_result['coverage'])
+                        subgroup_data[grouper_name][group_name]['retention_rate'].append(group_result['retention_rate'])
+                        subgroup_data[grouper_name][group_name]['retained_claims'].append(group_result['retained_claims'])
+                        subgroup_data[grouper_name][group_name]['total_claims'].append(group_result['total_claims'])
+                        subgroup_data[grouper_name][group_name]['size'].append(group_result['size'])
+            for grouper_name, groups in subgroup_data.items():
+                logging.info(f"\n  🔍 {grouper_name}:")
+                for group_name, group_data in groups.items():
+                    if not group_data['coverage']:
+                        continue
+                    group_coverage_mean = np.mean(group_data['coverage'])
+                    group_coverage_std = np.std(group_data['coverage'])
+                    group_retention_mean = np.mean(group_data['retention_rate'])
+                    group_retention_std = np.std(group_data['retention_rate'])
+                    group_retained_claims_mean = np.mean(group_data['retained_claims'])
+                    group_retained_claims_std = np.std(group_data['retained_claims'])
+                    group_total_claims_mean = np.mean(group_data['total_claims'])
+                    group_size_mean = np.mean(group_data['size'])
+                    target_coverage = 1 - args.alpha
+                    violation_marker = "⚠️ " if abs(group_coverage_mean - target_coverage) > 0.014 else "✅ "
+                    logging.info(f"    {violation_marker}{group_name}:")
+                    logging.info(f"      Coverage: {group_coverage_mean:.3f} ± {group_coverage_std:.3f} (target: {target_coverage:.1f})")
+                    logging.info(f"      Retention: {group_retention_mean:.3f} ± {group_retention_std:.3f}")
+                    logging.info(f"      Claims: {group_retained_claims_mean:.1f} ± {group_retained_claims_std:.1f}/{group_total_claims_mean:.1f}")
+                    logging.info(f"      Group size: {group_size_mean:.1f} samples")
+                    logging.info(f"      Coverage gap: {group_coverage_mean - target_coverage:+.3f}")
+    logging.info("\n" + "=" * 100)
+    save_aggregated_results_to_json(all_runs_results, args)
+if __name__ == "__main__":
+    main()

MACI-main/requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+numpy==2.0.2
+scipy==1.13.1
+scikit-learn==1.6.1
+pandas==2.3.1
+matplotlib==3.9.4
+seaborn==0.13.2
+tqdm==4.67.1
+cvxpy==1.7.1
+conditionalconformal==0.0.5
+torch==2.8.0
+torchvision==0.23.0
+torchaudio==2.8.0