Miaode commited on Jul 26, 2025

Commit

8499730

verified ·

1 Parent(s): 2917cfc

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +14 -0
DataFlow/OpenSeek_CoT/Openseek_Math_Chain_of_Thoughts_pipeline.py.py +137 -0
DataFlow/dataflow/__init__.py +14 -0
DataFlow/dataflow/cli.py +80 -0
DataFlow/dataflow/core/LLMServing.py +27 -0
DataFlow/dataflow/core/Operator.py +31 -0
DataFlow/dataflow/core/__init__.py +7 -0
DataFlow/dataflow/core/__pycache__/LLMServing.cpython-310.pyc +0 -0
DataFlow/dataflow/core/__pycache__/Operator.cpython-310.pyc +0 -0
DataFlow/dataflow/core/__pycache__/__init__.cpython-310.pyc +0 -0
DataFlow/dataflow/logger.py +38 -0
DataFlow/dataflow/operators/__init__.py +4 -0
DataFlow/dataflow/operators/eval/AgenticRAG/statistics/f1_scorer.py +108 -0
DataFlow/dataflow/operators/eval/GeneralText/APIcaller/__pycache__/perspective_scorer.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/eval/GeneralText/APIcaller/meta_scorer.py +70 -0
DataFlow/dataflow/operators/eval/GeneralText/APIcaller/treeinstruct_scorer.py +53 -0
DataFlow/dataflow/operators/eval/GeneralText/__init__.py +55 -0
DataFlow/dataflow/operators/eval/GeneralText/diversity/__pycache__/task2vec_scorer.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/eval/GeneralText/diversity/__pycache__/vendi_scorer.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/eval/GeneralText/diversity/task2vec/__pycache__/task2vec.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/eval/GeneralText/diversity/task2vec/__pycache__/task_similarity.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/eval/GeneralText/diversity/task2vec/__pycache__/utils.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/eval/GeneralText/diversity/task2vec/task2vec.py +544 -0
DataFlow/dataflow/operators/eval/GeneralText/diversity/task2vec/task_similarity.py +485 -0
DataFlow/dataflow/operators/eval/GeneralText/diversity/task2vec/utils.py +76 -0
DataFlow/dataflow/operators/eval/GeneralText/diversity/task2vec_scorer.py +76 -0
DataFlow/dataflow/operators/eval/GeneralText/diversity/vendi_scorer.py +36 -0
DataFlow/dataflow/operators/eval/GeneralText/gen/__pycache__/bert_scorer.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/eval/GeneralText/gen/__pycache__/bleu_scorer.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/eval/GeneralText/gen/__pycache__/cider_scorer.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/eval/GeneralText/gen/bert_scorer.py +46 -0
DataFlow/dataflow/operators/eval/GeneralText/gen/bleu/__init__.py +0 -0
DataFlow/dataflow/operators/eval/GeneralText/gen/bleu/__pycache__/__init__.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/eval/GeneralText/gen/bleu/__pycache__/bleu.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/eval/GeneralText/gen/bleu/bleu.py +236 -0
DataFlow/dataflow/operators/eval/GeneralText/gen/bleu_scorer.py +47 -0
DataFlow/dataflow/operators/eval/GeneralText/gen/cider/__init__.py +0 -0
DataFlow/dataflow/operators/eval/GeneralText/gen/cider/__pycache__/__init__.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/eval/GeneralText/gen/cider/__pycache__/cider.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/eval/GeneralText/gen/cider/cider.py +134 -0
DataFlow/dataflow/operators/eval/GeneralText/gen/cider_scorer.py +60 -0
DataFlow/dataflow/operators/eval/GeneralText/models/Kenlm/__pycache__/model.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/eval/GeneralText/models/Kenlm/model.py +161 -0
DataFlow/dataflow/operators/eval/GeneralText/models/Qurating/__pycache__/qurater_annotate.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/eval/GeneralText/models/Qurating/modeling/__pycache__/modeling_flash_llama.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/eval/GeneralText/models/Qurating/modeling/modeling_flash_llama.py +853 -0
DataFlow/dataflow/operators/eval/GeneralText/models/Qurating/qurater_annotate.py +190 -0
DataFlow/dataflow/operators/eval/GeneralText/models/Superfiltering/__pycache__/data_analysis.cpython-310.pyc +0 -0
DataFlow/dataflow/operators/eval/GeneralText/models/Superfiltering/data_analysis.py +53 -0
DataFlow/dataflow/operators/eval/GeneralText/models/__pycache__/debertav3_scorer.cpython-310.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 baidu.zip filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 baidu.zip filter=lfs diff=lfs merge=lfs -text
+Qwen2.5-Math/evaluation/latex2sympy/antlr-4.11.1-complete.jar filter=lfs diff=lfs merge=lfs -text
+report.pdf filter=lfs diff=lfs merge=lfs -text
+Qwen2.5-Math/evaluation/data/tabmwp/test.jsonl filter=lfs diff=lfs merge=lfs -text
+Reproducibility/evaluation_log/result/ernie_openseek/math/test_qwen25-math-cot_-1_seed0_t0.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+Reproducibility/cleaned_data/for_erniekit_training/sft_dataflow_finemath.jsonl filter=lfs diff=lfs merge=lfs -text
+Qwen2.5-Math/evaluation/latex2sympy/gen/__pycache__/PSLexer.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
+Reproducibility/cleaned_data/for_erniekit_training/sft_dataflow_dolmino.jsonl filter=lfs diff=lfs merge=lfs -text
+Reproducibility/cleaned_data/dataflow_finemath.jsonl filter=lfs diff=lfs merge=lfs -text
+Qwen2.5-Math/evaluation/latex2sympy/gen/__pycache__/PSLexer.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+Reproducibility/evaluation_log/result/ernie_dataflow/math/test_qwen25-math-cot_-1_seed0_t0.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
+Qwen2.5-Math/evaluation/latex2sympy/gen/__pycache__/PSParser.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+Qwen2.5-Math/evaluation/latex2sympy/gen/__pycache__/PSParser.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
+Reproducibility/cleaned_data/dataflow_dolmino.jsonl filter=lfs diff=lfs merge=lfs -text
+ernie/ERNIE/examples/pre-training/demo_data/data-1-part1.idx filter=lfs diff=lfs merge=lfs -text

DataFlow/OpenSeek_CoT/Openseek_Math_Chain_of_Thoughts_pipeline.py.py ADDED Viewed

	@@ -0,0 +1,137 @@

+from dataflow.operators.generate import (
+    QuestionCategoryClassifier,
+    QuestionDifficultyClassifier,
+    QuestionGenerator,
+    AnswerGenerator,
+)
+from dataflow.operators.filter import (
+    QuestionFilter,
+    AnswerPipelineRoot,
+    AnswerFormatterFilter,
+    AnswerTokenLengthFilter,
+    AnswerGroundTruthFilter,
+    AnswerNgramFilter,
+)
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import APILLMServing_request, LocalModelLLMServing
+class ReasoningPipeline():
+    def __init__(self):
+        self.storage = FileStorage(
+            # first_entry_file_name="/cpfs/user/boyuan/verl_workspace/baidu/DataFlow/demo/example_data/ReasoningPipeline/pipeline_math_short.json",
+            # first_entry_file_name="/cpfs/user/boyuan/verl_workspace/baidu/data/fulldata/math/dolmino-mix-1124-math-merged.jsonl",
+            first_entry_file_name="/cpfs/user/boyuan/verl_workspace/baidu/data/fulldata/math/second_half_math.jsonl",
+            cache_path="./second_half_math",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+        # use API server as LLM serving
+        llm_serving = APILLMServing_request(
+                api_url="http://10.39.1.99:23456/v1/chat/completions",
+                # api_url = "https://aistudio.baidu.com/llm/lmapi/v3",
+                # api_url= "https://api.deepseek.com/v1/chat/completions",
+                model_name="ernie300",
+                # model_name="ernie-4.5-turbo-128k-preview",
+                # model_name="qwen3",
+                max_workers=50
+        )
+        self.question_filter_step3 = QuestionFilter(
+            system_prompt="You are an expert in evaluating mathematical problems. Follow the user's instructions strictly and output your final judgment in the required JSON format.",
+            llm_serving=llm_serving
+        )
+        self.question_difficulty_classifier_step4 = QuestionDifficultyClassifier(
+            llm_serving=llm_serving
+        )
+        self.question_category_classifier_step5 = QuestionCategoryClassifier(
+            llm_serving=llm_serving
+        )
+        ########################## branch ############################
+        # self.answer_pipeline_root_step6 = AnswerPipelineRoot()
+        ########################## answer ############################
+        self.answer_generator_step7 = AnswerGenerator(
+            llm_serving=llm_serving
+        )
+        self.answer_format_filter_step8 = AnswerFormatterFilter()
+        self.answer_token_length_filter_step9 = AnswerTokenLengthFilter(
+            max_answer_token_length = 8192,
+            tokenizer_dir = "/cpfs/user/boyuan/verl_workspace/baidu/models300/qwen3",
+        )
+        self.answer_groundtruth_filter_step10 = AnswerGroundTruthFilter()
+        self.answer_ngram_filter_step11 = AnswerNgramFilter(
+            min_score = 0.1,
+            max_score = 1.0,
+            ngrams = 5
+        )
+    def forward(self):
+        # self.question_filter_step1.run(
+        #     storage = self.storage.step(),
+        #     input_key = "instruction",
+        # )
+        # self.question_gen_step2.run(
+        #     storage = self.storage.step(),
+        #     input_key = "instruction",
+        # )
+        self.question_filter_step3.run(
+            storage = self.storage.step(),
+            input_key = "instruction",
+        )
+        self.question_difficulty_classifier_step4.run(
+            storage = self.storage.step(),
+            input_key = "instruction",
+            output_key = "question_difficulty"
+        )
+        self.question_category_classifier_step5.run(
+            storage = self.storage.step(),
+            input_key = "instruction",
+            output_key = "question_category"
+        )
+        ############# branch #############
+        # self.answer_pipeline_root_step6.run(
+        #     storage = self.storage.step(),
+        #     input_answer_key = "output",
+        #     input_gt_key = "golden_answer"
+        # )
+        ############## answer #############
+        self.answer_generator_step7.run(
+            storage = self.storage.step(),
+            input_key = "instruction",
+            output_key = "generated_cot"
+        )
+        self.answer_format_filter_step8.run(
+            storage = self.storage.step(),
+            input_key = "generated_cot",
+        )
+        self.answer_token_length_filter_step9.run(
+            storage = self.storage.step(),
+            input_key =  "generated_cot"
+        )
+        # self.answer_groundtruth_filter_step10.run(
+        #     storage = self.storage.step(),
+        #     test_answer_key = "generated_cot",
+        #     gt_answer_key =  "golden_answer"
+        # )
+        self.answer_ngram_filter_step11.run(
+            storage = self.storage.step(),
+            question_key = "instruction",
+            answer_key = "generated_cot"
+        )
+if __name__ == "__main__":
+    model = ReasoningPipeline()
+    model.forward()

DataFlow/dataflow/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from .utils import *
+from .version import __version__, version_info
+from .logger import get_logger
+from .operators import *
+__all__ = [
+    '__version__',
+    'version_info',
+    'get_logger',
+]
+def hello():
+    return "Hello from open-dataflow!"

DataFlow/dataflow/cli.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import os
+import argparse
+import requests
+from colorama import init, Fore, Style
+# from dataflow.utils.paths import BencoPath
+from dataflow.cli_funcs import cli_env, cli_init
+import importlib.metadata
+PYPI_API_URL = 'https://pypi.org/pypi/open-dataflow/json'
+from dataflow.version import __version__
+def version_and_check_for_updates():
+    # print a bar by the length of the shell width
+    print(Fore.BLUE + "=" * os.get_terminal_size().columns + Style.RESET_ALL)
+    print(f'open-dataflow codebase version: {__version__}')
+    try:
+        response = requests.get(PYPI_API_URL, timeout=5)
+        response.raise_for_status()  # 如果响应码不是200，则抛出异常
+        pypi_data = response.json()
+        cloud_version = pypi_data['info']['version']  # 获取最新版本号
+        # cloud_version = '0.1.21'   # for debug & test
+        print("\tChecking for updates...")
+        print("\tLocal version: ", __version__)
+        print("\tPyPI newest version: ", cloud_version)
+        local_version = __version__  # 通过 importlib.metadata 获取当前安装版本
+        if cloud_version != local_version:
+            print(Fore.YELLOW + f"New version available: {cloud_version}. Your version: {local_version}." + Style.RESET_ALL)
+            print("Run 'pip install --upgrade open-dataflow' to upgrade.")
+        else:
+            print(Fore.GREEN + f"You are using the latest version: {local_version}." + Style.RESET_ALL)
+    except requests.exceptions.RequestException as e:
+        print(Fore.RED + "Failed to check for updates from PyPI. Please check your internet connection." + Style.RESET_ALL)
+        print(f"Error: {e}")
+    print(Fore.BLUE + "=" * os.get_terminal_size().columns + Style.RESET_ALL)
+def main():
+    parser = argparse.ArgumentParser(description='Command line interface for DataFlow, with codebase version: ' + __version__)
+    # Add version argument with update check only when user requests version
+    parser.add_argument('-v', '--version', action='store_true', help="Show the version of the tool")
+    subparsers = parser.add_subparsers(dest='command', required=False)
+    # init command
+    parser_init = subparsers.add_parser('init', help='Initialize the scripts and configs in a directory')
+    init_subparsers = parser_init.add_subparsers(dest='subcommand', required=False)
+    # init all
+    parser_init_all = init_subparsers.add_parser('all', help='Initialize all components')
+    parser_init_all.set_defaults(subcommand='all')
+    # init reasoning
+    parser_init_reasoning = init_subparsers.add_parser('reasoning', help='Initialize reasoning components')
+    parser_init_reasoning.set_defaults(subcommand='reasoning')
+    # env command
+    parser_env = subparsers.add_parser('env', help='Show environment information')
+    # parser.add_argument('--config', type=str, help='Path to the configuration file')
+    args = parser.parse_args()
+    if args.version:
+        version_and_check_for_updates()
+    if args.command == 'init':
+        if args.subcommand is None:
+            args.subcommand = 'base'
+        cli_init(subcommand=args.subcommand)
+        # print("TODO Calling cli_init with subcommand:", args.subcommand)
+        from dataflow.cli_funcs.paths import DataFlowPath
+        # print(DataFlowPath.get_dataflow_dir())
+        # print(DataFlowPath.get_dataflow_scripts_dir())
+    elif args.command == 'env':
+        cli_env()
+if __name__ == '__main__':
+    main()

DataFlow/dataflow/core/LLMServing.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from abc import ABC, abstractmethod
+from typing import Any, List
+class LLMServingABC(ABC):
+    """Abstract base class for data generators. Which may be used to generate data from a model or API. Called by operators
+    """
+    @abstractmethod
+    def generate_from_input(self, user_inputs: List[str], system_prompt: str) -> List[str]:
+        """
+        Generate data from input.
+        input: List[str], the input of the generator
+        """
+        pass
+    @abstractmethod
+    def cleanup(self):
+        """
+        Cleanup the generator and garbage collect all GPU/CPU memory.
+        """
+        pass
+    def load_model(self, model_name_or_path: str, **kwargs: Any):
+        """
+        Load the model from the given path.
+        This method is optional and can be overridden by subclasses if needed.
+        """
+        raise NotImplementedError("This method should be implemented by subclasses.")

DataFlow/dataflow/core/Operator.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from abc import ABC, abstractmethod
+from dataflow.logger import get_logger
+class OperatorABC(ABC):
+    # @abstractmethod
+    # def check_config(self, config: dict) -> None:
+    #     """
+    #     Check the config of the operator. If config lacks any required keys, raise an error.
+    #     """
+    #     pass
+    @abstractmethod
+    def run(self) -> None:
+        """
+        Main function to run the operator.
+        """
+        pass
+def get_operator(operator_name, args) -> OperatorABC:
+    from dataflow.utils import OPERATOR_REGISTRY
+    print(operator_name, args)
+    operator = OPERATOR_REGISTRY.get(operator_name)(args)
+    logger = get_logger()
+    if operator is not None:
+        logger.info(f"Successfully get operator {operator_name}, args {args}")
+    else:
+        logger.error(f"operator {operator_name} is not found")
+    assert operator is not None
+    print(operator)
+    return operator

DataFlow/dataflow/core/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .Operator import OperatorABC, get_operator
+from .LLMServing import LLMServingABC
+__all__ = [
+    'OperatorABC',
+    'get_operator',
+    'LLMServingABC',
+]

DataFlow/dataflow/core/__pycache__/LLMServing.cpython-310.pyc ADDED Viewed

Binary file (1.5 kB). View file

DataFlow/dataflow/core/__pycache__/Operator.cpython-310.pyc ADDED Viewed

Binary file (1.05 kB). View file

DataFlow/dataflow/core/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (317 Bytes). View file

DataFlow/dataflow/logger.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import logging
+import colorlog
+# 自定义日志等级
+SUCCESS_LEVEL_NUM = 25
+logging.addLevelName(SUCCESS_LEVEL_NUM, "SUCCESS")
+def success(self, message, *args, **kwargs):
+    if self.isEnabledFor(SUCCESS_LEVEL_NUM):
+        self._log(SUCCESS_LEVEL_NUM, message, args, **kwargs)
+logging.Logger.success = success  # 添加方法到 Logger 类
+def get_logger(level=logging.INFO) -> logging.Logger:
+    # 创建logger对象
+    logger = logging.getLogger("DataFlow")
+    if not logger.handlers:
+        logger.setLevel(level)
+        # 创建控制台日志处理器
+        console_handler = logging.StreamHandler()
+        console_handler.setLevel(level)
+        # 定义颜色输出格式
+        color_formatter = colorlog.ColoredFormatter(
+            '%(log_color)s %(asctime)s | %(filename)-20s- %(module)-20s- %(funcName)-20s- %(lineno)5d - %(name)-10s | %(levelname)8s | Processno %(process)5d - Threadno %(thread)-15d : %(message)s',
+            log_colors={
+                'DEBUG': 'cyan',
+                # 'INFO': 'white',
+                'SUCCESS': 'green',
+                'WARNING': 'yellow',
+                'ERROR': 'red',
+                'CRITICAL': 'red,bg_white',
+            }
+        )
+        # 将颜色输出格式添加到控制台日志处理器
+        console_handler.setFormatter(color_formatter)
+        # 将控制台日志处理器添加到logger对象
+        logger.addHandler(console_handler)
+    return logger

DataFlow/dataflow/operators/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# from .eval import *
+# from .generate import *
+# from .filter import *
+# from .refine import *

DataFlow/dataflow/operators/eval/AgenticRAG/statistics/f1_scorer.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import re
+import string
+from collections import Counter
+from tqdm import tqdm
+import pandas as pd
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+@OPERATOR_REGISTRY.register()
+class F1Scorer(OperatorABC):
+    def __init__(self, prediction_key, ground_truth_key):
+        self.logger = get_logger()
+        self.logger.info(f"Initializing {self.__class__.__name__}...")
+        self.prediction_key = prediction_key
+        self.ground_truth_key = ground_truth_key
+        self.logger.info(f"{self.__class__.__name__} initialized.")
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        return "用于评估预测答案与多个参考答案之间的 F1 分数"
+    def _validate_dataframe(self, dataframe: pd.DataFrame):
+        required_keys = [self.prediction_key, self.ground_truth_key]
+        forbidden_keys = [self.output_key ]
+        missing = [k for k in required_keys if k not in dataframe.columns]
+        conflict = [k for k in forbidden_keys if k in dataframe.columns]
+        if missing:
+            raise ValueError(f"Missing required column(s): {missing}")
+        if conflict:
+            raise ValueError(f"The following column(s) already exist and would be overwritten: {conflict}")
+    def normalize_answer(self, s: str) -> str:
+        def remove_articles(text):
+            return re.sub(r"\b(a|an|the)\b", " ", text)
+        def white_space_fix(text):
+            return " ".join(text.split())
+        def remove_punc(text):
+            exclude = set(string.punctuation)
+            return "".join(ch for ch in text if ch not in exclude)
+        def lower(text):
+            return text.lower()
+        return white_space_fix(remove_articles(remove_punc(lower(s))))
+    def compute_f1(self, prediction: str, ground_truths) -> float:
+        if prediction is None or ground_truths is None:
+            return 0.0
+        if isinstance(ground_truths, str):
+            ground_truths = [ground_truths]
+        max_f1 = 0.0
+        for ground_truth in ground_truths:
+            if ground_truth is None:
+                continue
+            normalized_prediction = self.normalize_answer(prediction)
+            normalized_ground_truth = self.normalize_answer(ground_truth)
+            if normalized_prediction in ["yes", "no", "noanswer"] or normalized_ground_truth in ["yes", "no", "noanswer"]:
+                if normalized_prediction != normalized_ground_truth:
+                    continue
+            pred_tokens = normalized_prediction.split()
+            gold_tokens = normalized_ground_truth.split()
+            common = Counter(pred_tokens) & Counter(gold_tokens)
+            num_same = sum(common.values())
+            if num_same == 0:
+                continue
+            precision = num_same / len(pred_tokens)
+            recall = num_same / len(gold_tokens)
+            f1 = (2 * precision * recall) / (precision + recall)
+            max_f1 = max(max_f1, f1)
+        return max_f1
+    def eval(self, dataframe: pd.DataFrame) -> list:
+        self.logger.info(f"Evaluating {self.output_key}...")
+        f1_scores = []
+        for _, row in tqdm(dataframe.iterrows(), total=len(dataframe), desc="F1Scorer Evaluating..."):
+            prediction = row.get(self.prediction_key, None)
+            ground_truths = row.get(self.ground_truth_key, None)
+            score = self.compute_f1(prediction, ground_truths)
+            f1_scores.append(score)
+        self.logger.info("Evaluation complete!")
+        return f1_scores
+    def run(self, storage: DataFlowStorage, output_key):
+        dataframe = storage.read("dataframe")
+        self.output_key = output_key
+        self._validate_dataframe(dataframe)
+        scores = self.eval(dataframe)
+        dataframe[self.output_key] = scores
+        storage.write(dataframe)

DataFlow/dataflow/operators/eval/GeneralText/APIcaller/__pycache__/perspective_scorer.cpython-310.pyc ADDED Viewed

Binary file (2.3 kB). View file

DataFlow/dataflow/operators/eval/GeneralText/APIcaller/meta_scorer.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+import pandas as pd
+from dataflow.core import LLMServingABC
+from dataflow.prompts.general_text import MetaPrompt
+import ast
+@OPERATOR_REGISTRY.register()
+class MetaScorer(OperatorABC):
+    def __init__(self, llm_serving: LLMServingABC = None):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.llm_serving = llm_serving
+        self.score_name = 'MetaScore'
+        self.prompt = MetaPrompt()
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+        self.output_columns = [
+            "Text Structure",
+            "Diversity & Complexity",
+            "Fluency & Understandability",
+            "Safety",
+            "Educational Value",
+            "Content Accuracy & Effectiveness"
+        ]
+    def get_score(self, samples, input_key):
+        system_prompt = self.prompt.build_system_prompt()
+        user_prompts = []
+        for sample in samples:
+            input_text = sample.get(input_key, '')
+            user_prompt = self.prompt.build_user_prompt(input_text)
+            full_prompt = system_prompt + "\n" + user_prompt
+            user_prompts.append(full_prompt)
+        responses = self.llm_serving.generate_from_input(user_inputs=user_prompts)
+        scores = []
+        for i, response in enumerate(responses):
+            try:
+                lines = response.strip().split("\n")
+                last_line = lines[-1].strip()
+                parsed_scores = ast.literal_eval(last_line)
+                if isinstance(parsed_scores, list) and len(parsed_scores) == 6:
+                    scores.append(parsed_scores)
+                else:
+                    raise ValueError("Score format invalid")
+            except Exception as e:
+                self.logger.warning(f"Failed to extract score from response {i}: {e}")
+                scores.append([float('nan')] * 6)
+        return scores
+    def eval(self, dataframe: pd.DataFrame, input_key: str):
+        samples = dataframe.to_dict(orient='records')
+        self.logger.info(f"Evaluating {self.score_name}...")
+        scores = self.get_score(samples, input_key)
+        self.logger.info("Evaluation complete!")
+        return scores
+    def run(self, storage: DataFlowStorage, input_key: str):
+        self.input_key = input_key
+        dataframe = storage.read("dataframe")
+        scores = self.eval(dataframe, self.input_key)
+        # 展开6列固定命名
+        score_df = pd.DataFrame(scores, columns=self.output_columns)
+        dataframe = pd.concat([dataframe, score_df], axis=1)
+        storage.write(dataframe)

DataFlow/dataflow/operators/eval/GeneralText/APIcaller/treeinstruct_scorer.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+import pandas as pd
+from dataflow.core import LLMServingABC
+from dataflow.prompts.general_text import TreeinstructPrompt
+@OPERATOR_REGISTRY.register()
+class TreeinstructScorer(OperatorABC):
+    def __init__(self, llm_serving: LLMServingABC = None):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.llm_serving = llm_serving
+        self.score_name = 'TreeinstructScore'
+        self.prompt = TreeinstructPrompt()
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    def get_score(self, samples, input_instruction_key):
+        system_prompts = []
+        user_prompts = []
+        for sample in samples:
+            instruction = sample.get(input_instruction_key, [''])
+            system_prompts.append(self.prompt.build_system_prompt(instruction))
+            user_prompts.append(self.prompt.build_user_prompt())
+        inputs = [system + "\n" + user for system, user in zip(system_prompts, user_prompts)]
+        responses = self.llm_serving.generate_from_input(user_inputs=inputs)
+        scores = []
+        for response in responses:
+            response_lines = response.strip().split("\n")
+            score_line = response_lines[-1]
+            score = float(score_line.split()[0])
+            scores.append(score)
+        return scores
+    def eval(self, dataframe: pd.DataFrame, input_instruction_key: str):
+        self.logger.info(f"Evaluating {self.score_name}...")
+        samples = dataframe.to_dict(orient='records')
+        scores = self.get_score(samples, input_instruction_key)
+        self.logger.info("Evaluation complete!")
+        return scores
+    def run(self, storage: DataFlowStorage, input_instruction_key: str, output_key: str='TreeinstructScore'):
+        self.input_instruction_key = input_instruction_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        scores = self.eval(dataframe, self.input_instruction_key)
+        dataframe[self.output_key] = scores
+        storage.write(dataframe)

DataFlow/dataflow/operators/eval/GeneralText/__init__.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# from .statistics.ngram_scorer import NgramScorer
+# from .statistics.lexical_diversity_scorer import LexicalDiversityScorer
+# from .statistics.langkit_scorer import LangkitScorer
+# from .models.deita_quality_scorer import DeitaQualityScorer
+# from .models.instag_scorer import InstagScorer
+# from .models.debertav3_scorer import DebertaV3Scorer
+# from .models.deita_complexity_scorer import DeitaComplexityScorer
+# from .models.fineweb_edu_scorer import FineWebEduScorer
+# from .models.pair_qual_scorer import PairQualScorer
+# from .models.presidio_scorer import PresidioScorer
+# from .models.rm_scorer import RMScorer
+# from .models.textbook_scorer import TextbookScorer
+# from .models.superfiltering_scorer import SuperfilteringScorer
+# from .models.qurating_scorer import QuratingScorer
+# from .models.perplexity_scorer import PerplexityScorer
+# from .APIcaller.alpagasus_scorer import AlpagasusScorer
+# from .APIcaller.treeinstruct_scorer import TreeinstructScorer
+# from .APIcaller.perspective_scorer import PerspectiveScorer
+# from .APIcaller.meta_scorer import MetaScorer
+# from .diversity.vendi_scorer import VendiScorer
+# from .diversity.task2vec_scorer import Task2VecScorer
+# from .gen.bleu_scorer import BleuScorer
+# from .gen.cider_scorer import CiderScorer
+# from .gen.bert_scorer import BERTScorer
+# __all__ = [
+#     'NgramScorer',
+#     'LexicalDiversityScorer',
+#     'LangkitScorer',
+#     'DeitaQualityScorer',
+#     'InstagScorer',
+#     'DebertaV3Scorer',
+#     'DeitaComplexityScorer',
+#     'FineWebEduScorer',
+#     'PairQualScorer',
+#     'PresidioScorer',
+#     'RMScorer',
+#     'TextbookScorer',
+#     'SuperfilteringScorer',
+#     'QuratingScorer',
+#     'PerplexityScorer',
+#     'AlpagasusScorer',
+#     'TreeinstructScorer',
+#     'PerspectiveScorer',
+#     "MetaScorer",
+#     'VendiScorer',
+#     'Task2VecScorer',
+#     'BleuScorer',
+#     'CiderScorer',
+#     'BERTScorer'
+# ]

DataFlow/dataflow/operators/eval/GeneralText/diversity/__pycache__/task2vec_scorer.cpython-310.pyc ADDED Viewed

Binary file (4.26 kB). View file

DataFlow/dataflow/operators/eval/GeneralText/diversity/__pycache__/vendi_scorer.cpython-310.pyc ADDED Viewed

Binary file (1.83 kB). View file

DataFlow/dataflow/operators/eval/GeneralText/diversity/task2vec/__pycache__/task2vec.cpython-310.pyc ADDED Viewed

Binary file (17.6 kB). View file

DataFlow/dataflow/operators/eval/GeneralText/diversity/task2vec/__pycache__/task_similarity.cpython-310.pyc ADDED Viewed

Binary file (13.7 kB). View file

DataFlow/dataflow/operators/eval/GeneralText/diversity/task2vec/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (2.62 kB). View file

DataFlow/dataflow/operators/eval/GeneralText/diversity/task2vec/task2vec.py ADDED Viewed

	@@ -0,0 +1,544 @@

+# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+import itertools
+import math
+import random
+from abc import ABC, abstractmethod
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.optim import Optimizer
+import numpy as np
+from tqdm.auto import tqdm, trange
+import logging
+from torch.utils.data import DataLoader, Dataset
+from .utils import AverageMeter, get_error, get_device
+## LLM DIV
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+## LLM DIV
+def get_loss(logits: torch.tensor, targets: torch.tensor, ignore_index=None) -> torch.tensor:
+    """
+    Computes the cross-entropy loss for either sequence classification or generation.
+    """
+    assert logits.dim() == 3 and ignore_index is not None
+    loss = nn.CrossEntropyLoss(ignore_index=ignore_index)
+    logits = logits[:,:-1,:]
+    logits = logits.transpose(1, 2) # batch_size, vocab_size (i.e. num_classes), sequence_length
+    targets = targets[:,1:]
+    return loss(logits, targets)
+class Embedding:
+    """
+    task_embedding = diagonal of the FIM for the filters of size [F_total, 1] total filters for a network.
+    Notes:
+        - the diagonal of the Fisher Information Matrix for each layer.
+        - embedding size should be the size of the total number of filters for the network.
+    """
+    def __init__(self, hessian, scale, meta=None):
+        self.hessian = np.array(hessian)
+        self.scale = np.array(scale)
+        self.meta = meta
+    def __repr__(self):
+        return f'{self.hessian}'
+class ProbeNetwork(ABC, nn.Module):
+    """Abstract class that all probe networks should inherit from.
+    This is a standard torch.nn.Module but needs to expose a classifier property that returns the final classicifation
+    module (e.g., the last fully connected layer).
+    """
+    @property
+    @abstractmethod
+    def classifier(self):
+        raise NotImplementedError("Override the classifier property to return the submodules of the network that"
+                                  " should be interpreted as the classifier")
+    @classifier.setter
+    @abstractmethod
+    def classifier(self, val):
+        raise NotImplementedError("Override the classifier setter to set the submodules of the network that"
+                                  " should be interpreted as the classifier")
+class Task2Vec:
+    def __init__(self, model: ProbeNetwork, skip_layers=0, max_samples=None, classifier_opts=None,
+                 method='montecarlo', method_opts=None, loader_opts=None, bernoulli=False, mode='autoregressive'): ## LLM DIV
+        if classifier_opts is None:
+            classifier_opts = {}
+        if method_opts is None:
+            method_opts = {}
+        if loader_opts is None:
+            loader_opts = {}
+        assert method in ('variational', 'montecarlo')
+        assert skip_layers >= 0
+        self.model = model
+        # Fix batch norm running statistics (i.e., put batch_norm layers in eval mode)
+        self.model.train()
+        self.device = get_device(self.model)
+        self.skip_layers = skip_layers
+        self.max_samples = max_samples
+        self.classifier_opts = classifier_opts
+        self.method = method
+        self.method_opts = method_opts
+        self.loader_opts = loader_opts
+        self.bernoulli = bernoulli
+        self.mode = mode
+        if self.mode == "autoregressive":
+            self.loss_fn = get_loss
+        else:
+            self.loss_fn = nn.CrossEntropyLoss() if not self.bernoulli else nn.BCEWithLogitsLoss()
+            self.loss_fn = self.loss_fn.to(self.device)
+    def embed(self, dataset: Dataset, epochs: int = 5):
+        ## LLM DIV
+        # Cache the last layer features (needed to train the classifier) and (if needed) the intermediate layer features
+        # so that we can skip the initial layers when computing the embedding
+        # dataset.train()
+        if self.mode == "autoregressive":
+            loss = None
+            print(f'{self.classifier_opts=}')
+            if self.classifier_opts:  # is it something truthy? e.g., dict with something in it?
+                if self.classifier_opts.get('finetune', False):  # finetune only if specified True, else no finetuning if not specified or False.
+                    epochs = 0
+                    print(f'Warning: classifier_opts doesnt specify finetune or break early, thus no finetuning is being done. See: {self.classifier_opts=} {epochs=}')
+                    loss = self._finetune_classifier(dataset, loader_opts=self.loader_opts, classifier_opts=self.classifier_opts, max_samples=self.max_samples, epochs=epochs)
+                else:
+                    loss = self._finetune_classifier(dataset, loader_opts=self.loader_opts, classifier_opts=self.classifier_opts, max_samples=self.max_samples, epochs=epochs)
+            else:  # self.classifier_opts might be None or {}
+                loss = self._finetune_classifier(dataset, loader_opts=self.loader_opts, classifier_opts=self.classifier_opts, max_samples=self.max_samples, epochs=epochs)
+            print(f'{loss=} (after fine tune, if not done it will be None)')
+            assert loss is not None, f'Err: {loss=}'
+            self.compute_fisher(dataset)
+            embedding = self.extract_embedding(self.model)
+            return embedding, loss
+        else:
+            if self.skip_layers > 0:
+                self._cache_features(dataset, indexes=(self.skip_layers, -1), loader_opts=self.loader_opts,
+                                     max_samples=self.max_samples)
+            else:
+                self._cache_features(dataset, max_samples=self.max_samples)
+            # Fits the last layer classifier using cached features
+            self._fit_classifier(**self.classifier_opts)
+            if self.skip_layers > 0:
+                dataset = torch.utils.data.TensorDataset(self.model.layers[self.skip_layers].input_features,
+                                                         self.model.layers[-1].targets)
+            # dataset.eval()  # I added this so that the embedding is computed on the val set
+            self.compute_fisher(dataset)
+            embedding = self.extract_embedding(self.model)
+            # dataset.train()  # returns to using the support set
+            return embedding
+    ### LLM DIV
+    def _finetune_classifier(self, dataset: Dataset, loader_opts: dict = None, classifier_opts: dict = None, max_samples=None, epochs = 5, learning_rate = 5e-5, adam_epsilon = 1e-8):
+        """Fits the last layer of the HuggingFace transformer probe network."""
+        logging.info("Finetune classifier...")
+        if loader_opts is None:
+            loader_opts = {}
+        if classifier_opts is None:
+            classifier_opts = {}
+        data_loader = DataLoader(dataset, shuffle=False, batch_size=loader_opts.get('batch_size', 8),
+                                 num_workers=loader_opts.get('num_workers', 0), drop_last=False)
+        device = next(self.model.parameters()).device
+        print("MODEL DEVICE: ", device)
+        # num_examples = int(classifier_opts.get("task_batch_size", 256) / loader_opts.get('batch_size', 8))
+        num_examples = len(list(data_loader))  # not ideal but it's quicker in dev time, usually we won't feed the entire data set to task2vec so this should be fine
+        n_batches = num_examples
+        optimizer_grouped_parameters = [
+            {'params': [p for p in self.model.lm_head.parameters()],
+             'weight_decay': classifier_opts.get("weight_decay",0.0001)},
+        ]
+        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=classifier_opts.get("learning_rate",learning_rate), eps=classifier_opts.get("adam_epsilon",adam_epsilon))
+        # Train!
+        logging.info("***** Running training *****")
+        # logging.info("  Num examples = %d", num_examples)
+        logging.info("  Num Epochs = %d", epochs)
+        logging.info("  Batch size = %d", loader_opts.get('batch_size', 8))
+        train_iterator = trange(classifier_opts.get("epochs", epochs), desc="Epoch", leave=False)
+        set_seed(classifier_opts.get("seed", 42))  # Added here for reproductibility (even between python 2 and 3)
+        self.model.train()
+        for epoch in train_iterator:
+            metrics = AverageMeter()
+            epoch_iterator = tqdm(data_loader, desc="Iteration", total=n_batches, leave=False)
+            for step, batch in enumerate(epoch_iterator):
+                optimizer.zero_grad()
+                inputs = {'input_ids': batch['input_ids'].to(device),
+                        'attention_mask': batch['attention_mask'].to(device)}
+                logits = self.model(**inputs, labels=inputs["input_ids"]).logits
+                loss = self.loss_fn(logits, inputs["input_ids"], ignore_index=50256)
+                print(f'\nInitial loss {loss.item()} ({step=} {epoch=})') if step == 0 else None
+                error = get_error(logits, inputs['input_ids'], ignore_index=50256)
+                loss.backward()
+                optimizer.step()
+                metrics.update(n=batch['input_ids'].shape[0], loss=loss.item(), error=error)
+                epoch_iterator.update(1)
+                if classifier_opts.get("break_early", False):
+                    print("----> breaking early")
+                    break
+            if classifier_opts.get("break_early", False):
+                break
+            logging.info(f"[epoch {epoch}]: " + "\t".join(f"{k}: {v}" for k, v in metrics.avg.items()))
+        print(f'\nfinal loss {step=} {epoch=} of final layer loss {loss.item()} (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)')
+        return loss.item()
+    ### LLM DIV
+    def montecarlo_fisher_autoregressive(self, dataset: Dataset, epochs: int = 1):
+        logging.info("Using montecarlo Fisher")
+        if self.loader_opts is None:
+            loader_opts = {}
+        else:
+            loader_opts = self.loader_opts
+        data_loader = DataLoader(dataset, shuffle=False, batch_size=loader_opts.get('batch_size', 8),
+                                 num_workers=loader_opts.get('num_workers', 0), drop_last=False)
+        device = get_device(self.model)
+        # num_examples = int(classifier_opts.get("task_batch_size", 256) / loader_opts.get('batch_size', 8))
+        num_examples = len(list(data_loader))  # not idea but it's quicker in dev time, usually we won't feed the entire data set to task2vec so this should be fine
+        n_batches = num_examples
+        logging.info("Computing Fisher...")
+        for p in self.model.parameters():
+            p.grad2_acc = torch.zeros_like(p.data)
+            p.grad_counter = 0
+        for k in range(epochs):
+            logging.info(f"\tepoch {k + 1}/{epochs}")
+            epoch_iterator = tqdm(data_loader, desc="Iteration", total=n_batches, leave=False)
+            for step, batch in enumerate(epoch_iterator):
+                inputs = {'input_ids': batch['input_ids'].to(device),
+                        'attention_mask': batch['attention_mask'].to(device)}
+                logits = self.model(**inputs, labels=inputs["input_ids"]).logits
+                # The gradients used to compute the FIM needs to be for y sampled from
+                # the model distribution y ~ p_w(y|x), not for y from the dataset
+                if self.bernoulli:
+                    target = torch.bernoulli(F.sigmoid(logits[:,:-1,:])).detach()
+                else:
+                    softmax_output = F.softmax(logits, dim=-1)
+                    lst = [torch.multinomial(softmax_output[i,:,:], 1).detach().view(-1) for i in range(len(softmax_output))]
+                    target = torch.stack(lst, dim=0)
+                loss = self.loss_fn(logits, target, ignore_index=50256)
+                self.model.zero_grad()
+                loss.backward()
+                for p in self.model.parameters():
+                    if p.grad is not None:
+                        p.grad2_acc += p.grad.data ** 2
+                        p.grad_counter += 1
+                if self.classifier_opts.get("break_early", False):
+                    break  # for debugging faster, otherwise FIM is really slow
+            if self.classifier_opts.get("break_early", False):
+                break  # for debugging faster, otherwise FIM is really slow
+        for p in self.model.parameters():
+            if p.grad_counter == 0:
+                del p.grad2_acc
+            else:
+                p.grad2_acc /= p.grad_counter
+        logging.info("done")
+    def montecarlo_fisher(self, dataset: Dataset, epochs: int = 1):
+        logging.info("Using montecarlo Fisher")
+        if self.skip_layers > 0:
+            dataset = torch.utils.data.TensorDataset(self.model.layers[self.skip_layers].input_features,
+                                                     self.model.layers[-1].targets)
+        data_loader = _get_loader(dataset, **self.loader_opts)
+        device = get_device(self.model)
+        logging.info("Computing Fisher...")
+        for p in self.model.parameters():
+            p.grad2_acc = torch.zeros_like(p.data)
+            p.grad_counter = 0
+        for k in range(epochs):
+            logging.info(f"\tepoch {k + 1}/{epochs}")
+            for i, (data, target) in enumerate(tqdm(data_loader, leave=False, desc="Computing Fisher")):
+                data = data.to(device)
+                output = self.model(data, start_from=self.skip_layers)
+                # The gradients used to compute the FIM needs to be for y sampled from
+                # the model distribution y ~ p_w(y|x), not for y from the dataset
+                if self.bernoulli:
+                    target = torch.bernoulli(F.sigmoid(output)).detach()
+                else:
+                    target = torch.multinomial(F.softmax(output, dim=-1), 1).detach().view(-1)
+                loss = self.loss_fn(output, target)
+                self.model.zero_grad()
+                loss.backward()
+                for p in self.model.parameters():
+                    if p.grad is not None:
+                        p.grad2_acc += p.grad.data ** 2
+                        p.grad_counter += 1
+        for p in self.model.parameters():
+            if p.grad_counter == 0:
+                del p.grad2_acc
+            else:
+                p.grad2_acc /= p.grad_counter
+        logging.info("done")
+    def _run_epoch(self, data_loader: DataLoader, model: ProbeNetwork, loss_fn,
+                   optimizer: Optimizer, epoch: int, train: bool = True,
+                   add_compression_loss: bool = False, skip_layers=0, beta=1.0e-7):
+        metrics = AverageMeter()
+        device = get_device(model)
+        for i, (input, target) in enumerate(tqdm(data_loader, leave=False, desc="Computing Fisher")):
+            input = input.to(device)
+            target = target.to(device)
+            output = model(input, start_from=skip_layers)
+            loss = loss_fn(output, target)
+            lz = beta * variational.get_compression_loss(model) if add_compression_loss else torch.zeros_like(loss)
+            loss += lz
+            error = get_error(output, target)
+            metrics.update(n=input.size(0), loss=loss.item(), lz=lz.item(), error=error)
+            if train:
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+        # logging.info(
+        print(
+            "{}: [{epoch}] ".format('Epoch' if train else '', epoch=epoch) +
+            "Data/Batch: {:.3f}/{:.3f} ".format(metrics.avg["data_time"], metrics.avg["batch_time"]) +
+            "Loss {:.3f} Lz: {:.3f} ".format(metrics.avg["loss"], metrics.avg["lz"]) +
+            "Error: {:.2f}".format(metrics.avg["error"])
+        )
+        return metrics.avg
+    def variational_fisher(self, dataset: Dataset, epochs=1, beta=1e-7):
+        logging.info("Training variational fisher...")
+        parameters = []
+        for layer in self.model.layers[self.skip_layers:-1]:
+            if isinstance(layer, nn.Module):  # Skip lambda functions
+                variational.make_variational(layer)
+                parameters += variational.get_variational_vars(layer)
+        bn_params = []
+        # Allows batchnorm parameters to change
+        for m in self.model.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                bn_params += list(m.parameters())
+        # Avoids computing the gradients wrt to the weights to save time and memory
+        for p in self.model.parameters():
+            if p not in set(parameters) and p not in set(self.model.classifier.parameters()):
+                p.old_requires_grad = p.requires_grad
+                p.requires_grad = False
+        optimizer = torch.optim.Adam([
+            {'params': parameters},
+            {'params': bn_params, 'lr': 5e-4},
+            {'params': self.model.classifier.parameters(), 'lr': 5e-4}],
+            lr=1e-2, betas=(.9, 0.999))
+        if self.skip_layers > 0:
+            dataset = torch.utils.data.TensorDataset(self.model.layers[self.skip_layers].input_features,
+                                                     self.model.layers[-1].targets)
+        train_loader = _get_loader(dataset, **self.loader_opts)
+        for epoch in range(epochs):
+            self._run_epoch(train_loader, self.model, self.loss_fn, optimizer, epoch, beta=beta,
+                            add_compression_loss=True, train=True)
+        # Resets original value of requires_grad
+        for p in self.model.parameters():
+            if hasattr(p, 'old_requires_grad'):
+                p.requires_grad = p.old_requires_grad
+                del p.old_requires_grad
+    def compute_fisher(self, dataset: Dataset):
+        """
+        Computes the Fisher Information of the weights of the model wrt the model output on the dataset and stores it.
+        The Fisher Information Matrix is defined as:
+            F = E_{x ~ dataset} E_{y ~ p_w(y|x)} [\nabla_w log p_w(y|x) \nabla_w log p_w(y|x)^t]
+        where p_w(y|x) is the output probability vector of the network and w are the weights of the network.
+        Notice that the label y is sampled from the model output distribution and not from the dataset.
+        This code only approximate the diagonal of F. The result is stored in the model layers and can be extracted
+        using the `get_fisher` method. Different approximation methods of the Fisher information matrix are available,
+        and can be selected in the __init__.
+        :param dataset: dataset with the task to compute the Fisher on
+        """
+        if self.mode == 'autoregressive' and self.method == 'montecarlo':
+            fisher_fn = self.montecarlo_fisher_autoregressive
+        elif self.method == 'variational':
+            fisher_fn = self.variational_fisher
+        elif self.method == 'montecarlo':
+            fisher_fn = self.montecarlo_fisher
+        else:
+            raise ValueError(f"Invalid Fisher method {self.method}")
+        fisher_fn(dataset, **self.method_opts)
+    def _cache_features(self, dataset: Dataset, indexes=(-1,), max_samples=None, loader_opts: dict = None):
+        logging.info("Caching features...")
+        if loader_opts is None:
+            loader_opts = {}
+        data_loader = DataLoader(dataset, shuffle=False, batch_size=loader_opts.get('batch_size', 64),
+                                 num_workers=loader_opts.get('num_workers', 0), drop_last=False)
+        device = next(self.model.parameters()).device
+        def _hook(layer, inputs):
+            if not hasattr(layer, 'input_features'):
+                layer.input_features = []
+            layer.input_features.append(inputs[0].data.cpu().clone())
+        hooks = [self.model.layers[index].register_forward_pre_hook(_hook)
+                 for index in indexes]
+        if max_samples is not None:
+            n_batches = min(
+                math.floor(max_samples / data_loader.batch_size) - 1, len(data_loader))
+        else:
+            n_batches = len(data_loader)
+        targets = []
+        for i, (input, target) in tqdm(enumerate(itertools.islice(data_loader, 0, n_batches)), total=n_batches,
+                                       leave=False,
+                                       desc="Caching features"):
+            targets.append(target.clone())
+            self.model(input.to(device))
+        for hook in hooks:
+            hook.remove()
+        for index in indexes:
+            self.model.layers[index].input_features = torch.cat(self.model.layers[index].input_features)
+        self.model.layers[-1].targets = torch.cat(targets)
+    def _fit_classifier(self, optimizer='adam', learning_rate=0.0004, weight_decay=0.0001,
+                        epochs=10):
+        """Fits the last layer of the network using the cached features."""
+        logging.info("Fitting final classifier...")
+        if not hasattr(self.model.classifier, 'input_features'):
+            raise ValueError("You need to run `cache_features` on model before running `fit_classifier`")
+        targets = self.model.classifier.targets.to(self.device)
+        features = self.model.classifier.input_features.to(self.device)
+        dataset = torch.utils.data.TensorDataset(features, targets)
+        data_loader = _get_loader(dataset, **self.loader_opts)
+        if optimizer == 'adam':
+            optimizer = torch.optim.Adam(self.model.fc.parameters(), lr=learning_rate, weight_decay=weight_decay)
+        elif optimizer == 'sgd':
+            optimizer = torch.optim.SGD(self.model.fc.parameters(), lr=learning_rate, weight_decay=weight_decay)
+        else:
+            raise ValueError(f'Unsupported optimizer {optimizer}')
+        loss_fn = nn.CrossEntropyLoss()
+        for epoch in tqdm(range(epochs), desc="Fitting classifier", leave=False):
+            metrics = AverageMeter()
+            for data, target in data_loader:
+                optimizer.zero_grad()
+                output = self.model.classifier(data)
+                loss = loss_fn(self.model.classifier(data), target)
+                error = get_error(output, target)
+                loss.backward()
+                optimizer.step()
+                metrics.update(n=data.size(0), loss=loss.item(), error=error)
+            logging.info(f"[epoch {epoch}]: " + "\t".join(f"{k}: {v}" for k, v in metrics.avg.items()))
+        print(f'\nfinal loss after fitting final layer {loss=}')
+    def extract_embedding(self, model: ProbeNetwork):
+        """
+        Reads the values stored by `compute_fisher` and returns them in a common format that describes the diagonal of the
+        Fisher Information Matrix for each layer.
+        :param model:
+        :return:
+        """
+        if self.mode == 'autoregressive':
+            hess, scale = [], []
+            for name, module in model.named_modules():
+                if module is model.lm_head:
+                    continue
+                # The other Fisher approximation methods directly approximate the hessian at the minimum
+                if hasattr(module, 'weight') and hasattr(module.weight, 'grad2_acc'):
+                    grad2 = module.weight.grad2_acc.cpu().detach().numpy()
+                    filterwise_hess = grad2.reshape(grad2.shape[0], -1).mean(axis=1)
+                    hess.append(filterwise_hess)
+                    scale.append(np.ones_like(filterwise_hess))
+        else:
+            hess, scale = [], []
+            for name, module in model.named_modules():
+                if module is model.classifier:
+                    continue
+                # The variational Fisher approximation estimates the variance of noise that can be added to the weights
+                # without increasing the error more than a threshold. The inverse of this is proportional to an
+                # approximation of the hessian in the local minimum.
+                if hasattr(module, 'logvar0') and hasattr(module, 'loglambda2'):
+                    logvar = module.logvar0.view(-1).detach().cpu().numpy()
+                    hess.append(np.exp(-logvar))
+                    loglambda2 = module.loglambda2.detach().cpu().numpy()
+                    scale.append(np.exp(-loglambda2).repeat(logvar.size))
+                # The other Fisher approximation methods directly approximate the hessian at the minimum
+                elif hasattr(module, 'weight') and hasattr(module.weight, 'grad2_acc'):
+                    grad2 = module.weight.grad2_acc.cpu().detach().numpy()
+                    filterwise_hess = grad2.reshape(grad2.shape[0], -1).mean(axis=1)
+                    hess.append(filterwise_hess)
+                    scale.append(np.ones_like(filterwise_hess))
+        return Embedding(hessian=np.concatenate(hess), scale=np.concatenate(scale), meta=None)
+def _get_loader(trainset, testset=None, batch_size=64, num_workers=0, num_samples=10000, drop_last=True):
+    if getattr(trainset, 'is_multi_label', False):
+        raise ValueError("Multi-label datasets not supported")
+    # TODO: Find a way to standardize this
+    if hasattr(trainset, 'labels'):
+        labels = trainset.labels
+    elif hasattr(trainset, 'targets'):
+        labels = trainset.targets
+    else:
+        labels = list(trainset.tensors[1].cpu().numpy())
+    num_classes = int(getattr(trainset, 'num_classes', max(labels) + 1))
+    class_count = np.eye(num_classes)[labels].sum(axis=0)
+    weights = 1. / class_count[labels] / num_classes
+    weights /= weights.sum()
+    sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, num_samples=num_samples)
+    # No need for mutli-threaded loading if everything is already in memory,
+    # and would raise an error if TensorDataset is on CUDA
+    num_workers = num_workers if not isinstance(trainset, torch.utils.data.TensorDataset) else 0
+    trainloader = torch.utils.data.DataLoader(trainset, sampler=sampler, batch_size=batch_size,
+                                              num_workers=num_workers, drop_last=drop_last)
+    if testset is None:
+        return trainloader
+    else:
+        testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, pin_memory=True, shuffle=False,
+                                                 num_workers=num_workers)
+        return trainloader, testloader

DataFlow/dataflow/operators/eval/GeneralText/diversity/task2vec/task_similarity.py ADDED Viewed

	@@ -0,0 +1,485 @@

+#!/usr/bin/env python3
+# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+import itertools
+from typing import Tuple
+import scipy.spatial.distance as distance
+import numpy as np
+import copy
+import pickle
+# import uutils
+_DISTANCES = {}
+# TODO: Remove methods that do not perform well
+def _register_distance(distance_fn):
+    _DISTANCES[distance_fn.__name__] = distance_fn
+    return distance_fn
+def is_excluded(k):
+    exclude = ['fc', 'linear']
+    return any([e in k for e in exclude])
+def load_embedding(filename):
+    with open(filename, 'rb') as f:
+        e = pickle.load(f)
+    return e
+def get_trivial_embedding_from(e):
+    trivial_embedding = copy.deepcopy(e)
+    for l in trivial_embedding['layers']:
+        a = np.array(l['filter_logvar'])
+        a[:] = l['filter_lambda2']
+        l['filter_logvar'] = list(a)
+    return trivial_embedding
+def binary_entropy(p):
+    from scipy.special import xlogy
+    return - (xlogy(p, p) + xlogy(1. - p, 1. - p))
+def get_layerwise_variance(e, normalized=False):
+    var = [np.exp(l['filter_logvar']) for l in e['layers']]
+    if normalized:
+        var = [v / np.linalg.norm(v) for v in var]
+    return var
+def get_variance(e, normalized=False):
+    var = 1. / np.array(e.hessian)
+    if normalized:
+        lambda2 = 1. / np.array(e.scale)
+        var = var / lambda2
+    return var
+def get_variances(*embeddings, normalized=False):
+    return [get_variance(e, normalized=normalized) for e in embeddings]
+def get_hessian(e, normalized=False):
+    hess = np.array(e.hessian)
+    if normalized:
+        scale = np.array(e.scale)
+        hess = hess / scale
+    return hess
+def get_hessians(*embeddings, normalized=False):
+    return [get_hessian(e, normalized=normalized) for e in embeddings]
+def get_scaled_hessian(e0, e1):
+    h0, h1 = get_hessians(e0, e1, normalized=False)
+    return h0 / (h0 + h1 + 1e-8), h1 / (h0 + h1 + 1e-8)
+def get_full_kl(e0, e1):
+    var0, var1 = get_variance(e0), get_variance(e1)
+    kl0 = .5 * (var0 / var1 - 1 + np.log(var1) - np.log(var0))
+    kl1 = .5 * (var1 / var0 - 1 + np.log(var0) - np.log(var1))
+    return kl0, kl1
+def layerwise_kl(e0, e1):
+    layers0, layers1 = get_layerwise_variance(e0), get_layerwise_variance(e1)
+    kl0 = []
+    for var0, var1 in zip(layers0, layers1):
+        kl0.append(np.sum(.5 * (var0 / var1 - 1 + np.log(var1) - np.log(var0))))
+    return kl0
+def layerwise_cosine(e0, e1):
+    layers0, layers1 = get_layerwise_variance(e0, normalized=True), get_layerwise_variance(e1, normalized=True)
+    res = []
+    for var0, var1 in zip(layers0, layers1):
+        res.append(distance.cosine(var0, var1))
+    return res
+@_register_distance
+def kl(e0, e1):
+    var0, var1 = get_variance(e0), get_variance(e1)
+    kl0 = .5 * (var0 / var1 - 1 + np.log(var1) - np.log(var0))
+    kl1 = .5 * (var1 / var0 - 1 + np.log(var0) - np.log(var1))
+    return np.maximum(kl0, kl1).sum()
+@_register_distance
+def asymmetric_kl(e0, e1):
+    var0, var1 = get_variance(e0), get_variance(e1)
+    kl0 = .5 * (var0 / var1 - 1 + np.log(var1) - np.log(var0))
+    kl1 = .5 * (var1 / var0 - 1 + np.log(var0) - np.log(var1))
+    return kl0.sum()
+@_register_distance
+def jsd(e0, e1):
+    var0, var1 = get_variance(e0), get_variance(e1)
+    var = .5 * (var0 + var1)
+    kl0 = .5 * (var0 / var - 1 + np.log(var) - np.log(var0))
+    kl1 = .5 * (var1 / var - 1 + np.log(var) - np.log(var1))
+    return (.5 * (kl0 + kl1)).mean()
+@_register_distance
+def cosine(e0, e1):
+    h1, h2 = get_scaled_hessian(e0, e1)
+    return distance.cosine(h1, h2)
+@_register_distance
+def normalized_cosine(e0, e1):
+    h1, h2 = get_variances(e0, e1, normalized=True)
+    return distance.cosine(h1, h2)
+@_register_distance
+def correlation(e0, e1):
+    v1, v2 = get_variances(e0, e1, normalized=False)
+    return distance.correlation(v1, v2)
+@_register_distance
+def entropy(e0, e1):
+    h1, h2 = get_scaled_hessian(e0, e1)
+    return np.log(2) - binary_entropy(h1).mean()
+def get_normalized_embeddings(embeddings, normalization=None):
+    F = [1. / get_variance(e, normalized=False) if e is not None else None for e in embeddings]
+    zero_embedding = np.zeros_like([x for x in F if x is not None][0])
+    F = np.array([x if x is not None else zero_embedding for x in F])
+    # FIXME: compute variance using only valid embeddings
+    if normalization is None:
+        normalization = np.sqrt((F ** 2).mean(axis=0, keepdims=True))
+    F /= normalization
+    return F, normalization
+def pdist(embeddings, distance='cosine') -> np.ndarray:
+    distance_fn = _DISTANCES[distance]
+    n = len(embeddings)
+    distance_matrix = np.zeros([n, n])
+    if distance != 'asymmetric_kl':
+        for (i, e1), (j, e2) in itertools.combinations(enumerate(embeddings), 2):
+            distance_matrix[i, j] = distance_fn(e1, e2)
+            distance_matrix[j, i] = distance_matrix[i, j]
+    else:
+        for (i, e1) in enumerate(embeddings):
+            for (j, e2) in enumerate(embeddings):
+                distance_matrix[i, j] = distance_fn(e1, e2)
+    return distance_matrix
+def cross_pdist(embeddings1, embeddings2, distance='cosine') -> np.ndarray :
+    """
+    Compute pairwise distance between embeddings1 and embeddings2.
+    ref: https://chat.openai.com/share/a5ca38dc-3393-4cfd-971c-4a29b0c56b63
+    """
+    distance_fn = _DISTANCES[distance]
+    n1 = len(embeddings1)
+    n2 = len(embeddings2)
+    distance_matrix = np.zeros([n1, n2])
+    if distance != 'asymmetric_kl':
+        for i, e1 in enumerate(embeddings1):
+            for j, e2 in enumerate(embeddings2):
+                distance_matrix[i, j] = distance_fn(e1, e2)
+    else:
+        for i, e1 in enumerate(embeddings1):
+            for j, e2 in enumerate(embeddings2):
+                distance_matrix[i, j] = distance_fn(e1, e2)
+    return distance_matrix
+def cdist(from_embeddings, to_embeddings, distance='cosine'):
+    distance_fn = _DISTANCES[distance]
+    distance_matrix = np.zeros([len(from_embeddings), len(to_embeddings)])
+    for (i, e1) in enumerate(from_embeddings):
+        for (j, e2) in enumerate(to_embeddings):
+            if e1 is None or e2 is None:
+                continue
+            distance_matrix[i, j] = distance_fn(e1, e2)
+    return distance_matrix
+def plot_distance_matrix(embeddings, labels=None, distance='cosine', show_plot=True):
+    import seaborn as sns
+    from scipy.cluster.hierarchy import linkage
+    from scipy.spatial.distance import squareform
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    distance_matrix = pdist(embeddings, distance=distance)
+    cond_distance_matrix = squareform(distance_matrix, checks=False)
+    linkage_matrix = linkage(cond_distance_matrix, method='complete', optimal_ordering=True)
+    if labels is not None:
+        distance_matrix = pd.DataFrame(distance_matrix, index=labels, columns=labels)
+    sns.clustermap(distance_matrix, row_linkage=linkage_matrix, col_linkage=linkage_matrix, cmap='viridis_r')
+    if show_plot:
+        plt.show()
+## LLM DIV
+def plot_distance_matrix_heatmap_only(embeddings, labels=None, distance='cosine', show_plot=True, title=None, save_file=None):
+    import seaborn as sns
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    distance_matrix = pdist(embeddings, distance=distance)
+    if labels is not None:
+        distance_matrix = pd.DataFrame(distance_matrix, index=labels, columns=labels)
+    sns.heatmap(distance_matrix, cmap='viridis_r')
+    if title:
+        plt.title(title)
+    if save_file:
+        _ = plt.savefig("plots/" + save_file + ".png", bbox_inches='tight')
+    if show_plot:
+        plt.show()
+## LLM DIV
+def plot_distance_matrix_from_distance_matrix(distance_matrix, labels=None, show_plot=True, title=None, save_file=None, cluster=False, plot_multi=False):
+    import seaborn as sns
+    from scipy.cluster.hierarchy import linkage
+    from scipy.spatial.distance import squareform
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    cond_distance_matrix = squareform(distance_matrix, checks=False)
+    linkage_matrix = linkage(cond_distance_matrix, method='complete', optimal_ordering=True)
+    if labels is not None:
+        distance_matrix = pd.DataFrame(distance_matrix, index=labels, columns=labels)
+    # plot multiple subplots in one figure
+    # distance_matrix passed in is a list of distance_matrix (np.arrays)
+    if plot_multi and not cluster:
+        num_rows, num_cols = 3, 2
+        f, ax = plt.subplots(num_rows, num_cols)#, figsize=(12, 15))
+        i = 0
+        for row_ind in range(len(num_rows)):
+            for col_ind in range(len(num_cols)):
+                sns.heatmap(distance_matrix[i], cmap='viridis_r', ax=ax[row_ind, col_ind])
+                i += 1
+    else:
+        if cluster:
+            sns.clustermap(distance_matrix, row_linkage=linkage_matrix, col_linkage=linkage_matrix, cmap='viridis_r')
+        else:
+            sns.heatmap(distance_matrix, cmap='viridis_r')
+    if title:
+        plt.title(title)
+    if save_file:
+        _ = plt.savefig("plots/" + save_file + ".png", bbox_inches='tight')
+    if show_plot:
+        plt.show()
+## LLM DIV
+# plot multiple subplots in one figure
+# distance_matrix passed in is a list of distance_matrix np.arrays
+def plot_multi_distance_matrix_from_distance_matrix_list(distance_matrix_lst, title_lst, labels, main_title=None, show_plot=True, title=None, save_file=None, vmin=None, vmax=None):
+    import seaborn as sns
+    from scipy.cluster.hierarchy import linkage
+    from scipy.spatial.distance import squareform
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    import math
+    num_rows, num_cols = math.ceil(len(distance_matrix_lst)/2), 2
+    if len(distance_matrix_lst) % 2 == 1:
+        figsize = (12,10)
+    else:
+        figsize = (12,10)
+    f, ax = plt.subplots(num_rows, num_cols, figsize=figsize)
+    i = 0
+    for row_ind in range(num_rows):
+        for col_ind in range(num_cols):
+            if i >= len(distance_matrix_lst):
+                break
+            distance_matrix = distance_matrix_lst[i]
+            distance_matrix = pd.DataFrame(distance_matrix, index=labels[i], columns=labels[i])
+            if len(distance_matrix_lst) > 2:
+                ax[row_ind, col_ind].set_aspect('equal')
+                if vmin is not None and vmax is not None:
+                    sns.heatmap(distance_matrix, cmap='viridis_r', ax=ax[row_ind, col_ind], vmin=vmin, vmax=vmax)
+                else:
+                    sns.heatmap(distance_matrix, cmap='viridis_r', ax=ax[row_ind, col_ind])
+                ax[row_ind, col_ind].set_title(title_lst[i])
+            else:
+                ax[col_ind].set_aspect('equal')
+                sns.heatmap(distance_matrix, cmap='viridis_r', ax=ax[col_ind])
+                ax[col_ind].set_title(title_lst[i])
+            i += 1
+    if len(distance_matrix_lst) % 2 == 1:
+        f.delaxes(ax[num_rows-1,1])
+    if main_title:
+        f.suptitle(main_title)
+        f.subplots_adjust(top=0.5)
+    if len(distance_matrix_lst) % 2 == 1:
+        plt.tight_layout(h_pad=2)
+    else:
+        plt.tight_layout(h_pad=2, w_pad=5)
+    if save_file:
+        _ = plt.savefig("plots/" + save_file + ".png", bbox_inches='tight')
+    if show_plot:
+        plt.show()
+## LLM DIV
+def stats_of_distance_matrix(distance_matrix: np.ndarray,
+                             remove_diagonal: bool = True,
+                             variance_type: str = 'std',    # TODO: was ci_0.95. Changed to rid uutils call
+                             get_total: bool = False,
+                             ) -> Tuple[float, float]:
+    if remove_diagonal:
+        # - remove diagonal: ref https://stackoverflow.com/questions/46736258/deleting-diagonal-elements-of-a-numpy-array
+        triu: np.ndarray = np.triu(distance_matrix)
+        tril: np.ndarray = np.tril(distance_matrix)
+        # distance_matrix = distance_matrix[~np.eye(distance_matrix.shape[0], dtype=bool)].reshape(distance_matrix.shape[0], -1)
+        # remove diagonal and dummy zeros where the other triangular matrix was artificially placed.
+        distance_matrix = triu[triu != 0.0]
+    # - flatten
+    distance_matrix: np.ndarray = distance_matrix.flatten()
+    # - compute stats of distance matrix
+    if variance_type == 'std':
+        mu, var = distance_matrix.mean(), distance_matrix.std()
+    # elif variance_type == 'ci_0.95':
+    #     from uutils.torch_uu.metrics.confidence_intervals import mean_confidence_interval
+    #     mu, var = mean_confidence_interval(distance_matrix, confidence=0.95)
+    else:
+        raise ValueError(f'Invalid variance type, got: {variance_type=}')
+    # - double checks the mean was computed corrects. Since it's symmetric the mean after removing diagonal should be equal to just one side of the diagonals
+    if remove_diagonal:
+        # from uutils.torch_uu import approx_equal
+        # assert approx_equal(triu.sum(), tril.sum(), tolerance=1e-4), f'Distance matrix is not symmetric, are you sure this is correct?'
+        # assert approx_equal(distance_matrix.mean(), triu[triu != 0.0].mean(), tolerance=1e-4), f'Mean should be equal to triangular matrix'
+        # assert approx_equal(mu, triu[triu != 0.0].mean(), tolerance=1e-4)
+        print('Lower tri sum', tril.sum(), ' / Upper tri sum', triu.sum(), '| These should be approx equal!!')
+        print('Total mean', distance_matrix.mean(), ' / Upper mean', triu[triu != 0.0].mean(), ' / Lower mean', tril[tril != 0.0].mean(), '| These should all be approx equal!!')
+        print('mu (div coefficient)', mu, ' / Upper mean', triu[triu != 0.0].mean(), '| These should all be approx equal!!')
+    if get_total:
+        total = distance_matrix.sum()
+        return mu, var, total
+    else:
+        return mu, var
+def stats_cross_distance_matrix(distance_matrix: np.ndarray,
+                                remove_diagonal: bool = False,
+                                variance_type: str = 'std',     # TODO: was ci_0.95. Changed to rid uutils call
+                                get_total: bool = False,
+                                ) -> Tuple[float, float]:
+    return stats_of_distance_matrix(distance_matrix, remove_diagonal=remove_diagonal, variance_type=variance_type, get_total=get_total)
+def plot_histogram_of_distances(distance_matrix: np.ndarray, title, show_plot=True, save_file=None, bins_width=None, grid=True):
+    import matplotlib.pyplot as plt
+    triu = np.triu(distance_matrix)
+    triu = triu[triu != 0.0]
+    distance_values = triu.flatten()
+    if grid:
+        plt.grid(zorder=0)
+    plt.axvline(np.mean(distance_values), color='k', linestyle='dashed', linewidth=1, zorder=4)
+    if bins_width is not None:
+        plt.hist(distance_values, edgecolor ="black", bins=np.arange(min(distance_values), max(distance_values) + bins_width, bins_width), zorder=3)
+    else:
+        plt.hist(distance_values, edgecolor ="black", zorder=3)
+    plt.title(title)
+    plt.xlabel("Cosine Distance between Task Pairs")
+    plt.ylabel("Frequency")
+    plt.tight_layout()
+    if save_file:
+        _ = plt.savefig("plots/" + save_file + ".png", bbox_inches='tight')
+    if show_plot:
+        plt.show()
+## LLM DIV
+# plot multiple subplots in one figure
+# distance_matrix passed in is a list of distance_matrix (np.arrays)
+def plot_multi_histogram_of_distances(distance_matrix_lst, title_lst, main_title=None, show_plot=True, save_file=None,
+                                      xlabel="Cosine Distance between Task Pairs", grid=True, bins_width=None,
+                                      num_cols=2, figsize=(12,10)):
+    import seaborn as sns
+    from scipy.cluster.hierarchy import linkage
+    from scipy.spatial.distance import squareform
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    import math
+    if num_cols == 2:
+        num_rows = math.ceil(len(distance_matrix_lst)/2)
+    else:
+        num_rows = math.ceil(len(distance_matrix_lst)/num_cols)
+    f, ax = plt.subplots(num_rows, num_cols, figsize=figsize)
+    i = 0
+    for row_ind in range(num_rows):
+        for col_ind in range(num_cols):
+            if i >= len(distance_matrix_lst):
+                break
+            triu = np.triu(distance_matrix_lst[i])
+            triu = triu[triu != 0.0]
+            distance_values = triu.flatten()
+            if len(distance_matrix_lst) > 2:
+                if grid:
+                    ax[row_ind, col_ind].grid(zorder=0)
+                if bins_width is not None:
+                    ax[row_ind, col_ind].hist(distance_values, edgecolor ="black", zorder=3, bins=np.arange(min(distance_values), max(distance_values) + bins_width, bins_width))
+                else:
+                    ax[row_ind, col_ind].hist(distance_values, edgecolor ="black", zorder=3)
+                ax[row_ind, col_ind].set_xlabel(xlabel)
+                ax[row_ind, col_ind].set_ylabel("Frequency")
+                ax[row_ind, col_ind].axvline(np.mean(distance_values), color='k', linestyle='dashed', linewidth=1, zorder=4)
+                ax[row_ind, col_ind].set_title(title_lst[i])
+            else:
+                if grid:
+                    ax[col_ind].grid(zorder=0)
+                ax[col_ind].hist(distance_values, edgecolor ="black", zorder=3)
+                if bins_width is not None:
+                    ax[col_ind].hist(distance_values, edgecolor ="black", zorder=3, bins=np.arange(min(distance_values), max(distance_values) + bins_width, bins_width))
+                else:
+                    ax[col_ind].hist(distance_values, edgecolor ="black", zorder=3)
+                ax[col_ind].set_xlabel(xlabel)
+                ax[col_ind].set_ylabel("Frequency")
+                ax[col_ind].set_title(title_lst[i])
+            i += 1
+    if len(distance_matrix_lst) % 2 == 1 and num_cols == 2:
+        f.delaxes(ax[num_rows-1,1])
+    if main_title:
+        f.suptitle(main_title)
+        f.subplots_adjust(top=1)
+    plt.grid(True)
+    plt.tight_layout()
+    if save_file:
+        _ = plt.savefig("plots/" + save_file + ".png", bbox_inches='tight')
+    if show_plot:
+        plt.show()

DataFlow/dataflow/operators/eval/GeneralText/diversity/task2vec/utils.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from collections import defaultdict
+import torch
+import numpy as np
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = defaultdict(int)
+        self.avg = defaultdict(float)
+        self.sum = defaultdict(int)
+        self.count = defaultdict(int)
+    def update(self, n=1, **val):
+        for k in val:
+            self.val[k] = val[k]
+            self.sum[k] += val[k] * n
+            self.count[k] += n
+            self.avg[k] = self.sum[k] / self.count[k]
+def set_batchnorm_mode(model, train=True):
+    """Allows to set batch_norm layer mode to train or eval, independendtly on the mode of the model."""
+    def _set_batchnorm_mode(module):
+        if isinstance(module, torch.nn.BatchNorm1d) or isinstance(module, torch.nn.BatchNorm2d):
+            if train:
+                module.train()
+            else:
+                module.eval()
+    model.apply(_set_batchnorm_mode)
+### LLM DIV
+def get_error(output, target, mode='autoregressive', ignore_index=None):
+    if mode == 'autoregressive': # output = logits here
+        assert ignore_index is not None
+        output = output[:,:-1,:]
+        logits_inds = torch.argmax(output, dim=-1)
+        target = target[:,1:]
+        if ignore_index is not None:
+            acc = torch.eq(logits_inds, target.unsqueeze(0))[:, target != ignore_index]
+        else:
+            acc = torch.eq(logits_inds, target.unsqueeze(0))
+        acc = acc.float().mean()
+        return 1 - acc
+    else:
+        pred = output.argmax(dim=1)
+        correct = pred.eq(target).float().sum()
+        return float((1. - correct / output.size(0)) * 100.)
+def adjust_learning_rate(optimizer, epoch, optimizer_cfg):
+    lr = optimizer_cfg.lr * (0.1 ** np.less(optimizer_cfg.schedule, epoch).sum())
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+def get_device(model: torch.nn.Module):
+    return next(model.parameters()).device

DataFlow/dataflow/operators/eval/GeneralText/diversity/task2vec_scorer.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from dataflow.operators.eval.GeneralText.diversity.task2vec.task2vec import Task2Vec
+from dataflow.operators.eval.GeneralText.diversity.task2vec import task_similarity
+import torch
+import random
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from torch.utils.data import Dataset
+from dataflow import get_logger
+from typing import Optional
+# Task2Vec dataset diversity evaluation
+# Cited from: Beyond Scale: the Diversity Coefficient as a Data Quality Metric Demonstrates LLMs are Pre-trained on Formally Diverse Data
+@OPERATOR_REGISTRY.register()
+class Task2VecScorer(OperatorABC):
+    def __init__(self, device='cuda', sample_nums=10, sample_size=1, method: Optional[str]='montecarlo', model_cache_dir='./dataflow_cache'):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        # evaluating diversity by extract sample_nums * sample_size samples
+        self.sample_nums = sample_nums
+        self.sample_size = sample_size
+        self.device = device
+        self.model_cache_dir = model_cache_dir
+        self.score_name = 'Task2VecScore'
+        self.method = method
+        if method not in ['montecarlo', 'variational']:
+            raise ValueError(f"Invalid method '{method}'. Valid options are 'montecarlo' and 'variational'.")
+        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=self.model_cache_dir)
+        self.probe_network = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir=self.model_cache_dir)
+        self.device = torch.device(self.device if self.device and torch.cuda.is_available() else "cpu")
+        self.probe_network = self.probe_network.to(self.device)
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    def preprocess(self, texts):
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        tokenized_outputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
+        return {key: value.to(self.device) for key, value in tokenized_outputs.items()}
+    def get_score(self, sentences):
+        embeddings = []
+        data_length = len(sentences)
+        for sample_num in range(self.sample_nums):
+            self.logger.info(f'--> Sample {sample_num + 1}/{self.sample_nums}')
+            indices = random.sample(range(data_length), self.sample_size)
+            texts = [sentences[i] for i in indices]
+            tokenized_batch = self.preprocess(texts)
+            tokenized_dataset = CustomTensorDataset(tokenized_batch)
+            embedding, _ = Task2Vec(self.probe_network, method=self.method).embed(tokenized_dataset)
+            embeddings.append(embedding)
+        distance_matrix = task_similarity.pdist(embeddings, distance='cosine')
+        div_coeff, conf_interval = task_similarity.stats_of_distance_matrix(distance_matrix)
+        return {
+            "Task2VecDiversityScore": div_coeff,
+            "ConfidenceInterval": conf_interval
+        }
+    def run(self, storage: DataFlowStorage, input_key: str):
+        dataframe = storage.read("dataframe")
+        samples = dataframe[input_key].to_list()
+        self.logger.info(f"Evaluating {self.score_name}...")
+        task2vec_score = self.get_score(samples)
+        self.logger.info("Evaluation complete!")
+        self.logger.info(f"Task2Vec Diversity Score: {task2vec_score}")
+        return task2vec_score
+class CustomTensorDataset(Dataset):
+    def __init__(self, tokenized_batch):
+        self.tokenized_batch = tokenized_batch
+    def __getitem__(self, index):
+        return {key: self.tokenized_batch[key][index] for key in self.tokenized_batch}
+    def __len__(self):
+        return len(next(iter(self.tokenized_batch.values())))

DataFlow/dataflow/operators/eval/GeneralText/diversity/vendi_scorer.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from vendi_score import text_utils
+from dataflow.utils.storage import DataFlowStorage
+import pandas as pd
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+# VendiScore dataset diversity evaluation
+# Cited from: The Vendi Score: A Diversity Evaluation Metric for Machine Learning
+@OPERATOR_REGISTRY.register()
+class VendiScorer(OperatorABC):
+    def __init__(self, device='cuda'):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.bert_model_path = 'bert-base-uncased'
+        self.simcse_model_path = 'princeton-nlp/unsup-simcse-bert-base-uncased'
+        self.device = device
+        self.score_name = 'VendiScore'
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    def get_score(self, sentences):
+        result = {}
+        bert_vs = text_utils.embedding_vendi_score(sentences, model_path=self.bert_model_path, device=self.device)
+        result["BERTVendiScore"] = round(bert_vs, 2)
+        simcse_vs = text_utils.embedding_vendi_score(sentences, model_path=self.simcse_model_path, device=self.device)
+        result["SimCSEVendiScore"] = round(simcse_vs, 2)
+        return result
+    def run(self, storage: DataFlowStorage, input_key: str):
+        dataframe = storage.read("dataframe")
+        samples = dataframe[input_key].to_list()
+        self.logger.info(f"Evaluating {self.score_name}...")
+        vendiscore = self.get_score(samples)
+        self.logger.info("Evaluation complete!")
+        self.logger.info(f"VendiScore: {vendiscore}")
+        return vendiscore

DataFlow/dataflow/operators/eval/GeneralText/gen/__pycache__/bert_scorer.cpython-310.pyc ADDED Viewed

Binary file (2 kB). View file

DataFlow/dataflow/operators/eval/GeneralText/gen/__pycache__/bleu_scorer.cpython-310.pyc ADDED Viewed

Binary file (2.39 kB). View file

DataFlow/dataflow/operators/eval/GeneralText/gen/__pycache__/cider_scorer.cpython-310.pyc ADDED Viewed

Binary file (2.67 kB). View file

DataFlow/dataflow/operators/eval/GeneralText/gen/bert_scorer.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+import evaluate
+@OPERATOR_REGISTRY.register()
+class BERTScorer(OperatorABC):
+    def __init__(self, lang='en', model_cache_dir='./dataflow_cache'):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.data_type = "text"
+        self.score_name = "BERTScore"
+        self.lang = lang
+        self.model_type = "distilbert-base-uncased"
+        self.idf = False
+        self.rescale_with_baseline = False
+        self.bertscore = evaluate.load("bertscore", cache_dir=model_cache_dir)
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    def eval(self, dataframe, input_key, reference_key):
+        eval_data = dataframe[input_key].to_list()
+        ref_data = dataframe[reference_key].to_list()
+        self.logger.info(f"Evaluating {self.score_name}...")
+        if ref_data is None:
+            raise ValueError("Reference data must be provided for BERTScorer")
+        results = self.bertscore.compute(
+            predictions=eval_data,
+            references=ref_data,
+            lang=self.lang,
+            model_type=self.model_type,
+            idf=self.idf,
+            rescale_with_baseline=self.rescale_with_baseline
+        )
+        scores = results["f1"]
+        self.logger.info("Evaluation complete!")
+        return scores
+    def run(self, storage: DataFlowStorage, input_key: str, reference_key: str, output_key: str='BertScore'):
+        self.input_key = input_key
+        self.reference_key = reference_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        scores = self.eval(dataframe, input_key, reference_key)
+        dataframe[self.output_key] = scores
+        storage.write(dataframe)

DataFlow/dataflow/operators/eval/GeneralText/gen/bleu/__init__.py ADDED Viewed

File without changes

DataFlow/dataflow/operators/eval/GeneralText/gen/bleu/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (197 Bytes). View file

DataFlow/dataflow/operators/eval/GeneralText/gen/bleu/__pycache__/bleu.cpython-310.pyc ADDED Viewed

Binary file (6.82 kB). View file

DataFlow/dataflow/operators/eval/GeneralText/gen/bleu/bleu.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import copy
+import sys, math, re
+from collections import defaultdict
+import six
+from six.moves import xrange as range
+def precook(s, n=4, out=False):
+    words = s.split()
+    counts = defaultdict(int)
+    for k in range(1,n+1):
+        for i in range(len(words)-k+1):
+            ngram = tuple(words[i:i+k])
+            counts[ngram] += 1
+    return (len(words), counts)
+def cook_refs(refs, eff=None, n=4):
+    reflen = []
+    maxcounts = {}
+    for ref in refs:
+        rl, counts = precook(ref, n)
+        reflen.append(rl)
+        for (ngram,count) in six.iteritems(counts):
+            maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
+    if eff == "shortest":
+        reflen = min(reflen)
+    elif eff == "average":
+        reflen = float(sum(reflen))/len(reflen)
+    return (reflen, maxcounts)
+def cook_test(test, reflen_refmaxcounts, eff=None, n=4):
+    reflen, refmaxcounts = reflen_refmaxcounts
+    testlen, counts = precook(test, n, True)
+    result = {}
+    if eff == "closest":
+        result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1]
+    else:
+        result["reflen"] = reflen
+    result["testlen"] = testlen
+    result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)]
+    result['correct'] = [0]*n
+    for (ngram, count) in six.iteritems(counts):
+        result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
+    return result
+class Bleu(object):
+    """Bleu scorer.
+    """
+    __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
+    def copy(self):
+        new = Bleu(n=self.n)
+        new.ctest = copy.copy(self.ctest)
+        new.crefs = copy.copy(self.crefs)
+        new._score = None
+        return new
+    def __init__(self, test=None, refs=None, n=4, special_reflen=None):
+        self.n = n
+        self.crefs = []
+        self.ctest = []
+        self.cook_append(test, refs)
+        self.special_reflen = special_reflen
+    def cook_append(self, test, refs):
+        if refs is not None:
+            self.crefs.append(cook_refs(refs))
+            if test is not None:
+                cooked_test = cook_test(test, self.crefs[-1])
+                self.ctest.append(cooked_test) ## N.B.: -1
+            else:
+                self.ctest.append(None) # lens of crefs and ctest have to match
+        self._score = None ## need to recompute
+    def ratio(self, option=None):
+        self.compute_score(option=option)
+        return self._ratio
+    def score_ratio(self, option=None):
+        return (self.fscore(option=option), self.ratio(option=option))
+    def score_ratio_str(self, option=None):
+        return "%.4f (%.2f)" % self.score_ratio(option)
+    def reflen(self, option=None):
+        self.compute_score(option=option)
+        return self._reflen
+    def testlen(self, option=None):
+        self.compute_score(option=option)
+        return self._testlen
+    def retest(self, new_test):
+        if type(new_test) is str:
+            new_test = [new_test]
+        assert len(new_test) == len(self.crefs), new_test
+        self.ctest = []
+        for t, rs in zip(new_test, self.crefs):
+            self.ctest.append(cook_test(t, rs))
+        self._score = None
+        return self
+    def rescore(self, new_test):
+        ''' replace test(s) with new test(s), and returns the new score.'''
+        return self.retest(new_test).compute_score()
+    def size(self):
+        assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
+        return len(self.crefs)
+    def __iadd__(self, other):
+        '''add an instance (e.g., from another sentence).'''
+        if type(other) is tuple:
+            ## avoid creating new BleuScorer instances
+            self.cook_append(other[0], other[1])
+        else:
+            assert self.compatible(other), "incompatible BLEUs."
+            self.ctest.extend(other.ctest)
+            self.crefs.extend(other.crefs)
+            self._score = None ## need to recompute
+        return self
+    def compatible(self, other):
+        return isinstance(other, Bleu) and self.n == other.n
+    def single_reflen(self, option="average"):
+        return self._single_reflen(self.crefs[0][0], option)
+    def _single_reflen(self, reflens, option=None, testlen=None):
+        if option == "shortest":
+            reflen = min(reflens)
+        elif option == "average":
+            reflen = float(sum(reflens))/len(reflens)
+        elif option == "closest":
+            reflen = min((abs(l-testlen), l) for l in reflens)[1]
+        else:
+            assert False, "unsupported reflen option %s" % option
+        return reflen
+    def recompute_score(self, option=None, verbose=0):
+        self._score = None
+        return self.compute_score(option, verbose)
+    def compute_score(self, option=None, verbose=0):
+        n = self.n
+        small = 1e-9
+        tiny = 1e-15 ## so that if guess is 0 still return 0
+        bleu_list = [[] for _ in range(n)]
+        if self._score is not None:
+            return self._score
+        if option is None:
+            option = "average" if len(self.crefs) == 1 else "closest"
+        self._testlen = 0
+        self._reflen = 0
+        totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
+        # for each sentence
+        for comps in self.ctest:
+            testlen = comps['testlen']
+            self._testlen += testlen
+            if self.special_reflen is None: ## need computation
+                reflen = self._single_reflen(comps['reflen'], option, testlen)
+            else:
+                reflen = self.special_reflen
+            self._reflen += reflen
+            for key in ['guess','correct']:
+                for k in range(n):
+                    totalcomps[key][k] += comps[key][k]
+            # append per image bleu score
+            bleu = 1.
+            for k in range(n):
+                bleu *= (float(comps['correct'][k]) + tiny) \
+                        /(float(comps['guess'][k]) + small)
+                bleu_list[k].append(bleu ** (1./(k+1)))
+            ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
+            if ratio < 1:
+                for k in range(n):
+                    bleu_list[k][-1] *= math.exp(1 - 1/ratio)
+            if verbose > 1:
+                print(comps, reflen)
+        totalcomps['reflen'] = self._reflen
+        totalcomps['testlen'] = self._testlen
+        bleus = []
+        bleu = 1.
+        for k in range(n):
+            bleu *= float(totalcomps['correct'][k] + tiny) \
+                    / (totalcomps['guess'][k] + small)
+            bleus.append(bleu ** (1./(k+1)))
+        ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
+        if ratio < 1:
+            for k in range(n):
+                bleus[k] *= math.exp(1 - 1/ratio)
+        if verbose > 0:
+            print(totalcomps)
+            print("ratio:", ratio)
+        self._score = bleus
+        return self._score, bleu_list

DataFlow/dataflow/operators/eval/GeneralText/gen/bleu_scorer.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+from dataflow.operators.eval.GeneralText.gen.bleu.bleu import Bleu
+from tqdm import tqdm
+@OPERATOR_REGISTRY.register()
+class BleuScorer(OperatorABC):
+    def __init__(self, n=4, eff="average", special_reflen=None):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.score_name = 'BleuScore'
+        valid_eff_options = ["shortest", "average", "longest"]
+        if eff not in valid_eff_options:
+            raise ValueError(f"Invalid value for 'eff'. Must be one of {valid_eff_options}, but got '{eff}'.")
+        self.n = n  # Max n-gram length (default: 4)
+        self.eff = eff  # [shortest, average, longest]
+        self.special_reflen = special_reflen  # Special reference length if specified
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    def _score_func(self, eval_text, ref_text):
+        bleu_scorer = Bleu(
+            test=eval_text,
+            refs=[ref_text],
+            n=self.n,
+            special_reflen=self.special_reflen,
+        )
+        bleu_score, _ = bleu_scorer.compute_score(option=self.eff)
+        return bleu_score[0]
+    def eval(self, dataframe, input_key, reference_key):
+        eval_data = dataframe[input_key]
+        ref_data = dataframe[reference_key]
+        self.logger.info(f"Evaluating {self.score_name}...")
+        scores = [self._score_func(eval_text, ref_text) for eval_text, ref_text in tqdm(zip(eval_data, ref_data), desc="BleuScorer Evaluating...")]
+        self.logger.info("Evaluation complete!")
+        return scores
+    def run(self, storage: DataFlowStorage, input_key: str, reference_key: str, output_key: str='BleuScore'):
+        self.input_key = input_key
+        self.reference_key = reference_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        scores = self.eval(dataframe, input_key, reference_key)
+        dataframe[self.output_key] = scores
+        storage.write(dataframe)

DataFlow/dataflow/operators/eval/GeneralText/gen/cider/__init__.py ADDED Viewed

File without changes

DataFlow/dataflow/operators/eval/GeneralText/gen/cider/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (198 Bytes). View file

DataFlow/dataflow/operators/eval/GeneralText/gen/cider/__pycache__/cider.cpython-310.pyc ADDED Viewed

Binary file (5.54 kB). View file

DataFlow/dataflow/operators/eval/GeneralText/gen/cider/cider.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import copy
+import math
+import pickle
+import numpy as np
+from collections import defaultdict
+import os
+from six.moves import xrange
+import six
+def precook(s, n=4, out=False):
+    words = s.split()
+    counts = defaultdict(int)
+    for k in xrange(1, n+1):
+        for i in xrange(len(words)-k+1):
+            ngram = tuple(words[i:i+k])
+            counts[ngram] += 1
+    return counts
+def cook_refs(refs, n=4):
+    return [precook(ref, n) for ref in refs]
+def cook_test(test, n=4):
+    return precook(test, n, True)
+class Cider(object):
+    """CIDEr scorer."""
+    def copy(self):
+        new = Cider(n=self.n)
+        new.ctest = copy.copy(self.ctest)
+        new.crefs = copy.copy(self.crefs)
+        return new
+    def __init__(self, test=None, refs=None, n=4, sigma=6.0, idf=None):
+        self.n = n
+        self.sigma = sigma
+        self.crefs = []
+        self.ctest = []
+        self.document_frequency = defaultdict(float)
+        self.ref_len = None
+        if idf:
+            self.document_frequency = idf['df']
+            self.ref_len = np.log(float(idf['ref_len']))  # Use reference length from the IDF
+        self.cook_append(test, refs)
+    def cook_append(self, test, refs):
+        if refs is not None:
+            self.crefs.append(cook_refs(refs))
+            if test is not None:
+                self.ctest.append(cook_test(test))
+            else:
+                self.ctest.append(None)
+    def size(self):
+        assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
+        return len(self.crefs)
+    def __iadd__(self, other):
+        if type(other) is tuple:
+            self.cook_append(other[0], other[1])
+        else:
+            self.ctest.extend(other.ctest)
+            self.crefs.extend(other.crefs)
+        return self
+    def compute_doc_freq(self):
+        '''Compute term frequency for reference data to generate IDF.'''
+        if not self.document_frequency:  # Handle empty DF (for 'corpus' mode)
+            for refs in self.crefs:
+                for ngram in set([ngram for ref in refs for (ngram, count) in ref.items()]):
+                    self.document_frequency[ngram] += 1
+    def compute_cider(self, df_mode):
+        def counts2vec(cnts):
+            vec = [defaultdict(float) for _ in range(self.n)]
+            length = 0
+            norm = [0.0 for _ in range(self.n)]
+            for (ngram, term_freq) in cnts.items():
+                df = np.log(max(1.0, self.document_frequency[ngram]))
+                n = len(ngram) - 1
+                vec[n][ngram] = float(term_freq) * (self.ref_len - df)
+                norm[n] += pow(vec[n][ngram], 2)
+                if n == 1:
+                    length += term_freq
+            norm = [np.sqrt(n) for n in norm]
+            return vec, norm, length
+        def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
+            delta = float(length_hyp - length_ref)
+            val = np.array([0.0 for _ in range(self.n)])
+            for n in range(self.n):
+                for (ngram, count) in vec_hyp[n].items():
+                    val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
+                if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
+                    val[n] /= (norm_hyp[n] * norm_ref[n])
+                val[n] *= np.e**(-(delta**2) / (2 * self.sigma**2))
+            return val
+        if df_mode == "corpus":
+            self.ref_len = np.log(float(len(self.crefs)))  # Use total references in corpus as ref_len
+        scores = []
+        for test, refs in zip(self.ctest, self.crefs):
+            vec, norm, length = counts2vec(test)
+            score = np.array([0.0 for _ in range(self.n)])
+            for ref in refs:
+                vec_ref, norm_ref, length_ref = counts2vec(ref)
+                score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
+            score_avg = np.mean(score)
+            score_avg /= len(refs)
+            score_avg *= 10.0
+            scores.append(score_avg)
+        return scores
+    def compute_score(self, df_mode, option=None, verbose=0):
+        '''Compute the CIDEr score based on df_mode (corpus or IDF-based).'''
+        self.compute_doc_freq()
+        if df_mode == "corpus":
+            if not self.document_frequency:  # Handle the case where DF is empty
+                raise ValueError("Document frequency is empty. Please check the corpus data.")
+        min_required_data = max(self.document_frequency.values())
+        # print(min_required_data)# For corpus mode, we require at least one reference
+        # if len(self.ctest) < min_required_data:
+        #     raise ValueError(f"Insufficient test data: {len(self.ctest)} samples, but at least {min_required_data} are required.")
+        score = self.compute_cider(df_mode)
+        return np.mean(np.array(score)), np.array(score)

DataFlow/dataflow/operators/eval/GeneralText/gen/cider_scorer.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import os
+import json
+import pickle
+from tqdm import tqdm
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+from dataflow.operators.eval.GeneralText.gen.cider.cider import Cider
+def load_idf(idf_path):
+    with open(idf_path, 'rb') as f:
+        idf = pickle.load(f, encoding='utf-8')
+    return idf
+@OPERATOR_REGISTRY.register()
+class CiderScorer(OperatorABC):
+    def __init__(self, n=4, sigma=6.0, df_mode="coco-val-df", idf_path="./dataflow/operators/eval/GeneralText/gen/cider/coco-val-df.p"):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.score_name = 'CiderScore'
+        self.n = n  # Max n-gram length (default: 4)
+        self.sigma = sigma  # Sigma for Gaussian penalty (default: 6.0)
+        self.df_mode = df_mode
+        if self.df_mode != "corpus":
+            # The idf file can be downloaded at https://github.com/ramavedantam/coco-caption/blob/master/data/coco-val-df.p
+            # Put the file in the correct idf_path
+            self.idf = load_idf(idf_path)
+        else:
+            self.idf = None  # No need to load IDF for 'corpus' mode
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    def _score_func(self, eval_text, ref_text):
+        cider_scorer = Cider(
+            test=eval_text,
+            refs=[ref_text],
+            n=self.n,
+            sigma=self.sigma,
+            idf=self.idf  # Pass IDF (None if using 'corpus')
+        )
+        # Pass df_mode dynamically based on the argument
+        cider_score, _ = cider_scorer.compute_score(df_mode='corpus' if self.idf is None else 'coco-val-df')
+        return cider_score
+    def eval(self, dataframe, input_key, reference_key):
+        eval_data = dataframe[input_key]
+        ref_data = dataframe[reference_key]
+        self.logger.info(f"Evaluating {self.score_name}...")
+        scores = [self._score_func(eval_text, ref_text) for eval_text, ref_text in tqdm(zip(eval_data, ref_data), desc="CiderScorer Evaluating...")]
+        self.logger.info("Evaluation complete!")
+        return scores
+    def run(self, storage: DataFlowStorage, input_key: str, reference_key: str, output_key: str='CiderScore'):
+        self.input_key = input_key
+        self.reference_key = reference_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        scores = self.eval(dataframe, input_key, reference_key)
+        dataframe[self.output_key] = scores
+        storage.write(dataframe)

DataFlow/dataflow/operators/eval/GeneralText/models/Kenlm/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (5.21 kB). View file

DataFlow/dataflow/operators/eval/GeneralText/models/Kenlm/model.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import os
+import re
+import unicodedata
+from typing import Dict
+import kenlm
+import sentencepiece
+class SentencePiece:
+    def __init__(
+        self,
+        model: str,
+    ):
+        super().__init__()
+        self.sp = sentencepiece.SentencePieceProcessor()
+        self.sp.load(str(model))
+    def do(self, text: dict) -> dict:
+        tokenized = self.sp.encode_as_pieces(text)
+        return " ".join(tokenized)
+class KenlmModel:
+    digit_re: re.Pattern = re.compile(r"\d")
+    unicode_punct: Dict[str, str] = {
+        "，": ",",
+        "。": ".",
+        "、": ",",
+        "„": '"',
+        "”": '"',
+        "“": '"',
+        "«": '"',
+        "»": '"',
+        "１": '"',
+        "」": '"',
+        "「": '"',
+        "《": '"',
+        "》": '"',
+        "´": "'",
+        "∶": ":",
+        "：": ":",
+        "？": "?",
+        "！": "!",
+        "（": "(",
+        "）": ")",
+        "；": ";",
+        "–": "-",
+        "—": " - ",
+        "．": ". ",
+        "～": "~",
+        "’": "'",
+        "…": "...",
+        "━": "-",
+        "〈": "<",
+        "〉": ">",
+        "【": "[",
+        "】": "]",
+        "％": "%",
+        "►": "-",
+    }
+    unicode_punct_re = re.compile(f"[{''.join(unicode_punct.keys())}]")
+    non_printing_chars_re = re.compile(
+        f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
+    )
+    kenlm_model_dir = None
+    sentence_piece_model_dir = None
+    def __init__(
+        self,
+        model_dataset: str,
+        language: str,
+        lower_case: bool = False,
+        remove_accents: bool = False,
+        normalize_numbers: bool = True,
+        punctuation: int = 1,
+    ):
+        self.model = kenlm.Model(os.path.join(model_dataset, f"{language}.arpa.bin"))
+        self.tokenizer = SentencePiece(os.path.join(model_dataset, f"{language}.sp.model"))
+        self.accent = remove_accents
+        self.case = lower_case
+        self.numbers = normalize_numbers
+        self.punct = punctuation
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_dataset: str,
+        language: str,
+    ):
+        return cls(
+            model_dataset,
+            language,
+            False,
+            False,
+            True,
+            1,
+        )
+    def pp(self, log_score, length):
+        return 10.0 ** (-log_score / length)
+    def get_perplexity(self, doc: str, normalize_cc_net: bool = True):
+        if normalize_cc_net:
+            doc = self.normalize(
+                doc,
+                accent=self.accent,
+                case=self.case,
+                numbers=self.numbers,
+                punct=self.punct,
+            )
+        # Tokenize (after normalizing): See https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/mine.py#L352 for full pipeline
+        doc = self.tokenizer.do(doc)
+        doc_log_score, doc_length = 0, 0
+        for line in doc.split("\n"):
+            log_score = self.model.score(line)
+            length = len(line.split()) + 1
+            doc_log_score += log_score
+            doc_length += length
+        return round(self.pp(doc_log_score, doc_length), 1)
+    def normalize(
+        self,
+        line: str,
+        accent: bool = True,
+        case: bool = True,
+        numbers: bool = True,
+        punct: int = 1,
+    ) -> str:
+        line = line.strip()
+        if not line:
+            return line
+        if case:
+            line = line.lower()
+        if accent:
+            line = self.strip_accents(line)
+        if numbers:
+            line = self.digit_re.sub("0", line)
+        if punct == 1:
+            line = self.replace_unicode_punct(line)
+        elif punct == 2:
+            line = self.remove_unicode_punct(line)
+        line = self.remove_non_printing_char(line)
+        return line
+    def strip_accents(self, line: str) -> str:
+        """Strips accents from a piece of text."""
+        nfd = unicodedata.normalize("NFD", line)
+        output = [c for c in nfd if unicodedata.category(c) != "Mn"]
+        if len(output) == line:
+            return line
+        return "".join(output)
+    def replace_unicode_punct(self, text: str) -> str:
+        return "".join(self.unicode_punct.get(c, c) for c in text)
+    def remove_unicode_punct(self, text: str) -> str:
+        """More aggressive version of replace_unicode_punct but also faster."""
+        return self.unicode_punct_re.sub("", text)
+    def remove_non_printing_char(self, text: str) -> str:
+        return self.non_printing_chars_re.sub("", text)

DataFlow/dataflow/operators/eval/GeneralText/models/Qurating/__pycache__/qurater_annotate.cpython-310.pyc ADDED Viewed

Binary file (7.02 kB). View file

DataFlow/dataflow/operators/eval/GeneralText/models/Qurating/modeling/__pycache__/modeling_flash_llama.cpython-310.pyc ADDED Viewed

Binary file (25 kB). View file

DataFlow/dataflow/operators/eval/GeneralText/models/Qurating/modeling/modeling_flash_llama.py ADDED Viewed

	@@ -0,0 +1,853 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch LLaMA model."""
+from typing import List, Optional, Tuple, Union, Any
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+import torch.distributed as dist
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from transformers.models.llama.configuration_llama import LlamaConfig
+def try_import_flash_attention():
+    try:
+        from flash_attn import flash_attn_kvpacked_func, flash_attn_varlen_kvpacked_func, flash_attn_with_kvcache
+        from flash_attn.bert_padding import unpad_input, pad_input
+        from flash_attn.layers.rotary import apply_rotary_emb_func
+    except ImportError as e:
+        if 'flash_attn.layers.rotary' in str(e):
+            raise ImportError('Please install RoPE kernels: `pip install git+https://github.com/HazyResearch/flash-attention.git#subdirectory=csrc/rotary`')
+        else:
+            raise ImportError('Please install flash_attention dependency in GPU environment')
+from dataflow import get_logger
+logger = logging.get_logger(__name__)
+# @torch.jit.script
+def rmsnorm_func(hidden_states, weight, variance_epsilon):
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + variance_epsilon)
+    return (weight * hidden_states).to(input_dtype)
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.register_buffer(
+            "variance_epsilon",
+            torch.tensor(eps),
+            persistent=False,
+        )
+    def forward(self, hidden_states):
+        return rmsnorm_func(hidden_states, self.weight, self.variance_epsilon)
+class FlashRotaryEmbedding(torch.nn.Module):
+    """
+    The rotary position embeddings from RoFormer_ (Su et. al).
+    A crucial insight from the method is that the query and keys are
+    transformed by rotation matrices which depend on the relative positions.
+    Other implementations are available in the Rotary Transformer repo_ and in
+    GPT-NeoX_, GPT-NeoX was an inspiration
+    .. _RoFormer: https://arxiv.org/abs/2104.09864
+    .. _repo: https://github.com/ZhuiyiTechnology/roformer
+    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
+    If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
+    A recommended value for scale_base is 512: https://github.com/HazyResearch/flash-attention/issues/96
+    Reference: https://github.com/sunyt32/torchscale/blob/main/torchscale/component/xpos_relative_position.py
+    """
+    def __init__(self, dim: int, base=10000.0, interleaved=False, scale_base=None,
+                 scaling_factor=1.0, pos_idx_in_fp32=True, device=None):
+        """
+            interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
+                of 1st half and 2nd half (GPT-NeoX style).
+            pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32,
+                otherwise they might be in lower precision.
+                This option was added because previously (before 2023-07-02), when we construct
+                the position indices, we use the dtype of self.inv_freq. In most cases this would
+                be fp32, but if the model is trained in pure bf16 (not mixed precision), then
+                self.inv_freq would be bf16, and the position indices are also in bf16.
+                Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the
+                embeddings for some positions will coincide.
+                To maintain compatibility with models previously trained in pure bf16,
+                we add this option.
+            scaling_factor: RotaryEmbedding extended with linear scaling.
+        """
+        super().__init__()
+        self.dim = dim
+        self.base = float(base)
+        self.pos_idx_in_fp32 = pos_idx_in_fp32
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = self._compute_inv_freq(device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.interleaved = interleaved
+        self.scale_base = scale_base
+        self.scaling_factor = scaling_factor
+        scale = ((torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim)
+                 / (1.4 * dim) if scale_base is not None else None)
+        self.register_buffer("scale", scale)
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+    def _compute_inv_freq(self, device=None):
+        return 1 / (self.base ** (torch.arange(0, self.dim, 2, device=device,
+                                                 dtype=torch.float32) / self.dim))
+    def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
+        # Reset the tables if the sequence length has changed,
+        # if we're on a new device (possibly due to tracing for instance),
+        # or if we're switching from inference mode to training
+        if (seqlen > self._seq_len_cached or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+            or (self.training and self._cos_cached.is_inference())):
+            self._seq_len_cached = seqlen
+            # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
+            # And the output of arange can be quite large, so bf16 would lose a lot of precision.
+            # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
+            if self.pos_idx_in_fp32:
+                t = torch.arange(seqlen, device=device, dtype=torch.float32)
+                t /= self.scaling_factor
+                # We want fp32 here as well since inv_freq will be multiplied with t, and the output
+                # will be large. Having it in bf16 will lose a lot of precision and cause the
+                # cos & sin output to change significantly.
+                # We want to recompute self.inv_freq if it was not loaded in fp32
+                if self.inv_freq.dtype != torch.float32:
+                    inv_freq = self.inv_freq.to(torch.float32)
+                else:
+                    inv_freq = self.inv_freq
+            else:
+                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+                t /= self.scaling_factor
+                inv_freq = self.inv_freq
+            # Don't do einsum, it converts fp32 to fp16 under AMP
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            freqs = torch.outer(t, inv_freq)
+            if self.scale is None:
+                self._cos_cached = torch.cos(freqs).to(dtype)
+                self._sin_cached = torch.sin(freqs).to(dtype)
+            else:
+                power = ((torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device)
+                          - seqlen // 2) / self.scale_base)
+                scale = self.scale.to(device=power.device) ** power.unsqueeze(-1)
+                # We want the multiplication by scale to happen in fp32
+                self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
+                self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
+                self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
+                self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
+    def forward(self,
+                q: torch.Tensor, k: torch.Tensor,
+                seqlen_offset: int = 0,
+                unpadded_lengths: Optional[Tuple[torch.Tensor]] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        q: (batch, seqlen, nheads, headdim)
+        k: (batch, seqlen, nheads, headdim)
+        seqlen_offset: can be used in generation where the qkv being passed in is only the last
+        token in the batch.
+        """
+        if unpadded_lengths is not None:
+            cu_seqlens, max_seqlen = unpadded_lengths
+        else:
+            cu_seqlens, max_seqlen = None, q.shape[1]
+        self._update_cos_sin_cache(max_seqlen + seqlen_offset, device=q.device, dtype=q.dtype)
+        if self.scale is None:
+            return apply_rotary_emb_func(
+                q, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:],
+                self.interleaved, True, # inplace=True,
+                cu_seqlens=cu_seqlens, max_seqlen=max_seqlen
+            ), apply_rotary_emb_func(
+                k, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:],
+                self.interleaved, True, # inplace=True
+                cu_seqlens=cu_seqlens, max_seqlen=max_seqlen
+            )
+        else:
+            assert False
+class LlamaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+@torch.jit.script
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    if n_rep == 1:
+        return hidden_states
+    final_shape = list(hidden_states.shape[:-2]) + [-1] + [hidden_states.shape[-1]]
+    expand_shape = [-1] * (len(hidden_states.shape) - 1) + [n_rep] + [-1]
+    hidden_states = hidden_states.unsqueeze(-2).expand(expand_shape)
+    return hidden_states.reshape(final_shape)
+class LlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = getattr(config, "num_key_value_heads", self.num_heads)
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.register_buffer(
+            "norm_factor",
+            torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)).to(torch.get_default_dtype()),
+            persistent=False,
+        )
+        if not getattr(self.config, "rope_scaling", None):
+            scaling_factor = 1
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            assert scaling_type == 'linear'
+        theta = getattr(self.config, "rope_theta", 10000)
+        self.rotary_emb = FlashRotaryEmbedding(
+            self.head_dim, base=theta, interleaved=False, scaling_factor=scaling_factor,
+        )
+        self.distributed_attn_func = flash_attn_kvpacked_func
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        unpadded_lengths: Optional[Tuple[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        h_size = hidden_states.size(-1)
+        has_layer_past = past_key_value is not None
+        if has_layer_past:
+            past_kv = past_key_value[0]
+            past_len = past_key_value[1]
+        else:
+            past_len = 0
+        # NOTE: Hack to include position_ids, assuming they are increasing uniformly per block
+        if position_ids is not None:
+            past_len += position_ids.min()
+        q = self.q_proj(hidden_states)
+        k = self.k_proj(hidden_states)
+        v = self.v_proj(hidden_states)
+        q = q.view(*q.shape[:-1], self.num_heads, self.head_dim)
+        k = k.view(*k.shape[:-1], self.num_key_value_heads, self.head_dim)
+        v = v.view(*v.shape[:-1], self.num_key_value_heads, self.head_dim)
+        q, k = self.rotary_emb(q, k, past_len, unpadded_lengths)
+        kv = torch.stack([k, v], -3)
+        kv = repeat_kv(kv, self.num_key_value_groups)
+        # Cache QKV values
+        if has_layer_past:
+            new_len = past_len+q.size(1)
+            if new_len > past_kv.size(1):
+                past_kv = torch.cat([past_kv, torch.empty(hidden_states.size(0), 256, 2, kv.size(3), kv.size(4), dtype=kv.dtype, device=kv.device)], 1)
+            past_kv[:, past_len:new_len] = kv
+            kv = past_kv[:, :new_len]
+        else:
+            past_kv = kv
+        if unpadded_lengths is not None:
+            # varlen, ignore padding tokens, efficient for large batch with many paddings
+            assert attention_mask is not None
+            cu_seqlens, max_seqlen = unpadded_lengths
+            attn_outputs = flash_attn_varlen_kvpacked_func(
+                q, kv,
+                cu_seqlens, cu_seqlens,
+                max_seqlen, max_seqlen,
+                dropout_p=0.0, softmax_scale=1.0/self.norm_factor,
+                causal=True, return_attn_probs=output_attentions
+            )
+        # elif use_cache and past_key_value is not None:
+        #     attn_outputs = flash_attn_with_kvcache(
+        #         q,
+        #         kv[:, :, 0],
+        #         kv[:, :, 1],
+        #         softmax_scale=1.0/self.norm_factor,
+        #         causal=True,
+        #     )
+        else:
+            attn_outputs = flash_attn_kvpacked_func(
+                q, kv,
+                dropout_p=0.0,
+                softmax_scale=1.0/self.norm_factor,
+                causal=True,
+                return_attn_probs=output_attentions,
+            )
+        past_key_value = (past_kv, past_len+q.size(1)) if use_cache else None
+        attn_output = attn_outputs[0] if output_attentions else attn_outputs
+        attn_output = attn_output.reshape(*attn_output.shape[:-2], h_size)
+        attn_weights = attn_outputs[2] if output_attentions else None
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = LlamaAttention(config=config)
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self._fsdp_wrap = True
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        unpadded_lengths: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            unpadded_lengths=unpadded_lengths,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class LlamaPreTrainedModel(PreTrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+class LlamaModel(LlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+    Args:
+        config: LlamaConfig
+    """
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        try_import_flash_attention()
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        # position_ids = None
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        hidden_states = inputs_embeds
+        bsz = hidden_states.size(0)
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        if (
+            ((attention_mask is not None) and (not attention_mask.all().item()))
+            and not use_cache
+        ):
+            try: # for flash-attn latest version
+                hidden_states, unpad_indices, cu_seqlens, max_seqlen, _ = unpad_input(hidden_states, attention_mask)
+            except: # for flash-attn 2.3.3 verstion
+                hidden_states, unpad_indices, cu_seqlens, max_seqlen = unpad_input(hidden_states, attention_mask)
+            unpadded_lengths = (cu_seqlens, max_seqlen)
+        else:
+            unpadded_lengths = None
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                if unpadded_lengths is not None:
+                    all_hidden_states += (pad_input(hidden_states, unpad_indices, bsz, max_seqlen),)
+                else:
+                    all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    decoder_layer,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                    unpadded_lengths,
+                    output_attentions,
+                    False,
+                    use_reentrant=False
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    unpadded_lengths=unpadded_lengths,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        if unpadded_lengths is not None:
+            hidden_states = pad_input(hidden_states, unpad_indices, bsz, max_seqlen)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class LlamaForCausalLM(LlamaPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        try_import_flash_attention()
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        avg_valid_labels_per_chunk: Optional[float] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, LlamaForCausalLM
+        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states).float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+class LlamaForSequenceClassification(LlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = LlamaModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        try_import_flash_attention()
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

DataFlow/dataflow/operators/eval/GeneralText/models/Qurating/qurater_annotate.py ADDED Viewed

	@@ -0,0 +1,190 @@

+from datasets import load_from_disk, load_dataset, concatenate_datasets
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from .modeling.modeling_flash_llama import LlamaForSequenceClassification
+import torch
+import argparse
+import numpy as np
+class TokenizeAndChunk:
+    def __init__(self, tokenizer_name, text_field, tokens_field, tokens, model_cache_dir):
+        self.tokens = tokens
+        self.tokenizer_name = tokenizer_name
+        self.text_field = text_field
+        self.tokens_field = tokens_field
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True, cache_dir = model_cache_dir)
+        self.tokenizer.pad_token_id = 0
+    def __getstate__(self):
+        return {
+            "tokenizer_name": self.tokenizer_name,
+            "text_field": self.text_field,
+            "tokens_field": self.tokens_field,
+            "tokens": self.tokens,
+        }
+    def __setstate__(self, state):
+        self.__init__(**state)
+    def tokenize_and_chunk(self, source_tokens):
+        chunks_token_ids = []
+        chunks_token_counts = []
+        for seq in source_tokens:
+            chunks = torch.tensor(seq, dtype=torch.long).split(self.tokens)
+            chunks_token_ids.append([chunk.tolist() for chunk in chunks])
+            chunks_token_counts.append([len(x) for x in chunks])
+        return chunks_token_ids, chunks_token_counts
+    def __call__(self, example):
+        if self.tokens_field in example:
+            source_tokens = example[self.tokens_field]
+        else:
+            source_tokens = self.tokenizer(example[self.text_field], truncation=False, padding=False, add_special_tokens=False).input_ids
+        chunks_token_ids, chunks_token_counts = self.tokenize_and_chunk(source_tokens)
+        assert len(example[self.text_field]) == len(chunks_token_ids)
+        assert len(example[self.text_field]) == len(chunks_token_counts)
+        return {
+            "chunks_token_ids": chunks_token_ids,
+            "chunks_token_counts": chunks_token_counts,
+        }
+class ModelAnnotator:
+    def __init__(self, model_name, labels, device_batch_size, device, model_cache_dir):
+        self.model_name = model_name
+        self.labels = labels
+        self.device_batch_size = device_batch_size
+        self.model = LlamaForSequenceClassification.from_pretrained(
+            self.model_name,
+            torch_dtype=torch.bfloat16,
+            cache_dir=model_cache_dir)
+        self.model.config.pad_token_id = 0
+        self.model.eval()
+        self.device = device
+        print(f"Using device {self.device}")
+        self.model.to(self.device)
+        self.num_labels = len(labels)
+        assert self.num_labels == self.model.config.num_labels, f"Number of labels ({self.num_labels}) does not match model config ({self.model.config.num_labels})"
+    def __getstate__(self):
+        return {
+            "model_name": self.model_name,
+            "labels": self.labels,
+            "device_batch_size": self.device_batch_size,
+        }
+    def __setstate__(self, state):
+        self.__init__(**state)
+    @torch.inference_mode()
+    def score_chunks(self, chunks_token_ids, chunks_token_counts):
+        sorted_indices = torch.argsort(chunks_token_counts)
+        scores = torch.zeros(len(chunks_token_ids), self.num_labels, dtype=torch.float32)
+        for batch_indices in sorted_indices.split(self.device_batch_size):
+            max_len = chunks_token_counts[batch_indices].max()
+            input_ids = torch.zeros((len(batch_indices), max_len), dtype=torch.long)
+            attention_mask = torch.zeros((len(batch_indices), max_len), dtype=torch.long)
+            for i, j in enumerate(batch_indices):
+                seq = chunks_token_ids[j]
+                input_ids[i, :len(seq)] = seq
+                attention_mask[i, :len(seq)] = 1
+            outputs = self.model(input_ids.to(self.device), attention_mask=attention_mask.to(self.device), use_cache=False)
+            scores[batch_indices] = outputs.logits.float().cpu()
+        return scores
+    def __call__(self, example, indices):
+        num_seqs = len(indices)
+        source_ids = [i for i, counts in enumerate(example["chunks_token_counts"]) for _ in range(len(counts))]
+        chunks_token_ids = [torch.tensor(chunk, dtype=torch.long) for chunks in example["chunks_token_ids"] for chunk in chunks]
+        flattened_chunks_token_counts = torch.tensor([chunk for chunks in example["chunks_token_counts"] for chunk in chunks], dtype=torch.long)
+        flattened_scores = self.score_chunks(chunks_token_ids, flattened_chunks_token_counts)
+        chunk_token_counts = example["chunks_token_counts"]
+        chunk_scores = [[[] for _ in range(num_seqs)] for _ in range(self.num_labels)]
+        for source_id, score in zip(source_ids, flattened_scores):
+            for label in range(self.num_labels):
+                chunk_scores[label][source_id].append(score[label].item())
+        output = {
+            "index": indices,
+            "chunk_lengths": chunk_token_counts,
+            "length": [sum(counts) for counts in chunk_token_counts],
+        }
+        for i, label in enumerate(self.labels):
+            output[f"{label}_chunks"] = chunk_scores[i]
+            output[f"{label}_average"] = [
+                np.average(scores, weights=token_counts).item()
+                for scores, token_counts in zip(chunk_scores[i], chunk_token_counts)
+            ]
+        return output
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input", type=str)
+    parser.add_argument("output", type=str)
+    parser.add_argument("-F", "--data_files", type=str, nargs="+", default=[])
+    parser.add_argument("-S", "--shard", type=int, nargs=2, default=[0, 1])
+    parser.add_argument("-M", "--model", type=str, required=True)
+    parser.add_argument("-t", "--tokens", type=int, default=512)
+    parser.add_argument("--map_batch_size", type=int, default=512)
+    parser.add_argument("-b", "--device_batch_size", type=int, default=16)
+    parser.add_argument("-w", "--num_workers", type=int, default=1)
+    parser.add_argument("--text_field", type=str, default="text")
+    parser.add_argument("--tokens_field", type=str, default="input_ids")
+    parser.add_argument("--labels", type=str, nargs="+")
+    args = parser.parse_args()
+    print(args)
+    if args.input == "json":
+        dataset = load_dataset("json", data_files=args.data_files, split="train")
+    else:
+        dataset = load_from_disk(args.input)
+    src_dataset = dataset.shard(args.shard[1], args.shard[0], contiguous=True)
+    dataset = src_dataset
+    print(dataset)
+    print("Total number of examples:", len(dataset))
+    dataset = dataset.map(
+        TokenizeAndChunk(args.model, args.text_field, args.tokens_field, args.tokens),
+        batched=True,
+        batch_size=args.map_batch_size,
+        num_proc=args.num_workers,
+        remove_columns=dataset.column_names)
+    print("After tokenization: Total number of examples:", len(dataset))
+    dataset = dataset.map(
+        ModelAnnotator(args.model, args.labels, args.device_batch_size),
+        batched=True,
+        with_indices=True,
+        batch_size=args.map_batch_size,
+        remove_columns=dataset.column_names)
+    dataset = concatenate_datasets([dataset, src_dataset], axis=1)
+    print("After annotation: Total number of examples:", len(dataset))
+    print(f"Saving to {args.output}")
+    dataset.save_to_disk(args.output)

DataFlow/dataflow/operators/eval/GeneralText/models/Superfiltering/__pycache__/data_analysis.cpython-310.pyc ADDED Viewed

Binary file (1.51 kB). View file

DataFlow/dataflow/operators/eval/GeneralText/models/Superfiltering/data_analysis.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+import json
+import torch
+import argparse
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+PROMPT_DICT_NONE = {
+    "prompt_input": (
+        "{instruction}\n{input}\n"
+    ),
+    "prompt_no_input": (
+        "{instruction}\n"
+    ),
+}
+# Used to get the ppl and emb for the whole input
+def get_perplexity_and_embedding_whole_text(tokenizer, model, text, max_length, device):
+    input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
+    with torch.no_grad():
+        outputs = model(input_ids, labels=input_ids.contiguous())
+    loss = outputs.loss
+    perplexity = torch.exp(loss)
+    return perplexity.to('cpu').item(), loss.to('cpu').item()
+# Used to get the ppl and emb for part of input, used in conditional version, and token-wise loss
+def get_perplexity_and_embedding_part_text(tokenizer, model, text, target_span, max_length, device):
+    try:
+        input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
+        start_index = text.rfind(target_span)
+        start_token = len(tokenizer.encode(text[:start_index]))
+        end_token = input_ids.shape[1]
+        labels = input_ids.clone()
+        labels[0, :start_token] = -100
+        with torch.no_grad():
+            outputs = model(input_ids, labels=labels)
+        loss = outputs.loss
+        perplexity = torch.exp(loss)
+        return perplexity.to('cpu').item(), loss.to('cpu').item()
+    except:
+        return 0, 0

DataFlow/dataflow/operators/eval/GeneralText/models/__pycache__/debertav3_scorer.cpython-310.pyc ADDED Viewed

Binary file (3.61 kB). View file