diff --git a/EAGLE/lmms_eval/tasks/ferret/ferret.yaml b/EAGLE/lmms_eval/tasks/ferret/ferret.yaml new file mode 100644 index 0000000000000000000000000000000000000000..517649e72f93f74ec0f919b543c6287739267211 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/ferret/ferret.yaml @@ -0,0 +1,39 @@ +dataset_path: lmms-lab/Ferret-Bench +dataset_kwargs: + token: True +task: "ferret" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.ferret_doc_to_visual +doc_to_text: !function utils.ferret_doc_to_text +doc_to_target: "gpt_answer" +generation_kwargs: + until: + - "ASSISTANT:" + image_aspect_ratio: original + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +process_results: !function utils.ferret_process_results +metric_list: + - metric: gpt_eval_ferret_all + aggregation: !function utils.ferret_all_aggregation + higher_is_better: true + - metric: gpt_eval_ferret_refer_desc + aggregation: !function utils.ferret_refer_desc_aggregation + higher_is_better: true + - metric: gpt_eval_ferret_refer_reason + aggregation: !function utils.ferret_refer_reason_aggregation + higher_is_better: true + - metric: gpt_eval_ferret_ground_conv + aggregation: !function utils.ferret_ground_conv_aggregation + higher_is_better: true +metadata: + version: 0.0 + gpt_eval_model_name: "gpt-4-0314" +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/ferret/rule.json b/EAGLE/lmms_eval/tasks/ferret/rule.json new file mode 100644 index 0000000000000000000000000000000000000000..7294372c37a477fa82125f1b2ba3ae7c3df0e000 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/ferret/rule.json @@ -0,0 +1,5 @@ +{ + "refer_desc": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question about specific region of an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. Also, the relationships between pairs of objects are provided, in the format of object -> relationship -> subject, where the object/subject are indexed by object id from previous object lists as well as the object names. Also, several region description are given, each describing a box region of image, with detailed coordinates. \nPlease rate the spatial correspondence, helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, + "refer_reason": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question about specific region of an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. Also, the relationships between pairs of objects are provided, in the format of object -> relationship -> subject, where the object/subject are indexed by object id from previous object lists as well as the object names. Also, several region description are given, each describing a box region of image, with detailed coordinates. \nPlease rate the spatial correspondence, helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, + "ground_conv": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question that requires model to predict the coordinates of relevant object. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. Also, the relationships between pairs of objects are provided, in the format of object -> relationship -> subject, where the object/subject are indexed by object id from previous object lists as well as the object names. Also, several region description are given, each describing a box region of image, with detailed coordinates. \nPlease rate the predicted coordinates, helpfulness, relevance, accuracy, level of details of their responses. Specifically, pay your attention to the precision of the coordinates and whether it matches the object. Small deviation (<20% of ground-truth box width or height) of coordinates is allowed and shouldn't be punished. More than that, the degree of deviation should be reflected in scoring too. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."} +} \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/ferret/utils.py b/EAGLE/lmms_eval/tasks/ferret/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7e86ea474dfb248053c849f91b55ecc81ae383ce --- /dev/null +++ b/EAGLE/lmms_eval/tasks/ferret/utils.py @@ -0,0 +1,206 @@ +import json +import logging +import os +import requests +import numpy as np +import openai +from openai import OpenAI +import time +import yaml +from pathlib import Path +from copy import deepcopy + +eval_logger = logging.getLogger("lmms-eval") +NUM_SECONDS_TO_SLEEP = 0.5 + +FERRET_W_METRICS = ["gpt_eval_ferret_refer_desc", "gpt_eval_ferret_refer_reason", "gpt_eval_ferret_ground_conv"] + +rule_dict = json.load(open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "rule.json"), "r")) + +with open(Path(__file__).parent / "ferret.yaml", "r") as f: + raw_data = f.readlines() + safe_data = [] + for i, line in enumerate(raw_data): + # remove function definition since yaml load cannot handle it + if "!function" not in line: + safe_data.append(line) + + config = yaml.safe_load("".join(safe_data)) + +GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"] + +API_TYPE = os.getenv("API_TYPE", "openai") + +if API_TYPE == "openai": + API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") + API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") + headers = { + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json", + } +elif API_TYPE == "azure": + API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken") + API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY") + headers = { + "api-key": API_KEY, + "Content-Type": "application/json", + } + + +def get_eval(content: str, max_tokens: int, retries: int = 3): + global headers + + messages = [ + { + "role": "system", + "content": "You are a helpful and precise assistant for checking the quality of the answer.", + }, + {"role": "user", "content": content}, + ] + + payload = { + "model": GPT_EVAL_MODEL_NAME, + "messages": messages, + "temperature": 0.2, + "max_tokens": max_tokens, + } + + for attempt in range(retries): + try: + response = requests.post(API_URL, headers=headers, json=payload) + response.raise_for_status() + response_data = response.json() + + content = response_data["choices"][0]["message"]["content"].strip() + if content != "": + return content, response_data["model"] + break # If successful, break out of the loop + + except Exception as e: + eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}") + if attempt < retries - 1: # If we have retries left, sleep and then continue to next attempt + time.sleep(NUM_SECONDS_TO_SLEEP) + else: # If this was the last attempt, log and return empty + eval_logger.error(f"All {retries} attempts failed. Last error message: {e}") + return "", "" + return "", "" + + +def parse_score(review): + try: + score_pair = review.split("\n")[0] + score_pair = score_pair.replace(",", " ") + sp = score_pair.split(" ") + if len(sp) == 2: + return [float(sp[0]), float(sp[1])] + else: + eval_logger.debug(f"Can not split: {review}. Returning [-1, -1]") + return [-1, -1] + except Exception as e: + eval_logger.debug(f"Error: {e}. Returning [-1, -1]") + return [-1, -1] + + +def ferret_doc_to_visual(doc): + return [doc["image"].convert("RGB")] + + +def ferret_doc_to_text(doc, model_specific_prompt_kwargs=None): + if model_specific_prompt_kwargs is None: + model_specific_prompt_kwargs = {} + pre_prompt = model_specific_prompt_kwargs.get("pre_prompt", "") + post_prompt = model_specific_prompt_kwargs.get("post_prompt", "") + question = f"{pre_prompt}{doc['question']}{post_prompt}" + return question + + +def ferret_process_results(doc, result): + """ + Args: + doc: a instance of the eval dataset + results: [pred] + Returns: + a dictionary with key: metric name (in this case coco_bleu), value: metric value + """ + try: + question = doc.get("question", "") + ans1 = doc.get("gpt_answer", "") + ans2 = result[0] if result else "" + context = doc.get("context", []) + context = "\n".join(context) if isinstance(context, list) else context + category = doc.get("category", "") + rule = rule_dict.get(category, {}) + prompt = rule.get("prompt", "") + role = rule.get("role", "user") + content = f"[Context]\n{context}\n\n" f"[Question]\n{question}\n\n" f"[{role} 1]\n{ans1}\n\n[End of {role} 1]\n\n" f"[{role} 2]\n{ans2}\n\n[End of {role} 2]\n\n" f"[System]\n{prompt}\n\n" + review, model_name = get_eval(content, 1024) + scores = parse_score(review) + except Exception as e: + eval_logger.error(f"Error for Question ID: {doc.get('question_id', 'Unknown')}: {e}") + review = "Failed to Get a Proper Review." + model_name = "Failed Request" + scores = [-1, -1] + + metric = f"gpt_eval_ferret_{doc.get('category', 'all')}" + category_review_dict = { + "question": question, + "ans1": ans1, + "ans2": ans2, + "context": context, + "category": category, + "review": review, + "scores": scores, + "eval_model": model_name, + } + + non_category_review_dict = deepcopy(category_review_dict) + non_category_review_dict["scores"] = [-999, -999] + + data_dict = {} + for m in FERRET_W_METRICS: + if m == metric: + data_dict[m] = category_review_dict + else: + data_dict[m] = non_category_review_dict + data_dict["gpt_eval_ferret_all"] = category_review_dict + + # return {"gpt_eval_ferret_all": review_dict} + return data_dict + + +def ferret_refer_desc_aggregation(results): + return ferret_aggregation(results, "refer_desc") + + +def ferret_refer_reason_aggregation(results): + return ferret_aggregation(results, "refer_reason") + + +def ferret_ground_conv_aggregation(results): + return ferret_aggregation(results, "ground_conv") + + +def ferret_all_aggregation(results): + return ferret_aggregation(results, "all") + + +def ferret_aggregation(results, category): + try: + scores = [] + for result in results: + if -999 in result["scores"]: + continue + scores.append(result["scores"]) + + stats = np.asarray(scores).mean(0).tolist() + stats = [round(x, 3) for x in stats] + # gpt4_score_percentage = stats[0] * 10 + # model_score_percentage = stats[1] * 10 + # eval_logger.info(f"Category: {category}") + # eval_logger.info(f"GPT4 Score: {gpt4_score_percentage:.1f}%") + # eval_logger.info(f"Model Score: {model_score_percentage:.1f}%") + # eval_logger.info("=========================") + return round(stats[1] / stats[0] * 100, 1) + except Exception as e: + eval_logger.info(f"Error in ferret_aggregation: {e}, and in category: {category}") + return None diff --git a/EAGLE/lmms_eval/tasks/flickr30k/flickr30k.yaml b/EAGLE/lmms_eval/tasks/flickr30k/flickr30k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..94f63bcf88e568dfb9a754b42fad40ab456712b7 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/flickr30k/flickr30k.yaml @@ -0,0 +1,3 @@ +group: flickr30k +task: +- flickr30k_test \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/flickr30k/flickr30k_test.yaml b/EAGLE/lmms_eval/tasks/flickr30k/flickr30k_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..737d9ff4964a2e7f0208abfd8ec06345ed5706fd --- /dev/null +++ b/EAGLE/lmms_eval/tasks/flickr30k/flickr30k_test.yaml @@ -0,0 +1,44 @@ +dataset_path: lmms-lab/flickr30k +dataset_kwargs: + token: True +task : "flickr30k_test" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.flickr_doc_to_visual +doc_to_text: !function utils.flickr_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 64 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +process_results: !function utils.flickr_process_result +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: flickr_Bleu_4 + aggregation : !function utils.flickr_bleu4 + higher_is_better : true + - metric: flickr_Bleu_3 + aggregation : !function utils.flickr_bleu3 + higher_is_better : true + - metric: flickr_Bleu_2 + aggregation : !function utils.flickr_bleu2 + higher_is_better : true + - metric: flickr_Bleu_1 + aggregation : !function utils.flickr_bleu1 + higher_is_better : true + - metric: flickr_METEOR + aggregation : !function utils.flickr_meteor + higher_is_better : true + - metric: flickr_ROUGE_L + aggregation : !function utils.flickr_rougel + higher_is_better : true + - metric: flickr_CIDEr + aggregation : !function utils.flickr_cider + higher_is_better : true + #- metric: flickr_SPICE + # aggregation : !function utils.flickr_spice + # higher_is_better : true +metadata: + - version: 0.0 \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/flickr30k/utils.py b/EAGLE/lmms_eval/tasks/flickr30k/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..44bf075454f2272cd487c25de07b4951b69bec49 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/flickr30k/utils.py @@ -0,0 +1,141 @@ +import os +import json +from pycocoevalcap.eval import COCOEvalCap, Bleu, Meteor, Rouge, Cider, Spice +from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer +from pycocotools.coco import COCO +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file +import datetime + +import logging + +eval_logger = logging.getLogger("lmms-eval") + +dir_name = os.path.dirname(os.path.abspath(__file__)) + +FLICKR_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"] # , "SPICE"] + + +def flickr_doc_to_visual(doc): + return [doc["image"].convert("RGB")] + + +def flickr_doc_to_text(doc): + # question = "Please carefully observe the image and come up with a caption for the image" + return f"Provide a one-sentence caption for the provided image." + + +def flickr_process_result(doc, result): + """ + Args: + doc: a instance of the eval dataset + results: [pred] + Returns: + a dictionary with key: metric name, value: metric value + """ + pred = result[0] if len(result) > 0 else "" + image_id = int(doc["img_id"]) + + data_dict = {"answer": doc["caption"], "pred": pred, "image_id": image_id} + + return {f"flickr_{metric}": data_dict for metric in FLICKR_METRICS} + + +def flickr_aggregation_result(results, metric, args): + scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")] + scorers_dict = {s[1]: s for s in scorers} + + stored_results = [] + # In order to make the coco eval tools to successfully create index + # We need at least two dict in the dataset + # 'annotation' and 'images' + # 'annotation' exactly reproduce the original annotation + # 'images' however only need the image id which is contained in the file name + dataset = {"annotations": [], "images": []} + idx = 0 + for result in results: + stored_results.append({"image_id": int(result["image_id"]), "caption": result["pred"]}) + for a in result["answer"]: + dataset["annotations"].append({"image_id": int(result["image_id"]), "caption": a, "id": idx}) + idx += 1 + dataset["images"].append({"id": int(result["image_id"])}) + + coco = COCO() + # Manually create index here + coco.dataset = dataset + coco.createIndex() + + flickr_result = coco.loadRes(stored_results) + flickr_eval = COCOEvalCap(coco, flickr_result) + + imgIds = flickr_eval.params["image_id"] + gts = {} + res = {} + for imgId in imgIds: + gts[imgId] = flickr_eval.coco.imgToAnns[imgId] + res[imgId] = flickr_eval.cocoRes.imgToAnns[imgId] + + eval_logger.info("tokenization...") + tokenizer = PTBTokenizer() + gts = tokenizer.tokenize(gts) + res = tokenizer.tokenize(res) + + eval_logger.info(f"Computing {metric} scores...") + + score, scores = scorers_dict[metric][0].compute_score(gts, res) + # When metric is one of the Bleu, score will be a list + if type(score) == list: + n = int(metric.split("_")[-1]) + score = score[n - 1] + + path = generate_submission_file(f"flickr30k_captions_val2014_alg_results_{metric}.json", args) + + eval_logger.info("Storing prediction that can be submitted to the server ...") + with open(path, "w") as f: + json.dump(stored_results, f, indent=4) + + return score + + +def flickr_bleu4(results, args): + return flickr_aggregation_result(results, "Bleu_4", args) + + +def flickr_bleu3(results, args): + return flickr_aggregation_result(results, "Bleu_3", args) + + +def flickr_bleu2(results, args): + return flickr_aggregation_result(results, "Bleu_2", args) + + +def flickr_bleu1(results, args): + return flickr_aggregation_result(results, "Bleu_1", args) + + +def flickr_meteor(results, args): + return flickr_aggregation_result(results, "METEOR", args) + + +def flickr_rougel(results, args): + return flickr_aggregation_result(results, "ROUGE_L", args) + + +def flickr_cider(results, args): + return flickr_aggregation_result(results, "CIDEr", args) + + +def flickr_spice(results, args): + return flickr_aggregation_result(results, "SPICE", args) + + +def flickr_test_process_result(doc, result): + """ + Args: + doc: a instance of the eval dataset + results: [pred] + Returns: + a dictionary with key: metric name (in this case flickr_passthrough), value: metric value + """ + # The question id in our dataset is the image file itself + image_id = doc["img_id"] + return {"flickr_passthrough": {"pred": result, "image_id": image_id}} diff --git a/EAGLE/lmms_eval/tasks/hallusion_bench/evaluate_hb.py b/EAGLE/lmms_eval/tasks/hallusion_bench/evaluate_hb.py new file mode 100644 index 0000000000000000000000000000000000000000..87c65519d27dc54d706e710e99f8ffc7b28b1ce0 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/hallusion_bench/evaluate_hb.py @@ -0,0 +1,129 @@ +import os +import json +import logging +from tqdm import tqdm + +from lmms_eval.tasks.hallusion_bench.utils import evaluate_by_chatgpt, check_same_by_chatgpt, assign_correctness, get_eval_all, get_eval_fig, get_eval_pair_all + +cur_dir = os.path.dirname(os.path.abspath(__file__)) +output_entry = "model_prediction" +correctness_entry = "gpt4v_output_gpt_check" + +metric = ["aAcc", "fAcc", "qAcc"] + +eval_logger = logging.getLogger("lmms-eval") + + +def hb_doc_to_text(doc, model_specific_prompt_kwargs=None): + if model_specific_prompt_kwargs is None: + model_specific_prompt_kwargs = {} + pre_prompt = model_specific_prompt_kwargs.get("pre_prompt", "") + post_prompt = model_specific_prompt_kwargs.get("post_prompt", "") + return f"{pre_prompt}{doc['question']}{post_prompt}" + + +def hb_doc_to_visual(doc): + return [doc["image"].convert("RGB")] + + +def hb_process_results(doc, result): + sample = doc + # doc.pop("image") + sample["model_prediction"] = result[0] + return {k: sample for k in metric} + + +def hb_aggregation_result(results, metric, args): + data_vd = [] + data_vs = [] + for data in tqdm(results, desc="Split vd and vs"): + if data["category"] == "VD": + data_vd.append(data) + if data["category"] == "VS": + data_vs.append(data) + eval_logger.info("Do gpt eval vd ...") + path = os.path.join(args.output_path, "gpt_response") + os.makedirs(path, exist_ok=True) + save_json_path_vd = f"{path}/hallusion_output_vd_model.json" + save_json_path_vs = f"{path}/hallusion_output_vs_model.json" + data_vd = evaluate_by_chatgpt(data_vd, output_entry=output_entry, correctness_entry=correctness_entry, load_json=True, save_json_path=save_json_path_vd) + # data_vd = check_same_by_chatgpt(data_vd, output_entry=output_entry, load_json=True, save_json_path=save_json_path_vd) + data_vd = assign_correctness(data_vd, correctness_entry=correctness_entry) + eval_logger.info("Do gpt eval vs") + data_vs = evaluate_by_chatgpt(data_vs, output_entry=output_entry, correctness_entry=correctness_entry, load_json=True, save_json_path=save_json_path_vs) + # data_vs = check_same_by_chatgpt(data_vs, output_entry=output_entry, load_json=True, save_json_path=save_json_path_vs) + data_vs = assign_correctness(data_vs, correctness_entry=correctness_entry) + results = data_vs + data_vd + + if metric == "aAcc": + all_data = get_eval_all(results, model_correctness_entry=correctness_entry) + return round(100 * all_data["correct"] / all_data["total"], 4) + elif metric == "fAcc": + fig_all = get_eval_fig(results) + return round(100 * fig_all["correct"] / fig_all["total"], 4) + elif metric == "qAcc": + all_data = get_eval_pair_all(results, model_correctness_entry=correctness_entry) + return round(100 * all_data["correct"] / all_data["total"], 4) + + +def hb_aggregation_result_qAcc(results, args): + return hb_aggregation_result(results, "qAcc", args) + + +def hb_aggregation_result_fAcc(results, args): + return hb_aggregation_result(results, "fAcc", args) + + +def hb_aggregation_result_aAcc(results, args): + return hb_aggregation_result(results, "aAcc", args) + + +def hb_aggregation_result_intern(results, metric): + scores = [] + for result in results: + ans = "1" if result["model_prediction"].lower().find("yes") != -1 else "0" + scores.append(ans == result["gt_answer"]) + result["answer"] = ans + + if metric == "aAcc": + return sum(scores) / len(scores) + elif metric == "qAcc": + qlist = {} + for r in results: + key = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["question_id"])]) + try: + qlist[key].append(r["answer"] == r["gt_answer"]) + except: + qlist[key] = [r["answer"] == r["gt_answer"]] + out = [] + for q, v in qlist.items(): + out.append(min(v)) + + return sum(out) / len(out) + elif metric == "fAcc": + qlist = {} + for r in results: + key = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["figure_id"])]) + try: + qlist[key].append(r["answer"] == r["gt_answer"]) + except: + qlist[key] = [r["answer"] == r["gt_answer"]] + out = [] + for q, v in qlist.items(): + out.append(min(v)) + return sum(out) / len(out) + + +def hb_aggregation_result_qAcc_intern(results): + eval_logger.info("Calculating qAcc ...") + return hb_aggregation_result_intern(results, "qAcc") + + +def hb_aggregation_result_fAcc_intern(results): + eval_logger.info("Calculating fAcc ...") + return hb_aggregation_result_intern(results, "fAcc") + + +def hb_aggregation_result_aAcc_intern(results): + eval_logger.info("Calculating aAcc ...") + return hb_aggregation_result_intern(results, "aAcc") diff --git a/EAGLE/lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml b/EAGLE/lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml new file mode 100644 index 0000000000000000000000000000000000000000..39a5be4c394c0cf48f0bd28b36349044b9eaa572 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml @@ -0,0 +1,41 @@ +dataset_path: lmms-lab/HallusionBench +dataset_kwargs: + token: True +task: "hallusion_bench_image" +test_split: image +output_type: generate_until +doc_to_visual: !function evaluate_hb.hb_doc_to_visual +doc_to_text: !function evaluate_hb.hb_doc_to_text +doc_to_target: "gt_answer_details" +process_results: !function evaluate_hb.hb_process_results +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" +generation_kwargs: + max_new_tokens: 128 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +metric_list: + - metric: aAcc + aggregation: !function evaluate_hb.hb_aggregation_result_aAcc + higher_is_better: true + - metric: qAcc + aggregation: !function evaluate_hb.hb_aggregation_result_qAcc + higher_is_better: true + - metric: fAcc + aggregation: !function evaluate_hb.hb_aggregation_result_fAcc + higher_is_better: true + # - metric: aAcc + # aggregation: !function evaluate_hb.hb_aggregation_result_aAcc_intern + # higher_is_better: true + # - metric: qAcc + # aggregation: !function evaluate_hb.hb_aggregation_result_qAcc_intern + # higher_is_better: true + # - metric: fAcc + # aggregation: !function evaluate_hb.hb_aggregation_result_fAcc_intern + # higher_is_better: true +metadata: + - version: 0.0 diff --git a/EAGLE/lmms_eval/tasks/hallusion_bench/utils.py b/EAGLE/lmms_eval/tasks/hallusion_bench/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1d8dfdaf0ab54354e0f0c01d77cff9938518fa22 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/hallusion_bench/utils.py @@ -0,0 +1,306 @@ +import csv +import json +from tqdm import tqdm +import numpy as np +import os +import time +import openai +import threading +import requests +import logging + +API_TYPE = os.getenv("API_TYPE", "openai") + +if API_TYPE == "openai": + API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") + API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") + headers = { + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json", + } +elif API_TYPE == "azure": + API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken") + API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY") + headers = { + "api-key": API_KEY, + "Content-Type": "application/json", + } + +eval_logger = logging.getLogger("lmms-eval") + + +def evaluate_by_chatgpt(data, output_entry, correctness_entry, gpt_model="gpt-4", load_json=False, save_json_path="./hallusion_output.json", retries=3): + if load_json and os.path.exists(save_json_path): + with open(save_json_path, "r") as f: + output = json.load(f) + else: + output = [] + for sample in tqdm(data[len(output) :], desc="Eval by GPT"): + prompt = "Imagine you are an intelligent teacher. Thoroughly read the question, reference answer and the prediction answer to ensure a clear understanding of the information provided. Assess the correctness of the predictions. " + prompt += 'If the prediction answer does not conflict with the reference answer, please generate “correct”. If the prediction answer conflict with the reference answer, please generate “incorrect”. If the prediction answer is unclear about the answer, please generate "unclear". \n\n Question:' + prompt += sample["question"] + prompt += "\nReference answer: " + prompt += sample["gt_answer_details"] + prompt += "\nPrediction answer:" + prompt += sample[output_entry] + prompt += "\nOutput:" + + # https://github.com/openai/openai-python/issues/322#issuecomment-1767841683 + for attempt in range(retries): + try: + messages = [{"role": "user", "content": prompt}] + payload = { + "messages": messages, + "max_tokens": 16, + } + # set model when using openai api_key. Azure api_key does not need model since the endpoint fixed the model. + if API_TYPE == "openai": + payload["model"] = gpt_model + response = requests.post(API_URL, headers=headers, json=payload, timeout=30) + response.raise_for_status() + response = response.json() + break + except Exception as e: + eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}") + if attempt < retries - 1: # If we have retries left, sleep and then continue to next attempt + time.sleep(5) + else: # If this was the last attempt, log and return empty + eval_logger.error(f"All {retries} attempts failed. Last error message: {str(e)}") + try: + output_text = response["choices"][0]["message"]["content"] + except Exception as e: + eval_logger.info(f"Get error {str(e)} when extracting response") + output_text = "unclear" + + if "incorrect" in output_text.lower(): + gpt_correctness = "0" + + elif "correct" in output_text.lower(): + gpt_correctness = "1" + else: + gpt_correctness = "2" + + sample[correctness_entry] = gpt_correctness + sample["gpt_answer"] = prompt + output_text + + output.append(sample) + + with open(save_json_path, "w") as f: + json.dump(output, f, indent=4) + + return output + + +def check_same_by_chatgpt(data, output_entry, gpt_model="gpt-4", load_json=False, save_json_path="./hallusion_output.json", retries=3): + orig_response = {} + + for r in data: + if str(r["figure_id"]) == "0": + key = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["question_id"])]) + orig_response[key] = r[output_entry] + + for sample in tqdm(data, desc="Check same by GPT"): + if "same" not in sample.keys(): + key = "_".join([sample["category"], sample["subcategory"], str(sample["set_id"]), str(sample["question_id"])]) + response2 = orig_response[key] + + prompt = "Imagine you are an intelligent teacher. Thoroughly read the two responses to two different questions. Assess the consistency of the information provided within those two responses. " + prompt += "You do not know the specific questions, but you can asssess the consistency among the two responses by checking for logical conflicts if both responses are correct. " + prompt += 'If response1 does not conflict with response2, please generate “same”. Otherwise, generate "different". \n\n response1:' + prompt += sample[output_entry] + prompt += "\nresponse2: " + prompt += response2 + prompt += "\nOutput:" + + # https://github.com/openai/openai-python/issues/322#issuecomment-1767841683 + for attempt in range(retries): + try: + headers = { + "api-key": API_KEY, + "Content-Type": "application/json", + } + + messages = [{"role": "user", "content": prompt}] + + payload = { + "model": gpt_model, + "messages": messages, + "max_tokens": 16, + } + response = requests.post(API_URL, headers=headers, json=payload) + response.raise_for_status() + response = response.json() + + break + except Exception as e: + eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}") + if attempt < retries - 1: # If we have retries left, sleep and then continue to next attempt + time.sleep(5) + else: # If this was the last attempt, log and return empty + eval_logger.error(f"All {retries} attempts failed. Last error message: {str(e)}") + + try: + output_text = response["choices"][0]["message"]["content"] + except Exception as e: + eval_logger.info(f"Get error {str(e)} when extracting response") + output_text = "different" + + gpt_same = "0" + + if "same" in output_text.lower(): + gpt_same = "1" + + elif "different" in output_text.lower(): + gpt_same = "0" + + sample["same"] = gpt_same + + with open(save_json_path, "w") as f: + json.dump(data, f, indent=4) + + return data + + +def assign_correctness(data_arr, correctness_entry): + for r in data_arr: + assert int(r[correctness_entry]) == 0 or int(r[correctness_entry]) == 1 or int(r[correctness_entry]) == 2 + if r["category"] == "VS" and int(r["figure_id"]) == 0: # if there is no visual supplement and the model does not know, count it as correct + r["correct"] = 1 if int(r[correctness_entry]) == 1 or int(r[correctness_entry]) == 2 else 0 + else: + r["correct"] = 1 if int(r[correctness_entry]) == 1 else 0 + return data_arr + + +def get_eval_fig(data): # per figure + eval_fig_dict = dict() + + for r in data: + if r["category"] == "VS" and str(r["figure_id"]) == "0": # no figure + continue + name = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["figure_id"])]) + if name in eval_fig_dict: + c, t = eval_fig_dict[name] + eval_fig_dict[name] = (c + r["correct"], t + 1) + else: + eval_fig_dict[name] = (r["correct"], 1) + + eval_fig_stat = {} + eval_fig_stat["note"] = "all accuracy per image (consistency test)" + eval_fig_stat["total"] = len(eval_fig_dict.keys()) + eval_fig_stat["correct"] = 0 + eval_fig_stat["wrong"] = 0 + eval_fig_stat["inconsistent"] = 0 + eval_fig_stat["score"] = 0 + + for v in eval_fig_dict.values(): + if v[0] == v[1]: + eval_fig_stat["correct"] += 1 + elif v[0] == 0: + eval_fig_stat["wrong"] += 1 + else: + eval_fig_stat["inconsistent"] += 1 + eval_fig_stat["score"] += v[0] / v[1] + + eval_fig_stat["score"] = eval_fig_stat["score"] / eval_fig_stat["total"] + return eval_fig_stat + + +def get_eval_all(data, model_correctness_entry): # per question + eval_all_dict = dict() + eval_all_stat = {} + eval_all_stat["LH"] = 0 + eval_all_stat["VI"] = 0 + eval_all_stat["Mix"] = 0 + + for r in data: + name = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["figure_id"]), str(r["question_id"])]) + assert name not in eval_all_dict + + eval_all_dict[name] = r["correct"] + + if str(r["category"]) == "VD": # VD + if str(r["figure_id"]) == "0": + if str(r[model_correctness_entry]) == "0" or str(r[model_correctness_entry]) == "2": + eval_all_stat["VI"] += 1 + else: + if str(r[model_correctness_entry]) == "0": + eval_all_stat["Mix"] += 1 + elif str(r[model_correctness_entry]) == "2": + eval_all_stat["VI"] += 1 + else: # VS + if str(r["visual_input"]) == "0": # no visual + if str(r[model_correctness_entry]) == "0": + eval_all_stat["LH"] += 1 + else: # original visual or modified visual (isual_input == 1 or 2) + if str(r[model_correctness_entry]) == "0": + eval_all_stat["Mix"] += 1 + elif str(r[model_correctness_entry]) == "2": + eval_all_stat["VI"] += 1 + + eval_all_stat["note"] = "all accuracy per question" + eval_all_stat["total"] = len(eval_all_dict.keys()) + eval_all_stat["correct"] = np.count_nonzero(list(eval_all_dict.values())) + eval_all_stat["wrong"] = eval_all_stat["total"] - eval_all_stat["correct"] + + return eval_all_stat + + +def get_eval_pair_all(data, model_correctness_entry): # per question pair + orig_correctness = dict() + counter = 0 + lh_counter = 0 + vi_counter = 0 + both_counter = 0 + + for r in data: + if str(r["figure_id"]) == "0": + key = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["question_id"])]) + orig_correctness[key] = r[model_correctness_entry] + + get_eval_pair_dict = dict() + + for r in data: + name = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["question_id"])]) + if name in get_eval_pair_dict: + c, t = get_eval_pair_dict[name] + get_eval_pair_dict[name] = (c + r["correct"], t + 1) + else: + get_eval_pair_dict[name] = (r["correct"], 1) + counter += 1 + + eval_all_pair_stat = {} + eval_all_pair_stat["note"] = "all accuracy per question pair" + eval_all_pair_stat["total"] = len(get_eval_pair_dict.keys()) + eval_all_pair_stat["total_q"] = counter + eval_all_pair_stat["correct"] = 0 + eval_all_pair_stat["wrong"] = 0 + eval_all_pair_stat["LH"] = 0 + eval_all_pair_stat["VI"] = 0 + eval_all_pair_stat["Mix"] = 0 + + eval_all_pair_stat["LH_cg"] = lh_counter + eval_all_pair_stat["VI_cg"] = vi_counter + eval_all_pair_stat["Mix_cg"] = both_counter + + # for v in get_eval_pair_dict.values(): + # if v[0] == v[1]: + # eval_all_pair_stat["correct"] += 1 + # else: + # eval_all_pair_stat["wrong"] += 1 + + # for v in get_analysis_pair_dict.values(): + # if v[0] > 0 and v[1] > 0: + # eval_all_pair_stat["Mix"] += 1 + # elif v[0] > 0: + # eval_all_pair_stat["LH"] += 1 + # elif v[1] > 0: + # eval_all_pair_stat["VI"] += 1 + + for k in get_eval_pair_dict.keys(): + v = get_eval_pair_dict[k] + if v[0] == v[1]: + eval_all_pair_stat["correct"] += 1 + else: + eval_all_pair_stat["wrong"] += 1 + + return eval_all_pair_stat diff --git a/EAGLE/lmms_eval/tasks/iconqa/utils.py b/EAGLE/lmms_eval/tasks/iconqa/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..de99e804736e57c81bfd09a3fd78fab4ff3ff370 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/iconqa/utils.py @@ -0,0 +1,57 @@ +import json +import os + + +def options_to_str(options_prompt): + option_prompt_str = "" + for i, option in enumerate(options_prompt): + option_choice = chr(ord("A") + i) + option_prompt_str += f"{option_choice}. {option}\n" + + option_prompt_str = option_prompt_str.rstrip("\n") + return option_prompt_str + + +def doc_to_visual(doc): + image_list = [] + if "query_image" in doc: + image_list.append(doc["query_image"].convert("RGB")) + for i in range(5): + id = f"choice_image_{i}" + if id in doc and doc[id] is not None: + image_list.append(doc[id].convert("RGB")) + assert len(image_list) < 6, "Maximum 5 images allowed for ICON-QA" + return image_list + + +def doc_to_text(doc, model_specific_prompt_kwargs): + question = doc["question"] + ques_type = doc["ques_type"] + options_prompt = [] + + if ques_type == "choose_img": + options_prompt.append("The first image.") + options_prompt.append("The second image.") + + options_str = options_to_str(options_prompt) + full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['options_statement'].format(question=question, options=options_str)}" + + elif ques_type == "choose_txt": + choices = doc["choices"].split(",") + for i, choice in enumerate(choices): + options_prompt.append(f"{choice}") + + options_str = options_to_str(options_prompt) + full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['options_statement'].format(question=question, options=options_str)}" + + elif ques_type == "fill_in_blank": + full_prompt = f"{model_specific_prompt_kwargs['pre_prompt']}{model_specific_prompt_kwargs['statement']}{model_specific_prompt_kwargs['freeform_statement'].format(question=question)}" + + return full_prompt + + +def test_process_results(doc, results): + pred = results[0] + questionId = doc["question_id"] + answer = doc["answer"] + return {"anls": {"questionId": int(questionId), "answer": answer, "pred_answer": pred}} diff --git a/EAGLE/lmms_eval/tasks/mme/mme.yaml b/EAGLE/lmms_eval/tasks/mme/mme.yaml new file mode 100644 index 0000000000000000000000000000000000000000..504e6dd0202a5789a9e0eedebaaf24086cc4d04e --- /dev/null +++ b/EAGLE/lmms_eval/tasks/mme/mme.yaml @@ -0,0 +1,37 @@ +dataset_path: lmms-lab/MME +dataset_kwargs: + token: True +task: "mme" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.mme_doc_to_visual +doc_to_text: !function utils.mme_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 16 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +# The return value of process_results will be used by metrics +process_results: !function utils.mme_process_results +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: mme_percetion_score + aggregation: !function utils.mme_aggregate_results + higher_is_better: true + - metric: mme_cognition_score + aggregation: !function utils.mme_aggregate_results + higher_is_better: true +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nAnswer the question using a single word or phrase." + qwen_vl: + pre_prompt: "" + post_prompt: " Answer:" + otterhd: + pre_prompt: "" + post_prompt: " Answer:" +metadata: + - version: 0.0 diff --git a/EAGLE/lmms_eval/tasks/mme/utils.py b/EAGLE/lmms_eval/tasks/mme/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b001b2c0392747222d524e44b78524c87043ced4 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/mme/utils.py @@ -0,0 +1,120 @@ +from collections import defaultdict +import os +import datetime +import json +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file + +import logging + +eval_logger = logging.getLogger("lmms-eval") + +dir_name = os.path.dirname(os.path.abspath(__file__)) + +eval_type_dict = { + "Perception": [ + "existence", + "count", + "position", + "color", + "posters", + "celebrity", + "scene", + "landmark", + "artwork", + "OCR", + ], + "Cognition": [ + "commonsense_reasoning", + "numerical_calculation", + "text_translation", + "code_reasoning", + ], +} + + +replace_prompt = " Please answer yes or no." + + +def mme_doc_to_visual(doc): + return [doc["image"].convert("RGB")] + + +def mme_doc_to_text(doc, model_specific_prompt_kwargs=None): + question = doc["question"].strip() + if "pre_prompt" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["pre_prompt"] != "": + question = question.replace(replace_prompt, "") + question = f"{model_specific_prompt_kwargs['pre_prompt']}{question}" + if "post_prompt" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["post_prompt"] != "": + question = question.replace(replace_prompt, "") + question = f"{question}{model_specific_prompt_kwargs['post_prompt']}" + return question + + +def parse_pred_ans(pred_ans): + """Brought from Otter Eval""" + pred_ans = pred_ans.lower().strip().replace(".", "") + pred_label = None + if pred_ans in ["yes", "no"]: + pred_label = pred_ans + else: + prefix_pred_ans = pred_ans[:4] + if "yes" in prefix_pred_ans: + pred_label = "yes" + elif "no" in prefix_pred_ans: + pred_label = "no" + else: + pred_label = "other" + return pred_label + + +def mme_process_results(doc, results): + """ + Args: + doc: a instance of the eval dataset + results: [pred] + Returns: + a dictionary with key: metric name (in this case mme score), value: metric value + """ + pred = results[0] + pred_ans = parse_pred_ans(pred) + gt_ans = doc["answer"].lower().strip().replace(".", "") + assert gt_ans in ["yes", "no"] + assert pred_ans in ["yes", "no", "other"] + score = 1.0 if pred_ans == gt_ans else 0.0 + category = doc["category"] + key_name = "mme_percetion_score" if category in eval_type_dict["Perception"] else "mme_cognition_score" + # Note: the key name here is very important. It decides which aggregation function will receive the results + # We note down the question id/category to help us aggregate the results later + return {key_name: {"question_id": doc["question_id"], "category": category, "score": score}} + + +def mme_aggregate_results(results): + """ + Args: + results: a list of values returned by process_results + Returns: + A score + """ + category2score = defaultdict(dict) + for result in results: + question_id = result["question_id"] + score = result["score"] + category = result["category"] + if question_id not in category2score[category]: + category2score[category][question_id] = [] + category2score[category][question_id].append(score) + category2avg_score = {} + for category, question2scores in category2score.items(): + total_score = 0 + for question_id, scores in question2scores.items(): + assert len(scores) == 2 + acc = sum(scores) / len(scores) * 100.0 + acc_plus = (sum(scores) == 2) * 100.0 + score = acc_plus + acc + total_score += score + avg_score = total_score / len(question2scores) + category2avg_score[category] = avg_score + for category, avg_score in category2avg_score.items(): + eval_logger.info(f"{category}: {avg_score:.2f}") + total_score = sum(category2avg_score.values()) + return total_score diff --git a/EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa.yaml b/EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3572b693542385875285526acce57cdf145db7cb --- /dev/null +++ b/EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa.yaml @@ -0,0 +1,4 @@ +group: multidocvqa +task: +- multidocvqa_val +- multidocvqa_test diff --git a/EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa_test.yaml b/EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d33c5a50b8cb813712258a0ae0d718011f61ea5d --- /dev/null +++ b/EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa_test.yaml @@ -0,0 +1,20 @@ +dataset_path: lmms-lab/MP-DocVQA +task: "multidocvqa_test" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.multidocvqa_doc_to_visual +doc_to_text: !function utils.multidocvqa_doc_to_text +doc_to_target: "answers" +generation_kwargs: + max_new_tokens: 32 + temperature: 0 + do_sample: False +process_results: !function utils.multidocvqa_process_test_results_for_submission +metric_list: + - metric: submission + aggregation: !function utils.multidocvqa_test_aggregate_results_for_submission +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nAnswer the question using a single word or phrase." + \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa_val.yaml b/EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa_val.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b4f5238bf63609c1789a030aef14ee37e890f079 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/multidocvqa/multidocvqa_val.yaml @@ -0,0 +1,23 @@ +dataset_path: lmms-lab/MP-DocVQA +task: "multidocvqa_val" +test_split: val +output_type: generate_until +doc_to_visual: !function utils.multidocvqa_doc_to_visual +doc_to_text: !function utils.multidocvqa_doc_to_text +doc_to_target: "answers" +generation_kwargs: + max_new_tokens: 32 + temperature: 0 + do_sample: False +process_results: !function utils.multidocvqa_process_results +metric_list: + - metric: anls + aggregation: !function utils.multidocvqa_aggregate_results_anls + higher_is_better: true + - metric: accuracy + aggregation: !function utils.multidocvqa_aggregate_results_accuracy + higher_is_better: true +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nAnswer the question using a single word or phrase." diff --git a/EAGLE/lmms_eval/tasks/multidocvqa/utils.py b/EAGLE/lmms_eval/tasks/multidocvqa/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..10fd85e62eca88295c41bfe015ce3bb253a7edc3 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/multidocvqa/utils.py @@ -0,0 +1,116 @@ +import os +import re +import ast +import json +import logging +from lmms_eval.api.metrics import levenshtein_distance +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file + +lmms_logger = logging.getLogger("lmms-eval") + + +def multidocvqa_doc_to_text(doc, model_specific_prompt_kwargs): + question = doc["question"] + pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + post_prompt = model_specific_prompt_kwargs["post_prompt"] + + return f"{pre_prompt}{question}{post_prompt}" + + +def multidocvqa_doc_to_visual(doc): + return [doc[f"image_{i}"].convert("RGB") for i in range(1, 21) if doc[f"image_{i}"] is not None] + + +def multidocvqa_process_results(doc, results): + pred_answer = results[0] + answer = ast.literal_eval(doc["answers"]) + + return {"anls": {"questionId": int(doc["questionId"]), "answer": answer, "pred_answer": pred_answer}, "accuracy": {"questionId": int(doc["questionId"]), "answer": answer, "pred_answer": pred_answer}} + + +def multidocvqa_aggregate_results_anls(results): + keys = {k for result in results for k in result} + results = {key: [result.get(key, None) for result in results] for key in keys} + evaluator = Evaluator(case_sensitive=False) + metric = evaluator.get_metrics(results["answer"], results["pred_answer"]) + + return sum(metric["anls"]) / len(metric["anls"]) + + +def multidocvqa_aggregate_results_accuracy(results): + keys = {k for result in results for k in result} + results = {key: [result.get(key, None) for result in results] for key in keys} + evaluator = Evaluator(case_sensitive=False) + metric = evaluator.get_metrics(results["answer"], results["pred_answer"]) + + return sum(metric["accuracy"]) / len(metric["accuracy"]) + + +def multidocvqa_process_test_results_for_submission(doc, results): + answer = results[0] + return {"submission": {"questionId": int(doc["questionId"]), "answer": answer, "answer_page": None}} + + +def multidocvqa_test_aggregate_results_for_submission(results, args): + path = generate_submission_file("multidocvqa_test_for_submission.json", args) + with open(path, "w") as f: + json.dump(results, f) + lmms_logger.info(f"Results saved to {path}.") + + +################## +# Helper functions +################## + + +class Evaluator: + def __init__(self, case_sensitive=False): + self.case_sensitive = case_sensitive + self.get_edit_distance = levenshtein_distance + self.anls_threshold = 0.5 + + def get_metrics(self, gt_answers, preds): + batch_accuracy = [] + batch_anls = [] + for batch_idx in range(len(preds)): + gt = [self._preprocess_str(gt_elm) for gt_elm in gt_answers[batch_idx]] + pred = self._preprocess_str(preds[batch_idx]) + + batch_accuracy.append(self._calculate_accuracy(gt, pred)) + batch_anls.append(self._calculate_anls(gt, pred)) + + return {"accuracy": batch_accuracy, "anls": batch_anls} + + def _preprocess_str(self, string): + if not self.case_sensitive: + string = string.lower() + + return string.strip() + + def _calculate_accuracy(self, gt, pred): + if pred == "none": + return 0 + + for gt_elm in gt: + if gt_elm == pred: + return 1 + + return 0 + + def _calculate_anls(self, gt, pred): + if len(pred) == 0: + return 0 + + if pred == "none": + return 0 + + answers_similarity = [1 - self.get_edit_distance(gt_elm, pred) / max(len(gt_elm), len(pred)) for gt_elm in gt] + max_similarity = max(answers_similarity) + + anls = max_similarity if max_similarity >= self.anls_threshold else 0 + return anls + + +if __name__ == "__main__": + print("-----------------") + multidocvqa_aggregate_results_anls([{"questionId": 1, "answer": ["answer"], "pred_answer": "pred_answer"}, {"questionId": 2, "answer": ["nswer"], "pred_answer": "nswer"}]) diff --git a/EAGLE/lmms_eval/tasks/nocaps/_default_template_nocaps_yaml b/EAGLE/lmms_eval/tasks/nocaps/_default_template_nocaps_yaml new file mode 100644 index 0000000000000000000000000000000000000000..12fdbbc4db4b62afa343acdf9bbc6580e508fb3b --- /dev/null +++ b/EAGLE/lmms_eval/tasks/nocaps/_default_template_nocaps_yaml @@ -0,0 +1,3 @@ +model_specific_prompt_kwargs: + default: + prompt: "Provide a one-sentence caption for the provided image." \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/nocaps/nocaps.yaml b/EAGLE/lmms_eval/tasks/nocaps/nocaps.yaml new file mode 100644 index 0000000000000000000000000000000000000000..579b7aee7d2d360a88d74c9281dd85b1f7980c7e --- /dev/null +++ b/EAGLE/lmms_eval/tasks/nocaps/nocaps.yaml @@ -0,0 +1,4 @@ +group : nocaps +task: + - nocaps_test + - nocaps_val \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/nocaps/nocaps_test.yaml b/EAGLE/lmms_eval/tasks/nocaps/nocaps_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9f21ce2b1df1ff7950336d5245df2e0cf106001d --- /dev/null +++ b/EAGLE/lmms_eval/tasks/nocaps/nocaps_test.yaml @@ -0,0 +1,25 @@ +dataset_path: lmms-lab/NoCaps +dataset_kwargs: + token: True +task : "nocaps_test" +group : "nocaps_caption" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.nocaps_doc_to_visual +doc_to_text: !function utils.nocaps_doc_to_text +doc_to_target: "annotations_captions" +generation_kwargs: + max_new_tokens: 64 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +process_results: !function utils.nocaps_test_process_result +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: nocaps_passthrough + aggregation : !function utils.nocaps_test_aggregation_result + higher_is_better : true +metadata: + - version: 0.0 +include: _default_template_nocaps_yaml \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/nocaps/nocaps_val.yaml b/EAGLE/lmms_eval/tasks/nocaps/nocaps_val.yaml new file mode 100644 index 0000000000000000000000000000000000000000..048066a69d47bef5b8ff13f7f2adc7aa2b1db536 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/nocaps/nocaps_val.yaml @@ -0,0 +1,46 @@ +dataset_path: lmms-lab/NoCaps +dataset_kwargs: + token: True +task: "nocaps_val" +group : "nocaps_caption" +test_split: validation +output_type: generate_until +doc_to_visual: !function utils.nocaps_doc_to_visual +doc_to_text: !function utils.nocaps_doc_to_text +doc_to_target: "annotations_captions" +generation_kwargs: + max_new_tokens: 64 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +process_results: !function utils.nocaps_process_result +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: nocaps_Bleu_4 + aggregation : !function utils.nocaps_bleu4 + higher_is_better : true + - metric: nocaps_Bleu_3 + aggregation : !function utils.nocaps_bleu3 + higher_is_better : true + - metric: nocaps_Bleu_2 + aggregation : !function utils.nocaps_bleu2 + higher_is_better : true + - metric: nocaps_Bleu_1 + aggregation : !function utils.nocaps_bleu1 + higher_is_better : true + - metric: nocaps_METEOR + aggregation : !function utils.nocaps_meteor + higher_is_better : true + - metric: nocaps_ROUGE_L + aggregation : !function utils.nocaps_rougel + higher_is_better : true + - metric: nocaps_CIDEr + aggregation : !function utils.nocaps_cider + higher_is_better : true + #- metric: nocaps_SPICE + # aggregation : !function utils.nocaps_spice + # higher_is_better : true +metadata: + - version: 0.0 +include: _default_template_nocaps_yaml \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/nocaps/utils.py b/EAGLE/lmms_eval/tasks/nocaps/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..41e47286c484bab2c65848ef9132afb3642d8088 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/nocaps/utils.py @@ -0,0 +1,153 @@ +import os +import json +from pycocoevalcap.eval import COCOEvalCap, Bleu, Meteor, Rouge, Cider, Spice +from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer +from pycocotools.coco import COCO + +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file + +import logging + +eval_logger = logging.getLogger("lmms-eval") + +dir_name = os.path.dirname(os.path.abspath(__file__)) + +NOCAPS_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"] # , "SPICE"] + + +def nocaps_doc_to_visual(doc): + return [doc["image"].convert("RGB")] + + +def nocaps_doc_to_text(doc, model_specific_prompt_kwargs=None): + # question = "Please carefully observe the image and come up with a caption for the image" + return model_specific_prompt_kwargs["prompt"] + + +def nocaps_process_result(doc, result): + """ + Args: + doc: a instance of the eval dataset + results: [pred] + Returns: + a dictionary with key: metric name, value: metric value + """ + pred = result[0] + # The question id in our dataset is the image file itself + image_id = doc["image_id"] + + data_dict = {"answer": doc["annotations_captions"], "pred": pred, "image_id": image_id} + + return {f"nocaps_{metric}": data_dict for metric in NOCAPS_METRICS} + + +def nocaps_aggregation_result(results, metric, args=None): + scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")] + scorers_dict = {s[1]: s for s in scorers} + + stored_results = [] + # In order to make the coco eval tools to successfully create index + # We need at least two dict in the dataset + # 'annotation' and 'images' + # 'annotation' exactly reproduce the original annotation + # 'images' however only need the image id which is contained in the file name + dataset = {"annotations": [], "images": []} + idx = 0 + for result in results: + stored_results.append({"image_id": int(result["image_id"]), "caption": result["pred"]}) + for a in result["answer"]: + dataset["annotations"].append({"image_id": int(result["image_id"]), "caption": a, "id": idx}) + idx += 1 + dataset["images"].append({"id": result["image_id"]}) + + coco = COCO() + # Manually create index here + coco.dataset = dataset + coco.createIndex() + + nocaps_result = coco.loadRes(stored_results) + nocaps_eval = COCOEvalCap(coco, nocaps_result) + + imgIds = nocaps_eval.params["image_id"] + gts = {} + res = {} + for imgId in imgIds: + gts[imgId] = nocaps_eval.coco.imgToAnns[imgId] + res[imgId] = nocaps_eval.cocoRes.imgToAnns[imgId] + + eval_logger.info("tokenization...") + tokenizer = PTBTokenizer() + gts = tokenizer.tokenize(gts) + res = tokenizer.tokenize(res) + + eval_logger.info(f"Computing {metric} scores...") + + score, scores = scorers_dict[metric][0].compute_score(gts, res) + # When metric is one of the Bleu, score will be a list + if type(score) == list: + n = int(metric.split("_")[-1]) + score = score[n - 1] + + path = generate_submission_file(f"nocaps_val_{metric}_scores.json", args) + eval_logger.info("Storing prediction that can be submitted to the server ...") + with open(path, "w") as f: + json.dump(stored_results, f, indent=4) + eval_logger.info(f"Your result has been saved to {path}.") + + return score + + +def nocaps_bleu4(results, args=None): + return nocaps_aggregation_result(results, "Bleu_4", args) + + +def nocaps_bleu3(results, args=None): + return nocaps_aggregation_result(results, "Bleu_3", args) + + +def nocaps_bleu2(results, args=None): + return nocaps_aggregation_result(results, "Bleu_2", args) + + +def nocaps_bleu1(results, args=None): + return nocaps_aggregation_result(results, "Bleu_1", args) + + +def nocaps_meteor(results, args=None): + return nocaps_aggregation_result(results, "METEOR", args) + + +def nocaps_rougel(results, args=None): + return nocaps_aggregation_result(results, "ROUGE_L", args) + + +def nocaps_cider(results, args=None): + return nocaps_aggregation_result(results, "CIDEr", args) + + +def nocaps_spice(results, args=None): + return nocaps_aggregation_result(results, "SPICE", args) + + +def nocaps_test_process_result(doc, result): + """ + Args: + doc: a instance of the eval dataset + results: [pred] + Returns: + a dictionary with key: metric name (in this case nocaps_passthrough), value: metric value + """ + return {"nocaps_passthrough": {"pred": result[0], "image_id": doc["image_id"]}} + + +def nocaps_test_aggregation_result(results, args=None): + stored_results = [] + for result in results: + stored_results.append({"image_id": int(result["image_id"]), "caption": result["pred"]}) + + path = generate_submission_file("nocaps_captions_nocaps_test_alg_results.json", args) + eval_logger.info("Storing prediction that can be submitted to the server ...") + with open(path, "w") as f: + json.dump(stored_results, f, indent=4) + + eval_logger.info(f"Your test result has been stored in {path}. Make sure you also have the val result stored to submit to the server on https://codalab.lisn.upsaclay.fr/competitions/7404#participate.") diff --git a/EAGLE/lmms_eval/tasks/ok_vqa/_default_template_vqa_yaml b/EAGLE/lmms_eval/tasks/ok_vqa/_default_template_vqa_yaml new file mode 100644 index 0000000000000000000000000000000000000000..8d74eb11c83b700b9de409db007a2253685e950f --- /dev/null +++ b/EAGLE/lmms_eval/tasks/ok_vqa/_default_template_vqa_yaml @@ -0,0 +1,24 @@ +dataset_path: lmms-lab/OK-VQA +output_type: generate_until +doc_to_visual: !function utils.ok_vqa_doc_to_visual +doc_to_text: !function utils.ok_vqa_doc_to_text +doc_to_target: "answer" +generation_kwargs: + until: + - "ASSISTANT:" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true + - metric: submission + aggregation: !function utils.ok_vqa_aggreate_submissions + higher_is_better: true +process_results: !function utils.ok_vqa_process_results +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase." +metadata: + - version: 0.0 \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/ok_vqa/_generate_config.py b/EAGLE/lmms_eval/tasks/ok_vqa/_generate_config.py new file mode 100644 index 0000000000000000000000000000000000000000..a98f2f4f564ebb3fc885d9565193808796e88b8c --- /dev/null +++ b/EAGLE/lmms_eval/tasks/ok_vqa/_generate_config.py @@ -0,0 +1,25 @@ +import os +import yaml + +splits = ["val2014"] +tasks = ["vqa"] + +if __name__ == "__main__": + dump_tasks = [] + for task in tasks: + for split in splits: + yaml_dict = {"group": f"ok_vqa", "task": f"ok_vqa_{split}", "include": f"_default_template_{task}_yaml", "test_split": split} + if split == "train": + yaml_dict.pop("group") + else: + dump_tasks.append(f"ok_vqa_{split}") + + save_path = f"./ok_vqa_{split}.yaml" + print(f"Saving to {save_path}") + with open(save_path, "w") as f: + yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False) + + group_dict = {"group": "ok_vqa", "task": dump_tasks} + + with open("./_ok_vqa.yaml", "w") as f: + yaml.dump(group_dict, f, default_flow_style=False, indent=4) diff --git a/EAGLE/lmms_eval/tasks/ok_vqa/_ok_vqa.yaml b/EAGLE/lmms_eval/tasks/ok_vqa/_ok_vqa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3b6376f71704913d3658c72a8f312c462d5553c8 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/ok_vqa/_ok_vqa.yaml @@ -0,0 +1,3 @@ +group: ok_vqa +task: +- ok_vqa_val2014 \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/ok_vqa/ok_vqa_val2014.yaml b/EAGLE/lmms_eval/tasks/ok_vqa/ok_vqa_val2014.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6bb1d74571002f1429405b91585b01be0eb13110 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/ok_vqa/ok_vqa_val2014.yaml @@ -0,0 +1,4 @@ +group: ok_vqa +task: ok_vqa_val2014 +test_split: val2014 +include: _default_template_vqa_yaml \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/ok_vqa/utils.py b/EAGLE/lmms_eval/tasks/ok_vqa/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..52faa11dfc4f8b7d56fb78bda720f3d38e520b7a --- /dev/null +++ b/EAGLE/lmms_eval/tasks/ok_vqa/utils.py @@ -0,0 +1,70 @@ +import re +import os +import json +import yaml +import pathlib +import logging +import datetime +import statistics + +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file +from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor + +eval_logger = logging.getLogger("lmms-eval") + + +def ok_vqa_doc_to_visual(doc): + return [doc["image"].convert("RGB")] + + +def ok_vqa_process_results(doc, result): + eval_ai_processor = EvalAIAnswerProcessor() + assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}." + resAns = eval_ai_processor(result[0]) + accuracy = 0 + + if "answers" in doc and doc["answers"] is not None: + gtAcc = [] + + for i in range(len(doc["answers"])): + doc["answers"][i] = eval_ai_processor(doc["answers"][i]) + + for i in range(len(doc["answers"])): + otherGTAns = [doc["answers"][j] for j in range(len(doc["answers"])) if i != j] + matchingAns = [item for item in otherGTAns if item == resAns] + acc = min(1, float(len(matchingAns)) / 3) + gtAcc.append(acc) + if gtAcc: + accuracy = statistics.mean(gtAcc) + else: + accuracy = 0 + + return { + "exact_match": accuracy, + "submission": { + "image": f"{doc['question_id']}.jpg", + "answer": resAns, + }, + } + + +def ok_vqa_doc_to_text(doc, model_specific_prompt_kwargs=None): + question = doc["question"] + if model_specific_prompt_kwargs is None: + model_specific_prompt_kwargs = {} + pre_prompt = "" + post_prompt = "" + if "pre_prompt" in model_specific_prompt_kwargs: + pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + if "post_prompt" in model_specific_prompt_kwargs: + post_prompt = model_specific_prompt_kwargs["post_prompt"] + return f"{pre_prompt}{question}{post_prompt}" + + +def ok_vqa_aggreate_submissions(results, args): + now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S") + file = f"ok_vqa-test-submission-{now_date_time}.json" + path = generate_submission_file(file, args) + with open(path, "w") as f: + json.dump(results, f) + print(f"Submission file saved to {path}") diff --git a/EAGLE/lmms_eval/tasks/olympiadbench/cn_utils.py b/EAGLE/lmms_eval/tasks/olympiadbench/cn_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..34e5ce4d45d24d9238d43344711ffbf076aab212 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/olympiadbench/cn_utils.py @@ -0,0 +1,69 @@ +import os +import json +import datetime +from lmms_eval.tasks.olympiadbench.olympiadbench_evals import OlympiadBenchEvaluator +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file + +import logging +eval_logger = logging.getLogger("lmms-eval") +dir_name = os.path.dirname(os.path.abspath(__file__)) + +olympiadbench_evaluator = OlympiadBenchEvaluator() + +def olympiadbench_doc_to_visual(doc): + return [image.convert("RGB") for image in doc["images"]] + +def olympiadbench_doc_to_text(doc): + question = doc["question"] + subject = doc["subfield"] + mul_ans = doc["is_multiple_answer"] + if mul_ans is None: + mul_ans = False + ans_type = doc["answer_type"] + if ans_type == "Need_human_evaluate": + ans_type = "proof based" + + pre_prompt = f"以下是中国{subject}竞赛中的解答题。\n" + + post_prompt = "" + if not mul_ans: + post_prompt += f"答案类型为{ans_type}。\n" + else: + post_prompt += f"题目有多个答案,答案类型均为{ans_type}。\n" + post_prompt += "请根据题目的要求和所提供的信息计算得出答案。解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以" + if not mul_ans: + post_prompt += '"所以最终答案是\\boxed{答案}。"\n' + else: + post_prompt += '"所以最终答案是\\boxed{用英⽂逗号连接的多个答案}。"\n' + + final_question = pre_prompt + question + '\n' + post_prompt + return final_question + +def olympiadbench_process_results(doc, results): + precision = doc["error"] + is_proving = "TP" in doc["source"] + if precision is None: + precision = 0 + prediction = results[0].strip() + + if is_proving: + return { + "submission": prediction + } + else: + prediction = prediction.split("所以最终答案是")[-1] + prediction = prediction.replace('"', "").replace("\n", "").replace(" ", "").strip(".").strip("。") + accuracy = olympiadbench_evaluator.judge(prediction, doc["final_answer"][0], precision) + accuracy = int(accuracy) + return { + "exact_match": accuracy + } + +def olympiadbench_aggregate_results(results, args): + now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S") + submission_file_name = f"olympiadbench-test-cn-submission-{now_date_time}.json" + path = generate_submission_file(submission_file_name, args) + with open(path, "w") as f: + json.dump(results, f, ensure_ascii=False) + print(f"Submission file saved to {path}") + \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/olympiadbench/en_utils.py b/EAGLE/lmms_eval/tasks/olympiadbench/en_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a21ee159f0c6eb7daafa495e81fceab2650d21a6 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/olympiadbench/en_utils.py @@ -0,0 +1,69 @@ +import os +import json +import datetime +from lmms_eval.tasks.olympiadbench.olympiadbench_evals import OlympiadBenchEvaluator +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file + +import logging +eval_logger = logging.getLogger("lmms-eval") +dir_name = os.path.dirname(os.path.abspath(__file__)) + +olympiadbench_evaluator = OlympiadBenchEvaluator() + +def olympiadbench_doc_to_visual(doc): + return [image.convert("RGB") for image in doc["images"]] + +def olympiadbench_doc_to_text(doc): + question = doc["question"] + subject = doc["subfield"] + mul_ans = doc["is_multiple_answer"] + if mul_ans is None: + mul_ans = False + ans_type = doc["answer_type"] + if ans_type == "Need_human_evaluate": + ans_type = "proof based" + + pre_prompt = f"The following is a question from an International {subject} competition.\n" + + post_prompt = "" + if not mul_ans: + post_prompt += f"The answer of the question should be {ans_type}.\n" + else: + post_prompt += f"The question has multiple answers, each of them should be {ans_type}.\n" + post_prompt += "Please calculate the answer according to the given requirements and the information provided. Please use LaTeX format to represent the variables and formulas used in the solution process and results. Please end your solution with " + if not mul_ans: + post_prompt += '"So the final answer is \\boxed{answer}."\n' + else: + post_prompt += 'So the final answer is \\boxed{multiple answers connected with commas}.\n' + + final_question = pre_prompt + question + '\n' + post_prompt + return final_question + +def olympiadbench_process_results(doc, results): + precision = doc["error"] + is_proving = "TP" in doc["source"] + if precision is None: + precision = 0 + prediction = results[0].strip() + + if is_proving: + return { + "submission": prediction + } + else: + prediction = prediction.split("final answer is")[-1] + prediction = prediction.replace('"', "").replace("\n", "").replace(" ", "").strip(".").strip("。") + accuracy = olympiadbench_evaluator.judge(prediction, doc["final_answer"][0], precision) + accuracy = int(accuracy) + return { + "exact_match": accuracy + } + +def olympiadbench_aggregate_results(results, args): + now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S") + submission_file_name = f"olympiadbench-test-en-submission-{now_date_time}.json" + path = generate_submission_file(submission_file_name, args) + with open(path, "w") as f: + json.dump(results, f, ensure_ascii=False) + print(f"Submission file saved to {path}") + \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench.yaml b/EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1580b158b38ed1214e4eb33800415ffb4bc327cd --- /dev/null +++ b/EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench.yaml @@ -0,0 +1,6 @@ +group: olympiadbench +task: +- olympiadbench_test_en +- olympiadbench_test_cn +metadata: + - version: 0.0 diff --git a/EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py b/EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py new file mode 100644 index 0000000000000000000000000000000000000000..dd40f6111cfaab4ddcaa8a947afbc9b8f004341d --- /dev/null +++ b/EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_evals.py @@ -0,0 +1,355 @@ +import re +import sympy as sp +from sympy import simplify, Eq, sympify, Pow +from sympy.parsing.latex import parse_latex +import math + +# how to use +# scorer = OlympiadBenchEvaluator() +# exp1 = "10^{10^{10^{10}}}" +# exp2 = "10^{10}" +# precision = 1e-4 +# res = scorer.judge(exp1, exp2, precision) + +class OlympiadBenchEvaluator: + def __init__(self): + # Map of special symbols to their replacements + self.special_signal_map = { + "\\left": "", + "\\right": "", + "∶": ":", + ",": ",", + "$": "", + "\\approx": "=", + "\\simeq": "=", + "\\sim": "=", + "^\\prime": "'", + "^{\\prime}": "'", + "^\\circ": "", + "%": "", + } + self.pi = parse_latex("\\pi") + self.precision = 1e-8 # Default precision for comparison + + def split_by_comma(self, expr: str): + # Splits expressions by commas outside of brackets + in_bracket_num = 0 + splitted_expr = [] + start_idx = 0 + for i, char in enumerate(expr): + if char in ["(", "["]: + in_bracket_num += 1 + elif char in [")", "]"]: + in_bracket_num -= 1 + elif char == "," and in_bracket_num == 0: + splitted_expr.append(expr[start_idx:i].strip()) + start_idx = i + 1 + + if start_idx < len(expr): + splitted_expr.append(expr[start_idx:].strip()) + + return splitted_expr + + def trans_plus_minus_sign(self, expr_list: list): + # Translates plus-minus signs into separate expressions + new_expr_list = [] + for expr in expr_list: + if "\\pm" in expr: + new_expr_list.append(expr.replace("\\pm", "+")) + new_expr_list.append(expr.replace("\\pm", "-")) + else: + new_expr_list.append(expr) + + return new_expr_list + + def judge(self, expression1, expression2, precision=1e-8): + # Judge if two expressions are equal (expression1 is considered as the Ground Truth) + # Default precision is a list for supporting multiple expressions + precision = precision if isinstance(precision, list) else [precision] + + try: + expression1, expression2 = self.preprocess(expression1, expression2) + except: + return False + if expression1 == expression2: + # print("Exactly equal") + return True + + # Remove Chinese characters from the string, as answers like "yes" or "no" in Chinese have been considered + expression1 = re.sub(r'[\u4e00-\u9fff]+', '', expression1) + expression2 = re.sub(r'[\u4e00-\u9fff]+', '', expression2) + + expression1 = self.split_by_comma(expression1) + expression2 = self.split_by_comma(expression2) + + temp_list1 = self.trans_plus_minus_sign(expression1) + temp_list2 = self.trans_plus_minus_sign(expression2) + + # Set up a list for allowed errors + if len(precision) <= 1: + precision = precision * len(temp_list1) + + if len(temp_list1) != len(temp_list2): + return False + + # Check if elements in both lists can be paired and are equal + idx = -1 + while len(temp_list1) != 0: + idx = (idx + 1) % len(temp_list1) + + item1 = temp_list1[idx] + self.precision = precision[idx] + + for item2 in temp_list2: + if self.is_equal(item1, item2): + temp_list1.remove(item1) + temp_list2.remove(item2) + precision.remove(self.precision) + break + else: + # If no match was found, return False + return False + + # If all elements are matched, return True + return True + + def is_interval(self, expr): + # Checks if an expression is an interval + return expr.startswith(("(", "[")) and expr.endswith((")", "]")) + + def sympy_sub_pi(self, expression_sympy): + # Replaces the symbol for pi in sympy expressions with its numerical value + return expression_sympy.subs(self.pi, math.pi) + + def is_equal(self, expression1, expression2): + # Default first expression is ground truth. Check if expressions are equal in different aspects + if expression1 == expression2 and expression1 != "" and expression2 != "": + # print("Equivalent natively") + return True + + # First check if both are intervals + if self.is_interval(expression1) and self.is_interval(expression2): + try: + if self.interval_equal(expression1, expression2): + # print("Interval equivalent") + return True + except: + return False + + # Then check for numerical equality + try: + if self.numerical_equal(expression1, expression2): + # print("Numerically equivalent") + return True + except: + pass + + # Then check if expressions are mathematically equal + try: + if self.expression_equal(expression1, expression2) and not ("=" in expression1 and "=" in expression2): + # print("Expression equivalent") + return True + except: + pass + + # Lastly, check for equation equality + try: + if self.equation_equal(expression1, expression2): + # print("Equation equivalent") + return True + except: + pass + + return False + + def numerical_equal(self, expression1: str, expression2: str, include_percentage: bool = True): + # Check if two numerical values are equal within an allowed error range + # Includes possible percentage cases + reference = float(expression1) + prediction = float(expression2) + + if include_percentage: + gt_result = [reference / 100, reference, reference * 100] + else: + gt_result = [reference] + + for item in gt_result: + if abs(item - prediction) <= self.precision * 1.01: + return True + return False + + + def expression_equal(self, exp1, exp2): + # Check if two expressions are mathematically equivalent + # Extract expression and use sympy for equivalence checking + def extract_expression(expression): + if "=" in expression: + expression = expression.split("=")[1] + return expression.strip() + + exp1 = extract_expression(exp1) + exp2 = extract_expression(exp2) + + expr1_sym = sympify(parse_latex(exp1)) + expr2_sym = sympify(parse_latex(exp2)) + + if expr1_sym == expr2_sym: + return True + else: + expr1_sym = self.sympy_sub_pi(expr1_sym) + expr2_sym = self.sympy_sub_pi(expr2_sym) + + if (expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol)) or (not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)): + return False + elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol): + try: + if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)): + print(f"These two numbers cannot be calculated by the current computer for: \"{str(expr1_sym)}\" and \"{str(expr2_sym)}\"") + return False + + if abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01: + return True + else: + return False + except: + return False + else: + try: + simplified_expr = simplify(expr1_sym - expr2_sym) + + num_value = simplified_expr.evalf() + + return abs(num_value) < 1e-3 + except: + return False + + def equation_equal(self, expression1, expression2): + # Check if two equations are mathematically equivalent + # Simplify equations and use sympy for equivalence checking + def simplify_equation(latex_eq): + lhs, rhs = latex_eq.split('=') + + lhs_expr = parse_latex(lhs) + rhs_expr = parse_latex(rhs) + + equation = Eq(lhs_expr, rhs_expr) + + simplified_eq = simplify(equation.lhs - equation.rhs) + + return simplified_eq + + expr1_sym = simplify_equation(expression1) + expr2_sym = simplify_equation(expression2) + + division_result_1 = simplify(expr1_sym / expr2_sym) + division_result_2 = simplify(expr2_sym / expr1_sym) + + if (division_result_1.is_Integer and division_result_1 != 0) or (division_result_2.is_Integer and division_result_2 != 0): + return True + else: + return False + + def interval_equal(self, expression1, expression2): + # Check if two intervals are mathematically equivalent + def compare_two_interval(inter1, inter2): + if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]: + return False + + inter1 = inter1.strip('[]()') + inter2 = inter2.strip('[]()') + + items_1 = inter1.split(',') + items_2 = inter2.split(',') + + for item_1, item_2 in zip(items_1, items_2): + if not self.expression_equal(item_1, item_2): + return False + return True + + interval1 = expression1 + interval2 = expression2 + + if interval1 == interval2: + return True + else: + inter_list1 = interval1.split("\\cup") + inter_list2 = interval2.split("\\cup") + + if len(inter_list1) != len(inter_list2): + return False + else: + for inter1, inter2 in zip(inter_list1, inter_list2): + if not compare_two_interval(inter1, inter2): + return False + return True + + def preprocess(self, expression1, expression2): + # Preprocess expressions to extract and replace special symbols + def extract_boxed_content(latex_str): + boxed_matches = re.finditer(r'\\boxed{', latex_str) + results = "" + + for match in boxed_matches: + start_index = match.end() + end_index = start_index + stack = 1 + + while stack > 0 and end_index < len(latex_str): + if latex_str[end_index] == '{': + stack += 1 + elif latex_str[end_index] == '}': + stack -= 1 + end_index += 1 + + if stack == 0: + content = latex_str[start_index:end_index - 1] + results += content + "," + else: + raise ValueError("Mismatched braces in LaTeX string.") + + if results == "": + last_line_ans = latex_str.strip().split("\n")[-1] + dollar_pattern = r"\$(.*?)\$" + answers = re.findall(dollar_pattern, last_line_ans) + + if answers: + for ans in answers: + results += ans + "," + else: + results = latex_str + + return results + + def sepcial_symbol_replace(expression): + if "\\in " in expression: + expression = expression.split("\\in ")[1] + + for signal in self.special_signal_map: + expression = expression.replace(signal, self.special_signal_map[signal]) + + expression = expression.strip("\n$,.:;^_=+`!@#$%^&*~,。") + + pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}' + expression = re.sub(pattern, r'\1', expression) + + return expression + + exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2) + exp1, exp2 = sepcial_symbol_replace(exp1), sepcial_symbol_replace(exp2) + + return exp1, exp2 + + def can_compute_power(self, expr): + # Checks if a power expression can be computed + if isinstance(expr, Pow): + base, exp = expr.as_base_exp() + if base.is_number and exp.is_number: + MAX_EXP = 1000 # Adjust based on computing environment + if abs(exp.evalf()) > MAX_EXP: + return False + else: + return True + else: + return False + else: + return True # Not a power expression, can compute \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml b/EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..574d0c194c2652baf313d87724fe43e0cdad8d66 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml @@ -0,0 +1,25 @@ +dataset_path: lmms-lab/OlympiadBench +dataset_kwargs: + token: True +task : "olympiadbench_test_cn" +test_split: test_cn +output_type: generate_until +doc_to_visual: !function cn_utils.olympiadbench_doc_to_visual +doc_to_text: !function cn_utils.olympiadbench_doc_to_text +doc_to_target: "answer" +generation_kwargs: + until: + - "ASSISTANT:" + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +process_results: !function cn_utils.olympiadbench_process_results +metric_list: + - metric: submission + aggregation: !function cn_utils.olympiadbench_aggregate_results + higher_is_better: true + - metric: exact_match + aggregation: mean + higher_is_better: true \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml b/EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6d293fb7a53f7ae260244a0b1f08434f7e8eefc1 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml @@ -0,0 +1,25 @@ +dataset_path: lmms-lab/OlympiadBench +dataset_kwargs: + token: True +task : "olympiadbench_test_en" +test_split: test_en +output_type: generate_until +doc_to_visual: !function en_utils.olympiadbench_doc_to_visual +doc_to_text: !function en_utils.olympiadbench_doc_to_text +doc_to_target: "answer" +generation_kwargs: + until: + - "ASSISTANT:" + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +process_results: !function en_utils.olympiadbench_process_results +metric_list: + - metric: submission + aggregation: !function en_utils.olympiadbench_aggregate_results + higher_is_better: true + - metric: exact_match + aggregation: mean + higher_is_better: true \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/seedbench/seedbench.yaml b/EAGLE/lmms_eval/tasks/seedbench/seedbench.yaml new file mode 100644 index 0000000000000000000000000000000000000000..371d0ba3c9c1471c475136873d81d3534dd71061 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/seedbench/seedbench.yaml @@ -0,0 +1,28 @@ +dataset_path: lmms-lab/SEED-Bench +dataset_kwargs: + token: True +task: "seedbench" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.seed_doc_to_visual +doc_to_text: !function utils.seed_doc_to_text +doc_to_target: "answer" +generation_kwargs: + until: + - "ASSISTANT:" + image_aspect_ratio: original +# The return value of process_results will be used by metrics +process_results: !function utils.seed_process_result +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: seed_image + aggregation: !function utils.seed_aggregation_result + higher_is_better: true + - metric: seed_video + aggregation: !function utils.seed_aggregation_result + higher_is_better: true + - metric: seed_all + aggregation: !function utils.seed_aggregation_result + higher_is_better: true +metadata: + - version: 0.0 \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/seedbench/seedbench_ppl.yaml b/EAGLE/lmms_eval/tasks/seedbench/seedbench_ppl.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e12729ec8dd8331d5f557b38efe240d076ef4139 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/seedbench/seedbench_ppl.yaml @@ -0,0 +1,15 @@ +dataset_path: lmms-lab/SEED-Bench +dataset_kwargs: + token: True +task: "seedbench_ppl" +test_split: test +output_type: multiple_choice +doc_to_visual: !function utils.seed_doc_to_visual +doc_to_text: !function utils.seed_doc_to_text_mc +doc_to_choice : !function utils.seed_doc_to_choice +doc_to_target: !function utils.seed_doc_to_mc_target +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: acc +metadata: + - version: 0.0 \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/seedbench/utils.py b/EAGLE/lmms_eval/tasks/seedbench/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c2938f137745931af29af7a9a646e50ee91ee8b1 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/seedbench/utils.py @@ -0,0 +1,60 @@ +import json + + +def seed_doc_to_visual(doc): + return [image.convert("RGB") for image in doc["image"]] + + +def seed_doc_to_text(doc): + question = doc["question"] + question += "\n" + f"A. {doc['choice_a']}\n" + question += f"B. {doc['choice_b']}\n" + question += f"C. {doc['choice_c']}\n" + question += f"D. {doc['choice_d']}" + return f"{question}\nAnswer with the option's letter from the given choices directly." + + +def seed_process_result(doc, result): + pred = result[0].strip() + if len(pred) > 1: + pred = pred[0] + answer = doc["answer"] + data_type = doc["data_type"] + + return {f"seed_{data_type}": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}, f"seed_all": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}} + + +def seed_aggregation_result(results): + total_count = 0 + total_correct = 0 + for result in results: + if result["pred"] == result["answer"]: + total_correct += 1 + total_count += 1 + return total_correct / total_count + + +def seed_aggregation_result_all(results): + score = seed_aggregation_result(results) + stored_results = [] + for result in results: + stored_results.append({"question_id": result["question_id"], "prediction": result["pred"]}) + with open("./seed_submission.json", "w") as f: + json.dump(stored_results, f, indent=4) + print("Storing files for seed_submission ...") + + return score + + +def seed_doc_to_text_mc(doc): + question = doc["question"] + return f"{question} Answer :" + + +def seed_doc_to_choice(doc): + return [doc["choice_a"], doc["choice_b"], doc["choice_c"], doc["choice_d"]] + + +def seed_doc_to_mc_target(doc): + answer2choice = {"A": "choice_a", "B": "choice_b", "C": "choice_c", "D": "choice_d"} + return doc[answer2choice[doc["answer"]]] diff --git a/EAGLE/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml b/EAGLE/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml new file mode 100644 index 0000000000000000000000000000000000000000..b60e83c4690045a6a77627094cf47b0d421cf31c --- /dev/null +++ b/EAGLE/lmms_eval/tasks/textvqa/_default_template_textvqa_yaml @@ -0,0 +1,17 @@ +dataset_path: lmms-lab/textvqa +output_type: generate_until +doc_to_visual: !function utils.textvqa_doc_to_visual +doc_to_text: !function utils.textvqa_doc_to_text +doc_to_target: "answer" +generation_kwargs: + until: + - "ASSISTANT:" +process_results: !function utils.textvqa_process_results +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nAnswer the question using a single word or phrase." + ocr: true + qwen_vl: + pre_prompt: "" + post_prompt: " Answer:" diff --git a/EAGLE/lmms_eval/tasks/textvqa/_textvqa.yaml b/EAGLE/lmms_eval/tasks/textvqa/_textvqa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..506a2fb047a7ee38b9cf85fa13e91df0394e2dcb --- /dev/null +++ b/EAGLE/lmms_eval/tasks/textvqa/_textvqa.yaml @@ -0,0 +1,4 @@ +group: textvqa +task: +- textvqa_val +- textvqa_test \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/textvqa/textvqa_test.yaml b/EAGLE/lmms_eval/tasks/textvqa/textvqa_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..15b02c4f51d17d902723a0fb595577f131797831 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/textvqa/textvqa_test.yaml @@ -0,0 +1,7 @@ +task: textvqa_test +test_split: test +metric_list: + - metric: submission + aggregation: !function utils.textvqa_aggreate_submissions + higher_is_better: true +include: _default_template_textvqa_yaml diff --git a/EAGLE/lmms_eval/tasks/textvqa/textvqa_val.yaml b/EAGLE/lmms_eval/tasks/textvqa/textvqa_val.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e23e9b2ed6b8ee80e878c9bd9d800efc0016983e --- /dev/null +++ b/EAGLE/lmms_eval/tasks/textvqa/textvqa_val.yaml @@ -0,0 +1,12 @@ +task: textvqa_val +test_split: validation +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true + - metric: submission + aggregation: !function utils.textvqa_aggreate_submissions + higher_is_better: true +include: _default_template_textvqa_yaml diff --git a/EAGLE/lmms_eval/tasks/textvqa/utils.py b/EAGLE/lmms_eval/tasks/textvqa/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ea3b503bc854d3014786e9904b010ce8d33cfd50 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/textvqa/utils.py @@ -0,0 +1,68 @@ +import re +import os +import json +import yaml +import pathlib +import logging +import datetime +import statistics + +from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file + +eval_logger = logging.getLogger("lmms-eval") + + +def textvqa_doc_to_visual(doc): + return [doc["image"].convert("RGB")] + + +def textvqa_process_results(doc, result): + eval_ai_processor = EvalAIAnswerProcessor() + assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}." + resAns = eval_ai_processor(result[0]) + accuracy = 0 + + if "answers" in doc and doc["answers"] is not None: + gtAcc = [] + + for i in range(len(doc["answers"])): + doc["answers"][i] = eval_ai_processor(doc["answers"][i]) + + for i in range(len(doc["answers"])): + otherGTAns = [doc["answers"][j] for j in range(len(doc["answers"])) if i != j] + matchingAns = [item for item in otherGTAns if item == resAns] + acc = min(1, float(len(matchingAns)) / 3) + gtAcc.append(acc) + accuracy = statistics.mean(gtAcc) + + return { + "exact_match": accuracy, + "submission": { + "question_id": doc["question_id"], + "answer": resAns, + }, + } + + +def textvqa_doc_to_text(doc, model_specific_prompt_kwargs=None): + pre_prompt = "" + post_post = "" + ocr_ref = "" + if model_specific_prompt_kwargs: + if "pre_prompt" in model_specific_prompt_kwargs: + pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + if "post_prompt" in model_specific_prompt_kwargs: + post_prompt = model_specific_prompt_kwargs["post_prompt"] + if "ocr" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["ocr"]: + ocr_ref = f"\nReference OCR token: {', '.join(doc['ocr_tokens'])}" + return f"{pre_prompt}{doc['question'].capitalize()}{ocr_ref}{post_prompt}" + + +def textvqa_aggreate_submissions(results, args): + now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + path = generate_submission_file(f"textvqa_submission_{now_date_time}.json", args) + with open(path, "w") as f: + json.dump(results, f) + # print(f"Submission file saved to {path}") + eval_logger.info(f"Submission file saved to {path}") diff --git a/EAGLE/lmms_eval/tasks/vizwiz_vqa/_default_template_vqa_yaml b/EAGLE/lmms_eval/tasks/vizwiz_vqa/_default_template_vqa_yaml new file mode 100644 index 0000000000000000000000000000000000000000..b7dd4137ae9963cf6e1a8cc1805bf396dde77976 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/vizwiz_vqa/_default_template_vqa_yaml @@ -0,0 +1,15 @@ +dataset_path: lmms-lab/VizWiz-VQA +output_type: generate_until +doc_to_visual: !function utils.vizwiz_vqa_doc_to_visual +doc_to_text: !function utils.vizwiz_vqa_doc_to_text +doc_to_target: "answer" +generation_kwargs: + until: + - "ASSISTANT:" +metadata: + - version: 0.0 +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nWhen the provided information is insufficient, respond with 'Unanswerable'.\nAnswer the question using a single word or phrase." +process_results: !function utils.vizwiz_vqa_process_results diff --git a/EAGLE/lmms_eval/tasks/vizwiz_vqa/_generate_config.py b/EAGLE/lmms_eval/tasks/vizwiz_vqa/_generate_config.py new file mode 100644 index 0000000000000000000000000000000000000000..725161d10e7f774566c43a3c8312c44be4581f45 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/vizwiz_vqa/_generate_config.py @@ -0,0 +1,25 @@ +import os +import yaml + +splits = ["val", "test"] +tasks = ["vqa"] + +if __name__ == "__main__": + dump_tasks = [] + for task in tasks: + for split in splits: + yaml_dict = {"group": f"vizwiz_{task}", "task": f"vizwiz_{task}_{split}", "include": f"_default_template_{task}_yaml", "test_split": split} + if split == "train": + yaml_dict.pop("group") + else: + dump_tasks.append(f"vizwiz_{task}_{split}") + + save_path = f"./vizwiz_{task}_{split}.yaml" + print(f"Saving to {save_path}") + with open(save_path, "w") as f: + yaml.dump(yaml_dict, f, default_flow_style=False, sort_keys=False) + + group_dict = {"group": "vizwiz_vqa", "task": dump_tasks} + + with open("./_vizwiz_vqa.yaml", "w") as f: + yaml.dump(group_dict, f, default_flow_style=False, indent=4) diff --git a/EAGLE/lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml b/EAGLE/lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5502a105c14c3dd1fa75c969a665429289f02331 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/vizwiz_vqa/_vizwiz_vqa.yaml @@ -0,0 +1,4 @@ +group: vizwiz_vqa +task: +- vizwiz_vqa_val +- vizwiz_vqa_test \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/vizwiz_vqa/utils.py b/EAGLE/lmms_eval/tasks/vizwiz_vqa/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..63afb4e427863f05f50506028a09521526bd293d --- /dev/null +++ b/EAGLE/lmms_eval/tasks/vizwiz_vqa/utils.py @@ -0,0 +1,70 @@ +import re +import os +import json +import yaml +import pathlib +import logging +import datetime +import statistics + +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file +from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor + +eval_logger = logging.getLogger("lmms-eval") + + +def vizwiz_vqa_doc_to_visual(doc): + return [doc["image"].convert("RGB")] + + +def vizwiz_vqa_process_results(doc, result): + eval_ai_processor = EvalAIAnswerProcessor() + assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}." + resAns = eval_ai_processor(result[0]) + accuracy = 0 + + if "answers" in doc and doc["answers"] is not None: + gtAcc = [] + + for i in range(len(doc["answers"])): + doc["answers"][i] = eval_ai_processor(doc["answers"][i]) + + for i in range(len(doc["answers"])): + otherGTAns = [doc["answers"][j] for j in range(len(doc["answers"])) if i != j] + matchingAns = [item for item in otherGTAns if item == resAns] + acc = min(1, float(len(matchingAns)) / 3) + gtAcc.append(acc) + if gtAcc: + accuracy = statistics.mean(gtAcc) + else: + accuracy = 0 + + return { + "exact_match": accuracy, + "submission": { + "image": f"{doc['question_id']}.jpg", + "answer": resAns, + }, + } + + +def vizwiz_vqa_doc_to_text(doc, model_specific_prompt_kwargs=None): + if model_specific_prompt_kwargs is None: + model_specific_prompt_kwargs = {} + pre_prompt = "" + post_prompt = "" + if "pre_prompt" in model_specific_prompt_kwargs: + pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + if "post_prompt" in model_specific_prompt_kwargs: + post_prompt = model_specific_prompt_kwargs["post_prompt"] + text = f"{pre_prompt}{doc['question'].capitalize()}{post_prompt}" + return text + + +def vizwiz_vqa_aggreate_submissions(results, args): + now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S") + submission_file_name = f"vizwiz_vqa-test-submission-{now_date_time}.json" + path = generate_submission_file(submission_file_name, args) + with open(path, "w") as f: + json.dump(results, f) + print(f"Submission file saved to {path}") diff --git a/EAGLE/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_test.yaml b/EAGLE/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dec140f6629fd7fbc6fb017e07217cfe367e06e0 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_test.yaml @@ -0,0 +1,14 @@ +group: vizwiz_vqa +task: vizwiz_vqa_test +test_split: test +include: _default_template_vqa_yaml +process_results: !function utils.vizwiz_vqa_process_results +metric_list: + # - metric: exact_match + # aggregation: mean + # higher_is_better: true + # ignore_case: true + # ignore_punctuation: true + - metric: submission + aggregation: !function utils.vizwiz_vqa_aggreate_submissions + higher_is_better: true diff --git a/EAGLE/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val.yaml b/EAGLE/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ac8ecc98c58a096d28e679fd2c7e74a103802111 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/vizwiz_vqa/vizwiz_vqa_val.yaml @@ -0,0 +1,13 @@ +group: vizwiz_vqa +task: vizwiz_vqa_val +test_split: val +include: _default_template_vqa_yaml +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true + # - metric: submission + # aggregation: !function utils.vizwiz_vqa_aggreate_submissions + # higher_is_better: true \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/vqav2/_default_template_vqav2_yaml b/EAGLE/lmms_eval/tasks/vqav2/_default_template_vqav2_yaml new file mode 100644 index 0000000000000000000000000000000000000000..d3ce20f775468b7b8d936572823ede9a0ee04629 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/vqav2/_default_template_vqav2_yaml @@ -0,0 +1,15 @@ +dataset_path: lmms-lab/VQAv2 +dataset_kwargs: + token: True +output_type: generate_until +doc_to_visual: !function utils.vqav2_doc_to_visual +doc_to_text: !function utils.vqav2_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 16 +metadata: + - version: 0.0 +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nAnswer the question using a single word or phrase." \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/vqav2/_vqav2.yaml b/EAGLE/lmms_eval/tasks/vqav2/_vqav2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d6caddecf343ebbef6c15b62a657c9a6d38450a9 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/vqav2/_vqav2.yaml @@ -0,0 +1,4 @@ +group: vqav2 +task: +- vqav2_val +- vqav2_test \ No newline at end of file diff --git a/EAGLE/lmms_eval/tasks/vqav2/utils.py b/EAGLE/lmms_eval/tasks/vqav2/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1a3e90829f1446b2332854672c713c1d470150e6 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/vqav2/utils.py @@ -0,0 +1,89 @@ +import re +import os +import json +import logging +import datetime +import statistics + +import lmms_eval.tasks._task_utils.file_utils as file_utils + +from lmms_eval.tasks._task_utils.vqa_eval_metric import EvalAIAnswerProcessor + + +eval_logger = logging.getLogger("lmms-eval") + + +def vqav2_doc_to_visual(doc): + return [doc["image"].convert("RGB")] + + +def vqav2_process_results(doc, result): + eval_ai_processor = EvalAIAnswerProcessor() + assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}." + resAns = eval_ai_processor(result[0]) + accuracy = 0 + + if "answers" in doc and doc["answers"] is not None: + for ansDic in doc["answers"]: + ansDic["answer"] = ansDic["answer"].replace("\n", " ") + ansDic["answer"] = ansDic["answer"].replace("\t", " ") + ansDic["answer"] = ansDic["answer"].strip() + gtAcc = [] + gtAnswers = [ans["answer"] for ans in doc["answers"]] + + if len(set(gtAnswers)) > 1: + for ansDic in doc["answers"]: + ansDic["answer"] = eval_ai_processor.process_punctuation(ansDic["answer"]) + ansDic["answer"] = eval_ai_processor.process_digit_article(ansDic["answer"]) + resAns = eval_ai_processor.process_punctuation(resAns) + resAns = eval_ai_processor.process_digit_article(resAns) + + for gtAnsDatum in doc["answers"]: + otherGTAns = [item for item in doc["answers"] if item != gtAnsDatum] + matchingAns = [item for item in otherGTAns if item["answer"] == resAns] + acc = min(1, float(len(matchingAns)) / 3) + gtAcc.append(acc) + accuracy = statistics.mean(gtAcc) + + return { + "exact_match": accuracy, + "submission": { + "question_id": doc["question_id"], + "answer": resAns, + }, + } + + +def vqav2_process_results_test(doc, result): + res = vqav2_process_results(doc, result) + return { + "submission": res["submission"], + } + + +def vqav2_process_results_val(doc, result): + res = vqav2_process_results(doc, result) + return { + "exact_match": res["exact_match"], + } + + +def vqav2_doc_to_text(doc, model_specific_prompt_kwargs=None): + if model_specific_prompt_kwargs is None: + model_specific_prompt_kwargs = {} + pre_prompt = "" + post_prompt = "" + if "pre_prompt" in model_specific_prompt_kwargs: + pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + if "post_prompt" in model_specific_prompt_kwargs: + post_prompt = model_specific_prompt_kwargs["post_prompt"] + return f"{pre_prompt}{doc['question']}{post_prompt}" + + +def vqav2_aggreate_submissions(results, args): + now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + submission_file_name = f"vqav2-test-submission-{now_date_time}.json" + path = file_utils.generate_submission_file(submission_file_name, args) + with open(path, "w") as f: + json.dump(results, f) + eval_logger.info(f"Submission file saved to {path}") diff --git a/EAGLE/lmms_eval/tasks/vqav2/vqav2_test.yaml b/EAGLE/lmms_eval/tasks/vqav2/vqav2_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..94c69209d09f6c724f2ceea7d9bbe9b760d67a2b --- /dev/null +++ b/EAGLE/lmms_eval/tasks/vqav2/vqav2_test.yaml @@ -0,0 +1,8 @@ +task: "vqav2_test" +include: _default_template_vqav2_yaml +test_split: test +metric_list: + - metric: submission + aggregation: !function utils.vqav2_aggreate_submissions + higher_is_better: true +process_results: !function utils.vqav2_process_results_test diff --git a/EAGLE/lmms_eval/tasks/vqav2/vqav2_val.yaml b/EAGLE/lmms_eval/tasks/vqav2/vqav2_val.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d24870b7d7a3e9952e0f299af289c7eebc71dc51 --- /dev/null +++ b/EAGLE/lmms_eval/tasks/vqav2/vqav2_val.yaml @@ -0,0 +1,10 @@ +task: "vqav2_val" +include: _default_template_vqav2_yaml +test_split: validation +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true +process_results: !function utils.vqav2_process_results_val diff --git a/EAGLE/scripts/eval/gqa.sh b/EAGLE/scripts/eval/gqa.sh new file mode 100644 index 0000000000000000000000000000000000000000..cc369c7d3df6edb86ed93f636a34100c9015912c --- /dev/null +++ b/EAGLE/scripts/eval/gqa.sh @@ -0,0 +1,43 @@ +#!/bin/bash +CKPT=$1 +NAME=$2 + +gpu_list="${CUDA_VISIBLE_DEVICES:-0}" +IFS=',' read -ra GPULIST <<< "$gpu_list" + +CHUNKS=${#GPULIST[@]} + +SPLIT="llava_gqa_testdev_balanced" +GQADIR="./playground/data/eval/gqa/data" +LOCAL_ANSWER_DIR="./playground/data/eval_local_files/gqa" + +for IDX in $(seq 0 $((CHUNKS-1))); do + CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m eagle.eval.model_vqa_loader \ + --model-path $CKPT \ + --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \ + --image-folder ./playground/data/eval/gqa/data/images \ + --answers-file ${LOCAL_ANSWER_DIR}/$SPLIT/$NAME/${CHUNKS}_${IDX}.jsonl \ + --num-chunks $CHUNKS \ + --chunk-idx $IDX \ + --temperature 0 \ + --conv-mode vicuna_v1 & +done + +wait + +output_file=${LOCAL_ANSWER_DIR}/$SPLIT/$NAME/merge.jsonl + +# Clear out the output file if it exists. +> "$output_file" + +# Loop through the indices and concatenate each file. +for IDX in $(seq 0 $((CHUNKS-1))); do + cat ${LOCAL_ANSWER_DIR}/$SPLIT/$NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file" +done + +python scripts/convert_gqa_for_eval.py --src $output_file --dst ${LOCAL_ANSWER_DIR}/$SPLIT/$NAME/testdev_balanced_predictions.json +absolute_path=$(readlink -f "${LOCAL_ANSWER_DIR}/$SPLIT/$NAME") + +cd $GQADIR +# python eval/eval.py --predictions ${LOCAL_ANSWER_DIR}/$SPLIT/$name/{tier}_predictions.json --tier testdev_balanced +python eval.py --predictions ${absolute_path}/{tier}_predictions.json --tier testdev_balanced \ No newline at end of file diff --git a/EAGLE/scripts/eval/mme.sh b/EAGLE/scripts/eval/mme.sh new file mode 100644 index 0000000000000000000000000000000000000000..342bb1908fd513f3342555f0e80db62c8b89e3c7 --- /dev/null +++ b/EAGLE/scripts/eval/mme.sh @@ -0,0 +1,26 @@ +#!/bin/bash +CKPT=$1 +NAME=$2 +MME_DATA_ROOT=$(readlink -f "./playground/data/eval/MME") + +python -m eagle.eval.model_vqa_loader \ + --model-path $CKPT \ + --question-file ./playground/data/eval/MME/llava_mme.jsonl \ + --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \ + --answers-file ./playground/data/eval/MME/answers/${NAME}.jsonl \ + --temperature 0 \ + --conv-mode vicuna_v1 + +cd ./playground/data/eval/MME + +# python convert_answer_to_mme.py --experiment ${NAME}.jsonl + +# cd eval_tool + +# python calculation.py --results_dir answers/${NAME} + +python convert_answer_to_mme.py --experiment ${MME_DATA_ROOT}/answers/${NAME}.jsonl --data_path ${MME_DATA_ROOT}/MME_Benchmark_release_version + +cd eval_tool + +python calculation.py --results_dir ${MME_DATA_ROOT}/answers/${NAME}_mme_results diff --git a/EAGLE/scripts/eval/mmmu.sh b/EAGLE/scripts/eval/mmmu.sh new file mode 100644 index 0000000000000000000000000000000000000000..5c01daa7e04887144c7416e2f08dd8c8a67e2e22 --- /dev/null +++ b/EAGLE/scripts/eval/mmmu.sh @@ -0,0 +1,18 @@ +#!/bin/bash +export CUDA_VISIBLE_DEVICES=0 +MODEL_CKPT=$1 +MODEL_NAME=$2 + +SAVE_DIR=playground/data/eval/mmmu/${MODEL_NAME} +SPLIT=validation +MMMU_DATA_ROOT=./playground/data/eval/MMMU + +python eagle/eval/model_vqa_mmmu.py \ + --model_path ${MODEL_CKPT} \ + --split ${SPLIT} \ + --output_path ${SAVE_DIR}/${SPLIT}_output.json \ + +output_file=${SAVE_DIR}/${SPLIT}_output.json +echo "saving model answer at $output_file" + +python ./eval_utils/mmmu/main_eval_only.py --output_path ${SAVE_DIR}/${SPLIT}_output.json \ No newline at end of file diff --git a/EAGLE/scripts/eval/pope.sh b/EAGLE/scripts/eval/pope.sh new file mode 100644 index 0000000000000000000000000000000000000000..6db733129e366348595cff7b9ebe6c5355bbb477 --- /dev/null +++ b/EAGLE/scripts/eval/pope.sh @@ -0,0 +1,16 @@ +#!/bin/bash +CKPT=$1 +NAME=$2 + +python -m eagle.eval.model_vqa_loader \ + --model-path $CKPT \ + --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ + --image-folder ./playground/data/eval/pope/val2014 \ + --answers-file ./playground/data/eval/pope/answers/${NAME}.jsonl \ + --temperature 0 \ + --conv-mode vicuna_v1 + +python eagle/eval/eval_pope.py \ + --annotation-dir ./playground/data/eval/pope/coco \ + --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ + --result-file ./playground/data/eval/pope/answers/${NAME}.jsonl diff --git a/EAGLE/scripts/eval/sqa.sh b/EAGLE/scripts/eval/sqa.sh new file mode 100644 index 0000000000000000000000000000000000000000..929e96620950e865484cd9d5f07b62262b72cae2 --- /dev/null +++ b/EAGLE/scripts/eval/sqa.sh @@ -0,0 +1,18 @@ +#!/bin/bash +CKPT=$1 +NAME=$2 + +python -m eagle.eval.model_vqa_science \ + --model-path $CKPT \ + --question-file ./playground/data/eval/scienceqa/llava_test_CQM-A.json \ + --image-folder ./playground/data/eval/scienceqa/images/test \ + --answers-file ./playground/data/eval/scienceqa/answers/${NAME}.jsonl \ + --single-pred-prompt \ + --temperature 0 \ + --conv-mode vicuna_v1 + +python eagle/eval/eval_science_qa.py \ + --base-dir ./playground/data/eval/scienceqa \ + --result-file ./playground/data/eval/scienceqa/answers/${NAME}.jsonl \ + --output-file ./playground/data/eval/scienceqa/answers/${NAME}_output.jsonl \ + --output-result ./playground/data/eval/scienceqa/answers/${NAME}_result.json diff --git a/EAGLE/scripts/eval/textvqa.sh b/EAGLE/scripts/eval/textvqa.sh new file mode 100644 index 0000000000000000000000000000000000000000..a2084c9805baede9fa8a362b60c273e9eee963ca --- /dev/null +++ b/EAGLE/scripts/eval/textvqa.sh @@ -0,0 +1,16 @@ +#!/bin/bash +CKPT=$1 +NAME=$2 +DATA_ROOT=$(readlink -f "./playground/data/eval/textvqa/") + +python -m eagle.eval.model_vqa_loader \ + --model-path $CKPT \ + --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \ + --image-folder ./playground/data/eval/textvqa/train_images \ + --answers-file ./playground/data/eval/textvqa/answers/${NAME}.jsonl \ + --temperature 0 \ + --conv-mode vicuna_v1 + +python -m eagle.eval.eval_textvqa \ + --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \ + --result-file ./playground/data/eval/textvqa/answers/${NAME}.jsonl diff --git a/EAGLE/scripts/eval/vizwiz.sh b/EAGLE/scripts/eval/vizwiz.sh new file mode 100644 index 0000000000000000000000000000000000000000..dd95f8b5b2e3c85cb75439f8907dd1b3818d685d --- /dev/null +++ b/EAGLE/scripts/eval/vizwiz.sh @@ -0,0 +1,18 @@ +#!/bin/bash +CKPT=$1 +NAME=$2 +DATA_ROOT=$(readlink -f "./playground/data/eval/vizwiz") +LOCAL_ANSWER_DIR="./playground/data/eval_local_files/vizwiz" + +python -m eagle.eval.model_vqa_loader \ + --model-path $CKPT \ + --question-file $DATA_ROOT/llava_test.jsonl \ + --image-folder $DATA_ROOT/test \ + --answers-file $LOCAL_ANSWER_DIR/$NAME/$NAME.jsonl \ + --temperature 0 \ + --conv-mode vicuna_v1 + +python scripts/convert_vizwiz_for_submission.py \ + --annotation-file $DATA_ROOT/llava_test.jsonl \ + --result-file $LOCAL_ANSWER_DIR/$NAME/$NAME.jsonl \ + --result-upload-file $LOCAL_ANSWER_DIR/$NAME/answers_upload/vizwiz_test_$NAME.json diff --git a/EAGLE/scripts/eval/vqav2.sh b/EAGLE/scripts/eval/vqav2.sh new file mode 100644 index 0000000000000000000000000000000000000000..6c1d1cc9a8c3136bce4d8b6ca1fbc1e72e56b0cd --- /dev/null +++ b/EAGLE/scripts/eval/vqav2.sh @@ -0,0 +1,37 @@ +#!/bin/bash +CKPT=$1 +NAME=$2 + +gpu_list="${CUDA_VISIBLE_DEVICES:-0}" +IFS=',' read -ra GPULIST <<< "$gpu_list" + +CHUNKS=${#GPULIST[@]} + +SPLIT="llava_vqav2_mscoco_test-dev2015" +LOCAL_ANSWER_DIR="./playground/data/eval_local_files/vqav2" + +for IDX in $(seq 0 $((CHUNKS-1))); do + CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m eagle.eval.model_vqa_loader \ + --model-path $CKPT \ + --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \ + --image-folder ./playground/data/eval/vqav2/test2015 \ + --answers-file ${LOCAL_ANSWER_DIR}/$SPLIT/$NAME/${CHUNKS}_${IDX}.jsonl \ + --num-chunks $CHUNKS \ + --chunk-idx $IDX \ + --temperature 0 \ + --conv-mode vicuna_v1 & +done + +wait + +output_file=${LOCAL_ANSWER_DIR}/$SPLIT/$NAME/merge.jsonl + +# Clear out the output file if it exists. +> "$output_file" + +# Loop through the indices and concatenate each file. +for IDX in $(seq 0 $((CHUNKS-1))); do + cat ${LOCAL_ANSWER_DIR}/$SPLIT/$NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file" +done + +python scripts/convert_vqav2_for_submission.py --src ${LOCAL_ANSWER_DIR}/$SPLIT/$NAME/merge.jsonl --save_path ${LOCAL_ANSWER_DIR}/$SPLIT/$NAME/vqav2-upload-$NAME.json --split $SPLIT --ckpt $NAME \ No newline at end of file diff --git a/EAGLE/scripts/eval_lmms_eval/eval-mmbench-mathvista.sh b/EAGLE/scripts/eval_lmms_eval/eval-mmbench-mathvista.sh new file mode 100644 index 0000000000000000000000000000000000000000..7636ee35d2fd3404eff1bef3569365ad362f2bcc --- /dev/null +++ b/EAGLE/scripts/eval_lmms_eval/eval-mmbench-mathvista.sh @@ -0,0 +1,13 @@ +MODEL_PATH=$1 +MODEL_NAME=$2 +CONV_MODE=$3 + +accelerate launch --num_processes=8\ + evaluate_lmms_eval.py \ + --model eagle \ + --model_args pretrained=${MODEL_PATH},conv_template=${CONV_MODE} \ + --tasks mmbench_en_dev,mathvista_testmini \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix ${MODEL_NAME}_mmbench_mathvista \ + --output_path ./logs/ \ No newline at end of file diff --git a/EAGLE/scripts/eval_lmms_eval/eval-mme-seed-pope-sqa-gqa-ocrbench-textvqa-chartqa.sh b/EAGLE/scripts/eval_lmms_eval/eval-mme-seed-pope-sqa-gqa-ocrbench-textvqa-chartqa.sh new file mode 100644 index 0000000000000000000000000000000000000000..46ac6c762761560645d4fabd637315f918353550 --- /dev/null +++ b/EAGLE/scripts/eval_lmms_eval/eval-mme-seed-pope-sqa-gqa-ocrbench-textvqa-chartqa.sh @@ -0,0 +1,13 @@ +MODEL_PATH=$1 +MODEL_NAME=$2 +CONV_MODE=$3 + +accelerate launch --num_processes=8\ + evaluate_lmms_eval.py \ + --model eagle \ + --model_args pretrained=${MODEL_PATH},conv_template=${CONV_MODE} \ + --tasks mme,seed_bench,pope,scienceqa_img,gqa,ocrbench,textvqa_val,chartqa \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix ${MODEL_NAME}_mmbench_mathvista_seedbench \ + --output_path ./logs/ \ No newline at end of file diff --git a/EAGLE/scripts/eval_lmms_eval/eval-vizwiz-vqav2.sh b/EAGLE/scripts/eval_lmms_eval/eval-vizwiz-vqav2.sh new file mode 100644 index 0000000000000000000000000000000000000000..632d970720b5e957b1c77970c42735c2c2837621 --- /dev/null +++ b/EAGLE/scripts/eval_lmms_eval/eval-vizwiz-vqav2.sh @@ -0,0 +1,13 @@ +MODEL_PATH=$1 +MODEL_NAME=$2 +CONV_MODE=$3 + +accelerate launch --num_processes=8\ + evaluate_lmms_eval.py \ + --model eagle \ + --model_args pretrained=${MODEL_PATH},conv_template=${CONV_MODE} \ + --tasks vizwiz_vqa_test,vqav2_test \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix ${MODEL_NAME}_vizwiz_vqav2 \ + --output_path ./logs/ \ No newline at end of file diff --git a/Emu/Emu1/README.md b/Emu/Emu1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dc3ce1eea8fa568f79166c3d70b71b79b18c88f0 --- /dev/null +++ b/Emu/Emu1/README.md @@ -0,0 +1,173 @@ + + +
+

Generative Pretraining in Multimodality +

Generative Pretraining in Multimodality

+ +[Quan Sun](https://github.com/Quan-Sun)1*, [Qiying Yu](https://yqy2001.github.io)2,1*, [Yufeng Cui]()1*, [Fan Zhang](https://scholar.google.com/citations?user=VsJ39HMAAAAJ)1*, [Xiaosong Zhang](https://github.com/zhangxiaosong18)1*, [Yueze Wang]()1, [Hongcheng Gao](https://hongcheng-gao.github.io/)1,
[Jingjing Liu](https://air.tsinghua.edu.cn/en/info/1046/1194.htm)2, [Tiejun Huang](https://scholar.google.com/citations?user=knvEK4AAAAAJ&hl=en)1,3, [Xinlong Wang](https://www.xloong.wang/)1 + +1 [BAAI](https://www.baai.ac.cn/english.html), 2 [THU](https://air.tsinghua.edu.cn), 3 [PKU](https://english.pku.edu.cn/)
* Equal Contribution + +| [Paper](https://arxiv.org/abs/2307.05222) | [Demo](https://emu.ssi.plus/) | + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/generative-pretraining-in-multimodality/visual-question-answering-on-mm-vet-w-o)](https://paperswithcode.com/sota/visual-question-answering-on-mm-vet?tag_filter=0) +
+ +**Emu is a multimodal generalist that can seamlessly generate images and texts in multimodal context**. **Emu** is trained with a unified autoregressive objective, *i.e.*, predict-the-next-element, including both visual embeddings and textual tokens. Trained under this objective, **Emu** can serve as a generalist interface for both image-to-text and text-to-image tasks. + +![](assets/Emu.png) + +## News + +* `Oct 16, 2023`: **Emu-I** achieves [state-of-the-art performance](https://paperswithcode.com/sota/visual-question-answering-on-mm-vet?tag_filter=0) on the [MM-Vet](https://github.com/yuweihao/MM-Vet) benchmark (w/o external tools like GPT-4), which assesses large multimodal models in real-world, in-the-wild scenarios. +* `Oct 13, 2023`: The code for the zero-shot evaluation of **Emu-I** has been released! +* `Sep 18, 2023`: Tools for processing YT-Storyboard-1b dataset have been released! + +## Generalist Interface + +**Emu** serves as a generalist interface capable of diverse multimodal tasks, such as image captioning, image/video question answering, and text-to-image generation, together with new abilities like in-context text and image generation, and image blending: + +![](assets/generalist.png) + +## Setup + +Clone this repository and install required packages: + +```shell +git clone https://github.com/baaivision/Emu +cd Emu/Emu1 + +pip install -r requirements.txt +``` + +## Model Weights + +We release the pretrained and instruction-tuned weights of **Emu**. Our weights are subject to LLaMA-1's [license](https://github.com/facebookresearch/llama/blob/1076b9c51c77ad06e9d7ba8a4c6df775741732bd/LICENSE). + +| Model name | Weight | +| ------------------ | ------------------------------------------------------- | +| **Emu w/ Decoder** | [🤗 HF link](https://huggingface.co/BAAI/Emu/tree/main/pretrain) (34GB) | +| **Emu-I** | [🤗 HF link](https://huggingface.co/BAAI/Emu/blob/main/Emu-instruct.pt) (27GB) | + +## Inference + +At present, we provide inference code that can process interleaved image-text and **video** as input, and output text and image. + +For instruction-tuned model, we provide examples for image captioning, visual question answering, and interleaved multi-image understanding: + +```sh +python inference.py --instruct --ckpt-path ${INSTRUCT_CKPT_PATH} +``` + +For pretrained model, we provide an example for in-context learning: + +```sh +python inference.py --ckpt-path ${PRETRAIN_CKPT_DIR}/multimodal_encoder/pytorch_model.bin +``` + +For image generation, we provide examples for image blending, text-to-image and in-context generation: + +```sh +python image_inference.py --ckpt-path ${PRETRAIN_CKPT_DIR} +``` + +## Evaluation + +We provide **Emu-I**'s zero-shot evaluation code on MM-Vet, COCO Caption, VQAv2, OKVQA, VizWiz and VisDial benchmarks. For example, evaluating COCO captioning on a node with 8 GPUs: +```sh +python -m torch.distributed.launch \ + --nproc_per_node=8 \ + --use_env \ + eval.py \ + --instruct \ + --batch_size 4 \ + --ckpt_path ${INSTRUCT_CKPT_PATH} \ + --root_path /path/to/benchmark_root \ + --dataset_name coco \ # coco, mmvet, vqav2, okvqa, vizwiz, visdial + --output_path ./output/ +``` +where `/path/to/benchmark_root` should contain the following file structure: +``` +benchmark_root/ + mm-vet/ + mm-vet.json + images/ + v1_0.png + ... + coco/ + images/ + test2015/ + COCO_test2015_{...}.jpg + ... + val2014/ + COCO_val2014_{...}.jpg + ... + annotations/ + coco_karpathy_test.json + coco_karpathy_test_gt.json + coco_karpathy_val.json + coco_karpathy_val_gt.json + v2_OpenEnded_mscoco_val2014_questions.json + v2_mscoco_val2014_annotations.json + vqa_test.json + vqa_val_eval.json + okvqa/ + annotations/ + OpenEnded_mscoco_val2014_questions.json + mscoco_val2014_annotations.json + vqa_val_eval.json + vizwiz/ + images/ + test/ + VizWiz_test_{...}.jpg + ... + val/ + VizWiz_val_{...}.jpg + ... + annotations/ + test.json + val.json + visdial/ + VisualDialog_test2018/ + VisualDialog_test2018_{...}.jpg + ... + VisualDialog_val2018/ + VisualDialog_val2018_{...}.jpg + ... + visdial_1.0_test.json + visdial_1.0_val.json +``` +You can also customize your own file structure and modify the corresponding data loading code. Each dataset file can be found in the `mm_eval/datasets/` directory. All files can be downloaded from the official dataset websites or from [LAVIS](https://github.com/salesforce/LAVIS). + + +## Schedule + +We are committed to open-sourcing all Emu related materials, including: + +- [x] The weights of **Emu** and **Emu-I** +- [x] Inference example for interleaved image-text as input, text as output +- [x] Video inference example +- [x] Weights of image decoder & image generation/blending example +- [x] YT-Storyboard-1B pretraining data +- [ ] Pretraining code +- [ ] Instruction tuning code +- [x] Evaluation code + +We hope to foster the growth of our community through open-sourcing and promoting collaboration👬. Let's step towards multimodal intelligence together🍻. + +## Acknowledgement + +We thank the great work from [LLaMA](https://github.com/facebookresearch/llama), [BLIP-2](https://github.com/salesforce/LAVIS), [Stable Diffusion](https://github.com/CompVis/stable-diffusion), and [FastChat](https://github.com/lm-sys/FastChat). + +## Citation + +If you find Emu useful for your research and applications, please consider starring this repository and citing: + +``` +@article{Emu, + title={Generative Pretraining in Multimodality}, + author={Sun, Quan and Yu, Qiying and Cui, Yufeng and Zhang, Fan and Zhang, Xiaosong and Wang, Yueze and Gao, Hongcheng and Liu, Jingjing and Huang, Tiejun and Wang, Xinlong}, + publisher={arXiv preprint arXiv:2307.05222}, + year={2023}, +} +``` diff --git a/Emu/Emu1/eval.py b/Emu/Emu1/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..81b230b1d89e07af13594fc50fb04369f6fef0fa --- /dev/null +++ b/Emu/Emu1/eval.py @@ -0,0 +1,6 @@ +import os + +from mm_eval import evaluate_engine + +if __name__ == '__main__': + metric = evaluate_engine() diff --git a/Emu/Emu1/image_inference.py b/Emu/Emu1/image_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..1be85ec6522e5fdbc467369c6f55648a0eecd569 --- /dev/null +++ b/Emu/Emu1/image_inference.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- + +import argparse + +from PIL import Image +from models.pipeline import EmuGenerationPipeline + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--instruct", + action='store_true', + default=False, + help="Load Emu-I", + ) + parser.add_argument( + "--ckpt-path", + type=str, + default='', + help="Emu Decoder ckpt path", + ) + args = parser.parse_args() + + return args + + +if __name__ == "__main__": + args = parse_args() + + # NOTE + # Emu Decoder Pipeline only supports pretrain model + # Using instruct tuning model as image encoder may cause unpredicted results + assert args.instruct is False, "Image Generation currently do not support instruct tuning model" + + pipeline = EmuGenerationPipeline.from_pretrained( + path=args.ckpt_path, + args=args, + ) + pipeline = pipeline.bfloat16().cuda() + + # image blend case + # image_1 = Image.open("examples/sunflower.png") + # image_2 = Image.open("examples/oil_sunflower.jpg") + image_1 = Image.open("examples/cat.jpg") + image_2 = Image.open("examples/tiger.jpg") + image, safety = pipeline( + [image_1, image_2], + height=512, + width=512, + guidance_scale=7.5, + ) + + if safety is None or not safety: + image.save("image_blend_result.jpg") + else: + print("ImageBlend Generated Image Has Safety Concern!!!") + + # text-to-image case + text = "An image of a dog wearing a pair of glasses." + image, safety = pipeline( + [text], + height=512, + width=512, + guidance_scale=7.5, + ) + + if safety is None or not safety: + image.save("text2image_result.jpg") + else: + print("T2I Generated Image Has Safety Concern!!!") + + # in-context generation + image_1 = Image.open("examples/dog.png") + image_2 = Image.open("examples/sunflower.png") + + image, safety = pipeline( + [ + "This is the first image: ", + image_1, + "This is the second image: ", + image_2, + "The animal in the first image surrounded with the plant in the second image: ", + ], + height=512, + width=512, + guidance_scale=10., + ) + + if safety is None or not safety: + image.save("incontext_result.jpg") + else: + print("In-context Generated Image Has Safety Concern!!!") diff --git a/Emu/Emu1/inference.py b/Emu/Emu1/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..ab353cea1c6375bd5ac3b1e2be8493c0ceafc3b1 --- /dev/null +++ b/Emu/Emu1/inference.py @@ -0,0 +1,181 @@ +import argparse + +import json + +import torch +from models.modeling_emu import Emu +from utils import process_img, process_video + +image_placeholder = "[IMG]" + "" * 32 + "[/IMG]" +image_system_msg = "You will be presented with an image: [IMG]ImageContent[/IMG]. You will be able to see the image after I provide it to you. Please answer my questions based on the given image." +video_system_msg = "You are a helpful assistant and you will be presented with a video consisting of multiple chronological images: [IMG]ImageContent[/IMG]. You will be able to see the video after I provide it to you. Please answer my questions based on the given video." + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--instruct", + action='store_true', + default=False, + help="Load Emu-I", + ) + parser.add_argument( + "--ckpt-path", + type=str, + default='', + help="Emu ckpt path", + ) + args = parser.parse_args() + + return args + + +def prepare_model(model_name, args): + with open(f'models/{model_name}.json', "r", encoding="utf8") as f: + model_cfg = json.load(f) + print(f"=====> model_cfg: {model_cfg}") + + model = Emu(**model_cfg, cast_dtype=torch.float, args=args) + + if args.instruct: + print('Patching LoRA...') + from peft import LoraConfig, get_peft_model + lora_config = LoraConfig( + r=16, + lora_alpha=16, + target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + model.decoder.lm = get_peft_model(model.decoder.lm, lora_config) + + print(f"=====> loading from ckpt_path {args.ckpt_path}") + ckpt = torch.load(args.ckpt_path, map_location="cpu") + if 'module' in ckpt: + ckpt = ckpt['module'] + msg = model.load_state_dict(ckpt, strict=False) + model.eval() + print(f"=====> get model.load_state_dict msg: {msg}") + + return model + + +def Emu_inference(image_list, text_sequence, system='', instruct=True, max_new_tokens=128, beam_size=5, length_penalty=0.0): + if instruct: + prompt = f"{system} [USER]: {text_sequence} [ASSISTANT]:".strip() + else: + prompt = text_sequence + + print(f"===> prompt: {prompt}") + + samples = {"image": torch.cat(image_list, dim=0), "prompt": prompt} + + output_text = emu_model.generate( + samples, + max_new_tokens=max_new_tokens, + num_beams=beam_size, + length_penalty=length_penalty, + repetition_penalty=1.0, + )[0].strip() + + print(f"===> output: {output_text}\n") + + +def Emu_instruct_caption(img): + system = image_system_msg + + prompt = f"{system} [USER]: {image_placeholder}Please provide an accurate and concise description of the given image. [ASSISTANT]: The image depicts a photo of".strip() + + print(f"===> caption prompt: {prompt}") + + samples = {"image": img, "prompt": prompt} + + output_text = emu_model.generate( + samples, + max_new_tokens=512, + num_beams=5, + length_penalty=0.0, + repetition_penalty=1.0, + )[0].strip() + + print(f"===> caption output: {output_text}\n") + + +def pretrain_example(): + # prepare in-context learning example + image_text_sequence = [ + process_img(img_path='examples/dog.png', device=args.device), + 'There are two dogs.', + process_img(img_path='examples/panda.png', device=args.device), + 'There are three pandas.', + process_img(img_path='examples/sunflower.png', device=args.device), + ] + interleaved_sequence_1 = '' + image_list_1 = [] + for item in image_text_sequence: + if isinstance(item, str): # text + interleaved_sequence_1 += item + else: # image + image_list_1.append(item) + interleaved_sequence_1 += image_placeholder + + # Pretrained Model Inference + # -- in-context learning + Emu_inference(image_list_1, interleaved_sequence_1, instruct=False) + + +def instruct_example(): + # prepare image captioning and vqa examples + image = process_img(img_path='examples/iron_man.jpg', device=args.device) + question = 'what is the man doing?' + + # prepare interleaved image-text input example + image_text_sequence = [ + process_img(img_path='examples/book1.jpeg', device=args.device), + 'This is the first image.', + process_img(img_path='examples/book2.jpeg', device=args.device), + 'This is the second image.', + process_img(img_path='examples/book3.jpeg', device=args.device), + 'This is the third image.', + process_img(img_path='examples/book4.jpeg', device=args.device), + 'This is the fourth image.', + 'Describe all images.' + ] + interleaved_sequence_1 = '' + image_list_1 = [] + for item in image_text_sequence: + if isinstance(item, str): # text + interleaved_sequence_1 += item + else: # image + image_list_1.append(item) + interleaved_sequence_1 += image_placeholder + + # prepare video example + image_list_2, interleaved_sequence_2 = process_video('examples/AppleVR.mp4') + interleaved_sequence_2 += "What's the woman doing in the video?" + + # Instruct Model Inference + # -- image captioning + Emu_instruct_caption(image) + # -- visual question answering + Emu_inference([image], image_placeholder + question, system=image_system_msg) + # -- image-text interleaved input, text output + Emu_inference(image_list_1, interleaved_sequence_1, system='') + # -- video understanding + Emu_inference(image_list_2, interleaved_sequence_2, system=video_system_msg, length_penalty=1.0) + + +if __name__ == '__main__': + + args = parse_args() + + # initialize and load model + args.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + emu_model = prepare_model('Emu-14B', args) + emu_model.to(args.device).to(torch.bfloat16) + + if args.instruct: + instruct_example() + else: + pretrain_example() diff --git a/Emu/Emu1/requirements.txt b/Emu/Emu1/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..0483d98919207f98b88af7b0cbb9abe2ad055217 --- /dev/null +++ b/Emu/Emu1/requirements.txt @@ -0,0 +1,13 @@ +torch +transformers +peft +numpy +Pillow +argparse +einops +timm +xformers +sentencepiece +decord +diffusers==0.15.1 +torchvision diff --git a/Emu/Emu1/utils.py b/Emu/Emu1/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d8c15beef069719ef759c6279fc2b7de6d187696 --- /dev/null +++ b/Emu/Emu1/utils.py @@ -0,0 +1,44 @@ +import torch +from PIL import Image +import numpy as np +from decord import VideoReader + + +def get_index(num_frames, num_segments): + print(f"===> num_frames: {num_frames}, num_segments: {num_segments}") + seg_size = float(num_frames - 1) / num_segments + start = int(seg_size / 2) + offsets = np.array([ + start + int(np.round(seg_size * idx)) for idx in range(num_segments) + ]) + return offsets + + +def process_img(img_path=None, img=None, device=torch.device("cuda")): + assert img_path is not None or img is not None, "you should pass either path to an image or a PIL image object" + width, height = 224, 224 + OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) + OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) + if img_path: + img = Image.open(img_path).convert("RGB") + img = img.resize((width, height)) + img = np.array(img) / 255. + img = (img - OPENAI_DATASET_MEAN) / OPENAI_DATASET_STD + img = torch.tensor(img).to(device).to(torch.float) + img = torch.einsum('hwc->chw', img) + img = img.unsqueeze(0) + return img + + +def process_video(video_path=None): + vr = VideoReader(video_path) + frame_indices = get_index(len(vr), 8) + image_list = [] + text_sequence = '' + from inference import image_placeholder + for frame_index in frame_indices: + image = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB') + image = process_img(img=image) + image_list.append(image) + text_sequence += image_placeholder + return image_list, text_sequence diff --git a/Emu/Emu2/.gitignore b/Emu/Emu2/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..478adc9b212ba70f2e0a14a06b2576f228a7f0e4 --- /dev/null +++ b/Emu/Emu2/.gitignore @@ -0,0 +1,3 @@ +logs/ +tools/ +__pycache__/ diff --git a/Emu/Emu2/README.md b/Emu/Emu2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..daa2ff570cac10a881f807029255a965e3723a9b --- /dev/null +++ b/Emu/Emu2/README.md @@ -0,0 +1,665 @@ + +
+

Generative Multimodal Models are In-Context Learners +

Generative Multimodal Models are In-Context Learners

+ +[Quan Sun](https://github.com/Quan-Sun)1*, [Yufeng Cui](https://scholar.google.com/citations?hl=en&user=5Ydha2EAAAAJ)1*, [Xiaosong Zhang](https://zhangxiaosong18.github.io)1*, [Fan Zhang](https://scholar.google.com/citations?user=VsJ39HMAAAAJ)1*, [Qiying Yu](https://yqy2001.github.io)2,1*, [Zhengxiong Luo](https://greatlog.github.io)1, [Yueze Wang]()1, [Yongming Rao](https://raoyongming.github.io)1,
[Jingjing Liu](https://air.tsinghua.edu.cn/en/info/1046/1194.htm)2, [Tiejun Huang](https://scholar.google.com/citations?user=knvEK4AAAAAJ&hl=en)1,3, [Xinlong Wang](https://www.xloong.wang/)1† + +1 [BAAI](https://www.baai.ac.cn/english.html), 2 [THU](https://air.tsinghua.edu.cn), 3 [PKU](https://english.pku.edu.cn/)
* equal contribution project lead + +| [Paper](https://arxiv.org/abs/2312.13286) | [Fast Demo](https://emu.ssi.plus) | [🤗HF Demo](https://huggingface.co/spaces/BAAI/Emu2) | [🤗HF Models](https://huggingface.co/BAAI/Emu2) | [Project Page](https://baaivision.github.io/emu2/) | [Video Demo](https://www.youtube.com/watch?v=nz4BJ7MKtKo&t=9s) | + + +
+ +We introduce **Emu2**, a generative multimodal model with 37 billion parameters, trained on large-scale multimodal sequences with a unified autoregressive objective. +**Emu2** exhibits strong multimodal in-context learning abilities, even emerging to solve tasks that require on-the-fly reasoning. +The model sets a new record on multiple multimodal understanding tasks in few-shot settings. +When instruction-tuned to follow specific instructions, **Emu2** further achieves new state-of-the-art on challenging tasks such as question answering benchmarks for large multimodal models and open-ended subject-driven generation. +These achievements demonstrate that **Emu2** can serve as a base model and general-purpose interface for a wide range of multimodal tasks. + + + +## Emu2 is a strong multimodal few-shot learner + +
+comparison_fewshot. +
+ + + + +## An impressive multimodal generalist + +
+Radar. +
+ + +## A skilled painter + +
+ gen_barchart. + +
+ +
+Zero-shot subject-driven generation +
+ + +## Refer Expression Comprehension +We supplemented the results of Refer Expression Comprehension on RefCOCO, RefCOCO+, RefCOCOg, and compared them with generalist models. + +| Model | RefCOCO
val | RefCOCO
testA | RefCOCO
testB | RefCOCO+
val | RefCOCO+
testA | RefCOCO+
testB | RefCOCOg
val | RefCOCOg
test | +| :-----------: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | +| OFA-L | 79.96 | 83.67 | 76.39 | 68.29 | 76.00 | 61.75 | 67.57 | 67.58 | +| Shikra-13B | 87.83 | 91.11 | 81.81 | 82.89 | 87.79 | 74.41 | 82.64 | 83.16 | +| Qwen-VL-7B | 89.36 | 92.26 | 85.34 | 83.12 | 88.25 | 77.21 | 85.58 | 85.48 | +| **Emu2-Chat** | **90.40** | **93.88** | **85.97** | **87.05** | **91.43** | **80.47** | **87.64** | **88.11** | + + +## Setup + +Clone this repository and install required packages: + +```shell +git clone https://github.com/baaivision/Emu +cd Emu/Emu2 + +pip install -r requirements.txt +``` + +## Model Weights + +| Model name | HF Weight | +| ------------------ | ------------------------------------------------------- | +| **Emu2** | [🤗 HF link](https://huggingface.co/BAAI/Emu2) | +| **Emu2-Chat** | [🤗 HF link](https://huggingface.co/BAAI/Emu2-Chat) | +| **Emu2-Gen** | [🤗 HF link](https://huggingface.co/BAAI/Emu2-Gen) | +- Model type: An auto-regressive multimodal model based on the transformer architecture. +- License: Non-commercial license +- Initialized from model: [LLaMA](https://github.com/facebookresearch/llama/blob/1076b9c51c77ad06e9d7ba8a4c6df775741732bd/LICENSE). + +### Native PyTorch Models +You can also access the native PyTorch models from [Link](https://model.baai.ac.cn/model-detail/220122) +- **Emu2** and **Emu2-Chat** can be loaded using `emu.chat.EmuChatGeneration` +- **Emu2-Gen** can be loaded using `emu.diffusion.EmuVisualGeneration` + + +## Demo +To facilitate the local usage, we provide the demo codes for both huggingface version and native PyTorch version models. + +### Huggingface Version + +```bash +# Before using the HF demo tools, please download Emu2-Chat or Emu2-Gen from +# Emu2-Chat: https://huggingface.co/BAAI/Emu2-Chat +# Emu2-Gen: https://huggingface.co/BAAI/Emu2-Gen + +# launch the frontend +cd demo/frontend +python frontend.py + +# launch the backend +cd demo/backend/hf_model + +# huggingface version of demo supports both multi-GPU deployment and quantization +# launch both generation and chat demo +python backend.py --model-path ${PATH_TO_Emu2-Chat_and_Emu2-Gen} + +# launch chat only demo +python backend.py --disable-generate --model-path ${PATH_TO_Emu2-Chat} +# with N gpus +python backend.py --disable-generate --chat-gpu-per-instance ${N} --model-path ${PATH_TO_Emu2-Chat} +# quantize +python backend.py --disable-generate --chat-quantize --model-path ${PATH_TO_Emu2-Chat} + +# launch generation only demo +python backend.py --disable-chat --model-path ${PATH_TO_Emu2-Gen} +# with N gpus +python backend.py --disable-chat --generate-gpu-per-instance ${N} --model-path ${PATH_TO_Emu2-Gen} +# quantize +python backend.py --disable-chat --generate-quantize --model-path ${PATH_TO_Emu2-Gen} +``` + +### Native PyTorch Version +```bash +# Before using the native PyTorch demo tools, please download Emu2-Chat or Emu2-Gen from +# https://model.baai.ac.cn/model-detail/220122 + +# launch the frontend +cd demo/frontend +python frontend.py + +# launch the backend +cd demo/backend/pytorch_model + +# native PyTorch version of demo only supports a simple multi-GPU deployment strategy +# launch both generation and chat demo +MODEL_PATH="DIR_CONTAINS_Emu2-Gen_pytorch_model_AND_Emu2-Chat_pytorch_model" +python backend.py --model-path ${MODEL_PATH} + +# launch chat only demo +python backend.py --disable-generate --model-path ${DIR_CONTAINS_Emu2-Chat_pytorch_model} +# with N gpus +python backend.py --disable-generate --chat-gpu-per-instance ${N} --model-path ${DIR_CONTAINS_Emu2-Chat_pytorch_model} + +# launch generation only demo +python backend.py --disable-chat --model-path ${DIR_CONTAINS_Emu2-Gen_pytorch_model} +# with N gpus +python backend.py --disable-chat --generate-gpu-per-instance ${N} --model-path ${DIR_CONTAINS_Emu2-Gen_pytorch_model} +``` + +## Inference + +### Huggingface Version +#### Emu2 & Emu2-Chat +#### Single GPU + +```python +from PIL import Image +import requests +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + + +tokenizer = AutoTokenizer.from_pretrained("BAAI/Emu2") # "BAAI/Emu2-Chat" + +model = AutoModelForCausalLM.from_pretrained( + "BAAI/Emu2", # "BAAI/Emu2-Chat" + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + trust_remote_code=True).to('cuda').eval() + + +# `[]` is the image placeholder which will be replaced by image embeddings. +# the number of `[]` should be equal to the number of input images + +query = '[]Describe the image in details:' +#image = Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/blue_black_1_top_left.jpg?raw=true',stream=True).raw).convert('RGB') + +image = Image.open("./examples/blue_black_1_top_left.jpg").convert('RGB') + +inputs = model.build_input_ids( + text=[query], + tokenizer=tokenizer, + image=[image] +) + +with torch.no_grad(): + outputs = model.generate( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + image=inputs["image"].to(torch.bfloat16), + max_new_tokens=64, + length_penalty=-1) + +output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True) +``` + +Interleaved image and text + +```python +from PIL import Image +import requests +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + + +tokenizer = AutoTokenizer.from_pretrained("BAAI/Emu2") # "BAAI/Emu2-Chat" + +model = AutoModelForCausalLM.from_pretrained( + "BAAI/Emu2", # "BAAI/Emu2-Chat" + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + trust_remote_code=True).to('cuda').eval() + +# `[]` is the image placeholder which will be replaced by image embeddings. +# the number of `[]` should be equal to the number of input images + +query = "[][red, white, 3, bottom left].[][yellow, white, 2, top left].[][green, black, 4, bottom right].[]" + +images = [ + Image.open("./examples/red_white_3_bottom_left.jpg").convert('RGB'), + Image.open("./examples/yellow_white_2_top_right.jpg").convert('RGB'), + Image.open("./examples/green_black_4_bottom_right.jpg").convert('RGB'), + Image.open("./examples/blue_black_1_top_left.jpg").convert('RGB') +] + +inputs = model.build_input_ids( + text=[query], + tokenizer=tokenizer, + image=images + +) + +with torch.no_grad(): + outputs = model.generate( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + image=inputs["image"].to(torch.bfloat16), + max_new_tokens=64, + length_penalty=-1) + +output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True) +``` + +#### Multi GPU + + +```python +from PIL import Image +import requests +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch + +tokenizer = AutoTokenizer.from_pretrained("BAAI/Emu2") # "BAAI/Emu2-Chat" + +with init_empty_weights(): + model = AutoModelForCausalLM.from_pretrained( + "BAAI/Emu2", # "BAAI/Emu2-Chat" + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + trust_remote_code=True) + +device_map = infer_auto_device_map(model, max_memory={0:'38GiB',1:'38GiB',}, no_split_module_classes=['Block','LlamaDecoderLayer']) +# input and output logits should be on same device +device_map["model.decoder.lm.lm_head"] = 0 + +model = load_checkpoint_and_dispatch( + model, + 'local/path/to/hf/version/Emu2/model', + device_map=device_map).eval() + +# `[]` is the image placeholder which will be replaced by image embeddings. +# the number of `[]` should be equal to the number of input images + +query = '[]Describe the image in details:' +image = Image.open("./examples/blue_black_1_top_left.jpg").convert('RGB') + +inputs = model.build_input_ids( + text=[query], + tokenizer=tokenizer, + image=[image] + +) + +with torch.no_grad(): + outputs = model.generate( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + image=inputs["image"].to(torch.bfloat16), + max_new_tokens=64, + length_penalty=-1) + +output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True) +``` + +Interleaved image and text + +```python +from PIL import Image +import requests +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch + +tokenizer = AutoTokenizer.from_pretrained("BAAI/Emu2") # "BAAI/Emu2-Chat" + +with init_empty_weights(): + model = AutoModelForCausalLM.from_pretrained( + "BAAI/Emu2", # "BAAI/Emu2-Chat" + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + trust_remote_code=True) + +device_map = infer_auto_device_map(model, max_memory={0:'38GiB',1:'38GiB',}, no_split_module_classes=['Block','LlamaDecoderLayer']) +# input and output logits should be on same device +device_map["model.decoder.lm.lm_head"] = 0 + +model = load_checkpoint_and_dispatch( + model, + 'local/path/to/hf/version/Emu2/model', + device_map=device_map).eval() + +# `[]` is the image placeholder which will be replaced by image embeddings. +# the number of `[]` should be equal to the number of input images +query = "[][red, white, 3, bottom left].[][yellow, white, 2, top left].[][green, black, 4, bottom right].[]" + +images = [ + Image.open("./examples/red_white_3_bottom_left.jpg").convert('RGB'), + Image.open("./examples/yellow_white_2_top_right.jpg").convert('RGB'), + Image.open("./examples/green_black_4_bottom_right.jpg").convert('RGB'), + Image.open("./examples/blue_black_1_top_left.jpg").convert('RGB') +] + +inputs = model.build_input_ids( + text=[query], + tokenizer=tokenizer, + image=images + +) + +with torch.no_grad(): + outputs = model.generate( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + image=inputs["image"].to(torch.bfloat16), + max_new_tokens=64, + length_penalty=-1) + +output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True) +``` + +#### Quantization + +Check quantization guidance at [transformers](https://huggingface.co/docs/transformers/v4.28.0/main_classes/quantization) + + +```python +from PIL import Image +import requests +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + + +tokenizer = AutoTokenizer.from_pretrained("BAAI/Emu2") # "BAAI/Emu2-Chat" + +model = AutoModelForCausalLM.from_pretrained( + "BAAI/Emu2", # "BAAI/Emu2-Chat" + load_in_4bit=True, + trust_remote_code=True, + bnb_4bit_compute_dtype=torch.float16).eval() + +query = '[]Describe the image in details:' +image = Image.open("./examples/blue_black_1_top_left.jpg").convert('RGB') + +inputs = model.build_input_ids( + text=[query], + tokenizer=tokenizer, + image=[image] + +) + +with torch.no_grad(): + outputs = model.generate( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + image=inputs["image"].to(torch.float16), # should be torch.float16 + max_new_tokens=64, + length_penalty=-1) + +output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True) +``` + +#### Emu2-Gen +```python +import cv2 +from diffusers import DiffusionPipeline +import numpy as np +from PIL import Image +from transformers import AutoModelForCausalLM, AutoTokenizer +import torch + +# For the first time of using, +# you need to download the huggingface repo "BAAI/Emu2-GEN" to local first +path = "path to local BAAI/Emu2-GEN" + +multimodal_encoder = AutoModelForCausalLM.from_pretrained( + f"{path}/multimodal_encoder", + trust_remote_code=True, + torch_dtype=torch.bfloat16, + use_safetensors=True, + variant="bf16" +) +tokenizer = AutoTokenizer.from_pretrained(f"{path}/tokenizer") + +pipe = DiffusionPipeline.from_pretrained( + path, + custom_pipeline="pipeline_emu2_gen", + torch_dtype=torch.bfloat16, + use_safetensors=True, + variant="bf16", + multimodal_encoder=multimodal_encoder, + tokenizer=tokenizer, +) + +# For the non-first time of using, you can init the pipeline directly +pipe = DiffusionPipeline.from_pretrained( + path, + custom_pipeline="pipeline_emu2_gen", + torch_dtype=torch.bfloat16, + use_safetensors=True, + variant="bf16", +) + +pipe.to("cuda") + +# text-to-image +prompt = "impressionist painting of an astronaut in a jungle" +ret = pipe(prompt) +ret.image.save("astronaut.png") + +# image editing +image = Image.open("./examples/dog.jpg").convert("RGB") +prompt = [image, "wearing a red hat on the beach."] +ret = pipe(prompt) +ret.image.save("dog_hat_beach.png") + +# grounding generation +def draw_box(left, top, right, bottom): + mask = np.zeros((448, 448, 3), dtype=np.uint8) + mask = cv2.rectangle(mask, (left, top), (right, bottom), (255, 255, 255), 3) + mask = Image.fromarray(mask) + return mask + +dog1 = Image.open("./examples/dog1.jpg").convert("RGB") +dog2 = Image.open("./examples/dog2.jpg").convert("RGB") +dog3 = Image.open("./examples/dog3.jpg").convert("RGB") +dog1_mask = draw_box( 22, 14, 224, 224) +dog2_mask = draw_box(224, 10, 448, 224) +dog3_mask = draw_box(120, 264, 320, 438) + +prompt = [ + "", + "An oil painting of three dogs,", + "the first dog" + "", + dog1_mask, + "", + dog1, + "the second dog" + "", + dog2_mask, + "", + dog2, + "the third dog" + "", + dog3_mask, + "", + dog3, +] +ret = pipe(prompt) +ret.image.save("three_dogs.png") + +# Autoencoding +# to enable the autoencoding mode, please pull the latest pipeline_emu2_gen.py first, +# and you can only input exactly one image as prompt +# if you want the model to generate an image, +# please input extra empty text "" besides the image, e.g. +# autoencoding mode: prompt = image or [image] +# generation mode: prompt = ["", image] or [image, ""] +prompt = Image.open("./examples/doodle.jpg").convert("RGB") +ret = pipe(prompt) +ret.image.save("doodle_ae.png") +``` + +### Native PyTorch Version +#### Emu2 & Emu2-Chat +```python +from emu.chat import EmuChatGeneration + +# Emu2 +pipe = EmuChatGeneration.from_pretrained( + "Path to Emu2_pytorch_model.bf16.bin", + dtype=torch.bfloat16, +) + +# Emu2-Chat +pipe = EmuChatGeneration.from_pretrained( + "Path to Emu2-Chat_pytorch_model.bf16.bin", + instruct=True, + dtype=torch.bfloat16, +) + +# Single GPU, e.g. cuda:0 +pipe = pipe.multito(["cuda:0"]) +# Multi GPU, e.g. cuda:0 and cuda:1 +pipe = pipe.multito(["cuda:0", "cuda:1"]) + +# In the context of chat, input must be List[List[str | Image.Image]] +# The length of outer list must be odd, which represents the ROLEs, like [USER, ASSISTANT, USER, ...], +# The first and the last ROLE must denote USER. +# The content in the innter list is the input/output of USER/ASSISTANT + +kwargs = { + "do_sample": False, + "max_new_tokens": 1024, + "temperature": 0.7, + "top_k": 3, + "top_p": 0.9, + "length_penalty": 2.0, + "num_beams": 5, + "repetition_penalty": 1.0, +} + +# image caption case +image = Image.open("./examples/blue_black_1_top_left.jpg").convert('RGB') +user_input = [image, "describe the image in details:"] +output = pipe([user_input], **kwargs) + +# video case +# suppose that you have already extracted the video frames +frames: List[Image.Image] = [] +user_input = ["[VIDEO]"] + frames + ["[/VIDEO]", "What happened in the video?"] +output = pipe([user_input], **kwargs) + +# Emu2-Chat also supports generation under the grounding senario +# to enable the grounding generation, please pass +# is_grounding=True and skip_special_tokens=False +# to the generation function +# NOTE: Our grounding coordinates range from 0 to 224 +image = Image.open("./examples/squirrel.jpeg").convert("RGB") +user_input = [image, "Where is the squirrel?"] +output = pipe([user_input], is_grounding=True, skip_special_tokens=False, **kwargs) + +# In the context of non-chat, input must be List[str | Image.Image] +# which represents the input content + +# interleaved case +img1 = Image.open("./examples/red_white_3_bottom_left.jpg").convert('RGB') +img2 = Image.open("./examples/yellow_white_2_top_right.jpg").convert('RGB') +img3 = Image.open("./examples/green_black_4_bottom_right.jpg").convert('RGB') +img4 = Image.open("./examples/blue_black_1_top_left.jpg").convert('RGB') + +input = [ + img1, "[red, white, 3, bottom left].", + img2, "[yellow, white, 2, top left].", + img3, "[green, black, 4, bottom right].", + img4, +] +output = pipe(input, max_new_tokens=15) +``` + +#### Emu2-Gen + +```python +from emu.diffusion import EmuVisualGeneration + +pipe = EmuVisualGeneration.from_pretrained( + "Path to Emu2-Gen_pytorch_model.bf16.safetensors", + dtype=torch.bfloat16, + use_safetensors=True, +) + +# Single GPU, e.g. cuda:0 +pipe = pipe.multito(["cuda:0"]) +# Multi GPU, e.g. cuda:0 and cuda:1 +pipe = pipe.multito(["cuda:0", "cuda:1"]) + +# text-to-image +prompt = "impressionist painting of an astronaut in a jungle" +ret = pipe(prompt) +ret.image.save("astronaut.png") + +# image editing +image = Image.open("./examples/dog.jpg").convert("RGB") +prompt = [image, "wearing a red hat on the beach."] +ret = pipe(prompt) +ret.image.save("dog_hat_beach.png") + +# grounding generation +def draw_box(left, top, right, bottom): + mask = np.zeros((448, 448, 3), dtype=np.uint8) + mask = cv2.rectangle(mask, (left, top), (right, bottom), (255, 255, 255), 3) + mask = Image.fromarray(mask) + return mask + +dog1 = Image.open("./examples/dog1.jpg").convert("RGB") +dog2 = Image.open("./examples/dog2.jpg").convert("RGB") +dog3 = Image.open("./examples/dog3.jpg").convert("RGB") +dog1_mask = draw_box( 22, 14, 224, 224) +dog2_mask = draw_box(224, 10, 448, 224) +dog3_mask = draw_box(120, 264, 320, 438) + +prompt = [ + "", + "An oil painting of three dogs,", + "the first dog" + "", + dog1_mask, + "", + dog1, + "the second dog" + "", + dog2_mask, + "", + dog2, + "the third dog" + "", + dog3_mask, + "", + dog3, +] +ret = pipe(prompt) +ret.image.save("three_dogs.png") + +# Autoencoding +# to enable the autoencoding mode, you can only input exactly one image as prompt +# if you want the model to generate an image, +# please input extra empty text "" besides the image, e.g. +# autoencoding mode: prompt = image or [image] +# generation mode: prompt = ["", image] or [image, ""] +prompt = Image.open("./examples/doodle.jpg").convert("RGB") +ret = pipe(prompt) +ret.image.save("doodle_ae.png") +``` + +## Acknowledgement + +We thank the great work from [LLaMA](https://github.com/facebookresearch/llama), [BLIP-2](https://github.com/salesforce/LAVIS), [Stable Diffusion](https://github.com/CompVis/stable-diffusion), and [FastChat](https://github.com/lm-sys/FastChat). + +## Citation + +If you find Emu useful for your research and applications, please consider starring this repository and citing: + +``` +@article{Emu2, + title={Generative Multimodal Models are In-Context Learners}, + author={Quan Sun and Yufeng Cui and Xiaosong Zhang and Fan Zhang and Qiying Yu and Zhengxiong Luo and Yueze Wang and Yongming Rao and Jingjing Liu and Tiejun Huang and Xinlong Wang}, + publisher={arXiv preprint arXiv:2312.13286}, + year={2023}, +} +``` diff --git a/Emu/Emu2/emu/.gitignore b/Emu/Emu2/emu/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..94ee09e7dd9ff54f381e3b047d3b91d25d7834f1 --- /dev/null +++ b/Emu/Emu2/emu/.gitignore @@ -0,0 +1,10 @@ +*.swp +**/__pycache__/** +checkpoints/ + +OUTPUT/* +OUTPUT + +wandb/ +logs/ +tools/ diff --git a/Emu/Emu2/emu/emu.py b/Emu/Emu2/emu/emu.py new file mode 100644 index 0000000000000000000000000000000000000000..47659bb43db217986be0c7aa28896232b3e6e224 --- /dev/null +++ b/Emu/Emu2/emu/emu.py @@ -0,0 +1,235 @@ +from functools import partial +from typing import Any, List, Optional, Mapping +from collections import OrderedDict + +import torch +from torch import nn +import torch.nn.functional as F + +from transformers.generation.configuration_utils import GenerationConfig +GENERATION_CONFIG = GenerationConfig(bos_token_id=1, eos_token_id=2, pad_token_id=32000) + +from .conf.emu_conf import CLIPVisionCfg, TextDecoderCfg + +from .constants import * +from .eva_vit import EVAVisionTransformer +from .lm import EmuForClsAndRegression + + +class EmuModel(nn.Module): + + def __init__( + self, + vision_cfg: CLIPVisionCfg = CLIPVisionCfg(), + text_decoder_cfg: TextDecoderCfg = TextDecoderCfg(), + ): + super().__init__() + + self.visual = EVAVisionTransformer( + img_size=vision_cfg.image_size, + patch_size=vision_cfg.patch_size, + embed_dim=vision_cfg.width, + depth=vision_cfg.layers, + num_heads=vision_cfg.width // vision_cfg.head_width, + mlp_ratio=vision_cfg.mlp_ratio, + qkv_bias=vision_cfg.qkv_bias, + drop_path_rate=vision_cfg.drop_path_rate, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + init_values=vision_cfg.init_value, + patch_dropout=vision_cfg.patch_dropout, + rope=vision_cfg.rope, + use_mean_pooling=vision_cfg.global_average_pool, + xattn=vision_cfg.xattn, + postnorm=vision_cfg.postnorm, + pt_hw_seq_len=vision_cfg.pt_hw_seq_len, + intp_freq=vision_cfg.intp_freq, + naiveswiglu=vision_cfg.naiveswiglu, + subln=vision_cfg.subln, + ) + + self.decoder = EmuForClsAndRegression(args=text_decoder_cfg) + + # EVA to LM: 1792 -> 6656 + self.project_up = nn.Linear(vision_cfg.width, self.decoder.lm.config.hidden_size, bias=False) + # LM to EVA: 6656 -> 1792 + self.project_down = nn.Linear(self.decoder.lm.config.hidden_size, vision_cfg.width, bias=False) + + # EmuModel is for inference only, so set padding and truncation to left + self.decoder.tokenizer.truncation_side = self.decoder.tokenizer.padding_side = "left" + + self.n_query = vision_cfg.n_query + self.v_query = vision_cfg.v_query + self.image_placeholder = DEFAULT_IMG_TOKEN + DEFAULT_IMAGE_TOKEN * self.n_query + DEFAULT_IMG_END_TOKEN + + # temporarily borrow [gIMG] as the video frame feature placeholder. + self.video_placeholder = DEFAULT_IMG_TOKEN + DEFAULT_gIMG_TOKEN * self.v_query + DEFAULT_IMG_END_TOKEN + + def device(self, module=None): + if module is None: + return next(self.parameters()).device + return next(module.parameters()).device + + def dtype(self, module=None): + if module is None: + return next(self.parameters()).dtype + return next(module.parameters()).dtype + + @torch.no_grad() + def encode_image(self, image: torch.Tensor, *, n_query=None): + n_query = n_query if n_query is not None else self.n_query + + image_embeds = self.visual(image) + image_embeds = image_embeds[:, 1:, :] + b, n, c = image_embeds.shape + sqrt_n = int(n**0.5) + image_embeds = image_embeds.permute(0, 2, 1).view(b, c, sqrt_n, sqrt_n) + + stride = int(sqrt_n // (n_query ** 0.5)) + image_embeds = F.avg_pool2d(image_embeds, kernel_size=(stride, stride), stride=stride) + image_embeds = image_embeds.view(b, c, -1).permute(0, 2, 1).contiguous() + return image_embeds + + @torch.no_grad() + def generate_image( + self, + text: List[str], + image: Optional[torch.Tensor] = None, + placeholder: str = DEFAULT_IMG_PLACEHOLDER, + ): + IMAGE, BOI = self.decoder.tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_TOKEN, DEFAULT_IMG_TOKEN]) + if image is not None: + prompt_image_embeds = self.encode_image(image) + _, _, c = prompt_image_embeds.shape + prompt_image_embeds = prompt_image_embeds.view(-1, c) + prompt_image_embeds = self.project_up(prompt_image_embeds) + + text = [t.replace(placeholder, self.image_placeholder) for t in text] + + target_image_embeds = None + for num_img_token in range(self.n_query): + if num_img_token == 0: + text = [f"{t}{DEFAULT_IMG_TOKEN}" for t in text] + else: + text = [f"{t}{DEFAULT_IMAGE_TOKEN}" for t in text] + + inputs = self.decoder.tokenizer(text, padding="longest", return_tensors="pt") + + device = self.device(self.decoder.lm.model.embed_tokens) + input_ids = inputs.input_ids.to(device) # B x N + text_embeds = self.decoder.lm.model.embed_tokens(input_ids) + + attention_mask = inputs.attention_mask.to(text_embeds.device) + + image_idx = (input_ids == IMAGE) + cumsum_idx = torch.flip(torch.cumsum(torch.flip(image_idx, dims=[1]), dim=1), dims=[1]) + if image is not None: + prompt_idx = torch.logical_and(image_idx, cumsum_idx > num_img_token) + text_embeds[prompt_idx] = prompt_image_embeds.to(text_embeds.device) + + if target_image_embeds is not None: + target_idx = torch.logical_and(image_idx, torch.logical_and(cumsum_idx > 0, cumsum_idx <= num_img_token)) + text_embeds[target_idx] = self.project_up(target_image_embeds).to(text_embeds.device) + + outputs = self.decoder.lm.model( + inputs_embeds=text_embeds, + attention_mask=attention_mask, + output_hidden_states=True, + return_dict=True, + ) + + image_idx = (input_ids == IMAGE) + (input_ids == BOI) + cumsum_idx = torch.flip(torch.cumsum(torch.flip(image_idx, dims=[1]), dim=1), dims=[1]) + target_idx = torch.logical_and(image_idx, torch.logical_and(cumsum_idx > 0, cumsum_idx <= num_img_token+1)) + + hidden_states = outputs.hidden_states[-1] + target_image_embeds = hidden_states[target_idx] + target_image_embeds = target_image_embeds.view(-1, target_image_embeds.shape[-1]) + target_image_embeds = self.project_down(target_image_embeds) + + _, C = target_image_embeds.shape + B = hidden_states.shape[0] + target_image_embeds = target_image_embeds.view(B, -1, C) + + return target_image_embeds + + @torch.no_grad() + def generate( + self, + text: List[str], + image: Optional[torch.Tensor] = None, + video: Optional[torch.Tensor] = None, + image_placeholder: str = DEFAULT_IMG_PLACEHOLDER, + video_placeholder: str = DEFAULT_VID_PLACEHOLDER, + num_beams=5, + max_new_tokens=10, + min_len=1, + do_sample=False, + penalty_alpha=None, + top_p=None, + top_k=None, + temperature=None, + length_penalty=-1, + repetition_penalty=1.0, + synced_gpus=False, + skip_special_tokens=True, + **kwargs + ): + + GENERATION_CONFIG.pad_token_id = self.decoder.tokenizer.pad_token_id + GENERATION_CONFIG.bos_token_id = self.decoder.tokenizer.bos_token_id + GENERATION_CONFIG.eos_token_id = self.decoder.tokenizer.eos_token_id + + IMAGE, VIDEO = self.decoder.tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_TOKEN, DEFAULT_gIMG_TOKEN]) + + text = [ + t.replace(image_placeholder, self.image_placeholder).replace(video_placeholder, self.video_placeholder) + for t in text + ] + + inputs = self.decoder.tokenizer(text, padding="longest", return_tensors="pt") + + device = self.device(self.decoder.lm.model.embed_tokens) + input_ids = inputs.input_ids.to(device) # B x N + text_embeds = self.decoder.lm.model.embed_tokens(input_ids) + + attention_mask = inputs.attention_mask.to(text_embeds.device) + + if image is not None: + prompt_image_embeds = self.encode_image(image, n_query=self.n_query) + _, _, c = prompt_image_embeds.shape + prompt_image_embeds = prompt_image_embeds.view(-1, c) + prompt_image_embeds = self.project_up(prompt_image_embeds) + image_idx = (input_ids == IMAGE) + text_embeds[image_idx] = prompt_image_embeds.to(text_embeds.device) + + if video is not None: + prompt_video_embeds = self.encode_image(video, n_query=self.v_query) + _, _, c = prompt_video_embeds.shape + prompt_video_embeds = prompt_video_embeds.view(-1, c) + prompt_video_embeds = self.project_up(prompt_video_embeds) + video_idx = (input_ids == VIDEO) + text_embeds[video_idx] = prompt_video_embeds.to(text_embeds.device) + + outputs = self.decoder.lm.generate( + generation_config=GENERATION_CONFIG, + inputs_embeds=text_embeds, + attention_mask=attention_mask, + do_sample=do_sample, + num_beams=num_beams, + max_new_tokens=max_new_tokens, + min_length=min_len, + length_penalty=length_penalty, + repetition_penalty=repetition_penalty, + penalty_alpha=penalty_alpha, + top_k=top_k, + top_p=top_p, + temperature=temperature, + synced_gpus=synced_gpus or hasattr(next(self.parameters()), 'ds_tensor'), + **kwargs, + ) + + output_text = self.decoder.tokenizer.batch_decode( + outputs, skip_special_tokens=skip_special_tokens, + ) + + return output_text diff --git a/Emu/Emu2/emu/eva_vit.py b/Emu/Emu2/emu/eva_vit.py new file mode 100644 index 0000000000000000000000000000000000000000..2729708764d8c5a9808a41d66060b918fe2e6389 --- /dev/null +++ b/Emu/Emu2/emu/eva_vit.py @@ -0,0 +1,445 @@ +# -------------------------------------------------------- +# Adapted from https://github.com/microsoft/unilm/tree/master/beit +# -------------------------------------------------------- + +import os +from functools import partial + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.checkpoint import checkpoint + +try: + from timm.models.layers import drop_path, to_2tuple +except: + from timm.layers import drop_path, to_2tuple + +try: + import xformers.ops as xops +except ImportError: + xops = None + print("Please 'pip install xformers'") + + +class PatchDropout(nn.Module): + """ + https://arxiv.org/abs/2212.00794 + """ + + def __init__(self, prob, exclude_first_token=True): + super().__init__() + assert 0 <= prob < 1. + self.prob = prob + self.exclude_first_token = exclude_first_token # exclude CLS token + print(f"os.getenv('RoPE')={os.getenv('RoPE')}") + + def forward(self, x): + if not self.training or self.prob == 0.: + return x + + if self.exclude_first_token: + cls_tokens, x = x[:, :1], x[:, 1:] + else: + cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1]) + + batch = x.size()[0] + num_tokens = x.size()[1] + + batch_indices = torch.arange(batch) + batch_indices = batch_indices[..., None] + + keep_prob = 1 - self.prob + num_patches_keep = max(1, int(num_tokens * keep_prob)) + + rand = torch.randn(batch, num_tokens) + patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices + + x = x[batch_indices, patch_indices_keep] + + if self.exclude_first_token: + x = torch.cat((cls_tokens, x), dim=1) + + if self.training and os.getenv('RoPE') == '1': + return x, patch_indices_keep + + return x + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return 'p={}'.format(self.drop_prob) + + +class Mlp(nn.Module): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + drop=0., + subln=False, + + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + + self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity() + + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + # x = self.drop(x) + # commit this for the orignal BERT implement + x = self.ffn_ln(x) + + x = self.fc2(x) + x = self.drop(x) + return x + +class SwiGLU(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, drop=0., + norm_layer=nn.LayerNorm, subln=False): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + + self.w1 = nn.Linear(in_features, hidden_features) + self.w2 = nn.Linear(in_features, hidden_features) + + self.act = act_layer() + self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity() + self.w3 = nn.Linear(hidden_features, out_features) + + self.drop = nn.Dropout(drop) + + def forward(self, x): + x1 = self.w1(x) + x2 = self.w2(x) + hidden = self.act(x1) * x2 + x = self.ffn_ln(hidden) + x = self.w3(x) + x = self.drop(x) + return x + +class Attention(nn.Module): + def __init__( + self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., + proj_drop=0., window_size=None, attn_head_dim=None, xattn=False, rope=None, subln=False, norm_layer=nn.LayerNorm): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.subln = subln + if self.subln: + self.q_proj = nn.Linear(dim, all_head_dim, bias=False) + self.k_proj = nn.Linear(dim, all_head_dim, bias=False) + self.v_proj = nn.Linear(dim, all_head_dim, bias=False) + else: + self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) + + if qkv_bias: + self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) + self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) + else: + self.q_bias = None + self.v_bias = None + + self.window_size = None + self.relative_position_bias_table = None + self.relative_position_index = None + + self.attn_drop = nn.Dropout(attn_drop) + self.inner_attn_ln = norm_layer(all_head_dim) if subln else nn.Identity() + # self.proj = nn.Linear(all_head_dim, all_head_dim) + self.proj = nn.Linear(all_head_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.xattn = xattn + self.xattn_drop = attn_drop + + self.rope = rope + + def forward(self, x, rel_pos_bias=None, attn_mask=None): + B, N, C = x.shape + if self.subln: + q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias) + k = F.linear(input=x, weight=self.k_proj.weight, bias=None) + v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias) + + q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) # B, num_heads, N, C + k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) + v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) + else: + + qkv_bias = None + if self.q_bias is not None: + qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias)) + + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) # 3, B, num_heads, N, C + q, k, v = qkv[0], qkv[1], qkv[2] + + if self.rope: + q_t = q[:, :, 1:, :] + ro_q_t = self.rope(q_t) + q = torch.cat((q[:, :, :1, :], ro_q_t), -2).type_as(v) + + k_t = k[:, :, 1:, :] + ro_k_t = self.rope(k_t) + k = torch.cat((k[:, :, :1, :], ro_k_t), -2).type_as(v) + + if self.xattn: + q = q.permute(0, 2, 1, 3) # B, num_heads, N, C -> B, N, num_heads, C + k = k.permute(0, 2, 1, 3) + v = v.permute(0, 2, 1, 3) + + x = xops.memory_efficient_attention( + q, k, v, + p=self.xattn_drop, + scale=self.scale, + ) + x = x.reshape(B, N, -1) + x = self.inner_attn_ln(x) + x = self.proj(x) + x = self.proj_drop(x) + else: + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + if self.relative_position_bias_table is not None: + relative_position_bias = \ + self.relative_position_bias_table[self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0).type_as(attn) + + if rel_pos_bias is not None: + attn = attn + rel_pos_bias.type_as(attn) + + if attn_mask is not None: + attn_mask = attn_mask.bool() + attn = attn.masked_fill(~attn_mask[:, None, None, :], float("-inf")) + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.inner_attn_ln(x) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm, + window_size=None, attn_head_dim=None, xattn=False, rope=None, postnorm=False, + subln=False, naiveswiglu=False): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, + attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim, + xattn=xattn, rope=rope, subln=subln, norm_layer=norm_layer) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + + if naiveswiglu: + self.mlp = SwiGLU( + in_features=dim, + hidden_features=mlp_hidden_dim, + subln=subln, + norm_layer=norm_layer, + ) + else: + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + subln=subln, + drop=drop + ) + + if init_values is not None and init_values > 0: + self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True) + self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True) + else: + self.gamma_1, self.gamma_2 = None, None + + self.postnorm = postnorm + + def forward(self, x, rel_pos_bias=None, attn_mask=None): + if self.gamma_1 is None: + if self.postnorm: + x = x + self.drop_path(self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))) + x = x + self.drop_path(self.norm2(self.mlp(x))) + else: + x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + if self.postnorm: + x = x + self.drop_path(self.gamma_1 * self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))) + x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x))) + else: + x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x, **kwargs): + B, C, H, W = x.shape + # FIXME look at relaxing size constraints + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x).flatten(2).transpose(1, 2) + return x + + +class EVAVisionTransformer(nn.Module): + """ Vision Transformer with support for patch or hybrid CNN input stage + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None, patch_dropout=0., + use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, rope=False, + use_mean_pooling=True, init_scale=0.001, grad_checkpointing=False, xattn=False, postnorm=False, + pt_hw_seq_len=16, intp_freq=False, naiveswiglu=False, subln=False, + ): + super().__init__() + self.image_size = img_size + # self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + if use_abs_pos_emb: + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + else: + self.pos_embed = None + self.pos_drop = nn.Dropout(p=drop_rate) + + self.rel_pos_bias = None + self.rope = None + + self.naiveswiglu = naiveswiglu + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.use_rel_pos_bias = use_rel_pos_bias + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None, + xattn=xattn, rope=self.rope, postnorm=postnorm, subln=subln, naiveswiglu=naiveswiglu) + for i in range(depth)]) + + # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn + self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity() + + self.grad_checkpointing = grad_checkpointing + + + def get_num_layers(self): + return len(self.blocks) + + def lock(self, unlocked_groups=0, freeze_bn_stats=False): + assert unlocked_groups == 0, 'partial locking not currently supported for this model' + for param in self.parameters(): + param.requires_grad = False + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + + def forward_features(self, x): + x = self.patch_embed(x) + batch_size, seq_len, _ = x.size() + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in + if os.getenv('RoPE') == '1': + if self.training and not isinstance(self.patch_dropout, nn.Identity): + x, patch_indices_keep = self.patch_dropout(x) + self.rope.forward = partial(self.rope.forward, patch_indices_keep=patch_indices_keep) + else: + self.rope.forward = partial(self.rope.forward, patch_indices_keep=None) + x = self.patch_dropout(x) + else: + x = self.patch_dropout(x) + + rel_pos_bias = None + + for blk in self.blocks: + if self.grad_checkpointing: + x = checkpoint(blk, x, (rel_pos_bias,)) + else: + x = blk(x, rel_pos_bias=rel_pos_bias) + + return x + + def forward(self, x): + + """ + :return: + forward_features function returns raw features of ViT, + forward with return_all_features returns normalized features of ViT + :param x: + :param return_all_features: + """ + + features = self.forward_features(x) # [B, n_patch, C] + + return features diff --git a/Emu/Emu2/emu/lm.py b/Emu/Emu2/emu/lm.py new file mode 100644 index 0000000000000000000000000000000000000000..14f5254c9e26406418a79989dd6072e35dd59308 --- /dev/null +++ b/Emu/Emu2/emu/lm.py @@ -0,0 +1,106 @@ +""" +LLaMA model from transformers, following stanford's Alpaca +""" +import torch.nn as nn + +import transformers +from transformers.models.llama.configuration_llama import LlamaConfig + +from .constants import * + + +def add_location_symbols(quantized_size=256, locate_special_token=2, flag_rec_symbol=True): + custom_sp_symbols = [] + + if locate_special_token > 0: + custom_sp_symbols.append(GRD_SYMBOL) + + for symbol in [BOP_SYMBOL, EOP_SYMBOL, BOO_SYMBOL, EOO_SYMBOL, DOM_SYMBOL]: + custom_sp_symbols.append(symbol) + + if flag_rec_symbol: + custom_sp_symbols.append(REC_SYMBOL) + + for i in range(quantized_size+1): + token_name = f"" + custom_sp_symbols.append(token_name) + return custom_sp_symbols + + +class EmuForClsAndRegression(nn.Module): + + def __init__(self, args): + super(EmuForClsAndRegression, self).__init__() + self.args = args + + # init a empty lm + llama_config = LlamaConfig.from_pretrained(args.llama_config_path) + self.lm = transformers.LlamaForCausalLM(config=llama_config) + + # init tokenizer + self.tokenizer = transformers.LlamaTokenizer.from_pretrained(args.llama_config_path) + + special_tokens_list = [ + DEFAULT_IMG_TOKEN, + DEFAULT_IMG_END_TOKEN, + DEFAULT_IMAGE_TOKEN, + DEFAULT_gIMG_TOKEN, + DEFAULT_gIMG_END_TOKEN, + DEFAULT_EOC_TOKEN, + DEFAULT_VIDEO_TOKEN + ] + add_location_symbols() + + if args.instruct: + special_tokens_list += [USER_TOKEN, ASSISTANT_TOKEN] + + special_tokens_dict = dict( + pad_token=DEFAULT_PAD_TOKEN, + bos_token=DEFAULT_BOS_TOKEN, + eos_token=DEFAULT_EOS_TOKEN, + additional_special_tokens=special_tokens_list + ) + + self.num_new_tokens = self.tokenizer.add_special_tokens(special_tokens_dict) + self.lm.resize_token_embeddings(len(self.tokenizer)) + self.lm.model.embed_tokens.padding_idx = self.tokenizer.pad_token_id + + self.config = self.lm.config + self.lm.config.d_model = self.lm.config.hidden_size + + self.image_token_id = self.tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_TOKEN])[0] + self.img_token_id = self.tokenizer.convert_tokens_to_ids([DEFAULT_IMG_TOKEN])[0] + self.img_end_token_id = self.tokenizer.convert_tokens_to_ids([DEFAULT_IMG_END_TOKEN])[0] + self.gimg_token_id = self.tokenizer.convert_tokens_to_ids([DEFAULT_gIMG_TOKEN])[0] + self.gimg_end_token_id = self.tokenizer.convert_tokens_to_ids([DEFAULT_gIMG_END_TOKEN])[0] + self.eoc_token_id = self.tokenizer.convert_tokens_to_ids([DEFAULT_EOC_TOKEN])[0] + + self.grounding_token_id = self.tokenizer.convert_tokens_to_ids([GRD_SYMBOL])[0] + self.rec_token_id = self.tokenizer.convert_tokens_to_ids([REC_SYMBOL])[0] + + self.user_token_id = self.assistant_token_id = None + if args.instruct: + self.user_token_id = self.tokenizer.convert_tokens_to_ids([USER_TOKEN])[0] + self.assistant_token_id = self.tokenizer.convert_tokens_to_ids([ASSISTANT_TOKEN])[0] + + + print(f"Vocab Size: {len(self.tokenizer)}") + print(f"The Special Tokens: {self.tokenizer.special_tokens_map}") + print(f"bos_token_id: {self.tokenizer.bos_token_id}") + print(f"eos_token_id: {self.tokenizer.eos_token_id}") + print(f"pad_token_id: {self.tokenizer.pad_token_id}") + print(f"{DEFAULT_IMAGE_TOKEN} token id: {self.image_token_id}") + print(f"{DEFAULT_IMG_TOKEN} token id: {self.img_token_id}") + print(f"{DEFAULT_IMG_END_TOKEN} token id: {self.img_end_token_id}") + print(f"{DEFAULT_gIMG_TOKEN} token id: {self.gimg_token_id}") + print(f"{DEFAULT_gIMG_END_TOKEN} token id: {self.gimg_end_token_id}") + print(f"{DEFAULT_EOC_TOKEN} token id: {self.eoc_token_id}") + print(f"{GRD_SYMBOL} token id: {self.grounding_token_id}") + print(f"{REC_SYMBOL} token id: {self.rec_token_id}") + + if args.instruct: + print(f"{USER_TOKEN} token id: {self.user_token_id}") + print(f"{ASSISTANT_TOKEN} token id: {self.assistant_token_id}") + + def get_num_layers(self): + return len(self.lm.model.layers) + diff --git a/Emu/Emu2/requirements.txt b/Emu/Emu2/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8700d5244c48c186c817deba0d2b59929ad5365d --- /dev/null +++ b/Emu/Emu2/requirements.txt @@ -0,0 +1,17 @@ +torch +transformers==4.31.0 +peft +numpy +Pillow +argparse +einops +timm +xformers +safetensors +sentencepiece==0.1.99 +decord +diffusers==0.24.0 +torchvision +accelerate==0.25.0 +gradio==3.40.1 +tokenizers==0.13.3