Spaces:

jbilcke-hf
/

SNIPED_grasp-any-region

Running on Zero

File size: 7,078 Bytes

46861c5

import argparse
import json
import os
import re
import time

import openai
from tqdm import tqdm

NUM_SECONDS_TO_SLEEP = 0.5
VOCAB_IMAGE_W = 1000
VOCAB_IMAGE_H = 1000

# Define Azure OpenAI details
model_name = "gpt-4o-2024-11-20"
max_tokens = 1000  # range: [1, 4095]

# Initialize the Azure client
client = openai.AzureOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_KEY"),
    api_version="2024-03-01-preview",
)


def get_eval(content: str, max_tokens: int):
    while True:
        try:
            completion = client.chat.completions.create(
                model=model_name,
                messages=[
                    {
                        "role": "system",
                        "content": "You are a helpful and precise assistant for checking the quality of the answer.",
                    },
                    {
                        "role": "user",
                        "content": content,
                    },
                ],
                max_tokens=max_tokens,
                temperature=0,
            )
            ret = completion.choices[0].message.content
            break
        except openai.error.RateLimitError:
            pass
        except Exception as e:
            print(e)
        time.sleep(NUM_SECONDS_TO_SLEEP)

    return ret


def postprocess_answer(answer, category):
    if category == "refer_desc" or category == "refer_reason":
        pattern = r"\[.*?\]"
        matches = re.findall(pattern, answer)
        for match in matches:
            answer = answer.replace(" " + match, "")
    elif category == "ground_conv":
        pattern = r"\[.*?\]"
        matches = re.findall(pattern, answer)
        for match in matches:
            coor_cur = match.replace("[", "")
            coor_cur = coor_cur.replace("]", "")
            coor_cur = coor_cur.split(",")
            coor_cur = [float(i.strip()) for i in coor_cur]
            try:
                assert len(coor_cur) == 4
            except:
                print("Found a exception when parsing coordinates")
                answer = answer.replace(match, "")
            converted_box_coor = [
                coor_cur[0] / VOCAB_IMAGE_W,
                coor_cur[1] / VOCAB_IMAGE_H,
                coor_cur[2] / VOCAB_IMAGE_W,
                coor_cur[3] / VOCAB_IMAGE_H,
            ]
            answer = answer.replace(
                match,
                f"[{converted_box_coor[0]:.3f}, {converted_box_coor[1]:.3f}, {converted_box_coor[2]:.3f}, {converted_box_coor[3]:.3f}]",
            )

    return answer


def parse_score(review):
    try:
        score_pair = review.split("\n")[0]
        score_pair = score_pair.replace(",", " ")
        sp = score_pair.split(" ")
        print("score:", sp)
        return [float(sp[0]), float(sp[1])]
    except Exception as e:
        print(e)
        print("error", review)
        return [-1, -1]


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="ChatGPT-based QA evaluation.")
    parser.add_argument("-q", "--question")
    parser.add_argument("-c", "--context")
    parser.add_argument("-a", "--answer-list", nargs="+", default=[])
    parser.add_argument("-r", "--rule")
    parser.add_argument("-o", "--output")
    parser.add_argument(
        "--max-tokens",
        type=int,
        default=1024,
        help="maximum number of tokens produced in the output",
    )
    parser.add_argument("--source-file", type=str, required=True)
    args = parser.parse_args()

    f_q = open(os.path.expanduser(args.question))
    f_ans1 = open(os.path.expanduser(args.answer_list[0]))

    # convert first
    target_path = os.path.expanduser(args.answer_list[1])
    with open(args.source_file, "r") as file:
        source_file = json.load(file)
    with open(target_path, "w") as file:
        for idx, item in enumerate(source_file):
            info = {
                "question_id": idx,
                "image": item["image_path"].split("/")[-1],
                "category": "refer_desc",
                "text": item["caption"],
            }
            json.dump(info, file, ensure_ascii=False)
            file.write("\n")

    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
    rule_dict = json.load(open(os.path.expanduser(args.rule), "r"))

    if os.path.isfile(os.path.expanduser(args.output)):
        cur_reviews = [
            json.loads(line) for line in open(os.path.expanduser(args.output))
        ]
    else:
        cur_reviews = []

    review_file = open(f"{args.output}", "a")

    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
    image_to_context = {context["image"]: context for context in context_list}

    handles = []
    idx = 0
    for ques_js, ans1_js, ans2_js in tqdm(zip(f_q, f_ans1, f_ans2)):
        ques = json.loads(ques_js)
        ans1 = json.loads(ans1_js)
        ans2 = json.loads(ans2_js)

        inst = image_to_context[ques["image"]]
        # cap_str = '\n'.join(inst['captions'])
        # box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])

        category = json.loads(ques_js)["category"]
        if category in rule_dict:
            rule = rule_dict[category]
        else:
            assert False, f"Visual QA category not found in rule file: {category}."

        # Assume ans2 is the predicted one.
        processed_answer = postprocess_answer(ans2["text"], category)
        # pdb.set_trace()
        ans2["text"] = processed_answer
        # if category == 'refer_desc':

        prompt = rule["prompt"]
        role = rule["role"]
        content = (
            f'[Context]\{inst["text"]}\n\n'
            f'[Question]\n{ques["text"]}\n\n'
            f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
            f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
            f"[System]\n{prompt}\n\n"
        )
        # content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
        #            f'[Question]\n{ques["text"]}\n\n'
        #            f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
        #            f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
        #            f'[System]\n{prompt}\n\n')
        cur_js = {
            "id": idx + 1,
            "question_id": ques["question_id"],
            "answer1_id": ans1.get("answer_id", ans1["question_id"]),
            "answer2_id": ans2.get("answer_id", ans2["question_id"]),
            "category": category,
        }
        if idx >= len(cur_reviews):
            review = get_eval(content, args.max_tokens)
            scores = parse_score(review)
            cur_js["content"] = review
            cur_js["tuple"] = scores
            cur_js["answer1"] = ans1["text"]
            cur_js["answer2"] = ans2["text"]
            review_file.write(json.dumps(cur_js) + "\n")
            review_file.flush()
        else:
            print(f"Skipping {idx} as we already have it.")
        idx += 1
        print(idx)
    review_file.close()