Mqleet's picture
upd code
fcaa164
import json
import os
import random
import shutil
import tempfile
from collections import defaultdict
from glob import glob
from typing import Literal
import func_argparse
import pytorch_fid.fid_score as fid
import torch
from jinja2 import Template
from pytorch_fid.fid_score import compute_statistics_of_path
from rich import print
from tqdm import tqdm
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import llms
from presentation import Picture, Presentation, SlidePage
from utils import Config, pexists, pjoin
fid.tqdm = lambda x: x
judges = [
(llms.gpt4o, llms.gpt4o, "gpt4o"),
(llms.qwen2_5, llms.intern_vl, "qwen+intern"),
(llms.qwen2_5, llms.qwen_vl, "Qwen"),
(llms.qwen_vl, llms.qwen_vl, "qwen_vl"),
(llms.intern_vl, llms.intern_vl, "intern_vl"),
]
DEVICES = torch.cuda.device_count()
def get_ppl(slide: SlidePage, model: GPT2LMHeadModel, tokenizer: GPT2TokenizerFast):
ppl = []
text = slide.to_text()
if len(text) == 0:
return ppl
tokenized = tokenizer(text, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model(tokenized.input_ids, labels=tokenized.input_ids)
loss = outputs.loss
perplexity = torch.exp(loss)
ppl.append(perplexity.item())
return ppl
def eval_general(presentations: list[Presentation], evals: dict[str, list[int]]):
for prs in presentations:
if prs.source_file in evals["pages"]:
continue
evals["pages"][prs.source_file] = len(prs)
evals["characters"][prs.source_file] = sum(
[len(slide.to_text()) for slide in prs.slides]
)
evals["figures"][prs.source_file] = sum(
[len(list(slide.shape_filter(Picture))) for slide in prs.slides]
)
def eval_feature(
presentations: list[Presentation],
evals: dict,
setting: str,
):
device = f"cuda:{random.randint(0, DEVICES - 1)}"
print("start scoring ppl")
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
for prs in tqdm(presentations):
try:
if prs.source_file in evals["ppl"]:
continue
if (
prs.source_file
== "data/culture/pptx/ChemBio-in-the-HUB-public/PPTCrew_wo_SchemaInduction/SSRN-id2933553_Management of Systems Engineering and Technical Assistance of DARPA Research Programs/final.pptx"
):
continue
ppl = []
for slide in prs.slides:
ppl.extend(get_ppl(slide, model, tokenizer))
if len(ppl) == 0:
continue
evals["ppl"][prs.source_file] = sum(ppl) / len(ppl)
except Exception as e:
print(e, "\n", "happended in ", prs.source_file)
model = fid.InceptionV3([fid.InceptionV3.BLOCK_INDEX_BY_DIM[64]]).to(device)
for ppt_folder in tqdm(sorted(glob(f"data/*/pptx/*/"))):
if ppt_folder in evals["fid"]:
continue
source_folder = pjoin(ppt_folder, "source_slides")
m1, s1 = compute_statistics_of_path(source_folder, model, 128, 64, device)
try:
with tempfile.TemporaryDirectory(prefix="ppteval_fid_") as temp_dir:
for result_folder in glob(
pjoin(ppt_folder, f"final_images/{setting}/*")
):
folder_base = os.path.basename(result_folder)
for image_file in os.listdir(result_folder):
image_path = os.path.join(result_folder, image_file)
temp_image_path = os.path.join(
temp_dir, folder_base + "_" + image_file
).replace(" ", "_")
shutil.copyfile(image_path, temp_image_path)
if len(os.listdir(temp_dir)) < 10:
continue
m2, s2 = compute_statistics_of_path(temp_dir, model, 32, 64, device)
evals["fid"][ppt_folder] = fid.calculate_frechet_distance(
m1, s1, m2, s2
)
except Exception as e:
print(e, "\n", "happended in ", ppt_folder, "on:", setting)
def merge_evals(folders: list[str], evals: dict):
for folder in folders:
sub_eval = json.load(open(pjoin(folder, "evals.json")))
for dimension in ["content", "vision", "logic"]:
evals[dimension] |= sub_eval[dimension]
return evals
def slide_score(slide_folder: str):
eval_file = pjoin(slide_folder, "evals.json")
evals = defaultdict(dict)
if pexists(eval_file):
evals |= json.load(open(eval_file))
text_scorer = Template(open("prompts/ppteval_content.txt", "r").read())
vision_scorer = Template(open("prompts/ppteval_style.txt", "r").read())
style_descriptor = open("prompts/ppteval_describe_style.txt", "r").read()
content_descriptor = open("prompts/ppteval_describe_content.txt", "r").read()
for slide_image in glob(pjoin(slide_folder, "slide_*.jpg")):
slide_descr = slide_image.replace(".jpg", ".json")
if not os.path.exists(slide_descr):
style_descr = llms.vision_model(style_descriptor, slide_image)
content_descr = llms.vision_model(content_descriptor, slide_image)
json.dump(
{"content": content_descr, "style": style_descr},
open(slide_descr, "w"),
indent=4,
)
else:
descr = json.load(open(slide_descr))
style_descr = descr["style"]
content_descr = descr["content"]
if slide_image not in evals["vision"]:
evals["vision"][slide_image] = llms.language_model(
vision_scorer.render(descr=style_descr), return_json=True
)
if slide_image not in evals["content"]:
evals["content"][slide_image] = llms.language_model(
text_scorer.render(descr=content_descr), return_json=True
)
def pres_score(prs_source: str):
if "/pptx/" in prs_source: # ours
source, setting, pdf, _ = prs_source.rsplit("/", 3)
slide_folder = os.path.join(source, "final_images", setting, pdf)
else: # baseline
slide_folder = os.path.dirname(prs_source)
eval_file = pjoin(slide_folder, "evals.json")
evals = defaultdict(dict)
if pexists(eval_file):
try:
evals |= json.load(open(eval_file))
except:
pass
evals.pop("logic", None) # ? debug
slide_descr = pjoin(slide_folder, "extracted.json")
if not pexists(slide_descr):
config = Config("/tmp")
presentation = Presentation.from_file(prs_source, config)
ppt_extractor = Template(open("prompts/ppteval_extract.txt", "r").read())
extracted = llms.language_model(
ppt_extractor.render(presentation=presentation.to_text()),
return_json=True,
)
json.dump(extracted, open(slide_descr, "w"), indent=4)
else:
extracted = json.load(open(slide_descr))
if presentation.source_file not in evals["logic"]:
logic_scorer = Template(open("ppteval_coherence.txt", "r").read())
evals["logic"][presentation.source_file] = llms.language_model(
logic_scorer.render(
background_information=extracted.pop("metadata"),
logical_structure=extracted,
),
return_json=True,
)
json.dump(evals, open(eval_file, "w"), indent=4)
# ppt eval
def eval_experiment(
setting: str,
general_eval: bool = False,
feature_eval: bool = False,
ppt_eval: bool = False,
):
assert setting != "*"
llms.language_model, llms.vision_model, judge_name = judges[0]
print(f"evaluating {setting} under {judge_name}")
print(
"eval config :",
f"general_eval: {general_eval}, feature_eval: {feature_eval}, ppt_eval: {ppt_eval}",
)
eval_file = f"data/evals/{setting}_{judge_name}.json"
eval_stats = defaultdict(dict)
if pexists(eval_file):
eval_stats |= json.load(open(eval_file))
config = Config("/tmp")
prs_files = glob(f"data/*/pptx/*/{setting}/*/final.pptx")
# filename dimension score
print("start evaluation")
if general_eval or feature_eval:
presentations = [Presentation.from_file(i, config) for i in prs_files]
if general_eval:
eval_general(presentations, eval_stats)
if feature_eval:
eval_feature(presentations, eval_stats, setting)
if ppt_eval:
slide_image_folders = glob(f"data/*/pptx/*/final_images/{setting}/*")
for presentation in prs_files:
pres_score(presentation)
eval_stats = merge_evals(slide_image_folders, eval_stats)
json.dump(eval_stats, open(eval_file, "w"), indent=4)
def eval_baseline(
setting: str,
model: Literal["Qwen2.5", "gpt-4o"],
general_eval: bool = False,
feature_eval: bool = False,
ppt_eval: bool = False,
):
evals = defaultdict(dict)
prs_files = glob(f"data/*/pdf/*/{setting}/{model}/final.pptx")
slide_folders = [os.path.dirname(i) for i in prs_files]
if general_eval or feature_eval:
config = Config("/tmp")
presentations = [Presentation.from_file(i, config) for i in prs_files]
if general_eval:
eval_general(presentations, evals)
if feature_eval:
eval_feature(presentations, evals, setting, fid_eval=False)
if ppt_eval:
for slide_folder in slide_folders:
slide_score(slide_folder)
for presentation in prs_files:
pres_score(presentation)
merge_evals(slide_folders, evals)
json.dump(evals, open(f"data/evals/{setting}_{model}.json", "w"), indent=4)
if __name__ == "__main__":
func_argparse.main(
eval_experiment,
eval_baseline,
pres_score,
slide_score,
)