Ilia Tambovtsev
feat: update eda
d570714
from collections import Counter
from math import ceil
from pathlib import Path
from typing import Dict, List, Optional, Set
import fitz # PyMuPDF
import pandas as pd
from src.chains import PresentationAnalysis
from src.config.navigator import Navigator
class PresentationMetrics:
"""Class to handle various presentation metrics calculations."""
def __init__(self, pdf_path: Path):
"""Initialize with PDF path and open document."""
self.pdf_path = pdf_path
self.doc = fitz.open(pdf_path)
def get_page_metrics(self, page_num: int) -> Dict:
"""
Get comprehensive metrics for a specific page.
Returns:
Dictionary containing page metrics (image count, text length, size)
"""
page = self.doc[page_num]
return dict(
image_count=len(page.get_images()),
n_words=len(page.get_text().strip().split()),
size=(page.rect.width, page.rect.height)
)
def get_all_metrics(self) -> List[Dict]:
"""
Calculate metrics for all pages in the presentation.
Returns:
List of dictionaries with metrics for each page
"""
metrics = []
for page_num in range(len(self.doc)):
page_metrics = self.get_page_metrics(page_num)
page_metrics.update(dict(
page_num=page_num,
pdf_path=str(self.pdf_path)
))
metrics.append(page_metrics)
return metrics
def __del__(self):
"""Ensure proper document closure."""
if hasattr(self, "doc"):
self.doc.close()
def parse_pdf_directory(
root_dir: str | Path,
topic_first: bool = True,
include_datasets: Optional[Set[str]] = None,
exclude_datasets: Optional[Set[str]] = None,
) -> pd.DataFrame:
"""Your existing parse_pdf_directory function with metrics integration."""
if include_datasets and exclude_datasets:
raise ValueError("Cannot specify both include_datasets and exclude_datasets")
pdf_files: List[Dict] = []
root = Path(root_dir)
for path in root.rglob("*.pdf"):
rel_path = path.relative_to(root)
parts = list(rel_path.parts)
# Get dataset name for filtering (either first or second part depending on topic_first)
dataset_name = parts[1] if topic_first else parts[0]
# Apply dataset filters
if include_datasets and dataset_name not in include_datasets:
continue
if exclude_datasets and set(parts).intersection(set(exclude_datasets)):
continue
# Initialize empty dict for file info
pdf_info = dict(filename=parts.pop(), relative_path=str(rel_path))
if topic_first:
pdf_info["topic"] = parts.pop(0)
pdf_info["dataset"] = parts.pop(0)
pdf_info["nav"] = "/".join(parts) if parts else ""
try:
metrics = PresentationMetrics(path)
all_metrics = metrics.get_all_metrics()
# Calculate aggregated metrics
pdf_info["num_pages"] = len(all_metrics)
pdf_info["total_images"] = sum(m["image_count"] for m in all_metrics)
pdf_info["total_n_words"] = sum(m["n_words"] for m in all_metrics)
# Get page sizes
page_sizes = [(m["size"][0], m["size"][1]) for m in all_metrics]
common_size = Counter(page_sizes).most_common(1)[0][0]
pdf_info["page_width"] = common_size[0]
pdf_info["page_height"] = common_size[1]
# Handle varying sizes
unique_sizes = set(page_sizes)
pdf_info["varying_sizes"] = str(unique_sizes) if len(unique_sizes) > 1 else ""
except Exception as e:
pdf_info.update(dict(
num_pages=0,
total_images=0,
total_text_length=0,
page_width=0,
page_height=0,
varying_sizes=""
))
pdf_files.append(pdf_info)
return pd.DataFrame(pdf_files)
def get_pres_analysis_df(base: Path = Navigator().interim) -> pd.DataFrame:
descriptions: List[Dict] = []
for f in base.rglob("*.json"):
pres = PresentationAnalysis.load(f)
for slide in pres.slides:
descriptions.append(
dict(
pres_path=slide.pdf_path,
pres_title=pres.name,
page=slide.page_num,
# Unparsed text
llm_output=slide.llm_output,
# Parsed texts
text_content=slide.parsed_output.text_content,
visual_content=slide.parsed_output.visual_content,
topic_overview=slide.parsed_output.general_description.topic_overview,
conclusions_and_insights=slide.parsed_output.general_description.conclusions_and_insights,
layout_and_composition=slide.parsed_output.general_description.layout_and_composition,
# Tokens
completion_tokens=slide.response_metadata["token_usage"]["completion_tokens"],
prompt_tokens=slide.response_metadata["token_usage"]["prompt_tokens"],
)
)
df = pd.DataFrame(descriptions)
return df
def calculate_image_tokens(width: int, height: int):
# Source: this openai thread: https://community.openai.com/t/how-do-i-calculate-image-tokens-in-gpt4-vision/492318/6
if width > 2048 or height > 2048:
aspect_ratio = width / height
if aspect_ratio > 1:
width, height = 2048, int(2048 / aspect_ratio)
else:
width, height = int(2048 * aspect_ratio), 2048
if width >= height and height > 768:
width, height = int((768 / height) * width), 768
elif height > width and width > 768:
width, height = 768, int((768 / width) * height)
tiles_width = ceil(width / 512)
tiles_height = ceil(height / 512)
total_tokens = 85 + 170 * (tiles_width * tiles_height)
return total_tokens
def tokens2price(tokens: int, cost_per_1k_tokens: float = 0.00015):
# Token prices: https://openai.com/api/pricing/
return tokens / 1000 * cost_per_1k_tokens