from collections import Counter from math import ceil from pathlib import Path from typing import Dict, List, Optional, Set import fitz # PyMuPDF import pandas as pd from src.chains import PresentationAnalysis from src.config.navigator import Navigator class PresentationMetrics: """Class to handle various presentation metrics calculations.""" def __init__(self, pdf_path: Path): """Initialize with PDF path and open document.""" self.pdf_path = pdf_path self.doc = fitz.open(pdf_path) def get_page_metrics(self, page_num: int) -> Dict: """ Get comprehensive metrics for a specific page. Returns: Dictionary containing page metrics (image count, text length, size) """ page = self.doc[page_num] return dict( image_count=len(page.get_images()), n_words=len(page.get_text().strip().split()), size=(page.rect.width, page.rect.height) ) def get_all_metrics(self) -> List[Dict]: """ Calculate metrics for all pages in the presentation. Returns: List of dictionaries with metrics for each page """ metrics = [] for page_num in range(len(self.doc)): page_metrics = self.get_page_metrics(page_num) page_metrics.update(dict( page_num=page_num, pdf_path=str(self.pdf_path) )) metrics.append(page_metrics) return metrics def __del__(self): """Ensure proper document closure.""" if hasattr(self, "doc"): self.doc.close() def parse_pdf_directory( root_dir: str | Path, topic_first: bool = True, include_datasets: Optional[Set[str]] = None, exclude_datasets: Optional[Set[str]] = None, ) -> pd.DataFrame: """Your existing parse_pdf_directory function with metrics integration.""" if include_datasets and exclude_datasets: raise ValueError("Cannot specify both include_datasets and exclude_datasets") pdf_files: List[Dict] = [] root = Path(root_dir) for path in root.rglob("*.pdf"): rel_path = path.relative_to(root) parts = list(rel_path.parts) # Get dataset name for filtering (either first or second part depending on topic_first) dataset_name = parts[1] if topic_first else parts[0] # Apply dataset filters if include_datasets and dataset_name not in include_datasets: continue if exclude_datasets and set(parts).intersection(set(exclude_datasets)): continue # Initialize empty dict for file info pdf_info = dict(filename=parts.pop(), relative_path=str(rel_path)) if topic_first: pdf_info["topic"] = parts.pop(0) pdf_info["dataset"] = parts.pop(0) pdf_info["nav"] = "/".join(parts) if parts else "" try: metrics = PresentationMetrics(path) all_metrics = metrics.get_all_metrics() # Calculate aggregated metrics pdf_info["num_pages"] = len(all_metrics) pdf_info["total_images"] = sum(m["image_count"] for m in all_metrics) pdf_info["total_n_words"] = sum(m["n_words"] for m in all_metrics) # Get page sizes page_sizes = [(m["size"][0], m["size"][1]) for m in all_metrics] common_size = Counter(page_sizes).most_common(1)[0][0] pdf_info["page_width"] = common_size[0] pdf_info["page_height"] = common_size[1] # Handle varying sizes unique_sizes = set(page_sizes) pdf_info["varying_sizes"] = str(unique_sizes) if len(unique_sizes) > 1 else "" except Exception as e: pdf_info.update(dict( num_pages=0, total_images=0, total_text_length=0, page_width=0, page_height=0, varying_sizes="" )) pdf_files.append(pdf_info) return pd.DataFrame(pdf_files) def get_pres_analysis_df(base: Path = Navigator().interim) -> pd.DataFrame: descriptions: List[Dict] = [] for f in base.rglob("*.json"): pres = PresentationAnalysis.load(f) for slide in pres.slides: descriptions.append( dict( pres_path=slide.pdf_path, pres_title=pres.name, page=slide.page_num, # Unparsed text llm_output=slide.llm_output, # Parsed texts text_content=slide.parsed_output.text_content, visual_content=slide.parsed_output.visual_content, topic_overview=slide.parsed_output.general_description.topic_overview, conclusions_and_insights=slide.parsed_output.general_description.conclusions_and_insights, layout_and_composition=slide.parsed_output.general_description.layout_and_composition, # Tokens completion_tokens=slide.response_metadata["token_usage"]["completion_tokens"], prompt_tokens=slide.response_metadata["token_usage"]["prompt_tokens"], ) ) df = pd.DataFrame(descriptions) return df def calculate_image_tokens(width: int, height: int): # Source: this openai thread: https://community.openai.com/t/how-do-i-calculate-image-tokens-in-gpt4-vision/492318/6 if width > 2048 or height > 2048: aspect_ratio = width / height if aspect_ratio > 1: width, height = 2048, int(2048 / aspect_ratio) else: width, height = int(2048 * aspect_ratio), 2048 if width >= height and height > 768: width, height = int((768 / height) * width), 768 elif height > width and width > 768: width, height = 768, int((768 / width) * height) tiles_width = ceil(width / 512) tiles_height = ceil(height / 512) total_tokens = 85 + 170 * (tiles_width * tiles_height) return total_tokens def tokens2price(tokens: int, cost_per_1k_tokens: float = 0.00015): # Token prices: https://openai.com/api/pricing/ return tokens / 1000 * cost_per_1k_tokens