Spaces:
Build error
Build error
| from collections import Counter | |
| from math import ceil | |
| from pathlib import Path | |
| from typing import Dict, List, Optional, Set | |
| import fitz # PyMuPDF | |
| import pandas as pd | |
| from src.chains import PresentationAnalysis | |
| from src.config.navigator import Navigator | |
| class PresentationMetrics: | |
| """Class to handle various presentation metrics calculations.""" | |
| def __init__(self, pdf_path: Path): | |
| """Initialize with PDF path and open document.""" | |
| self.pdf_path = pdf_path | |
| self.doc = fitz.open(pdf_path) | |
| def get_page_metrics(self, page_num: int) -> Dict: | |
| """ | |
| Get comprehensive metrics for a specific page. | |
| Returns: | |
| Dictionary containing page metrics (image count, text length, size) | |
| """ | |
| page = self.doc[page_num] | |
| return dict( | |
| image_count=len(page.get_images()), | |
| n_words=len(page.get_text().strip().split()), | |
| size=(page.rect.width, page.rect.height) | |
| ) | |
| def get_all_metrics(self) -> List[Dict]: | |
| """ | |
| Calculate metrics for all pages in the presentation. | |
| Returns: | |
| List of dictionaries with metrics for each page | |
| """ | |
| metrics = [] | |
| for page_num in range(len(self.doc)): | |
| page_metrics = self.get_page_metrics(page_num) | |
| page_metrics.update(dict( | |
| page_num=page_num, | |
| pdf_path=str(self.pdf_path) | |
| )) | |
| metrics.append(page_metrics) | |
| return metrics | |
| def __del__(self): | |
| """Ensure proper document closure.""" | |
| if hasattr(self, "doc"): | |
| self.doc.close() | |
| def parse_pdf_directory( | |
| root_dir: str | Path, | |
| topic_first: bool = True, | |
| include_datasets: Optional[Set[str]] = None, | |
| exclude_datasets: Optional[Set[str]] = None, | |
| ) -> pd.DataFrame: | |
| """Your existing parse_pdf_directory function with metrics integration.""" | |
| if include_datasets and exclude_datasets: | |
| raise ValueError("Cannot specify both include_datasets and exclude_datasets") | |
| pdf_files: List[Dict] = [] | |
| root = Path(root_dir) | |
| for path in root.rglob("*.pdf"): | |
| rel_path = path.relative_to(root) | |
| parts = list(rel_path.parts) | |
| # Get dataset name for filtering (either first or second part depending on topic_first) | |
| dataset_name = parts[1] if topic_first else parts[0] | |
| # Apply dataset filters | |
| if include_datasets and dataset_name not in include_datasets: | |
| continue | |
| if exclude_datasets and set(parts).intersection(set(exclude_datasets)): | |
| continue | |
| # Initialize empty dict for file info | |
| pdf_info = dict(filename=parts.pop(), relative_path=str(rel_path)) | |
| if topic_first: | |
| pdf_info["topic"] = parts.pop(0) | |
| pdf_info["dataset"] = parts.pop(0) | |
| pdf_info["nav"] = "/".join(parts) if parts else "" | |
| try: | |
| metrics = PresentationMetrics(path) | |
| all_metrics = metrics.get_all_metrics() | |
| # Calculate aggregated metrics | |
| pdf_info["num_pages"] = len(all_metrics) | |
| pdf_info["total_images"] = sum(m["image_count"] for m in all_metrics) | |
| pdf_info["total_n_words"] = sum(m["n_words"] for m in all_metrics) | |
| # Get page sizes | |
| page_sizes = [(m["size"][0], m["size"][1]) for m in all_metrics] | |
| common_size = Counter(page_sizes).most_common(1)[0][0] | |
| pdf_info["page_width"] = common_size[0] | |
| pdf_info["page_height"] = common_size[1] | |
| # Handle varying sizes | |
| unique_sizes = set(page_sizes) | |
| pdf_info["varying_sizes"] = str(unique_sizes) if len(unique_sizes) > 1 else "" | |
| except Exception as e: | |
| pdf_info.update(dict( | |
| num_pages=0, | |
| total_images=0, | |
| total_text_length=0, | |
| page_width=0, | |
| page_height=0, | |
| varying_sizes="" | |
| )) | |
| pdf_files.append(pdf_info) | |
| return pd.DataFrame(pdf_files) | |
| def get_pres_analysis_df(base: Path = Navigator().interim) -> pd.DataFrame: | |
| descriptions: List[Dict] = [] | |
| for f in base.rglob("*.json"): | |
| pres = PresentationAnalysis.load(f) | |
| for slide in pres.slides: | |
| descriptions.append( | |
| dict( | |
| pres_path=slide.pdf_path, | |
| pres_title=pres.name, | |
| page=slide.page_num, | |
| # Unparsed text | |
| llm_output=slide.llm_output, | |
| # Parsed texts | |
| text_content=slide.parsed_output.text_content, | |
| visual_content=slide.parsed_output.visual_content, | |
| topic_overview=slide.parsed_output.general_description.topic_overview, | |
| conclusions_and_insights=slide.parsed_output.general_description.conclusions_and_insights, | |
| layout_and_composition=slide.parsed_output.general_description.layout_and_composition, | |
| # Tokens | |
| completion_tokens=slide.response_metadata["token_usage"]["completion_tokens"], | |
| prompt_tokens=slide.response_metadata["token_usage"]["prompt_tokens"], | |
| ) | |
| ) | |
| df = pd.DataFrame(descriptions) | |
| return df | |
| def calculate_image_tokens(width: int, height: int): | |
| # Source: this openai thread: https://community.openai.com/t/how-do-i-calculate-image-tokens-in-gpt4-vision/492318/6 | |
| if width > 2048 or height > 2048: | |
| aspect_ratio = width / height | |
| if aspect_ratio > 1: | |
| width, height = 2048, int(2048 / aspect_ratio) | |
| else: | |
| width, height = int(2048 * aspect_ratio), 2048 | |
| if width >= height and height > 768: | |
| width, height = int((768 / height) * width), 768 | |
| elif height > width and width > 768: | |
| width, height = 768, int((768 / width) * height) | |
| tiles_width = ceil(width / 512) | |
| tiles_height = ceil(height / 512) | |
| total_tokens = 85 + 170 * (tiles_width * tiles_height) | |
| return total_tokens | |
| def tokens2price(tokens: int, cost_per_1k_tokens: float = 0.00015): | |
| # Token prices: https://openai.com/api/pricing/ | |
| return tokens / 1000 * cost_per_1k_tokens | |