File size: 6,288 Bytes
3ee94d1
 
40aa9f2
 
 
3ee94d1
40aa9f2
 
 
 
3ee94d1
 
d570714
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ee94d1
d570714
3ee94d1
 
40aa9f2
3ee94d1
d570714
3ee94d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40aa9f2
3ee94d1
 
 
 
 
 
 
 
d570714
 
3ee94d1
d570714
 
 
 
3ee94d1
d570714
 
3ee94d1
 
 
 
d570714
3ee94d1
d570714
3ee94d1
 
d570714
 
 
 
 
 
 
 
3ee94d1
 
 
d570714
40aa9f2
 
 
 
 
 
 
 
 
 
 
647a229
 
40aa9f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ee94d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40aa9f2
3ee94d1
40aa9f2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
from collections import Counter
from math import ceil
from pathlib import Path
from typing import Dict, List, Optional, Set

import fitz  # PyMuPDF
import pandas as pd

from src.chains import PresentationAnalysis
from src.config.navigator import Navigator


class PresentationMetrics:
    """Class to handle various presentation metrics calculations."""

    def __init__(self, pdf_path: Path):
        """Initialize with PDF path and open document."""
        self.pdf_path = pdf_path
        self.doc = fitz.open(pdf_path)

    def get_page_metrics(self, page_num: int) -> Dict:
        """
        Get comprehensive metrics for a specific page.

        Returns:
            Dictionary containing page metrics (image count, text length, size)
        """
        page = self.doc[page_num]
        return dict(
            image_count=len(page.get_images()),
            n_words=len(page.get_text().strip().split()),
            size=(page.rect.width, page.rect.height)
        )

    def get_all_metrics(self) -> List[Dict]:
        """
        Calculate metrics for all pages in the presentation.

        Returns:
            List of dictionaries with metrics for each page
        """
        metrics = []
        for page_num in range(len(self.doc)):
            page_metrics = self.get_page_metrics(page_num)
            page_metrics.update(dict(
                page_num=page_num,
                pdf_path=str(self.pdf_path)
            ))
            metrics.append(page_metrics)
        return metrics

    def __del__(self):
        """Ensure proper document closure."""
        if hasattr(self, "doc"):
            self.doc.close()


def parse_pdf_directory(
    root_dir: str | Path,
    topic_first: bool = True,
    include_datasets: Optional[Set[str]] = None,
    exclude_datasets: Optional[Set[str]] = None,
) -> pd.DataFrame:
    """Your existing parse_pdf_directory function with metrics integration."""
    if include_datasets and exclude_datasets:
        raise ValueError("Cannot specify both include_datasets and exclude_datasets")

    pdf_files: List[Dict] = []
    root = Path(root_dir)

    for path in root.rglob("*.pdf"):
        rel_path = path.relative_to(root)
        parts = list(rel_path.parts)

        # Get dataset name for filtering (either first or second part depending on topic_first)
        dataset_name = parts[1] if topic_first else parts[0]

        # Apply dataset filters
        if include_datasets and dataset_name not in include_datasets:
            continue
        if exclude_datasets and set(parts).intersection(set(exclude_datasets)):
            continue

        # Initialize empty dict for file info
        pdf_info = dict(filename=parts.pop(), relative_path=str(rel_path))

        if topic_first:
            pdf_info["topic"] = parts.pop(0)

        pdf_info["dataset"] = parts.pop(0)
        pdf_info["nav"] = "/".join(parts) if parts else ""

        try:
            metrics = PresentationMetrics(path)
            all_metrics = metrics.get_all_metrics()

            # Calculate aggregated metrics
            pdf_info["num_pages"] = len(all_metrics)
            pdf_info["total_images"] = sum(m["image_count"] for m in all_metrics)
            pdf_info["total_n_words"] = sum(m["n_words"] for m in all_metrics)

            # Get page sizes
            page_sizes = [(m["size"][0], m["size"][1]) for m in all_metrics]
            common_size = Counter(page_sizes).most_common(1)[0][0]
            pdf_info["page_width"] = common_size[0]
            pdf_info["page_height"] = common_size[1]

            # Handle varying sizes
            unique_sizes = set(page_sizes)
            pdf_info["varying_sizes"] = str(unique_sizes) if len(unique_sizes) > 1 else ""

        except Exception as e:
            pdf_info.update(dict(
                num_pages=0,
                total_images=0,
                total_text_length=0,
                page_width=0,
                page_height=0,
                varying_sizes=""
            ))

        pdf_files.append(pdf_info)

    return pd.DataFrame(pdf_files)

def get_pres_analysis_df(base: Path = Navigator().interim) -> pd.DataFrame:
    descriptions: List[Dict] = []
    for f in base.rglob("*.json"):
        pres = PresentationAnalysis.load(f)
        for slide in pres.slides:
            descriptions.append(
                dict(
                    pres_path=slide.pdf_path,
                    pres_title=pres.name,
                    page=slide.page_num,
                    # Unparsed text
                    llm_output=slide.llm_output,
                    # Parsed texts
                    text_content=slide.parsed_output.text_content,
                    visual_content=slide.parsed_output.visual_content,
                    topic_overview=slide.parsed_output.general_description.topic_overview,
                    conclusions_and_insights=slide.parsed_output.general_description.conclusions_and_insights,
                    layout_and_composition=slide.parsed_output.general_description.layout_and_composition,
                    # Tokens
                    completion_tokens=slide.response_metadata["token_usage"]["completion_tokens"],
                    prompt_tokens=slide.response_metadata["token_usage"]["prompt_tokens"],
                )
            )
    df = pd.DataFrame(descriptions)
    return df


def calculate_image_tokens(width: int, height: int):
    # Source: this openai thread: https://community.openai.com/t/how-do-i-calculate-image-tokens-in-gpt4-vision/492318/6
    if width > 2048 or height > 2048:
        aspect_ratio = width / height
        if aspect_ratio > 1:
            width, height = 2048, int(2048 / aspect_ratio)
        else:
            width, height = int(2048 * aspect_ratio), 2048

    if width >= height and height > 768:
        width, height = int((768 / height) * width), 768
    elif height > width and width > 768:
        width, height = 768, int((768 / width) * height)

    tiles_width = ceil(width / 512)
    tiles_height = ceil(height / 512)
    total_tokens = 85 + 170 * (tiles_width * tiles_height)

    return total_tokens


def tokens2price(tokens: int, cost_per_1k_tokens: float = 0.00015):
    # Token prices: https://openai.com/api/pricing/
    return tokens / 1000 * cost_per_1k_tokens