Spaces:
Build error
Build error
File size: 6,288 Bytes
3ee94d1 40aa9f2 3ee94d1 40aa9f2 3ee94d1 d570714 3ee94d1 d570714 3ee94d1 40aa9f2 3ee94d1 d570714 3ee94d1 40aa9f2 3ee94d1 d570714 3ee94d1 d570714 3ee94d1 d570714 3ee94d1 d570714 3ee94d1 d570714 3ee94d1 d570714 3ee94d1 d570714 40aa9f2 647a229 40aa9f2 3ee94d1 40aa9f2 3ee94d1 40aa9f2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 | from collections import Counter
from math import ceil
from pathlib import Path
from typing import Dict, List, Optional, Set
import fitz # PyMuPDF
import pandas as pd
from src.chains import PresentationAnalysis
from src.config.navigator import Navigator
class PresentationMetrics:
"""Class to handle various presentation metrics calculations."""
def __init__(self, pdf_path: Path):
"""Initialize with PDF path and open document."""
self.pdf_path = pdf_path
self.doc = fitz.open(pdf_path)
def get_page_metrics(self, page_num: int) -> Dict:
"""
Get comprehensive metrics for a specific page.
Returns:
Dictionary containing page metrics (image count, text length, size)
"""
page = self.doc[page_num]
return dict(
image_count=len(page.get_images()),
n_words=len(page.get_text().strip().split()),
size=(page.rect.width, page.rect.height)
)
def get_all_metrics(self) -> List[Dict]:
"""
Calculate metrics for all pages in the presentation.
Returns:
List of dictionaries with metrics for each page
"""
metrics = []
for page_num in range(len(self.doc)):
page_metrics = self.get_page_metrics(page_num)
page_metrics.update(dict(
page_num=page_num,
pdf_path=str(self.pdf_path)
))
metrics.append(page_metrics)
return metrics
def __del__(self):
"""Ensure proper document closure."""
if hasattr(self, "doc"):
self.doc.close()
def parse_pdf_directory(
root_dir: str | Path,
topic_first: bool = True,
include_datasets: Optional[Set[str]] = None,
exclude_datasets: Optional[Set[str]] = None,
) -> pd.DataFrame:
"""Your existing parse_pdf_directory function with metrics integration."""
if include_datasets and exclude_datasets:
raise ValueError("Cannot specify both include_datasets and exclude_datasets")
pdf_files: List[Dict] = []
root = Path(root_dir)
for path in root.rglob("*.pdf"):
rel_path = path.relative_to(root)
parts = list(rel_path.parts)
# Get dataset name for filtering (either first or second part depending on topic_first)
dataset_name = parts[1] if topic_first else parts[0]
# Apply dataset filters
if include_datasets and dataset_name not in include_datasets:
continue
if exclude_datasets and set(parts).intersection(set(exclude_datasets)):
continue
# Initialize empty dict for file info
pdf_info = dict(filename=parts.pop(), relative_path=str(rel_path))
if topic_first:
pdf_info["topic"] = parts.pop(0)
pdf_info["dataset"] = parts.pop(0)
pdf_info["nav"] = "/".join(parts) if parts else ""
try:
metrics = PresentationMetrics(path)
all_metrics = metrics.get_all_metrics()
# Calculate aggregated metrics
pdf_info["num_pages"] = len(all_metrics)
pdf_info["total_images"] = sum(m["image_count"] for m in all_metrics)
pdf_info["total_n_words"] = sum(m["n_words"] for m in all_metrics)
# Get page sizes
page_sizes = [(m["size"][0], m["size"][1]) for m in all_metrics]
common_size = Counter(page_sizes).most_common(1)[0][0]
pdf_info["page_width"] = common_size[0]
pdf_info["page_height"] = common_size[1]
# Handle varying sizes
unique_sizes = set(page_sizes)
pdf_info["varying_sizes"] = str(unique_sizes) if len(unique_sizes) > 1 else ""
except Exception as e:
pdf_info.update(dict(
num_pages=0,
total_images=0,
total_text_length=0,
page_width=0,
page_height=0,
varying_sizes=""
))
pdf_files.append(pdf_info)
return pd.DataFrame(pdf_files)
def get_pres_analysis_df(base: Path = Navigator().interim) -> pd.DataFrame:
descriptions: List[Dict] = []
for f in base.rglob("*.json"):
pres = PresentationAnalysis.load(f)
for slide in pres.slides:
descriptions.append(
dict(
pres_path=slide.pdf_path,
pres_title=pres.name,
page=slide.page_num,
# Unparsed text
llm_output=slide.llm_output,
# Parsed texts
text_content=slide.parsed_output.text_content,
visual_content=slide.parsed_output.visual_content,
topic_overview=slide.parsed_output.general_description.topic_overview,
conclusions_and_insights=slide.parsed_output.general_description.conclusions_and_insights,
layout_and_composition=slide.parsed_output.general_description.layout_and_composition,
# Tokens
completion_tokens=slide.response_metadata["token_usage"]["completion_tokens"],
prompt_tokens=slide.response_metadata["token_usage"]["prompt_tokens"],
)
)
df = pd.DataFrame(descriptions)
return df
def calculate_image_tokens(width: int, height: int):
# Source: this openai thread: https://community.openai.com/t/how-do-i-calculate-image-tokens-in-gpt4-vision/492318/6
if width > 2048 or height > 2048:
aspect_ratio = width / height
if aspect_ratio > 1:
width, height = 2048, int(2048 / aspect_ratio)
else:
width, height = int(2048 * aspect_ratio), 2048
if width >= height and height > 768:
width, height = int((768 / height) * width), 768
elif height > width and width > 768:
width, height = 768, int((768 / width) * height)
tiles_width = ceil(width / 512)
tiles_height = ceil(height / 512)
total_tokens = 85 + 170 * (tiles_width * tiles_height)
return total_tokens
def tokens2price(tokens: int, cost_per_1k_tokens: float = 0.00015):
# Token prices: https://openai.com/api/pricing/
return tokens / 1000 * cost_per_1k_tokens
|