Spaces:
Build error
Build error
Ilia Tambovtsev commited on
Commit ·
40aa9f2
1
Parent(s): 43554ac
feat: calculate statistics for descriptions
Browse files- pyproject.toml +3 -0
- src/eda/__init__.py +6 -1
- src/eda/explore.py +42 -13
pyproject.toml
CHANGED
|
@@ -23,6 +23,9 @@ chromadb = "^0.5.20"
|
|
| 23 |
gradio = "^5.6.0"
|
| 24 |
gradio-pdf = "^0.0.19"
|
| 25 |
tabulate = "^0.9.0"
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
[build-system]
|
|
|
|
| 23 |
gradio = "^5.6.0"
|
| 24 |
gradio-pdf = "^0.0.19"
|
| 25 |
tabulate = "^0.9.0"
|
| 26 |
+
plotly = "^5.24.1"
|
| 27 |
+
seaborn = "^0.13.2"
|
| 28 |
+
tiktoken = "^0.8.0"
|
| 29 |
|
| 30 |
|
| 31 |
[build-system]
|
src/eda/__init__.py
CHANGED
|
@@ -1 +1,6 @@
|
|
| 1 |
-
from src.eda.explore import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.eda.explore import (
|
| 2 |
+
calculate_image_tokens,
|
| 3 |
+
get_pres_analysis_df,
|
| 4 |
+
parse_pdf_directory,
|
| 5 |
+
tokens2price,
|
| 6 |
+
)
|
src/eda/explore.py
CHANGED
|
@@ -1,17 +1,23 @@
|
|
| 1 |
-
from pathlib import Path
|
| 2 |
from collections import Counter
|
| 3 |
from math import ceil
|
| 4 |
-
|
| 5 |
-
from typing import List,
|
|
|
|
| 6 |
import fitz # PyMuPDF
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
def parse_pdf_directory(
|
| 11 |
root_dir: str,
|
| 12 |
topic_first: bool = True,
|
| 13 |
include_datasets: Optional[Set[str]] = None,
|
| 14 |
-
exclude_datasets: Optional[Set[str]] = None
|
| 15 |
) -> pd.DataFrame:
|
| 16 |
"""
|
| 17 |
Parse directory of PDFs into a DataFrame using PyMuPDF (fitz).
|
|
@@ -46,10 +52,7 @@ def parse_pdf_directory(
|
|
| 46 |
continue
|
| 47 |
|
| 48 |
# Initialize empty dict for file info
|
| 49 |
-
pdf_info = dict(
|
| 50 |
-
filename=parts.pop(),
|
| 51 |
-
relative_path=str(rel_path)
|
| 52 |
-
)
|
| 53 |
|
| 54 |
if topic_first:
|
| 55 |
pdf_info["topic"] = parts.pop(0)
|
|
@@ -69,8 +72,7 @@ def parse_pdf_directory(
|
|
| 69 |
pdf_info["keywords"] = metadata.get("keywords", "")
|
| 70 |
|
| 71 |
# Get all page sizes
|
| 72 |
-
page_sizes = [(page.rect.width, page.rect.height)
|
| 73 |
-
for page in doc]
|
| 74 |
|
| 75 |
# Get most common page size
|
| 76 |
common_size = Counter(page_sizes).most_common(1)[0][0]
|
|
@@ -105,6 +107,32 @@ def parse_pdf_directory(
|
|
| 105 |
# df = df.sort_values(sort_cols)
|
| 106 |
return df
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
def calculate_image_tokens(width: int, height: int):
|
| 109 |
# Source: this openai thread: https://community.openai.com/t/how-do-i-calculate-image-tokens-in-gpt4-vision/492318/6
|
| 110 |
if width > 2048 or height > 2048:
|
|
@@ -125,6 +153,7 @@ def calculate_image_tokens(width: int, height: int):
|
|
| 125 |
|
| 126 |
return total_tokens
|
| 127 |
|
|
|
|
| 128 |
def tokens2price(tokens: int, cost_per_1k_tokens: float = 0.00015):
|
| 129 |
-
|
| 130 |
-
|
|
|
|
|
|
|
| 1 |
from collections import Counter
|
| 2 |
from math import ceil
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Dict, List, Optional, Set
|
| 5 |
+
|
| 6 |
import fitz # PyMuPDF
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from sqlalchemy import text
|
| 9 |
+
from sqlalchemy.sql.elements import CompilerElement
|
| 10 |
+
from sqlalchemy.sql.expression import desc
|
| 11 |
+
|
| 12 |
+
from src.chains import PresentationAnalysis
|
| 13 |
+
from src.config.navigator import Navigator
|
| 14 |
|
| 15 |
|
| 16 |
def parse_pdf_directory(
|
| 17 |
root_dir: str,
|
| 18 |
topic_first: bool = True,
|
| 19 |
include_datasets: Optional[Set[str]] = None,
|
| 20 |
+
exclude_datasets: Optional[Set[str]] = None,
|
| 21 |
) -> pd.DataFrame:
|
| 22 |
"""
|
| 23 |
Parse directory of PDFs into a DataFrame using PyMuPDF (fitz).
|
|
|
|
| 52 |
continue
|
| 53 |
|
| 54 |
# Initialize empty dict for file info
|
| 55 |
+
pdf_info = dict(filename=parts.pop(), relative_path=str(rel_path))
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
if topic_first:
|
| 58 |
pdf_info["topic"] = parts.pop(0)
|
|
|
|
| 72 |
pdf_info["keywords"] = metadata.get("keywords", "")
|
| 73 |
|
| 74 |
# Get all page sizes
|
| 75 |
+
page_sizes = [(page.rect.width, page.rect.height) for page in doc]
|
|
|
|
| 76 |
|
| 77 |
# Get most common page size
|
| 78 |
common_size = Counter(page_sizes).most_common(1)[0][0]
|
|
|
|
| 107 |
# df = df.sort_values(sort_cols)
|
| 108 |
return df
|
| 109 |
|
| 110 |
+
|
| 111 |
+
def get_pres_analysis_df(base: Path = Navigator().interim) -> pd.DataFrame:
|
| 112 |
+
descriptions: List[Dict] = []
|
| 113 |
+
for f in base.rglob("*.json"):
|
| 114 |
+
pres = PresentationAnalysis.load(f)
|
| 115 |
+
for slide in pres.slides:
|
| 116 |
+
descriptions.append(
|
| 117 |
+
dict(
|
| 118 |
+
pres_path=slide.pdf_path,
|
| 119 |
+
pres_title=pres.name,
|
| 120 |
+
page=slide.page_num,
|
| 121 |
+
# Parsed texts
|
| 122 |
+
text_content=slide.parsed_output.text_content,
|
| 123 |
+
visual_content=slide.parsed_output.visual_content,
|
| 124 |
+
topic_overview=slide.parsed_output.general_description.topic_overview,
|
| 125 |
+
conclusions_and_insights=slide.parsed_output.general_description.conclusions_and_insights,
|
| 126 |
+
layout_and_composition=slide.parsed_output.general_description.layout_and_composition,
|
| 127 |
+
# Tokens
|
| 128 |
+
completion_tokens=slide.response_metadata["token_usage"]["completion_tokens"],
|
| 129 |
+
prompt_tokens=slide.response_metadata["token_usage"]["prompt_tokens"],
|
| 130 |
+
)
|
| 131 |
+
)
|
| 132 |
+
df = pd.DataFrame(descriptions)
|
| 133 |
+
return df
|
| 134 |
+
|
| 135 |
+
|
| 136 |
def calculate_image_tokens(width: int, height: int):
|
| 137 |
# Source: this openai thread: https://community.openai.com/t/how-do-i-calculate-image-tokens-in-gpt4-vision/492318/6
|
| 138 |
if width > 2048 or height > 2048:
|
|
|
|
| 153 |
|
| 154 |
return total_tokens
|
| 155 |
|
| 156 |
+
|
| 157 |
def tokens2price(tokens: int, cost_per_1k_tokens: float = 0.00015):
|
| 158 |
+
# Token prices: https://openai.com/api/pricing/
|
| 159 |
+
return tokens / 1000 * cost_per_1k_tokens
|