Ilia Tambovtsev commited on
Commit
40aa9f2
·
1 Parent(s): 43554ac

feat: calculate statistics for descriptions

Browse files
Files changed (3) hide show
  1. pyproject.toml +3 -0
  2. src/eda/__init__.py +6 -1
  3. src/eda/explore.py +42 -13
pyproject.toml CHANGED
@@ -23,6 +23,9 @@ chromadb = "^0.5.20"
23
  gradio = "^5.6.0"
24
  gradio-pdf = "^0.0.19"
25
  tabulate = "^0.9.0"
 
 
 
26
 
27
 
28
  [build-system]
 
23
  gradio = "^5.6.0"
24
  gradio-pdf = "^0.0.19"
25
  tabulate = "^0.9.0"
26
+ plotly = "^5.24.1"
27
+ seaborn = "^0.13.2"
28
+ tiktoken = "^0.8.0"
29
 
30
 
31
  [build-system]
src/eda/__init__.py CHANGED
@@ -1 +1,6 @@
1
- from src.eda.explore import calculate_image_tokens, parse_pdf_directory, tokens2price
 
 
 
 
 
 
1
+ from src.eda.explore import (
2
+ calculate_image_tokens,
3
+ get_pres_analysis_df,
4
+ parse_pdf_directory,
5
+ tokens2price,
6
+ )
src/eda/explore.py CHANGED
@@ -1,17 +1,23 @@
1
- from pathlib import Path
2
  from collections import Counter
3
  from math import ceil
4
- import pandas as pd
5
- from typing import List, Dict
 
6
  import fitz # PyMuPDF
7
- from typing import List, Dict, Optional, Set
 
 
 
 
 
 
8
 
9
 
10
  def parse_pdf_directory(
11
  root_dir: str,
12
  topic_first: bool = True,
13
  include_datasets: Optional[Set[str]] = None,
14
- exclude_datasets: Optional[Set[str]] = None
15
  ) -> pd.DataFrame:
16
  """
17
  Parse directory of PDFs into a DataFrame using PyMuPDF (fitz).
@@ -46,10 +52,7 @@ def parse_pdf_directory(
46
  continue
47
 
48
  # Initialize empty dict for file info
49
- pdf_info = dict(
50
- filename=parts.pop(),
51
- relative_path=str(rel_path)
52
- )
53
 
54
  if topic_first:
55
  pdf_info["topic"] = parts.pop(0)
@@ -69,8 +72,7 @@ def parse_pdf_directory(
69
  pdf_info["keywords"] = metadata.get("keywords", "")
70
 
71
  # Get all page sizes
72
- page_sizes = [(page.rect.width, page.rect.height)
73
- for page in doc]
74
 
75
  # Get most common page size
76
  common_size = Counter(page_sizes).most_common(1)[0][0]
@@ -105,6 +107,32 @@ def parse_pdf_directory(
105
  # df = df.sort_values(sort_cols)
106
  return df
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  def calculate_image_tokens(width: int, height: int):
109
  # Source: this openai thread: https://community.openai.com/t/how-do-i-calculate-image-tokens-in-gpt4-vision/492318/6
110
  if width > 2048 or height > 2048:
@@ -125,6 +153,7 @@ def calculate_image_tokens(width: int, height: int):
125
 
126
  return total_tokens
127
 
 
128
  def tokens2price(tokens: int, cost_per_1k_tokens: float = 0.00015):
129
- # Token prices: https://openai.com/api/pricing/
130
- return tokens / 1000 * cost_per_1k_tokens
 
 
1
  from collections import Counter
2
  from math import ceil
3
+ from pathlib import Path
4
+ from typing import Dict, List, Optional, Set
5
+
6
  import fitz # PyMuPDF
7
+ import pandas as pd
8
+ from sqlalchemy import text
9
+ from sqlalchemy.sql.elements import CompilerElement
10
+ from sqlalchemy.sql.expression import desc
11
+
12
+ from src.chains import PresentationAnalysis
13
+ from src.config.navigator import Navigator
14
 
15
 
16
  def parse_pdf_directory(
17
  root_dir: str,
18
  topic_first: bool = True,
19
  include_datasets: Optional[Set[str]] = None,
20
+ exclude_datasets: Optional[Set[str]] = None,
21
  ) -> pd.DataFrame:
22
  """
23
  Parse directory of PDFs into a DataFrame using PyMuPDF (fitz).
 
52
  continue
53
 
54
  # Initialize empty dict for file info
55
+ pdf_info = dict(filename=parts.pop(), relative_path=str(rel_path))
 
 
 
56
 
57
  if topic_first:
58
  pdf_info["topic"] = parts.pop(0)
 
72
  pdf_info["keywords"] = metadata.get("keywords", "")
73
 
74
  # Get all page sizes
75
+ page_sizes = [(page.rect.width, page.rect.height) for page in doc]
 
76
 
77
  # Get most common page size
78
  common_size = Counter(page_sizes).most_common(1)[0][0]
 
107
  # df = df.sort_values(sort_cols)
108
  return df
109
 
110
+
111
+ def get_pres_analysis_df(base: Path = Navigator().interim) -> pd.DataFrame:
112
+ descriptions: List[Dict] = []
113
+ for f in base.rglob("*.json"):
114
+ pres = PresentationAnalysis.load(f)
115
+ for slide in pres.slides:
116
+ descriptions.append(
117
+ dict(
118
+ pres_path=slide.pdf_path,
119
+ pres_title=pres.name,
120
+ page=slide.page_num,
121
+ # Parsed texts
122
+ text_content=slide.parsed_output.text_content,
123
+ visual_content=slide.parsed_output.visual_content,
124
+ topic_overview=slide.parsed_output.general_description.topic_overview,
125
+ conclusions_and_insights=slide.parsed_output.general_description.conclusions_and_insights,
126
+ layout_and_composition=slide.parsed_output.general_description.layout_and_composition,
127
+ # Tokens
128
+ completion_tokens=slide.response_metadata["token_usage"]["completion_tokens"],
129
+ prompt_tokens=slide.response_metadata["token_usage"]["prompt_tokens"],
130
+ )
131
+ )
132
+ df = pd.DataFrame(descriptions)
133
+ return df
134
+
135
+
136
  def calculate_image_tokens(width: int, height: int):
137
  # Source: this openai thread: https://community.openai.com/t/how-do-i-calculate-image-tokens-in-gpt4-vision/492318/6
138
  if width > 2048 or height > 2048:
 
153
 
154
  return total_tokens
155
 
156
+
157
  def tokens2price(tokens: int, cost_per_1k_tokens: float = 0.00015):
158
+ # Token prices: https://openai.com/api/pricing/
159
+ return tokens / 1000 * cost_per_1k_tokens