Ilia Tambovtsev commited on
Commit
d570714
·
1 Parent(s): 0802329

feat: update eda

Browse files
Files changed (1) hide show
  1. src/eda/explore.py +66 -51
src/eda/explore.py CHANGED
@@ -5,32 +5,63 @@ from typing import Dict, List, Optional, Set
5
 
6
  import fitz # PyMuPDF
7
  import pandas as pd
8
- from sqlalchemy import text
9
- from sqlalchemy.sql.elements import CompilerElement
10
- from sqlalchemy.sql.expression import desc
11
 
12
  from src.chains import PresentationAnalysis
13
  from src.config.navigator import Navigator
14
 
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def parse_pdf_directory(
17
- root_dir: str,
18
  topic_first: bool = True,
19
  include_datasets: Optional[Set[str]] = None,
20
  exclude_datasets: Optional[Set[str]] = None,
21
  ) -> pd.DataFrame:
22
- """
23
- Parse directory of PDFs into a DataFrame using PyMuPDF (fitz).
24
-
25
- Args:
26
- root_dir: Path to root directory containing PDF files
27
- topic_first: If True, first folder is topic. If False, topic is not stored
28
- include_datasets: Set of dataset names to include. If None, include all
29
- exclude_datasets: Set of dataset names to exclude. If None, exclude none
30
-
31
- Returns:
32
- DataFrame with columns: [topic (optional)], dataset, nav, filename, relative_path
33
- """
34
  if include_datasets and exclude_datasets:
35
  raise ValueError("Cannot specify both include_datasets and exclude_datasets")
36
 
@@ -44,7 +75,6 @@ def parse_pdf_directory(
44
  # Get dataset name for filtering (either first or second part depending on topic_first)
45
  dataset_name = parts[1] if topic_first else parts[0]
46
 
47
- # import pdb; pdb.set_trace()
48
  # Apply dataset filters
49
  if include_datasets and dataset_name not in include_datasets:
50
  continue
@@ -61,52 +91,37 @@ def parse_pdf_directory(
61
  pdf_info["nav"] = "/".join(parts) if parts else ""
62
 
63
  try:
64
- doc = fitz.open(path)
65
- pdf_info["num_pages"] = doc.page_count
66
 
67
- # Get metadata
68
- metadata = doc.metadata
69
- pdf_info["title"] = metadata.get("title", "")
70
- pdf_info["author"] = metadata.get("author", "")
71
- pdf_info["subject"] = metadata.get("subject", "")
72
- pdf_info["keywords"] = metadata.get("keywords", "")
73
 
74
- # Get all page sizes
75
- page_sizes = [(page.rect.width, page.rect.height) for page in doc]
76
-
77
- # Get most common page size
78
  common_size = Counter(page_sizes).most_common(1)[0][0]
79
  pdf_info["page_width"] = common_size[0]
80
  pdf_info["page_height"] = common_size[1]
81
 
82
- # If there are different page sizes, store them as a set
83
  unique_sizes = set(page_sizes)
84
- if len(unique_sizes) > 1:
85
- pdf_info["varying_sizes"] = str(unique_sizes)
86
- else:
87
- pdf_info["varying_sizes"] = ""
88
-
89
- doc.close()
90
 
91
  except Exception as e:
92
- pdf_info["num_pages"] = 0
93
- pdf_info["title"] = ""
94
- pdf_info["author"] = ""
95
- pdf_info["subject"] = ""
96
- pdf_info["keywords"] = ""
97
- pdf_info["page_width"] = 0
98
- pdf_info["page_height"] = 0
99
- pdf_info["varying_sizes"] = ""
100
 
101
  pdf_files.append(pdf_info)
102
 
103
- # Convert to DataFrame
104
- df = pd.DataFrame(pdf_files)
105
-
106
- # sort_cols = ["dataset", "nav"]
107
- # df = df.sort_values(sort_cols)
108
- return df
109
-
110
 
111
  def get_pres_analysis_df(base: Path = Navigator().interim) -> pd.DataFrame:
112
  descriptions: List[Dict] = []
 
5
 
6
  import fitz # PyMuPDF
7
  import pandas as pd
 
 
 
8
 
9
  from src.chains import PresentationAnalysis
10
  from src.config.navigator import Navigator
11
 
12
 
13
+ class PresentationMetrics:
14
+ """Class to handle various presentation metrics calculations."""
15
+
16
+ def __init__(self, pdf_path: Path):
17
+ """Initialize with PDF path and open document."""
18
+ self.pdf_path = pdf_path
19
+ self.doc = fitz.open(pdf_path)
20
+
21
+ def get_page_metrics(self, page_num: int) -> Dict:
22
+ """
23
+ Get comprehensive metrics for a specific page.
24
+
25
+ Returns:
26
+ Dictionary containing page metrics (image count, text length, size)
27
+ """
28
+ page = self.doc[page_num]
29
+ return dict(
30
+ image_count=len(page.get_images()),
31
+ n_words=len(page.get_text().strip().split()),
32
+ size=(page.rect.width, page.rect.height)
33
+ )
34
+
35
+ def get_all_metrics(self) -> List[Dict]:
36
+ """
37
+ Calculate metrics for all pages in the presentation.
38
+
39
+ Returns:
40
+ List of dictionaries with metrics for each page
41
+ """
42
+ metrics = []
43
+ for page_num in range(len(self.doc)):
44
+ page_metrics = self.get_page_metrics(page_num)
45
+ page_metrics.update(dict(
46
+ page_num=page_num,
47
+ pdf_path=str(self.pdf_path)
48
+ ))
49
+ metrics.append(page_metrics)
50
+ return metrics
51
+
52
+ def __del__(self):
53
+ """Ensure proper document closure."""
54
+ if hasattr(self, "doc"):
55
+ self.doc.close()
56
+
57
+
58
  def parse_pdf_directory(
59
+ root_dir: str | Path,
60
  topic_first: bool = True,
61
  include_datasets: Optional[Set[str]] = None,
62
  exclude_datasets: Optional[Set[str]] = None,
63
  ) -> pd.DataFrame:
64
+ """Your existing parse_pdf_directory function with metrics integration."""
 
 
 
 
 
 
 
 
 
 
 
65
  if include_datasets and exclude_datasets:
66
  raise ValueError("Cannot specify both include_datasets and exclude_datasets")
67
 
 
75
  # Get dataset name for filtering (either first or second part depending on topic_first)
76
  dataset_name = parts[1] if topic_first else parts[0]
77
 
 
78
  # Apply dataset filters
79
  if include_datasets and dataset_name not in include_datasets:
80
  continue
 
91
  pdf_info["nav"] = "/".join(parts) if parts else ""
92
 
93
  try:
94
+ metrics = PresentationMetrics(path)
95
+ all_metrics = metrics.get_all_metrics()
96
 
97
+ # Calculate aggregated metrics
98
+ pdf_info["num_pages"] = len(all_metrics)
99
+ pdf_info["total_images"] = sum(m["image_count"] for m in all_metrics)
100
+ pdf_info["total_n_words"] = sum(m["n_words"] for m in all_metrics)
 
 
101
 
102
+ # Get page sizes
103
+ page_sizes = [(m["size"][0], m["size"][1]) for m in all_metrics]
 
 
104
  common_size = Counter(page_sizes).most_common(1)[0][0]
105
  pdf_info["page_width"] = common_size[0]
106
  pdf_info["page_height"] = common_size[1]
107
 
108
+ # Handle varying sizes
109
  unique_sizes = set(page_sizes)
110
+ pdf_info["varying_sizes"] = str(unique_sizes) if len(unique_sizes) > 1 else ""
 
 
 
 
 
111
 
112
  except Exception as e:
113
+ pdf_info.update(dict(
114
+ num_pages=0,
115
+ total_images=0,
116
+ total_text_length=0,
117
+ page_width=0,
118
+ page_height=0,
119
+ varying_sizes=""
120
+ ))
121
 
122
  pdf_files.append(pdf_info)
123
 
124
+ return pd.DataFrame(pdf_files)
 
 
 
 
 
 
125
 
126
  def get_pres_analysis_df(base: Path = Navigator().interim) -> pd.DataFrame:
127
  descriptions: List[Dict] = []