gnlui commited on
Commit
0bad002
·
1 Parent(s): 6cfad53
requirements.txt CHANGED
@@ -1,3 +1,7 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
1
+ setuptools
2
+ PyMuPDF>=1.22.0
3
+ numpy
4
+ streamlit>=1.28.0
5
+ scikit-learn
6
+ python-docx>=0.8.11
7
+ urllib3>=2.0.0
src/PDF_highlight_extractor.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import numpy as np
3
+ import re
4
+ from utils.decompose import Decomposer
5
+
6
+ def clean_chinese_text(text):
7
+ """Clean up text by removing spaces between Chinese characters."""
8
+ # Remove spaces between Chinese characters
9
+ text = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', text)
10
+ # Remove spaces before and after Chinese punctuation
11
+ text = re.sub(r'\s+([\u3000-\u303f\uff00-\uffef])', r'\1', text)
12
+ text = re.sub(r'([\u3000-\u303f\uff00-\uffef])\s+', r'\1', text)
13
+ return text.strip()
14
+
15
+
16
+ def categorize_highlight(color):
17
+ """Categorizes highlights based on the closest color match using Euclidean distance."""
18
+ # customize the categories of highlights as you
19
+ color_mapping = {
20
+ (0.5608, 0.8706, 0.9765): "Ideas & Insights", # Light Blue
21
+ (1.0, 0.9412, 0.4): "General Notes", # Yellow
22
+ (0.4902, 0.9412, 0.4): "Action Items / To-Do", # Green
23
+ (0.9686, 0.6, 0.8196): "Quotes & References", # Pink
24
+ (0.9216, 0.2863, 0.2863): "Critical Issues / Warnings" # Red
25
+ }
26
+
27
+ # Convert color to a NumPy array for distance calculation
28
+ color_array = np.array(color)
29
+
30
+ # Find the closest color in the mapping using Euclidean distance
31
+ best_match = min(color_mapping.keys(), key=lambda ref_color: sum((color_array[i] - ref_color[i]) ** 2 for i in range(len(color_array))) ** 0.5)
32
+
33
+ return color_mapping[best_match]
34
+
35
+
36
+ def clean_text_by_punctuation(text):
37
+ """Clean text by removing content after the last proper punctuation mark."""
38
+ # Define proper ending punctuation marks (both Chinese and English)
39
+ ending_punctuation = {'.', '。', '?', '?', '!', '!'}
40
+
41
+ # Find the last occurrence of any ending punctuation
42
+ last_punct_index = -1
43
+ for i, char in enumerate(text):
44
+ if char in ending_punctuation:
45
+ last_punct_index = i
46
+
47
+ # If no proper ending punctuation found, return empty string
48
+ if last_punct_index == -1:
49
+ return ""
50
+
51
+ # Return text up to and including the last punctuation mark
52
+ return text[:last_punct_index + 1]
53
+
54
+
55
+ def extract_highlights(pdf_path):
56
+ """
57
+ Extract all highlights from a PDF file.
58
+
59
+ Args:
60
+ pdf_path (str): Path to the PDF file
61
+
62
+ Returns:
63
+ list: List of dictionaries containing highlight information
64
+ """
65
+ highlights = []
66
+ try:
67
+ pdf_document = fitz.open(pdf_path)
68
+
69
+ min_y, max_y, single_y = Decomposer(pdf_document).run()
70
+ highlights=[]
71
+ for page_num in range(pdf_document.page_count):
72
+ page = pdf_document[page_num]
73
+
74
+ for annot in page.annots():
75
+ if annot.type[0] == 8: # Highlight annotation
76
+ # Extract highlighted text
77
+ highlight_text = page.get_text("text", clip=annot.rect, sort=True, flags=1).strip()
78
+ highlight_text = highlight_text.encode("utf-8", "ignore").decode("utf-8")
79
+
80
+ # Extract annotation color
81
+ color_rgb = annot.colors.get("stroke", [0, 0, 0]) # Default black if undefined
82
+ category = categorize_highlight(color_rgb)
83
+
84
+ # Extract popup comment if it exists
85
+ comment = annot.info.get("content", "").strip() if annot.has_popup else ""
86
+ # Store structured highlight data
87
+ if highlight_text:
88
+ for chunk in highlight_text.split("\n\n"): # Handle consecutive highlights
89
+ cleaned_chunk = clean_text_by_punctuation(clean_chinese_text(chunk))
90
+ if not cleaned_chunk: # Skip if no valid text after cleaning
91
+ continue
92
+
93
+ if (highlights and
94
+ highlights[-1]['page'] == page_num and
95
+ highlights[-1]['rect'][3] > (max_y - single_y) and
96
+ annot.rect[1]< (min_y + single_y)
97
+ ): # Handle highlights over page
98
+ highlights[-1]['text'] += cleaned_chunk
99
+
100
+ else:
101
+ highlights.append({
102
+ "page": page_num + 1,
103
+ "text": cleaned_chunk,
104
+ "category": category,
105
+ "comment": comment,
106
+ "rect":annot.rect
107
+ })
108
+
109
+ pdf_document.close()
110
+ return highlights
111
+ except Exception as e:
112
+ print(f"Error processing PDF: {str(e)}")
113
+ return []
114
+
115
+
116
+ def main():
117
+ # Example usage
118
+ pdf_path = "OneDrive-2025-04-15/专题讨论_线下研讨会DeepSeek效应追踪AI产业的持续变革-12Mar2025_zho.pdf"
119
+ highlights = extract_highlights(pdf_path)
120
+
121
+ # Create markdown content
122
+ markdown_content = "# PDF Highlights Summary\n\n"
123
+ markdown_content += f"Total highlights found: {len(highlights)}\n\n"
124
+
125
+ # Group highlights by category
126
+ highlights_by_category = {}
127
+ for highlight in highlights:
128
+ category = highlight['category']
129
+ if category not in highlights_by_category:
130
+ highlights_by_category[category] = []
131
+ highlights_by_category[category].append(highlight)
132
+
133
+ # Add highlights grouped by category
134
+ for category, category_highlights in highlights_by_category.items():
135
+ markdown_content += f"## {category}\n\n"
136
+ for highlight in category_highlights:
137
+ markdown_content += f"### Page {highlight['page']}\n\n"
138
+ markdown_content += f"{highlight['text']}\n\n"
139
+ if highlight['comment']:
140
+ markdown_content += f"> {highlight['comment']}\n\n"
141
+ markdown_content += "---\n\n"
142
+
143
+ # Save to markdown file
144
+ output_file = "highlights_summary.md"
145
+ with open(output_file, "w", encoding="utf-8") as f:
146
+ f.write(markdown_content)
147
+
148
+ print(f"Highlights summary has been saved to {output_file}")
149
+
150
+
151
+ if __name__ == "__main__":
152
+ main()
src/streamlit_app.py CHANGED
@@ -1,40 +1,209 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import os
3
+ import logging
4
+ from PDF_highlight_extractor import extract_highlights, clean_chinese_text
5
+ from docx import Document
6
+ import urllib.parse
7
+ import tempfile
8
+ import shutil
9
+ import re
10
+ import traceback
11
+ import fitz # PyMuPDF
12
 
13
+ # Set up logging
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ def sanitize_filename(filename):
18
+ """Sanitize filename for web use while preserving Chinese characters."""
19
+ try:
20
+ filename = urllib.parse.unquote(filename)
21
+ filename = os.path.basename(filename)
22
+ # Use a safer regex for filename sanitization
23
+ filename = re.sub(r'[^\w\u4e00-\u9fff\-\.]', '_', filename)
24
+ name, ext = os.path.splitext(filename)
25
+ if len(name) > 100:
26
+ name = name[:100]
27
+ return name + ext
28
+ except Exception as e:
29
+ logger.warning(f"Error sanitizing filename: {str(e)}")
30
+ # Fallback to a simple safe name
31
+ return f"upload_{hash(str(filename))}.pdf"
32
+
33
+ st.set_page_config(
34
+ page_title="PDF Highlight Extractor",
35
+ page_icon="📄",
36
+ layout="wide"
37
+ )
38
+
39
+ st.title("📄 PDF Highlight Extractor")
40
+ st.markdown("Upload a PDF file to extract and categorize highlights.")
41
+
42
+ # Display PyMuPDF version for debugging
43
+ st.sidebar.text(f"PyMuPDF version: {fitz.__version__}")
44
+
45
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
46
+
47
+ if uploaded_file is not None:
48
+ try:
49
+ temp_dir = tempfile.mkdtemp()
50
+ try:
51
+ # Add file size check
52
+ file_size = len(uploaded_file.getvalue()) / (1024 * 1024) # Size in MB
53
+ if file_size > 200: # Streamlit Cloud's default limit
54
+ st.error(f"File size ({file_size:.1f}MB) exceeds the 200MB limit. Please upload a smaller file.")
55
+ st.stop()
56
+
57
+ original_filename = uploaded_file.name
58
+ logger.info(f"Processing file: {original_filename}")
59
+
60
+ safe_filename = sanitize_filename(original_filename)
61
+ logger.info(f"Sanitized filename: {safe_filename}")
62
+
63
+ file_extension = os.path.splitext(safe_filename)[1]
64
+ temp_file_path = os.path.join(temp_dir, f"temp_upload{file_extension}")
65
+
66
+ with open(temp_file_path, "wb") as f:
67
+ f.write(uploaded_file.getvalue())
68
+
69
+ logger.info(f"Saved to temp file: {temp_file_path}")
70
+
71
+ try:
72
+ with st.spinner("Extracting highlights..."):
73
+ # Test if we can open the PDF first
74
+ try:
75
+ test_pdf = fitz.open(temp_file_path)
76
+ logger.info(f"Successfully opened PDF with {test_pdf.page_count} pages")
77
+ test_pdf.close()
78
+ except Exception as pdf_error:
79
+ logger.error(f"Failed to open PDF: {str(pdf_error)}")
80
+ st.error(f"Failed to open PDF: {str(pdf_error)}")
81
+ raise
82
+
83
+ # Extract highlights
84
+ highlights = extract_highlights(temp_file_path)
85
+ logger.info(f"Extracted {len(highlights)} highlights")
86
+
87
+ if highlights:
88
+ st.success(f"Found {len(highlights)} highlights!")
89
+
90
+ highlights_by_category = {}
91
+ for highlight in highlights:
92
+ category = highlight['category']
93
+ if category not in highlights_by_category:
94
+ highlights_by_category[category] = []
95
+ highlights_by_category[category].append(highlight)
96
+
97
+ for category, category_highlights in highlights_by_category.items():
98
+ with st.expander(f"📌 {category} ({len(category_highlights)} highlights)"):
99
+ for highlight in category_highlights:
100
+ st.markdown(f"**Page {highlight['page']}**")
101
+ try:
102
+ st.markdown(highlight['text'])
103
+ except Exception as text_error:
104
+ clean_text = highlight['text'].encode('utf-8', 'ignore').decode('utf-8')
105
+ st.text(clean_text) # Fallback to plain text
106
+
107
+ if highlight['comment']:
108
+ try:
109
+ st.markdown(f"> {highlight['comment']}")
110
+ except Exception:
111
+ st.text(f"Comment: {highlight['comment']}")
112
+ st.markdown("---")
113
+
114
+ file_format = st.selectbox("Select file format for download:", ["Markdown", "Txt", "Word"])
115
+
116
+ if file_format == "Markdown":
117
+ markdown_content = "# PDF Highlights Summary\n\n"
118
+ markdown_content += f"Total highlights found: {len(highlights)}\n\n"
119
+
120
+ for category, category_highlights in highlights_by_category.items():
121
+ markdown_content += f"## {category}\n\n"
122
+ for highlight in category_highlights:
123
+ markdown_content += f"### Page {highlight['page']}\n\n"
124
+ markdown_content += f"{highlight['text']}\n\n"
125
+ if highlight['comment']:
126
+ markdown_content += f"> {highlight['comment']}\n\n"
127
+ markdown_content += "---\n\n"
128
+
129
+ download_filename = f"highlights_{os.path.splitext(safe_filename)[0]}.md"
130
+
131
+ st.download_button(
132
+ label="Download Highlights Summary (Markdown)",
133
+ data=markdown_content.encode('utf-8'), # Ensure proper encoding
134
+ file_name=download_filename,
135
+ mime="text/markdown"
136
+ )
137
+
138
+ elif file_format == "Txt":
139
+ text_content = f"PDF Highlights Summary\n\nTotal highlights found: {len(highlights)}\n\n"
140
+
141
+ for category, category_highlights in highlights_by_category.items():
142
+ text_content += f"{category}\n\n"
143
+ for highlight in category_highlights:
144
+ text_content += f"Page {highlight['page']}: {highlight['text']}\n"
145
+ if highlight['comment']:
146
+ text_content += f"Comment: {highlight['comment']}\n"
147
+ text_content += "---\n\n"
148
+
149
+ download_filename = f"highlights_{os.path.splitext(safe_filename)[0]}.txt"
150
+
151
+ st.download_button(
152
+ label="Download Highlights Summary (Text)",
153
+ data=text_content.encode('utf-8'), # Ensure proper encoding
154
+ file_name=download_filename,
155
+ mime="text/plain"
156
+ )
157
+
158
+ elif file_format == "Word":
159
+ doc = Document()
160
+ doc.add_heading('PDF Highlights Summary', level=1)
161
+ doc.add_paragraph(f'Total highlights found: {len(highlights)}')
162
+
163
+ for category, category_highlights in highlights_by_category.items():
164
+ doc.add_heading(category, level=2)
165
+ for highlight in category_highlights:
166
+ doc.add_heading(f'Page {highlight["page"]}', level=3)
167
+ doc.add_paragraph(highlight['text'])
168
+ if highlight['comment']:
169
+ doc.add_paragraph(f'Comment: {highlight["comment"]}')
170
+ doc.add_paragraph('---')
171
+
172
+ word_file_path = os.path.join(temp_dir, f"highlights_{os.path.splitext(safe_filename)[0]}.docx")
173
+ doc.save(word_file_path)
174
+
175
+ with open(word_file_path, "rb") as f:
176
+ docx_bytes = f.read()
177
+
178
+ st.download_button(
179
+ label="Download Highlights Summary (Word)",
180
+ data=docx_bytes,
181
+ file_name=f"highlights_{os.path.splitext(safe_filename)[0]}.docx",
182
+ mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
183
+ )
184
+ else:
185
+ st.warning("No highlights found in the PDF.")
186
+
187
+ except Exception as e:
188
+ logger.error(f"Error processing PDF: {str(e)}")
189
+ logger.error(traceback.format_exc())
190
+ st.error(f"Error processing PDF: {str(e)}")
191
+
192
+ finally:
193
+ shutil.rmtree(temp_dir, ignore_errors=True)
194
+
195
+ except Exception as e:
196
+ logger.error(f"Error processing file: {str(e)}")
197
+ logger.error(traceback.format_exc())
198
+ st.error(f"Error processing file: {str(e)}")
199
+
200
+ st.sidebar.markdown("""
201
+ ### About
202
+ This app extracts and categorizes highlights from PDF files based on their colors:
203
+
204
+ - 💡 Light Blue: Ideas & Insights
205
+ - 📝 Yellow: General Notes
206
+ - ✅ Green: Action Items / To-Do
207
+ - 📖 Pink: Quotes & References
208
+ - ⚠️ Red: Critical Issues / Warnings
209
+ """)
src/utils/decompose.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from sklearn.cluster import DBSCAN
3
+ import numpy as np
4
+ from itertools import islice
5
+ from collections import Counter
6
+ import logging
7
+
8
+ # Set up logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class DBSCAN_helper:
13
+ def __init__(self, blocks):
14
+ self.blocks = blocks
15
+
16
+ def run(self):
17
+ try:
18
+ if not self.blocks:
19
+ logger.warning("No blocks provided to DBSCAN_helper")
20
+ # Return default values
21
+ self.n_clusters = 0
22
+ self.labels = np.array([])
23
+ return
24
+
25
+ # Extract features from blocks
26
+ X = np.array(
27
+ [(x0, y0, x1, y1, len(text)) for x0, y0, x1, y1, text in self.blocks]
28
+ )
29
+
30
+ # Handle empty array
31
+ if X.size == 0:
32
+ logger.warning("Empty feature array for DBSCAN")
33
+ self.n_clusters = 0
34
+ self.labels = np.array([])
35
+ return
36
+
37
+ # Configure DBSCAN with explicit parameters for better control
38
+ dbscan = DBSCAN(eps=0.5, min_samples=2, metric='euclidean')
39
+ dbscan.fit(X)
40
+ labels = dbscan.labels_
41
+
42
+ # Count the number of clusters (excluding noise points marked as -1)
43
+ unique_labels = set(labels)
44
+ if -1 in unique_labels:
45
+ unique_labels.remove(-1)
46
+ self.n_clusters = len(unique_labels)
47
+ self.labels = labels
48
+
49
+ logger.info(f"{self.n_clusters} clusters for {len(self.blocks)} blocks")
50
+ except Exception as e:
51
+ logger.error(f"Error in DBSCAN_helper: {str(e)}")
52
+ # Set default values on error
53
+ self.n_clusters = 0
54
+ self.labels = np.array([-1] * len(self.blocks)) if self.blocks else np.array([])
55
+
56
+
57
+ class Decomposer:
58
+ def __init__(self, pdf_document=None):
59
+ if not pdf_document:
60
+ raise ValueError("PDF document must be provided")
61
+ self.pdf_doc = pdf_document
62
+
63
+ def calc_rect_center(self, rect, reverse_y=False):
64
+ try:
65
+ if reverse_y:
66
+ x0, y0, x1, y1 = rect[0], -rect[1], rect[2], -rect[3]
67
+ else:
68
+ x0, y0, x1, y1 = rect
69
+
70
+ x_center = (x0 + x1) / 2
71
+ y_center = (y0 + y1) / 2
72
+ return (x_center, y_center)
73
+ except Exception as e:
74
+ logger.error(f"Error calculating rectangle center: {str(e)}")
75
+ return (0, 0) # Return default values on error
76
+
77
+ def get_rect_labels(self):
78
+ try:
79
+ rect_centers = []
80
+ rects = []
81
+ visual_label_texts = []
82
+ categorize_vectors = []
83
+
84
+ for page_idx, page in islice(enumerate(self.pdf_doc), len(self.pdf_doc)):
85
+ try:
86
+ blocks = page.get_text("blocks")
87
+ page_cnt = page_idx + 1
88
+ logger.debug(f"=== Start Page {page_cnt}: {len(blocks)} blocks ===")
89
+ block_cnt = 0
90
+
91
+ for block in blocks:
92
+ try:
93
+ block_rect = block[:4] # (x0,y0,x1,y1)
94
+ x0, y0, x1, y1 = block_rect
95
+ rects.append(block_rect)
96
+
97
+ # Handle possible encoding issues with block text
98
+ block_text = block[4]
99
+ if isinstance(block_text, bytes):
100
+ block_text = block_text.decode('utf-8', errors='ignore')
101
+
102
+ block_num = block[5]
103
+ block_cnt = block_num + 1
104
+
105
+ rect_center = self.calc_rect_center(block_rect, reverse_y=True)
106
+ rect_centers.append(rect_center)
107
+ visual_label_text = f"({page_cnt}.{block_cnt})"
108
+ visual_label_texts.append(visual_label_text)
109
+
110
+ #block_type = "text" if block[6] == 0 else "image"
111
+ categorize_vectors.append((*block_rect, block_text))
112
+ except Exception as block_error:
113
+ logger.warning(f"Error processing block {block_cnt} on page {page_cnt}: {str(block_error)}")
114
+ continue
115
+ except Exception as page_error:
116
+ logger.warning(f"Error processing page {page_idx + 1}: {str(page_error)}")
117
+ continue
118
+
119
+ if not categorize_vectors:
120
+ logger.warning("No categorize vectors generated")
121
+ return []
122
+
123
+ categorizer = DBSCAN_helper(categorize_vectors)
124
+ categorizer.run()
125
+
126
+ # Make sure the lengths match
127
+ if len(rects) != len(categorizer.labels):
128
+ logger.warning(f"Length mismatch: rects={len(rects)}, labels={len(categorizer.labels)}")
129
+ # Handle mismatch by creating default labels
130
+ if categorizer.labels.size == 0: # If labels array is empty
131
+ result = [(rect, -1) for rect in rects] # Assign all to noise (-1)
132
+ else:
133
+ # Truncate to shorter length
134
+ min_len = min(len(rects), len(categorizer.labels))
135
+ result = [(rects[i], categorizer.labels[i]) for i in range(min_len)]
136
+ return result
137
+
138
+ return [(rects[i], categorizer.labels[i]) for i in range(len(rects))]
139
+ except Exception as e:
140
+ logger.error(f"Error in get_rect_labels: {str(e)}")
141
+ return [] # Return empty result on error
142
+
143
+ def get_page_stats(self, res):
144
+ try:
145
+ if not res:
146
+ logger.warning("Empty input to get_page_stats")
147
+ return None, None, None # Handle empty input
148
+
149
+ x_counter = Counter(x for _, x in res)
150
+ y_diffs = Counter(i[3] - i[1] for i, _ in res)
151
+
152
+ # Handle empty counters
153
+ if not x_counter or not y_diffs:
154
+ logger.warning("Empty counters in get_page_stats")
155
+ return None, None, None
156
+
157
+ most_common_x = x_counter.most_common(1)[0][0]
158
+ threshold = float('inf')
159
+ min_x = float('inf')
160
+
161
+ for i, x in res:
162
+ min_x = min(i[0], min_x)
163
+ if x != most_common_x and i[0] < threshold:
164
+ threshold = i[0]
165
+
166
+ if threshold == float('inf'): # Fallback
167
+ threshold = min_x
168
+
169
+ min_y, max_y = float('inf'), -float('inf') # Changed from 0 to -inf
170
+ for i, x in res:
171
+ if x == -1 and i[0] <= threshold:
172
+ min_y = min(min_y, i[1])
173
+ max_y = max(max_y, i[-1])
174
+
175
+ single_y = y_diffs.most_common(1)[0][0] if y_diffs else 0
176
+
177
+ # Additional validity checks
178
+ if min_y == float('inf'):
179
+ min_y = None
180
+ if max_y == -float('inf'):
181
+ max_y = None
182
+
183
+ # Ensure single_y is positive
184
+ single_y = abs(single_y) if single_y else 0
185
+
186
+ return min_y, max_y, single_y
187
+ except Exception as e:
188
+ logger.error(f"Error in get_page_stats: {str(e)}")
189
+ return None, None, None # Return default values on error
190
+
191
+ def run(self):
192
+ try:
193
+ rect_labels = self.get_rect_labels()
194
+ stats = self.get_page_stats(rect_labels)
195
+ logger.info(f"Page stats: min_y={stats[0]}, max_y={stats[1]}, single_y={stats[2]}")
196
+ return stats
197
+ except Exception as e:
198
+ logger.error(f"Error in Decomposer.run: {str(e)}")
199
+ return None, None, None # Return default values on error