Marthee commited on
Commit
e7bb4bc
·
verified ·
1 Parent(s): 99d63bf

Create findInitialMarkups.py

Browse files
Files changed (1) hide show
  1. findInitialMarkups.py +572 -0
findInitialMarkups.py ADDED
@@ -0,0 +1,572 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from collections import defaultdict, Counter
3
+ import fitz # PyMuPDF
4
+ import requests
5
+ from io import BytesIO
6
+
7
+ def normalize_text(text):
8
+ if text is None:
9
+ return ""
10
+ return re.sub(r'\s+', ' ', text.strip().lower())
11
+
12
+ def get_spaced_text_from_spans(spans):
13
+ return normalize_text(" ".join(span["text"].strip() for span in spans))
14
+
15
+ def is_header(span, most_common_font_size, most_common_color, most_common_font):
16
+ fontname = span.get("font", "").lower()
17
+ # is_italic = "italic" in fontname or "oblique" in fontname
18
+ is_bold = "bold" in fontname or span.get("bold", False)
19
+ return (
20
+ (
21
+ span["size"] > most_common_font_size or
22
+ span["font"].lower() != most_common_font.lower() or
23
+ is_bold
24
+ )
25
+ )
26
+
27
+ def add_span_to_nearest_group(span_y, grouped_dict, pageNum=None, threshold=0.5):
28
+ for (p, y) in grouped_dict:
29
+ if pageNum is not None and p != pageNum:
30
+ continue
31
+ if abs(y - span_y) <= threshold:
32
+ return (p, y)
33
+ return (pageNum, span_y)
34
+
35
+
36
+ def get_regular_font_size_and_color(doc):
37
+ font_sizes = []
38
+ colors = []
39
+ fonts = []
40
+
41
+ # Loop through all pages
42
+ for page_num in range(len(doc)):
43
+ page = doc.load_page(page_num)
44
+ for span in page.get_text("dict")["blocks"]:
45
+ if "lines" in span:
46
+ for line in span["lines"]:
47
+ for span in line["spans"]:
48
+ font_sizes.append(span['size'])
49
+ colors.append(span['color'])
50
+ fonts.append(span['font'])
51
+
52
+ # Get the most common font size, color, and font
53
+ most_common_font_size = Counter(font_sizes).most_common(1)[0][0] if font_sizes else None
54
+ most_common_color = Counter(colors).most_common(1)[0][0] if colors else None
55
+ most_common_font = Counter(fonts).most_common(1)[0][0] if fonts else None
56
+
57
+ return most_common_font_size, most_common_color, most_common_font
58
+
59
+ def extract_headers(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin):
60
+ print("Font baseline:", most_common_font_size, most_common_color, most_common_font)
61
+
62
+ grouped_headers = defaultdict(list)
63
+ spans = []
64
+ line_merge_threshold = 1.5 # Maximum vertical distance between lines to consider as part of same header
65
+
66
+ for pageNum in range(len(doc)):
67
+ if pageNum in toc_pages:
68
+ continue
69
+ page = doc.load_page(pageNum)
70
+ page_height = page.rect.height
71
+ text_instances = page.get_text("dict")
72
+
73
+ # First pass: collect all potential header spans
74
+ potential_header_spans = []
75
+ for block in text_instances['blocks']:
76
+ if block['type'] != 0:
77
+ continue
78
+
79
+ for line in block['lines']:
80
+ for span in line['spans']:
81
+ span_y0 = span['bbox'][1]
82
+ span_y1 = span['bbox'][3]
83
+
84
+ if span_y0 < top_margin or span_y1 > (page_height - bottom_margin):
85
+ continue
86
+
87
+ span_text = normalize_text(span.get('text', ''))
88
+ if not span_text:
89
+ continue
90
+ if span_text.startswith('http://www') or span_text.startswith('www'):
91
+ continue
92
+ if any((
93
+ 'page' in span_text,
94
+ not re.search(r'[a-z0-9]', span_text),
95
+ 'end of section' in span_text,
96
+ re.search(r'page\s+\d+\s+of\s+\d+', span_text),
97
+ re.search(r'\b(?:\d{1,2}[/-])?\d{1,2}[/-]\d{2,4}\b', span_text),
98
+ # re.search(r'\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)', span_text),
99
+ 'specification:' in span_text
100
+ )):
101
+ continue
102
+
103
+ cleaned_text = re.sub(r'[.\-]{4,}.*$', '', span_text).strip()
104
+ cleaned_text = normalize_text(cleaned_text)
105
+
106
+ if is_header(span, most_common_font_size, most_common_color, most_common_font):
107
+ potential_header_spans.append({
108
+ 'text': cleaned_text,
109
+ 'size': span['size'],
110
+ 'pageNum': pageNum,
111
+ 'y0': span_y0,
112
+ 'y1': span_y1,
113
+ 'x0': span['bbox'][0],
114
+ 'x1': span['bbox'][2],
115
+ 'span': span
116
+ })
117
+
118
+ # Sort spans by vertical position (top to bottom)
119
+ potential_header_spans.sort(key=lambda s: (s['pageNum'], s['y0']))
120
+
121
+ # Second pass: group spans that are vertically close and likely part of same header
122
+ i = 0
123
+ while i < len(potential_header_spans):
124
+ current = potential_header_spans[i]
125
+ header_text = current['text']
126
+ header_size = current['size']
127
+ header_page = current['pageNum']
128
+ min_y = current['y0']
129
+ max_y = current['y1']
130
+ spans_group = [current['span']]
131
+
132
+ # Look ahead to find adjacent lines that might be part of same header
133
+ j = i + 1
134
+ while j < len(potential_header_spans):
135
+ next_span = potential_header_spans[j]
136
+ # Check if on same page and vertically close with similar styling
137
+ if (next_span['pageNum'] == header_page and
138
+ next_span['y0'] - max_y < line_merge_threshold and
139
+ abs(next_span['size'] - header_size) < 0.5):
140
+ header_text += " " + next_span['text']
141
+ max_y = next_span['y1']
142
+ spans_group.append(next_span['span'])
143
+ j += 1
144
+ else:
145
+ break
146
+
147
+ # Add the merged header
148
+ grouped_headers[(header_page, min_y)].append({
149
+ "text": header_text.strip(),
150
+ "size": header_size,
151
+ "pageNum": header_page,
152
+ "spans": spans_group
153
+ })
154
+ spans.extend(spans_group)
155
+ i = j # Skip the spans we've already processed
156
+
157
+ # Prepare final headers list
158
+ headers = []
159
+ for (pageNum, y), header_groups in sorted(grouped_headers.items()):
160
+ for group in header_groups:
161
+ headers.append([
162
+ group['text'],
163
+ group['size'],
164
+ group['pageNum'],
165
+ y
166
+ ])
167
+
168
+ font_sizes = [size for _, size, _, _ in headers]
169
+ font_size_counts = Counter(font_sizes)
170
+
171
+ # Filter font sizes that appear at least 3 times
172
+ valid_font_sizes = [size for size, count in font_size_counts.items() if count >= 3]
173
+
174
+ # Sort in descending order
175
+ valid_font_sizes_sorted = sorted(valid_font_sizes, reverse=True)
176
+
177
+ # If only 2 sizes, repeat the second one
178
+ if len(valid_font_sizes_sorted) == 2:
179
+ top_3_font_sizes = [valid_font_sizes_sorted[0], valid_font_sizes_sorted[1], valid_font_sizes_sorted[1]]
180
+ else:
181
+ top_3_font_sizes = valid_font_sizes_sorted[:3]
182
+
183
+ # Get the smallest font size among valid ones
184
+ smallest_font_size = min(valid_font_sizes) if valid_font_sizes else None
185
+
186
+ print("Smallest font size in headers:", smallest_font_size)
187
+
188
+ return headers, top_3_font_sizes, smallest_font_size, spans
189
+
190
+ import re
191
+ import difflib
192
+
193
+ def is_numbered(text):
194
+ return bool(re.match(r'^\d', text.strip()))
195
+
196
+ def is_similar(a, b, threshold=0.85):
197
+ return difflib.SequenceMatcher(None, a, b).ratio() > threshold
198
+
199
+ def normalize(text):
200
+ text = text.lower()
201
+ text = re.sub(r'\.{2,}', '', text) # remove long dots
202
+ text = re.sub(r'\s+', ' ', text) # replace multiple spaces with one
203
+ return text.strip()
204
+
205
+ def clean_toc_entry(toc_text):
206
+ """Remove page numbers and formatting from TOC entries"""
207
+ # Remove everything after last sequence of dots/whitespace followed by digits
208
+ return re.sub(r'[\.\s]+\d+.*$', '', toc_text).strip('. ')
209
+
210
+ def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin=70, bottom_margin=70):
211
+ # Extract headers with margin handling
212
+ headers_list, top_3_font_sizes, smallest_font_size, spans = extract_headers(
213
+ doc,
214
+ toc_pages=toc_pages,
215
+ most_common_font_size=most_common_font_size,
216
+ most_common_color=most_common_color,
217
+ most_common_font=most_common_font,
218
+ top_margin=top_margin,
219
+ bottom_margin=bottom_margin
220
+ )
221
+
222
+ # Step 1: Collect and filter potential headers
223
+ headers = []
224
+ seen_headers = set()
225
+
226
+ # First extract TOC entries to get exact level 0 header texts
227
+ toc_entries = {}
228
+ for pno in toc_pages:
229
+ page = doc.load_page(pno)
230
+ toc_text = page.get_text()
231
+ for line in toc_text.split('\n'):
232
+ clean_line = line.strip()
233
+ if clean_line:
234
+ norm_line = normalize(clean_line)
235
+ toc_entries[norm_line] = clean_line # Store original text
236
+
237
+ for h in headers_list:
238
+ text, size, pageNum, y = h[:4]
239
+ page = doc.load_page(pageNum)
240
+ page_height = page.rect.height
241
+
242
+ # Skip margin areas
243
+ if y < top_margin or y > (page_height - bottom_margin):
244
+ continue
245
+
246
+ norm_text = normalize(text)
247
+ if len(norm_text) > 2 and size >= most_common_font_size:
248
+ headers.append({
249
+ "text": text,
250
+ "page": pageNum,
251
+ "y": y,
252
+ "size": size,
253
+ "bold": h[4] if len(h) > 4 else False,
254
+ # "italic": h[5] if len(h) > 5 else False,
255
+ "color": h[6] if len(h) > 6 else None,
256
+ "font": h[7] if len(h) > 7 else None,
257
+ "children": [],
258
+ "is_numbered": is_numbered(text),
259
+ "original_size": size,
260
+ "norm_text": norm_text,
261
+ "level": -1 # Initialize as unassigned
262
+ })
263
+
264
+ # Sort by page and vertical position
265
+ headers.sort(key=lambda h: (h['page'], h['y']))
266
+ # Step 2: Detect consecutive headers and assign levels
267
+ i = 0
268
+ while i < len(headers) - 1:
269
+ current = headers[i]
270
+ next_header = headers[i+1]
271
+
272
+ # Check if they are on the same page and very close vertically (likely consecutive lines)
273
+ if (current['page'] == next_header['page'] and
274
+ abs(current['y'] - next_header['y']) < 20): # 20pt threshold for "same line"
275
+
276
+ # Case 1: Both unassigned - make current level 1 and next level 2
277
+ if current['level'] == -1 and next_header['level'] == -1:
278
+ current['level'] = 1
279
+ next_header['level'] = 2
280
+ i += 1 # Skip next header since we processed it
281
+
282
+ # Case 2: Current unassigned, next assigned - make current one level above
283
+ elif current['level'] == -1 and next_header['level'] != -1:
284
+ current['level'] = max(1, next_header['level'] - 1)
285
+
286
+ # Case 3: Current assigned, next unassigned - make next one level below
287
+ elif current['level'] != -1 and next_header['level'] == -1:
288
+ next_header['level'] = current['level'] + 1
289
+ i += 1 # Skip next header since we processed it
290
+ i += 1
291
+ # Step 2: Identify level 0 headers (largest and in TOC)
292
+ # max_size = max(h['size'] for h in headers) if headers else 0
293
+ max_size,subheaderSize,nbsheadersize=top_3_font_sizes
294
+ print(max_size)
295
+ toc_text_match=[]
296
+ # Improved TOC matching with exact and substring matching
297
+ toc_matches = []
298
+ for h in headers:
299
+ norm_text = h['norm_text']
300
+ matching_toc_texts = []
301
+
302
+ # Check both exact matches and substring matches
303
+ for toc_norm, toc_text in toc_entries.items():
304
+ # Exact match case
305
+ if norm_text == toc_norm and len(toc_text)>4 and h['size']==max_size:
306
+ matching_toc_texts.append(toc_text)
307
+ # Substring match case (header is substring of TOC entry)
308
+ elif norm_text in toc_norm and len(toc_text)>4 and h['size']==max_size:
309
+ matching_toc_texts.append(toc_text)
310
+ # Substring match case (TOC entry is substring of header)
311
+ elif toc_norm in norm_text and len(toc_text)>4 and h['size']==max_size:
312
+ matching_toc_texts.append(toc_text)
313
+
314
+ if matching_toc_texts and h['size'] >= max_size * 0.9:
315
+ best_match = max(matching_toc_texts,
316
+ key=lambda x: (len(x), -len(x.replace(norm_text, ''))))
317
+ h['text'] = normalize_text(clean_toc_entry(best_match))
318
+ h['level'] = 0
319
+ if h['text'] not in toc_text_match:
320
+ toc_matches.append(h)
321
+ toc_text_match.append(h['text'])
322
+ elif matching_toc_texts and h['size'] < max_size * 0.9 and h['size'] > nbsheadersize : # h['size'] < max_size * 0.9 and h['size'] > max_size*0.75:
323
+ print(h['text'],matching_toc_texts)
324
+ headers.remove(h)
325
+ continue
326
+
327
+
328
+ # Remove duplicates - keep only first occurrence of each level 0 header
329
+ unique_level0 = []
330
+ seen_level0 = set()
331
+ for h in toc_matches:
332
+ # Use the cleaned text for duplicate checking
333
+ cleaned_text = clean_toc_entry(h['text'])
334
+ norm_cleaned_text = normalize(cleaned_text)
335
+
336
+ if norm_cleaned_text not in seen_level0:
337
+ seen_level0.add(norm_cleaned_text)
338
+ # Update the header text with cleaned version
339
+ h['text'] = cleaned_text
340
+ unique_level0.append(h)
341
+ print(f"Added unique header: {cleaned_text} (normalized: {norm_cleaned_text})")
342
+
343
+ # Step 3: Process headers under each level 0 to identify level 1 format
344
+
345
+ # First, group headers by their level 0 parent
346
+ level0_headers = [h for h in headers if h['level'] == 0]
347
+ header_groups = []
348
+
349
+ for i, level0 in enumerate(level0_headers):
350
+ start_idx = headers.index(level0)
351
+ end_idx = headers.index(level0_headers[i+1]) if i+1 < len(level0_headers) else len(headers)
352
+ group = headers[start_idx:end_idx]
353
+ header_groups.append(group)
354
+
355
+ # Now process each group to identify level 1 format
356
+ for group in header_groups:
357
+ level0 = group[0]
358
+ level1_candidates = [h for h in group[1:] if h['level'] == -1]
359
+
360
+ if not level1_candidates:
361
+ continue
362
+
363
+ # The first candidate is our reference level 1
364
+ first_level1 = level1_candidates[0]
365
+ level1_format = {
366
+ 'font': first_level1['font'],
367
+ 'color': first_level1['color'],
368
+ 'starts_with_number': is_numbered(first_level1['text']),
369
+ 'size': first_level1['size'],
370
+ 'bold': first_level1['bold']
371
+ # 'italic': first_level1['italic']
372
+ }
373
+
374
+ # Assign levels based on the reference format
375
+ for h in level1_candidates:
376
+ current_format = {
377
+ 'font': h['font'],
378
+ 'color': h['color'],
379
+ 'starts_with_number': is_numbered(h['text']),
380
+ 'size': h['size'],
381
+ 'bold': h['bold']
382
+ # 'italic': h['italic']
383
+ }
384
+
385
+ # Compare with level1 format
386
+ if (current_format['font'] == level1_format['font'] and
387
+ current_format['color'] == level1_format['color'] and
388
+ current_format['starts_with_number'] == level1_format['starts_with_number'] and
389
+ abs(current_format['size'] - level1_format['size']) <= 0.1 and
390
+ current_format['bold'] == level1_format['bold'] ): #and
391
+ # current_format['italic'] == level1_format['italic']):
392
+ h['level'] = 1
393
+ else:
394
+ h['level'] = 2
395
+
396
+ # Step 4: Assign levels to remaining unassigned headers
397
+ unassigned = [h for h in headers if h['level'] == -1]
398
+ if unassigned:
399
+ # Cluster by size with tolerance
400
+ sizes = sorted({h['size'] for h in unassigned}, reverse=True)
401
+ clusters = []
402
+
403
+ for size in sizes:
404
+ found_cluster = False
405
+ for cluster in clusters:
406
+ if abs(size - cluster['size']) <= max(size, cluster['size']) * 0.1:
407
+ cluster['headers'].extend([h for h in unassigned if abs(h['size'] - size) <= size * 0.1])
408
+ found_cluster = True
409
+ break
410
+ if not found_cluster:
411
+ clusters.append({
412
+ 'size': size,
413
+ 'headers': [h for h in unassigned if abs(h['size'] - size) <= size * 0.1]
414
+ })
415
+
416
+ # Assign levels starting from 1
417
+ clusters.sort(key=lambda x: -x['size'])
418
+ for i, cluster in enumerate(clusters):
419
+ for h in cluster['headers']:
420
+ base_level = i + 1
421
+ if h['bold']:
422
+ base_level = max(1, base_level - 1)
423
+ h['level'] = base_level
424
+
425
+ # Step 5: Build hierarchy
426
+ root = []
427
+ stack = []
428
+
429
+ # Create a set of normalized texts from unique_level0 to avoid duplicates
430
+ unique_level0_texts = {h['norm_text'] for h in unique_level0}
431
+
432
+ # Filter out any headers from the original list that match unique_level0 headers
433
+ filtered_headers = []
434
+ for h in headers:
435
+ if h['norm_text'] in unique_level0_texts and h not in unique_level0:
436
+ h['level'] = 0
437
+ filtered_headers.append(h)
438
+
439
+ # Combine all headers - unique_level0 first, then the filtered headers
440
+ all_headers = unique_level0 + filtered_headers
441
+ all_headers.sort(key=lambda h: (h['page'], h['y']))
442
+
443
+ # Track which level 0 headers we've already added
444
+ added_level0 = set()
445
+
446
+ for header in all_headers:
447
+ if header['level'] < 0:
448
+ continue
449
+
450
+ if header['level'] == 0:
451
+ norm_text = header['norm_text']
452
+ if norm_text in added_level0:
453
+ continue
454
+ added_level0.add(norm_text)
455
+
456
+ # Pop stack until we find a parent
457
+ while stack and stack[-1]['level'] >= header['level']:
458
+ stack.pop()
459
+
460
+ current_parent = stack[-1] if stack else None
461
+
462
+ if current_parent:
463
+ current_parent['children'].append(header)
464
+ else:
465
+ root.append(header)
466
+
467
+ stack.append(header)
468
+
469
+ # Step 6: Enforce proper nesting
470
+ def enforce_nesting(node_list, parent_level=-1):
471
+ for node in node_list:
472
+ if node['level'] <= parent_level:
473
+ node['level'] = parent_level + 1
474
+ enforce_nesting(node['children'], node['level'])
475
+
476
+ enforce_nesting(root)
477
+ root = [h for h in root if not (h['level'] == 0 and not h['children'])]
478
+ # NEW: Filter out level 1 headers containing 'installation' and their children
479
+ def filter_installation_headers(node_list):
480
+ filtered = []
481
+ for node in node_list:
482
+ # Skip if it's a level 1 header containing 'installation' (case insensitive)
483
+ if node['level'] == 1 and ('installation' in node['text'].lower() or 'execution' in node['text'].lower() or 'miscellaneous items' in node['text'].lower() ) :
484
+ continue
485
+ # Recursively filter children
486
+ node['children'] = filter_installation_headers(node['children'])
487
+ filtered.append(node)
488
+ return filtered
489
+
490
+ root = filter_installation_headers(root)
491
+ return root
492
+
493
+ def adjust_levels_if_level0_not_in_toc(doc, toc_pages, root):
494
+ def normalize(text):
495
+ return re.sub(r'\s+', ' ', text.strip().lower())
496
+
497
+ toc_text = ""
498
+ for pno in toc_pages:
499
+ page = doc.load_page(pno)
500
+ toc_text += page.get_text()
501
+ toc_text_normalized = normalize(toc_text)
502
+
503
+ def is_level0_in_toc_text(header):
504
+ return header['level'] == 0 and normalize(header['text']) in toc_text_normalized
505
+
506
+ if any(is_level0_in_toc_text(h) for h in root):
507
+ return # No change needed
508
+
509
+ def increase_levels(node_list):
510
+ for node in node_list:
511
+ node['level'] += 1
512
+ increase_levels(node['children'])
513
+
514
+ def assign_numbers_to_headers(headers, prefix=None):
515
+ for idx, header in enumerate(headers, 1):
516
+ current_number = f"{prefix}.{idx}" if prefix else str(idx)
517
+ header["number"] = current_number
518
+ assign_numbers_to_headers(header["children"], current_number)
519
+
520
+ def print_tree_with_numbers(headers, listofheaders, indent=0):
521
+ for header in headers:
522
+ size_info = f"size:{header['original_size']:.1f}" if 'original_size' in header else ""
523
+ line = (
524
+ " " * indent +
525
+ f"{header.get('number', '?')} {header['text']} " +
526
+ f"(Level {header['level']}, p:{header['page']+1}, {size_info})"
527
+ )
528
+ print(line)
529
+ listofheaders.append(line)
530
+ print_tree_with_numbers(header["children"], listofheaders, indent + 1)
531
+ return listofheaders
532
+
533
+ def get_toc_page_numbers(doc, max_pages_to_check=15):
534
+ toc_pages = []
535
+ for page_num in range(min(len(doc), max_pages_to_check)):
536
+ page = doc.load_page(page_num)
537
+ blocks = page.get_text("dict")["blocks"]
538
+
539
+ dot_line_count = 0
540
+ for block in blocks:
541
+ for line in block.get("lines", []):
542
+ line_text = get_spaced_text_from_spans(line["spans"]).strip()
543
+ if dot_pattern.search(line_text):
544
+ dot_line_count += 1
545
+
546
+ if dot_line_count >= 3:
547
+ toc_pages.append(page_num)
548
+
549
+ return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages
550
+
551
+
552
+ def headersfrompdf(filePath):
553
+ pdf_path=filePath
554
+ if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
555
+ pdf_path = pdf_path.replace('dl=0', 'dl=1')
556
+
557
+ response = requests.get(pdf_path)
558
+ pdf_content = BytesIO(response.content)
559
+ if not pdf_content:
560
+ raise ValueError("No valid PDF content found.")
561
+
562
+ doc = fitz.open(stream=pdf_content, filetype="pdf")
563
+ most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)
564
+ toc_pages = get_toc_page_numbers(doc)
565
+ hierarchy = build_header_hierarchy(doc,toc_pages, most_common_font_size, most_common_color, most_common_font)
566
+ assign_numbers_to_headers(hierarchy)
567
+ listofheaders=print_tree_with_numbers(hierarchy,listofheaders=[])
568
+ print(listofheaders)
569
+ return listofheaders
570
+
571
+
572
+