Spaces:

Marthee
/

HeadersFilteringFind1

Sleeping

App Files Files Community

HeadersFilteringFind1 / findInitialMarkups.py

Marthee

Update findInitialMarkups.py

0b34f32 verified 8 months ago

raw

history blame contribute delete

22.8 kB

	import re
	from collections import defaultdict, Counter
	import fitz # PyMuPDF
	import requests
	from io import BytesIO

	def normalize_text(text):
	if text is None:
	return ""
	return re.sub(r'\s+', ' ', text.strip().lower())

	def get_spaced_text_from_spans(spans):
	return normalize_text(" ".join(span["text"].strip() for span in spans))

	def is_header(span, most_common_font_size, most_common_color, most_common_font):
	fontname = span.get("font", "").lower()
	# is_italic = "italic" in fontname or "oblique" in fontname
	is_bold = "bold" in fontname or span.get("bold", False)
	return (
	(
	span["size"] > most_common_font_size or
	span["font"].lower() != most_common_font.lower() or
	is_bold
	)
	)

	def add_span_to_nearest_group(span_y, grouped_dict, pageNum=None, threshold=0.5):
	for (p, y) in grouped_dict:
	if pageNum is not None and p != pageNum:
	continue
	if abs(y - span_y) <= threshold:
	return (p, y)
	return (pageNum, span_y)


	def get_regular_font_size_and_color(doc):
	font_sizes = []
	colors = []
	fonts = []

	# Loop through all pages
	for page_num in range(len(doc)):
	page = doc.load_page(page_num)
	for span in page.get_text("dict")["blocks"]:
	if "lines" in span:
	for line in span["lines"]:
	for span in line["spans"]:
	font_sizes.append(span['size'])
	colors.append(span['color'])
	fonts.append(span['font'])

	# Get the most common font size, color, and font
	most_common_font_size = Counter(font_sizes).most_common(1)[0][0] if font_sizes else None
	most_common_color = Counter(colors).most_common(1)[0][0] if colors else None
	most_common_font = Counter(fonts).most_common(1)[0][0] if fonts else None

	return most_common_font_size, most_common_color, most_common_font

	def extract_headers(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin):
	print("Font baseline:", most_common_font_size, most_common_color, most_common_font)

	grouped_headers = defaultdict(list)
	spans = []
	line_merge_threshold = 1.5 # Maximum vertical distance between lines to consider as part of same header

	for pageNum in range(len(doc)):
	if pageNum in toc_pages:
	continue
	page = doc.load_page(pageNum)
	page_height = page.rect.height
	text_instances = page.get_text("dict")

	# First pass: collect all potential header spans
	potential_header_spans = []
	for block in text_instances['blocks']:
	if block['type'] != 0:
	continue

	for line in block['lines']:
	for span in line['spans']:
	span_y0 = span['bbox'][1]
	span_y1 = span['bbox'][3]

	if span_y0 < top_margin or span_y1 > (page_height - bottom_margin):
	continue

	span_text = normalize_text(span.get('text', ''))
	if not span_text:
	continue
	if span_text.startswith('http://www') or span_text.startswith('www'):
	continue
	if any((
	'page' in span_text,
	not re.search(r'[a-z0-9]', span_text),
	'end of section' in span_text,
	re.search(r'page\s+\d+\s+of\s+\d+', span_text),
	re.search(r'\b(?:\d{1,2}[/-])?\d{1,2}[/-]\d{2,4}\b', span_text),
	# re.search(r'\b(?:jan\|feb\|mar\|apr\|may\|jun\|jul\|aug\|sep\|oct\|nov\|dec)', span_text),
	'specification:' in span_text
	)):
	continue

	cleaned_text = re.sub(r'[.\-]{4,}.*$', '', span_text).strip()
	cleaned_text = normalize_text(cleaned_text)

	if is_header(span, most_common_font_size, most_common_color, most_common_font):
	potential_header_spans.append({
	'text': cleaned_text,
	'size': span['size'],
	'pageNum': pageNum,
	'y0': span_y0,
	'y1': span_y1,
	'x0': span['bbox'][0],
	'x1': span['bbox'][2],
	'span': span
	})

	# Sort spans by vertical position (top to bottom)
	potential_header_spans.sort(key=lambda s: (s['pageNum'], s['y0']))

	# Second pass: group spans that are vertically close and likely part of same header
	i = 0
	while i < len(potential_header_spans):
	current = potential_header_spans[i]
	header_text = current['text']
	header_size = current['size']
	header_page = current['pageNum']
	min_y = current['y0']
	max_y = current['y1']
	spans_group = [current['span']]

	# Look ahead to find adjacent lines that might be part of same header
	j = i + 1
	while j < len(potential_header_spans):
	next_span = potential_header_spans[j]
	# Check if on same page and vertically close with similar styling
	if (next_span['pageNum'] == header_page and
	next_span['y0'] - max_y < line_merge_threshold and
	abs(next_span['size'] - header_size) < 0.5):
	header_text += " " + next_span['text']
	max_y = next_span['y1']
	spans_group.append(next_span['span'])
	j += 1
	else:
	break

	# Add the merged header
	grouped_headers[(header_page, min_y)].append({
	"text": header_text.strip(),
	"size": header_size,
	"pageNum": header_page,
	"spans": spans_group
	})
	spans.extend(spans_group)
	i = j # Skip the spans we've already processed

	# Prepare final headers list
	headers = []
	for (pageNum, y), header_groups in sorted(grouped_headers.items()):
	for group in header_groups:
	headers.append([
	group['text'],
	group['size'],
	group['pageNum'],
	y
	])

	font_sizes = [size for _, size, _, _ in headers]
	font_size_counts = Counter(font_sizes)

	# Filter font sizes that appear at least 3 times
	valid_font_sizes = [size for size, count in font_size_counts.items() if count >= 3]

	# Sort in descending order
	valid_font_sizes_sorted = sorted(valid_font_sizes, reverse=True)

	# If only 2 sizes, repeat the second one
	if len(valid_font_sizes_sorted) == 2:
	top_3_font_sizes = [valid_font_sizes_sorted[0], valid_font_sizes_sorted[1], valid_font_sizes_sorted[1]]
	else:
	top_3_font_sizes = valid_font_sizes_sorted[:3]

	# Get the smallest font size among valid ones
	smallest_font_size = min(valid_font_sizes) if valid_font_sizes else None

	print("Smallest font size in headers:", smallest_font_size)

	return headers, top_3_font_sizes, smallest_font_size, spans

	import re
	import difflib

	def is_numbered(text):
	return bool(re.match(r'^\d', text.strip()))

	def is_similar(a, b, threshold=0.85):
	return difflib.SequenceMatcher(None, a, b).ratio() > threshold

	def normalize(text):
	text = text.lower()
	text = re.sub(r'\.{2,}', '', text) # remove long dots
	text = re.sub(r'\s+', ' ', text) # replace multiple spaces with one
	return text.strip()

	def clean_toc_entry(toc_text):
	"""Remove page numbers and formatting from TOC entries"""
	# Remove everything after last sequence of dots/whitespace followed by digits
	return re.sub(r'[\.\s]+\d+.*$', '', toc_text).strip('. ')

	def build_header_hierarchy(doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin=70, bottom_margin=70):
	# Extract headers with margin handling
	headers_list, top_3_font_sizes, smallest_font_size, spans = extract_headers(
	doc,
	toc_pages=toc_pages,
	most_common_font_size=most_common_font_size,
	most_common_color=most_common_color,
	most_common_font=most_common_font,
	top_margin=top_margin,
	bottom_margin=bottom_margin
	)

	# Step 1: Collect and filter potential headers
	headers = []
	seen_headers = set()

	# First extract TOC entries to get exact level 0 header texts
	toc_entries = {}
	for pno in toc_pages:
	page = doc.load_page(pno)
	toc_text = page.get_text()
	for line in toc_text.split('\n'):
	clean_line = line.strip()
	if clean_line:
	norm_line = normalize(clean_line)
	toc_entries[norm_line] = clean_line # Store original text

	for h in headers_list:
	text, size, pageNum, y = h[:4]
	page = doc.load_page(pageNum)
	page_height = page.rect.height

	# Skip margin areas
	if y < top_margin or y > (page_height - bottom_margin):
	continue

	norm_text = normalize(text)
	if len(norm_text) > 2 and size >= most_common_font_size:
	headers.append({
	"text": text,
	"page": pageNum,
	"y": y,
	"size": size,
	"bold": h[4] if len(h) > 4 else False,
	# "italic": h[5] if len(h) > 5 else False,
	"color": h[6] if len(h) > 6 else None,
	"font": h[7] if len(h) > 7 else None,
	"children": [],
	"is_numbered": is_numbered(text),
	"original_size": size,
	"norm_text": norm_text,
	"level": -1 # Initialize as unassigned
	})

	# Sort by page and vertical position
	headers.sort(key=lambda h: (h['page'], h['y']))
	# Step 2: Detect consecutive headers and assign levels
	i = 0
	while i < len(headers) - 1:
	current = headers[i]
	next_header = headers[i+1]

	# Check if they are on the same page and very close vertically (likely consecutive lines)
	if (current['page'] == next_header['page'] and
	abs(current['y'] - next_header['y']) < 20): # 20pt threshold for "same line"

	# Case 1: Both unassigned - make current level 1 and next level 2
	if current['level'] == -1 and next_header['level'] == -1:
	current['level'] = 1
	next_header['level'] = 2
	i += 1 # Skip next header since we processed it

	# Case 2: Current unassigned, next assigned - make current one level above
	elif current['level'] == -1 and next_header['level'] != -1:
	current['level'] = max(1, next_header['level'] - 1)

	# Case 3: Current assigned, next unassigned - make next one level below
	elif current['level'] != -1 and next_header['level'] == -1:
	next_header['level'] = current['level'] + 1
	i += 1 # Skip next header since we processed it
	i += 1
	# Step 2: Identify level 0 headers (largest and in TOC)
	# max_size = max(h['size'] for h in headers) if headers else 0
	max_size,subheaderSize,nbsheadersize=top_3_font_sizes
	print(max_size)
	toc_text_match=[]
	# Improved TOC matching with exact and substring matching
	toc_matches = []
	for h in headers:
	norm_text = h['norm_text']
	matching_toc_texts = []

	# Check both exact matches and substring matches
	for toc_norm, toc_text in toc_entries.items():
	# Exact match case
	if norm_text == toc_norm and len(toc_text)>4 and h['size']==max_size:
	matching_toc_texts.append(toc_text)
	# Substring match case (header is substring of TOC entry)
	elif norm_text in toc_norm and len(toc_text)>4 and h['size']==max_size:
	matching_toc_texts.append(toc_text)
	# Substring match case (TOC entry is substring of header)
	elif toc_norm in norm_text and len(toc_text)>4 and h['size']==max_size:
	matching_toc_texts.append(toc_text)

	if matching_toc_texts and h['size'] >= max_size * 0.9:
	best_match = max(matching_toc_texts,
	key=lambda x: (len(x), -len(x.replace(norm_text, ''))))
	h['text'] = normalize_text(clean_toc_entry(best_match))
	h['level'] = 0
	if h['text'] not in toc_text_match:
	toc_matches.append(h)
	toc_text_match.append(h['text'])
	elif matching_toc_texts and h['size'] < max_size * 0.9 and h['size'] > nbsheadersize : # h['size'] < max_size * 0.9 and h['size'] > max_size*0.75:
	print(h['text'],matching_toc_texts)
	headers.remove(h)
	continue


	# Remove duplicates - keep only first occurrence of each level 0 header
	unique_level0 = []
	seen_level0 = set()
	for h in toc_matches:
	# Use the cleaned text for duplicate checking
	cleaned_text = clean_toc_entry(h['text'])
	norm_cleaned_text = normalize(cleaned_text)

	if norm_cleaned_text not in seen_level0:
	seen_level0.add(norm_cleaned_text)
	# Update the header text with cleaned version
	h['text'] = cleaned_text
	unique_level0.append(h)
	print(f"Added unique header: {cleaned_text} (normalized: {norm_cleaned_text})")

	# Step 3: Process headers under each level 0 to identify level 1 format

	# First, group headers by their level 0 parent
	level0_headers = [h for h in headers if h['level'] == 0]
	header_groups = []

	for i, level0 in enumerate(level0_headers):
	start_idx = headers.index(level0)
	end_idx = headers.index(level0_headers[i+1]) if i+1 < len(level0_headers) else len(headers)
	group = headers[start_idx:end_idx]
	header_groups.append(group)

	# Now process each group to identify level 1 format
	for group in header_groups:
	level0 = group[0]
	level1_candidates = [h for h in group[1:] if h['level'] == -1]

	if not level1_candidates:
	continue

	# The first candidate is our reference level 1
	first_level1 = level1_candidates[0]
	level1_format = {
	'font': first_level1['font'],
	'color': first_level1['color'],
	'starts_with_number': is_numbered(first_level1['text']),
	'size': first_level1['size'],
	'bold': first_level1['bold']
	# 'italic': first_level1['italic']
	}

	# Assign levels based on the reference format
	for h in level1_candidates:
	current_format = {
	'font': h['font'],
	'color': h['color'],
	'starts_with_number': is_numbered(h['text']),
	'size': h['size'],
	'bold': h['bold']
	# 'italic': h['italic']
	}

	# Compare with level1 format
	if (current_format['font'] == level1_format['font'] and
	current_format['color'] == level1_format['color'] and
	current_format['starts_with_number'] == level1_format['starts_with_number'] and
	abs(current_format['size'] - level1_format['size']) <= 0.1 and
	current_format['bold'] == level1_format['bold'] ): #and
	# current_format['italic'] == level1_format['italic']):
	h['level'] = 1
	else:
	h['level'] = 2

	# Step 4: Assign levels to remaining unassigned headers
	unassigned = [h for h in headers if h['level'] == -1]
	if unassigned:
	# Cluster by size with tolerance
	sizes = sorted({h['size'] for h in unassigned}, reverse=True)
	clusters = []

	for size in sizes:
	found_cluster = False
	for cluster in clusters:
	if abs(size - cluster['size']) <= max(size, cluster['size']) * 0.1:
	cluster['headers'].extend([h for h in unassigned if abs(h['size'] - size) <= size * 0.1])
	found_cluster = True
	break
	if not found_cluster:
	clusters.append({
	'size': size,
	'headers': [h for h in unassigned if abs(h['size'] - size) <= size * 0.1]
	})

	# Assign levels starting from 1
	clusters.sort(key=lambda x: -x['size'])
	for i, cluster in enumerate(clusters):
	for h in cluster['headers']:
	base_level = i + 1
	if h['bold']:
	base_level = max(1, base_level - 1)
	h['level'] = base_level

	# Step 5: Build hierarchy
	root = []
	stack = []

	# Create a set of normalized texts from unique_level0 to avoid duplicates
	unique_level0_texts = {h['norm_text'] for h in unique_level0}

	# Filter out any headers from the original list that match unique_level0 headers
	filtered_headers = []
	for h in headers:
	if h['norm_text'] in unique_level0_texts and h not in unique_level0:
	h['level'] = 0
	filtered_headers.append(h)

	# Combine all headers - unique_level0 first, then the filtered headers
	all_headers = unique_level0 + filtered_headers
	all_headers.sort(key=lambda h: (h['page'], h['y']))

	# Track which level 0 headers we've already added
	added_level0 = set()

	for header in all_headers:
	if header['level'] < 0:
	continue

	if header['level'] == 0:
	norm_text = header['norm_text']
	if norm_text in added_level0:
	continue
	added_level0.add(norm_text)

	# Pop stack until we find a parent
	while stack and stack[-1]['level'] >= header['level']:
	stack.pop()

	current_parent = stack[-1] if stack else None

	if current_parent:
	current_parent['children'].append(header)
	else:
	root.append(header)

	stack.append(header)

	# Step 6: Enforce proper nesting
	def enforce_nesting(node_list, parent_level=-1):
	for node in node_list:
	if node['level'] <= parent_level:
	node['level'] = parent_level + 1
	enforce_nesting(node['children'], node['level'])

	enforce_nesting(root)
	root = [h for h in root if not (h['level'] == 0 and not h['children'])]
	# NEW: Filter out level 1 headers containing 'installation' and their children
	def filter_installation_headers(node_list):
	filtered = []
	for node in node_list:
	# Skip if it's a level 1 header containing 'installation' (case insensitive)
	if node['level'] == 1 and ('installation' in node['text'].lower() or 'execution' in node['text'].lower() or 'miscellaneous items' in node['text'].lower() ) :
	continue
	# Recursively filter children
	node['children'] = filter_installation_headers(node['children'])
	filtered.append(node)
	return filtered

	root = filter_installation_headers(root)
	return root

	def adjust_levels_if_level0_not_in_toc(doc, toc_pages, root):
	def normalize(text):
	return re.sub(r'\s+', ' ', text.strip().lower())

	toc_text = ""
	for pno in toc_pages:
	page = doc.load_page(pno)
	toc_text += page.get_text()
	toc_text_normalized = normalize(toc_text)

	def is_level0_in_toc_text(header):
	return header['level'] == 0 and normalize(header['text']) in toc_text_normalized

	if any(is_level0_in_toc_text(h) for h in root):
	return # No change needed

	def increase_levels(node_list):
	for node in node_list:
	node['level'] += 1
	increase_levels(node['children'])

	def assign_numbers_to_headers(headers, prefix=None):
	for idx, header in enumerate(headers, 1):
	current_number = f"{prefix}.{idx}" if prefix else str(idx)
	header["number"] = current_number
	assign_numbers_to_headers(header["children"], current_number)

	def print_tree_with_numbers(headers, listofheaders, indent=0):
	for header in headers:
	size_info = f"size:{header['original_size']:.1f}" if 'original_size' in header else ""
	line = (
	" " * indent +
	f"{header.get('number', '?')} {header['text']} " +
	f"(Level {header['level']}, p:{header['page']+1}, {size_info})"
	)
	print(line)
	listofheaders.append(line)
	print_tree_with_numbers(header["children"], listofheaders, indent + 1)
	return listofheaders

	def get_toc_page_numbers(doc, max_pages_to_check=15):
	# Precompute regex patterns
	dot_pattern = re.compile(r'\.{3,}')
	url_pattern = re.compile(r'https?://\S+\|www\.\S+')

	toc_pages = []
	for page_num in range(min(len(doc), max_pages_to_check)):
	page = doc.load_page(page_num)
	blocks = page.get_text("dict")["blocks"]

	dot_line_count = 0
	for block in blocks:
	for line in block.get("lines", []):
	line_text = get_spaced_text_from_spans(line["spans"]).strip()
	if dot_pattern.search(line_text):
	dot_line_count += 1

	if dot_line_count >= 3:
	toc_pages.append(page_num)

	return list(range(0, toc_pages[-1] +1)) if toc_pages else toc_pages


	def headersfrompdf(filePath):
	pdf_path=filePath
	if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
	pdf_path = pdf_path.replace('dl=0', 'dl=1')

	response = requests.get(pdf_path)
	pdf_content = BytesIO(response.content)
	if not pdf_content:
	raise ValueError("No valid PDF content found.")

	doc = fitz.open(stream=pdf_content, filetype="pdf")
	most_common_font_size, most_common_color, most_common_font = get_regular_font_size_and_color(doc)

	toc_pages = get_toc_page_numbers(doc)
	hierarchy = build_header_hierarchy(doc,toc_pages, most_common_font_size, most_common_color, most_common_font)
	assign_numbers_to_headers(hierarchy)
	listofheaders=print_tree_with_numbers(hierarchy,listofheaders=[])
	# print(listofheaders)
	full_text = "\n".join(listofheaders)
	# print(full_text)
	return full_text