Spaces:

davidepanza
/

test2text

Sleeping

App Files Files Community

test2text / app /backend /toc_cleaning.py

davidepanza

Upload toc_cleaning.py

79fcbeb verified 7 months ago

raw

history blame contribute delete

4.35 kB

	import fitz
	import re
	import streamlit as st
	from app.utils import debug_log, breaks


	def extract_font_info(pdf_bytes, content_page_ranges, header_margin=70, footer_margin=100):
	try:
	# If pdf_bytes is a BytesIO object
	if hasattr(pdf_bytes, 'read'):
	pdf_bytes.seek(0) # Reset pointer
	pdf_bytes = pdf_bytes.read()

	# Ensure it's bytes, not string
	if isinstance(pdf_bytes, str):
	pdf_bytes = pdf_bytes.encode()

	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	debug_log(f"PDF opened successfully. Pages: {len(doc)}")

	except Exception as e:
	st.error(f"Error opening PDF: {e}")
	return []

	font_data = []

	for page_num in content_page_ranges:
	page = doc.load_page(page_num)
	page_height = page.rect.height
	blocks = page.get_text("dict")["blocks"]

	for block in blocks:
	if "lines" in block:
	for line in block["lines"]:
	for span in line["spans"]:
	y = span["origin"][1]
	# Skip headers/footers using defaults
	if y < header_margin or y > (page_height - footer_margin):
	continue
	font_data.append({
	"text": span["text"],
	# "font_name": span["font"],
	# "font_size": span["size"],
	# "color": span["color"], # RGB tuple (e.g., (0, 0, 0) for black)
	# "is_bold": "bold" in span["font"].lower(),
	# "is_italic": "italic" in span["font"].lower(),
	"page": page_num + 1,
	"coordinates": (span["origin"][0], span["origin"][1])
	})
	return font_data


	def extract_lines_from_font_info(font_info):
	"""
	Extracts lines of text from font information based on y-coordinates.
	This function assumes that text elements with the same y-coordinate belong to the same line.
	"""
	if not font_info:
	return []
	lines = []
	prev_y = None
	cur_line = ""

	for element in font_info:
	cur_y = element['coordinates'][1]
	if prev_y is None or cur_y == prev_y:
	cur_line += " " + element['text']
	else:
	if cur_line.strip():
	lines.append(cur_line.strip())
	cur_line = element['text']
	prev_y = cur_y

	# Don't forget the last line
	if cur_line.strip():
	lines.append(cur_line.strip())

	return lines


	class TextCleaner:
	def __init__(self):
	self.patterns = {
	# patterns to filter out unwanted lines
	'numbered_lines': re.compile(r'^\d+\.\d+\b'),
	'symbol_only': re.compile(r'^[\W_]+$'),
	'copyright_pattern': re.compile(r'(©\|ⓒ\|$c$\|$C$\|c\s*⃝)', re.IGNORECASE),
	'exercises_pattern': re.compile(r'^\sExercises?\b[\s\d.:!?-]$', re.IGNORECASE),
	# noise patterns
	'dotted_noise': re.compile(r'(?<!\w)([.\s]){3,}(?!\w)'),
	'symbol_noise': re.compile(r'(?<!\w)([\W]\s?){3,}(?!\w)')
	}

	def filter_lines(self, lines):
	"""Remove unwanted lines while keeping the structure"""
	return [
	line for line in lines
	if not (self.patterns['numbered_lines'].match(line.strip()) or
	self.patterns['symbol_only'].match(line.strip()) or
	self.patterns['copyright_pattern'].search(line.strip()) or
	self.patterns['exercises_pattern'].match(line.strip()))
	]

	def filter_noise(self, lines):
	"""Remove noise patterns from lines"""
	cleaned = []
	for line in lines:
	# Remove standalone noise sequences (not between words)
	line = self.patterns['dotted_noise'].sub('', line)
	line = self.patterns['symbol_noise'].sub('', line)
	cleaned.append(line.strip())
	return cleaned

	def process(self, lines):
	"""Complete processing pipeline"""
	filtered = self.filter_lines(lines)
	cleaned = self.filter_noise(filtered)
	return cleaned