import fitz import re import streamlit as st from app.utils import debug_log, breaks def extract_font_info(pdf_bytes, content_page_ranges, header_margin=70, footer_margin=100): try: # If pdf_bytes is a BytesIO object if hasattr(pdf_bytes, 'read'): pdf_bytes.seek(0) # Reset pointer pdf_bytes = pdf_bytes.read() # Ensure it's bytes, not string if isinstance(pdf_bytes, str): pdf_bytes = pdf_bytes.encode() doc = fitz.open(stream=pdf_bytes, filetype="pdf") debug_log(f"PDF opened successfully. Pages: {len(doc)}") except Exception as e: st.error(f"Error opening PDF: {e}") return [] font_data = [] for page_num in content_page_ranges: page = doc.load_page(page_num) page_height = page.rect.height blocks = page.get_text("dict")["blocks"] for block in blocks: if "lines" in block: for line in block["lines"]: for span in line["spans"]: y = span["origin"][1] # Skip headers/footers using defaults if y < header_margin or y > (page_height - footer_margin): continue font_data.append({ "text": span["text"], # "font_name": span["font"], # "font_size": span["size"], # "color": span["color"], # RGB tuple (e.g., (0, 0, 0) for black) # "is_bold": "bold" in span["font"].lower(), # "is_italic": "italic" in span["font"].lower(), "page": page_num + 1, "coordinates": (span["origin"][0], span["origin"][1]) }) return font_data def extract_lines_from_font_info(font_info): """ Extracts lines of text from font information based on y-coordinates. This function assumes that text elements with the same y-coordinate belong to the same line. """ if not font_info: return [] lines = [] prev_y = None cur_line = "" for element in font_info: cur_y = element['coordinates'][1] if prev_y is None or cur_y == prev_y: cur_line += " " + element['text'] else: if cur_line.strip(): lines.append(cur_line.strip()) cur_line = element['text'] prev_y = cur_y # Don't forget the last line if cur_line.strip(): lines.append(cur_line.strip()) return lines class TextCleaner: def __init__(self): self.patterns = { # patterns to filter out unwanted lines 'numbered_lines': re.compile(r'^\d+\.\d+\b'), 'symbol_only': re.compile(r'^[\W_]+$'), 'copyright_pattern': re.compile(r'(©|ⓒ|\(c\)|\(C\)|c\s*⃝)', re.IGNORECASE), 'exercises_pattern': re.compile(r'^\s*Exercises?\b[\s\d.:!?-]*$', re.IGNORECASE), # noise patterns 'dotted_noise': re.compile(r'(?