Spaces:

davidepanza
/

test2text

Sleeping

App Files Files Community

davidepanza commited on Jun 19, 2025

Commit

79fcbeb

verified ·

1 Parent(s): 468a9aa

Upload toc_cleaning.py

Browse files

Files changed (1) hide show

app/backend/toc_cleaning.py +118 -0

app/backend/toc_cleaning.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import fitz
+import re
+import streamlit as st
+from app.utils import debug_log, breaks
+def extract_font_info(pdf_bytes, content_page_ranges, header_margin=70, footer_margin=100):
+    try:
+        # If pdf_bytes is a BytesIO object
+        if hasattr(pdf_bytes, 'read'):
+            pdf_bytes.seek(0)  # Reset pointer
+            pdf_bytes = pdf_bytes.read()
+        # Ensure it's bytes, not string
+        if isinstance(pdf_bytes, str):
+            pdf_bytes = pdf_bytes.encode()
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+        debug_log(f"PDF opened successfully. Pages: {len(doc)}")
+    except Exception as e:
+        st.error(f"Error opening PDF: {e}")
+        return []
+    font_data = []
+    for page_num in content_page_ranges:
+        page = doc.load_page(page_num)
+        page_height = page.rect.height
+        blocks = page.get_text("dict")["blocks"]
+        for block in blocks:
+            if "lines" in block:
+                for line in block["lines"]:
+                    for span in line["spans"]:
+                        y = span["origin"][1]
+                        # Skip headers/footers using defaults
+                        if y < header_margin or y > (page_height - footer_margin):
+                            continue
+                        font_data.append({
+                            "text": span["text"],
+                            # "font_name": span["font"],
+                            # "font_size": span["size"],
+                            # "color": span["color"],  # RGB tuple (e.g., (0, 0, 0) for black)
+                            # "is_bold": "bold" in span["font"].lower(),
+                            # "is_italic": "italic" in span["font"].lower(),
+                            "page": page_num + 1,
+                            "coordinates": (span["origin"][0], span["origin"][1])
+                        })
+    return font_data
+def extract_lines_from_font_info(font_info):
+    """
+    Extracts lines of text from font information based on y-coordinates.
+    This function assumes that text elements with the same y-coordinate belong to the same line.
+    """
+    if not font_info:
+        return []
+    lines = []
+    prev_y = None
+    cur_line = ""
+    for element in font_info:
+        cur_y = element['coordinates'][1]
+        if prev_y is None or cur_y == prev_y:
+            cur_line += " " + element['text']
+        else:
+            if cur_line.strip():
+                lines.append(cur_line.strip())
+            cur_line = element['text']
+        prev_y = cur_y
+    # Don't forget the last line
+    if cur_line.strip():
+        lines.append(cur_line.strip())
+    return lines
+class TextCleaner:
+    def __init__(self):
+        self.patterns = {
+            # patterns to filter out unwanted lines
+            'numbered_lines': re.compile(r'^\d+\.\d+\b'),
+            'symbol_only': re.compile(r'^[\W_]+$'),
+            'copyright_pattern': re.compile(r'(©|ⓒ|\(c\)|\(C\)|c\s*⃝)', re.IGNORECASE),
+            'exercises_pattern': re.compile(r'^\s*Exercises?\b[\s\d.:!?-]*$', re.IGNORECASE),
+            # noise patterns
+            'dotted_noise': re.compile(r'(?<!\w)([.\s]){3,}(?!\w)'),
+            'symbol_noise': re.compile(r'(?<!\w)([\W]\s?){3,}(?!\w)')
+            }
+    def filter_lines(self, lines):
+        """Remove unwanted lines while keeping the structure"""
+        return [
+            line for line in lines
+            if not (self.patterns['numbered_lines'].match(line.strip()) or
+                   self.patterns['symbol_only'].match(line.strip()) or
+                   self.patterns['copyright_pattern'].search(line.strip()) or
+                   self.patterns['exercises_pattern'].match(line.strip()))
+        ]
+    def filter_noise(self, lines):
+        """Remove noise patterns from lines"""
+        cleaned = []
+        for line in lines:
+            # Remove standalone noise sequences (not between words)
+            line = self.patterns['dotted_noise'].sub('', line)
+            line = self.patterns['symbol_noise'].sub('', line)
+            cleaned.append(line.strip())
+        return cleaned
+    def process(self, lines):
+        """Complete processing pipeline"""
+        filtered = self.filter_lines(lines)
+        cleaned = self.filter_noise(filtered)
+        return cleaned