File size: 4,351 Bytes
79fcbeb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import fitz  
import re
import streamlit as st
from app.utils import debug_log, breaks


def extract_font_info(pdf_bytes, content_page_ranges, header_margin=70, footer_margin=100):
    try:
        # If pdf_bytes is a BytesIO object
        if hasattr(pdf_bytes, 'read'):
            pdf_bytes.seek(0)  # Reset pointer
            pdf_bytes = pdf_bytes.read()
        
        # Ensure it's bytes, not string
        if isinstance(pdf_bytes, str):
            pdf_bytes = pdf_bytes.encode()
            
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        debug_log(f"PDF opened successfully. Pages: {len(doc)}")
        
    except Exception as e:
        st.error(f"Error opening PDF: {e}")
        return []
    
    font_data = []
    
    for page_num in content_page_ranges:
        page = doc.load_page(page_num)
        page_height = page.rect.height
        blocks = page.get_text("dict")["blocks"]
        
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        y = span["origin"][1]
                        # Skip headers/footers using defaults
                        if y < header_margin or y > (page_height - footer_margin):
                            continue
                        font_data.append({
                            "text": span["text"],
                            # "font_name": span["font"],
                            # "font_size": span["size"],
                            # "color": span["color"],  # RGB tuple (e.g., (0, 0, 0) for black)
                            # "is_bold": "bold" in span["font"].lower(),
                            # "is_italic": "italic" in span["font"].lower(),
                            "page": page_num + 1,
                            "coordinates": (span["origin"][0], span["origin"][1])
                        })
    return font_data


def extract_lines_from_font_info(font_info):
    """
    Extracts lines of text from font information based on y-coordinates.
    This function assumes that text elements with the same y-coordinate belong to the same line.
    """
    if not font_info:
        return []
    lines = []
    prev_y = None
    cur_line = ""

    for element in font_info:
        cur_y = element['coordinates'][1]
        if prev_y is None or cur_y == prev_y:
            cur_line += " " + element['text']
        else:
            if cur_line.strip():
                lines.append(cur_line.strip())
            cur_line = element['text']
        prev_y = cur_y

    # Don't forget the last line
    if cur_line.strip():
        lines.append(cur_line.strip())

    return lines


class TextCleaner:
    def __init__(self):
        self.patterns = {
            # patterns to filter out unwanted lines
            'numbered_lines': re.compile(r'^\d+\.\d+\b'),
            'symbol_only': re.compile(r'^[\W_]+$'),
            'copyright_pattern': re.compile(r'(©|ⓒ|\(c\)|\(C\)|c\s*⃝)', re.IGNORECASE),
            'exercises_pattern': re.compile(r'^\s*Exercises?\b[\s\d.:!?-]*$', re.IGNORECASE),
            # noise patterns
            'dotted_noise': re.compile(r'(?<!\w)([.\s]){3,}(?!\w)'),  
            'symbol_noise': re.compile(r'(?<!\w)([\W]\s?){3,}(?!\w)')
            }

    def filter_lines(self, lines):
        """Remove unwanted lines while keeping the structure"""
        return [
            line for line in lines
            if not (self.patterns['numbered_lines'].match(line.strip()) or 
                   self.patterns['symbol_only'].match(line.strip()) or
                   self.patterns['copyright_pattern'].search(line.strip()) or
                   self.patterns['exercises_pattern'].match(line.strip())) 
        ]

    def filter_noise(self, lines):
        """Remove noise patterns from lines"""
        cleaned = []
        for line in lines:
            # Remove standalone noise sequences (not between words)
            line = self.patterns['dotted_noise'].sub('', line)
            line = self.patterns['symbol_noise'].sub('', line)
            cleaned.append(line.strip())
        return cleaned
    
    def process(self, lines):
        """Complete processing pipeline"""
        filtered = self.filter_lines(lines)
        cleaned = self.filter_noise(filtered)
        return cleaned