davidepanza commited on
Commit
79fcbeb
·
verified ·
1 Parent(s): 468a9aa

Upload toc_cleaning.py

Browse files
Files changed (1) hide show
  1. app/backend/toc_cleaning.py +118 -0
app/backend/toc_cleaning.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+ import re
3
+ import streamlit as st
4
+ from app.utils import debug_log, breaks
5
+
6
+
7
+ def extract_font_info(pdf_bytes, content_page_ranges, header_margin=70, footer_margin=100):
8
+ try:
9
+ # If pdf_bytes is a BytesIO object
10
+ if hasattr(pdf_bytes, 'read'):
11
+ pdf_bytes.seek(0) # Reset pointer
12
+ pdf_bytes = pdf_bytes.read()
13
+
14
+ # Ensure it's bytes, not string
15
+ if isinstance(pdf_bytes, str):
16
+ pdf_bytes = pdf_bytes.encode()
17
+
18
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
19
+ debug_log(f"PDF opened successfully. Pages: {len(doc)}")
20
+
21
+ except Exception as e:
22
+ st.error(f"Error opening PDF: {e}")
23
+ return []
24
+
25
+ font_data = []
26
+
27
+ for page_num in content_page_ranges:
28
+ page = doc.load_page(page_num)
29
+ page_height = page.rect.height
30
+ blocks = page.get_text("dict")["blocks"]
31
+
32
+ for block in blocks:
33
+ if "lines" in block:
34
+ for line in block["lines"]:
35
+ for span in line["spans"]:
36
+ y = span["origin"][1]
37
+ # Skip headers/footers using defaults
38
+ if y < header_margin or y > (page_height - footer_margin):
39
+ continue
40
+ font_data.append({
41
+ "text": span["text"],
42
+ # "font_name": span["font"],
43
+ # "font_size": span["size"],
44
+ # "color": span["color"], # RGB tuple (e.g., (0, 0, 0) for black)
45
+ # "is_bold": "bold" in span["font"].lower(),
46
+ # "is_italic": "italic" in span["font"].lower(),
47
+ "page": page_num + 1,
48
+ "coordinates": (span["origin"][0], span["origin"][1])
49
+ })
50
+ return font_data
51
+
52
+
53
+ def extract_lines_from_font_info(font_info):
54
+ """
55
+ Extracts lines of text from font information based on y-coordinates.
56
+ This function assumes that text elements with the same y-coordinate belong to the same line.
57
+ """
58
+ if not font_info:
59
+ return []
60
+ lines = []
61
+ prev_y = None
62
+ cur_line = ""
63
+
64
+ for element in font_info:
65
+ cur_y = element['coordinates'][1]
66
+ if prev_y is None or cur_y == prev_y:
67
+ cur_line += " " + element['text']
68
+ else:
69
+ if cur_line.strip():
70
+ lines.append(cur_line.strip())
71
+ cur_line = element['text']
72
+ prev_y = cur_y
73
+
74
+ # Don't forget the last line
75
+ if cur_line.strip():
76
+ lines.append(cur_line.strip())
77
+
78
+ return lines
79
+
80
+
81
+ class TextCleaner:
82
+ def __init__(self):
83
+ self.patterns = {
84
+ # patterns to filter out unwanted lines
85
+ 'numbered_lines': re.compile(r'^\d+\.\d+\b'),
86
+ 'symbol_only': re.compile(r'^[\W_]+$'),
87
+ 'copyright_pattern': re.compile(r'(©|ⓒ|\(c\)|\(C\)|c\s*⃝)', re.IGNORECASE),
88
+ 'exercises_pattern': re.compile(r'^\s*Exercises?\b[\s\d.:!?-]*$', re.IGNORECASE),
89
+ # noise patterns
90
+ 'dotted_noise': re.compile(r'(?<!\w)([.\s]){3,}(?!\w)'),
91
+ 'symbol_noise': re.compile(r'(?<!\w)([\W]\s?){3,}(?!\w)')
92
+ }
93
+
94
+ def filter_lines(self, lines):
95
+ """Remove unwanted lines while keeping the structure"""
96
+ return [
97
+ line for line in lines
98
+ if not (self.patterns['numbered_lines'].match(line.strip()) or
99
+ self.patterns['symbol_only'].match(line.strip()) or
100
+ self.patterns['copyright_pattern'].search(line.strip()) or
101
+ self.patterns['exercises_pattern'].match(line.strip()))
102
+ ]
103
+
104
+ def filter_noise(self, lines):
105
+ """Remove noise patterns from lines"""
106
+ cleaned = []
107
+ for line in lines:
108
+ # Remove standalone noise sequences (not between words)
109
+ line = self.patterns['dotted_noise'].sub('', line)
110
+ line = self.patterns['symbol_noise'].sub('', line)
111
+ cleaned.append(line.strip())
112
+ return cleaned
113
+
114
+ def process(self, lines):
115
+ """Complete processing pipeline"""
116
+ filtered = self.filter_lines(lines)
117
+ cleaned = self.filter_noise(filtered)
118
+ return cleaned