fromozu commited on
Commit
75e2413
·
verified ·
1 Parent(s): 4cdcd43

Upload bilingual_book_maker/book_maker/backmatter.py with huggingface_hub

Browse files
bilingual_book_maker/book_maker/backmatter.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from collections.abc import Iterable
5
+
6
+
7
+ DEFAULT_BACKMATTER_TITLES = (
8
+ "about the author",
9
+ "notes about the author",
10
+ "notes on the author",
11
+ "biography",
12
+ "biographical note",
13
+ "credits",
14
+ "acknowledgments",
15
+ "acknowledgements",
16
+ "copyright",
17
+ "bibliography",
18
+ "index",
19
+ "endnotes",
20
+ "notes",
21
+ "further reading",
22
+ "also by the author",
23
+ "about this book",
24
+ "about this ebook",
25
+ )
26
+
27
+ NON_ALNUM_RE = re.compile(r"[^a-z0-9]+")
28
+
29
+
30
+ def normalize_heading(text: str | None) -> str:
31
+ value = (text or "").strip().lower()
32
+ if not value:
33
+ return ""
34
+ value = NON_ALNUM_RE.sub(" ", value)
35
+ return " ".join(value.split())
36
+
37
+
38
+ def split_backmatter_titles(value: str | Iterable[str] | None) -> tuple[str, ...]:
39
+ if value is None:
40
+ return DEFAULT_BACKMATTER_TITLES
41
+ if isinstance(value, str):
42
+ candidates = value.split(",")
43
+ else:
44
+ candidates = list(value)
45
+
46
+ normalized = [normalize_heading(item) for item in candidates]
47
+ filtered = [item for item in normalized if item]
48
+ return tuple(filtered) or DEFAULT_BACKMATTER_TITLES
49
+
50
+
51
+ def matches_backmatter_title(title: str | None, keywords: str | Iterable[str] | None = None) -> bool:
52
+ normalized_title = normalize_heading(title)
53
+ if not normalized_title:
54
+ return False
55
+
56
+ normalized_keywords = split_backmatter_titles(keywords)
57
+ return any(
58
+ normalized_title == keyword
59
+ or normalized_title.startswith(f"{keyword} ")
60
+ for keyword in normalized_keywords
61
+ )
62
+
63
+
64
+ def should_skip_backmatter(
65
+ *,
66
+ title: str | None,
67
+ current_percent: float,
68
+ skip_after_percent: int | float,
69
+ keywords: str | Iterable[str] | None = None,
70
+ ) -> bool:
71
+ if current_percent <= skip_after_percent:
72
+ return False
73
+ return matches_backmatter_title(title, keywords)