LisaMegaWatts commited on
Commit
c7180df
·
verified ·
1 Parent(s): d184fb7

Upload chunker.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. chunker.py +107 -0
chunker.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Text chunker - splits cleaned text into training-sized chunks."""
2
+
3
+ import logging
4
+ import re
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class TextChunker:
10
+ """Splits text into chunks suitable for character-level GPT training.
11
+
12
+ Each chunk becomes one line in the training file. Chunks are split
13
+ at sentence boundaries when possible, otherwise at word boundaries.
14
+ """
15
+
16
+ # Sentence-ending punctuation followed by space or end-of-string
17
+ SENTENCE_END = re.compile(r"[.!?]['\"]?\s+")
18
+
19
+ def __init__(self, config: dict):
20
+ self.max_chars = config.get("max_chars", 256)
21
+ self.min_chars = config.get("min_chars", 40)
22
+ self.break_on_sentence = config.get("break_on_sentence", True)
23
+
24
+ def chunk(self, text: str) -> list[str]:
25
+ """Split text into chunks of at most max_chars characters.
26
+
27
+ Args:
28
+ text: Cleaned text to chunk.
29
+
30
+ Returns:
31
+ List of text chunks, each a single line with no newlines.
32
+ """
33
+ if not text.strip():
34
+ return []
35
+
36
+ # First, split into paragraphs
37
+ paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
38
+
39
+ chunks = []
40
+ for para in paragraphs:
41
+ # Replace remaining newlines within paragraph with spaces
42
+ para = para.replace("\n", " ").strip()
43
+
44
+ if not para:
45
+ continue
46
+
47
+ if len(para) <= self.max_chars:
48
+ if len(para) >= self.min_chars:
49
+ chunks.append(para)
50
+ continue
51
+
52
+ # Split long paragraphs
53
+ chunks.extend(self._split_long_text(para))
54
+
55
+ logger.info("Chunked text into %d chunks (max %d chars)", len(chunks), self.max_chars)
56
+ return chunks
57
+
58
+ def _split_long_text(self, text: str) -> list[str]:
59
+ """Split text longer than max_chars into sentence-aware chunks."""
60
+ chunks = []
61
+ remaining = text
62
+
63
+ while remaining:
64
+ remaining = remaining.strip()
65
+ if not remaining:
66
+ break
67
+
68
+ if len(remaining) <= self.max_chars:
69
+ if len(remaining) >= self.min_chars:
70
+ chunks.append(remaining)
71
+ break
72
+
73
+ # Find the best break point within max_chars
74
+ cut = self._find_break_point(remaining)
75
+ chunk = remaining[:cut].strip()
76
+ remaining = remaining[cut:].strip()
77
+
78
+ if len(chunk) >= self.min_chars:
79
+ chunks.append(chunk)
80
+
81
+ return chunks
82
+
83
+ def _find_break_point(self, text: str) -> int:
84
+ """Find the best position to break text at, within max_chars.
85
+
86
+ Priority: sentence boundary > word boundary > hard cut.
87
+ """
88
+ window = text[:self.max_chars]
89
+
90
+ # Try to find sentence boundary
91
+ if self.break_on_sentence:
92
+ best_sentence_break = -1
93
+ for match in self.SENTENCE_END.finditer(window):
94
+ pos = match.end()
95
+ if pos <= self.max_chars:
96
+ best_sentence_break = pos
97
+
98
+ if best_sentence_break > self.min_chars:
99
+ return best_sentence_break
100
+
101
+ # Fall back to word boundary
102
+ last_space = window.rfind(" ")
103
+ if last_space > self.min_chars:
104
+ return last_space
105
+
106
+ # Hard cut at max_chars (shouldn't happen often with natural text)
107
+ return self.max_chars