OliverPerrin commited on
Commit
8573220
·
1 Parent(s): e3422d2

Add English language filter to all data downloads

Browse files

- Filter BookSum, arXiv, and Gutenberg for English only
- Common English word ratio check (min 8%)
- Non-English pattern detection (French, German, Spanish, Italian, Latin)
- Skip non-English content at download time, not discovery time
- Reports skipped count for each source

scripts/build_discovery_dataset.py CHANGED
@@ -45,37 +45,85 @@ GARBAGE_PATTERNS = [
45
  r"transcriber", # Transcriber notes
46
  r"eBook", # eBook references
47
  r"©|copyright", # Copyright notices
 
 
 
 
 
48
  ]
49
 
50
- # Non-English indicators
51
  NON_ENGLISH_PATTERNS = [
52
- r"\b(le|la|les|un|une|des|du|de la|au|aux)\b", # French articles
53
- r"\b(der|die|das|ein|eine|und|ist|nicht)\b", # German
54
- r"\b(el|la|los|las|un|una|que|por|para)\b", # Spanish
55
- r"\b(il|lo|la|gli|le|un|una|che|per|con)\b", # Italian
56
- r"[àâäéèêëïîôùûüÿœæ]{3,}", # Multiple French accents
 
 
 
57
  ]
58
 
 
 
 
 
 
 
 
 
 
 
 
59
  def is_english(text: str) -> bool:
60
  """Check if text appears to be English."""
61
  text_lower = text.lower()
62
 
63
- # Check for non-English patterns
64
  for pattern in NON_ENGLISH_PATTERNS:
65
  matches = len(re.findall(pattern, text_lower, re.IGNORECASE))
66
- if matches > 5: # Too many non-English words
67
  return False
68
 
69
  # Check English word ratio
70
- english_words = ["the", "and", "of", "to", "a", "in", "that", "is", "was", "he", "she", "it", "for", "with", "as", "his", "her", "they", "be", "at", "on", "have", "had", "this", "but", "not", "from", "by", "or", "an"]
71
  words = text_lower.split()
72
- if len(words) < 20:
73
  return False
74
 
75
  english_count = sum(1 for w in words if w in english_words)
76
  ratio = english_count / len(words)
77
 
78
- return ratio > 0.05 # At least 5% common English words
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
 
81
  def is_quality_text(text: str) -> bool:
@@ -86,17 +134,21 @@ def is_quality_text(text: str) -> bool:
86
  return False
87
 
88
  # Must have reasonable length
89
- if len(text) < 200:
90
  return False
91
 
92
  # Must have sentences (not just fragments)
93
  sentences = re.split(r'[.!?]+', text)
94
- if len(sentences) < 3:
95
  return False
96
 
97
  # Check for too many special characters
98
  special_ratio = len(re.findall(r'[^\w\s.,!?\'"()-]', text)) / len(text)
99
- if special_ratio > 0.1:
 
 
 
 
100
  return False
101
 
102
  return True
 
45
  r"transcriber", # Transcriber notes
46
  r"eBook", # eBook references
47
  r"©|copyright", # Copyright notices
48
+ r"^INDEX", # Index pages
49
+ r"^\d+\.\s+\w+,\s+\d+", # Index entries like "1. Name, 234"
50
+ r"(syn\.|var\.|sp\.)", # Botanical abbreviations
51
+ r"[A-Z][a-z]+aceae", # Botanical family names
52
+ r"\(\s*syn\s+", # Synonym references
53
  ]
54
 
55
+ # Non-English indicators (expanded)
56
  NON_ENGLISH_PATTERNS = [
57
+ r"\b(le|la|les|un|une|des|du|de la|au|aux|et|est|sont|dans|pour|avec|sur|qui|que)\b", # French
58
+ r"\b(der|die|das|ein|eine|und|ist|nicht|mit|von|zu|den|dem|auf|für|als|auch|oder|nach|bei|nur|noch|wie|mehr|aber|wenn|so|hat|kann|ich|sie|er|wir|ihr|es|sich|sein)\b", # German (expanded)
59
+ r"\b(el|la|los|las|un|una|que|por|para|con|del|al|es|en|se|no|más|como|pero|su|sus)\b", # Spanish
60
+ r"\b(il|lo|la|gli|le|un|una|che|per|con|del|della|di|da|non|sono|è|anche|più|ma|se)\b", # Italian
61
+ r"[àâäéèêëïîôùûüÿœæäöüß]{2,}", # Accented chars (German ß, umlauts)
62
+ r"\b[A-Z][a-z]+ü[a-z]+\b", # German words with ü
63
+ r"\b[A-Z][a-z]+ö[a-z]+\b", # German words with ö
64
+ r"\b[A-Z][a-z]+ä[a-z]+\b", # German words with ä
65
  ]
66
 
67
+ # Patterns that indicate index/glossary/list content (not narrative)
68
+ INDEX_PATTERNS = [
69
+ r"^\s*\d+\s*$", # Just numbers
70
+ r"^[A-Z][a-z]+,\s+\d+", # "Word, 123" index entries
71
+ r"(\d+,\s*)+\d+", # Lists of page numbers
72
+ r"^[A-Z]{2,}\s+", # ALL CAPS words at start
73
+ r"^\s*[-•]\s+", # Bullet points
74
+ r"p\.\s*\d+", # Page references
75
+ ]
76
+
77
+
78
  def is_english(text: str) -> bool:
79
  """Check if text appears to be English."""
80
  text_lower = text.lower()
81
 
82
+ # Check for non-English patterns - stricter threshold
83
  for pattern in NON_ENGLISH_PATTERNS:
84
  matches = len(re.findall(pattern, text_lower, re.IGNORECASE))
85
+ if matches > 3: # Stricter: was 5
86
  return False
87
 
88
  # Check English word ratio
89
+ english_words = ["the", "and", "of", "to", "a", "in", "that", "is", "was", "he", "she", "it", "for", "with", "as", "his", "her", "they", "be", "at", "on", "have", "had", "this", "but", "not", "from", "by", "or", "an", "said", "were", "been", "would", "could", "which", "their", "there", "what", "when", "who", "will", "more", "if", "no", "out", "so", "up", "into", "than", "them", "can", "only", "other", "new", "some", "very", "just", "over", "such", "also", "its", "then", "two", "first", "any", "these", "may", "after", "most", "made", "before", "should", "now", "where", "those", "being", "has", "between", "own", "under"]
90
  words = text_lower.split()
91
+ if len(words) < 30: # Stricter: was 20
92
  return False
93
 
94
  english_count = sum(1 for w in words if w in english_words)
95
  ratio = english_count / len(words)
96
 
97
+ return ratio > 0.08 # Stricter: was 0.05
98
+
99
+
100
+ def is_narrative_text(text: str) -> bool:
101
+ """Check if text is actual narrative (not index/glossary/list)."""
102
+ lines = text.strip().split('\n')
103
+
104
+ # Count lines that look like index entries
105
+ index_lines = 0
106
+ for line in lines:
107
+ for pattern in INDEX_PATTERNS:
108
+ if re.search(pattern, line):
109
+ index_lines += 1
110
+ break
111
+
112
+ # If more than 30% are index-like, reject
113
+ if len(lines) > 0 and index_lines / len(lines) > 0.3:
114
+ return False
115
+
116
+ # Must have actual sentences with verbs
117
+ # Check for common verbs
118
+ verb_patterns = r"\b(is|are|was|were|have|has|had|do|does|did|will|would|could|should|may|might|can|said|says|went|came|made|took|saw|knew|thought|found|gave|told|asked|seemed|felt|looked|heard|began|kept|left|called|turned|wanted|tried|needed|used|believe|think|know|see|want|need|find|give|tell|become|leave|put|mean|keep|let|begin|seem|help|show|hear|play|run|move|live|read|write|learn|speak|bring|hold|stand|set|pay|meet|lead|understand|watch|follow|stop|create|speak|allow|add|spend|grow|open|walk|offer|remember|consider|appear|buy|wait|serve|die|send|build|stay|fall|cut|reach|kill|remain|suggest|raise|pass|sell|require|report|decide|pull)\b"
119
+ verb_count = len(re.findall(verb_patterns, text.lower()))
120
+
121
+ # Should have at least 1 verb per 50 words
122
+ words = len(text.split())
123
+ if words > 0 and verb_count / words < 0.02:
124
+ return False
125
+
126
+ return True
127
 
128
 
129
  def is_quality_text(text: str) -> bool:
 
134
  return False
135
 
136
  # Must have reasonable length
137
+ if len(text) < 300: # Stricter: was 200
138
  return False
139
 
140
  # Must have sentences (not just fragments)
141
  sentences = re.split(r'[.!?]+', text)
142
+ if len(sentences) < 4: # Stricter: was 3
143
  return False
144
 
145
  # Check for too many special characters
146
  special_ratio = len(re.findall(r'[^\w\s.,!?\'"()-]', text)) / len(text)
147
+ if special_ratio > 0.08: # Stricter: was 0.1
148
+ return False
149
+
150
+ # Must be narrative, not index/list
151
+ if not is_narrative_text(text):
152
  return False
153
 
154
  return True
scripts/download_data.py CHANGED
@@ -113,10 +113,71 @@ def write_jsonl(records: list[dict[str, Any]], path: Path, desc: str = "Writing"
113
  print(f" ✓ {len(records):,} samples → {path}")
114
 
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  # ============== SUMMARIZATION: BOOKS + ARXIV ==============
117
 
118
  def download_booksum(max_samples: int = 40000) -> list[dict[str, Any]]:
119
- """Download BookSum - literary chapter summarization."""
120
  print("\n📖 Loading BookSum (literary summarization)...")
121
 
122
  all_records: list[dict[str, Any]] = []
@@ -129,19 +190,28 @@ def download_booksum(max_samples: int = 40000) -> list[dict[str, Any]]:
129
  indices = random.sample(range(len(data)), min(len(data), limit))
130
 
131
  records = []
 
132
  for i in tqdm(indices, desc=f"BookSum {split}", leave=False):
133
  item = data[i]
134
  chapter = item.get("chapter", "")
135
  summary = item.get("summary_text") or item.get("summary", "")
136
- if chapter and summary and len(chapter) > 300:
137
- records.append({
138
- "source": chapter[:4000],
139
- "summary": summary,
140
- "type": "literary",
141
- "split": split,
142
- })
 
 
 
 
 
 
 
 
143
  all_records.extend(records)
144
- print(f" {split}: {len(records):,}")
145
 
146
  return all_records
147
 
@@ -162,7 +232,7 @@ def clean_arxiv_text(text: str) -> str:
162
 
163
  def download_arxiv_summarization(max_samples: int = 50000) -> list[dict[str, Any]]:
164
  """
165
- Download arXiv papers for academic summarization only.
166
  Note: This dataset doesn't have categories, so can't be used for topic classification.
167
 
168
  Returns: summarization_records
@@ -173,6 +243,7 @@ def download_arxiv_summarization(max_samples: int = 50000) -> list[dict[str, Any
173
  arxiv = load_dataset("ccdv/arxiv-summarization", split="train")
174
 
175
  summ_records: list[dict[str, Any]] = []
 
176
 
177
  indices = list(range(len(arxiv)))
178
  random.shuffle(indices)
@@ -199,6 +270,11 @@ def download_arxiv_summarization(max_samples: int = 50000) -> list[dict[str, Any
199
  if '@' in abstract or '@' in article[:500]:
200
  continue
201
 
 
 
 
 
 
202
  # Summarization: article → abstract
203
  if article and len(article) > 500:
204
  summ_records.append({
@@ -207,7 +283,7 @@ def download_arxiv_summarization(max_samples: int = 50000) -> list[dict[str, Any
207
  "type": "academic",
208
  })
209
 
210
- print(f" Summarization: {len(summ_records):,}")
211
 
212
  return summ_records
213
 
@@ -402,7 +478,7 @@ def download_topics(max_samples: int = 50000) -> None:
402
 
403
 
404
  def download_gutenberg_topics(max_samples: int = 30000) -> list[dict[str, Any]]:
405
- """Extract topic-labeled samples from Gutenberg books."""
406
  print("\n📚 Loading Gutenberg for topic classification...")
407
 
408
  try:
@@ -412,6 +488,7 @@ def download_gutenberg_topics(max_samples: int = 30000) -> list[dict[str, Any]]:
412
  gutenberg = load_dataset("pg19", split="train")
413
 
414
  records: list[dict[str, Any]] = []
 
415
 
416
  indices = list(range(len(gutenberg)))
417
  random.shuffle(indices)
@@ -450,6 +527,11 @@ def download_gutenberg_topics(max_samples: int = 30000) -> list[dict[str, Any]]:
450
  for para in paragraphs[5:]: # Skip front matter
451
  para = para.strip()
452
  if 200 < len(para) < 1500 and para.count('.') >= 2:
 
 
 
 
 
453
  records.append({
454
  "text": para,
455
  "topic": topic,
@@ -457,7 +539,7 @@ def download_gutenberg_topics(max_samples: int = 30000) -> list[dict[str, Any]]:
457
  })
458
  break
459
 
460
- print(f" Gutenberg topics: {len(records):,}")
461
  return records
462
 
463
 
@@ -502,7 +584,7 @@ GUTENBERG_JUNK_REGEX = re.compile("|".join(GUTENBERG_JUNK_PATTERNS), re.IGNORECA
502
 
503
 
504
  def is_clean_prose(text: str) -> bool:
505
- """Check if text is clean literary prose."""
506
  if len(text) < 300 or len(text) > 3000:
507
  return False
508
  if GUTENBERG_JUNK_REGEX.search(text):
@@ -515,12 +597,15 @@ def is_clean_prose(text: str) -> bool:
515
  digit_ratio = sum(1 for c in text if c.isdigit()) / max(len(text), 1)
516
  if digit_ratio > 0.1:
517
  return False
 
 
 
518
  return True
519
 
520
 
521
  def download_gutenberg(max_samples: int = 30000) -> None:
522
- """Download Gutenberg books for language modeling."""
523
- print("\n📚 Downloading Gutenberg Books...")
524
  out_dir = OUTPUT_DIR / "books"
525
  out_dir.mkdir(parents=True, exist_ok=True)
526
 
 
113
  print(f" ✓ {len(records):,} samples → {path}")
114
 
115
 
116
+ # ============== ENGLISH LANGUAGE FILTER ==============
117
+
118
+ # Common English words for detection
119
+ ENGLISH_WORDS = {
120
+ "the", "and", "of", "to", "a", "in", "that", "is", "was", "he", "she", "it",
121
+ "for", "with", "as", "his", "her", "they", "be", "at", "on", "have", "had",
122
+ "this", "but", "not", "from", "by", "or", "an", "said", "were", "been",
123
+ "would", "could", "which", "their", "there", "what", "when", "who", "will",
124
+ "more", "if", "no", "out", "so", "up", "into", "than", "them", "can", "only",
125
+ "other", "new", "some", "very", "just", "over", "such", "also", "its", "then",
126
+ }
127
+
128
+ # Non-English language patterns
129
+ NON_ENGLISH_PATTERNS = [
130
+ # French
131
+ r"\b(le|la|les|un|une|des|du|et|est|sont|dans|pour|avec|sur|qui|que|ce|cette|nous|vous|ils|elles|je|tu|il|elle|être|avoir)\b",
132
+ # German
133
+ r"\b(der|die|das|ein|eine|und|ist|nicht|mit|von|zu|den|dem|auf|für|als|auch|oder|nach|bei|nur|noch|wie|mehr|aber|wenn|hat|kann|ich|sie|er|wir|ihr|es|sich|sein)\b",
134
+ # Spanish
135
+ r"\b(el|la|los|las|un|una|que|por|para|con|del|al|es|en|se|no|más|como|pero|su|sus|le|lo|te|me|nos)\b",
136
+ # Italian
137
+ r"\b(il|lo|la|gli|le|che|per|con|del|della|di|da|non|sono|anche|più|ma|se|mi|ti|ci)\b",
138
+ # Latin
139
+ r"\b(et|in|ad|cum|de|ex|per|pro|sub|ab|ante|post|inter|contra|super|trans|apud)\b",
140
+ ]
141
+
142
+
143
+ def is_english_text(text: str, min_ratio: float = 0.08, max_foreign: int = 5) -> bool:
144
+ """
145
+ Check if text is primarily English.
146
+
147
+ Args:
148
+ text: Text to check
149
+ min_ratio: Minimum ratio of common English words
150
+ max_foreign: Maximum number of foreign word matches before rejecting
151
+
152
+ Returns:
153
+ True if text appears to be English
154
+ """
155
+ if not text or len(text) < 100:
156
+ return False
157
+
158
+ text_lower = text.lower()
159
+ words = text_lower.split()
160
+
161
+ if len(words) < 20:
162
+ return False
163
+
164
+ # Check for excessive non-English words
165
+ for pattern in NON_ENGLISH_PATTERNS:
166
+ matches = len(re.findall(pattern, text_lower))
167
+ if matches > max_foreign:
168
+ return False
169
+
170
+ # Check for sufficient English words
171
+ english_count = sum(1 for w in words if w.strip(".,!?;:'\"") in ENGLISH_WORDS)
172
+ ratio = english_count / len(words)
173
+
174
+ return ratio >= min_ratio
175
+
176
+
177
  # ============== SUMMARIZATION: BOOKS + ARXIV ==============
178
 
179
  def download_booksum(max_samples: int = 40000) -> list[dict[str, Any]]:
180
+ """Download BookSum - literary chapter summarization (English only)."""
181
  print("\n📖 Loading BookSum (literary summarization)...")
182
 
183
  all_records: list[dict[str, Any]] = []
 
190
  indices = random.sample(range(len(data)), min(len(data), limit))
191
 
192
  records = []
193
+ skipped_language = 0
194
  for i in tqdm(indices, desc=f"BookSum {split}", leave=False):
195
  item = data[i]
196
  chapter = item.get("chapter", "")
197
  summary = item.get("summary_text") or item.get("summary", "")
198
+
199
+ if not (chapter and summary and len(chapter) > 300):
200
+ continue
201
+
202
+ # Filter: English only
203
+ if not is_english_text(chapter):
204
+ skipped_language += 1
205
+ continue
206
+
207
+ records.append({
208
+ "source": chapter[:4000],
209
+ "summary": summary,
210
+ "type": "literary",
211
+ "split": split,
212
+ })
213
  all_records.extend(records)
214
+ print(f" {split}: {len(records):,} (skipped {skipped_language} non-English)")
215
 
216
  return all_records
217
 
 
232
 
233
  def download_arxiv_summarization(max_samples: int = 50000) -> list[dict[str, Any]]:
234
  """
235
+ Download arXiv papers for academic summarization only (English only).
236
  Note: This dataset doesn't have categories, so can't be used for topic classification.
237
 
238
  Returns: summarization_records
 
243
  arxiv = load_dataset("ccdv/arxiv-summarization", split="train")
244
 
245
  summ_records: list[dict[str, Any]] = []
246
+ skipped_language = 0
247
 
248
  indices = list(range(len(arxiv)))
249
  random.shuffle(indices)
 
270
  if '@' in abstract or '@' in article[:500]:
271
  continue
272
 
273
+ # Filter: English only
274
+ if not is_english_text(article[:1000]):
275
+ skipped_language += 1
276
+ continue
277
+
278
  # Summarization: article → abstract
279
  if article and len(article) > 500:
280
  summ_records.append({
 
283
  "type": "academic",
284
  })
285
 
286
+ print(f" Summarization: {len(summ_records):,} (skipped {skipped_language} non-English)")
287
 
288
  return summ_records
289
 
 
478
 
479
 
480
  def download_gutenberg_topics(max_samples: int = 30000) -> list[dict[str, Any]]:
481
+ """Extract topic-labeled samples from Gutenberg books (English only)."""
482
  print("\n📚 Loading Gutenberg for topic classification...")
483
 
484
  try:
 
488
  gutenberg = load_dataset("pg19", split="train")
489
 
490
  records: list[dict[str, Any]] = []
491
+ skipped_language = 0
492
 
493
  indices = list(range(len(gutenberg)))
494
  random.shuffle(indices)
 
527
  for para in paragraphs[5:]: # Skip front matter
528
  para = para.strip()
529
  if 200 < len(para) < 1500 and para.count('.') >= 2:
530
+ # Filter: English only
531
+ if not is_english_text(para):
532
+ skipped_language += 1
533
+ break
534
+
535
  records.append({
536
  "text": para,
537
  "topic": topic,
 
539
  })
540
  break
541
 
542
+ print(f" Gutenberg topics: {len(records):,} (skipped {skipped_language} non-English)")
543
  return records
544
 
545
 
 
584
 
585
 
586
  def is_clean_prose(text: str) -> bool:
587
+ """Check if text is clean literary prose (English only)."""
588
  if len(text) < 300 or len(text) > 3000:
589
  return False
590
  if GUTENBERG_JUNK_REGEX.search(text):
 
597
  digit_ratio = sum(1 for c in text if c.isdigit()) / max(len(text), 1)
598
  if digit_ratio > 0.1:
599
  return False
600
+ # English filter
601
+ if not is_english_text(text):
602
+ return False
603
  return True
604
 
605
 
606
  def download_gutenberg(max_samples: int = 30000) -> None:
607
+ """Download Gutenberg books for language modeling (English only)."""
608
+ print("\n📚 Downloading Gutenberg Books (English only)...")
609
  out_dir = OUTPUT_DIR / "books"
610
  out_dir.mkdir(parents=True, exist_ok=True)
611