Spaces:
Sleeping
Sleeping
OliverPerrin commited on
Commit ·
8573220
1
Parent(s): e3422d2
Add English language filter to all data downloads
Browse files- Filter BookSum, arXiv, and Gutenberg for English only
- Common English word ratio check (min 8%)
- Non-English pattern detection (French, German, Spanish, Italian, Latin)
- Skip non-English content at download time, not discovery time
- Reports skipped count for each source
- scripts/build_discovery_dataset.py +66 -14
- scripts/download_data.py +101 -16
scripts/build_discovery_dataset.py
CHANGED
|
@@ -45,37 +45,85 @@ GARBAGE_PATTERNS = [
|
|
| 45 |
r"transcriber", # Transcriber notes
|
| 46 |
r"eBook", # eBook references
|
| 47 |
r"©|copyright", # Copyright notices
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
]
|
| 49 |
|
| 50 |
-
# Non-English indicators
|
| 51 |
NON_ENGLISH_PATTERNS = [
|
| 52 |
-
r"\b(le|la|les|un|une|des|du|de la|au|aux)\b", # French
|
| 53 |
-
r"\b(der|die|das|ein|eine|und|ist|nicht)\b",
|
| 54 |
-
r"\b(el|la|los|las|un|una|que|por|para)\b",
|
| 55 |
-
r"\b(il|lo|la|gli|le|un|una|che|per|con)\b",
|
| 56 |
-
r"[
|
|
|
|
|
|
|
|
|
|
| 57 |
]
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
def is_english(text: str) -> bool:
|
| 60 |
"""Check if text appears to be English."""
|
| 61 |
text_lower = text.lower()
|
| 62 |
|
| 63 |
-
# Check for non-English patterns
|
| 64 |
for pattern in NON_ENGLISH_PATTERNS:
|
| 65 |
matches = len(re.findall(pattern, text_lower, re.IGNORECASE))
|
| 66 |
-
if matches >
|
| 67 |
return False
|
| 68 |
|
| 69 |
# Check English word ratio
|
| 70 |
-
english_words = ["the", "and", "of", "to", "a", "in", "that", "is", "was", "he", "she", "it", "for", "with", "as", "his", "her", "they", "be", "at", "on", "have", "had", "this", "but", "not", "from", "by", "or", "an"]
|
| 71 |
words = text_lower.split()
|
| 72 |
-
if len(words) <
|
| 73 |
return False
|
| 74 |
|
| 75 |
english_count = sum(1 for w in words if w in english_words)
|
| 76 |
ratio = english_count / len(words)
|
| 77 |
|
| 78 |
-
return ratio > 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
|
| 81 |
def is_quality_text(text: str) -> bool:
|
|
@@ -86,17 +134,21 @@ def is_quality_text(text: str) -> bool:
|
|
| 86 |
return False
|
| 87 |
|
| 88 |
# Must have reasonable length
|
| 89 |
-
if len(text) <
|
| 90 |
return False
|
| 91 |
|
| 92 |
# Must have sentences (not just fragments)
|
| 93 |
sentences = re.split(r'[.!?]+', text)
|
| 94 |
-
if len(sentences) <
|
| 95 |
return False
|
| 96 |
|
| 97 |
# Check for too many special characters
|
| 98 |
special_ratio = len(re.findall(r'[^\w\s.,!?\'"()-]', text)) / len(text)
|
| 99 |
-
if special_ratio > 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
return False
|
| 101 |
|
| 102 |
return True
|
|
|
|
| 45 |
r"transcriber", # Transcriber notes
|
| 46 |
r"eBook", # eBook references
|
| 47 |
r"©|copyright", # Copyright notices
|
| 48 |
+
r"^INDEX", # Index pages
|
| 49 |
+
r"^\d+\.\s+\w+,\s+\d+", # Index entries like "1. Name, 234"
|
| 50 |
+
r"(syn\.|var\.|sp\.)", # Botanical abbreviations
|
| 51 |
+
r"[A-Z][a-z]+aceae", # Botanical family names
|
| 52 |
+
r"\(\s*syn\s+", # Synonym references
|
| 53 |
]
|
| 54 |
|
| 55 |
+
# Non-English indicators (expanded)
|
| 56 |
NON_ENGLISH_PATTERNS = [
|
| 57 |
+
r"\b(le|la|les|un|une|des|du|de la|au|aux|et|est|sont|dans|pour|avec|sur|qui|que)\b", # French
|
| 58 |
+
r"\b(der|die|das|ein|eine|und|ist|nicht|mit|von|zu|den|dem|auf|für|als|auch|oder|nach|bei|nur|noch|wie|mehr|aber|wenn|so|hat|kann|ich|sie|er|wir|ihr|es|sich|sein)\b", # German (expanded)
|
| 59 |
+
r"\b(el|la|los|las|un|una|que|por|para|con|del|al|es|en|se|no|más|como|pero|su|sus)\b", # Spanish
|
| 60 |
+
r"\b(il|lo|la|gli|le|un|una|che|per|con|del|della|di|da|non|sono|è|anche|più|ma|se)\b", # Italian
|
| 61 |
+
r"[àâäéèêëïîôùûüÿœæäöüß]{2,}", # Accented chars (German ß, umlauts)
|
| 62 |
+
r"\b[A-Z][a-z]+ü[a-z]+\b", # German words with ü
|
| 63 |
+
r"\b[A-Z][a-z]+ö[a-z]+\b", # German words with ö
|
| 64 |
+
r"\b[A-Z][a-z]+ä[a-z]+\b", # German words with ä
|
| 65 |
]
|
| 66 |
|
| 67 |
+
# Patterns that indicate index/glossary/list content (not narrative)
|
| 68 |
+
INDEX_PATTERNS = [
|
| 69 |
+
r"^\s*\d+\s*$", # Just numbers
|
| 70 |
+
r"^[A-Z][a-z]+,\s+\d+", # "Word, 123" index entries
|
| 71 |
+
r"(\d+,\s*)+\d+", # Lists of page numbers
|
| 72 |
+
r"^[A-Z]{2,}\s+", # ALL CAPS words at start
|
| 73 |
+
r"^\s*[-•]\s+", # Bullet points
|
| 74 |
+
r"p\.\s*\d+", # Page references
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
|
| 78 |
def is_english(text: str) -> bool:
|
| 79 |
"""Check if text appears to be English."""
|
| 80 |
text_lower = text.lower()
|
| 81 |
|
| 82 |
+
# Check for non-English patterns - stricter threshold
|
| 83 |
for pattern in NON_ENGLISH_PATTERNS:
|
| 84 |
matches = len(re.findall(pattern, text_lower, re.IGNORECASE))
|
| 85 |
+
if matches > 3: # Stricter: was 5
|
| 86 |
return False
|
| 87 |
|
| 88 |
# Check English word ratio
|
| 89 |
+
english_words = ["the", "and", "of", "to", "a", "in", "that", "is", "was", "he", "she", "it", "for", "with", "as", "his", "her", "they", "be", "at", "on", "have", "had", "this", "but", "not", "from", "by", "or", "an", "said", "were", "been", "would", "could", "which", "their", "there", "what", "when", "who", "will", "more", "if", "no", "out", "so", "up", "into", "than", "them", "can", "only", "other", "new", "some", "very", "just", "over", "such", "also", "its", "then", "two", "first", "any", "these", "may", "after", "most", "made", "before", "should", "now", "where", "those", "being", "has", "between", "own", "under"]
|
| 90 |
words = text_lower.split()
|
| 91 |
+
if len(words) < 30: # Stricter: was 20
|
| 92 |
return False
|
| 93 |
|
| 94 |
english_count = sum(1 for w in words if w in english_words)
|
| 95 |
ratio = english_count / len(words)
|
| 96 |
|
| 97 |
+
return ratio > 0.08 # Stricter: was 0.05
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def is_narrative_text(text: str) -> bool:
|
| 101 |
+
"""Check if text is actual narrative (not index/glossary/list)."""
|
| 102 |
+
lines = text.strip().split('\n')
|
| 103 |
+
|
| 104 |
+
# Count lines that look like index entries
|
| 105 |
+
index_lines = 0
|
| 106 |
+
for line in lines:
|
| 107 |
+
for pattern in INDEX_PATTERNS:
|
| 108 |
+
if re.search(pattern, line):
|
| 109 |
+
index_lines += 1
|
| 110 |
+
break
|
| 111 |
+
|
| 112 |
+
# If more than 30% are index-like, reject
|
| 113 |
+
if len(lines) > 0 and index_lines / len(lines) > 0.3:
|
| 114 |
+
return False
|
| 115 |
+
|
| 116 |
+
# Must have actual sentences with verbs
|
| 117 |
+
# Check for common verbs
|
| 118 |
+
verb_patterns = r"\b(is|are|was|were|have|has|had|do|does|did|will|would|could|should|may|might|can|said|says|went|came|made|took|saw|knew|thought|found|gave|told|asked|seemed|felt|looked|heard|began|kept|left|called|turned|wanted|tried|needed|used|believe|think|know|see|want|need|find|give|tell|become|leave|put|mean|keep|let|begin|seem|help|show|hear|play|run|move|live|read|write|learn|speak|bring|hold|stand|set|pay|meet|lead|understand|watch|follow|stop|create|speak|allow|add|spend|grow|open|walk|offer|remember|consider|appear|buy|wait|serve|die|send|build|stay|fall|cut|reach|kill|remain|suggest|raise|pass|sell|require|report|decide|pull)\b"
|
| 119 |
+
verb_count = len(re.findall(verb_patterns, text.lower()))
|
| 120 |
+
|
| 121 |
+
# Should have at least 1 verb per 50 words
|
| 122 |
+
words = len(text.split())
|
| 123 |
+
if words > 0 and verb_count / words < 0.02:
|
| 124 |
+
return False
|
| 125 |
+
|
| 126 |
+
return True
|
| 127 |
|
| 128 |
|
| 129 |
def is_quality_text(text: str) -> bool:
|
|
|
|
| 134 |
return False
|
| 135 |
|
| 136 |
# Must have reasonable length
|
| 137 |
+
if len(text) < 300: # Stricter: was 200
|
| 138 |
return False
|
| 139 |
|
| 140 |
# Must have sentences (not just fragments)
|
| 141 |
sentences = re.split(r'[.!?]+', text)
|
| 142 |
+
if len(sentences) < 4: # Stricter: was 3
|
| 143 |
return False
|
| 144 |
|
| 145 |
# Check for too many special characters
|
| 146 |
special_ratio = len(re.findall(r'[^\w\s.,!?\'"()-]', text)) / len(text)
|
| 147 |
+
if special_ratio > 0.08: # Stricter: was 0.1
|
| 148 |
+
return False
|
| 149 |
+
|
| 150 |
+
# Must be narrative, not index/list
|
| 151 |
+
if not is_narrative_text(text):
|
| 152 |
return False
|
| 153 |
|
| 154 |
return True
|
scripts/download_data.py
CHANGED
|
@@ -113,10 +113,71 @@ def write_jsonl(records: list[dict[str, Any]], path: Path, desc: str = "Writing"
|
|
| 113 |
print(f" ✓ {len(records):,} samples → {path}")
|
| 114 |
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
# ============== SUMMARIZATION: BOOKS + ARXIV ==============
|
| 117 |
|
| 118 |
def download_booksum(max_samples: int = 40000) -> list[dict[str, Any]]:
|
| 119 |
-
"""Download BookSum - literary chapter summarization."""
|
| 120 |
print("\n📖 Loading BookSum (literary summarization)...")
|
| 121 |
|
| 122 |
all_records: list[dict[str, Any]] = []
|
|
@@ -129,19 +190,28 @@ def download_booksum(max_samples: int = 40000) -> list[dict[str, Any]]:
|
|
| 129 |
indices = random.sample(range(len(data)), min(len(data), limit))
|
| 130 |
|
| 131 |
records = []
|
|
|
|
| 132 |
for i in tqdm(indices, desc=f"BookSum {split}", leave=False):
|
| 133 |
item = data[i]
|
| 134 |
chapter = item.get("chapter", "")
|
| 135 |
summary = item.get("summary_text") or item.get("summary", "")
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
all_records.extend(records)
|
| 144 |
-
print(f" {split}: {len(records):,}")
|
| 145 |
|
| 146 |
return all_records
|
| 147 |
|
|
@@ -162,7 +232,7 @@ def clean_arxiv_text(text: str) -> str:
|
|
| 162 |
|
| 163 |
def download_arxiv_summarization(max_samples: int = 50000) -> list[dict[str, Any]]:
|
| 164 |
"""
|
| 165 |
-
Download arXiv papers for academic summarization only.
|
| 166 |
Note: This dataset doesn't have categories, so can't be used for topic classification.
|
| 167 |
|
| 168 |
Returns: summarization_records
|
|
@@ -173,6 +243,7 @@ def download_arxiv_summarization(max_samples: int = 50000) -> list[dict[str, Any
|
|
| 173 |
arxiv = load_dataset("ccdv/arxiv-summarization", split="train")
|
| 174 |
|
| 175 |
summ_records: list[dict[str, Any]] = []
|
|
|
|
| 176 |
|
| 177 |
indices = list(range(len(arxiv)))
|
| 178 |
random.shuffle(indices)
|
|
@@ -199,6 +270,11 @@ def download_arxiv_summarization(max_samples: int = 50000) -> list[dict[str, Any
|
|
| 199 |
if '@' in abstract or '@' in article[:500]:
|
| 200 |
continue
|
| 201 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
# Summarization: article → abstract
|
| 203 |
if article and len(article) > 500:
|
| 204 |
summ_records.append({
|
|
@@ -207,7 +283,7 @@ def download_arxiv_summarization(max_samples: int = 50000) -> list[dict[str, Any
|
|
| 207 |
"type": "academic",
|
| 208 |
})
|
| 209 |
|
| 210 |
-
print(f" Summarization: {len(summ_records):,}")
|
| 211 |
|
| 212 |
return summ_records
|
| 213 |
|
|
@@ -402,7 +478,7 @@ def download_topics(max_samples: int = 50000) -> None:
|
|
| 402 |
|
| 403 |
|
| 404 |
def download_gutenberg_topics(max_samples: int = 30000) -> list[dict[str, Any]]:
|
| 405 |
-
"""Extract topic-labeled samples from Gutenberg books."""
|
| 406 |
print("\n📚 Loading Gutenberg for topic classification...")
|
| 407 |
|
| 408 |
try:
|
|
@@ -412,6 +488,7 @@ def download_gutenberg_topics(max_samples: int = 30000) -> list[dict[str, Any]]:
|
|
| 412 |
gutenberg = load_dataset("pg19", split="train")
|
| 413 |
|
| 414 |
records: list[dict[str, Any]] = []
|
|
|
|
| 415 |
|
| 416 |
indices = list(range(len(gutenberg)))
|
| 417 |
random.shuffle(indices)
|
|
@@ -450,6 +527,11 @@ def download_gutenberg_topics(max_samples: int = 30000) -> list[dict[str, Any]]:
|
|
| 450 |
for para in paragraphs[5:]: # Skip front matter
|
| 451 |
para = para.strip()
|
| 452 |
if 200 < len(para) < 1500 and para.count('.') >= 2:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 453 |
records.append({
|
| 454 |
"text": para,
|
| 455 |
"topic": topic,
|
|
@@ -457,7 +539,7 @@ def download_gutenberg_topics(max_samples: int = 30000) -> list[dict[str, Any]]:
|
|
| 457 |
})
|
| 458 |
break
|
| 459 |
|
| 460 |
-
print(f" Gutenberg topics: {len(records):,}")
|
| 461 |
return records
|
| 462 |
|
| 463 |
|
|
@@ -502,7 +584,7 @@ GUTENBERG_JUNK_REGEX = re.compile("|".join(GUTENBERG_JUNK_PATTERNS), re.IGNORECA
|
|
| 502 |
|
| 503 |
|
| 504 |
def is_clean_prose(text: str) -> bool:
|
| 505 |
-
"""Check if text is clean literary prose."""
|
| 506 |
if len(text) < 300 or len(text) > 3000:
|
| 507 |
return False
|
| 508 |
if GUTENBERG_JUNK_REGEX.search(text):
|
|
@@ -515,12 +597,15 @@ def is_clean_prose(text: str) -> bool:
|
|
| 515 |
digit_ratio = sum(1 for c in text if c.isdigit()) / max(len(text), 1)
|
| 516 |
if digit_ratio > 0.1:
|
| 517 |
return False
|
|
|
|
|
|
|
|
|
|
| 518 |
return True
|
| 519 |
|
| 520 |
|
| 521 |
def download_gutenberg(max_samples: int = 30000) -> None:
|
| 522 |
-
"""Download Gutenberg books for language modeling."""
|
| 523 |
-
print("\n📚 Downloading Gutenberg Books...")
|
| 524 |
out_dir = OUTPUT_DIR / "books"
|
| 525 |
out_dir.mkdir(parents=True, exist_ok=True)
|
| 526 |
|
|
|
|
| 113 |
print(f" ✓ {len(records):,} samples → {path}")
|
| 114 |
|
| 115 |
|
| 116 |
+
# ============== ENGLISH LANGUAGE FILTER ==============
|
| 117 |
+
|
| 118 |
+
# Common English words for detection
|
| 119 |
+
ENGLISH_WORDS = {
|
| 120 |
+
"the", "and", "of", "to", "a", "in", "that", "is", "was", "he", "she", "it",
|
| 121 |
+
"for", "with", "as", "his", "her", "they", "be", "at", "on", "have", "had",
|
| 122 |
+
"this", "but", "not", "from", "by", "or", "an", "said", "were", "been",
|
| 123 |
+
"would", "could", "which", "their", "there", "what", "when", "who", "will",
|
| 124 |
+
"more", "if", "no", "out", "so", "up", "into", "than", "them", "can", "only",
|
| 125 |
+
"other", "new", "some", "very", "just", "over", "such", "also", "its", "then",
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
# Non-English language patterns
|
| 129 |
+
NON_ENGLISH_PATTERNS = [
|
| 130 |
+
# French
|
| 131 |
+
r"\b(le|la|les|un|une|des|du|et|est|sont|dans|pour|avec|sur|qui|que|ce|cette|nous|vous|ils|elles|je|tu|il|elle|être|avoir)\b",
|
| 132 |
+
# German
|
| 133 |
+
r"\b(der|die|das|ein|eine|und|ist|nicht|mit|von|zu|den|dem|auf|für|als|auch|oder|nach|bei|nur|noch|wie|mehr|aber|wenn|hat|kann|ich|sie|er|wir|ihr|es|sich|sein)\b",
|
| 134 |
+
# Spanish
|
| 135 |
+
r"\b(el|la|los|las|un|una|que|por|para|con|del|al|es|en|se|no|más|como|pero|su|sus|le|lo|te|me|nos)\b",
|
| 136 |
+
# Italian
|
| 137 |
+
r"\b(il|lo|la|gli|le|che|per|con|del|della|di|da|non|sono|anche|più|ma|se|mi|ti|ci)\b",
|
| 138 |
+
# Latin
|
| 139 |
+
r"\b(et|in|ad|cum|de|ex|per|pro|sub|ab|ante|post|inter|contra|super|trans|apud)\b",
|
| 140 |
+
]
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def is_english_text(text: str, min_ratio: float = 0.08, max_foreign: int = 5) -> bool:
|
| 144 |
+
"""
|
| 145 |
+
Check if text is primarily English.
|
| 146 |
+
|
| 147 |
+
Args:
|
| 148 |
+
text: Text to check
|
| 149 |
+
min_ratio: Minimum ratio of common English words
|
| 150 |
+
max_foreign: Maximum number of foreign word matches before rejecting
|
| 151 |
+
|
| 152 |
+
Returns:
|
| 153 |
+
True if text appears to be English
|
| 154 |
+
"""
|
| 155 |
+
if not text or len(text) < 100:
|
| 156 |
+
return False
|
| 157 |
+
|
| 158 |
+
text_lower = text.lower()
|
| 159 |
+
words = text_lower.split()
|
| 160 |
+
|
| 161 |
+
if len(words) < 20:
|
| 162 |
+
return False
|
| 163 |
+
|
| 164 |
+
# Check for excessive non-English words
|
| 165 |
+
for pattern in NON_ENGLISH_PATTERNS:
|
| 166 |
+
matches = len(re.findall(pattern, text_lower))
|
| 167 |
+
if matches > max_foreign:
|
| 168 |
+
return False
|
| 169 |
+
|
| 170 |
+
# Check for sufficient English words
|
| 171 |
+
english_count = sum(1 for w in words if w.strip(".,!?;:'\"") in ENGLISH_WORDS)
|
| 172 |
+
ratio = english_count / len(words)
|
| 173 |
+
|
| 174 |
+
return ratio >= min_ratio
|
| 175 |
+
|
| 176 |
+
|
| 177 |
# ============== SUMMARIZATION: BOOKS + ARXIV ==============
|
| 178 |
|
| 179 |
def download_booksum(max_samples: int = 40000) -> list[dict[str, Any]]:
|
| 180 |
+
"""Download BookSum - literary chapter summarization (English only)."""
|
| 181 |
print("\n📖 Loading BookSum (literary summarization)...")
|
| 182 |
|
| 183 |
all_records: list[dict[str, Any]] = []
|
|
|
|
| 190 |
indices = random.sample(range(len(data)), min(len(data), limit))
|
| 191 |
|
| 192 |
records = []
|
| 193 |
+
skipped_language = 0
|
| 194 |
for i in tqdm(indices, desc=f"BookSum {split}", leave=False):
|
| 195 |
item = data[i]
|
| 196 |
chapter = item.get("chapter", "")
|
| 197 |
summary = item.get("summary_text") or item.get("summary", "")
|
| 198 |
+
|
| 199 |
+
if not (chapter and summary and len(chapter) > 300):
|
| 200 |
+
continue
|
| 201 |
+
|
| 202 |
+
# Filter: English only
|
| 203 |
+
if not is_english_text(chapter):
|
| 204 |
+
skipped_language += 1
|
| 205 |
+
continue
|
| 206 |
+
|
| 207 |
+
records.append({
|
| 208 |
+
"source": chapter[:4000],
|
| 209 |
+
"summary": summary,
|
| 210 |
+
"type": "literary",
|
| 211 |
+
"split": split,
|
| 212 |
+
})
|
| 213 |
all_records.extend(records)
|
| 214 |
+
print(f" {split}: {len(records):,} (skipped {skipped_language} non-English)")
|
| 215 |
|
| 216 |
return all_records
|
| 217 |
|
|
|
|
| 232 |
|
| 233 |
def download_arxiv_summarization(max_samples: int = 50000) -> list[dict[str, Any]]:
|
| 234 |
"""
|
| 235 |
+
Download arXiv papers for academic summarization only (English only).
|
| 236 |
Note: This dataset doesn't have categories, so can't be used for topic classification.
|
| 237 |
|
| 238 |
Returns: summarization_records
|
|
|
|
| 243 |
arxiv = load_dataset("ccdv/arxiv-summarization", split="train")
|
| 244 |
|
| 245 |
summ_records: list[dict[str, Any]] = []
|
| 246 |
+
skipped_language = 0
|
| 247 |
|
| 248 |
indices = list(range(len(arxiv)))
|
| 249 |
random.shuffle(indices)
|
|
|
|
| 270 |
if '@' in abstract or '@' in article[:500]:
|
| 271 |
continue
|
| 272 |
|
| 273 |
+
# Filter: English only
|
| 274 |
+
if not is_english_text(article[:1000]):
|
| 275 |
+
skipped_language += 1
|
| 276 |
+
continue
|
| 277 |
+
|
| 278 |
# Summarization: article → abstract
|
| 279 |
if article and len(article) > 500:
|
| 280 |
summ_records.append({
|
|
|
|
| 283 |
"type": "academic",
|
| 284 |
})
|
| 285 |
|
| 286 |
+
print(f" Summarization: {len(summ_records):,} (skipped {skipped_language} non-English)")
|
| 287 |
|
| 288 |
return summ_records
|
| 289 |
|
|
|
|
| 478 |
|
| 479 |
|
| 480 |
def download_gutenberg_topics(max_samples: int = 30000) -> list[dict[str, Any]]:
|
| 481 |
+
"""Extract topic-labeled samples from Gutenberg books (English only)."""
|
| 482 |
print("\n📚 Loading Gutenberg for topic classification...")
|
| 483 |
|
| 484 |
try:
|
|
|
|
| 488 |
gutenberg = load_dataset("pg19", split="train")
|
| 489 |
|
| 490 |
records: list[dict[str, Any]] = []
|
| 491 |
+
skipped_language = 0
|
| 492 |
|
| 493 |
indices = list(range(len(gutenberg)))
|
| 494 |
random.shuffle(indices)
|
|
|
|
| 527 |
for para in paragraphs[5:]: # Skip front matter
|
| 528 |
para = para.strip()
|
| 529 |
if 200 < len(para) < 1500 and para.count('.') >= 2:
|
| 530 |
+
# Filter: English only
|
| 531 |
+
if not is_english_text(para):
|
| 532 |
+
skipped_language += 1
|
| 533 |
+
break
|
| 534 |
+
|
| 535 |
records.append({
|
| 536 |
"text": para,
|
| 537 |
"topic": topic,
|
|
|
|
| 539 |
})
|
| 540 |
break
|
| 541 |
|
| 542 |
+
print(f" Gutenberg topics: {len(records):,} (skipped {skipped_language} non-English)")
|
| 543 |
return records
|
| 544 |
|
| 545 |
|
|
|
|
| 584 |
|
| 585 |
|
| 586 |
def is_clean_prose(text: str) -> bool:
|
| 587 |
+
"""Check if text is clean literary prose (English only)."""
|
| 588 |
if len(text) < 300 or len(text) > 3000:
|
| 589 |
return False
|
| 590 |
if GUTENBERG_JUNK_REGEX.search(text):
|
|
|
|
| 597 |
digit_ratio = sum(1 for c in text if c.isdigit()) / max(len(text), 1)
|
| 598 |
if digit_ratio > 0.1:
|
| 599 |
return False
|
| 600 |
+
# English filter
|
| 601 |
+
if not is_english_text(text):
|
| 602 |
+
return False
|
| 603 |
return True
|
| 604 |
|
| 605 |
|
| 606 |
def download_gutenberg(max_samples: int = 30000) -> None:
|
| 607 |
+
"""Download Gutenberg books for language modeling (English only)."""
|
| 608 |
+
print("\n📚 Downloading Gutenberg Books (English only)...")
|
| 609 |
out_dir = OUTPUT_DIR / "books"
|
| 610 |
out_dir.mkdir(parents=True, exist_ok=True)
|
| 611 |
|