Spaces:

OliverPerrin
/

LexiMind

Sleeping

OliverPerrin commited on Jan 14

Commit

8573220

1 Parent(s): e3422d2

Add English language filter to all data downloads

- Filter BookSum, arXiv, and Gutenberg for English only
- Common English word ratio check (min 8%)
- Non-English pattern detection (French, German, Spanish, Italian, Latin)
- Skip non-English content at download time, not discovery time
- Reports skipped count for each source

Files changed (2) hide show

scripts/build_discovery_dataset.py +66 -14
scripts/download_data.py +101 -16

scripts/build_discovery_dataset.py CHANGED Viewed

@@ -45,37 +45,85 @@ GARBAGE_PATTERNS = [
     r"transcriber",          # Transcriber notes
     r"eBook",                # eBook references
     r"©|copyright",          # Copyright notices
 ]
-# Non-English indicators
 NON_ENGLISH_PATTERNS = [
-    r"\b(le|la|les|un|une|des|du|de la|au|aux)\b",  # French articles
-    r"\b(der|die|das|ein|eine|und|ist|nicht)\b",     # German
-    r"\b(el|la|los|las|un|una|que|por|para)\b",      # Spanish
-    r"\b(il|lo|la|gli|le|un|una|che|per|con)\b",     # Italian
-    r"[àâäéèêëïîôùûüÿœæ]{3,}",                       # Multiple French accents
 ]
 def is_english(text: str) -> bool:
     """Check if text appears to be English."""
     text_lower = text.lower()
-    # Check for non-English patterns
     for pattern in NON_ENGLISH_PATTERNS:
         matches = len(re.findall(pattern, text_lower, re.IGNORECASE))
-        if matches > 5:  # Too many non-English words
             return False
     # Check English word ratio
-    english_words = ["the", "and", "of", "to", "a", "in", "that", "is", "was", "he", "she", "it", "for", "with", "as", "his", "her", "they", "be", "at", "on", "have", "had", "this", "but", "not", "from", "by", "or", "an"]
     words = text_lower.split()
-    if len(words) < 20:
         return False
     english_count = sum(1 for w in words if w in english_words)
     ratio = english_count / len(words)
-    return ratio > 0.05  # At least 5% common English words
 def is_quality_text(text: str) -> bool:
@@ -86,17 +134,21 @@ def is_quality_text(text: str) -> bool:
             return False
     # Must have reasonable length
-    if len(text) < 200:
         return False
     # Must have sentences (not just fragments)
     sentences = re.split(r'[.!?]+', text)
-    if len(sentences) < 3:
         return False
     # Check for too many special characters
     special_ratio = len(re.findall(r'[^\w\s.,!?\'"()-]', text)) / len(text)
-    if special_ratio > 0.1:
         return False
     return True

     r"transcriber",          # Transcriber notes
     r"eBook",                # eBook references
     r"©|copyright",          # Copyright notices
+    r"^INDEX",               # Index pages
+    r"^\d+\.\s+\w+,\s+\d+",  # Index entries like "1. Name, 234"
+    r"(syn\.|var\.|sp\.)",   # Botanical abbreviations
+    r"[A-Z][a-z]+aceae",     # Botanical family names
+    r"\(\s*syn\s+",          # Synonym references
 ]
+# Non-English indicators (expanded)
 NON_ENGLISH_PATTERNS = [
+    r"\b(le|la|les|un|une|des|du|de la|au|aux|et|est|sont|dans|pour|avec|sur|qui|que)\b",  # French
+    r"\b(der|die|das|ein|eine|und|ist|nicht|mit|von|zu|den|dem|auf|für|als|auch|oder|nach|bei|nur|noch|wie|mehr|aber|wenn|so|hat|kann|ich|sie|er|wir|ihr|es|sich|sein)\b",  # German (expanded)
+    r"\b(el|la|los|las|un|una|que|por|para|con|del|al|es|en|se|no|más|como|pero|su|sus)\b",  # Spanish
+    r"\b(il|lo|la|gli|le|un|una|che|per|con|del|della|di|da|non|sono|è|anche|più|ma|se)\b",  # Italian
+    r"[àâäéèêëïîôùûüÿœæäöüß]{2,}",  # Accented chars (German ß, umlauts)
+    r"\b[A-Z][a-z]+ü[a-z]+\b",  # German words with ü
+    r"\b[A-Z][a-z]+ö[a-z]+\b",  # German words with ö
+    r"\b[A-Z][a-z]+ä[a-z]+\b",  # German words with ä
 ]
+# Patterns that indicate index/glossary/list content (not narrative)
+INDEX_PATTERNS = [
+    r"^\s*\d+\s*$",           # Just numbers
+    r"^[A-Z][a-z]+,\s+\d+",   # "Word, 123" index entries
+    r"(\d+,\s*)+\d+",         # Lists of page numbers
+    r"^[A-Z]{2,}\s+",         # ALL CAPS words at start
+    r"^\s*[-•]\s+",           # Bullet points
+    r"p\.\s*\d+",             # Page references
+]
 def is_english(text: str) -> bool:
     """Check if text appears to be English."""
     text_lower = text.lower()
+    # Check for non-English patterns - stricter threshold
     for pattern in NON_ENGLISH_PATTERNS:
         matches = len(re.findall(pattern, text_lower, re.IGNORECASE))
+        if matches > 3:  # Stricter: was 5
             return False
     # Check English word ratio
+    english_words = ["the", "and", "of", "to", "a", "in", "that", "is", "was", "he", "she", "it", "for", "with", "as", "his", "her", "they", "be", "at", "on", "have", "had", "this", "but", "not", "from", "by", "or", "an", "said", "were", "been", "would", "could", "which", "their", "there", "what", "when", "who", "will", "more", "if", "no", "out", "so", "up", "into", "than", "them", "can", "only", "other", "new", "some", "very", "just", "over", "such", "also", "its", "then", "two", "first", "any", "these", "may", "after", "most", "made", "before", "should", "now", "where", "those", "being", "has", "between", "own", "under"]
     words = text_lower.split()
+    if len(words) < 30:  # Stricter: was 20
         return False
     english_count = sum(1 for w in words if w in english_words)
     ratio = english_count / len(words)
+    return ratio > 0.08  # Stricter: was 0.05
+def is_narrative_text(text: str) -> bool:
+    """Check if text is actual narrative (not index/glossary/list)."""
+    lines = text.strip().split('\n')
+    # Count lines that look like index entries
+    index_lines = 0
+    for line in lines:
+        for pattern in INDEX_PATTERNS:
+            if re.search(pattern, line):
+                index_lines += 1
+                break
+    # If more than 30% are index-like, reject
+    if len(lines) > 0 and index_lines / len(lines) > 0.3:
+        return False
+    # Must have actual sentences with verbs
+    # Check for common verbs
+    verb_patterns = r"\b(is|are|was|were|have|has|had|do|does|did|will|would|could|should|may|might|can|said|says|went|came|made|took|saw|knew|thought|found|gave|told|asked|seemed|felt|looked|heard|began|kept|left|called|turned|wanted|tried|needed|used|believe|think|know|see|want|need|find|give|tell|become|leave|put|mean|keep|let|begin|seem|help|show|hear|play|run|move|live|read|write|learn|speak|bring|hold|stand|set|pay|meet|lead|understand|watch|follow|stop|create|speak|allow|add|spend|grow|open|walk|offer|remember|consider|appear|buy|wait|serve|die|send|build|stay|fall|cut|reach|kill|remain|suggest|raise|pass|sell|require|report|decide|pull)\b"
+    verb_count = len(re.findall(verb_patterns, text.lower()))
+    # Should have at least 1 verb per 50 words
+    words = len(text.split())
+    if words > 0 and verb_count / words < 0.02:
+        return False
+    return True
 def is_quality_text(text: str) -> bool:
             return False
     # Must have reasonable length
+    if len(text) < 300:  # Stricter: was 200
         return False
     # Must have sentences (not just fragments)
     sentences = re.split(r'[.!?]+', text)
+    if len(sentences) < 4:  # Stricter: was 3
         return False
     # Check for too many special characters
     special_ratio = len(re.findall(r'[^\w\s.,!?\'"()-]', text)) / len(text)
+    if special_ratio > 0.08:  # Stricter: was 0.1
+        return False
+    # Must be narrative, not index/list
+    if not is_narrative_text(text):
         return False
     return True

scripts/download_data.py CHANGED Viewed

@@ -113,10 +113,71 @@ def write_jsonl(records: list[dict[str, Any]], path: Path, desc: str = "Writing"
     print(f"  ✓ {len(records):,} samples → {path}")
 # ============== SUMMARIZATION: BOOKS + ARXIV ==============
 def download_booksum(max_samples: int = 40000) -> list[dict[str, Any]]:
-    """Download BookSum - literary chapter summarization."""
     print("\n📖 Loading BookSum (literary summarization)...")
     all_records: list[dict[str, Any]] = []
@@ -129,19 +190,28 @@ def download_booksum(max_samples: int = 40000) -> list[dict[str, Any]]:
         indices = random.sample(range(len(data)), min(len(data), limit))
         records = []
         for i in tqdm(indices, desc=f"BookSum {split}", leave=False):
             item = data[i]
             chapter = item.get("chapter", "")
             summary = item.get("summary_text") or item.get("summary", "")
-            if chapter and summary and len(chapter) > 300:
-                records.append({
-                    "source": chapter[:4000],
-                    "summary": summary,
-                    "type": "literary",
-                    "split": split,
-                })
         all_records.extend(records)
-        print(f"    {split}: {len(records):,}")
     return all_records
@@ -162,7 +232,7 @@ def clean_arxiv_text(text: str) -> str:
 def download_arxiv_summarization(max_samples: int = 50000) -> list[dict[str, Any]]:
     """
-    Download arXiv papers for academic summarization only.
     Note: This dataset doesn't have categories, so can't be used for topic classification.
     Returns: summarization_records
@@ -173,6 +243,7 @@ def download_arxiv_summarization(max_samples: int = 50000) -> list[dict[str, Any
     arxiv = load_dataset("ccdv/arxiv-summarization", split="train")
     summ_records: list[dict[str, Any]] = []
     indices = list(range(len(arxiv)))
     random.shuffle(indices)
@@ -199,6 +270,11 @@ def download_arxiv_summarization(max_samples: int = 50000) -> list[dict[str, Any
         if '@' in abstract or '@' in article[:500]:
             continue
         # Summarization: article → abstract
         if article and len(article) > 500:
             summ_records.append({
@@ -207,7 +283,7 @@ def download_arxiv_summarization(max_samples: int = 50000) -> list[dict[str, Any
                 "type": "academic",
             })
-    print(f"    Summarization: {len(summ_records):,}")
     return summ_records
@@ -402,7 +478,7 @@ def download_topics(max_samples: int = 50000) -> None:
 def download_gutenberg_topics(max_samples: int = 30000) -> list[dict[str, Any]]:
-    """Extract topic-labeled samples from Gutenberg books."""
     print("\n📚 Loading Gutenberg for topic classification...")
     try:
@@ -412,6 +488,7 @@ def download_gutenberg_topics(max_samples: int = 30000) -> list[dict[str, Any]]:
         gutenberg = load_dataset("pg19", split="train")
     records: list[dict[str, Any]] = []
     indices = list(range(len(gutenberg)))
     random.shuffle(indices)
@@ -450,6 +527,11 @@ def download_gutenberg_topics(max_samples: int = 30000) -> list[dict[str, Any]]:
             for para in paragraphs[5:]:  # Skip front matter
                 para = para.strip()
                 if 200 < len(para) < 1500 and para.count('.') >= 2:
                     records.append({
                         "text": para,
                         "topic": topic,
@@ -457,7 +539,7 @@ def download_gutenberg_topics(max_samples: int = 30000) -> list[dict[str, Any]]:
                     })
                     break
-    print(f"    Gutenberg topics: {len(records):,}")
     return records
@@ -502,7 +584,7 @@ GUTENBERG_JUNK_REGEX = re.compile("|".join(GUTENBERG_JUNK_PATTERNS), re.IGNORECA
 def is_clean_prose(text: str) -> bool:
-    """Check if text is clean literary prose."""
     if len(text) < 300 or len(text) > 3000:
         return False
     if GUTENBERG_JUNK_REGEX.search(text):
@@ -515,12 +597,15 @@ def is_clean_prose(text: str) -> bool:
     digit_ratio = sum(1 for c in text if c.isdigit()) / max(len(text), 1)
     if digit_ratio > 0.1:
         return False
     return True
 def download_gutenberg(max_samples: int = 30000) -> None:
-    """Download Gutenberg books for language modeling."""
-    print("\n📚 Downloading Gutenberg Books...")
     out_dir = OUTPUT_DIR / "books"
     out_dir.mkdir(parents=True, exist_ok=True)

     print(f"  ✓ {len(records):,} samples → {path}")
+# ============== ENGLISH LANGUAGE FILTER ==============
+# Common English words for detection
+ENGLISH_WORDS = {
+    "the", "and", "of", "to", "a", "in", "that", "is", "was", "he", "she", "it",
+    "for", "with", "as", "his", "her", "they", "be", "at", "on", "have", "had",
+    "this", "but", "not", "from", "by", "or", "an", "said", "were", "been",
+    "would", "could", "which", "their", "there", "what", "when", "who", "will",
+    "more", "if", "no", "out", "so", "up", "into", "than", "them", "can", "only",
+    "other", "new", "some", "very", "just", "over", "such", "also", "its", "then",
+}
+# Non-English language patterns
+NON_ENGLISH_PATTERNS = [
+    # French
+    r"\b(le|la|les|un|une|des|du|et|est|sont|dans|pour|avec|sur|qui|que|ce|cette|nous|vous|ils|elles|je|tu|il|elle|être|avoir)\b",
+    # German
+    r"\b(der|die|das|ein|eine|und|ist|nicht|mit|von|zu|den|dem|auf|für|als|auch|oder|nach|bei|nur|noch|wie|mehr|aber|wenn|hat|kann|ich|sie|er|wir|ihr|es|sich|sein)\b",
+    # Spanish
+    r"\b(el|la|los|las|un|una|que|por|para|con|del|al|es|en|se|no|más|como|pero|su|sus|le|lo|te|me|nos)\b",
+    # Italian
+    r"\b(il|lo|la|gli|le|che|per|con|del|della|di|da|non|sono|anche|più|ma|se|mi|ti|ci)\b",
+    # Latin
+    r"\b(et|in|ad|cum|de|ex|per|pro|sub|ab|ante|post|inter|contra|super|trans|apud)\b",
+]
+def is_english_text(text: str, min_ratio: float = 0.08, max_foreign: int = 5) -> bool:
+    """
+    Check if text is primarily English.
+    Args:
+        text: Text to check
+        min_ratio: Minimum ratio of common English words
+        max_foreign: Maximum number of foreign word matches before rejecting
+    Returns:
+        True if text appears to be English
+    """
+    if not text or len(text) < 100:
+        return False
+    text_lower = text.lower()
+    words = text_lower.split()
+    if len(words) < 20:
+        return False
+    # Check for excessive non-English words
+    for pattern in NON_ENGLISH_PATTERNS:
+        matches = len(re.findall(pattern, text_lower))
+        if matches > max_foreign:
+            return False
+    # Check for sufficient English words
+    english_count = sum(1 for w in words if w.strip(".,!?;:'\"") in ENGLISH_WORDS)
+    ratio = english_count / len(words)
+    return ratio >= min_ratio
 # ============== SUMMARIZATION: BOOKS + ARXIV ==============
 def download_booksum(max_samples: int = 40000) -> list[dict[str, Any]]:
+    """Download BookSum - literary chapter summarization (English only)."""
     print("\n📖 Loading BookSum (literary summarization)...")
     all_records: list[dict[str, Any]] = []
         indices = random.sample(range(len(data)), min(len(data), limit))
         records = []
+        skipped_language = 0
         for i in tqdm(indices, desc=f"BookSum {split}", leave=False):
             item = data[i]
             chapter = item.get("chapter", "")
             summary = item.get("summary_text") or item.get("summary", "")
+            if not (chapter and summary and len(chapter) > 300):
+                continue
+            # Filter: English only
+            if not is_english_text(chapter):
+                skipped_language += 1
+                continue
+            records.append({
+                "source": chapter[:4000],
+                "summary": summary,
+                "type": "literary",
+                "split": split,
+            })
         all_records.extend(records)
+        print(f"    {split}: {len(records):,} (skipped {skipped_language} non-English)")
     return all_records
 def download_arxiv_summarization(max_samples: int = 50000) -> list[dict[str, Any]]:
     """
+    Download arXiv papers for academic summarization only (English only).
     Note: This dataset doesn't have categories, so can't be used for topic classification.
     Returns: summarization_records
     arxiv = load_dataset("ccdv/arxiv-summarization", split="train")
     summ_records: list[dict[str, Any]] = []
+    skipped_language = 0
     indices = list(range(len(arxiv)))
     random.shuffle(indices)
         if '@' in abstract or '@' in article[:500]:
             continue
+        # Filter: English only
+        if not is_english_text(article[:1000]):
+            skipped_language += 1
+            continue
         # Summarization: article → abstract
         if article and len(article) > 500:
             summ_records.append({
                 "type": "academic",
             })
+    print(f"    Summarization: {len(summ_records):,} (skipped {skipped_language} non-English)")
     return summ_records
 def download_gutenberg_topics(max_samples: int = 30000) -> list[dict[str, Any]]:
+    """Extract topic-labeled samples from Gutenberg books (English only)."""
     print("\n📚 Loading Gutenberg for topic classification...")
     try:
         gutenberg = load_dataset("pg19", split="train")
     records: list[dict[str, Any]] = []
+    skipped_language = 0
     indices = list(range(len(gutenberg)))
     random.shuffle(indices)
             for para in paragraphs[5:]:  # Skip front matter
                 para = para.strip()
                 if 200 < len(para) < 1500 and para.count('.') >= 2:
+                    # Filter: English only
+                    if not is_english_text(para):
+                        skipped_language += 1
+                        break
                     records.append({
                         "text": para,
                         "topic": topic,
                     })
                     break
+    print(f"    Gutenberg topics: {len(records):,} (skipped {skipped_language} non-English)")
     return records
 def is_clean_prose(text: str) -> bool:
+    """Check if text is clean literary prose (English only)."""
     if len(text) < 300 or len(text) > 3000:
         return False
     if GUTENBERG_JUNK_REGEX.search(text):
     digit_ratio = sum(1 for c in text if c.isdigit()) / max(len(text), 1)
     if digit_ratio > 0.1:
         return False
+    # English filter
+    if not is_english_text(text):
+        return False
     return True
 def download_gutenberg(max_samples: int = 30000) -> None:
+    """Download Gutenberg books for language modeling (English only)."""
+    print("\n📚 Downloading Gutenberg Books (English only)...")
     out_dir = OUTPUT_DIR / "books"
     out_dir.mkdir(parents=True, exist_ok=True)