Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import re | |
| import json | |
| import os | |
| import random | |
| import requests | |
| from datasets import load_dataset | |
| from sklearn.model_selection import train_test_split | |
| class DataProcessor: | |
| """ | |
| Handles the complete data pipeline for collecting sonnets: | |
| Loading -> Splitting -> Cleaning -> Validation -> Deduplication -> JSONL Export | |
| Approved Data Sources: | |
| 1. HuggingFace: zhyncs/sonnet (Shakespeare's collected sonnets) | |
| 2. Kaggle: Poetry Foundation CSV (mixed poetry β filtered to 14-line) | |
| 3. Gutenberg pg1041: Shakespeare's Sonnets | |
| 4. Gutenberg pg2002: Sonnets from the Portuguese (Elizabeth Barrett Browning) | |
| """ | |
| GUTENBERG_SOURCES = { | |
| "Shakespeare": "https://www.gutenberg.org/cache/epub/1041/pg1041-images.html", | |
| "Browning": "https://www.gutenberg.org/cache/epub/2002/pg2002-images.html", | |
| } | |
| def __init__(self, data_output_dir): | |
| """ | |
| Initializes the Data Processor. | |
| data_output_dir: The directory where train.jsonl and valid.jsonl will be saved. | |
| """ | |
| self.data_output_dir = data_output_dir | |
| os.makedirs(self.data_output_dir, exist_ok=True) | |
| self.all_sonnets_raw = [] | |
| self.cleaned_sonnets = [] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # DATA LOADERS | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_kaggle_csv(self, csv_filepath): | |
| """Loads and extracts raw poems from the Kaggle Poetry Foundation dataset.""" | |
| print(f"π Loading Kaggle dataset from {csv_filepath}...") | |
| try: | |
| df = pd.read_csv(csv_filepath) | |
| if 'Poem' not in df.columns: | |
| print("β 'Poem' column not found in Kaggle CSV!") | |
| return | |
| poems = df['Poem'].dropna().tolist() | |
| self.all_sonnets_raw.extend(poems) | |
| print(f" β Loaded {len(poems)} raw poems from Kaggle.") | |
| except Exception as e: | |
| print(f" β Error loading Kaggle CSV: {e}") | |
| def load_huggingface_dataset(self, dataset_name="zhyncs/sonnet"): | |
| """ | |
| Loads sonnets from HuggingFace Hub. | |
| The zhyncs/sonnet dataset contains Shakespeare's sonnets as text blocks. | |
| Each row may contain one or more sonnets, so we split them. | |
| """ | |
| print(f"π€ Loading HuggingFace dataset: {dataset_name}...") | |
| try: | |
| dataset = load_dataset(dataset_name, split="train") | |
| raw_texts = dataset['text'] | |
| # Each row may be a large text blob with multiple sonnets | |
| individual_poems = [] | |
| for text_blob in raw_texts: | |
| split_poems = self._split_text_blob_into_poems(text_blob) | |
| individual_poems.extend(split_poems) | |
| self.all_sonnets_raw.extend(individual_poems) | |
| print(f" β Loaded {len(individual_poems)} individual poem chunks from {dataset_name}.") | |
| except Exception as e: | |
| print(f" β Error loading HuggingFace dataset: {e}") | |
| def load_gutenberg_sources(self): | |
| """ | |
| Downloads and parses the two approved Gutenberg sonnet collections. | |
| These are HTML pages with a specific structure where each sonnet | |
| appears under a markdown-style heading (## I, ## II, etc.) as a | |
| single line of text. | |
| """ | |
| for author, url in self.GUTENBERG_SOURCES.items(): | |
| print(f"π Loading Gutenberg ({author}) from {url}...") | |
| try: | |
| response = requests.get(url, timeout=30) | |
| response.raise_for_status() | |
| html_text = response.text | |
| sonnets = self._parse_gutenberg_html(html_text) | |
| self.all_sonnets_raw.extend(sonnets) | |
| print(f" β Extracted {len(sonnets)} poem chunks from Gutenberg ({author}).") | |
| except Exception as e: | |
| print(f" β Error loading Gutenberg ({author}): {e}") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TEXT SPLITTING & PARSING | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _split_text_blob_into_poems(self, text_blob): | |
| """ | |
| Splits a large text blob that may contain multiple concatenated sonnets. | |
| Strategy: Split on blank-line gaps (two or more consecutive newlines). | |
| """ | |
| # Normalize all line endings to plain \n | |
| text_blob = text_blob.replace('\r\n', '\n').replace('\r', '\n') | |
| # Split on double (or more) newlines β these are the gaps between poems | |
| chunks = re.split(r'\n{2,}', text_blob) | |
| # Return non-empty chunks as candidate poems | |
| return [chunk.strip() for chunk in chunks if chunk.strip()] | |
| def _parse_gutenberg_html(self, html_text): | |
| """ | |
| Parses a Gutenberg HTML page to extract individual sonnets. | |
| The Gutenberg source format (after converting to text) has each sonnet | |
| as the text content between consecutive Roman numeral headings. | |
| We extract the raw text between <h2> tags containing Roman numerals. | |
| """ | |
| # Remove everything before the first sonnet and after the license | |
| # The license section starts with "THE FULL PROJECT GUTENBERG" or similar | |
| license_markers = [ | |
| "PROJECT GUTENBERG", | |
| "End of the Project", | |
| "End of Project", | |
| "*** END", | |
| ] | |
| # Also skip front-matter like INDEX OF FIRST LINES | |
| index_marker = "INDEX OF FIRST LINES" | |
| poems = [] | |
| # Extract text between <h2> tags which contain Roman numeral headings | |
| # Pattern: Find content between headings like <h2 ...>I</h2>, <h2 ...>XIV</h2> | |
| # The actual poem text follows in <p> tags | |
| # Simpler approach: extract all text, split by Roman numeral headings | |
| # First, strip HTML tags but preserve structure | |
| import html | |
| from html.parser import HTMLParser | |
| class TextExtractor(HTMLParser): | |
| def __init__(self): | |
| super().__init__() | |
| self.result = [] | |
| self.in_heading = False | |
| self.skip = False | |
| def handle_starttag(self, tag, attrs): | |
| if tag in ('h2', 'h3'): | |
| self.in_heading = True | |
| self.result.append('\n##HEADING##') | |
| if tag == 'p': | |
| self.result.append('\n') | |
| if tag == 'br': | |
| self.result.append('\n') | |
| def handle_endtag(self, tag): | |
| if tag in ('h2', 'h3'): | |
| self.in_heading = False | |
| self.result.append('##/HEADING##\n') | |
| def handle_data(self, data): | |
| self.result.append(data) | |
| extractor = TextExtractor() | |
| extractor.feed(html_text) | |
| full_text = ''.join(extractor.result) | |
| # Split by heading markers | |
| sections = re.split(r'##HEADING##(.*?)##/HEADING##', full_text, flags=re.DOTALL) | |
| # sections alternates: [before_first_heading, heading1, content1, heading2, content2, ...] | |
| i = 1 # Start from first heading | |
| while i < len(sections) - 1: | |
| heading = sections[i].strip() | |
| content = sections[i + 1].strip() | |
| # Check if heading is a Roman numeral (the sonnet number) | |
| is_roman = bool(re.match(r'^[MDCLXVI]+$', heading)) | |
| # Skip non-sonnet sections | |
| is_license = any(marker.lower() in heading.lower() for marker in license_markers) | |
| is_index = index_marker.lower() in heading.lower() | |
| if is_roman and not is_license and not is_index: | |
| # Clean up the content: this is the raw sonnet text | |
| if content: | |
| poems.append(content) | |
| i += 2 | |
| return poems | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CLEANING & VALIDATION | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _clean_single_line(self, line): | |
| """ | |
| Cleans a single line of poem text: | |
| - Strips whitespace & carriage returns | |
| - Removes standalone title/header lines like "Sonnet XIV" | |
| - Removes leading Arabic numerals (e.g. "1.", "14 -") | |
| - Removes leading Roman numerals ONLY when followed by a period | |
| (protects real words like "I wandered" or "Did he") | |
| """ | |
| # Normalize carriage returns and strip whitespace | |
| line = line.replace('\r', '').strip() | |
| if not line: | |
| return None | |
| # Skip standalone title lines: "Sonnet XIV", "SONNET 12", "Sonnet", etc. | |
| if re.match(r'(?i)^sonnet\s*[MDCLXVI\d]*\.?\s*$', line): | |
| return None | |
| # Remove leading Arabic number prefixes: "1.", "14)", "3 -", "12 " | |
| line = re.sub(r'^\d+[\.\)\-]?\s+', '', line) | |
| # Remove leading Roman numerals ONLY if followed by a period. | |
| # This protects real words: "I wandered", "Did he", "Civil war" | |
| # But catches: "XIV.", "II.", "ix." | |
| line = re.sub(r'^(?i)([MDCLXVI]+)\.\s*', '', line) | |
| line = line.strip() | |
| return line if line else None | |
| def _split_long_line_into_verses(self, text): | |
| """ | |
| Gutenberg stores entire sonnets as a single long text line where verse | |
| lines are separated by commas, semicolons, or sentence boundaries. | |
| If the raw text appears to be a single long line (not already multi-line), | |
| we attempt to split it into 14 verse lines using punctuation patterns | |
| common in poetry (e.g., comma-space-capital-letter boundaries). | |
| Returns the text unchanged if it's already multi-line. | |
| """ | |
| # Normalize | |
| text = text.replace('\r', '').strip() | |
| lines = text.split('\n') | |
| non_empty = [l.strip() for l in lines if l.strip()] | |
| # If already multi-line, return as-is (the normal cleaning will handle it) | |
| if len(non_empty) > 1: | |
| return text | |
| # Single line β this is likely a Gutenberg compressed sonnet | |
| # These sonnets have the form: "Line one text, Line two text; Line three..." | |
| # We need a smarter split. The pattern is usually that each verse line | |
| # ends with a comma, colon, semicolon, period, or exclamation/question mark | |
| # followed by a space and a capital letter starting the next line. | |
| single_line = non_empty[0] if non_empty else "" | |
| if not single_line: | |
| return text | |
| # Split on punctuation followed by space and a capital letter | |
| # We use a lookahead so we don't consume the capital letter | |
| verse_lines = re.split( | |
| r'(?<=[,;:.!?])\s+(?=[A-Z])', | |
| single_line | |
| ) | |
| # Also handle lines that have indented couplets (marked with multiple spaces) | |
| expanded = [] | |
| for vl in verse_lines: | |
| # Split on multiple spaces (4+) which indicate couplet indentation | |
| parts = re.split(r'\s{4,}', vl) | |
| expanded.extend(parts) | |
| return '\n'.join(expanded) | |
| # Maximum characters allowed per verse line. | |
| # A real sonnet verse is roughly 30-90 characters. | |
| # Prose paragraphs that happen to have 14 line-breaks are typically 200+ chars. | |
| MAX_LINE_LENGTH = 120 | |
| def clean_and_validate_sonnet(self, raw_poem): | |
| """ | |
| Cleans the poem text and validates it as a sonnet (exactly 14 lines). | |
| Also rejects prose paragraphs disguised as poems. | |
| Returns the cleaned string if valid, otherwise returns None. | |
| """ | |
| # Normalize all line endings | |
| raw_poem = str(raw_poem).replace('\r\n', '\n').replace('\r', '\n') | |
| # Try to expand single-line Gutenberg sonnets into multi-line | |
| raw_poem = self._split_long_line_into_verses(raw_poem) | |
| lines = raw_poem.split('\n') | |
| valid_lines = [] | |
| for line in lines: | |
| cleaned = self._clean_single_line(line) | |
| if cleaned is not None: | |
| valid_lines.append(cleaned) | |
| # STRICT VALIDATION 1: A sonnet must be exactly 14 lines of text. | |
| if len(valid_lines) != 14: | |
| return None | |
| # STRICT VALIDATION 2: Reject prose paragraphs. | |
| # If ANY line exceeds the max length, this is not a verse poem. | |
| for line in valid_lines: | |
| if len(line) > self.MAX_LINE_LENGTH: | |
| return None | |
| return '\n'.join(valid_lines) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MASTER PROCESSING PIPELINE | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def process_all_data(self): | |
| """Runs the raw data through the cleaning, validation, and deduplication pipeline.""" | |
| print("\nβββ Starting Data Processing βββ") | |
| print(f"π¦ Total raw poem chunks to process: {len(self.all_sonnets_raw)}") | |
| for poem in self.all_sonnets_raw: | |
| cleaned = self.clean_and_validate_sonnet(poem) | |
| if cleaned: | |
| self.cleaned_sonnets.append(cleaned) | |
| print(f"π Found {len(self.cleaned_sonnets)} valid 14-line sonnets across all sources.") | |
| # Deduplication using Pandas | |
| master_df = pd.DataFrame({"Poem": self.cleaned_sonnets}) | |
| initial_count = len(master_df) | |
| master_df = master_df.drop_duplicates(subset=['Poem']) | |
| final_count = len(master_df) | |
| print(f"ποΈ Removed {initial_count - final_count} identical clones.") | |
| # Shuffle randomly (random_state ensures reproducibility) | |
| master_df = master_df.sample(frac=1, random_state=42).reset_index(drop=True) | |
| self.cleaned_sonnets = master_df['Poem'].tolist() | |
| print(f"β¨ Master Dataset finalized with {len(self.cleaned_sonnets)} pure, unique sonnets.") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # JSONL EXPORT (for Apple MLX) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Varied prompts so the model learns the CONCEPT of "produce a sonnet" | |
| # rather than memorizing one exact string. | |
| PROMPT_VARIATIONS = [ | |
| "Write a classic sonnet.\nSonnet:", | |
| "Compose a sonnet.\nSonnet:", | |
| "Write a sonnet in 14 lines.\nSonnet:", | |
| "Create a beautiful sonnet.\nSonnet:", | |
| "Write an English sonnet.\nSonnet:", | |
| "Compose a 14-line sonnet.\nSonnet:", | |
| "Write a poetic sonnet.\nSonnet:", | |
| "Produce a sonnet.\nSonnet:", | |
| ] | |
| def export_to_jsonl(self): | |
| """Splits data 80/20 and formats it for MLX Base Model training.""" | |
| if not self.cleaned_sonnets: | |
| print("β No data to export! Run process_all_data() first.") | |
| return | |
| # Split 80% train, 20% validation | |
| train_data, valid_data = train_test_split( | |
| self.cleaned_sonnets, test_size=0.2, random_state=42 | |
| ) | |
| rng = random.Random(42) # Reproducible prompt assignment | |
| def write_file(filename, data): | |
| filepath = os.path.join(self.data_output_dir, filename) | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| for poem in data: | |
| # Pick a random prompt variation for each example | |
| prompt = rng.choice(self.PROMPT_VARIATIONS) | |
| formatted_text = f"{prompt}\n{poem}" | |
| json_obj = {"text": formatted_text} | |
| f.write(json.dumps(json_obj, ensure_ascii=False) + '\n') | |
| print(f"πΎ Saved {len(data)} sonnets to {filepath}") | |
| write_file("train.jsonl", train_data) | |
| write_file("valid.jsonl", valid_data) | |
| print("\nπ Pre-processing Complete! Your data is ready for Apple MLX Training.") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # EXECUTION BLOCK | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| # Define directories based on the OOP architecture | |
| project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| data_dir = os.path.join(project_root, "data") | |
| # Initialize Processor | |
| processor = DataProcessor(data_output_dir=data_dir) | |
| # ββ 1. Load Data from all 4 approved sources ββ | |
| # Source 1: HuggingFace (zhyncs/sonnet β Shakespeare's collected sonnets) | |
| processor.load_huggingface_dataset("zhyncs/sonnet") | |
| # Source 2: Kaggle Poetry Foundation CSV (mixed poetry β filtered to 14-line only) | |
| kaggle_csv = os.path.join(os.path.expanduser("~"), "Downloads", "PoetryFoundationData.csv") | |
| if os.path.exists(kaggle_csv): | |
| processor.load_kaggle_csv(kaggle_csv) | |
| else: | |
| print(f"β οΈ Kaggle CSV not found at {kaggle_csv}, skipping.") | |
| # Source 3 & 4: Gutenberg (Shakespeare pg1041 + Browning pg2002) | |
| processor.load_gutenberg_sources() | |
| # ββ 2. Process, Validate & Deduplicate ββ | |
| processor.process_all_data() | |
| # ββ 3. Export to MLX JSONL Format ββ | |
| processor.export_to_jsonl() | |