|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| r"""
|
| Download and prepare training data from the SQUAD dataset.
|
|
|
| OVERVIEW:
|
| This script downloads the SQUAD (Stanford Question Answering Dataset) from its official source,
|
| extracts the Wikipedia context passages from the JSON format, and saves the cleaned text to disk.
|
| The SQUAD dataset contains high-quality Wikipedia articles that are perfect for training language models.
|
|
|
| DATA FLOW:
|
| 1. Downloads 4 JSON files from Stanford (SQUAD v1.1 & v2.0, train & dev splits)
|
| 2. Parses JSON structure: data -> articles -> paragraphs -> context
|
| 3. Extracts only the 'context' fields (Wikipedia passages, not questions/answers)
|
| 4. Cleans text: normalizes whitespace, filters by minimum word count
|
| 5. Outputs one passage per line in a single text file
|
|
|
| The output is a single text file containing ~150k-200k Wikipedia article passages,
|
| suitable for training tokenizers and language models.
|
|
|
| DATASET INFO:
|
| - SQUAD v1.1: 87k train + 10k dev examples
|
| - SQUAD v2.0: 130k train + 11k dev examples
|
| - Source: High-quality Wikipedia articles across diverse topics
|
| - Total download size: ~200MB
|
| - Final processed size: ~100-150MB of clean text
|
|
|
| Usage:
|
| python core/src/download_and_prepare.py
|
|
|
| Output:
|
| data/clean/training_data.txt - Cleaned Wikipedia passages from SQUAD dataset
|
|
|
| Requirements:
|
| pip install requests tqdm
|
|
|
| Example setup:
|
|
|
| Windows PowerShell:
|
| ```powershell
|
| python -m venv venv
|
| .\venv\Scripts\Activate.ps1
|
| pip install requests tqdm
|
| python core/src/download_and_prepare.py
|
| ```
|
|
|
| Linux/macOS:
|
| ```bash
|
| python -m venv venv
|
| source venv/bin/activate
|
| pip install requests tqdm
|
| python core/src/download_and_prepare.py
|
| ```
|
|
|
| """
|
|
|
| import json
|
| import os
|
|
|
| import requests
|
| from tqdm import tqdm
|
|
|
|
|
| def download_file(url, filename):
|
| """
|
| Download a file from URL with progress bar.
|
|
|
| Args:
|
| url (str): URL to download from
|
| filename (str): Local path where file should be saved
|
| """
|
|
|
| response = requests.get(url, stream=True, timeout=30)
|
| total_size = int(response.headers.get("content-length", 0))
|
|
|
|
|
| with open(filename, "wb") as file, tqdm(
|
| desc=filename,
|
| total=total_size,
|
| unit="iB",
|
| unit_scale=True,
|
| unit_divisor=1024,
|
| ) as pbar:
|
|
|
| for data in response.iter_content(chunk_size=1024):
|
| size = file.write(data)
|
| pbar.update(size)
|
|
|
|
|
| def prepare_training_data(output_path="data/clean/training_data.txt", min_words=10):
|
| """
|
| Downloads the SQUAD dataset and extracts Wikipedia context passages for training.
|
|
|
| SQUAD Dataset Structure:
|
| - Each JSON file contains a 'data' array of Wikipedia articles
|
| - Each article has 'paragraphs' containing 'context' (Wikipedia text) and 'qas' (questions/answers)
|
| - We extract only the 'context' fields which contain high-quality Wikipedia passages
|
|
|
| Args:
|
| output_path (str): Path to save the cleaned text data.
|
| min_words (int): Minimum number of words required for a passage to be included.
|
| """
|
| print("Downloading SQUAD dataset...")
|
|
|
|
|
|
|
|
|
|
|
| squad_urls = [
|
| "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json",
|
| "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json",
|
| "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json",
|
| "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json",
|
| ]
|
|
|
|
|
| os.makedirs("data/raw", exist_ok=True)
|
|
|
| downloaded_files = []
|
|
|
|
|
| print("Step 1: Downloading SQUAD JSON files...")
|
| for i, url in enumerate(squad_urls):
|
| filename = f"data/raw/squad_{i+1}.json"
|
| try:
|
| print(f"Downloading {url}...")
|
| download_file(url, filename)
|
| downloaded_files.append(filename)
|
| print(f"Successfully downloaded {filename}")
|
| except Exception as e:
|
| print(f"Failed to download {url}: {e}")
|
| continue
|
|
|
|
|
| if not downloaded_files:
|
| print("ERROR: No files were downloaded successfully.")
|
| return
|
|
|
|
|
| os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
| print(f"\nStep 2: Processing SQUAD files and saving to {output_path}...")
|
|
|
|
|
| with open(output_path, "w", encoding="utf-8") as f:
|
| total_contexts = 0
|
|
|
| for file_path in downloaded_files:
|
| print(f"Processing {file_path}...")
|
|
|
| try:
|
|
|
| with open(file_path, "r", encoding="utf-8") as json_file:
|
| squad_data = json.load(json_file)
|
|
|
|
|
|
|
| contexts = []
|
| for article in squad_data.get("data", []):
|
|
|
| for paragraph in article.get("paragraphs", []):
|
|
|
| context = paragraph.get("context", "").strip()
|
| if context:
|
| contexts.append(context)
|
|
|
| print(f"Found {len(contexts)} Wikipedia passages in {os.path.basename(file_path)}")
|
|
|
|
|
|
|
| for context in tqdm(contexts, desc=f"Processing {os.path.basename(file_path)}"):
|
|
|
|
|
|
|
|
|
|
|
|
|
| cleaned_text = " ".join(context.split())
|
|
|
|
|
|
|
| if not cleaned_text:
|
| continue
|
|
|
|
|
|
|
|
|
|
|
| word_count = len(cleaned_text.split())
|
| if word_count >= min_words:
|
|
|
|
|
|
|
| f.write(cleaned_text + "\n")
|
| total_contexts += 1
|
|
|
|
|
| elif word_count > 0:
|
| if total_contexts % 1000 == 0:
|
| print(
|
| f"⚠️ Skipped short passage ({word_count} words): {cleaned_text[:50]}..."
|
| )
|
|
|
| except Exception as e:
|
| print(f"Error processing {file_path}: {e}")
|
| continue
|
|
|
| print(f"\nStep 3: Successfully saved {total_contexts} Wikipedia passages from SQUAD dataset.")
|
| print(f"Output file: {output_path}")
|
|
|
|
|
| print("Step 4: Cleaning up temporary files...")
|
| for file in downloaded_files:
|
| try:
|
| os.remove(file)
|
| print(f"Removed {file}")
|
| except Exception as e:
|
| print(f"Warning: Could not remove {file}: {e}")
|
|
|
|
|
| if __name__ == "__main__":
|
| """
|
| Main execution block - runs when script is called directly.
|
|
|
| This will:
|
| 1. Download SQUAD v1.1 and v2.0 datasets (~200MB total)
|
| 2. Extract ~240k Wikipedia passages from the JSON files
|
| 3. Clean and filter the text (remove passages < 10 words)
|
| 4. Save all passages to data/clean/training_data.txt (one per line)
|
| 5. Clean up temporary files
|
|
|
| Expected output: ~150k-200k high-quality Wikipedia passages suitable for LM training
|
| """
|
|
|
| prepare_training_data()
|
|
|