Spaces:
Runtime error
Runtime error
| import bz2 | |
| import re | |
| import os | |
| import sqlite3 | |
| from pathlib import Path | |
| from xml.sax import make_parser, handler | |
| import time | |
| class WikiContentHandler(handler.ContentHandler): | |
| def __init__(self, db_conn, batch_size=1000, max_articles=None): | |
| self.db_conn = db_conn | |
| self.cursor = db_conn.cursor() | |
| self.batch_size = batch_size | |
| self.article_count = 0 | |
| self.max_articles = max_articles | |
| self.article_batch = [] | |
| self.links_batch = [] | |
| # Current elements | |
| self.current_title = None | |
| self.current_text = None | |
| self.current_ns = None | |
| self.in_page = False | |
| self.in_title = False | |
| self.in_text = False | |
| self.in_ns = False | |
| self.buffer = [] | |
| def startElement(self, name, attrs): | |
| if name == 'page': | |
| self.in_page = True | |
| self.current_title = None | |
| self.current_text = None | |
| self.current_ns = None | |
| elif self.in_page and name == 'title': | |
| self.in_title = True | |
| self.buffer = [] | |
| elif self.in_page and name == 'ns': | |
| self.in_ns = True | |
| self.buffer = [] | |
| elif self.in_page and name == 'text': | |
| self.in_text = True | |
| self.buffer = [] | |
| def endElement(self, name): | |
| if name == 'page': | |
| self.in_page = False | |
| # Only process main namespace articles (ns = 0) | |
| if self.current_ns == '0' and self.current_title and self.current_text: | |
| # Extract links | |
| links = self.extract_links(self.current_text) | |
| # Add to batch | |
| self.article_batch.append( | |
| (self.current_title, self.current_text) | |
| ) | |
| # Add links to batch | |
| for link in links: | |
| self.links_batch.append( | |
| (self.current_title, link) | |
| ) | |
| self.article_count += 1 | |
| # Print progress | |
| if self.article_count % 100 == 0: | |
| print(f"Processed {self.article_count} articles...") | |
| # Insert batch if reached batch size | |
| if len(self.article_batch) >= self.batch_size: | |
| self._insert_batch() | |
| # Check if we've reached the maximum number of articles | |
| if self.max_articles and self.article_count >= self.max_articles: | |
| self._insert_batch() # Insert any remaining items | |
| raise StopIteration("Reached maximum number of articles") | |
| elif name == 'title': | |
| self.in_title = False | |
| self.current_title = ''.join(self.buffer) | |
| elif name == 'ns': | |
| self.in_ns = False | |
| self.current_ns = ''.join(self.buffer) | |
| elif name == 'text': | |
| self.in_text = False | |
| self.current_text = ''.join(self.buffer) | |
| def characters(self, content): | |
| if self.in_title: | |
| self.buffer.append(content) | |
| elif self.in_ns: | |
| self.buffer.append(content) | |
| elif self.in_text: | |
| self.buffer.append(content) | |
| def extract_links(self, text): | |
| """Extract links from article wikitext""" | |
| # Pattern to match [[Link]] or [[Link|Text]] format | |
| links = re.findall(r'\[\[([^|\]]+)(?:\|[^\]]+)?\]\]', text) | |
| # Process links | |
| processed_links = [] | |
| for link in links: | |
| # Skip non-article links (except categories which might be useful) | |
| if ':' in link and not link.startswith('Category:'): | |
| continue | |
| # Remove any section links (with #) | |
| link = link.split('#')[0].strip() | |
| # Skip empty links | |
| if not link: | |
| continue | |
| processed_links.append(link) | |
| # Remove duplicates and return | |
| return list(set(processed_links)) | |
| def _insert_batch(self): | |
| """Insert batched data into the database""" | |
| if self.article_batch: | |
| self.cursor.executemany( | |
| "INSERT OR IGNORE INTO articles (title, text) VALUES (?, ?)", | |
| self.article_batch | |
| ) | |
| if self.links_batch: | |
| self.cursor.executemany( | |
| "INSERT OR IGNORE INTO links (source_title, target_title) VALUES (?, ?)", | |
| self.links_batch | |
| ) | |
| self.db_conn.commit() | |
| self.article_batch = [] | |
| self.links_batch = [] | |
| def create_db_schema(db_conn): | |
| """Create the database schema""" | |
| cursor = db_conn.cursor() | |
| # Create articles table | |
| cursor.execute(''' | |
| CREATE TABLE IF NOT EXISTS articles ( | |
| title TEXT PRIMARY KEY, | |
| text TEXT | |
| ) | |
| ''') | |
| # Create links table | |
| cursor.execute(''' | |
| CREATE TABLE IF NOT EXISTS links ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| source_title TEXT, | |
| target_title TEXT, | |
| FOREIGN KEY (source_title) REFERENCES articles (title), | |
| UNIQUE (source_title, target_title) | |
| ) | |
| ''') | |
| # Create index on links for faster queries | |
| cursor.execute(''' | |
| CREATE INDEX IF NOT EXISTS idx_links_source ON links (source_title) | |
| ''') | |
| cursor.execute(''' | |
| CREATE INDEX IF NOT EXISTS idx_links_target ON links (target_title) | |
| ''') | |
| db_conn.commit() | |
| def parse_wiki_dump(dump_path, db_path, batch_size=1000, max_articles=None): | |
| """ | |
| Parse the Wikipedia XML dump and extract articles with their links into SQLite database. | |
| Args: | |
| dump_path: Path to the bz2 Wikipedia dump | |
| db_path: Path to save the SQLite database | |
| batch_size: Number of articles to process before committing to the database | |
| max_articles: Maximum number of articles to extract (None for all) | |
| Returns: | |
| The path to the created SQLite database | |
| """ | |
| start_time = time.time() | |
| print(f"Parsing Wikipedia dump: {dump_path}") | |
| # Create or connect to SQLite database | |
| db_conn = sqlite3.connect(db_path) | |
| # Create schema | |
| create_db_schema(db_conn) | |
| # Create SAX parser with custom content handler | |
| parser = make_parser() | |
| content_handler = WikiContentHandler(db_conn, batch_size, max_articles) | |
| parser.setContentHandler(content_handler) | |
| # Parse the dump | |
| try: | |
| parser.parse(bz2.BZ2File(dump_path)) | |
| # Insert any remaining items in the batch | |
| content_handler._insert_batch() | |
| except StopIteration: | |
| print("Reached maximum number of articles") | |
| except Exception as e: | |
| print(f"Error parsing dump: {e}") | |
| raise | |
| finally: | |
| db_conn.commit() | |
| db_conn.close() | |
| duration = time.time() - start_time | |
| print(f"Extracted {content_handler.article_count} articles in {duration:.2f} seconds.") | |
| print(f"Data saved to {db_path}") | |
| return db_path | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser(description='Parse Wikipedia XML dump to SQLite') | |
| parser.add_argument('dump_path', help='Path to the Wikipedia XML dump (bz2 file)') | |
| parser.add_argument('output_path', help='Path to save the SQLite database') | |
| parser.add_argument('--batch-size', type=int, default=1000, | |
| help='Batch size for database inserts (default: 1000)') | |
| parser.add_argument('--max-articles', type=int, default=None, | |
| help='Maximum number of articles to extract (default: all)') | |
| args = parser.parse_args() | |
| # Create output directory if it doesn't exist | |
| output_dir = os.path.dirname(args.output_path) | |
| if output_dir: | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Parse the dump | |
| parse_wiki_dump(args.dump_path, args.output_path, args.batch_size, args.max_articles) |