Spaces:

stillerman
/

wikihop-server

Runtime error

App Files Files Community

wikihop-server / db /wiki_parser_sqlite.py

stillerman

sqlite backend

e91ced9 8 months ago

raw

history blame contribute delete

8.09 kB

	import bz2
	import re
	import os
	import sqlite3
	from pathlib import Path
	from xml.sax import make_parser, handler
	import time

	class WikiContentHandler(handler.ContentHandler):
	def __init__(self, db_conn, batch_size=1000, max_articles=None):
	self.db_conn = db_conn
	self.cursor = db_conn.cursor()
	self.batch_size = batch_size
	self.article_count = 0
	self.max_articles = max_articles
	self.article_batch = []
	self.links_batch = []

	# Current elements
	self.current_title = None
	self.current_text = None
	self.current_ns = None
	self.in_page = False
	self.in_title = False
	self.in_text = False
	self.in_ns = False
	self.buffer = []

	def startElement(self, name, attrs):
	if name == 'page':
	self.in_page = True
	self.current_title = None
	self.current_text = None
	self.current_ns = None
	elif self.in_page and name == 'title':
	self.in_title = True
	self.buffer = []
	elif self.in_page and name == 'ns':
	self.in_ns = True
	self.buffer = []
	elif self.in_page and name == 'text':
	self.in_text = True
	self.buffer = []

	def endElement(self, name):
	if name == 'page':
	self.in_page = False
	# Only process main namespace articles (ns = 0)
	if self.current_ns == '0' and self.current_title and self.current_text:
	# Extract links
	links = self.extract_links(self.current_text)

	# Add to batch
	self.article_batch.append(
	(self.current_title, self.current_text)
	)

	# Add links to batch
	for link in links:
	self.links_batch.append(
	(self.current_title, link)
	)

	self.article_count += 1

	# Print progress
	if self.article_count % 100 == 0:
	print(f"Processed {self.article_count} articles...")

	# Insert batch if reached batch size
	if len(self.article_batch) >= self.batch_size:
	self._insert_batch()

	# Check if we've reached the maximum number of articles
	if self.max_articles and self.article_count >= self.max_articles:
	self._insert_batch() # Insert any remaining items
	raise StopIteration("Reached maximum number of articles")

	elif name == 'title':
	self.in_title = False
	self.current_title = ''.join(self.buffer)
	elif name == 'ns':
	self.in_ns = False
	self.current_ns = ''.join(self.buffer)
	elif name == 'text':
	self.in_text = False
	self.current_text = ''.join(self.buffer)

	def characters(self, content):
	if self.in_title:
	self.buffer.append(content)
	elif self.in_ns:
	self.buffer.append(content)
	elif self.in_text:
	self.buffer.append(content)

	def extract_links(self, text):
	"""Extract links from article wikitext"""
	# Pattern to match [[Link]] or [[Link\|Text]] format
	links = re.findall(r'\[\[([^\|\]]+)(?:\\|[^\]]+)?\]\]', text)

	# Process links
	processed_links = []
	for link in links:
	# Skip non-article links (except categories which might be useful)
	if ':' in link and not link.startswith('Category:'):
	continue

	# Remove any section links (with #)
	link = link.split('#')[0].strip()

	# Skip empty links
	if not link:
	continue

	processed_links.append(link)

	# Remove duplicates and return
	return list(set(processed_links))

	def _insert_batch(self):
	"""Insert batched data into the database"""
	if self.article_batch:
	self.cursor.executemany(
	"INSERT OR IGNORE INTO articles (title, text) VALUES (?, ?)",
	self.article_batch
	)

	if self.links_batch:
	self.cursor.executemany(
	"INSERT OR IGNORE INTO links (source_title, target_title) VALUES (?, ?)",
	self.links_batch
	)

	self.db_conn.commit()
	self.article_batch = []
	self.links_batch = []

	def create_db_schema(db_conn):
	"""Create the database schema"""
	cursor = db_conn.cursor()

	# Create articles table
	cursor.execute('''
	CREATE TABLE IF NOT EXISTS articles (
	title TEXT PRIMARY KEY,
	text TEXT
	)
	''')

	# Create links table
	cursor.execute('''
	CREATE TABLE IF NOT EXISTS links (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	source_title TEXT,
	target_title TEXT,
	FOREIGN KEY (source_title) REFERENCES articles (title),
	UNIQUE (source_title, target_title)
	)
	''')

	# Create index on links for faster queries
	cursor.execute('''
	CREATE INDEX IF NOT EXISTS idx_links_source ON links (source_title)
	''')

	cursor.execute('''
	CREATE INDEX IF NOT EXISTS idx_links_target ON links (target_title)
	''')

	db_conn.commit()

	def parse_wiki_dump(dump_path, db_path, batch_size=1000, max_articles=None):
	"""
	Parse the Wikipedia XML dump and extract articles with their links into SQLite database.

	Args:
	dump_path: Path to the bz2 Wikipedia dump
	db_path: Path to save the SQLite database
	batch_size: Number of articles to process before committing to the database
	max_articles: Maximum number of articles to extract (None for all)

	Returns:
	The path to the created SQLite database
	"""
	start_time = time.time()
	print(f"Parsing Wikipedia dump: {dump_path}")

	# Create or connect to SQLite database
	db_conn = sqlite3.connect(db_path)

	# Create schema
	create_db_schema(db_conn)

	# Create SAX parser with custom content handler
	parser = make_parser()
	content_handler = WikiContentHandler(db_conn, batch_size, max_articles)
	parser.setContentHandler(content_handler)

	# Parse the dump
	try:
	parser.parse(bz2.BZ2File(dump_path))
	# Insert any remaining items in the batch
	content_handler._insert_batch()
	except StopIteration:
	print("Reached maximum number of articles")
	except Exception as e:
	print(f"Error parsing dump: {e}")
	raise
	finally:
	db_conn.commit()
	db_conn.close()

	duration = time.time() - start_time
	print(f"Extracted {content_handler.article_count} articles in {duration:.2f} seconds.")
	print(f"Data saved to {db_path}")
	return db_path

	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(description='Parse Wikipedia XML dump to SQLite')
	parser.add_argument('dump_path', help='Path to the Wikipedia XML dump (bz2 file)')
	parser.add_argument('output_path', help='Path to save the SQLite database')
	parser.add_argument('--batch-size', type=int, default=1000,
	help='Batch size for database inserts (default: 1000)')
	parser.add_argument('--max-articles', type=int, default=None,
	help='Maximum number of articles to extract (default: all)')

	args = parser.parse_args()

	# Create output directory if it doesn't exist
	output_dir = os.path.dirname(args.output_path)
	if output_dir:
	os.makedirs(output_dir, exist_ok=True)

	# Parse the dump
	parse_wiki_dump(args.dump_path, args.output_path, args.batch_size, args.max_articles)