Spaces:

tm23hgf
/

rust_algo_reasoning

Sleeping

rust_algo_reasoning / complexity_reasoning_data /extract_explanations.py

TM23-sanji

initial clean commit

c987dd2 about 2 months ago

11.2 kB

	#!/usr/bin/env python3
	"""
	Extract solution explanations from LeetCode solution README files.

	Output format per problem:
	{
	"problem_id": 15,
	"explanations": {
	"1": "We sort the array first...",
	"2": "..."
	},
	"is_english": true,
	"time_complexity": "O(n^2)",
	"space_complexity": "O(log n)"
	}

	Strategy:
	1. Read English README_EN.md first
	2. If a solution has no explanation text, fall back to Chinese README.md
	3. Extract time/space complexity from explanation text via regex
	"""

	import os
	import re
	import json
	from pathlib import Path

	SOLUTION_BASE = "/teamspace/studios/this_studio/dataset/leetcode/solution"


	def parse_explanation_from_block(block):
	"""Extract explanation text from a single solution block.

	The block starts right after the '### Solution N' header line.
	Explanation is everything before the first code block marker:
	- #### (language header like #### Python3)
	- <!-- tabs:start -->
	"""
	lines = block.strip().split("\n")
	explanation_lines = []

	for line in lines:
	stripped = line.strip()
	if stripped.startswith("####") or stripped.startswith("<!-- tabs:start -->"):
	break
	explanation_lines.append(line)

	explanation = "\n".join(explanation_lines).strip()
	# Clean up HTML comments that sometimes leak in
	explanation = re.sub(r"<!--.*?-->", "", explanation, flags=re.DOTALL).strip()
	return explanation


	def _extract_first_o_expr(text):
	r"""Find the first O(...) with balanced parentheses.

	Returns (inner_content, end_pos) or None.
	Validates that the character after the closing paren is an
	expected end-of-match delimiter (e.g. $, comma, period, backtick,
	whitespace, or end of string).
	"""
	i = 0
	while i < len(text) - 2:
	if text[i] == "O" and text[i + 1] == "(":
	depth = 1
	j = i + 2
	while j < len(text) and depth > 0:
	if text[j] == "(":
	depth += 1
	elif text[j] == ")":
	depth -= 1
	j += 1
	if depth == 0:
	after = text[j] if j < len(text) else ""
	if after == "" or after in "$`,); \t\n，。；":
	return text[i + 2 : j - 1], j
	i += 2
	else:
	i += 1
	return None


	def _extract_complexity_after_keyword(text, kw_regex):
	"""Find the first O(...) after a complexity keyword match.

	Returns 'O(inner)' or None.
	"""
	for m in re.finditer(kw_regex, text):
	result = _extract_first_o_expr(text[m.end() :])
	if result:
	inner = result[0]
	if inner:
	return f"O({inner})"
	return None


	def extract_complexities(text):
	r"""Extract time and space complexity from explanation text.

	Handles variations like:
	- 'The time complexity is $O(n^2)$, and the space complexity is $O(log n)$'
	- 'Time complexity is $O(n^2)$, and space complexity is $O(log n)$'
	- 'Time complexity $O(n)$, space complexity $O(n)$' (no "is")
	- '时间复杂度 $O(n^2)$，空间复杂度 $O(log n)$'
	- '时间复杂度：$O(logN)$' (with colon)
	- '时间复杂度为 $O(1)' (with 为)
	- '时间复杂度 O(logn)' (no $ delimiters)
	- 'O(\max (m, n))' (nested parens)
	- 'O((n-m) \times m)' (double parens)
	"""
	# Keyword patterns that precede O-notation
	time_kw = r"(?:[Tt]ime\s+[Cc]omplexity\s+(?:is\s)?\|(?:时间复杂度)[：:]?\s(?:为\s)?)\s(?:\\?\$)?\s*"
	space_kw = r"(?:[Ss]pace\s+[Cc]omplexity\s+(?:is\s)?\|(?:空间复杂度)[：:]?\s(?:为\s)?)\s(?:\\?\$)?\s*"

	return {
	"time_complexity": _extract_complexity_after_keyword(text, time_kw),
	"space_complexity": _extract_complexity_after_keyword(text, space_kw),
	}


	def parse_solutions_section(content, is_english):
	"""Parse the ## Solutions section and extract all solution explanations.

	Returns:
	explanations: dict mapping solution number -> explanation text
	first_time: first non-null time complexity found
	first_space: first non-null space complexity found
	all_from_primary: whether all explanations came from the primary language
	"""
	# Find the Solutions section header
	if is_english:
	match = re.search(r"## Solutions\s*\n", content)
	else:
	match = re.search(r"## 解法\s*\n", content)

	if not match:
	return {}, None, None, True

	solutions_section = content[match.end() :]

	# Split into solution blocks on ### Solution N or ### 方法N
	if is_english:
	parts = re.split(r"(?=### Solution \d+)", solutions_section)
	else:
	parts = re.split(r"(?=### 方法[一二三四五六七八九十\d]+)", solutions_section)

	explanations = {}
	first_time = None
	first_space = None
	all_have_text = True

	for part in parts:
	part = part.strip()
	if not part:
	continue

	# Extract solution number
	if is_english:
	num_match = re.match(r"### Solution (\d+)", part)
	if not num_match:
	continue
	sol_num = num_match.group(1)
	else:
	num_match = re.match(r"### 方法([一二三四五六七八九十\d]+)", part)
	if not num_match:
	continue
	raw_num = num_match.group(1)
	# Convert Chinese numerals to digits
	cn_map = {
	"一": "1",
	"二": "2",
	"三": "3",
	"四": "4",
	"五": "5",
	"六": "6",
	"七": "7",
	"八": "8",
	"九": "9",
	"十": "10",
	}
	sol_num = cn_map.get(raw_num, raw_num)

	# Get the block after the header line
	header_end = part.find("\n")
	if header_end == -1:
	explanations[sol_num] = ""
	all_have_text = False
	continue

	block = part[header_end + 1 :]
	explanation = parse_explanation_from_block(block)

	if not explanation:
	all_have_text = False

	explanations[sol_num] = explanation

	# Extract complexities from the first solution that has them
	if explanation and (first_time is None or first_space is None):
	complexities = extract_complexities(explanation)
	if complexities["time_complexity"] and first_time is None:
	first_time = complexities["time_complexity"]
	if complexities["space_complexity"] and first_space is None:
	first_space = complexities["space_complexity"]

	return explanations, first_time, first_space, all_have_text


	def process_problem(problem_dir):
	"""Process a single problem directory and return the explanation entry."""
	folder_name = os.path.basename(problem_dir)

	# Extract problem_id from folder name (e.g., "0015.3Sum" -> 15)
	id_match = re.match(r"(\d+)\.", folder_name)
	if not id_match:
	return None
	problem_id = int(id_match.group(1))

	en_path = os.path.join(problem_dir, "README_EN.md")
	zh_path = os.path.join(problem_dir, "README.md")

	# Read English README
	en_content = ""
	if os.path.exists(en_path):
	with open(en_path, "r", encoding="utf-8") as f:
	en_content = f.read()

	if not en_content:
	return None

	# Parse English solutions
	en_explanations, time_c, space_c, all_en = parse_solutions_section(
	en_content, is_english=True
	)

	# is_english is True if we got any English explanation text
	# Only False if all solutions had to fall back to Chinese
	is_english = (
	any(v.strip() for v in en_explanations.values()) if en_explanations else False
	)

	# If any solution has empty explanation, try Chinese fallback
	if not all_en and os.path.exists(zh_path):
	with open(zh_path, "r", encoding="utf-8") as f:
	zh_content = f.read()

	zh_explanations, zh_time, zh_space, _ = parse_solutions_section(
	zh_content, is_english=False
	)

	# Merge: use Chinese where English is empty
	for sol_num in en_explanations:
	if not en_explanations[sol_num].strip() and sol_num in zh_explanations:
	en_explanations[sol_num] = zh_explanations[sol_num]

	# Fill complexity from Chinese if still missing
	if time_c is None and zh_time:
	time_c = zh_time
	if space_c is None and zh_space:
	space_c = zh_space

	# Handle case where English had no solutions section at all
	if not en_explanations and os.path.exists(zh_path):
	with open(zh_path, "r", encoding="utf-8") as f:
	zh_content = f.read()
	en_explanations, time_c, space_c, _ = parse_solutions_section(
	zh_content, is_english=False
	)
	is_english = False

	# Skip if we found absolutely nothing
	if not en_explanations:
	return None

	return {
	"problem_id": problem_id,
	"explanations": en_explanations,
	"is_english": is_english,
	"time_complexity": time_c,
	"space_complexity": space_c,
	}


	def main():
	"""Main entry point: iterate all range directories and process every problem."""
	all_entries = []

	# Get all range directories sorted
	range_dirs = sorted(
	[
	d
	for d in os.listdir(SOLUTION_BASE)
	if os.path.isdir(os.path.join(SOLUTION_BASE, d))
	and re.match(r"\d{4}-\d{4}$", d)
	]
	)

	print(f"Found {len(range_dirs)} range directories")

	for range_dir in range_dirs:
	range_path = os.path.join(SOLUTION_BASE, range_dir)

	problem_dirs = sorted(
	[
	os.path.join(range_path, d)
	for d in os.listdir(range_path)
	if os.path.isdir(os.path.join(range_path, d))
	]
	)

	print(f"Processing {range_dir}: {len(problem_dirs)} problems")

	for problem_dir in problem_dirs:
	entry = process_problem(problem_dir)
	if entry:
	all_entries.append(entry)

	# Sort by problem_id
	all_entries.sort(key=lambda x: x["problem_id"])

	output_path = "/teamspace/studios/this_studio/dataset/explanations.json"
	with open(output_path, "w", encoding="utf-8") as f:
	json.dump(all_entries, f, indent=2, ensure_ascii=False)

	print(f"\nDone! Wrote {len(all_entries)} entries to {output_path}")

	# Summary stats
	en_count = sum(1 for e in all_entries if e["is_english"])
	zh_count = len(all_entries) - en_count
	total_solutions = sum(len(e["explanations"]) for e in all_entries)
	empty_count = sum(
	1 for e in all_entries for sol in e["explanations"].values() if not sol.strip()
	)

	print(f" English explanations: {en_count}")
	print(f" Chinese fallback: {zh_count}")
	print(f" Total solutions: {total_solutions}")
	print(f" Empty explanations: {empty_count}")


	if __name__ == "__main__":
	main()