Spaces:
Running
Running
File size: 11,174 Bytes
c987dd2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 | #!/usr/bin/env python3
"""
Extract solution explanations from LeetCode solution README files.
Output format per problem:
{
"problem_id": 15,
"explanations": {
"1": "We sort the array first...",
"2": "..."
},
"is_english": true,
"time_complexity": "O(n^2)",
"space_complexity": "O(log n)"
}
Strategy:
1. Read English README_EN.md first
2. If a solution has no explanation text, fall back to Chinese README.md
3. Extract time/space complexity from explanation text via regex
"""
import os
import re
import json
from pathlib import Path
SOLUTION_BASE = "/teamspace/studios/this_studio/dataset/leetcode/solution"
def parse_explanation_from_block(block):
"""Extract explanation text from a single solution block.
The block starts right after the '### Solution N' header line.
Explanation is everything before the first code block marker:
- #### (language header like #### Python3)
- <!-- tabs:start -->
"""
lines = block.strip().split("\n")
explanation_lines = []
for line in lines:
stripped = line.strip()
if stripped.startswith("####") or stripped.startswith("<!-- tabs:start -->"):
break
explanation_lines.append(line)
explanation = "\n".join(explanation_lines).strip()
# Clean up HTML comments that sometimes leak in
explanation = re.sub(r"<!--.*?-->", "", explanation, flags=re.DOTALL).strip()
return explanation
def _extract_first_o_expr(text):
r"""Find the first O(...) with balanced parentheses.
Returns (inner_content, end_pos) or None.
Validates that the character after the closing paren is an
expected end-of-match delimiter (e.g. $, comma, period, backtick,
whitespace, or end of string).
"""
i = 0
while i < len(text) - 2:
if text[i] == "O" and text[i + 1] == "(":
depth = 1
j = i + 2
while j < len(text) and depth > 0:
if text[j] == "(":
depth += 1
elif text[j] == ")":
depth -= 1
j += 1
if depth == 0:
after = text[j] if j < len(text) else ""
if after == "" or after in "$`,); \t\n,。;":
return text[i + 2 : j - 1], j
i += 2
else:
i += 1
return None
def _extract_complexity_after_keyword(text, kw_regex):
"""Find the first O(...) after a complexity keyword match.
Returns 'O(inner)' or None.
"""
for m in re.finditer(kw_regex, text):
result = _extract_first_o_expr(text[m.end() :])
if result:
inner = result[0]
if inner:
return f"O({inner})"
return None
def extract_complexities(text):
r"""Extract time and space complexity from explanation text.
Handles variations like:
- 'The time complexity is $O(n^2)$, and the space complexity is $O(log n)$'
- 'Time complexity is $O(n^2)$, and space complexity is $O(log n)$'
- 'Time complexity $O(n)$, space complexity $O(n)$' (no "is")
- '时间复杂度 $O(n^2)$,空间复杂度 $O(log n)$'
- '时间复杂度:$O(logN)$' (with colon)
- '时间复杂度为 $O(1)' (with 为)
- '时间复杂度 O(logn)' (no $ delimiters)
- 'O(\max (m, n))' (nested parens)
- 'O((n-m) \times m)' (double parens)
"""
# Keyword patterns that precede O-notation
time_kw = r"(?:[Tt]ime\s+[Cc]omplexity\s+(?:is\s*)?|(?:时间复杂度)[::]?\s*(?:为\s*)?)\s*(?:\\?\$)?\s*"
space_kw = r"(?:[Ss]pace\s+[Cc]omplexity\s+(?:is\s*)?|(?:空间复杂度)[::]?\s*(?:为\s*)?)\s*(?:\\?\$)?\s*"
return {
"time_complexity": _extract_complexity_after_keyword(text, time_kw),
"space_complexity": _extract_complexity_after_keyword(text, space_kw),
}
def parse_solutions_section(content, is_english):
"""Parse the ## Solutions section and extract all solution explanations.
Returns:
explanations: dict mapping solution number -> explanation text
first_time: first non-null time complexity found
first_space: first non-null space complexity found
all_from_primary: whether all explanations came from the primary language
"""
# Find the Solutions section header
if is_english:
match = re.search(r"## Solutions\s*\n", content)
else:
match = re.search(r"## 解法\s*\n", content)
if not match:
return {}, None, None, True
solutions_section = content[match.end() :]
# Split into solution blocks on ### Solution N or ### 方法N
if is_english:
parts = re.split(r"(?=### Solution \d+)", solutions_section)
else:
parts = re.split(r"(?=### 方法[一二三四五六七八九十\d]+)", solutions_section)
explanations = {}
first_time = None
first_space = None
all_have_text = True
for part in parts:
part = part.strip()
if not part:
continue
# Extract solution number
if is_english:
num_match = re.match(r"### Solution (\d+)", part)
if not num_match:
continue
sol_num = num_match.group(1)
else:
num_match = re.match(r"### 方法([一二三四五六七八九十\d]+)", part)
if not num_match:
continue
raw_num = num_match.group(1)
# Convert Chinese numerals to digits
cn_map = {
"一": "1",
"二": "2",
"三": "3",
"四": "4",
"五": "5",
"六": "6",
"七": "7",
"八": "8",
"九": "9",
"十": "10",
}
sol_num = cn_map.get(raw_num, raw_num)
# Get the block after the header line
header_end = part.find("\n")
if header_end == -1:
explanations[sol_num] = ""
all_have_text = False
continue
block = part[header_end + 1 :]
explanation = parse_explanation_from_block(block)
if not explanation:
all_have_text = False
explanations[sol_num] = explanation
# Extract complexities from the first solution that has them
if explanation and (first_time is None or first_space is None):
complexities = extract_complexities(explanation)
if complexities["time_complexity"] and first_time is None:
first_time = complexities["time_complexity"]
if complexities["space_complexity"] and first_space is None:
first_space = complexities["space_complexity"]
return explanations, first_time, first_space, all_have_text
def process_problem(problem_dir):
"""Process a single problem directory and return the explanation entry."""
folder_name = os.path.basename(problem_dir)
# Extract problem_id from folder name (e.g., "0015.3Sum" -> 15)
id_match = re.match(r"(\d+)\.", folder_name)
if not id_match:
return None
problem_id = int(id_match.group(1))
en_path = os.path.join(problem_dir, "README_EN.md")
zh_path = os.path.join(problem_dir, "README.md")
# Read English README
en_content = ""
if os.path.exists(en_path):
with open(en_path, "r", encoding="utf-8") as f:
en_content = f.read()
if not en_content:
return None
# Parse English solutions
en_explanations, time_c, space_c, all_en = parse_solutions_section(
en_content, is_english=True
)
# is_english is True if we got any English explanation text
# Only False if all solutions had to fall back to Chinese
is_english = (
any(v.strip() for v in en_explanations.values()) if en_explanations else False
)
# If any solution has empty explanation, try Chinese fallback
if not all_en and os.path.exists(zh_path):
with open(zh_path, "r", encoding="utf-8") as f:
zh_content = f.read()
zh_explanations, zh_time, zh_space, _ = parse_solutions_section(
zh_content, is_english=False
)
# Merge: use Chinese where English is empty
for sol_num in en_explanations:
if not en_explanations[sol_num].strip() and sol_num in zh_explanations:
en_explanations[sol_num] = zh_explanations[sol_num]
# Fill complexity from Chinese if still missing
if time_c is None and zh_time:
time_c = zh_time
if space_c is None and zh_space:
space_c = zh_space
# Handle case where English had no solutions section at all
if not en_explanations and os.path.exists(zh_path):
with open(zh_path, "r", encoding="utf-8") as f:
zh_content = f.read()
en_explanations, time_c, space_c, _ = parse_solutions_section(
zh_content, is_english=False
)
is_english = False
# Skip if we found absolutely nothing
if not en_explanations:
return None
return {
"problem_id": problem_id,
"explanations": en_explanations,
"is_english": is_english,
"time_complexity": time_c,
"space_complexity": space_c,
}
def main():
"""Main entry point: iterate all range directories and process every problem."""
all_entries = []
# Get all range directories sorted
range_dirs = sorted(
[
d
for d in os.listdir(SOLUTION_BASE)
if os.path.isdir(os.path.join(SOLUTION_BASE, d))
and re.match(r"\d{4}-\d{4}$", d)
]
)
print(f"Found {len(range_dirs)} range directories")
for range_dir in range_dirs:
range_path = os.path.join(SOLUTION_BASE, range_dir)
problem_dirs = sorted(
[
os.path.join(range_path, d)
for d in os.listdir(range_path)
if os.path.isdir(os.path.join(range_path, d))
]
)
print(f"Processing {range_dir}: {len(problem_dirs)} problems")
for problem_dir in problem_dirs:
entry = process_problem(problem_dir)
if entry:
all_entries.append(entry)
# Sort by problem_id
all_entries.sort(key=lambda x: x["problem_id"])
output_path = "/teamspace/studios/this_studio/dataset/explanations.json"
with open(output_path, "w", encoding="utf-8") as f:
json.dump(all_entries, f, indent=2, ensure_ascii=False)
print(f"\nDone! Wrote {len(all_entries)} entries to {output_path}")
# Summary stats
en_count = sum(1 for e in all_entries if e["is_english"])
zh_count = len(all_entries) - en_count
total_solutions = sum(len(e["explanations"]) for e in all_entries)
empty_count = sum(
1 for e in all_entries for sol in e["explanations"].values() if not sol.strip()
)
print(f" English explanations: {en_count}")
print(f" Chinese fallback: {zh_count}")
print(f" Total solutions: {total_solutions}")
print(f" Empty explanations: {empty_count}")
if __name__ == "__main__":
main()
|