Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Extract solution explanations from LeetCode solution README files. | |
| Output format per problem: | |
| { | |
| "problem_id": 15, | |
| "explanations": { | |
| "1": "We sort the array first...", | |
| "2": "..." | |
| }, | |
| "is_english": true, | |
| "time_complexity": "O(n^2)", | |
| "space_complexity": "O(log n)" | |
| } | |
| Strategy: | |
| 1. Read English README_EN.md first | |
| 2. If a solution has no explanation text, fall back to Chinese README.md | |
| 3. Extract time/space complexity from explanation text via regex | |
| """ | |
| import os | |
| import re | |
| import json | |
| from pathlib import Path | |
| SOLUTION_BASE = "/teamspace/studios/this_studio/dataset/leetcode/solution" | |
| def parse_explanation_from_block(block): | |
| """Extract explanation text from a single solution block. | |
| The block starts right after the '### Solution N' header line. | |
| Explanation is everything before the first code block marker: | |
| - #### (language header like #### Python3) | |
| - <!-- tabs:start --> | |
| """ | |
| lines = block.strip().split("\n") | |
| explanation_lines = [] | |
| for line in lines: | |
| stripped = line.strip() | |
| if stripped.startswith("####") or stripped.startswith("<!-- tabs:start -->"): | |
| break | |
| explanation_lines.append(line) | |
| explanation = "\n".join(explanation_lines).strip() | |
| # Clean up HTML comments that sometimes leak in | |
| explanation = re.sub(r"<!--.*?-->", "", explanation, flags=re.DOTALL).strip() | |
| return explanation | |
| def _extract_first_o_expr(text): | |
| r"""Find the first O(...) with balanced parentheses. | |
| Returns (inner_content, end_pos) or None. | |
| Validates that the character after the closing paren is an | |
| expected end-of-match delimiter (e.g. $, comma, period, backtick, | |
| whitespace, or end of string). | |
| """ | |
| i = 0 | |
| while i < len(text) - 2: | |
| if text[i] == "O" and text[i + 1] == "(": | |
| depth = 1 | |
| j = i + 2 | |
| while j < len(text) and depth > 0: | |
| if text[j] == "(": | |
| depth += 1 | |
| elif text[j] == ")": | |
| depth -= 1 | |
| j += 1 | |
| if depth == 0: | |
| after = text[j] if j < len(text) else "" | |
| if after == "" or after in "$`,); \t\n,。;": | |
| return text[i + 2 : j - 1], j | |
| i += 2 | |
| else: | |
| i += 1 | |
| return None | |
| def _extract_complexity_after_keyword(text, kw_regex): | |
| """Find the first O(...) after a complexity keyword match. | |
| Returns 'O(inner)' or None. | |
| """ | |
| for m in re.finditer(kw_regex, text): | |
| result = _extract_first_o_expr(text[m.end() :]) | |
| if result: | |
| inner = result[0] | |
| if inner: | |
| return f"O({inner})" | |
| return None | |
| def extract_complexities(text): | |
| r"""Extract time and space complexity from explanation text. | |
| Handles variations like: | |
| - 'The time complexity is $O(n^2)$, and the space complexity is $O(log n)$' | |
| - 'Time complexity is $O(n^2)$, and space complexity is $O(log n)$' | |
| - 'Time complexity $O(n)$, space complexity $O(n)$' (no "is") | |
| - '时间复杂度 $O(n^2)$,空间复杂度 $O(log n)$' | |
| - '时间复杂度:$O(logN)$' (with colon) | |
| - '时间复杂度为 $O(1)' (with 为) | |
| - '时间复杂度 O(logn)' (no $ delimiters) | |
| - 'O(\max (m, n))' (nested parens) | |
| - 'O((n-m) \times m)' (double parens) | |
| """ | |
| # Keyword patterns that precede O-notation | |
| time_kw = r"(?:[Tt]ime\s+[Cc]omplexity\s+(?:is\s*)?|(?:时间复杂度)[::]?\s*(?:为\s*)?)\s*(?:\\?\$)?\s*" | |
| space_kw = r"(?:[Ss]pace\s+[Cc]omplexity\s+(?:is\s*)?|(?:空间复杂度)[::]?\s*(?:为\s*)?)\s*(?:\\?\$)?\s*" | |
| return { | |
| "time_complexity": _extract_complexity_after_keyword(text, time_kw), | |
| "space_complexity": _extract_complexity_after_keyword(text, space_kw), | |
| } | |
| def parse_solutions_section(content, is_english): | |
| """Parse the ## Solutions section and extract all solution explanations. | |
| Returns: | |
| explanations: dict mapping solution number -> explanation text | |
| first_time: first non-null time complexity found | |
| first_space: first non-null space complexity found | |
| all_from_primary: whether all explanations came from the primary language | |
| """ | |
| # Find the Solutions section header | |
| if is_english: | |
| match = re.search(r"## Solutions\s*\n", content) | |
| else: | |
| match = re.search(r"## 解法\s*\n", content) | |
| if not match: | |
| return {}, None, None, True | |
| solutions_section = content[match.end() :] | |
| # Split into solution blocks on ### Solution N or ### 方法N | |
| if is_english: | |
| parts = re.split(r"(?=### Solution \d+)", solutions_section) | |
| else: | |
| parts = re.split(r"(?=### 方法[一二三四五六七八九十\d]+)", solutions_section) | |
| explanations = {} | |
| first_time = None | |
| first_space = None | |
| all_have_text = True | |
| for part in parts: | |
| part = part.strip() | |
| if not part: | |
| continue | |
| # Extract solution number | |
| if is_english: | |
| num_match = re.match(r"### Solution (\d+)", part) | |
| if not num_match: | |
| continue | |
| sol_num = num_match.group(1) | |
| else: | |
| num_match = re.match(r"### 方法([一二三四五六七八九十\d]+)", part) | |
| if not num_match: | |
| continue | |
| raw_num = num_match.group(1) | |
| # Convert Chinese numerals to digits | |
| cn_map = { | |
| "一": "1", | |
| "二": "2", | |
| "三": "3", | |
| "四": "4", | |
| "五": "5", | |
| "六": "6", | |
| "七": "7", | |
| "八": "8", | |
| "九": "9", | |
| "十": "10", | |
| } | |
| sol_num = cn_map.get(raw_num, raw_num) | |
| # Get the block after the header line | |
| header_end = part.find("\n") | |
| if header_end == -1: | |
| explanations[sol_num] = "" | |
| all_have_text = False | |
| continue | |
| block = part[header_end + 1 :] | |
| explanation = parse_explanation_from_block(block) | |
| if not explanation: | |
| all_have_text = False | |
| explanations[sol_num] = explanation | |
| # Extract complexities from the first solution that has them | |
| if explanation and (first_time is None or first_space is None): | |
| complexities = extract_complexities(explanation) | |
| if complexities["time_complexity"] and first_time is None: | |
| first_time = complexities["time_complexity"] | |
| if complexities["space_complexity"] and first_space is None: | |
| first_space = complexities["space_complexity"] | |
| return explanations, first_time, first_space, all_have_text | |
| def process_problem(problem_dir): | |
| """Process a single problem directory and return the explanation entry.""" | |
| folder_name = os.path.basename(problem_dir) | |
| # Extract problem_id from folder name (e.g., "0015.3Sum" -> 15) | |
| id_match = re.match(r"(\d+)\.", folder_name) | |
| if not id_match: | |
| return None | |
| problem_id = int(id_match.group(1)) | |
| en_path = os.path.join(problem_dir, "README_EN.md") | |
| zh_path = os.path.join(problem_dir, "README.md") | |
| # Read English README | |
| en_content = "" | |
| if os.path.exists(en_path): | |
| with open(en_path, "r", encoding="utf-8") as f: | |
| en_content = f.read() | |
| if not en_content: | |
| return None | |
| # Parse English solutions | |
| en_explanations, time_c, space_c, all_en = parse_solutions_section( | |
| en_content, is_english=True | |
| ) | |
| # is_english is True if we got any English explanation text | |
| # Only False if all solutions had to fall back to Chinese | |
| is_english = ( | |
| any(v.strip() for v in en_explanations.values()) if en_explanations else False | |
| ) | |
| # If any solution has empty explanation, try Chinese fallback | |
| if not all_en and os.path.exists(zh_path): | |
| with open(zh_path, "r", encoding="utf-8") as f: | |
| zh_content = f.read() | |
| zh_explanations, zh_time, zh_space, _ = parse_solutions_section( | |
| zh_content, is_english=False | |
| ) | |
| # Merge: use Chinese where English is empty | |
| for sol_num in en_explanations: | |
| if not en_explanations[sol_num].strip() and sol_num in zh_explanations: | |
| en_explanations[sol_num] = zh_explanations[sol_num] | |
| # Fill complexity from Chinese if still missing | |
| if time_c is None and zh_time: | |
| time_c = zh_time | |
| if space_c is None and zh_space: | |
| space_c = zh_space | |
| # Handle case where English had no solutions section at all | |
| if not en_explanations and os.path.exists(zh_path): | |
| with open(zh_path, "r", encoding="utf-8") as f: | |
| zh_content = f.read() | |
| en_explanations, time_c, space_c, _ = parse_solutions_section( | |
| zh_content, is_english=False | |
| ) | |
| is_english = False | |
| # Skip if we found absolutely nothing | |
| if not en_explanations: | |
| return None | |
| return { | |
| "problem_id": problem_id, | |
| "explanations": en_explanations, | |
| "is_english": is_english, | |
| "time_complexity": time_c, | |
| "space_complexity": space_c, | |
| } | |
| def main(): | |
| """Main entry point: iterate all range directories and process every problem.""" | |
| all_entries = [] | |
| # Get all range directories sorted | |
| range_dirs = sorted( | |
| [ | |
| d | |
| for d in os.listdir(SOLUTION_BASE) | |
| if os.path.isdir(os.path.join(SOLUTION_BASE, d)) | |
| and re.match(r"\d{4}-\d{4}$", d) | |
| ] | |
| ) | |
| print(f"Found {len(range_dirs)} range directories") | |
| for range_dir in range_dirs: | |
| range_path = os.path.join(SOLUTION_BASE, range_dir) | |
| problem_dirs = sorted( | |
| [ | |
| os.path.join(range_path, d) | |
| for d in os.listdir(range_path) | |
| if os.path.isdir(os.path.join(range_path, d)) | |
| ] | |
| ) | |
| print(f"Processing {range_dir}: {len(problem_dirs)} problems") | |
| for problem_dir in problem_dirs: | |
| entry = process_problem(problem_dir) | |
| if entry: | |
| all_entries.append(entry) | |
| # Sort by problem_id | |
| all_entries.sort(key=lambda x: x["problem_id"]) | |
| output_path = "/teamspace/studios/this_studio/dataset/explanations.json" | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| json.dump(all_entries, f, indent=2, ensure_ascii=False) | |
| print(f"\nDone! Wrote {len(all_entries)} entries to {output_path}") | |
| # Summary stats | |
| en_count = sum(1 for e in all_entries if e["is_english"]) | |
| zh_count = len(all_entries) - en_count | |
| total_solutions = sum(len(e["explanations"]) for e in all_entries) | |
| empty_count = sum( | |
| 1 for e in all_entries for sol in e["explanations"].values() if not sol.strip() | |
| ) | |
| print(f" English explanations: {en_count}") | |
| print(f" Chinese fallback: {zh_count}") | |
| print(f" Total solutions: {total_solutions}") | |
| print(f" Empty explanations: {empty_count}") | |
| if __name__ == "__main__": | |
| main() | |