#!/usr/bin/env python3 """ Extract solution explanations from LeetCode solution README files. Output format per problem: { "problem_id": 15, "explanations": { "1": "We sort the array first...", "2": "..." }, "is_english": true, "time_complexity": "O(n^2)", "space_complexity": "O(log n)" } Strategy: 1. Read English README_EN.md first 2. If a solution has no explanation text, fall back to Chinese README.md 3. Extract time/space complexity from explanation text via regex """ import os import re import json from pathlib import Path SOLUTION_BASE = "/teamspace/studios/this_studio/dataset/leetcode/solution" def parse_explanation_from_block(block): """Extract explanation text from a single solution block. The block starts right after the '### Solution N' header line. Explanation is everything before the first code block marker: - #### (language header like #### Python3) - """ lines = block.strip().split("\n") explanation_lines = [] for line in lines: stripped = line.strip() if stripped.startswith("####") or stripped.startswith(""): break explanation_lines.append(line) explanation = "\n".join(explanation_lines).strip() # Clean up HTML comments that sometimes leak in explanation = re.sub(r"", "", explanation, flags=re.DOTALL).strip() return explanation def _extract_first_o_expr(text): r"""Find the first O(...) with balanced parentheses. Returns (inner_content, end_pos) or None. Validates that the character after the closing paren is an expected end-of-match delimiter (e.g. $, comma, period, backtick, whitespace, or end of string). """ i = 0 while i < len(text) - 2: if text[i] == "O" and text[i + 1] == "(": depth = 1 j = i + 2 while j < len(text) and depth > 0: if text[j] == "(": depth += 1 elif text[j] == ")": depth -= 1 j += 1 if depth == 0: after = text[j] if j < len(text) else "" if after == "" or after in "$`,); \t\n,。;": return text[i + 2 : j - 1], j i += 2 else: i += 1 return None def _extract_complexity_after_keyword(text, kw_regex): """Find the first O(...) after a complexity keyword match. Returns 'O(inner)' or None. """ for m in re.finditer(kw_regex, text): result = _extract_first_o_expr(text[m.end() :]) if result: inner = result[0] if inner: return f"O({inner})" return None def extract_complexities(text): r"""Extract time and space complexity from explanation text. Handles variations like: - 'The time complexity is $O(n^2)$, and the space complexity is $O(log n)$' - 'Time complexity is $O(n^2)$, and space complexity is $O(log n)$' - 'Time complexity $O(n)$, space complexity $O(n)$' (no "is") - '时间复杂度 $O(n^2)$,空间复杂度 $O(log n)$' - '时间复杂度:$O(logN)$' (with colon) - '时间复杂度为 $O(1)' (with 为) - '时间复杂度 O(logn)' (no $ delimiters) - 'O(\max (m, n))' (nested parens) - 'O((n-m) \times m)' (double parens) """ # Keyword patterns that precede O-notation time_kw = r"(?:[Tt]ime\s+[Cc]omplexity\s+(?:is\s*)?|(?:时间复杂度)[::]?\s*(?:为\s*)?)\s*(?:\\?\$)?\s*" space_kw = r"(?:[Ss]pace\s+[Cc]omplexity\s+(?:is\s*)?|(?:空间复杂度)[::]?\s*(?:为\s*)?)\s*(?:\\?\$)?\s*" return { "time_complexity": _extract_complexity_after_keyword(text, time_kw), "space_complexity": _extract_complexity_after_keyword(text, space_kw), } def parse_solutions_section(content, is_english): """Parse the ## Solutions section and extract all solution explanations. Returns: explanations: dict mapping solution number -> explanation text first_time: first non-null time complexity found first_space: first non-null space complexity found all_from_primary: whether all explanations came from the primary language """ # Find the Solutions section header if is_english: match = re.search(r"## Solutions\s*\n", content) else: match = re.search(r"## 解法\s*\n", content) if not match: return {}, None, None, True solutions_section = content[match.end() :] # Split into solution blocks on ### Solution N or ### 方法N if is_english: parts = re.split(r"(?=### Solution \d+)", solutions_section) else: parts = re.split(r"(?=### 方法[一二三四五六七八九十\d]+)", solutions_section) explanations = {} first_time = None first_space = None all_have_text = True for part in parts: part = part.strip() if not part: continue # Extract solution number if is_english: num_match = re.match(r"### Solution (\d+)", part) if not num_match: continue sol_num = num_match.group(1) else: num_match = re.match(r"### 方法([一二三四五六七八九十\d]+)", part) if not num_match: continue raw_num = num_match.group(1) # Convert Chinese numerals to digits cn_map = { "一": "1", "二": "2", "三": "3", "四": "4", "五": "5", "六": "6", "七": "7", "八": "8", "九": "9", "十": "10", } sol_num = cn_map.get(raw_num, raw_num) # Get the block after the header line header_end = part.find("\n") if header_end == -1: explanations[sol_num] = "" all_have_text = False continue block = part[header_end + 1 :] explanation = parse_explanation_from_block(block) if not explanation: all_have_text = False explanations[sol_num] = explanation # Extract complexities from the first solution that has them if explanation and (first_time is None or first_space is None): complexities = extract_complexities(explanation) if complexities["time_complexity"] and first_time is None: first_time = complexities["time_complexity"] if complexities["space_complexity"] and first_space is None: first_space = complexities["space_complexity"] return explanations, first_time, first_space, all_have_text def process_problem(problem_dir): """Process a single problem directory and return the explanation entry.""" folder_name = os.path.basename(problem_dir) # Extract problem_id from folder name (e.g., "0015.3Sum" -> 15) id_match = re.match(r"(\d+)\.", folder_name) if not id_match: return None problem_id = int(id_match.group(1)) en_path = os.path.join(problem_dir, "README_EN.md") zh_path = os.path.join(problem_dir, "README.md") # Read English README en_content = "" if os.path.exists(en_path): with open(en_path, "r", encoding="utf-8") as f: en_content = f.read() if not en_content: return None # Parse English solutions en_explanations, time_c, space_c, all_en = parse_solutions_section( en_content, is_english=True ) # is_english is True if we got any English explanation text # Only False if all solutions had to fall back to Chinese is_english = ( any(v.strip() for v in en_explanations.values()) if en_explanations else False ) # If any solution has empty explanation, try Chinese fallback if not all_en and os.path.exists(zh_path): with open(zh_path, "r", encoding="utf-8") as f: zh_content = f.read() zh_explanations, zh_time, zh_space, _ = parse_solutions_section( zh_content, is_english=False ) # Merge: use Chinese where English is empty for sol_num in en_explanations: if not en_explanations[sol_num].strip() and sol_num in zh_explanations: en_explanations[sol_num] = zh_explanations[sol_num] # Fill complexity from Chinese if still missing if time_c is None and zh_time: time_c = zh_time if space_c is None and zh_space: space_c = zh_space # Handle case where English had no solutions section at all if not en_explanations and os.path.exists(zh_path): with open(zh_path, "r", encoding="utf-8") as f: zh_content = f.read() en_explanations, time_c, space_c, _ = parse_solutions_section( zh_content, is_english=False ) is_english = False # Skip if we found absolutely nothing if not en_explanations: return None return { "problem_id": problem_id, "explanations": en_explanations, "is_english": is_english, "time_complexity": time_c, "space_complexity": space_c, } def main(): """Main entry point: iterate all range directories and process every problem.""" all_entries = [] # Get all range directories sorted range_dirs = sorted( [ d for d in os.listdir(SOLUTION_BASE) if os.path.isdir(os.path.join(SOLUTION_BASE, d)) and re.match(r"\d{4}-\d{4}$", d) ] ) print(f"Found {len(range_dirs)} range directories") for range_dir in range_dirs: range_path = os.path.join(SOLUTION_BASE, range_dir) problem_dirs = sorted( [ os.path.join(range_path, d) for d in os.listdir(range_path) if os.path.isdir(os.path.join(range_path, d)) ] ) print(f"Processing {range_dir}: {len(problem_dirs)} problems") for problem_dir in problem_dirs: entry = process_problem(problem_dir) if entry: all_entries.append(entry) # Sort by problem_id all_entries.sort(key=lambda x: x["problem_id"]) output_path = "/teamspace/studios/this_studio/dataset/explanations.json" with open(output_path, "w", encoding="utf-8") as f: json.dump(all_entries, f, indent=2, ensure_ascii=False) print(f"\nDone! Wrote {len(all_entries)} entries to {output_path}") # Summary stats en_count = sum(1 for e in all_entries if e["is_english"]) zh_count = len(all_entries) - en_count total_solutions = sum(len(e["explanations"]) for e in all_entries) empty_count = sum( 1 for e in all_entries for sol in e["explanations"].values() if not sol.strip() ) print(f" English explanations: {en_count}") print(f" Chinese fallback: {zh_count}") print(f" Total solutions: {total_solutions}") print(f" Empty explanations: {empty_count}") if __name__ == "__main__": main()