Spaces:

tm23hgf
/

rust_algo_reasoning

Running

File size: 11,174 Bytes

c987dd2

#!/usr/bin/env python3
"""
Extract solution explanations from LeetCode solution README files.

Output format per problem:
{
    "problem_id": 15,
    "explanations": {
        "1": "We sort the array first...",
        "2": "..."
    },
    "is_english": true,
    "time_complexity": "O(n^2)",
    "space_complexity": "O(log n)"
}

Strategy:
1. Read English README_EN.md first
2. If a solution has no explanation text, fall back to Chinese README.md
3. Extract time/space complexity from explanation text via regex
"""

import os
import re
import json
from pathlib import Path

SOLUTION_BASE = "/teamspace/studios/this_studio/dataset/leetcode/solution"


def parse_explanation_from_block(block):
    """Extract explanation text from a single solution block.

    The block starts right after the '### Solution N' header line.
    Explanation is everything before the first code block marker:
    - #### (language header like #### Python3)
    - <!-- tabs:start -->
    """
    lines = block.strip().split("\n")
    explanation_lines = []

    for line in lines:
        stripped = line.strip()
        if stripped.startswith("####") or stripped.startswith("<!-- tabs:start -->"):
            break
        explanation_lines.append(line)

    explanation = "\n".join(explanation_lines).strip()
    # Clean up HTML comments that sometimes leak in
    explanation = re.sub(r"<!--.*?-->", "", explanation, flags=re.DOTALL).strip()
    return explanation


def _extract_first_o_expr(text):
    r"""Find the first O(...) with balanced parentheses.

    Returns (inner_content, end_pos) or None.
    Validates that the character after the closing paren is an
    expected end-of-match delimiter (e.g. $, comma, period, backtick,
    whitespace, or end of string).
    """
    i = 0
    while i < len(text) - 2:
        if text[i] == "O" and text[i + 1] == "(":
            depth = 1
            j = i + 2
            while j < len(text) and depth > 0:
                if text[j] == "(":
                    depth += 1
                elif text[j] == ")":
                    depth -= 1
                j += 1
            if depth == 0:
                after = text[j] if j < len(text) else ""
                if after == "" or after in "$`,); \t\n，。；":
                    return text[i + 2 : j - 1], j
            i += 2
        else:
            i += 1
    return None


def _extract_complexity_after_keyword(text, kw_regex):
    """Find the first O(...) after a complexity keyword match.

    Returns 'O(inner)' or None.
    """
    for m in re.finditer(kw_regex, text):
        result = _extract_first_o_expr(text[m.end() :])
        if result:
            inner = result[0]
            if inner:
                return f"O({inner})"
    return None


def extract_complexities(text):
    r"""Extract time and space complexity from explanation text.

    Handles variations like:
    - 'The time complexity is $O(n^2)$, and the space complexity is $O(log n)$'
    - 'Time complexity is $O(n^2)$, and space complexity is $O(log n)$'
    - 'Time complexity $O(n)$, space complexity $O(n)$'  (no "is")
    - '时间复杂度 $O(n^2)$，空间复杂度 $O(log n)$'
    - '时间复杂度：$O(logN)$'  (with colon)
    - '时间复杂度为 $O(1)'  (with 为)
    - '时间复杂度 O(logn)'  (no $ delimiters)
    - 'O(\max (m, n))'  (nested parens)
    - 'O((n-m) \times m)'  (double parens)
    """
    # Keyword patterns that precede O-notation
    time_kw = r"(?:[Tt]ime\s+[Cc]omplexity\s+(?:is\s*)?|(?:时间复杂度)[：:]?\s*(?:为\s*)?)\s*(?:\\?\$)?\s*"
    space_kw = r"(?:[Ss]pace\s+[Cc]omplexity\s+(?:is\s*)?|(?:空间复杂度)[：:]?\s*(?:为\s*)?)\s*(?:\\?\$)?\s*"

    return {
        "time_complexity": _extract_complexity_after_keyword(text, time_kw),
        "space_complexity": _extract_complexity_after_keyword(text, space_kw),
    }


def parse_solutions_section(content, is_english):
    """Parse the ## Solutions section and extract all solution explanations.

    Returns:
        explanations: dict mapping solution number -> explanation text
        first_time: first non-null time complexity found
        first_space: first non-null space complexity found
        all_from_primary: whether all explanations came from the primary language
    """
    # Find the Solutions section header
    if is_english:
        match = re.search(r"## Solutions\s*\n", content)
    else:
        match = re.search(r"## 解法\s*\n", content)

    if not match:
        return {}, None, None, True

    solutions_section = content[match.end() :]

    # Split into solution blocks on ### Solution N or ### 方法N
    if is_english:
        parts = re.split(r"(?=### Solution \d+)", solutions_section)
    else:
        parts = re.split(r"(?=### 方法[一二三四五六七八九十\d]+)", solutions_section)

    explanations = {}
    first_time = None
    first_space = None
    all_have_text = True

    for part in parts:
        part = part.strip()
        if not part:
            continue

        # Extract solution number
        if is_english:
            num_match = re.match(r"### Solution (\d+)", part)
            if not num_match:
                continue
            sol_num = num_match.group(1)
        else:
            num_match = re.match(r"### 方法([一二三四五六七八九十\d]+)", part)
            if not num_match:
                continue
            raw_num = num_match.group(1)
            # Convert Chinese numerals to digits
            cn_map = {
                "一": "1",
                "二": "2",
                "三": "3",
                "四": "4",
                "五": "5",
                "六": "6",
                "七": "7",
                "八": "8",
                "九": "9",
                "十": "10",
            }
            sol_num = cn_map.get(raw_num, raw_num)

        # Get the block after the header line
        header_end = part.find("\n")
        if header_end == -1:
            explanations[sol_num] = ""
            all_have_text = False
            continue

        block = part[header_end + 1 :]
        explanation = parse_explanation_from_block(block)

        if not explanation:
            all_have_text = False

        explanations[sol_num] = explanation

        # Extract complexities from the first solution that has them
        if explanation and (first_time is None or first_space is None):
            complexities = extract_complexities(explanation)
            if complexities["time_complexity"] and first_time is None:
                first_time = complexities["time_complexity"]
            if complexities["space_complexity"] and first_space is None:
                first_space = complexities["space_complexity"]

    return explanations, first_time, first_space, all_have_text


def process_problem(problem_dir):
    """Process a single problem directory and return the explanation entry."""
    folder_name = os.path.basename(problem_dir)

    # Extract problem_id from folder name (e.g., "0015.3Sum" -> 15)
    id_match = re.match(r"(\d+)\.", folder_name)
    if not id_match:
        return None
    problem_id = int(id_match.group(1))

    en_path = os.path.join(problem_dir, "README_EN.md")
    zh_path = os.path.join(problem_dir, "README.md")

    # Read English README
    en_content = ""
    if os.path.exists(en_path):
        with open(en_path, "r", encoding="utf-8") as f:
            en_content = f.read()

    if not en_content:
        return None

    # Parse English solutions
    en_explanations, time_c, space_c, all_en = parse_solutions_section(
        en_content, is_english=True
    )

    # is_english is True if we got any English explanation text
    # Only False if all solutions had to fall back to Chinese
    is_english = (
        any(v.strip() for v in en_explanations.values()) if en_explanations else False
    )

    # If any solution has empty explanation, try Chinese fallback
    if not all_en and os.path.exists(zh_path):
        with open(zh_path, "r", encoding="utf-8") as f:
            zh_content = f.read()

        zh_explanations, zh_time, zh_space, _ = parse_solutions_section(
            zh_content, is_english=False
        )

        # Merge: use Chinese where English is empty
        for sol_num in en_explanations:
            if not en_explanations[sol_num].strip() and sol_num in zh_explanations:
                en_explanations[sol_num] = zh_explanations[sol_num]

        # Fill complexity from Chinese if still missing
        if time_c is None and zh_time:
            time_c = zh_time
        if space_c is None and zh_space:
            space_c = zh_space

    # Handle case where English had no solutions section at all
    if not en_explanations and os.path.exists(zh_path):
        with open(zh_path, "r", encoding="utf-8") as f:
            zh_content = f.read()
        en_explanations, time_c, space_c, _ = parse_solutions_section(
            zh_content, is_english=False
        )
        is_english = False

    # Skip if we found absolutely nothing
    if not en_explanations:
        return None

    return {
        "problem_id": problem_id,
        "explanations": en_explanations,
        "is_english": is_english,
        "time_complexity": time_c,
        "space_complexity": space_c,
    }


def main():
    """Main entry point: iterate all range directories and process every problem."""
    all_entries = []

    # Get all range directories sorted
    range_dirs = sorted(
        [
            d
            for d in os.listdir(SOLUTION_BASE)
            if os.path.isdir(os.path.join(SOLUTION_BASE, d))
            and re.match(r"\d{4}-\d{4}$", d)
        ]
    )

    print(f"Found {len(range_dirs)} range directories")

    for range_dir in range_dirs:
        range_path = os.path.join(SOLUTION_BASE, range_dir)

        problem_dirs = sorted(
            [
                os.path.join(range_path, d)
                for d in os.listdir(range_path)
                if os.path.isdir(os.path.join(range_path, d))
            ]
        )

        print(f"Processing {range_dir}: {len(problem_dirs)} problems")

        for problem_dir in problem_dirs:
            entry = process_problem(problem_dir)
            if entry:
                all_entries.append(entry)

    # Sort by problem_id
    all_entries.sort(key=lambda x: x["problem_id"])

    output_path = "/teamspace/studios/this_studio/dataset/explanations.json"
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(all_entries, f, indent=2, ensure_ascii=False)

    print(f"\nDone! Wrote {len(all_entries)} entries to {output_path}")

    # Summary stats
    en_count = sum(1 for e in all_entries if e["is_english"])
    zh_count = len(all_entries) - en_count
    total_solutions = sum(len(e["explanations"]) for e in all_entries)
    empty_count = sum(
        1 for e in all_entries for sol in e["explanations"].values() if not sol.strip()
    )

    print(f"  English explanations: {en_count}")
    print(f"  Chinese fallback:     {zh_count}")
    print(f"  Total solutions:      {total_solutions}")
    print(f"  Empty explanations:   {empty_count}")


if __name__ == "__main__":
    main()