Spaces:

donut12345
/

java_summarization

Running

File size: 4,022 Bytes

4465cb6

from __future__ import annotations

import re

MIN_FRAGMENT_TOKENS = 3

STOPWORDS = {
    "public", "private", "protected", "void", "new", "null", "int", "string",
    "static", "final", "return", "class", "this", "super", "true", "false",
    "boolean", "long", "double", "float", "byte", "char", "short", "object",
    "list", "map", "set", "if", "else", "for", "while", "do", "try", "catch",
    "finally", "throw", "throws", "import", "package", "extends", "implements",
    "instanceof", "interface", "abstract", "synchronized", "volatile",
    "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "of",
    "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
    "do", "does", "did", "will", "would", "could", "should", "may", "might",
    "it", "its", "with", "for", "not", "by", "from", "as", "that", "this",
    "which", "who", "when", "where", "how", "all", "each", "both", "more",
    "s", "e",
}

_SUBWORD_RE = re.compile(r"[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+|[A-Z]+|\d+")


def split_identifier(token: str) -> list[str]:
    out: list[str] = []
    for part in re.split(r"[_\s]+", token):
        out.extend(_SUBWORD_RE.findall(part))
    return [w.lower() for w in out if w]


def tokenize(text: str) -> list[str]:
    """Sub-tokenize identifiers, lowercase, and drop stopwords."""
    toks: list[str] = []
    for raw in text.split():
        for sub in split_identifier(raw):
            if sub and sub not in STOPWORDS:
                toks.append(sub)
    return toks


_METHOD_HEAD = re.compile(
    r"(?m)^[ \t]*"
    r"(?:@(?:\w+\.)*\w+(?:\([^)]*\))?\s+)*"
    r"(?:(?:public|private|protected)\s+)?"
    r"(?:static\s+)?(?:final\s+)?(?:synchronized\s+)?"
    r"(?:<[^>]+>\s+)?"
    r"[\w\[\]<>,.\s?]+\s+"
    r"(\w+)\s*"
    r"\(",
)

_NON_METHOD_NAMES = frozenset({
    "if", "for", "while", "switch", "catch", "do", "try", "else", "return", "new",
})


def split_java_methods(code: str) -> list[dict[str, str]]:
    source = code.strip()
    if not source:
        return []

    spans: list[tuple[int, int, str]] = []

    for match in _METHOD_HEAD.finditer(source):
        name = match.group(1)
        if name in _NON_METHOD_NAMES:
            continue

        start = match.start()
        brace_start = source.find("{", match.end())
        if brace_start == -1:
            continue

        header = source[match.end():brace_start]
        if ";" in header.split("//", 1)[0]:
            continue

        depth = 0
        end = brace_start
        for idx in range(brace_start, len(source)):
            ch = source[idx]
            if ch == "{":
                depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0:
                    end = idx + 1
                    break
        else:
            continue

        if any(start < existing_end and end > existing_start for existing_start, existing_end, _ in spans):
            continue

        spans.append((start, end, name))

    spans.sort(key=lambda item: item[0])

    methods = [
        {"name": name, "code": source[start:end].strip()}
        for start, end, name in spans
    ]

    if methods:
        return methods

    return [{"name": "(entire file)", "code": source}]


def format_method_summaries(parts: list[tuple[str, str]]) -> str:
    lines: list[str] = []
    for name, text in parts:
        cleaned = text.strip()
        if cleaned:
            lines.append(f"• {name}: {cleaned}")
        else:
            lines.append(f"• {name}: (no output)")
    return "\n".join(lines)


def split_code_statements(code: str) -> list[str]:
    parts = re.split(r"[;{}\n]", code)
    frags = [re.sub(r"\s+", " ", p).strip() for p in parts]
    frags = [f for f in frags if f]

    merged: list[str] = []
    for frag in frags:
        if len(frag.split()) < MIN_FRAGMENT_TOKENS and merged:
            merged[-1] += " " + frag
        else:
            merged.append(frag)

    return merged if merged else [re.sub(r"\s+", " ", code).strip()]