File size: 4,022 Bytes
4465cb6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from __future__ import annotations

import re

MIN_FRAGMENT_TOKENS = 3

STOPWORDS = {
    "public", "private", "protected", "void", "new", "null", "int", "string",
    "static", "final", "return", "class", "this", "super", "true", "false",
    "boolean", "long", "double", "float", "byte", "char", "short", "object",
    "list", "map", "set", "if", "else", "for", "while", "do", "try", "catch",
    "finally", "throw", "throws", "import", "package", "extends", "implements",
    "instanceof", "interface", "abstract", "synchronized", "volatile",
    "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "of",
    "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
    "do", "does", "did", "will", "would", "could", "should", "may", "might",
    "it", "its", "with", "for", "not", "by", "from", "as", "that", "this",
    "which", "who", "when", "where", "how", "all", "each", "both", "more",
    "s", "e",
}

_SUBWORD_RE = re.compile(r"[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+|[A-Z]+|\d+")


def split_identifier(token: str) -> list[str]:
    out: list[str] = []
    for part in re.split(r"[_\s]+", token):
        out.extend(_SUBWORD_RE.findall(part))
    return [w.lower() for w in out if w]


def tokenize(text: str) -> list[str]:
    """Sub-tokenize identifiers, lowercase, and drop stopwords."""
    toks: list[str] = []
    for raw in text.split():
        for sub in split_identifier(raw):
            if sub and sub not in STOPWORDS:
                toks.append(sub)
    return toks


_METHOD_HEAD = re.compile(
    r"(?m)^[ \t]*"
    r"(?:@(?:\w+\.)*\w+(?:\([^)]*\))?\s+)*"
    r"(?:(?:public|private|protected)\s+)?"
    r"(?:static\s+)?(?:final\s+)?(?:synchronized\s+)?"
    r"(?:<[^>]+>\s+)?"
    r"[\w\[\]<>,.\s?]+\s+"
    r"(\w+)\s*"
    r"\(",
)

_NON_METHOD_NAMES = frozenset({
    "if", "for", "while", "switch", "catch", "do", "try", "else", "return", "new",
})


def split_java_methods(code: str) -> list[dict[str, str]]:
    source = code.strip()
    if not source:
        return []

    spans: list[tuple[int, int, str]] = []

    for match in _METHOD_HEAD.finditer(source):
        name = match.group(1)
        if name in _NON_METHOD_NAMES:
            continue

        start = match.start()
        brace_start = source.find("{", match.end())
        if brace_start == -1:
            continue

        header = source[match.end():brace_start]
        if ";" in header.split("//", 1)[0]:
            continue

        depth = 0
        end = brace_start
        for idx in range(brace_start, len(source)):
            ch = source[idx]
            if ch == "{":
                depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0:
                    end = idx + 1
                    break
        else:
            continue

        if any(start < existing_end and end > existing_start for existing_start, existing_end, _ in spans):
            continue

        spans.append((start, end, name))

    spans.sort(key=lambda item: item[0])

    methods = [
        {"name": name, "code": source[start:end].strip()}
        for start, end, name in spans
    ]

    if methods:
        return methods

    return [{"name": "(entire file)", "code": source}]


def format_method_summaries(parts: list[tuple[str, str]]) -> str:
    lines: list[str] = []
    for name, text in parts:
        cleaned = text.strip()
        if cleaned:
            lines.append(f"• {name}: {cleaned}")
        else:
            lines.append(f"• {name}: (no output)")
    return "\n".join(lines)


def split_code_statements(code: str) -> list[str]:
    parts = re.split(r"[;{}\n]", code)
    frags = [re.sub(r"\s+", " ", p).strip() for p in parts]
    frags = [f for f in frags if f]

    merged: list[str] = []
    for frag in frags:
        if len(frag.split()) < MIN_FRAGMENT_TOKENS and merged:
            merged[-1] += " " + frag
        else:
            merged.append(frag)

    return merged if merged else [re.sub(r"\s+", " ", code).strip()]