Spaces:
Running
Running
File size: 4,022 Bytes
4465cb6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | from __future__ import annotations
import re
MIN_FRAGMENT_TOKENS = 3
STOPWORDS = {
"public", "private", "protected", "void", "new", "null", "int", "string",
"static", "final", "return", "class", "this", "super", "true", "false",
"boolean", "long", "double", "float", "byte", "char", "short", "object",
"list", "map", "set", "if", "else", "for", "while", "do", "try", "catch",
"finally", "throw", "throws", "import", "package", "extends", "implements",
"instanceof", "interface", "abstract", "synchronized", "volatile",
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "of",
"is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
"do", "does", "did", "will", "would", "could", "should", "may", "might",
"it", "its", "with", "for", "not", "by", "from", "as", "that", "this",
"which", "who", "when", "where", "how", "all", "each", "both", "more",
"s", "e",
}
_SUBWORD_RE = re.compile(r"[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+|[A-Z]+|\d+")
def split_identifier(token: str) -> list[str]:
out: list[str] = []
for part in re.split(r"[_\s]+", token):
out.extend(_SUBWORD_RE.findall(part))
return [w.lower() for w in out if w]
def tokenize(text: str) -> list[str]:
"""Sub-tokenize identifiers, lowercase, and drop stopwords."""
toks: list[str] = []
for raw in text.split():
for sub in split_identifier(raw):
if sub and sub not in STOPWORDS:
toks.append(sub)
return toks
_METHOD_HEAD = re.compile(
r"(?m)^[ \t]*"
r"(?:@(?:\w+\.)*\w+(?:\([^)]*\))?\s+)*"
r"(?:(?:public|private|protected)\s+)?"
r"(?:static\s+)?(?:final\s+)?(?:synchronized\s+)?"
r"(?:<[^>]+>\s+)?"
r"[\w\[\]<>,.\s?]+\s+"
r"(\w+)\s*"
r"\(",
)
_NON_METHOD_NAMES = frozenset({
"if", "for", "while", "switch", "catch", "do", "try", "else", "return", "new",
})
def split_java_methods(code: str) -> list[dict[str, str]]:
source = code.strip()
if not source:
return []
spans: list[tuple[int, int, str]] = []
for match in _METHOD_HEAD.finditer(source):
name = match.group(1)
if name in _NON_METHOD_NAMES:
continue
start = match.start()
brace_start = source.find("{", match.end())
if brace_start == -1:
continue
header = source[match.end():brace_start]
if ";" in header.split("//", 1)[0]:
continue
depth = 0
end = brace_start
for idx in range(brace_start, len(source)):
ch = source[idx]
if ch == "{":
depth += 1
elif ch == "}":
depth -= 1
if depth == 0:
end = idx + 1
break
else:
continue
if any(start < existing_end and end > existing_start for existing_start, existing_end, _ in spans):
continue
spans.append((start, end, name))
spans.sort(key=lambda item: item[0])
methods = [
{"name": name, "code": source[start:end].strip()}
for start, end, name in spans
]
if methods:
return methods
return [{"name": "(entire file)", "code": source}]
def format_method_summaries(parts: list[tuple[str, str]]) -> str:
lines: list[str] = []
for name, text in parts:
cleaned = text.strip()
if cleaned:
lines.append(f"• {name}: {cleaned}")
else:
lines.append(f"• {name}: (no output)")
return "\n".join(lines)
def split_code_statements(code: str) -> list[str]:
parts = re.split(r"[;{}\n]", code)
frags = [re.sub(r"\s+", " ", p).strip() for p in parts]
frags = [f for f in frags if f]
merged: list[str] = []
for frag in frags:
if len(frag.split()) < MIN_FRAGMENT_TOKENS and merged:
merged[-1] += " " + frag
else:
merged.append(frag)
return merged if merged else [re.sub(r"\s+", " ", code).strip()]
|