java_summarization / engine /preprocessing.py
dunkindonuts123
NLP java summarization experiment
4465cb6
Raw
History Blame Contribute Delete
4.02 kB
from __future__ import annotations
import re
MIN_FRAGMENT_TOKENS = 3
STOPWORDS = {
"public", "private", "protected", "void", "new", "null", "int", "string",
"static", "final", "return", "class", "this", "super", "true", "false",
"boolean", "long", "double", "float", "byte", "char", "short", "object",
"list", "map", "set", "if", "else", "for", "while", "do", "try", "catch",
"finally", "throw", "throws", "import", "package", "extends", "implements",
"instanceof", "interface", "abstract", "synchronized", "volatile",
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "of",
"is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
"do", "does", "did", "will", "would", "could", "should", "may", "might",
"it", "its", "with", "for", "not", "by", "from", "as", "that", "this",
"which", "who", "when", "where", "how", "all", "each", "both", "more",
"s", "e",
}
_SUBWORD_RE = re.compile(r"[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+|[A-Z]+|\d+")
def split_identifier(token: str) -> list[str]:
out: list[str] = []
for part in re.split(r"[_\s]+", token):
out.extend(_SUBWORD_RE.findall(part))
return [w.lower() for w in out if w]
def tokenize(text: str) -> list[str]:
"""Sub-tokenize identifiers, lowercase, and drop stopwords."""
toks: list[str] = []
for raw in text.split():
for sub in split_identifier(raw):
if sub and sub not in STOPWORDS:
toks.append(sub)
return toks
_METHOD_HEAD = re.compile(
r"(?m)^[ \t]*"
r"(?:@(?:\w+\.)*\w+(?:\([^)]*\))?\s+)*"
r"(?:(?:public|private|protected)\s+)?"
r"(?:static\s+)?(?:final\s+)?(?:synchronized\s+)?"
r"(?:<[^>]+>\s+)?"
r"[\w\[\]<>,.\s?]+\s+"
r"(\w+)\s*"
r"\(",
)
_NON_METHOD_NAMES = frozenset({
"if", "for", "while", "switch", "catch", "do", "try", "else", "return", "new",
})
def split_java_methods(code: str) -> list[dict[str, str]]:
source = code.strip()
if not source:
return []
spans: list[tuple[int, int, str]] = []
for match in _METHOD_HEAD.finditer(source):
name = match.group(1)
if name in _NON_METHOD_NAMES:
continue
start = match.start()
brace_start = source.find("{", match.end())
if brace_start == -1:
continue
header = source[match.end():brace_start]
if ";" in header.split("//", 1)[0]:
continue
depth = 0
end = brace_start
for idx in range(brace_start, len(source)):
ch = source[idx]
if ch == "{":
depth += 1
elif ch == "}":
depth -= 1
if depth == 0:
end = idx + 1
break
else:
continue
if any(start < existing_end and end > existing_start for existing_start, existing_end, _ in spans):
continue
spans.append((start, end, name))
spans.sort(key=lambda item: item[0])
methods = [
{"name": name, "code": source[start:end].strip()}
for start, end, name in spans
]
if methods:
return methods
return [{"name": "(entire file)", "code": source}]
def format_method_summaries(parts: list[tuple[str, str]]) -> str:
lines: list[str] = []
for name, text in parts:
cleaned = text.strip()
if cleaned:
lines.append(f"• {name}: {cleaned}")
else:
lines.append(f"• {name}: (no output)")
return "\n".join(lines)
def split_code_statements(code: str) -> list[str]:
parts = re.split(r"[;{}\n]", code)
frags = [re.sub(r"\s+", " ", p).strip() for p in parts]
frags = [f for f in frags if f]
merged: list[str] = []
for frag in frags:
if len(frag.split()) < MIN_FRAGMENT_TOKENS and merged:
merged[-1] += " " + frag
else:
merged.append(frag)
return merged if merged else [re.sub(r"\s+", " ", code).strip()]