Spaces:

donut12345
/

java_summarization

Running

java_summarization / engine /preprocessing.py

dunkindonuts123

NLP java summarization experiment

4465cb6 6 days ago

4.02 kB

	from __future__ import annotations

	import re

	MIN_FRAGMENT_TOKENS = 3

	STOPWORDS = {
	"public", "private", "protected", "void", "new", "null", "int", "string",
	"static", "final", "return", "class", "this", "super", "true", "false",
	"boolean", "long", "double", "float", "byte", "char", "short", "object",
	"list", "map", "set", "if", "else", "for", "while", "do", "try", "catch",
	"finally", "throw", "throws", "import", "package", "extends", "implements",
	"instanceof", "interface", "abstract", "synchronized", "volatile",
	"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "of",
	"is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
	"do", "does", "did", "will", "would", "could", "should", "may", "might",
	"it", "its", "with", "for", "not", "by", "from", "as", "that", "this",
	"which", "who", "when", "where", "how", "all", "each", "both", "more",
	"s", "e",
	}

	_SUBWORD_RE = re.compile(r"[A-Z]+(?=[A-Z][a-z])\|[A-Z]?[a-z]+\|[A-Z]+\|\d+")


	def split_identifier(token: str) -> list[str]:
	out: list[str] = []
	for part in re.split(r"[_\s]+", token):
	out.extend(_SUBWORD_RE.findall(part))
	return [w.lower() for w in out if w]


	def tokenize(text: str) -> list[str]:
	"""Sub-tokenize identifiers, lowercase, and drop stopwords."""
	toks: list[str] = []
	for raw in text.split():
	for sub in split_identifier(raw):
	if sub and sub not in STOPWORDS:
	toks.append(sub)
	return toks


	_METHOD_HEAD = re.compile(
	r"(?m)^[ \t]*"
	r"(?:@(?:\w+\.)\w+(?:\([^)]\))?\s+)*"
	r"(?:(?:public\|private\|protected)\s+)?"
	r"(?:static\s+)?(?:final\s+)?(?:synchronized\s+)?"
	r"(?:<[^>]+>\s+)?"
	r"[\w\[\]<>,.\s?]+\s+"
	r"(\w+)\s*"
	r"\(",
	)

	_NON_METHOD_NAMES = frozenset({
	"if", "for", "while", "switch", "catch", "do", "try", "else", "return", "new",
	})


	def split_java_methods(code: str) -> list[dict[str, str]]:
	source = code.strip()
	if not source:
	return []

	spans: list[tuple[int, int, str]] = []

	for match in _METHOD_HEAD.finditer(source):
	name = match.group(1)
	if name in _NON_METHOD_NAMES:
	continue

	start = match.start()
	brace_start = source.find("{", match.end())
	if brace_start == -1:
	continue

	header = source[match.end():brace_start]
	if ";" in header.split("//", 1)[0]:
	continue

	depth = 0
	end = brace_start
	for idx in range(brace_start, len(source)):
	ch = source[idx]
	if ch == "{":
	depth += 1
	elif ch == "}":
	depth -= 1
	if depth == 0:
	end = idx + 1
	break
	else:
	continue

	if any(start < existing_end and end > existing_start for existing_start, existing_end, _ in spans):
	continue

	spans.append((start, end, name))

	spans.sort(key=lambda item: item[0])

	methods = [
	{"name": name, "code": source[start:end].strip()}
	for start, end, name in spans
	]

	if methods:
	return methods

	return [{"name": "(entire file)", "code": source}]


	def format_method_summaries(parts: list[tuple[str, str]]) -> str:
	lines: list[str] = []
	for name, text in parts:
	cleaned = text.strip()
	if cleaned:
	lines.append(f"• {name}: {cleaned}")
	else:
	lines.append(f"• {name}: (no output)")
	return "\n".join(lines)


	def split_code_statements(code: str) -> list[str]:
	parts = re.split(r"[;{}\n]", code)
	frags = [re.sub(r"\s+", " ", p).strip() for p in parts]
	frags = [f for f in frags if f]

	merged: list[str] = []
	for frag in frags:
	if len(frag.split()) < MIN_FRAGMENT_TOKENS and merged:
	merged[-1] += " " + frag
	else:
	merged.append(frag)

	return merged if merged else [re.sub(r"\s+", " ", code).strip()]