Spaces:
Running
Running
| from __future__ import annotations | |
| import re | |
| MIN_FRAGMENT_TOKENS = 3 | |
| STOPWORDS = { | |
| "public", "private", "protected", "void", "new", "null", "int", "string", | |
| "static", "final", "return", "class", "this", "super", "true", "false", | |
| "boolean", "long", "double", "float", "byte", "char", "short", "object", | |
| "list", "map", "set", "if", "else", "for", "while", "do", "try", "catch", | |
| "finally", "throw", "throws", "import", "package", "extends", "implements", | |
| "instanceof", "interface", "abstract", "synchronized", "volatile", | |
| "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "of", | |
| "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", | |
| "do", "does", "did", "will", "would", "could", "should", "may", "might", | |
| "it", "its", "with", "for", "not", "by", "from", "as", "that", "this", | |
| "which", "who", "when", "where", "how", "all", "each", "both", "more", | |
| "s", "e", | |
| } | |
| _SUBWORD_RE = re.compile(r"[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+|[A-Z]+|\d+") | |
| def split_identifier(token: str) -> list[str]: | |
| out: list[str] = [] | |
| for part in re.split(r"[_\s]+", token): | |
| out.extend(_SUBWORD_RE.findall(part)) | |
| return [w.lower() for w in out if w] | |
| def tokenize(text: str) -> list[str]: | |
| """Sub-tokenize identifiers, lowercase, and drop stopwords.""" | |
| toks: list[str] = [] | |
| for raw in text.split(): | |
| for sub in split_identifier(raw): | |
| if sub and sub not in STOPWORDS: | |
| toks.append(sub) | |
| return toks | |
| _METHOD_HEAD = re.compile( | |
| r"(?m)^[ \t]*" | |
| r"(?:@(?:\w+\.)*\w+(?:\([^)]*\))?\s+)*" | |
| r"(?:(?:public|private|protected)\s+)?" | |
| r"(?:static\s+)?(?:final\s+)?(?:synchronized\s+)?" | |
| r"(?:<[^>]+>\s+)?" | |
| r"[\w\[\]<>,.\s?]+\s+" | |
| r"(\w+)\s*" | |
| r"\(", | |
| ) | |
| _NON_METHOD_NAMES = frozenset({ | |
| "if", "for", "while", "switch", "catch", "do", "try", "else", "return", "new", | |
| }) | |
| def split_java_methods(code: str) -> list[dict[str, str]]: | |
| source = code.strip() | |
| if not source: | |
| return [] | |
| spans: list[tuple[int, int, str]] = [] | |
| for match in _METHOD_HEAD.finditer(source): | |
| name = match.group(1) | |
| if name in _NON_METHOD_NAMES: | |
| continue | |
| start = match.start() | |
| brace_start = source.find("{", match.end()) | |
| if brace_start == -1: | |
| continue | |
| header = source[match.end():brace_start] | |
| if ";" in header.split("//", 1)[0]: | |
| continue | |
| depth = 0 | |
| end = brace_start | |
| for idx in range(brace_start, len(source)): | |
| ch = source[idx] | |
| if ch == "{": | |
| depth += 1 | |
| elif ch == "}": | |
| depth -= 1 | |
| if depth == 0: | |
| end = idx + 1 | |
| break | |
| else: | |
| continue | |
| if any(start < existing_end and end > existing_start for existing_start, existing_end, _ in spans): | |
| continue | |
| spans.append((start, end, name)) | |
| spans.sort(key=lambda item: item[0]) | |
| methods = [ | |
| {"name": name, "code": source[start:end].strip()} | |
| for start, end, name in spans | |
| ] | |
| if methods: | |
| return methods | |
| return [{"name": "(entire file)", "code": source}] | |
| def format_method_summaries(parts: list[tuple[str, str]]) -> str: | |
| lines: list[str] = [] | |
| for name, text in parts: | |
| cleaned = text.strip() | |
| if cleaned: | |
| lines.append(f"• {name}: {cleaned}") | |
| else: | |
| lines.append(f"• {name}: (no output)") | |
| return "\n".join(lines) | |
| def split_code_statements(code: str) -> list[str]: | |
| parts = re.split(r"[;{}\n]", code) | |
| frags = [re.sub(r"\s+", " ", p).strip() for p in parts] | |
| frags = [f for f in frags if f] | |
| merged: list[str] = [] | |
| for frag in frags: | |
| if len(frag.split()) < MIN_FRAGMENT_TOKENS and merged: | |
| merged[-1] += " " + frag | |
| else: | |
| merged.append(frag) | |
| return merged if merged else [re.sub(r"\s+", " ", code).strip()] | |