import re from typing import Dict, Optional FUNC_RE = re.compile(r"\bdef\s+([a-zA-Z_]\w*)\s*\(|\bfunction\s+([a-zA-Z_]\w*)\s*\(") CLASS_RE = re.compile(r"\bclass\s+([a-zA-Z_]\w*)") DOCSTRING_RE = re.compile(r'"""(.*?)"""|\'\'\'(.*?)\'\'\'', re.DOTALL) COMMENT_RE = re.compile(r"^\s*(#|//)\s*(.+)$", re.MULTILINE) def normalize_spaces(text: str) -> str: if not text: return "" return text.replace("\r\n", "\n").replace("\r", "\n").strip() def _first_non_empty(*vals: Optional[str]) -> str: for v in vals: if v and str(v).strip(): return str(v).strip() return "" def infer_language(lang: str = "", path: str = "") -> str: lang = (lang or "").lower() path = (path or "").lower() if lang: return lang if path.endswith(".py"): return "python" if path.endswith(".js"): return "javascript" if path.endswith(".ts"): return "typescript" if path.endswith(".java"): return "java" return "code" def extract_function_name(code: str) -> str: if not code: return "" m = FUNC_RE.search(code) if m: return m.group(1) or m.group(2) or "" c = CLASS_RE.search(code) if c: return c.group(1) or "" return "" def extract_doc_or_comment(code: str) -> str: if not code: return "" doc = DOCSTRING_RE.search(code) if doc: return _first_non_empty(doc.group(1), doc.group(2)) com = COMMENT_RE.search(code) if com: return com.group(2).strip() return "" def code_to_instruction(code: str, *, language: str = "", path: str = "", title: str = "") -> str: code = normalize_spaces(code) lang = infer_language(language, path) func = extract_function_name(code) hint = _first_non_empty(title, extract_doc_or_comment(code)) if func and hint: return f"Write a {lang} implementation of `{func}`. Requirements: {hint}" if func: return f"Write a {lang} function `{func}`." if hint: return f"Implement this {lang} code task: {hint}" if path: return f"Implement or refactor the {lang} code from `{path}`." return f"Write a correct and production-ready {lang} code snippet." def build_instruction_sample( *, instruction: str = "", response: str = "", code: str = "", language: str = "", path: str = "", title: str = "", source: str, category: str, ) -> Dict[str, str]: if not instruction: instruction = code_to_instruction(code, language=language, path=path, title=title) if not response: response = code return { "instruction": normalize_spaces(instruction), "response": normalize_spaces(response), "_source": source, "_category": category, }