| import re |
| from typing import Dict, Optional |
|
|
|
|
| FUNC_RE = re.compile(r"\bdef\s+([a-zA-Z_]\w*)\s*\(|\bfunction\s+([a-zA-Z_]\w*)\s*\(") |
| CLASS_RE = re.compile(r"\bclass\s+([a-zA-Z_]\w*)") |
| DOCSTRING_RE = re.compile(r'"""(.*?)"""|\'\'\'(.*?)\'\'\'', re.DOTALL) |
| COMMENT_RE = re.compile(r"^\s*(#|//)\s*(.+)$", re.MULTILINE) |
|
|
|
|
| def normalize_spaces(text: str) -> str: |
| if not text: |
| return "" |
| return text.replace("\r\n", "\n").replace("\r", "\n").strip() |
|
|
|
|
| def _first_non_empty(*vals: Optional[str]) -> str: |
| for v in vals: |
| if v and str(v).strip(): |
| return str(v).strip() |
| return "" |
|
|
|
|
| def infer_language(lang: str = "", path: str = "") -> str: |
| lang = (lang or "").lower() |
| path = (path or "").lower() |
| if lang: |
| return lang |
| if path.endswith(".py"): |
| return "python" |
| if path.endswith(".js"): |
| return "javascript" |
| if path.endswith(".ts"): |
| return "typescript" |
| if path.endswith(".java"): |
| return "java" |
| return "code" |
|
|
|
|
| def extract_function_name(code: str) -> str: |
| if not code: |
| return "" |
| m = FUNC_RE.search(code) |
| if m: |
| return m.group(1) or m.group(2) or "" |
| c = CLASS_RE.search(code) |
| if c: |
| return c.group(1) or "" |
| return "" |
|
|
|
|
| def extract_doc_or_comment(code: str) -> str: |
| if not code: |
| return "" |
| doc = DOCSTRING_RE.search(code) |
| if doc: |
| return _first_non_empty(doc.group(1), doc.group(2)) |
| com = COMMENT_RE.search(code) |
| if com: |
| return com.group(2).strip() |
| return "" |
|
|
|
|
| def code_to_instruction(code: str, *, language: str = "", path: str = "", title: str = "") -> str: |
| code = normalize_spaces(code) |
| lang = infer_language(language, path) |
| func = extract_function_name(code) |
| hint = _first_non_empty(title, extract_doc_or_comment(code)) |
|
|
| if func and hint: |
| return f"Write a {lang} implementation of `{func}`. Requirements: {hint}" |
| if func: |
| return f"Write a {lang} function `{func}`." |
| if hint: |
| return f"Implement this {lang} code task: {hint}" |
| if path: |
| return f"Implement or refactor the {lang} code from `{path}`." |
| return f"Write a correct and production-ready {lang} code snippet." |
|
|
|
|
| def build_instruction_sample( |
| *, |
| instruction: str = "", |
| response: str = "", |
| code: str = "", |
| language: str = "", |
| path: str = "", |
| title: str = "", |
| source: str, |
| category: str, |
| ) -> Dict[str, str]: |
| if not instruction: |
| instruction = code_to_instruction(code, language=language, path=path, title=title) |
| if not response: |
| response = code |
| return { |
| "instruction": normalize_spaces(instruction), |
| "response": normalize_spaces(response), |
| "_source": source, |
| "_category": category, |
| } |
|
|
|
|