mindi-backup / dataset_formatter.py
Mindigenous
Sync latest workspace state: data/scripts updates and archive cleanup
5ae3e12
import re
from typing import Dict, Optional
FUNC_RE = re.compile(r"\bdef\s+([a-zA-Z_]\w*)\s*\(|\bfunction\s+([a-zA-Z_]\w*)\s*\(")
CLASS_RE = re.compile(r"\bclass\s+([a-zA-Z_]\w*)")
DOCSTRING_RE = re.compile(r'"""(.*?)"""|\'\'\'(.*?)\'\'\'', re.DOTALL)
COMMENT_RE = re.compile(r"^\s*(#|//)\s*(.+)$", re.MULTILINE)
def normalize_spaces(text: str) -> str:
if not text:
return ""
return text.replace("\r\n", "\n").replace("\r", "\n").strip()
def _first_non_empty(*vals: Optional[str]) -> str:
for v in vals:
if v and str(v).strip():
return str(v).strip()
return ""
def infer_language(lang: str = "", path: str = "") -> str:
lang = (lang or "").lower()
path = (path or "").lower()
if lang:
return lang
if path.endswith(".py"):
return "python"
if path.endswith(".js"):
return "javascript"
if path.endswith(".ts"):
return "typescript"
if path.endswith(".java"):
return "java"
return "code"
def extract_function_name(code: str) -> str:
if not code:
return ""
m = FUNC_RE.search(code)
if m:
return m.group(1) or m.group(2) or ""
c = CLASS_RE.search(code)
if c:
return c.group(1) or ""
return ""
def extract_doc_or_comment(code: str) -> str:
if not code:
return ""
doc = DOCSTRING_RE.search(code)
if doc:
return _first_non_empty(doc.group(1), doc.group(2))
com = COMMENT_RE.search(code)
if com:
return com.group(2).strip()
return ""
def code_to_instruction(code: str, *, language: str = "", path: str = "", title: str = "") -> str:
code = normalize_spaces(code)
lang = infer_language(language, path)
func = extract_function_name(code)
hint = _first_non_empty(title, extract_doc_or_comment(code))
if func and hint:
return f"Write a {lang} implementation of `{func}`. Requirements: {hint}"
if func:
return f"Write a {lang} function `{func}`."
if hint:
return f"Implement this {lang} code task: {hint}"
if path:
return f"Implement or refactor the {lang} code from `{path}`."
return f"Write a correct and production-ready {lang} code snippet."
def build_instruction_sample(
*,
instruction: str = "",
response: str = "",
code: str = "",
language: str = "",
path: str = "",
title: str = "",
source: str,
category: str,
) -> Dict[str, str]:
if not instruction:
instruction = code_to_instruction(code, language=language, path=path, title=title)
if not response:
response = code
return {
"instruction": normalize_spaces(instruction),
"response": normalize_spaces(response),
"_source": source,
"_category": category,
}