File size: 2,797 Bytes
5ae3e12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import re
from typing import Dict, Optional


FUNC_RE = re.compile(r"\bdef\s+([a-zA-Z_]\w*)\s*\(|\bfunction\s+([a-zA-Z_]\w*)\s*\(")
CLASS_RE = re.compile(r"\bclass\s+([a-zA-Z_]\w*)")
DOCSTRING_RE = re.compile(r'"""(.*?)"""|\'\'\'(.*?)\'\'\'', re.DOTALL)
COMMENT_RE = re.compile(r"^\s*(#|//)\s*(.+)$", re.MULTILINE)


def normalize_spaces(text: str) -> str:
    if not text:
        return ""
    return text.replace("\r\n", "\n").replace("\r", "\n").strip()


def _first_non_empty(*vals: Optional[str]) -> str:
    for v in vals:
        if v and str(v).strip():
            return str(v).strip()
    return ""


def infer_language(lang: str = "", path: str = "") -> str:
    lang = (lang or "").lower()
    path = (path or "").lower()
    if lang:
        return lang
    if path.endswith(".py"):
        return "python"
    if path.endswith(".js"):
        return "javascript"
    if path.endswith(".ts"):
        return "typescript"
    if path.endswith(".java"):
        return "java"
    return "code"


def extract_function_name(code: str) -> str:
    if not code:
        return ""
    m = FUNC_RE.search(code)
    if m:
        return m.group(1) or m.group(2) or ""
    c = CLASS_RE.search(code)
    if c:
        return c.group(1) or ""
    return ""


def extract_doc_or_comment(code: str) -> str:
    if not code:
        return ""
    doc = DOCSTRING_RE.search(code)
    if doc:
        return _first_non_empty(doc.group(1), doc.group(2))
    com = COMMENT_RE.search(code)
    if com:
        return com.group(2).strip()
    return ""


def code_to_instruction(code: str, *, language: str = "", path: str = "", title: str = "") -> str:
    code = normalize_spaces(code)
    lang = infer_language(language, path)
    func = extract_function_name(code)
    hint = _first_non_empty(title, extract_doc_or_comment(code))

    if func and hint:
        return f"Write a {lang} implementation of `{func}`. Requirements: {hint}"
    if func:
        return f"Write a {lang} function `{func}`."
    if hint:
        return f"Implement this {lang} code task: {hint}"
    if path:
        return f"Implement or refactor the {lang} code from `{path}`."
    return f"Write a correct and production-ready {lang} code snippet."


def build_instruction_sample(
    *,
    instruction: str = "",
    response: str = "",
    code: str = "",
    language: str = "",
    path: str = "",
    title: str = "",
    source: str,
    category: str,
) -> Dict[str, str]:
    if not instruction:
        instruction = code_to_instruction(code, language=language, path=path, title=title)
    if not response:
        response = code
    return {
        "instruction": normalize_spaces(instruction),
        "response": normalize_spaces(response),
        "_source": source,
        "_category": category,
    }