File size: 8,277 Bytes
8643b59
 
 
 
4394ee2
8643b59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3be90dc
 
 
 
 
 
 
8643b59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4394ee2
 
 
 
 
 
 
 
 
 
 
 
3be90dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8643b59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ece7dd
8643b59
 
6ece7dd
8643b59
 
 
 
 
 
 
6ece7dd
8643b59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4394ee2
8643b59
 
 
 
 
 
 
4394ee2
 
 
 
 
 
 
 
 
 
8643b59
4394ee2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a628163
 
 
 
 
 
 
 
 
 
 
 
8643b59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import json
import os
import re
import sys
import ast

def load_template(template_name):
    """
    Finds and loads the template file.
    Searches in known directories: ., Regression, Logistic_Regression, Random_Forest, Decision_Tree
    """
    search_dirs = [
        ".",
        "Regression",
        "Logistic_Regression",
        "Random_Forest",
        "Decision_Tree",
        "ANN"
    ]
    
    for d in search_dirs:
        path = os.path.join(d, template_name)
        if os.path.exists(path):
            with open(path, "r") as f:
                return f.read()
    
    # Fallback: Try to find any file with that name in the whole tree
    for root, dirs, files in os.walk("."):
        if template_name in files:
            with open(os.path.join(root, template_name), "r") as f:
                return f.read()
                
    raise FileNotFoundError(f"Template '{template_name}' not found in search paths.")

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

def format_data_map(data_dict):
    """
    Converts the JSON data dictionary into a Python dictionary string
    compatible with AdvancedAnalytics (using DT class and tuples).
    """
    lines = []
    lines.append("data_map = {")
    
    for key, value in data_dict.items():
        # Value is expected to be [TypeString, TupleString/List]
        # e.g., ["DT.Interval", "(0, 100)"] or ["DT.Binary", ["Yes", "No"]]
        
        dtype_str = value[0]  # "DT.Interval"
        val_range = value[1]  # "(0, 100)" or ["Yes", "No"]
        
        # 1. Handle Data Type (Strip quotes in final output)
        # We just keep the string "DT.Interval" for now, we will write it directly
        
        # 2. Handle the Value Tuple
        # Attempt to parse string representation (e.g. "('0', '1')") into actual tuple
        if isinstance(val_range, str):
            try:
                # Only attempt if it looks like a tuple or list
                if val_range.strip().startswith(('(', '[')):
                    parsed = ast.literal_eval(val_range)
                    if isinstance(parsed, (list, tuple)):
                        val_range = parsed
            except:
                pass

        if isinstance(val_range, (list, tuple)):
            # Check if all items are numbers
            all_numbers = True
            for item in val_range:
                if not isinstance(item, (int, float)) and not (isinstance(item, str) and is_number(item)):
                    all_numbers = False
                    break
            
            if all_numbers:
                # Convert to numbers and avoid quotes
                # If they are strings representing numbers, convert to float/int
                clean_items = []
                for x in val_range:
                    if isinstance(x, str):
                        try:
                            clean_items.append(int(x))
                        except:
                            clean_items.append(float(x))
                    else:
                        clean_items.append(x)
                val_tuple_str = "(" + ", ".join(str(x) for x in clean_items) + ")"
            else:
                # Keep as strings with quotes
                # CRITICAL: If mixed types, force ALL to be strings for consistency
                val_tuple_str = "(" + ", ".join(repr(str(x)) for x in val_range) + ")"
                
        elif isinstance(val_range, str):
            # It's already a string like "(0, 100)"
            # We need to ensure it's safe, but usually we just use it
            val_tuple_str = val_range
        else:
            val_tuple_str = str(val_range)
            
        # 3. Construct the line:  'key': [DT.Interval, (0, 100)],
        # Note: We manually format the string to look like code
        line = f"    '{key}': [{dtype_str}, {val_tuple_str}],"
        lines.append(line)
        
    lines.append("}")
    return "\n".join(lines)

def generate_code(json_path, output_filename="solution.py", model_override=None):
    """
    Main function to read JSON prescription and generate Python code.
    If model_override is provided, use that template instead of the one in the JSON.
    Returns the path to the generated file.
    """
    print(f"Reading prescription from: {json_path}")
    with open(json_path, "r") as f:
        prescription = json.load(f)
        
    # 1. Extract Details
    model_template = model_override or prescription.get("suggested_model")
    target_var = prescription.get("target_variable")
    data_file = prescription.get("data_file")
    data_dictionary = prescription.get("data_dictionary")
    
    print(f"Target: {target_var}")
    print(f"Model: {model_template}")
    print(f"Data: {data_file}")
    
    # 2. Load Template
    try:
        template_code = load_template(model_template)
    except FileNotFoundError:
        # Fallback if template name is wrong
        print(f"Warning: Template {model_template} not found. Using generic placeholder.")
        template_code = load_template("BinaryRandomForest_Template.py") # Default fallback

    # 3. Generate Data Map Code Block
    data_map_code = format_data_map(data_dictionary)
    
    # 4. Perform Replacements
    new_code = template_code
    
    # Replace data_map block
    # Regex to find 'data_map = { ... }' across multiple lines
    # We match 'data_map', whitespace, '=', whitespace, '{', then anything non-greedy until '}'
    data_map_pattern = r"data_map\s*=\s*\{.*?\}"
    new_code = re.sub(data_map_pattern, data_map_code, new_code, flags=re.DOTALL)
    
    # Replace Target
    # Pattern: target = "..."
    target_pattern = r'target\s*=\s*".*?"'
    new_code = re.sub(target_pattern, f'target = "{target_var}"', new_code)
    
    # Replace Data File
    # We want the output code to ALWAYS use a relative path
    # regardless of what path was in the JSON (which might be /tmp/gradio/...)
    
    # Extract just the filename
    filename = "your_data_file.csv"
    if data_file:
        filename = os.path.basename(data_file)
        
    # Get delimiter if present
    delimiter = prescription.get("delimiter", ",")
    
    if not delimiter or delimiter == "comma": delimiter = ","
    
    # Normalize common names
    if delimiter.lower() == "tab": delimiter = "\t"
    if delimiter.lower() == "space": delimiter = " "
    if delimiter.lower() == "semicolon": delimiter = ";"
    
    # Pattern: pd.read_csv("...")
    # We need to handle both CSV and Excel cases in the template replacement
    
    if filename.lower().endswith(('.xls', '.xlsx')):
        read_cmd = f'pd.read_excel("{filename}")'
    else:
        if delimiter == ",":
            read_cmd = f'pd.read_csv("{filename}")'
        elif delimiter == "\t":
            # Explicitly handle tab to ensure it appears as \t in the code
            read_cmd = f'pd.read_csv("{filename}", sep="\\t")'
        else:
            # Use repr() to safely encode special chars like \t, \n, etc.
            # repr('\t') -> "'\t'" (including the quotes)
            read_cmd = f'pd.read_csv("{filename}", sep={repr(delimiter)})'

    # Prepare replacement string for re.sub
    # We must escape backslashes so re.sub doesn't treat them as escapes (e.g. \t -> tab)
    safe_read_cmd = read_cmd.replace("\\", "\\\\")

    # The template likely has pd.read_csv("...") or pd.read_csv('...') by default.
    # Regex matches: pd.read_csv( + quote + content + matching quote + )
    read_csv_pattern = r"pd\.read_csv\(([\"']).*?\1\)"
    new_code = re.sub(read_csv_pattern, safe_read_cmd, new_code)
    
    # Just in case the template was already set to read_excel
    read_excel_pattern = r"pd\.read_excel\(([\"']).*?\1\)"
    new_code = re.sub(read_excel_pattern, safe_read_cmd, new_code)
    
    # 5. Write Output
    with open(output_filename, "w") as f:
        f.write(new_code)
        
    print(f"Successfully generated: {output_filename}")
    return output_filename

if __name__ == "__main__":
    # Standalone testing
    files = [f for f in os.listdir('.') if f.startswith('project_context_') and f.endswith('.json')]
    if files:
        files.sort(reverse=True)
        latest_json = files[0]
        generate_code(latest_json)