File size: 8,277 Bytes
8643b59 4394ee2 8643b59 3be90dc 8643b59 4394ee2 3be90dc 8643b59 6ece7dd 8643b59 6ece7dd 8643b59 6ece7dd 8643b59 4394ee2 8643b59 4394ee2 8643b59 4394ee2 a628163 8643b59 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 | import json
import os
import re
import sys
import ast
def load_template(template_name):
"""
Finds and loads the template file.
Searches in known directories: ., Regression, Logistic_Regression, Random_Forest, Decision_Tree
"""
search_dirs = [
".",
"Regression",
"Logistic_Regression",
"Random_Forest",
"Decision_Tree",
"ANN"
]
for d in search_dirs:
path = os.path.join(d, template_name)
if os.path.exists(path):
with open(path, "r") as f:
return f.read()
# Fallback: Try to find any file with that name in the whole tree
for root, dirs, files in os.walk("."):
if template_name in files:
with open(os.path.join(root, template_name), "r") as f:
return f.read()
raise FileNotFoundError(f"Template '{template_name}' not found in search paths.")
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
def format_data_map(data_dict):
"""
Converts the JSON data dictionary into a Python dictionary string
compatible with AdvancedAnalytics (using DT class and tuples).
"""
lines = []
lines.append("data_map = {")
for key, value in data_dict.items():
# Value is expected to be [TypeString, TupleString/List]
# e.g., ["DT.Interval", "(0, 100)"] or ["DT.Binary", ["Yes", "No"]]
dtype_str = value[0] # "DT.Interval"
val_range = value[1] # "(0, 100)" or ["Yes", "No"]
# 1. Handle Data Type (Strip quotes in final output)
# We just keep the string "DT.Interval" for now, we will write it directly
# 2. Handle the Value Tuple
# Attempt to parse string representation (e.g. "('0', '1')") into actual tuple
if isinstance(val_range, str):
try:
# Only attempt if it looks like a tuple or list
if val_range.strip().startswith(('(', '[')):
parsed = ast.literal_eval(val_range)
if isinstance(parsed, (list, tuple)):
val_range = parsed
except:
pass
if isinstance(val_range, (list, tuple)):
# Check if all items are numbers
all_numbers = True
for item in val_range:
if not isinstance(item, (int, float)) and not (isinstance(item, str) and is_number(item)):
all_numbers = False
break
if all_numbers:
# Convert to numbers and avoid quotes
# If they are strings representing numbers, convert to float/int
clean_items = []
for x in val_range:
if isinstance(x, str):
try:
clean_items.append(int(x))
except:
clean_items.append(float(x))
else:
clean_items.append(x)
val_tuple_str = "(" + ", ".join(str(x) for x in clean_items) + ")"
else:
# Keep as strings with quotes
# CRITICAL: If mixed types, force ALL to be strings for consistency
val_tuple_str = "(" + ", ".join(repr(str(x)) for x in val_range) + ")"
elif isinstance(val_range, str):
# It's already a string like "(0, 100)"
# We need to ensure it's safe, but usually we just use it
val_tuple_str = val_range
else:
val_tuple_str = str(val_range)
# 3. Construct the line: 'key': [DT.Interval, (0, 100)],
# Note: We manually format the string to look like code
line = f" '{key}': [{dtype_str}, {val_tuple_str}],"
lines.append(line)
lines.append("}")
return "\n".join(lines)
def generate_code(json_path, output_filename="solution.py", model_override=None):
"""
Main function to read JSON prescription and generate Python code.
If model_override is provided, use that template instead of the one in the JSON.
Returns the path to the generated file.
"""
print(f"Reading prescription from: {json_path}")
with open(json_path, "r") as f:
prescription = json.load(f)
# 1. Extract Details
model_template = model_override or prescription.get("suggested_model")
target_var = prescription.get("target_variable")
data_file = prescription.get("data_file")
data_dictionary = prescription.get("data_dictionary")
print(f"Target: {target_var}")
print(f"Model: {model_template}")
print(f"Data: {data_file}")
# 2. Load Template
try:
template_code = load_template(model_template)
except FileNotFoundError:
# Fallback if template name is wrong
print(f"Warning: Template {model_template} not found. Using generic placeholder.")
template_code = load_template("BinaryRandomForest_Template.py") # Default fallback
# 3. Generate Data Map Code Block
data_map_code = format_data_map(data_dictionary)
# 4. Perform Replacements
new_code = template_code
# Replace data_map block
# Regex to find 'data_map = { ... }' across multiple lines
# We match 'data_map', whitespace, '=', whitespace, '{', then anything non-greedy until '}'
data_map_pattern = r"data_map\s*=\s*\{.*?\}"
new_code = re.sub(data_map_pattern, data_map_code, new_code, flags=re.DOTALL)
# Replace Target
# Pattern: target = "..."
target_pattern = r'target\s*=\s*".*?"'
new_code = re.sub(target_pattern, f'target = "{target_var}"', new_code)
# Replace Data File
# We want the output code to ALWAYS use a relative path
# regardless of what path was in the JSON (which might be /tmp/gradio/...)
# Extract just the filename
filename = "your_data_file.csv"
if data_file:
filename = os.path.basename(data_file)
# Get delimiter if present
delimiter = prescription.get("delimiter", ",")
if not delimiter or delimiter == "comma": delimiter = ","
# Normalize common names
if delimiter.lower() == "tab": delimiter = "\t"
if delimiter.lower() == "space": delimiter = " "
if delimiter.lower() == "semicolon": delimiter = ";"
# Pattern: pd.read_csv("...")
# We need to handle both CSV and Excel cases in the template replacement
if filename.lower().endswith(('.xls', '.xlsx')):
read_cmd = f'pd.read_excel("{filename}")'
else:
if delimiter == ",":
read_cmd = f'pd.read_csv("{filename}")'
elif delimiter == "\t":
# Explicitly handle tab to ensure it appears as \t in the code
read_cmd = f'pd.read_csv("{filename}", sep="\\t")'
else:
# Use repr() to safely encode special chars like \t, \n, etc.
# repr('\t') -> "'\t'" (including the quotes)
read_cmd = f'pd.read_csv("{filename}", sep={repr(delimiter)})'
# Prepare replacement string for re.sub
# We must escape backslashes so re.sub doesn't treat them as escapes (e.g. \t -> tab)
safe_read_cmd = read_cmd.replace("\\", "\\\\")
# The template likely has pd.read_csv("...") or pd.read_csv('...') by default.
# Regex matches: pd.read_csv( + quote + content + matching quote + )
read_csv_pattern = r"pd\.read_csv\(([\"']).*?\1\)"
new_code = re.sub(read_csv_pattern, safe_read_cmd, new_code)
# Just in case the template was already set to read_excel
read_excel_pattern = r"pd\.read_excel\(([\"']).*?\1\)"
new_code = re.sub(read_excel_pattern, safe_read_cmd, new_code)
# 5. Write Output
with open(output_filename, "w") as f:
f.write(new_code)
print(f"Successfully generated: {output_filename}")
return output_filename
if __name__ == "__main__":
# Standalone testing
files = [f for f in os.listdir('.') if f.startswith('project_context_') and f.endswith('.json')]
if files:
files.sort(reverse=True)
latest_json = files[0]
generate_code(latest_json)
|