dr_jones / code_generator.py
anly656's picture
Upload 2 files
6ece7dd verified
import json
import os
import re
import sys
import ast
def load_template(template_name):
"""
Finds and loads the template file.
Searches in known directories: ., Regression, Logistic_Regression, Random_Forest, Decision_Tree
"""
search_dirs = [
".",
"Regression",
"Logistic_Regression",
"Random_Forest",
"Decision_Tree",
"ANN"
]
for d in search_dirs:
path = os.path.join(d, template_name)
if os.path.exists(path):
with open(path, "r") as f:
return f.read()
# Fallback: Try to find any file with that name in the whole tree
for root, dirs, files in os.walk("."):
if template_name in files:
with open(os.path.join(root, template_name), "r") as f:
return f.read()
raise FileNotFoundError(f"Template '{template_name}' not found in search paths.")
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
def format_data_map(data_dict):
"""
Converts the JSON data dictionary into a Python dictionary string
compatible with AdvancedAnalytics (using DT class and tuples).
"""
lines = []
lines.append("data_map = {")
for key, value in data_dict.items():
# Value is expected to be [TypeString, TupleString/List]
# e.g., ["DT.Interval", "(0, 100)"] or ["DT.Binary", ["Yes", "No"]]
dtype_str = value[0] # "DT.Interval"
val_range = value[1] # "(0, 100)" or ["Yes", "No"]
# 1. Handle Data Type (Strip quotes in final output)
# We just keep the string "DT.Interval" for now, we will write it directly
# 2. Handle the Value Tuple
# Attempt to parse string representation (e.g. "('0', '1')") into actual tuple
if isinstance(val_range, str):
try:
# Only attempt if it looks like a tuple or list
if val_range.strip().startswith(('(', '[')):
parsed = ast.literal_eval(val_range)
if isinstance(parsed, (list, tuple)):
val_range = parsed
except:
pass
if isinstance(val_range, (list, tuple)):
# Check if all items are numbers
all_numbers = True
for item in val_range:
if not isinstance(item, (int, float)) and not (isinstance(item, str) and is_number(item)):
all_numbers = False
break
if all_numbers:
# Convert to numbers and avoid quotes
# If they are strings representing numbers, convert to float/int
clean_items = []
for x in val_range:
if isinstance(x, str):
try:
clean_items.append(int(x))
except:
clean_items.append(float(x))
else:
clean_items.append(x)
val_tuple_str = "(" + ", ".join(str(x) for x in clean_items) + ")"
else:
# Keep as strings with quotes
# CRITICAL: If mixed types, force ALL to be strings for consistency
val_tuple_str = "(" + ", ".join(repr(str(x)) for x in val_range) + ")"
elif isinstance(val_range, str):
# It's already a string like "(0, 100)"
# We need to ensure it's safe, but usually we just use it
val_tuple_str = val_range
else:
val_tuple_str = str(val_range)
# 3. Construct the line: 'key': [DT.Interval, (0, 100)],
# Note: We manually format the string to look like code
line = f" '{key}': [{dtype_str}, {val_tuple_str}],"
lines.append(line)
lines.append("}")
return "\n".join(lines)
def generate_code(json_path, output_filename="solution.py", model_override=None):
"""
Main function to read JSON prescription and generate Python code.
If model_override is provided, use that template instead of the one in the JSON.
Returns the path to the generated file.
"""
print(f"Reading prescription from: {json_path}")
with open(json_path, "r") as f:
prescription = json.load(f)
# 1. Extract Details
model_template = model_override or prescription.get("suggested_model")
target_var = prescription.get("target_variable")
data_file = prescription.get("data_file")
data_dictionary = prescription.get("data_dictionary")
print(f"Target: {target_var}")
print(f"Model: {model_template}")
print(f"Data: {data_file}")
# 2. Load Template
try:
template_code = load_template(model_template)
except FileNotFoundError:
# Fallback if template name is wrong
print(f"Warning: Template {model_template} not found. Using generic placeholder.")
template_code = load_template("BinaryRandomForest_Template.py") # Default fallback
# 3. Generate Data Map Code Block
data_map_code = format_data_map(data_dictionary)
# 4. Perform Replacements
new_code = template_code
# Replace data_map block
# Regex to find 'data_map = { ... }' across multiple lines
# We match 'data_map', whitespace, '=', whitespace, '{', then anything non-greedy until '}'
data_map_pattern = r"data_map\s*=\s*\{.*?\}"
new_code = re.sub(data_map_pattern, data_map_code, new_code, flags=re.DOTALL)
# Replace Target
# Pattern: target = "..."
target_pattern = r'target\s*=\s*".*?"'
new_code = re.sub(target_pattern, f'target = "{target_var}"', new_code)
# Replace Data File
# We want the output code to ALWAYS use a relative path
# regardless of what path was in the JSON (which might be /tmp/gradio/...)
# Extract just the filename
filename = "your_data_file.csv"
if data_file:
filename = os.path.basename(data_file)
# Get delimiter if present
delimiter = prescription.get("delimiter", ",")
if not delimiter or delimiter == "comma": delimiter = ","
# Normalize common names
if delimiter.lower() == "tab": delimiter = "\t"
if delimiter.lower() == "space": delimiter = " "
if delimiter.lower() == "semicolon": delimiter = ";"
# Pattern: pd.read_csv("...")
# We need to handle both CSV and Excel cases in the template replacement
if filename.lower().endswith(('.xls', '.xlsx')):
read_cmd = f'pd.read_excel("{filename}")'
else:
if delimiter == ",":
read_cmd = f'pd.read_csv("{filename}")'
elif delimiter == "\t":
# Explicitly handle tab to ensure it appears as \t in the code
read_cmd = f'pd.read_csv("{filename}", sep="\\t")'
else:
# Use repr() to safely encode special chars like \t, \n, etc.
# repr('\t') -> "'\t'" (including the quotes)
read_cmd = f'pd.read_csv("{filename}", sep={repr(delimiter)})'
# Prepare replacement string for re.sub
# We must escape backslashes so re.sub doesn't treat them as escapes (e.g. \t -> tab)
safe_read_cmd = read_cmd.replace("\\", "\\\\")
# The template likely has pd.read_csv("...") or pd.read_csv('...') by default.
# Regex matches: pd.read_csv( + quote + content + matching quote + )
read_csv_pattern = r"pd\.read_csv\(([\"']).*?\1\)"
new_code = re.sub(read_csv_pattern, safe_read_cmd, new_code)
# Just in case the template was already set to read_excel
read_excel_pattern = r"pd\.read_excel\(([\"']).*?\1\)"
new_code = re.sub(read_excel_pattern, safe_read_cmd, new_code)
# 5. Write Output
with open(output_filename, "w") as f:
f.write(new_code)
print(f"Successfully generated: {output_filename}")
return output_filename
if __name__ == "__main__":
# Standalone testing
files = [f for f in os.listdir('.') if f.startswith('project_context_') and f.endswith('.json')]
if files:
files.sort(reverse=True)
latest_json = files[0]
generate_code(latest_json)