import json import os import re import sys import ast def load_template(template_name): """ Finds and loads the template file. Searches in known directories: ., Regression, Logistic_Regression, Random_Forest, Decision_Tree """ search_dirs = [ ".", "Regression", "Logistic_Regression", "Random_Forest", "Decision_Tree", "ANN" ] for d in search_dirs: path = os.path.join(d, template_name) if os.path.exists(path): with open(path, "r") as f: return f.read() # Fallback: Try to find any file with that name in the whole tree for root, dirs, files in os.walk("."): if template_name in files: with open(os.path.join(root, template_name), "r") as f: return f.read() raise FileNotFoundError(f"Template '{template_name}' not found in search paths.") def is_number(s): try: float(s) return True except ValueError: return False def format_data_map(data_dict): """ Converts the JSON data dictionary into a Python dictionary string compatible with AdvancedAnalytics (using DT class and tuples). """ lines = [] lines.append("data_map = {") for key, value in data_dict.items(): # Value is expected to be [TypeString, TupleString/List] # e.g., ["DT.Interval", "(0, 100)"] or ["DT.Binary", ["Yes", "No"]] dtype_str = value[0] # "DT.Interval" val_range = value[1] # "(0, 100)" or ["Yes", "No"] # 1. Handle Data Type (Strip quotes in final output) # We just keep the string "DT.Interval" for now, we will write it directly # 2. Handle the Value Tuple # Attempt to parse string representation (e.g. "('0', '1')") into actual tuple if isinstance(val_range, str): try: # Only attempt if it looks like a tuple or list if val_range.strip().startswith(('(', '[')): parsed = ast.literal_eval(val_range) if isinstance(parsed, (list, tuple)): val_range = parsed except: pass if isinstance(val_range, (list, tuple)): # Check if all items are numbers all_numbers = True for item in val_range: if not isinstance(item, (int, float)) and not (isinstance(item, str) and is_number(item)): all_numbers = False break if all_numbers: # Convert to numbers and avoid quotes # If they are strings representing numbers, convert to float/int clean_items = [] for x in val_range: if isinstance(x, str): try: clean_items.append(int(x)) except: clean_items.append(float(x)) else: clean_items.append(x) val_tuple_str = "(" + ", ".join(str(x) for x in clean_items) + ")" else: # Keep as strings with quotes # CRITICAL: If mixed types, force ALL to be strings for consistency val_tuple_str = "(" + ", ".join(repr(str(x)) for x in val_range) + ")" elif isinstance(val_range, str): # It's already a string like "(0, 100)" # We need to ensure it's safe, but usually we just use it val_tuple_str = val_range else: val_tuple_str = str(val_range) # 3. Construct the line: 'key': [DT.Interval, (0, 100)], # Note: We manually format the string to look like code line = f" '{key}': [{dtype_str}, {val_tuple_str}]," lines.append(line) lines.append("}") return "\n".join(lines) def generate_code(json_path, output_filename="solution.py", model_override=None): """ Main function to read JSON prescription and generate Python code. If model_override is provided, use that template instead of the one in the JSON. Returns the path to the generated file. """ print(f"Reading prescription from: {json_path}") with open(json_path, "r") as f: prescription = json.load(f) # 1. Extract Details model_template = model_override or prescription.get("suggested_model") target_var = prescription.get("target_variable") data_file = prescription.get("data_file") data_dictionary = prescription.get("data_dictionary") print(f"Target: {target_var}") print(f"Model: {model_template}") print(f"Data: {data_file}") # 2. Load Template try: template_code = load_template(model_template) except FileNotFoundError: # Fallback if template name is wrong print(f"Warning: Template {model_template} not found. Using generic placeholder.") template_code = load_template("BinaryRandomForest_Template.py") # Default fallback # 3. Generate Data Map Code Block data_map_code = format_data_map(data_dictionary) # 4. Perform Replacements new_code = template_code # Replace data_map block # Regex to find 'data_map = { ... }' across multiple lines # We match 'data_map', whitespace, '=', whitespace, '{', then anything non-greedy until '}' data_map_pattern = r"data_map\s*=\s*\{.*?\}" new_code = re.sub(data_map_pattern, data_map_code, new_code, flags=re.DOTALL) # Replace Target # Pattern: target = "..." target_pattern = r'target\s*=\s*".*?"' new_code = re.sub(target_pattern, f'target = "{target_var}"', new_code) # Replace Data File # We want the output code to ALWAYS use a relative path # regardless of what path was in the JSON (which might be /tmp/gradio/...) # Extract just the filename filename = "your_data_file.csv" if data_file: filename = os.path.basename(data_file) # Get delimiter if present delimiter = prescription.get("delimiter", ",") if not delimiter or delimiter == "comma": delimiter = "," # Normalize common names if delimiter.lower() == "tab": delimiter = "\t" if delimiter.lower() == "space": delimiter = " " if delimiter.lower() == "semicolon": delimiter = ";" # Pattern: pd.read_csv("...") # We need to handle both CSV and Excel cases in the template replacement if filename.lower().endswith(('.xls', '.xlsx')): read_cmd = f'pd.read_excel("{filename}")' else: if delimiter == ",": read_cmd = f'pd.read_csv("{filename}")' elif delimiter == "\t": # Explicitly handle tab to ensure it appears as \t in the code read_cmd = f'pd.read_csv("{filename}", sep="\\t")' else: # Use repr() to safely encode special chars like \t, \n, etc. # repr('\t') -> "'\t'" (including the quotes) read_cmd = f'pd.read_csv("{filename}", sep={repr(delimiter)})' # Prepare replacement string for re.sub # We must escape backslashes so re.sub doesn't treat them as escapes (e.g. \t -> tab) safe_read_cmd = read_cmd.replace("\\", "\\\\") # The template likely has pd.read_csv("...") or pd.read_csv('...') by default. # Regex matches: pd.read_csv( + quote + content + matching quote + ) read_csv_pattern = r"pd\.read_csv\(([\"']).*?\1\)" new_code = re.sub(read_csv_pattern, safe_read_cmd, new_code) # Just in case the template was already set to read_excel read_excel_pattern = r"pd\.read_excel\(([\"']).*?\1\)" new_code = re.sub(read_excel_pattern, safe_read_cmd, new_code) # 5. Write Output with open(output_filename, "w") as f: f.write(new_code) print(f"Successfully generated: {output_filename}") return output_filename if __name__ == "__main__": # Standalone testing files = [f for f in os.listdir('.') if f.startswith('project_context_') and f.endswith('.json')] if files: files.sort(reverse=True) latest_json = files[0] generate_code(latest_json)