Spaces:

anly656
/

dr_jones

Sleeping

App Files Files Community

dr_jones / code_generator.py

anly656

Upload 2 files

6ece7dd verified 23 days ago

raw

history blame contribute delete

8.28 kB

	import json
	import os
	import re
	import sys
	import ast

	def load_template(template_name):
	"""
	Finds and loads the template file.
	Searches in known directories: ., Regression, Logistic_Regression, Random_Forest, Decision_Tree
	"""
	search_dirs = [
	".",
	"Regression",
	"Logistic_Regression",
	"Random_Forest",
	"Decision_Tree",
	"ANN"
	]

	for d in search_dirs:
	path = os.path.join(d, template_name)
	if os.path.exists(path):
	with open(path, "r") as f:
	return f.read()

	# Fallback: Try to find any file with that name in the whole tree
	for root, dirs, files in os.walk("."):
	if template_name in files:
	with open(os.path.join(root, template_name), "r") as f:
	return f.read()

	raise FileNotFoundError(f"Template '{template_name}' not found in search paths.")

	def is_number(s):
	try:
	float(s)
	return True
	except ValueError:
	return False

	def format_data_map(data_dict):
	"""
	Converts the JSON data dictionary into a Python dictionary string
	compatible with AdvancedAnalytics (using DT class and tuples).
	"""
	lines = []
	lines.append("data_map = {")

	for key, value in data_dict.items():
	# Value is expected to be [TypeString, TupleString/List]
	# e.g., ["DT.Interval", "(0, 100)"] or ["DT.Binary", ["Yes", "No"]]

	dtype_str = value[0] # "DT.Interval"
	val_range = value[1] # "(0, 100)" or ["Yes", "No"]

	# 1. Handle Data Type (Strip quotes in final output)
	# We just keep the string "DT.Interval" for now, we will write it directly

	# 2. Handle the Value Tuple
	# Attempt to parse string representation (e.g. "('0', '1')") into actual tuple
	if isinstance(val_range, str):
	try:
	# Only attempt if it looks like a tuple or list
	if val_range.strip().startswith(('(', '[')):
	parsed = ast.literal_eval(val_range)
	if isinstance(parsed, (list, tuple)):
	val_range = parsed
	except:
	pass

	if isinstance(val_range, (list, tuple)):
	# Check if all items are numbers
	all_numbers = True
	for item in val_range:
	if not isinstance(item, (int, float)) and not (isinstance(item, str) and is_number(item)):
	all_numbers = False
	break

	if all_numbers:
	# Convert to numbers and avoid quotes
	# If they are strings representing numbers, convert to float/int
	clean_items = []
	for x in val_range:
	if isinstance(x, str):
	try:
	clean_items.append(int(x))
	except:
	clean_items.append(float(x))
	else:
	clean_items.append(x)
	val_tuple_str = "(" + ", ".join(str(x) for x in clean_items) + ")"
	else:
	# Keep as strings with quotes
	# CRITICAL: If mixed types, force ALL to be strings for consistency
	val_tuple_str = "(" + ", ".join(repr(str(x)) for x in val_range) + ")"

	elif isinstance(val_range, str):
	# It's already a string like "(0, 100)"
	# We need to ensure it's safe, but usually we just use it
	val_tuple_str = val_range
	else:
	val_tuple_str = str(val_range)

	# 3. Construct the line: 'key': [DT.Interval, (0, 100)],
	# Note: We manually format the string to look like code
	line = f" '{key}': [{dtype_str}, {val_tuple_str}],"
	lines.append(line)

	lines.append("}")
	return "\n".join(lines)

	def generate_code(json_path, output_filename="solution.py", model_override=None):
	"""
	Main function to read JSON prescription and generate Python code.
	If model_override is provided, use that template instead of the one in the JSON.
	Returns the path to the generated file.
	"""
	print(f"Reading prescription from: {json_path}")
	with open(json_path, "r") as f:
	prescription = json.load(f)

	# 1. Extract Details
	model_template = model_override or prescription.get("suggested_model")
	target_var = prescription.get("target_variable")
	data_file = prescription.get("data_file")
	data_dictionary = prescription.get("data_dictionary")

	print(f"Target: {target_var}")
	print(f"Model: {model_template}")
	print(f"Data: {data_file}")

	# 2. Load Template
	try:
	template_code = load_template(model_template)
	except FileNotFoundError:
	# Fallback if template name is wrong
	print(f"Warning: Template {model_template} not found. Using generic placeholder.")
	template_code = load_template("BinaryRandomForest_Template.py") # Default fallback

	# 3. Generate Data Map Code Block
	data_map_code = format_data_map(data_dictionary)

	# 4. Perform Replacements
	new_code = template_code

	# Replace data_map block
	# Regex to find 'data_map = { ... }' across multiple lines
	# We match 'data_map', whitespace, '=', whitespace, '{', then anything non-greedy until '}'
	data_map_pattern = r"data_map\s=\s\{.*?\}"
	new_code = re.sub(data_map_pattern, data_map_code, new_code, flags=re.DOTALL)

	# Replace Target
	# Pattern: target = "..."
	target_pattern = r'target\s=\s".*?"'
	new_code = re.sub(target_pattern, f'target = "{target_var}"', new_code)

	# Replace Data File
	# We want the output code to ALWAYS use a relative path
	# regardless of what path was in the JSON (which might be /tmp/gradio/...)

	# Extract just the filename
	filename = "your_data_file.csv"
	if data_file:
	filename = os.path.basename(data_file)

	# Get delimiter if present
	delimiter = prescription.get("delimiter", ",")

	if not delimiter or delimiter == "comma": delimiter = ","

	# Normalize common names
	if delimiter.lower() == "tab": delimiter = "\t"
	if delimiter.lower() == "space": delimiter = " "
	if delimiter.lower() == "semicolon": delimiter = ";"

	# Pattern: pd.read_csv("...")
	# We need to handle both CSV and Excel cases in the template replacement

	if filename.lower().endswith(('.xls', '.xlsx')):
	read_cmd = f'pd.read_excel("{filename}")'
	else:
	if delimiter == ",":
	read_cmd = f'pd.read_csv("{filename}")'
	elif delimiter == "\t":
	# Explicitly handle tab to ensure it appears as \t in the code
	read_cmd = f'pd.read_csv("{filename}", sep="\\t")'
	else:
	# Use repr() to safely encode special chars like \t, \n, etc.
	# repr('\t') -> "'\t'" (including the quotes)
	read_cmd = f'pd.read_csv("{filename}", sep={repr(delimiter)})'

	# Prepare replacement string for re.sub
	# We must escape backslashes so re.sub doesn't treat them as escapes (e.g. \t -> tab)
	safe_read_cmd = read_cmd.replace("\\", "\\\\")

	# The template likely has pd.read_csv("...") or pd.read_csv('...') by default.
	# Regex matches: pd.read_csv( + quote + content + matching quote + )
	read_csv_pattern = r"pd\.read_csv\(([\"']).*?\1\)"
	new_code = re.sub(read_csv_pattern, safe_read_cmd, new_code)

	# Just in case the template was already set to read_excel
	read_excel_pattern = r"pd\.read_excel\(([\"']).*?\1\)"
	new_code = re.sub(read_excel_pattern, safe_read_cmd, new_code)

	# 5. Write Output
	with open(output_filename, "w") as f:
	f.write(new_code)

	print(f"Successfully generated: {output_filename}")
	return output_filename

	if __name__ == "__main__":
	# Standalone testing
	files = [f for f in os.listdir('.') if f.startswith('project_context_') and f.endswith('.json')]
	if files:
	files.sort(reverse=True)
	latest_json = files[0]
	generate_code(latest_json)