Spaces:

anly656
/

dr_jones

Sleeping

File size: 8,277 Bytes

import json
import os
import re
import sys
import ast

def load_template(template_name):
    """
    Finds and loads the template file.
    Searches in known directories: ., Regression, Logistic_Regression, Random_Forest, Decision_Tree
    """
    search_dirs = [
        ".",
        "Regression",
        "Logistic_Regression",
        "Random_Forest",
        "Decision_Tree",
        "ANN"
    ]
    
    for d in search_dirs:
        path = os.path.join(d, template_name)
        if os.path.exists(path):
            with open(path, "r") as f:
                return f.read()
    
    # Fallback: Try to find any file with that name in the whole tree
    for root, dirs, files in os.walk("."):
        if template_name in files:
            with open(os.path.join(root, template_name), "r") as f:
                return f.read()
                
    raise FileNotFoundError(f"Template '{template_name}' not found in search paths.")

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

def format_data_map(data_dict):
    """
    Converts the JSON data dictionary into a Python dictionary string
    compatible with AdvancedAnalytics (using DT class and tuples).
    """
    lines = []
    lines.append("data_map = {")
    
    for key, value in data_dict.items():
        # Value is expected to be [TypeString, TupleString/List]
        # e.g., ["DT.Interval", "(0, 100)"] or ["DT.Binary", ["Yes", "No"]]
        
        dtype_str = value[0]  # "DT.Interval"
        val_range = value[1]  # "(0, 100)" or ["Yes", "No"]
        
        # 1. Handle Data Type (Strip quotes in final output)
        # We just keep the string "DT.Interval" for now, we will write it directly
        
        # 2. Handle the Value Tuple
        # Attempt to parse string representation (e.g. "('0', '1')") into actual tuple
        if isinstance(val_range, str):
            try:
                # Only attempt if it looks like a tuple or list
                if val_range.strip().startswith(('(', '[')):
                    parsed = ast.literal_eval(val_range)
                    if isinstance(parsed, (list, tuple)):
                        val_range = parsed
            except:
                pass

        if isinstance(val_range, (list, tuple)):
            # Check if all items are numbers
            all_numbers = True
            for item in val_range:
                if not isinstance(item, (int, float)) and not (isinstance(item, str) and is_number(item)):
                    all_numbers = False
                    break
            
            if all_numbers:
                # Convert to numbers and avoid quotes
                # If they are strings representing numbers, convert to float/int
                clean_items = []
                for x in val_range:
                    if isinstance(x, str):
                        try:
                            clean_items.append(int(x))
                        except:
                            clean_items.append(float(x))
                    else:
                        clean_items.append(x)
                val_tuple_str = "(" + ", ".join(str(x) for x in clean_items) + ")"
            else:
                # Keep as strings with quotes
                # CRITICAL: If mixed types, force ALL to be strings for consistency
                val_tuple_str = "(" + ", ".join(repr(str(x)) for x in val_range) + ")"
                
        elif isinstance(val_range, str):
            # It's already a string like "(0, 100)"
            # We need to ensure it's safe, but usually we just use it
            val_tuple_str = val_range
        else:
            val_tuple_str = str(val_range)
            
        # 3. Construct the line:  'key': [DT.Interval, (0, 100)],
        # Note: We manually format the string to look like code
        line = f"    '{key}': [{dtype_str}, {val_tuple_str}],"
        lines.append(line)
        
    lines.append("}")
    return "\n".join(lines)

def generate_code(json_path, output_filename="solution.py", model_override=None):
    """
    Main function to read JSON prescription and generate Python code.
    If model_override is provided, use that template instead of the one in the JSON.
    Returns the path to the generated file.
    """
    print(f"Reading prescription from: {json_path}")
    with open(json_path, "r") as f:
        prescription = json.load(f)
        
    # 1. Extract Details
    model_template = model_override or prescription.get("suggested_model")
    target_var = prescription.get("target_variable")
    data_file = prescription.get("data_file")
    data_dictionary = prescription.get("data_dictionary")
    
    print(f"Target: {target_var}")
    print(f"Model: {model_template}")
    print(f"Data: {data_file}")
    
    # 2. Load Template
    try:
        template_code = load_template(model_template)
    except FileNotFoundError:
        # Fallback if template name is wrong
        print(f"Warning: Template {model_template} not found. Using generic placeholder.")
        template_code = load_template("BinaryRandomForest_Template.py") # Default fallback

    # 3. Generate Data Map Code Block
    data_map_code = format_data_map(data_dictionary)
    
    # 4. Perform Replacements
    new_code = template_code
    
    # Replace data_map block
    # Regex to find 'data_map = { ... }' across multiple lines
    # We match 'data_map', whitespace, '=', whitespace, '{', then anything non-greedy until '}'
    data_map_pattern = r"data_map\s*=\s*\{.*?\}"
    new_code = re.sub(data_map_pattern, data_map_code, new_code, flags=re.DOTALL)
    
    # Replace Target
    # Pattern: target = "..."
    target_pattern = r'target\s*=\s*".*?"'
    new_code = re.sub(target_pattern, f'target = "{target_var}"', new_code)
    
    # Replace Data File
    # We want the output code to ALWAYS use a relative path
    # regardless of what path was in the JSON (which might be /tmp/gradio/...)
    
    # Extract just the filename
    filename = "your_data_file.csv"
    if data_file:
        filename = os.path.basename(data_file)
        
    # Get delimiter if present
    delimiter = prescription.get("delimiter", ",")
    
    if not delimiter or delimiter == "comma": delimiter = ","
    
    # Normalize common names
    if delimiter.lower() == "tab": delimiter = "\t"
    if delimiter.lower() == "space": delimiter = " "
    if delimiter.lower() == "semicolon": delimiter = ";"
    
    # Pattern: pd.read_csv("...")
    # We need to handle both CSV and Excel cases in the template replacement
    
    if filename.lower().endswith(('.xls', '.xlsx')):
        read_cmd = f'pd.read_excel("{filename}")'
    else:
        if delimiter == ",":
            read_cmd = f'pd.read_csv("{filename}")'
        elif delimiter == "\t":
            # Explicitly handle tab to ensure it appears as \t in the code
            read_cmd = f'pd.read_csv("{filename}", sep="\\t")'
        else:
            # Use repr() to safely encode special chars like \t, \n, etc.
            # repr('\t') -> "'\t'" (including the quotes)
            read_cmd = f'pd.read_csv("{filename}", sep={repr(delimiter)})'

    # Prepare replacement string for re.sub
    # We must escape backslashes so re.sub doesn't treat them as escapes (e.g. \t -> tab)
    safe_read_cmd = read_cmd.replace("\\", "\\\\")

    # The template likely has pd.read_csv("...") or pd.read_csv('...') by default.
    # Regex matches: pd.read_csv( + quote + content + matching quote + )
    read_csv_pattern = r"pd\.read_csv\(([\"']).*?\1\)"
    new_code = re.sub(read_csv_pattern, safe_read_cmd, new_code)
    
    # Just in case the template was already set to read_excel
    read_excel_pattern = r"pd\.read_excel\(([\"']).*?\1\)"
    new_code = re.sub(read_excel_pattern, safe_read_cmd, new_code)
    
    # 5. Write Output
    with open(output_filename, "w") as f:
        f.write(new_code)
        
    print(f"Successfully generated: {output_filename}")
    return output_filename

if __name__ == "__main__":
    # Standalone testing
    files = [f for f in os.listdir('.') if f.startswith('project_context_') and f.endswith('.json')]
    if files:
        files.sort(reverse=True)
        latest_json = files[0]
        generate_code(latest_json)