import time

class CommandProcessor:
    def __init__(self, hal, memory_manager):
        self.hal = hal
        self.memory_manager = memory_manager
        self.command_buffer = []

    def add_command(self, command_type, **kwargs):
        command = {
            "type": command_type,
            "args": kwargs
        }
        self.command_buffer.append(command)
        print(f"Added command: {command_type} with args {kwargs}")

    def submit_commands(self, chip_id=0):
        if not self.hal.initialized:
            raise RuntimeError("HAL not initialized. Cannot submit commands.")

        print(f"Submitting {len(self.command_buffer)} commands to Chip {chip_id}...")
        results = []
        for command in self.command_buffer:
            command_type = command["type"]
            args = command["args"]

            try:
                if command_type == "execute_kernel":
                    sm_id = args.get("sm_id")
                    a = args.get("a")
                    b = args.get("b")
                    cin = args.get("cin")
                    opcode = args.get("opcode")
                    reg_sel = args.get("reg_sel")
                    # Use v2 core if available
                    try:
                        v2_result = self.hal.v2_core_step(chip_id, a, b, cin, opcode, reg_sel)
                        results.append(v2_result)
                        print(f"  [v2] Executed kernel on chip {chip_id} (AdvancedCore). Result: {v2_result}")
                    except Exception as e:
                        # fallback to legacy SM warp if v2 core not present
                        result = self.hal.execute_sm_warp(chip_id, sm_id, a, b, cin, opcode, reg_sel)
                        results.append(result)
                        print(f"  Executed kernel on SM {sm_id}. Result: {result}")
                elif command_type == "matmul":
                    sm_id = args.get("sm_id")
                    A = args.get("A")
                    B = args.get("B")
                    # Try v2 tensor core first
                    try:
                        v2_result = self.hal.v2_tensor_matmul(chip_id, A, B)
                        results.append(v2_result)
                        print(f"  [v2] Executed tensor matmul on chip {chip_id}. Result: {v2_result}")
                    except Exception as e:
                        result = self.hal.execute_tensor_core_matmul(chip_id, sm_id, A, B)
                        results.append(result)
                        print(f"  Executed matmul on SM {sm_id}. Result: {result}")
                elif command_type == "draw_arrays":
                    # Optionally, could simulate v2 graphics pipeline here
                    print(f"  [v2] draw_arrays command received (not yet fully simulated in v2 core).")
                    results.append(None)
                elif command_type == "draw_indexed":
                    print(f"  [v2] draw_indexed command received (not yet fully simulated in v2 core).")
                    results.append(None)
                elif command_type == "write_memory":
                    virtual_address = args.get("virtual_address")
                    data = args.get("data")
                    self.memory_manager.write_data(virtual_address, data, chip_id)
                    results.append(None)
                    print(f"  Wrote data to memory at virtual address {virtual_address}.")
                elif command_type == "read_memory":
                    virtual_address = args.get("virtual_address")
                    size_bytes = args.get("size_bytes")
                    result = self.memory_manager.read_data(virtual_address, size_bytes, chip_id)
                    results.append(result)
                    print(f"  Read data from memory at virtual address {virtual_address}. Data: {result}")
                elif command_type == "global_barrier":
                    print(f"  Executing global barrier on Chip {chip_id}. All pending operations on this chip will complete.")
                    time.sleep(0.01) # Simulate a small delay for synchronization
                    results.append(None)
                elif command_type == "shared_memory_barrier":
                    sm_id = args.get("sm_id")
                    print(f"  Executing shared memory barrier on Chip {chip_id}, SM {sm_id}. All pending shared memory operations on this SM will complete.")
                    time.sleep(0.001) # Simulate a very small delay
                    results.append(None)
                elif command_type == "atomic_operation":
                    sm_id = args.get("sm_id")
                    address = args.get("address")
                    operation = args.get("operation") # e.g., 'add', 'compare_and_swap'
                    value = args.get("value")
                    print(f"  Executing atomic operation '{operation}' at address {address} on Chip {chip_id}, SM {sm_id} with value {value}.")
                    # In a real driver, this would involve a hardware atomic instruction
                    results.append(None)
                else:
                    print(f"  Unknown command type: {command_type}")
                    results.append(None)
            except Exception as e:
                print(f"Error executing command {command_type}: {e}")
                results.append(f"Error: {e}")
        
        self.command_buffer = [] # Clear buffer after submission
        print("Command submission complete.")
        return results

    def clear_commands(self):
        self.command_buffer = []
        print("Command buffer cleared.")