catalyst-n1 / rtl /multi_chip_router.v
mrwabbit's picture
Initial upload: Catalyst N1 open source neuromorphic processor RTL
e4cdd5f verified
// ============================================================================
// Multi-Chip Router
// ============================================================================
//
// Copyright 2026 Henry Arthur Shulayev Barnes / Catalyst Neuromorphic Ltd
// Company No. 17054540 — UK Patent Application No. 2602902.6
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// ============================================================================
`timescale 1ns/1ps
module multi_chip_router #(
parameter NUM_LINKS = 1,
parameter CHIP_ID_BITS = 14,
parameter CORE_ID_BITS = 7,
parameter NEURON_BITS = 10,
parameter DATA_WIDTH = 16,
parameter TX_DEPTH = 256,
parameter RX_DEPTH = 256
)(
input wire clk,
input wire rst_n,
input wire [CHIP_ID_BITS-1:0] my_chip_id,
input wire tx_push,
input wire [CHIP_ID_BITS-1:0] tx_dest_chip,
input wire [CORE_ID_BITS-1:0] tx_core,
input wire [NEURON_BITS-1:0] tx_neuron,
input wire [7:0] tx_payload,
output wire tx_full,
output wire [CHIP_ID_BITS-1:0] rx_src_chip,
output wire [CORE_ID_BITS-1:0] rx_core,
output wire [NEURON_BITS-1:0] rx_neuron,
output wire signed [DATA_WIDTH-1:0] rx_current,
input wire rx_pop,
output wire rx_empty,
input wire barrier_tx_send,
output reg barrier_rx,
input wire mgmt_tx_push,
input wire [CORE_ID_BITS-1:0] mgmt_tx_core,
input wire [NEURON_BITS-1:0] mgmt_tx_neuron,
input wire [7:0] mgmt_tx_data,
input wire mgmt_tx_is_write,
input wire [CHIP_ID_BITS-1:0] mgmt_tx_dest_chip,
output reg mgmt_rx_valid,
output reg [CHIP_ID_BITS-1:0] mgmt_rx_src_chip,
output reg [CORE_ID_BITS-1:0] mgmt_rx_core,
output reg [NEURON_BITS-1:0] mgmt_rx_neuron,
output reg [7:0] mgmt_rx_data,
output reg mgmt_rx_is_write,
input wire preempt_request,
output reg preempt_rx,
output wire [NUM_LINKS*8-1:0] link_tx_data,
output wire [NUM_LINKS-1:0] link_tx_valid,
input wire [NUM_LINKS-1:0] link_tx_ready,
input wire [NUM_LINKS*8-1:0] link_rx_data,
input wire [NUM_LINKS-1:0] link_rx_valid,
output wire [NUM_LINKS-1:0] link_rx_ready
);
localparam MSG_SPIKE = 2'b00;
localparam MSG_BARRIER = 2'b01;
localparam MSG_MGMT = 2'b10;
localparam MSG_PREEMPT = 2'b11;
localparam TX_FLAT_W = 1 + 2 + 2*CHIP_ID_BITS + CORE_ID_BITS + NEURON_BITS + 8;
localparam TX_NUM_BYTES = (TX_FLAT_W + 7) / 8;
localparam TX_PAD_W = TX_NUM_BYTES * 8;
localparam MSGTYPE_OFFSET = TX_PAD_W - 1 - 1;
localparam DEST_OFFSET = MSGTYPE_OFFSET - 2;
localparam SRC_OFFSET = DEST_OFFSET - CHIP_ID_BITS;
localparam CORE_OFFSET = SRC_OFFSET - CHIP_ID_BITS;
localparam NRN_OFFSET = CORE_OFFSET - CORE_ID_BITS;
localparam PAY_OFFSET = NRN_OFFSET - NEURON_BITS;
localparam PKT_W = 2 + CHIP_ID_BITS + CORE_ID_BITS + NEURON_BITS + 8;
reg [PKT_W-1:0] tx_fifo [0:TX_DEPTH-1];
reg [8:0] tx_wr_ptr, tx_rd_ptr;
wire [8:0] tx_count = tx_wr_ptr - tx_rd_ptr;
wire tx_fifo_empty = (tx_wr_ptr == tx_rd_ptr);
assign tx_full = (tx_count >= TX_DEPTH);
always @(posedge clk or negedge rst_n) begin
if (!rst_n)
tx_wr_ptr <= 0;
else if (tx_push && !tx_full) begin
tx_fifo[tx_wr_ptr[7:0]] <= {MSG_SPIKE, tx_dest_chip, tx_core, tx_neuron, tx_payload};
tx_wr_ptr <= tx_wr_ptr + 1;
end else if (mgmt_tx_push && !tx_full) begin
tx_fifo[tx_wr_ptr[7:0]] <= {MSG_MGMT, mgmt_tx_dest_chip, mgmt_tx_core, mgmt_tx_neuron,
mgmt_tx_is_write, mgmt_tx_data[6:0]};
tx_wr_ptr <= tx_wr_ptr + 1;
end
end
wire [PKT_W-1:0] tx_head = tx_fifo[tx_rd_ptr[7:0]];
wire [1:0] tx_head_msgtype = tx_head[PKT_W-1 -: 2];
wire [CHIP_ID_BITS-1:0] tx_head_chip = tx_head[PKT_W-3 -: CHIP_ID_BITS];
wire [CHIP_ID_BITS-1:0] tx_link_sel = tx_head_chip % NUM_LINKS;
reg [TX_PAD_W-1:0] txs_shift;
reg [$clog2(TX_NUM_BYTES+1)-1:0] txs_cnt;
reg txs_active;
reg [CHIP_ID_BITS-1:0] txs_link;
reg [NUM_LINKS*8-1:0] ltx_data;
reg [NUM_LINKS-1:0] ltx_valid;
assign link_tx_data = ltx_data;
assign link_tx_valid = ltx_valid;
wire [TX_PAD_W-1:0] tx_flat = {1'b1, tx_head_msgtype, tx_head_chip, my_chip_id,
tx_head[CORE_ID_BITS+NEURON_BITS+7 : 0],
{(TX_PAD_W - TX_FLAT_W){1'b0}}};
wire [TX_PAD_W-1:0] barrier_flat = {1'b1, MSG_BARRIER, {CHIP_ID_BITS{1'b1}}, my_chip_id,
{(CORE_ID_BITS+NEURON_BITS+8){1'b0}},
{(TX_PAD_W - TX_FLAT_W){1'b0}}};
wire [TX_PAD_W-1:0] preempt_flat = {1'b1, MSG_PREEMPT, {CHIP_ID_BITS{1'b1}}, my_chip_id,
{(CORE_ID_BITS+NEURON_BITS+8){1'b0}},
{(TX_PAD_W - TX_FLAT_W){1'b0}}};
reg bcast_active;
reg [TX_PAD_W-1:0] bcast_shift;
reg [$clog2(TX_NUM_BYTES+1)-1:0] bcast_cnt;
reg [CHIP_ID_BITS-1:0] bcast_link;
reg [CHIP_ID_BITS-1:0] bcast_link_max;
reg [1:0] bcast_msg_type;
reg bcast_pending;
reg [TX_PAD_W-1:0] bcast_flat_save;
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
txs_active <= 0;
txs_cnt <= 0;
txs_shift <= 0;
txs_link <= 0;
tx_rd_ptr <= 0;
ltx_data <= 0;
ltx_valid <= 0;
bcast_active <= 0;
bcast_shift <= 0;
bcast_cnt <= 0;
bcast_link <= 0;
bcast_link_max <= 0;
bcast_msg_type <= 0;
bcast_pending <= 0;
bcast_flat_save <= 0;
end else begin
ltx_valid <= 0;
if (bcast_active) begin
ltx_data[bcast_link*8 +: 8] <= bcast_shift[TX_PAD_W-1 -: 8];
ltx_valid[bcast_link] <= 1;
if (link_tx_ready[bcast_link]) begin
bcast_shift <= bcast_shift << 8;
if (bcast_cnt == TX_NUM_BYTES - 1) begin
if (bcast_link < NUM_LINKS - 1) begin
bcast_link <= bcast_link + 1;
bcast_shift <= bcast_flat_save;
bcast_cnt <= 0;
end else begin
bcast_active <= 0;
end
end else begin
bcast_cnt <= bcast_cnt + 1;
end
end
end else if (!txs_active) begin
if (barrier_tx_send) begin
bcast_active <= 1;
bcast_flat_save <= barrier_flat;
bcast_shift <= barrier_flat;
bcast_cnt <= 0;
bcast_link <= 0;
bcast_msg_type <= MSG_BARRIER;
end else if (preempt_request) begin
bcast_active <= 1;
bcast_flat_save <= preempt_flat;
bcast_shift <= preempt_flat;
bcast_cnt <= 0;
bcast_link <= 0;
bcast_msg_type <= MSG_PREEMPT;
end else if (!tx_fifo_empty) begin
ltx_data[tx_link_sel*8 +: 8] <= tx_flat[TX_PAD_W-1 -: 8];
ltx_valid[tx_link_sel] <= 1;
txs_shift <= tx_flat << 8;
txs_link <= tx_link_sel;
txs_cnt <= 1;
txs_active <= 1;
tx_rd_ptr <= tx_rd_ptr + 1;
end
end else begin
ltx_data[txs_link*8 +: 8] <= txs_shift[TX_PAD_W-1 -: 8];
ltx_valid[txs_link] <= 1;
if (link_tx_ready[txs_link]) begin
txs_shift <= txs_shift << 8;
if (txs_cnt == TX_NUM_BYTES - 1)
txs_active <= 0;
else
txs_cnt <= txs_cnt + 1;
end
end
end
end
localparam RX_PKT_W = CHIP_ID_BITS + CORE_ID_BITS + NEURON_BITS + DATA_WIDTH;
reg [TX_PAD_W-1:0] rxs_accum [0:NUM_LINKS-1];
reg [$clog2(TX_NUM_BYTES+1)-1:0] rxs_cnt [0:NUM_LINKS-1];
reg [NUM_LINKS-1:0] rxs_push;
assign link_rx_ready = (rx_count < RX_DEPTH - 4) ? {NUM_LINKS{1'b1}} : {NUM_LINKS{1'b0}};
genvar li;
generate
for (li = 0; li < NUM_LINKS; li = li + 1) begin : gen_rx
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
rxs_cnt[li] <= 0;
rxs_push[li] <= 0;
rxs_accum[li] <= 0;
end else begin
rxs_push[li] <= 0;
if (link_rx_valid[li]) begin
rxs_accum[li] <= {rxs_accum[li][TX_PAD_W-9:0], link_rx_data[li*8 +: 8]};
if (rxs_cnt[li] == 0) begin
if (link_rx_data[li*8 + 7]) begin
rxs_accum[li] <= {{(TX_PAD_W-8){1'b0}}, link_rx_data[li*8 +: 8]};
rxs_cnt[li] <= 1;
end
end else begin
if (rxs_cnt[li] == TX_NUM_BYTES - 1) begin
rxs_push[li] <= 1;
rxs_cnt[li] <= 0;
end else begin
rxs_cnt[li] <= rxs_cnt[li] + 1;
end
end
end
end
end
end
endgenerate
reg [RX_PKT_W-1:0] rx_fifo [0:RX_DEPTH-1];
reg [8:0] rx_wr_ptr, rx_rd_ptr;
wire [8:0] rx_count = rx_wr_ptr - rx_rd_ptr;
assign rx_empty = (rx_wr_ptr == rx_rd_ptr);
always @(posedge clk or negedge rst_n) begin : rx_fifo_wr
integer k;
reg [1:0] rx_msg_type;
if (!rst_n) begin
rx_wr_ptr <= 0;
barrier_rx <= 0;
preempt_rx <= 0;
mgmt_rx_valid <= 0;
mgmt_rx_src_chip <= 0;
mgmt_rx_core <= 0;
mgmt_rx_neuron <= 0;
mgmt_rx_data <= 0;
mgmt_rx_is_write <= 0;
end else begin
barrier_rx <= 0;
preempt_rx <= 0;
mgmt_rx_valid <= 0;
for (k = 0; k < NUM_LINKS; k = k + 1) begin
if (rxs_push[k]) begin
rx_msg_type = rxs_accum[k][MSGTYPE_OFFSET -: 2];
case (rx_msg_type)
MSG_SPIKE: begin
if (rx_count < RX_DEPTH) begin
rx_fifo[rx_wr_ptr[7:0]] <= {
rxs_accum[k][SRC_OFFSET -: CHIP_ID_BITS],
rxs_accum[k][CORE_OFFSET -: CORE_ID_BITS],
rxs_accum[k][NRN_OFFSET -: NEURON_BITS],
{{(DATA_WIDTH-8){1'b0}},
rxs_accum[k][PAY_OFFSET -: 8]}
};
rx_wr_ptr <= rx_wr_ptr + 1;
end
end
MSG_BARRIER: begin
barrier_rx <= 1;
end
MSG_MGMT: begin
mgmt_rx_valid <= 1;
mgmt_rx_src_chip <= rxs_accum[k][SRC_OFFSET -: CHIP_ID_BITS];
mgmt_rx_core <= rxs_accum[k][CORE_OFFSET -: CORE_ID_BITS];
mgmt_rx_neuron <= rxs_accum[k][NRN_OFFSET -: NEURON_BITS];
mgmt_rx_is_write <= rxs_accum[k][PAY_OFFSET];
mgmt_rx_data <= {1'b0, rxs_accum[k][PAY_OFFSET-1 -: 7]};
end
MSG_PREEMPT: begin
preempt_rx <= 1;
end
endcase
end
end
end
end
always @(posedge clk or negedge rst_n) begin
if (!rst_n)
rx_rd_ptr <= 0;
else if (rx_pop && !rx_empty)
rx_rd_ptr <= rx_rd_ptr + 1;
end
wire [RX_PKT_W-1:0] rx_top = rx_fifo[rx_rd_ptr[7:0]];
assign rx_src_chip = rx_top[RX_PKT_W-1 -: CHIP_ID_BITS];
assign rx_core = rx_top[NEURON_BITS+DATA_WIDTH +: CORE_ID_BITS];
assign rx_neuron = rx_top[DATA_WIDTH +: NEURON_BITS];
assign rx_current = rx_top[DATA_WIDTH-1:0];
endmodule