From 01fb84c6e8e2cecbedbce08d9acba7e1538e70bf Mon Sep 17 00:00:00 2001
From: Ryan Zezeski <ryan@zinascii.com>
Date: Wed, 7 Jan 2026 10:19:36 -0500
Subject: [PATCH] 17526 cxgbe could use more queues 17527 cxgbe should be
 smatch clean 17528 cxgbe should always be mac ring capable Portions
 contributed by: Patrick Mooney <pmooney@pfmooney.com> Reviewed by: Robert
 Mustacchi <rm@fingolfin.org>

Change-Id: I719cb8b599b7de95d65055d65367a704e22b8d9c
---
 usr/src/uts/common/io/cxgbe/common/common.h   |    2 +-
 usr/src/uts/common/io/cxgbe/common/t4_hw.c    |   30 +-
 usr/src/uts/common/io/cxgbe/shared/shared.h   |    4 +-
 usr/src/uts/common/io/cxgbe/t4nex/adapter.h   |  966 +++---
 usr/src/uts/common/io/cxgbe/t4nex/cudbg.h     |    2 +-
 usr/src/uts/common/io/cxgbe/t4nex/cudbg_lib.c |    2 +-
 usr/src/uts/common/io/cxgbe/t4nex/t4_ioctl.c  |    1 -
 usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c    |  371 +--
 usr/src/uts/common/io/cxgbe/t4nex/t4_nexus.c  | 2357 ++++++++------
 usr/src/uts/common/io/cxgbe/t4nex/t4_sge.c    | 2894 +++++++++--------
 usr/src/uts/intel/cxgbe/t4nex/Makefile        |    9 +-
 11 files changed, 3820 insertions(+), 2818 deletions(-)

diff --git a/usr/src/uts/common/io/cxgbe/common/common.h b/usr/src/uts/common/io/cxgbe/common/common.h
index 22a28e4a3d..cabe280784 100644
--- a/usr/src/uts/common/io/cxgbe/common/common.h
+++ b/usr/src/uts/common/io/cxgbe/common/common.h
@@ -583,7 +583,7 @@ unsigned int t4_link_fwcap_to_speed(fw_port_cap32_t caps);
 fw_port_cap32_t t4_link_fwcap_to_fwspeed(fw_port_cap32_t acaps);
 int t4_link_set_autoneg(struct port_info *pi, u8 autoneg,
 			fw_port_cap32_t *new_caps);
-int t4_link_set_pause(struct port_info *pi, cc_pause_t pause,
+void t4_link_set_pause(struct port_info *pi, cc_pause_t pause,
 		      fw_port_cap32_t *new_caps);
 int t4_link_set_fec(struct port_info *pi, cc_fec_t fec,
 		    fw_port_cap32_t *new_caps);
diff --git a/usr/src/uts/common/io/cxgbe/common/t4_hw.c b/usr/src/uts/common/io/cxgbe/common/t4_hw.c
index 9fb9f9301c..7757fd0b51 100644
--- a/usr/src/uts/common/io/cxgbe/common/t4_hw.c
+++ b/usr/src/uts/common/io/cxgbe/common/t4_hw.c
@@ -449,7 +449,7 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
 	u32 ctl_reg = PF_REG(mbox, A_CIM_PF_MAILBOX_CTRL);
 	u32 ctl;
 	__be64 cmd_rpl[MBOX_LEN/8];
-	struct t4_mbox_list entry;
+	t4_mbox_waiter_t entry;
 	u32 pcie_fw;
 
 	if ((size & 15) || size > MBOX_LEN)
@@ -469,7 +469,7 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
 	 * wait [for a while] till we're at the front [or bail out with an
 	 * EBUSY] ...
 	 */
-	t4_mbox_list_add(adap, &entry);
+	t4_mbox_waiter_add(adap, &entry);
 
 	for (i = 0; ; i++) {
 		/*
@@ -481,28 +481,15 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
 		 */
 		pcie_fw = t4_read_reg(adap, A_PCIE_FW);
 		if (i > 4*timeout || (pcie_fw & F_PCIE_FW_ERR)) {
-			t4_mbox_list_del(adap, &entry);
+			t4_mbox_waiter_remove(adap, &entry);
 			t4_report_fw_error(adap);
 			ret = (pcie_fw & F_PCIE_FW_ERR) ? -ENXIO : -EBUSY;
 			T4_RECORD_MBOX(adap, cmd, size, ret, 0);
 			return ret;
 		}
 
-		/*
-		 * If we're at the head, break out and start the mailbox
-		 * protocol.
-		 */
-		if (t4_mbox_list_first_entry(adap) == &entry)
+		if (t4_mbox_wait_owner(adap, MBOX_CMD_DELAY, sleep_ok)) {
 			break;
-
-		/*
-		 * Delay for a bit before checking again ...
-		 */
-		if (sleep_ok) {
-			usleep_range(MIN_MBOX_CMD_DELAY, MBOX_CMD_DELAY);
-		} else {
-			T4_OS_TOUCH_NMI_WATCHDOG();
-			udelay(MBOX_CMD_DELAY);
 		}
 	}
 #ifdef T4_OS_LOG_MBOX_CMDS
@@ -524,7 +511,7 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
 	 * mailbox atomic access list and report the error to our caller.
 	 */
 	if (v != X_MBOWNER_PL) {
-		t4_mbox_list_del(adap, &entry);
+		t4_mbox_waiter_remove(adap, &entry);
 		t4_report_fw_error(adap);
 		ret = (v == X_MBOWNER_FW) ? -EBUSY : -ETIMEDOUT;
 		T4_RECORD_MBOX(adap, cmd, size, access, ret);
@@ -597,7 +584,7 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
 			 */
 			get_mbox_rpl(adap, cmd_rpl, size/8, data_reg);
 			t4_write_reg(adap, ctl_reg, V_MBOWNER(X_MBOWNER_NONE));
-			t4_mbox_list_del(adap, &entry);
+			t4_mbox_waiter_remove(adap, &entry);
 
 			T4_RECORD_MBOX(adap, cmd_rpl, size, access, i + 1);
 
@@ -625,7 +612,7 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
 	 * the error and also check to see if the firmware reported any
 	 * errors ...
 	 */
-	t4_mbox_list_del(adap, &entry);
+	t4_mbox_waiter_remove(adap, &entry);
 
 	ret = (pcie_fw & F_PCIE_FW_ERR) ? -ENXIO : -ETIMEDOUT;
 	T4_RECORD_MBOX(adap, cmd, size, access, ret);
@@ -9099,7 +9086,7 @@ int t4_link_set_autoneg(struct port_info *pi, u8 autoneg,
 	return 0;
 }
 
-int t4_link_set_pause(struct port_info *pi, cc_pause_t pause,
+void t4_link_set_pause(struct port_info *pi, cc_pause_t pause,
 		      fw_port_cap32_t *new_caps)
 {
 	struct link_config *lc = &pi->link_cfg;
@@ -9128,7 +9115,6 @@ int t4_link_set_pause(struct port_info *pi, cc_pause_t pause,
 		caps |= FW_PORT_CAP32_FORCE_PAUSE;
 
 	*new_caps = caps;
-	return 0;
 }
 
 #define T4_LINK_FEC_MASK V_FW_PORT_CAP32_FEC(M_FW_PORT_CAP32_FEC)
diff --git a/usr/src/uts/common/io/cxgbe/shared/shared.h b/usr/src/uts/common/io/cxgbe/shared/shared.h
index 8a1f682be0..158728427f 100644
--- a/usr/src/uts/common/io/cxgbe/shared/shared.h
+++ b/usr/src/uts/common/io/cxgbe/shared/shared.h
@@ -21,7 +21,7 @@
  */
 
 /*
- * Copyright 2024 Oxide Computer Company
+ * Copyright 2025 Oxide Computer Company
  */
 
 #ifndef __CXGBE_SHARED_H
@@ -51,7 +51,7 @@
 #define	CH_DBG(sc, category, fmt, ...)		do {} while (0)
 #endif
 
-extern int cxgb_printf(dev_info_t *dip, int level, char *f, ...);
+extern void cxgb_printf(dev_info_t *dip, int level, char *f, ...);
 
 /* Attach/detach logic used by cxgbe, calling into t4nex */
 struct port_info;
diff --git a/usr/src/uts/common/io/cxgbe/t4nex/adapter.h b/usr/src/uts/common/io/cxgbe/t4nex/adapter.h
index ea147b19da..34659a32c6 100644
--- a/usr/src/uts/common/io/cxgbe/t4nex/adapter.h
+++ b/usr/src/uts/common/io/cxgbe/t4nex/adapter.h
@@ -30,129 +30,122 @@
 #include <sys/ddi.h>
 #include <sys/mac_provider.h>
 #include <sys/ethernet.h>
-#include <sys/queue.h>
+#include <sys/list.h>
 #include <sys/containerof.h>
 #include <sys/ddi_ufm.h>
+#include <sys/mac_provider.h>
 
 #include "firmware/t4fw_interface.h"
 #include "shared.h"
 
 struct adapter;
+struct port_info;
 typedef struct adapter adapter_t;
+struct sge_fl;
+
+/* See the _Ingress Context Contents_ section of the T4 Programmers Guide. */
+typedef enum t4_iq_esize {
+	T4_IQ_ESIZE_16B = 0,
+	T4_IQ_ESIZE_32B = 1,
+	T4_IQ_ESIZE_64B = 2,
+	T4_IQ_ESIZE_128B = 3,
+} t4_iq_esize_t;
 
 #define	FW_IQ_QSIZE	256
-#define	FW_IQ_ESIZE	64	/* At least 64 mandated by the firmware spec */
+/* At least 64 bytes mandated by the firmware spec */
+#define	FW_IQ_ESIZE	T4_IQ_ESIZE_64B
+
+#define	T4_RX_DEF_QSIZE	1024
+/* At least 64 so CPL_RX_PKT will fit */
+#define	RX_IQ_ESIZE	T4_IQ_ESIZE_64B
+
+/* A flit is an 8 byte quantity. */
+#define	FLIT_NUM_BYTES		8
+#define	FLITS_TO_BYTES(nflits)	((nflits) * FLIT_NUM_BYTES)
 
-#define	RX_IQ_QSIZE	1024
-#define	RX_IQ_ESIZE	64	/* At least 64 so CPL_RX_PKT will fit */
+/*
+ * Egress Queues (EQ) are made up of units called "host credits". Each credit is
+ * always 8 flits (64 bytes) in size. The number of entries in the queue as well
+ * as the producer and consumer indexes (pidx/cidx) are phrased in units of
+ * credits.
+ *
+ * A freelist (FL) is a type of EQ. It consists of 16-byte aligned, 8-byte
+ * pointers to data buffers meant to hold the data of incoming packets. Since an
+ * EQ host credit is always 8 flits, and an FL buffer pointer is a single flit,
+ * each credit holds 8 FL buffer pointers.
+ *
+ */
+#define	FLITS_PER_EQ_HC		8
+#define	EQ_HC_SIZE		FLITS_PER_EQ_HC * FLIT_NUM_BYTES
+#define	FL_BUF_PTR_PER_HC	FLITS_PER_EQ_HC
 
-#define	EQ_ESIZE	64	/* All egress queues use this entry size */
+/*
+ * Given a number of host credits, calculate the total number of flits
+ * contained in those credits.
+ */
+#define	EQ_HC_TO_FLITS(num_credits)	((num_credits) * FLITS_PER_EQ_HC)
 
-#define	RX_FL_ESIZE	64	/* 8 64bit addresses */
+/*
+ * Given a number of flits, calculate how many host credits are needed to hold
+ * them.
+ */
+#define	EQ_FLITS_TO_HC(num_flits)	(howmany(num_flits, FLITS_PER_EQ_HC))
 
-#define	FL_BUF_SIZES	4
+/*
+ * We constrain the max "usable" EQ size so that there is always room for the
+ * status page, which may require 1-2 host credits.
+ */
+#define	T4_MAX_EQ_SIZE		(UINT16_MAX - 2)
+#define	T4_TX_DEF_QSIZE		1024
+#define	TX_SGL_SEGS		36
 
-#define	CTRL_EQ_QSIZE	128
+/* The maximum number of flits/credits a single WR may consume. */
+#define	TX_WR_MAX_FLITS		(SGE_MAX_WR_LEN / FLIT_NUM_BYTES)
+#define	TX_WR_MAX_CREDITS	(TX_WR_MAX_FLITS / FLITS_PER_EQ_HC)
 
-#define	TX_EQ_QSIZE	1024
-#define	TX_SGL_SEGS	36
-#define	TX_WR_FLITS	(SGE_MAX_WR_LEN / 8)
+CTASSERT(TX_WR_MAX_FLITS == 64);
+CTASSERT(TX_WR_MAX_CREDITS == 8);
 
 #define	UDBS_SEG_SHIFT	7	/* log2(UDBS_SEG_SIZE) */
 #define	UDBS_DB_OFFSET	8	/* offset of the 4B doorbell in a segment */
 #define	UDBS_WR_OFFSET	64	/* offset of the work request in a segment */
 
-typedef enum t4_port_flags {
-	TPF_INIT_DONE	= (1 << 0),
-	TPF_OPEN	= (1 << 1),
-} t4_port_flags_t;
-
-typedef enum t4_port_feat {
-	CXGBE_HW_LSO	= (1 << 0),
-	CXGBE_HW_CSUM	= (1 << 1),
-} t4_port_feat_t;
-
-struct port_info {
-	dev_info_t *dip;
-	mac_handle_t mh;
-	mac_callbacks_t *mc;
-	int mtu;
-	uint8_t hw_addr[ETHERADDRL];
-
-	kmutex_t lock;
-	struct adapter *adapter;
-
-	t4_port_flags_t flags;
-
-	uint16_t viid;
-	int16_t  xact_addr_filt; /* index of exact MAC address filter */
-	uint16_t rss_size;	/* size of VI's RSS table slice */
-	uint16_t ntxq;		/* # of tx queues */
-	uint16_t first_txq;	/* index of first tx queue */
-	uint16_t nrxq;		/* # of rx queues */
-	uint16_t first_rxq;	/* index of first rx queue */
-	uint8_t  lport;		/* associated offload logical port */
-	int8_t   mdio_addr;
-	uint8_t  port_type;
-	uint8_t  mod_type;
-	uint8_t  port_id;
-	uint8_t  tx_chan;
-	uint8_t  rx_chan;
-	uint8_t  rx_cchan;
-	uint8_t instance; /* Associated adapter instance */
-	uint8_t child_inst; /* Associated child instance */
-
-	uint8_t	tmr_idx;
-	int8_t	pktc_idx;
-	uint8_t	dbq_timer_idx;
-
-	struct link_config link_cfg;
-	struct port_stats stats;
-	t4_port_feat_t features;
-	uint8_t macaddr_cnt;
-	u8 rss_mode;
-	u16 viid_mirror;
-	kstat_t *ksp_config;
-	kstat_t *ksp_info;
-	kstat_t *ksp_fec;
-
-	u8 vivld;
-	u8 vin;
-	u8 smt_idx;
+/*
+ * A sentinel to mark when the interrupts for an IQ are being forwarded from
+ * another IQ which is receiving the actual interrupt.
+ */
+#define	INTR_FORWARDED	UINT_MAX
 
-	u8 vivld_mirror;
-	u8 vin_mirror;
-	u8 smt_idx_mirror;
+struct fl_desc {
+	uint64_t dptr[FL_BUF_PTR_PER_HC];
 };
 
 struct fl_sdesc {
 	struct rxbuf *rxb;
 };
 
-struct tx_desc {
-	__be64 flit[8];
-};
+typedef struct t4_eq_host_credit {
+	uint64_t flit[8];
+} t4_eq_host_credit_t;
 
 struct tx_sdesc {
 	mblk_t *mp_head;
 	mblk_t *mp_tail;
 	uint32_t txb_used;	/* # of bytes of tx copy buffer used */
 	uint16_t hdls_used;	/* # of dma handles used */
-	uint16_t desc_used;	/* # of hardware descriptors used */
+	uint16_t credits_used;	/* # of EQ host credits used */
 	uint64_t _pad;
 };
 
 typedef enum t4_iq_flags {
-	IQ_ALLOCATED	= (1 << 0),	/* firmware resources allocated */
-	IQ_INTR		= (1 << 1),	/* iq takes direct interrupt */
-	IQ_HAS_FL	= (1 << 2),	/* iq has fl */
-} t4_iq_flags_t;
+	IQ_ALLOC_HOST	= (1 << 0),	/* host-side resources allocated */
+	IQ_ALLOC_DEV	= (1 << 1),	/* device-side resource allocated */
+	IQ_INTR		= (1 << 2),	/* iq takes direct interrupt */
 
-typedef enum t4_iq_state {
-	IQS_DISABLED	= 0,
-	IQS_BUSY	= 1,
-	IQS_IDLE	= 2,
-} t4_iq_state_t;
+	/* Runtime state flags: */
+	IQ_ENABLED	= (1 << 3),
+	IQ_POLLING	= (1 << 4),
+} t4_iq_flags_t;
 
 struct rxbuf_cache_params {
 	dev_info_t		*dip;
@@ -162,8 +155,8 @@ struct rxbuf_cache_params {
 };
 
 struct sge_iq_stats {
-	uint64_t sis_overflow;
-	uint64_t sis_processed;
+	uint64_t sis_processed;	/* # entries processed from IQ */
+	uint64_t sis_overflow;	/* # entries bearing overflow flag */
 };
 
 /*
@@ -172,141 +165,330 @@ struct sge_iq_stats {
  *
  * See: t4_iq_update_intr_cfg() and t4_iq_gts_update().
  */
-typedef enum t4_intr_config {
-	TIC_SE_INTR_ARM		= 1,
-	TIC_TIMER0		= (0 << 1),
-	TIC_TIMER1		= (1 << 1),
-	TIC_TIMER2		= (2 << 1),
-	TIC_TIMER3		= (3 << 1),
-	TIC_TIMER4		= (4 << 1),
-	TIC_TIMER5		= (5 << 1),
-	TIC_START_COUNTER	= (6 << 1),
-} t4_intr_config_t;
+typedef enum t4_gts_config {
+	TGC_SE_INTR_ARM		= 1,
+	TGC_TIMER0		= (0 << 1),
+	TGC_TIMER1		= (1 << 1),
+	TGC_TIMER2		= (2 << 1),
+	TGC_TIMER3		= (3 << 1),
+	TGC_TIMER4		= (4 << 1),
+	TGC_TIMER5		= (5 << 1),
+	TGC_START_COUNTER	= (6 << 1),
+} t4_gts_config_t;
 
 /*
- * Ingress Queue: T4 is producer, driver is consumer.
+ * Event IQs are used for firmware events, Tx EGR updates, and IQ forwarded
+ * interrupts.
+ *
+ * Ethernet Rx IQs are used for receiving incoming packets.
  */
-struct sge_iq {
-	t4_iq_state_t state;
-	t4_iq_flags_t flags;
-	t4_intr_config_t intr_params;
-
-	ddi_dma_handle_t dhdl;
-	ddi_acc_handle_t ahdl;
-
-	__be64 *desc;		/* KVA of descriptor ring */
-	uint64_t ba;		/* bus address of descriptor ring */
-	const __be64 *cdesc;	/* current descriptor */
-	struct adapter *adapter; /* associated  adapter */
-	uint8_t  gen;		/* generation bit */
-	int8_t   intr_pktc_idx;	/* packet count threshold index */
-	uint8_t  esize;		/* size (bytes) of each entry in the queue */
-	uint16_t qsize;		/* size (# of entries) of the queue */
-	uint16_t cidx;		/* consumer index */
-	uint16_t pending;	/* # of descs processed since last doorbell */
-	uint16_t cntxt_id;	/* SGE context id  for the iq */
-	uint16_t abs_id;	/* absolute SGE id for the iq */
-	kmutex_t lock;		/* Rx access lock */
-	uint8_t polling;
-
-	struct sge_iq_stats stats;
-
-	STAILQ_ENTRY(sge_iq) link;
+typedef enum t4_iq_type {
+	TIQT_UNINIT,
+	TIQT_EVENT,
+	TIQT_ETH_RX,
+} t4_iq_type_t;
+
+/* Ingress Queue: T4 is producer, driver is consumer. */
+typedef struct t4_sge_iq {
+	kmutex_t tsi_lock;
+
+	t4_iq_type_t tsi_iqtype; /* Write Once */
+	t4_iq_flags_t tsi_flags; /* tsi_lock */
+
+	/*
+	 * This field is non-NULL only for Rx queues. It points to the event
+	 * queue which receives interrupts on its behalf. The event queue
+	 * processes these "forwarded interrupts" in t4_process_event_iq() and
+	 * calls into t4_process_rx_iq() for each Rx queue with an interrupt
+	 * notification message.
+	 */
+	struct t4_sge_iq *tsi_intr_evtq; /* Write Once */
+	/*
+	 * This field is only used by the event queues.
+	 *
+	 * As the event queue processes forwarded interrupt notification
+	 * messages it adds the destination rx queue receving the notification
+	 * to this list. After the event queue finishes processing its own
+	 * messages, it then uses this list to process the rx queues which have
+	 * pending notifications.
+	 */
+	list_node_t tsi_intr_fwd_node; /* tsi_lock */
+	/*
+	 * This field is used by both event queues and rx queues.
+	 *
+	 * For event queues this field holds the interrupt vector assigned to
+	 * this queue.
+	 *
+	 * For rx queues it holds the sentinel value INTR_FORWARDED to indicate
+	 * it has its interrupts forwarded by the event queue. In the current
+	 * iteration of this driver all rx queues will always have their
+	 * interrupts forwarded.
+	 */
+	uint_t tsi_intr_idx;	/* Write Once */
+
+	ddi_dma_handle_t tsi_desc_dhdl; /* Write Once */
+	ddi_acc_handle_t tsi_desc_ahdl; /* Write Once */
+
+	/* KVA of descriptor ring */
+	void *tsi_desc;		/* Write Once */
+	/* bus address of descriptor ring */
+	uint64_t tsi_desc_ba;	/* Write Once */
+	/* current descriptor (at CIDX) */
+	const void *tsi_cdesc;	/* tsi_lock */
+
+	/* Sizing and status */
+	/* size of each entry in the queue */
+	t4_iq_esize_t tsi_esize;  /* Write Once */
+	/* entry size in bytes */
+	uint16_t tsi_esize_bytes; /* Write Once */
+	/* number of entries in the queue */
+	uint16_t tsi_qsize;	/* Write Once */
+	/* number of usable entries in the queue */
+	uint16_t tsi_cap;	/* Write Once */
+	/* consumer index */
+	uint16_t tsi_cidx;	/* tsi_lock */
+	/* generation bit */
+	uint8_t tsi_gen;	/* tsi_lock */
+
+	/* GTS config to re-arm queue notification */
+	t4_gts_config_t tsi_gts_rearm; /* tsi_lock */
+	/* packet count threshold index */
+	int8_t tsi_intr_pktc_idx; /* tsi_lock */
+
+	/* SGE context ID for IQ */
+	uint16_t tsi_cntxt_id;	/* Write Once */
+	/* absolute SGE ID for IQ */
+	uint16_t tsi_abs_id;	/* Write Once */
+
+	/* associated adapter */
+	struct adapter *tsi_adapter; /* Write Once */
+	/* associated freelist (if any) */
+	struct sge_fl *tsi_fl;	/* Write Once */
+
+	struct sge_iq_stats tsi_stats; /* tsi_lock */
+} t4_sge_iq_t;
+
+/* Result of servicing IQ in t4_iq_service() call */
+typedef enum t4_iq_result {
+	TIR_SUCCESS,	/* All available entries processed successfully */
+	TIR_DISABLED,	/* IQ is disabled */
+	TIR_POLLING,	/* non-polling service req'd on polling-cfg'd IQ */
+	TIR_ALLOC_FAIL,	/* could not allocate packet buffer(s) */
+	TIR_BUDGET_MAX,	/* hit budget limit while processing entries */
+} t4_iq_result_t;
+
+/*
+ * Details used when servicing an IQ as part of polling.
+ */
+struct t4_poll_req {
+	mblk_t	*tpr_mp;
+	uint_t	tpr_byte_budget;
 };
 
 typedef enum t4_eq_flags {
 	/* Initialization state flags: */
-	EQ_ALLOCATED	= (1 << 0),	/* firmware resources allocated */
-	EQ_MTX		= (1 << 1),	/* mutex has been initialized */
+	EQ_ALLOC_HOST	= (1 << 0),	/* host-side resources allocated */
+	EQ_ALLOC_DEV	= (1 << 1),	/* EQ allocated in device firmware */
+	EQ_ALLOC_DESC	= (1 << 2),	/* descriptor inputs allocated */
 
 	/* Runtime state flags: */
 
+	EQ_ENABLED	= (1 << 3),	/* ready for submitted work requests */
 	/*
 	 * Short on resources (memory and/or descriptors) while attempting to
 	 * enqueue work in EQ
 	 */
-	EQ_CORKED	= (1 << 2),
+	EQ_CORKED	= (1 << 4),
 } t4_eq_flags_t;
 
-/* Listed in order of preference. */
+/*
+ * These are the Egress Queue doorbell methods. They are listed in order of
+ * preference (WCWR most preferred, KDB least). This ordering is important as
+ * the datapath uses ffs (find first set) to pick the preferred method.
+ *
+ * The first three are "user space" doorbells. They are mapped in BAR2 and are
+ * provided to allow kernel-bypass network stacks. However, they can also be
+ * used in the kernel and provide benefits such as write combining and per-queue
+ * registers (versus KDB which is a single register).
+ *
+ * The WCWR, Write Combining Work Request, is the preferred method. It allows
+ * the driver to push a WR directly to the device without the need for it to
+ * perform a DMA read of the hardware ring (to read the EQ host credit). Instead
+ * it comes in via the BAR2/UDB memory space and the device increments the pidx
+ * accordingly. However, the WCWR is limited to a single WR. It's use is
+ * intended for low latency situations or low rate of work, not for throughput.
+ *
+ * The maximum WCWR for T4 is 256 bytes. For T5/T6 it is 64-128 bytes, depending
+ * on the write-combining size of the platform.
+ *
+ * T4 Firmware Interface Specification, §9.2 Egress Queues and Work Requests.
+ */
 typedef enum t4_doorbells {
-	DOORBELL_UDB	= (1 << 0),
-	DOORBELL_WCWR	= (1 << 1),
-	DOORBELL_UDBWC	= (1 << 2),
+	DOORBELL_WCWR	= (1 << 0),
+	DOORBELL_UDBWC	= (1 << 1),
+	DOORBELL_UDB	= (1 << 2),
 	DOORBELL_KDB	= (1 << 3),
 } t4_doorbells_t;
 
-/*
- * Egress Queue: driver is producer, T4 is consumer.
- *
- * Note: A free list is an egress queue (driver produces the buffers and T4
- * consumes them) but it's special enough to have its own struct (see sge_fl).
- */
-struct sge_eq {
-	ddi_dma_handle_t desc_dhdl;
-	ddi_acc_handle_t desc_ahdl;
-	t4_eq_flags_t flags;
-	kmutex_t lock;
+/* Egress Queue: driver is producer, T4 is consumer. */
+typedef struct t4_sge_eq {
+	kmutex_t tse_lock;
 
-	struct tx_desc *desc;	/* KVA of descriptor ring */
-	uint64_t ba;		/* bus address of descriptor ring */
-	struct sge_qstat *spg;	/* status page, for convenience */
-	t4_doorbells_t doorbells;
-	caddr_t udb;		/* KVA of doorbell (lies within BAR2) */
-	uint_t udb_qid;		/* relative qid within the doorbell page */
-	uint16_t cap;		/* max # of desc, for convenience */
-	uint16_t avail;		/* available descriptors, for convenience */
-	uint16_t qsize;		/* size (# of entries) of the queue */
-	uint16_t cidx;		/* consumer idx (desc idx) */
-	uint16_t pidx;		/* producer idx (desc idx) */
-	uint16_t pending;	/* # of descriptors used since last doorbell */
-	uint16_t iqid;		/* iq that gets egr_update for the eq */
-	uint8_t tx_chan;	/* tx channel used by the eq */
-	uint32_t cntxt_id;	/* SGE context id for the eq */
-};
+	t4_eq_flags_t tse_flags;	/* tse_lock */
 
-typedef enum t4_fl_flags {
-	FL_MTX		= (1 << 0),	/* mutex has been initialized */
-	FL_STARVING	= (1 << 1),	/* on the list of starving fl's */
-	FL_DOOMED	= (1 << 2),	/* about to be destroyed */
-} t4_fl_flags_t;
+	ddi_dma_handle_t tse_ring_dhdl; /* Write Once */
+	ddi_acc_handle_t tse_ring_ahdl; /* Write Once */
+
+	/*
+	 * The ring type is pointer to void because the ring does not consist of
+	 * descriptors but rather host credits. These host credits carry
+	 * variable length work requests (WR) as well as the status page (SP) at
+	 * the end of the ring. We use void* to facilitate the type punning
+	 * required to work with these various types of EQ entries.
+	 *
+	 * In order to access credits and their individual flits we make use to
+	 * the t4_eq_host_credit_t type.
+	 */
+	void *tse_ring;		/* KVA of ring - Write Once */
+	uint64_t tse_ring_ba;	/* bus address of ring - Write Once */
+
+	/*
+	 * tse_qsize: The number of host credits that may be used for data. This
+	 * value is static for the lifetime of the queue.
+	 *
+	 * tse_qsize_spg: The total number of host credits in the queue. This is
+	 * 1-2 more credits than tse_qsize to account for the status page at the
+	 * end of the queue. The status page credits cannot be used for sending
+	 * data, rather the beginning of the status page is considered the end
+	 * of the queue as far as the datapath is concerned.
+	 *
+	 * tse_avail: The number of host credits that are currently available
+	 * for use by the host. This is never more than 'tse_qsize - 1' in order
+	 * to avoid 'tse_pidx==tse_cidx' which we use to indicate an empty
+	 * queue. This number is updated as credits are used/recycled.
+	 *
+	 * tse_pending: The number of credits that have been written by the host
+	 * but still require a doorbell before the device can consume them. Said
+	 * another way, it's the number of credits the host's pidx is ahead of
+	 * the device's cidx.
+	 */
+	uint16_t tse_qsize;	/* Write Once */
+	uint16_t tse_qsize_spg;	/* Write Once */
+	uint16_t tse_avail;	/* tse_lock */
+	uint16_t tse_pending;	/* tse_lock */
 
-#define	FL_RUNNING_LOW(fl)	(fl->cap - fl->needed <= fl->lowat)
-#define	FL_NOT_RUNNING_LOW(fl)	(fl->cap - fl->needed >= 2 * fl->lowat)
+	/*
+	 * The pidx is the driver's position in the queue, pointing to the next
+	 * credit to consume. The cidx is the device's position in the queue,
+	 * pointing to the last credit it has consumed as of the last status
+	 * update.
+	 */
+	uint16_t tse_cidx;	/* tse_lock */
+	uint16_t tse_pidx;	/* tse_lock */
+
+	/* Doorbell bits */
+	t4_doorbells_t tse_doorbells; /* Write Once */
+	/* KVA of doorbell (lies within BAR2) */
+	caddr_t tse_udb;	/* Write Once */
+	/* relative qid within the doorbell page */
+	uint_t tse_udb_qid;	/* Write Once */
+
+	struct sge_qstat *tse_spg;	/* status page - Write Once */
+	/* IQ that gets egr_update msg for EQ */
+	uint16_t tse_iqid;		/* Write Once */
+	/* tx channel used by the EQ */
+	uint8_t tse_tx_chan;		/* Write Once */
+	/* SGE context id for the EQ */
+	uint32_t tse_cntxt_id;		/* Write Once */
+} t4_sge_eq_t;
+
+typedef enum t4_sfl_flags {
+	SFL_STARVING	= (1 << 0),	/* on the list of starving fl's */
+	SFL_DOOMED	= (1 << 1),	/* about to be destroyed */
+} t4_sfl_flags_t;
+
+struct sge_fl_stats {
+	/* These stats describe the receiving of data. */
+	uint64_t copy;		/* # of frames copied (allocb) */
+	uint64_t copy_fail;	/* # of allocb failures */
+	uint64_t wrap;		/* # of frames wrapped (desballoc) */
+	uint64_t wrap_fail;	/* # of deballoc failures */
+
+	/* These stats describe the refilling of rx (FL) buffers. */
+	uint64_t rxb_recycle;	/* # of rx buffers recycled */
+	uint64_t rxb_alloc;	/* # of rx buffers allocated */
+	uint64_t rxb_alloc_fail; /* # of rx buffers that failed to allocb */
+};
 
 struct sge_fl {
-	t4_fl_flags_t flags;
-	kmutex_t lock;
-	ddi_dma_handle_t dhdl;
-	ddi_acc_handle_t ahdl;
-
-	__be64 *desc;		/* KVA of descriptor ring, ptr to addresses */
-	uint64_t ba;		/* bus address of descriptor ring */
-	struct fl_sdesc *sdesc;	/* KVA of software descriptor ring */
-	uint32_t cap;		/* max # of buffers, for convenience */
-	uint16_t qsize;		/* size (# of entries) of the queue */
-	uint16_t cntxt_id;	/* SGE context id for the freelist */
-	uint32_t cidx;		/* consumer idx (buffer idx, NOT hw desc idx) */
-	uint32_t pidx;		/* producer idx (buffer idx, NOT hw desc idx) */
-	uint32_t needed;	/* # of buffers needed to fill up fl. */
-	uint32_t lowat;		/* # of buffers <= this means fl needs help */
-	uint32_t pending;	/* # of bufs allocated since last doorbell */
-	uint32_t offset;	/* current packet within the larger buffer */
-	uint16_t copy_threshold; /* anything this size or less is copied up */
+	/*
+	 * EQ for passing freelist entries to adapter.
+	 * Must be first field in struct
+	 */
+	t4_sge_eq_t eq;		/* Write Once */
+
+	/*
+	 * Index at which new buffers are to be placed in the FL descriptor
+	 * which is currently being produced for the device.
+	 */
+	uint8_t cidx_sdesc;	/* FL_LOCK */
+	uint8_t pidx_sdesc;	/* FL_LOCK */
+
+	/* KVA of the software descriptor ring. */
+	struct fl_sdesc *sdesc;	/* Write Once */
+	/* Total number of buffers in the FL.  */
+	uint32_t bufs_cap;	/* Write Once */
+	/*
+	 * Number of buffers available to receive data, buffers owned by the
+	 * device.
+	 */
+	uint32_t bufs_avail;	 /* FL_LOCK */
+	/* Number of buffers at which the FL is considered "starving". */
+	uint32_t bufs_lowat;	/* Write Once */
+	/* The byte offset in the current FL buffer. */
+	uint32_t offset;	/* FL_LOCK */
+	/* Any packet smaller or equal to this is copied (allocb). */
+	uint16_t copy_threshold; /* Write Once */
+
+	/* Starvation-related state for this freelist. */
+	t4_sfl_flags_t sfl_flags; /* adapter->sfl_lock */
+	list_node_t sfl_node;	  /* adapter->sfl_lock */
+
+	struct sge_fl_stats stats; /* FL_LOCK */
+};
 
-	uint64_t copied_up;	/* # of frames copied into mblk and handed up */
-	uint64_t passed_up;	/* # of frames wrapped in mblk and handed up */
-	uint64_t allocb_fail;	/* # of mblk allocation failures */
+struct sge_txq_stats {
+	/* stats for common events first */
+	uint64_t txpkts;	/* # of ethernet packets */
+	uint64_t txbytes;	/* # of ethernet bytes */
+	uint64_t txcsum;	/* # of times hardware assisted with checksum */
+	uint64_t tso_wrs;	/* # of IPv4 TSO work requests */
+	uint64_t imm_wrs;	/* # of work requests with immediate data */
+	uint64_t sgl_wrs;	/* # of work requests with direct SGL */
+	uint64_t txpkt_wrs;	/* # of txpkt work requests (not coalesced) */
+	uint64_t txpkts_wrs;	/* # of coalesced tx work requests */
+	uint64_t txpkts_pkts;	/* # of frames in coalesced tx work requests */
+	uint64_t txb_used;	/* # of tx copy buffers used (64 byte each) */
+	uint64_t hdl_used;	/* # of DMA handles used */
 
-	TAILQ_ENTRY(sge_fl) link; /* All starving freelists */
+	/* stats for not-that-common events */
+	uint32_t txb_full;	/* txb ran out of space */
+	uint32_t dma_hdl_failed; /* couldn't obtain DMA handle */
+	uint32_t dma_map_failed; /* couldn't obtain DMA mapping */
+	uint32_t qfull;		/* out of hardware descriptors */
+	uint32_t pullup_early;	/* # of pullups before starting frame's SGL */
+	uint32_t pullup_late;	/* # of pullups while building frame's SGL */
+	uint32_t pullup_failed;	/* # of failed pullups */
+	uint32_t csum_failed;	/* # of csum reqs we failed to fulfill */
 };
 
-/* txq: SGE egress queue + miscellaneous items */
+/* Ethernet packet transmission queue */
 struct sge_txq {
-	struct sge_eq eq;	/* MUST be first */
+	t4_sge_eq_t eq;
 
-	struct port_info *port;	/* the port this txq belongs to */
+	struct port_info *port;
 	struct tx_sdesc *sdesc;	/* KVA of software descriptor ring */
+
 	mac_ring_handle_t ring_handle;
 
 	/* DMA handles used for tx */
@@ -326,137 +508,234 @@ struct sge_txq {
 	uint32_t txb_avail;	/* # of bytes available */
 	uint16_t copy_threshold; /* anything this size or less is copied up */
 
-	uint64_t txpkts;	/* # of ethernet packets */
-	uint64_t txbytes;	/* # of ethernet bytes */
 	kstat_t *ksp;
+	struct sge_txq_stats stats;
+};
 
+struct sge_rxq_stats {
 	/* stats for common events first */
-
-	uint64_t txcsum;	/* # of times hardware assisted with checksum */
-	uint64_t tso_wrs;	/* # of IPv4 TSO work requests */
-	uint64_t imm_wrs;	/* # of work requests with immediate data */
-	uint64_t sgl_wrs;	/* # of work requests with direct SGL */
-	uint64_t txpkt_wrs;	/* # of txpkt work requests (not coalesced) */
-	uint64_t txpkts_wrs;	/* # of coalesced tx work requests */
-	uint64_t txpkts_pkts;	/* # of frames in coalesced tx work requests */
-	uint64_t txb_used;	/* # of tx copy buffers used (64 byte each) */
-	uint64_t hdl_used;	/* # of DMA handles used */
-
-	/* stats for not-that-common events */
-
-	uint32_t txb_full;	/* txb ran out of space */
-	uint32_t dma_hdl_failed; /* couldn't obtain DMA handle */
-	uint32_t dma_map_failed; /* couldn't obtain DMA mapping */
-	uint32_t qfull;		/* out of hardware descriptors */
-	uint32_t pullup_early;	/* # of pullups before starting frame's SGL */
-	uint32_t pullup_late;	/* # of pullups while building frame's SGL */
-	uint32_t pullup_failed;	/* # of failed pullups */
-	uint32_t csum_failed;	/* # of csum reqs we failed to fulfill */
+	uint64_t rxcsum;	/* # of times hardware assisted with checksum */
+	uint64_t rxpkts;	/* # of ethernet packets */
+	uint64_t rxbytes;	/* # of ethernet bytes */
 };
 
-/* rxq: SGE ingress queue + SGE free list + miscellaneous items */
+/* Ethernet packet receive queue */
 struct sge_rxq {
-	struct sge_iq iq;	/* MUST be first */
-	struct sge_fl fl;
+	t4_sge_iq_t iq;
+	struct sge_fl fl;	/* Freelist for packet receive buffers */
 
-	struct port_info *port;	/* the port this rxq belongs to */
-	kstat_t *ksp;
+	struct port_info *port;
 
 	mac_ring_handle_t ring_handle;
 	uint64_t ring_gen_num;
 
-	/* stats for common events first */
+	kstat_t *ksp;
+	struct sge_rxq_stats stats;
+};
 
-	uint64_t rxcsum;	/* # of times hardware assisted with checksum */
-	uint64_t rxpkts;	/* # of ethernet packets */
-	uint64_t rxbytes;	/* # of ethernet bytes */
+typedef enum t4_port_flags {
+	TPF_INIT_DONE	= (1 << 0),
+	TPF_OPEN	= (1 << 1),
+	TPF_VI_ENABLED	= (1 << 2),
+} t4_port_flags_t;
 
-	/* stats for not-that-common events */
+typedef enum t4_port_feat {
+	CXGBE_HW_LSO	= (1 << 0),
+	CXGBE_HW_CSUM	= (1 << 1),
+} t4_port_feat_t;
+
+
+struct port_info {
+	kmutex_t	lock;
+	dev_info_t	*dip;
+	struct adapter	*adapter;
+	uint8_t		port_id;
+
+	t4_port_flags_t	flags;
+	t4_port_feat_t	features;
+
+	mac_handle_t	mh;
+	int		mtu;
+	uint8_t		hw_addr[ETHERADDRL];
+	int16_t 	xact_addr_filt; /* index of exact MAC address filter */
+
+	uint16_t	rxq_count;	/* # of RX queues */
+	uint16_t	rxq_start;	/* index of first RX queue */
+	uint16_t	txq_count;	/* # of TX queues */
+	uint16_t	txq_start;	/* index of first TX queue */
+
+	/*
+	 * Array of IQs for queue events, such as interrupt forward events
+	 * for Rx queue processing and completion events for Tx queues.
+	 * Only available when TIP_PER_PORT is selected. The size is based
+	 * on adapter.intr_queue_cfg.intr_per_port.
+	 */
+	t4_sge_iq_t	*intr_iqs;
+
+	kstat_t *ksp_config;
+	kstat_t *ksp_info;
+	kstat_t *ksp_fec;
+
+	/* Port attributes/data set by common code: */
+	uint16_t	viid;
+	uint16_t	rss_size;	/* size of VI's RSS table slice */
+
+	uint8_t		port_type;
+	int8_t		mdio_addr;
+	uint8_t		mod_type;
+
+	uint8_t		lport;
+	uint8_t		tx_chan;
+	uint8_t		rx_chan;
+	uint8_t		rx_cchan;
+
+	uint8_t		rss_mode;
+
+	uint8_t		tmr_idx;
+	int8_t		pktc_idx;
+	uint8_t		dbq_timer_idx;
 
-	uint32_t nomem;		/* mblk allocation during rx failed */
+	struct link_config link_cfg;
+	uint8_t		macaddr_cnt;
+
+	u8 vivld;
+	u8 vin;
+	u8 smt_idx;
+
+	/* Mirroring bits utilized by common code (unused by our driver) */
+	u16 viid_mirror;
+	u8 vivld_mirror;
+	u8 vin_mirror;
 };
 
-struct sge {
-	int fl_starve_threshold;
-	int s_qpp;
+struct sge_info {
+	uint_t fl_starve_threshold;
 	uint64_t dbq_timer_tick;
 	uint16_t dbq_timers[SGE_NDBQTIMERS];
 
-	int nrxq;	/* total rx queues (all ports and the rest) */
-	int ntxq;	/* total tx queues (all ports and the rest) */
-	int niq;	/* total ingress queues */
-	int neq;	/* total egress queues */
-	int stat_len;	/* length of status page at ring end */
-	int pktshift;	/* padding between CPL & packet data */
-	int fl_align;	/* response queue message alignment */
+	uint_t eq_spg_len;	/* EQ status page length in host credits */
+	uint_t pktshift;	/* padding between CPL & packet data */
+	uint_t fl_align;	/* response queue message alignment */
 	uint8_t fwq_tmr_idx;	/* Intr. coalesce timer for FWQ */
 	int8_t fwq_pktc_idx;	/* Intr. coalesce count for FWQ */
 
-	struct sge_iq fwq;	/* Firmware event queue */
-	struct sge_txq *txq;	/* NIC tx queues */
-	struct sge_rxq *rxq;	/* NIC rx queues */
+	t4_sge_iq_t fwq;	/* Firmware event queue */
 
-	uint_t iq_start; /* iq context id map start index */
-	uint_t eq_start; /* eq context id map start index */
-	uint_t iqmap_sz; /* size of iq context id map */
-	uint_t eqmap_sz; /* size of eq context id map */
-	struct sge_iq **iqmap;	/* iq->cntxt_id to iq mapping */
-	struct sge_eq **eqmap;	/* eq->cntxt_id to eq mapping */
+	uint_t rxq_count;	/* total RX queues (all ports and the rest) */
+	uint_t txq_count;	/* total TX queues (all ports and the rest) */
+	struct sge_txq *txq;	/* NIC TX queues */
+	struct sge_rxq *rxq;	/* NIC RX queues */
+
+	/*
+	 * Adapters uses 16-bit "context IDs" to uniquely identify queues.
+	 *
+	 * References to the queues, indexed by said context IDs are maintained
+	 * here, using the start/end values queried from the adapter.
+	 */
+	uint_t iqmap_start;	/* IQ context id map start index */
+	uint_t rxqmap_start;	/* IQ context id map start index */
+	uint_t eqmap_start;	/* EQ context id map start index */
+	uint_t iqmap_sz;	/* size of IQ context id map */
+	uint_t eqmap_sz;	/* size of EQ context id map */
+	t4_sge_iq_t **iqmap;	/* iq->cntxt_id to IQ mapping */
+	t4_sge_eq_t **eqmap;	/* eq->cntxt_id to EQ mapping */
 
 	/* Device access and DMA attributes for all the descriptor rings */
 	ddi_device_acc_attr_t acc_attr_desc;
 	ddi_dma_attr_t	dma_attr_desc;
 
-	/* Device access and DMA attributes for tx buffers */
+	/* Device access and DMA attributes for TX buffers */
 	ddi_device_acc_attr_t acc_attr_tx;
 	ddi_dma_attr_t	dma_attr_tx;
 
-	/* Device access and DMA attributes for rx buffers are in rxb_params */
+	/* Device access and DMA attributes for RX buffers are in rxb_params */
 	kmem_cache_t *rxbuf_cache;
 	struct rxbuf_cache_params rxb_params;
 };
 
 struct driver_properties {
-	int max_ntxq_10g;
-	int max_nrxq_10g;
-	int max_ntxq_1g;
-	int max_nrxq_1g;
-	int intr_types;
-	int tmr_idx_10g;
-	int pktc_idx_10g;
-	int tmr_idx_1g;
-	int pktc_idx_1g;
+	uint8_t ethq_tmr_idx;
+	int8_t ethq_pktc_idx;
 	uint8_t dbq_timer_idx;
 	uint8_t fwq_tmr_idx;
 	int8_t fwq_pktc_idx;
-	int qsize_txq;
-	int qsize_rxq;
+	uint16_t qsize_txq;
+	uint16_t qsize_rxq;
 
 	uint_t holdoff_timer_us[SGE_NTIMERS];
 	uint_t holdoff_pktcnt[SGE_NCOUNTERS];
 
-	int wc;
-
-	int multi_rings;
+	bool write_combine;
 	int t4_fw_install;
 };
 
-struct t4_mbox_list {
-	STAILQ_ENTRY(t4_mbox_list) link;
-};
+typedef struct t4_mbox_waiter {
+	list_node_t node;
+	kthread_t *thread;
+} t4_mbox_waiter_t;
 
 typedef enum t4_adapter_flags {
 	/* Initialization progress status bits */
 	TAF_INIT_DONE	= (1 << 0),
 	TAF_FW_OK	= (1 << 1),
-	TAF_INTR_FWD	= (1 << 2),
-	TAF_INTR_ALLOC	= (1 << 3),
+	TAF_INTR_ALLOC	= (1 << 2),
 
 	/* State & capability bits */
-	TAF_MASTER_PF	= (1 << 4),
-	TAF_DBQ_TIMER	= (1 << 5),
+	TAF_MASTER_PF	= (1 << 8),
+	TAF_DBQ_TIMER	= (1 << 9),
 } t4_adapter_flags_t;
 
+/* Plan for interrupt allocation */
+typedef enum t4_intr_plan {
+	/* Everything on a single interrupt */
+	TIP_SINGLE,
+	/* One for device errors, one FWQ (including forwarded intrs) */
+	TIP_ERR_QUEUES,
+	/* 1 + 1 for errors and FWQ, with rest divided evenly between ports */
+	TIP_PER_PORT,
+} t4_intr_plan_t;
+
+struct t4_intrs_queues {
+	/* The DDI_INTR_TYPE_* value negotiated. */
+	int intr_type;
+
+	/*
+	 * The plan for interrupt allocation, based on the interrupt type
+	 * and number of interrupts available. See the block comment in
+	 * t4_nexus.c for more information.
+	 */
+	t4_intr_plan_t intr_plan;
+
+	/*
+	 * The number of interrupts available (intr_avail) for use vs. the
+	 * number of interrupts the driver has decided to make use of
+	 * (intr_count). These values may be different depending on the
+	 * number available and the port count of the attached part.
+	 */
+	int intr_avail;
+	int intr_count;
+
+	/*
+	 * The number of interrupts per port for use with event queues.
+	 * These interrupts are used to take delivery of Tx recycling
+	 * messages and Rx packet delivery.
+	 */
+	uint_t intr_per_port;
+
+	/*
+	 * Track the number of IQs allocated for use with interrupts. We track
+	 * this to know how many IQs we have leftover for Rx queue usage.
+	 */
+	uint_t num_iqs;
+
+	/* The maximum number of RX/TX queues per port. */
+	uint_t port_max_rxq;
+	uint_t port_max_txq;
+};
+
+/*
+ * WO - Write Once at initialization time.
+ */
 struct adapter {
 	list_node_t node;
 	dev_info_t *dip;
@@ -483,17 +762,15 @@ struct adapter {
 	caddr_t bar2_ptr;
 
 	/* Interrupt information */
-	int intr_type;
-	int intr_count;
+	ddi_intr_handle_t *intr_handle;
 	int intr_cap;
 	uint_t intr_pri;
-	ddi_intr_handle_t *intr_handle;
 
 	struct driver_properties props;
 	kstat_t *ksp;
 	kstat_t *ksp_stat;
 
-	struct sge sge;
+	struct sge_info sge;
 
 	struct port_info *port[MAX_NPORTS];
 	uint8_t chan_map[NCHAN];
@@ -504,13 +781,19 @@ struct adapter {
 
 	unsigned int cfcsum;
 	struct adapter_params params;
+	struct t4_intrs_queues intr_queue_cfg;
 
 	kmutex_t lock;
 	kcondvar_t cv;
 
-	/* Starving free lists */
-	kmutex_t sfl_lock;	/* same cache-line as sc_lock? but that's ok */
-	TAILQ_HEAD(, sge_fl) sfl;
+	/*
+	 * Starving freelist state
+	 *
+	 * sfl_lock protects the `sfl_flags` and `sfl_node` fields in all sge_fl
+	 * structs owned by this adapter.
+	 */
+	kmutex_t sfl_lock;
+	list_t sfl_list;
 	timeout_id_t sfl_timer;
 
 	/* Sensors */
@@ -521,12 +804,8 @@ struct adapter {
 
 	/* support for single-threading access to adapter mailbox registers */
 	kmutex_t mbox_lock;
-	STAILQ_HEAD(, t4_mbox_list) mbox_list;
-};
-
-struct memwin {
-	uint32_t base;
-	uint32_t aperture;
+	kcondvar_t mbox_cv;
+	list_t mbox_list;
 };
 
 #define	ADAPTER_LOCK(sc)		mutex_enter(&(sc)->lock)
@@ -539,30 +818,21 @@ struct memwin {
 #define	PORT_LOCK_ASSERT_OWNED(pi)	ASSERT(mutex_owned(&(pi)->lock))
 #define	PORT_LOCK_ASSERT_NOTOWNED(pi)	ASSERT(!mutex_owned(&(pi)->lock))
 
-#define	IQ_LOCK(iq)			mutex_enter(&(iq)->lock)
-#define	IQ_UNLOCK(iq)			mutex_exit(&(iq)->lock)
-#define	IQ_LOCK_ASSERT_OWNED(iq)	ASSERT(mutex_owned(&(iq)->lock))
-#define	IQ_LOCK_ASSERT_NOTOWNED(iq)	ASSERT(!mutex_owned(&(iq)->lock))
-
-#define	FL_LOCK(fl)			mutex_enter(&(fl)->lock)
-#define	FL_UNLOCK(fl)			mutex_exit(&(fl)->lock)
-#define	FL_LOCK_ASSERT_OWNED(fl)	ASSERT(mutex_owned(&(fl)->lock))
-#define	FL_LOCK_ASSERT_NOTOWNED(fl)	ASSERT(!mutex_owned(&(fl)->lock))
+#define	IQ_LOCK(iq)			mutex_enter(&(iq)->tsi_lock)
+#define	IQ_UNLOCK(iq)			mutex_exit(&(iq)->tsi_lock)
+#define	IQ_LOCK_ASSERT_OWNED(iq)	ASSERT(mutex_owned(&(iq)->tsi_lock))
+#define	IQ_LOCK_ASSERT_NOTOWNED(iq)	ASSERT(!mutex_owned(&(iq)->tsi_lock))
 
-#define	RXQ_LOCK(rxq)			IQ_LOCK(&(rxq)->iq)
-#define	RXQ_UNLOCK(rxq)			IQ_UNLOCK(&(rxq)->iq)
-#define	RXQ_LOCK_ASSERT_OWNED(rxq)	IQ_LOCK_ASSERT_OWNED(&(rxq)->iq)
-#define	RXQ_LOCK_ASSERT_NOTOWNED(rxq)	IQ_LOCK_ASSERT_NOTOWNED(&(rxq)->iq)
+#define	EQ_LOCK(eq)			mutex_enter(&(eq)->tse_lock)
+#define	EQ_UNLOCK(eq)			mutex_exit(&(eq)->tse_lock)
+#define	EQ_LOCK_ASSERT_OWNED(eq)	ASSERT(mutex_owned(&(eq)->tse_lock))
+#define	EQ_LOCK_ASSERT_NOTOWNED(eq)	ASSERT(!mutex_owned(&(eq)->tse_lock))
 
-#define	RXQ_FL_LOCK(rxq)		FL_LOCK(&(rxq)->fl)
-#define	RXQ_FL_UNLOCK(rxq)		FL_UNLOCK(&(rxq)->fl)
-#define	RXQ_FL_LOCK_ASSERT_OWNED(rxq)	FL_LOCK_ASSERT_OWNED(&(rxq)->fl)
-#define	RXQ_FL_LOCK_ASSERT_NOTOWNED(rxq) FL_LOCK_ASSERT_NOTOWNED(&(rxq)->fl)
-
-#define	EQ_LOCK(eq)			mutex_enter(&(eq)->lock)
-#define	EQ_UNLOCK(eq)			mutex_exit(&(eq)->lock)
-#define	EQ_LOCK_ASSERT_OWNED(eq)	ASSERT(mutex_owned(&(eq)->lock))
-#define	EQ_LOCK_ASSERT_NOTOWNED(eq)	ASSERT(!mutex_owned(&(eq)->lock))
+/* Freelist state is protected by its EQ lock */
+#define	FL_LOCK(fl)			EQ_LOCK(&(fl)->eq)
+#define	FL_UNLOCK(fl)			EQ_UNLOCK(&(fl)->eq)
+#define	FL_LOCK_ASSERT_OWNED(fl)	EQ_LOCK_ASSERT_OWNED(&(fl)->eq)
+#define	FL_LOCK_ASSERT_NOTOWNED(fl)	EQ_LOCK_ASSERT_NOTOWNED(&(fl)->eq)
 
 #define	TXQ_LOCK(txq)			EQ_LOCK(&(txq)->eq)
 #define	TXQ_UNLOCK(txq)			EQ_UNLOCK(&(txq)->eq)
@@ -570,38 +840,11 @@ struct memwin {
 #define	TXQ_LOCK_ASSERT_NOTOWNED(txq)	EQ_LOCK_ASSERT_NOTOWNED(&(txq)->eq)
 
 #define	for_each_txq(pi, iter, txq) \
-	txq = &pi->adapter->sge.txq[pi->first_txq]; \
-	for (iter = 0; iter < pi->ntxq; ++iter, ++txq)
+	txq = &pi->adapter->sge.txq[pi->txq_start]; \
+	for (iter = 0; iter < pi->txq_count; ++iter, ++txq)
 #define	for_each_rxq(pi, iter, rxq) \
-	rxq = &pi->adapter->sge.rxq[pi->first_rxq]; \
-	for (iter = 0; iter < pi->nrxq; ++iter, ++rxq)
-
-#define	NFIQ(sc) ((sc)->intr_count > 1 ? (sc)->intr_count - 1 : 1)
-
-/* One for errors, one for firmware events */
-#define	T4_EXTRA_INTR 2
-
-static inline void t4_mbox_list_add(struct adapter *adap,
-				    struct t4_mbox_list *entry)
-{
-	mutex_enter(&adap->mbox_lock);
-	STAILQ_INSERT_TAIL(&adap->mbox_list, entry, link);
-	mutex_exit(&adap->mbox_lock);
-}
-
-static inline void t4_mbox_list_del(struct adapter *adap,
-				    struct t4_mbox_list *entry)
-{
-	mutex_enter(&adap->mbox_lock);
-	STAILQ_REMOVE(&adap->mbox_list, entry, t4_mbox_list, link);
-	mutex_exit(&adap->mbox_lock);
-}
-
-static inline struct t4_mbox_list *
-t4_mbox_list_first_entry(struct adapter *adap)
-{
-	return (STAILQ_FIRST(&adap->mbox_list));
-}
+	rxq = &pi->adapter->sge.rxq[pi->rxq_start]; \
+	for (iter = 0; iter < pi->rxq_count; ++iter, ++rxq)
 
 static inline struct port_info *
 adap2pinfo(struct adapter *sc, int idx)
@@ -609,28 +852,9 @@ adap2pinfo(struct adapter *sc, int idx)
 	return (sc->port[idx]);
 }
 
-static inline struct sge_rxq *
-iq_to_rxq(struct sge_iq *iq)
-{
-	return (__containerof(iq, struct sge_rxq, iq));
-}
-
-static inline bool
-t4_port_is_10xg(const struct port_info *pi)
-{
-	return (pi->link_cfg.pcaps &
-	    (FW_PORT_CAP32_SPEED_400G |
-	    FW_PORT_CAP32_SPEED_200G |
-	    FW_PORT_CAP32_SPEED_100G |
-	    FW_PORT_CAP32_SPEED_50G |
-	    FW_PORT_CAP32_SPEED_40G |
-	    FW_PORT_CAP32_SPEED_25G |
-	    FW_PORT_CAP32_SPEED_10G));
-}
-
 static inline unsigned int t4_use_ldst(struct adapter *adap)
 {
-	return (adap->flags & FW_OK);
+	return (adap->flags & TAF_FW_OK);
 }
 
 static inline void t4_db_full(struct adapter *adap) {}
@@ -652,46 +876,50 @@ t4_cver_ge(const adapter_t *adap, uint8_t ver)
 
 /* t4_nexus.c */
 int t4_port_full_init(struct port_info *);
-void t4_port_queues_enable(struct port_info *pi);
-void t4_port_queues_disable(struct port_info *pi);
 
 uint32_t t4_read_reg(struct adapter *, uint32_t);
 void t4_write_reg(struct adapter *, uint32_t, uint32_t);
 uint64_t t4_read_reg64(struct adapter *, uint32_t);
 void t4_write_reg64(struct adapter *, uint32_t, uint64_t);
 
+void t4_mbox_waiter_add(struct adapter *, t4_mbox_waiter_t *);
+void t4_mbox_waiter_remove(struct adapter *, t4_mbox_waiter_t *);
+bool t4_mbox_wait_owner(struct adapter *, uint_t, bool);
+
 /* t4_debug.c */
 void t4_debug_init(void);
 void t4_debug_fini(void);
 
 /* t4_sge.c */
-void t4_sge_init(struct adapter *sc);
-int t4_alloc_fwq(struct adapter *);
-int t4_free_fwq(struct adapter *);
-int t4_setup_port_queues(struct port_info *pi);
-int t4_teardown_port_queues(struct port_info *pi);
-uint_t t4_intr_all(caddr_t arg1, caddr_t arg2);
-uint_t t4_intr(caddr_t arg1, caddr_t arg2);
-uint_t t4_intr_err(caddr_t arg1, caddr_t arg2);
-void t4_iq_gts_update(struct sge_iq *, t4_intr_config_t, uint16_t);
-void t4_iq_update_intr_cfg(struct sge_iq *, uint8_t, int8_t);
-void t4_eq_update_dbq_timer(struct sge_eq *, struct port_info *);
-int t4_mgmt_tx(struct adapter *sc, mblk_t *m);
+void t4_sge_init(struct adapter *);
+int t4_alloc_evt_iqs(struct adapter *);
+void t4_free_evt_iqs(struct adapter *);
+void t4_port_kstats_init(struct port_info *);
+void t4_port_kstats_fini(struct port_info *);
+int t4_port_queues_init(struct port_info *);
+void t4_port_queues_fini(struct port_info *);
+void t4_port_queues_enable(struct port_info *pi);
+void t4_port_queues_disable(struct port_info *pi);
+uint_t t4_intr_all(caddr_t, caddr_t);
+uint_t t4_intr_err(caddr_t, caddr_t);
+uint_t t4_intr_fwq(caddr_t, caddr_t);
+uint_t t4_intr_port_queue(caddr_t, caddr_t);
+void t4_iq_gts_update(t4_sge_iq_t *, t4_gts_config_t, uint16_t);
+void t4_iq_update_intr_cfg(t4_sge_iq_t *, uint8_t, int8_t);
+void t4_eq_update_dbq_timer(t4_sge_eq_t *, struct port_info *);
 
 mblk_t *t4_eth_tx(void *, mblk_t *);
-mblk_t *t4_mc_tx(void *arg, mblk_t *m);
-mblk_t *t4_ring_rx(struct sge_rxq *rxq, int poll_bytes);
+t4_iq_result_t t4_process_rx_iq(t4_sge_iq_t *, uint_t, struct t4_poll_req *);
 
 /* t4_mac.c */
-void t4_mc_cb_init(struct port_info *);
 void t4_os_link_changed(struct adapter *sc, int idx, int link_stat);
-void t4_mac_rx(struct port_info *pi, struct sge_rxq *rxq, mblk_t *m);
 void t4_mac_tx_update(struct port_info *pi, struct sge_txq *txq);
 int t4_addmac(void *arg, const uint8_t *ucaddr);
 const char **t4_get_priv_props(struct port_info *, size_t *);
 uint8_t t4_choose_holdoff_timer(struct adapter *, uint_t);
 int8_t t4_choose_holdoff_pktcnt(struct adapter *, int);
 uint_t t4_choose_dbq_timer(struct adapter *, uint_t);
+extern mac_callbacks_t t4_mac_callbacks;
 
 /* t4_ioctl.c */
 int t4_ioctl(struct adapter *sc, int cmd, void *data, int mode);
diff --git a/usr/src/uts/common/io/cxgbe/t4nex/cudbg.h b/usr/src/uts/common/io/cxgbe/t4nex/cudbg.h
index e86de21085..64cdbedd93 100644
--- a/usr/src/uts/common/io/cxgbe/t4nex/cudbg.h
+++ b/usr/src/uts/common/io/cxgbe/t4nex/cudbg.h
@@ -318,7 +318,7 @@ static struct el ATTRIBUTE_UNUSED entity_list[] = {
 };
 
 #ifdef _KERNEL
-typedef int (*cudbg_print_cb) (dev_info_t *dip, int, char *, ...);
+typedef void (*cudbg_print_cb) (dev_info_t *dip, int, char *, ...);
 #else
 typedef int (*cudbg_print_cb) (char *, ...);
 #endif
diff --git a/usr/src/uts/common/io/cxgbe/t4nex/cudbg_lib.c b/usr/src/uts/common/io/cxgbe/t4nex/cudbg_lib.c
index e6b5b16667..1b1caa64f5 100644
--- a/usr/src/uts/common/io/cxgbe/t4nex/cudbg_lib.c
+++ b/usr/src/uts/common/io/cxgbe/t4nex/cudbg_lib.c
@@ -3389,7 +3389,7 @@ collect_tid(struct cudbg_init *pdbg_init,
 	rc = compress_buff(&scratch_buff, dbg_buff);
 
 err1:
-	ADAPTER_UNLOCK(padap);
+	ADAPTER_LOCK(padap);
 	release_scratch_buff(&scratch_buff, dbg_buff);
 err:
 	return rc;
diff --git a/usr/src/uts/common/io/cxgbe/t4nex/t4_ioctl.c b/usr/src/uts/common/io/cxgbe/t4nex/t4_ioctl.c
index 6ca43b52b9..4263ad691d 100644
--- a/usr/src/uts/common/io/cxgbe/t4nex/t4_ioctl.c
+++ b/usr/src/uts/common/io/cxgbe/t4nex/t4_ioctl.c
@@ -22,7 +22,6 @@
 
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
-#include <sys/queue.h>
 
 #include "t4nex.h"
 #include "common/common.h"
diff --git a/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c b/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c
index 107feaa51f..f6e8a8a609 100644
--- a/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c
+++ b/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c
@@ -31,7 +31,6 @@
 #include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 #include <sys/strsubr.h>
-#include <sys/queue.h>
 
 #include "common/common.h"
 #include "common/t4_regs.h"
@@ -50,37 +49,20 @@ static int t4_mc_getprop(void *arg, const char *name, mac_prop_id_t id,
 static void t4_mc_propinfo(void *arg, const char *name, mac_prop_id_t id,
     mac_prop_info_handle_t ph);
 
-static int t4_init_synchronized(struct port_info *pi);
-static int t4_uninit_synchronized(struct port_info *pi);
+static int t4_port_enable(struct port_info *pi);
+static int t4_port_disable(struct port_info *pi);
 static void t4_propinfo_priv(struct port_info *, const char *,
     mac_prop_info_handle_t);
 static int t4_getprop_priv(struct port_info *, const char *, uint_t, void *);
 static int t4_setprop_priv(struct port_info *, const char *, const void *);
 
-mac_callbacks_t t4_m_callbacks = {
+mac_callbacks_t t4_mac_callbacks = {
 	.mc_callbacks	= MC_GETCAPAB | MC_PROPERTIES,
 	.mc_getstat	= t4_mc_getstat,
 	.mc_start	= t4_mc_start,
 	.mc_stop	= t4_mc_stop,
 	.mc_setpromisc	= t4_mc_setpromisc,
 	.mc_multicst	= t4_mc_multicst,
-	.mc_unicst	= t4_mc_unicst,
-	.mc_tx		= t4_mc_tx,
-	.mc_getcapab	= t4_mc_getcapab,
-	.mc_setprop	= t4_mc_setprop,
-	.mc_getprop	= t4_mc_getprop,
-	.mc_propinfo	= t4_mc_propinfo,
-};
-
-mac_callbacks_t t4_m_ring_callbacks = {
-	.mc_callbacks	= MC_GETCAPAB | MC_PROPERTIES,
-	.mc_getstat	= t4_mc_getstat,
-	.mc_start	= t4_mc_start,
-	.mc_stop	= t4_mc_stop,
-	.mc_setpromisc	= t4_mc_setpromisc,
-	.mc_multicst	= t4_mc_multicst,
-	.mc_unicst	= NULL, /* t4_addmac */
-	.mc_tx		= NULL, /* t4_eth_tx */
 	.mc_getcapab	= t4_mc_getcapab,
 	.mc_setprop	= t4_mc_setprop,
 	.mc_getprop	= t4_mc_getprop,
@@ -401,7 +383,8 @@ t4_mc_getstat(void *arg, uint_t stat, uint64_t *val)
 		break;
 
 	case MAC_STAT_NORCVBUF:
-		*val = 0;	/* TODO should come from rxq->nomem */
+		/* TODO: pull from freelist stats? */
+		*val = 0;
 		break;
 
 	case MAC_STAT_IERRORS:
@@ -725,7 +708,7 @@ t4_mc_start(void *arg)
 	struct port_info *pi = arg;
 
 	ADAPTER_LOCK(pi->adapter);
-	const int rc = t4_init_synchronized(pi);
+	const int rc = t4_port_enable(pi);
 	ADAPTER_UNLOCK(pi->adapter);
 
 	return (rc);
@@ -737,7 +720,7 @@ t4_mc_stop(void *arg)
 	struct port_info *pi = arg;
 
 	ADAPTER_LOCK(pi->adapter);
-	(void) t4_uninit_synchronized(pi);
+	(void) t4_port_disable(pi);
 	ADAPTER_UNLOCK(pi->adapter);
 }
 
@@ -746,11 +729,10 @@ t4_mc_setpromisc(void *arg, boolean_t on)
 {
 	struct port_info *pi = arg;
 	struct adapter *sc = pi->adapter;
-	int rc;
 
 	ADAPTER_LOCK(sc);
-	rc = -t4_set_rxmode(sc, sc->mbox, pi->viid, -1, on ? 1 : 0, -1, -1, -1,
-	    false);
+	const int rc = -t4_set_rxmode(sc, sc->mbox, pi->viid, -1, on ? 1 : 0,
+	    -1, -1, -1, false);
 	ADAPTER_UNLOCK(sc);
 
 	return (rc);
@@ -766,10 +748,10 @@ t4_mc_multicst(void *arg, boolean_t add, const uint8_t *mcaddr)
 	struct port_info *pi = arg;
 	struct adapter *sc = pi->adapter;
 	struct fw_vi_mac_cmd c;
-	int len16, rc;
+	int rc = 0;
+	int len16 = howmany(sizeof (c.op_to_viid) +
+	    sizeof (c.freemacs_to_len16) + sizeof (c.u.exact[0]), 16);
 
-	len16 = howmany(sizeof (c.op_to_viid) + sizeof (c.freemacs_to_len16) +
-	    sizeof (c.u.exact[0]), 16);
 	c.op_to_viid = htonl(V_FW_CMD_OP(FW_VI_MAC_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | V_FW_VI_MAC_CMD_VIID(pi->viid));
 	c.freemacs_to_len16 = htonl(V_FW_CMD_LEN16(len16));
@@ -781,26 +763,8 @@ t4_mc_multicst(void *arg, boolean_t add, const uint8_t *mcaddr)
 	ADAPTER_LOCK(sc);
 	rc = -t4_wr_mbox_meat(sc, sc->mbox, &c, len16 * 16, &c, true);
 	ADAPTER_UNLOCK(sc);
-	if (rc != 0)
-		return (rc);
-#ifdef DEBUG
-	/*
-	 * TODO: Firmware doesn't seem to return the correct index on removal
-	 * (it gives back 0x3fd FW_VI_MAC_MAC_BASED_FREE unchanged. Remove this
-	 * code once it is fixed.
-	 */
-	else {
-		uint16_t idx;
-
-		idx = G_FW_VI_MAC_CMD_IDX(ntohs(c.u.exact[0].valid_to_idx));
-		cxgb_printf(pi->dip, CE_NOTE,
-		    "%02x:%02x:%02x:%02x:%02x:%02x %s %d", mcaddr[0],
-		    mcaddr[1], mcaddr[2], mcaddr[3], mcaddr[4], mcaddr[5],
-		    add ? "added at index" : "removed from index", idx);
-	}
-#endif
 
-	return (0);
+	return (rc);
 }
 
 int
@@ -808,30 +772,31 @@ t4_mc_unicst(void *arg, const uint8_t *ucaddr)
 {
 	struct port_info *pi = arg;
 	struct adapter *sc = pi->adapter;
-	int rc;
 
-	if (ucaddr == NULL)
+	if (ucaddr == NULL) {
 		return (EINVAL);
+	}
 
 	ADAPTER_LOCK(sc);
 
 	/* We will support adding only one mac address */
-	if (pi->adapter->props.multi_rings && pi->macaddr_cnt) {
+	if (pi->macaddr_cnt) {
 		ADAPTER_UNLOCK(sc);
 		return (ENOSPC);
 	}
-	rc = t4_change_mac(sc, sc->mbox, pi->viid, pi->xact_addr_filt, ucaddr,
-	    true, &pi->smt_idx);
+
+	const int rc = t4_change_mac(sc, sc->mbox, pi->viid, pi->xact_addr_filt,
+	    ucaddr, true, &pi->smt_idx);
 	if (rc < 0) {
-		rc = -rc;
-	} else {
-		pi->macaddr_cnt++;
-		pi->xact_addr_filt = rc;
-		rc = 0;
+		PORT_UNLOCK(pi);
+		return (-rc);
 	}
+
+	pi->macaddr_cnt++;
+	pi->xact_addr_filt = rc;
 	ADAPTER_UNLOCK(sc);
 
-	return (rc);
+	return (0);
 }
 
 int
@@ -845,9 +810,9 @@ t4_remmac(void *arg, const uint8_t *mac_addr)
 {
 	struct port_info *pi = arg;
 
-	ADAPTER_LOCK(pi->adapter);
+	PORT_LOCK(pi);
 	pi->macaddr_cnt--;
-	ADAPTER_UNLOCK(pi->adapter);
+	PORT_UNLOCK(pi);
 
 	return (0);
 }
@@ -868,7 +833,7 @@ t4_fill_group(void *arg, mac_ring_type_t rtype, const int rg_index,
 		infop->mgi_stop = NULL;
 		infop->mgi_addmac = t4_addmac;
 		infop->mgi_remmac = t4_remmac;
-		infop->mgi_count = pi->nrxq;
+		infop->mgi_count = pi->rxq_count;
 		break;
 	}
 	case MAC_RING_TYPE_TX:
@@ -879,52 +844,45 @@ t4_fill_group(void *arg, mac_ring_type_t rtype, const int rg_index,
 }
 
 static int
-t4_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num)
+t4_ring_rx_start(mac_ring_driver_t rh, uint64_t mr_gen_num)
 {
 	struct sge_rxq *rxq = (struct sge_rxq *)rh;
+	t4_sge_iq_t *iq = &rxq->iq;
 
-	RXQ_LOCK(rxq);
+	IQ_LOCK(iq);
 	rxq->ring_gen_num = mr_gen_num;
-	RXQ_UNLOCK(rxq);
+	IQ_UNLOCK(iq);
+
 	return (0);
 }
 
-/*
- * Enable interrupt on the specificed rx ring.
- */
 int
 t4_ring_intr_enable(mac_intr_handle_t intrh)
 {
 	struct sge_rxq *rxq = (struct sge_rxq *)intrh;
-	struct sge_iq *iq = &rxq->iq;
+	t4_sge_iq_t *iq = &rxq->iq;
+
+	IQ_LOCK(iq);
+	iq->tsi_flags &= ~IQ_POLLING;
+	t4_iq_gts_update(iq, iq->tsi_gts_rearm, 0);
+	IQ_UNLOCK(iq);
 
-	RXQ_LOCK(rxq);
-	iq->polling = 0;
-	iq->state = IQS_IDLE;
-	t4_iq_gts_update(iq, iq->intr_params, 0);
-	RXQ_UNLOCK(rxq);
 	return (0);
 }
 
-/*
- * Disable interrupt on the specificed rx ring.
- */
 int
 t4_ring_intr_disable(mac_intr_handle_t intrh)
 {
 	struct sge_rxq *rxq = (struct sge_rxq *)intrh;
-	struct sge_iq *iq;
+	t4_sge_iq_t *iq = &rxq->iq;
 
+	IQ_LOCK(iq);
 	/*
 	 * Nothing to be done here WRT the interrupt, as it will not fire until
 	 * re-enabled through the t4_iq_gts_update() in t4_ring_intr_enable().
 	 */
-
-	iq = &rxq->iq;
-	RXQ_LOCK(rxq);
-	iq->polling = 1;
-	iq->state = IQS_BUSY;
-	RXQ_UNLOCK(rxq);
+	iq->tsi_flags |= IQ_POLLING;
+	IQ_UNLOCK(iq);
 
 	return (0);
 }
@@ -933,17 +891,17 @@ mblk_t *
 t4_poll_ring(void *arg, int n_bytes)
 {
 	struct sge_rxq *rxq = (struct sge_rxq *)arg;
-	mblk_t *mp = NULL;
 
 	ASSERT(n_bytes >= 0);
 	if (n_bytes == 0)
 		return (NULL);
 
-	RXQ_LOCK(rxq);
-	mp = t4_ring_rx(rxq, n_bytes);
-	RXQ_UNLOCK(rxq);
-
-	return (mp);
+	struct t4_poll_req req = {
+		.tpr_byte_budget = n_bytes,
+		.tpr_mp = NULL,
+	};
+	(void) t4_process_rx_iq(&rxq->iq, 0, &req);
+	return (req.tpr_mp);
 }
 
 /*
@@ -956,11 +914,11 @@ t4_rx_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val)
 
 	switch (stat) {
 	case MAC_STAT_RBYTES:
-		*val = rxq->rxbytes;
+		*val = rxq->stats.rxbytes;
 		break;
 
 	case MAC_STAT_IPACKETS:
-		*val = rxq->rxpkts;
+		*val = rxq->stats.rxpkts;
 		break;
 
 	default:
@@ -980,12 +938,12 @@ t4_tx_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val)
 	struct sge_txq *txq = (struct sge_txq *)rh;
 
 	switch (stat) {
-	case MAC_STAT_RBYTES:
-		*val = txq->txbytes;
+	case MAC_STAT_OBYTES:
+		*val = txq->stats.txbytes;
 		break;
 
-	case MAC_STAT_IPACKETS:
-		*val = txq->txpkts;
+	case MAC_STAT_OPACKETS:
+		*val = txq->stats.txpkts;
 		break;
 
 	default:
@@ -997,9 +955,8 @@ t4_tx_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val)
 }
 
 /*
- * Callback funtion for MAC layer to register all rings
- * for given ring_group, noted by group_index.
- * Since we have only one group, ring index becomes
+ * Callback funtion for MAC layer to register all rings for given ring_group,
+ * noted by group_index. Since we have only one group, ring index becomes
  * absolute index.
  */
 void
@@ -1007,22 +964,25 @@ t4_fill_ring(void *arg, mac_ring_type_t rtype, const int group_index,
     const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
 {
 	struct port_info *pi = arg;
-	mac_intr_t *mintr;
+
+	ASSERT3S(ring_index, >=, 0);
 
 	switch (rtype) {
 	case MAC_RING_TYPE_RX: {
-		struct sge_rxq *rxq;
+		struct sge_rxq *rxq =
+		    &pi->adapter->sge.rxq[pi->rxq_start + ring_index];
+		mac_intr_t *mintr = &infop->mri_intr;
+
+		ASSERT3S(ring_index, <, pi->rxq_count);
 
-		rxq = &pi->adapter->sge.rxq[pi->first_rxq + ring_index];
 		rxq->ring_handle = rh;
 
 		infop->mri_driver = (mac_ring_driver_t)rxq;
-		infop->mri_start = t4_ring_start;
+		infop->mri_start = t4_ring_rx_start;
 		infop->mri_stop = NULL;
 		infop->mri_poll = t4_poll_ring;
 		infop->mri_stat = t4_rx_stat;
 
-		mintr = &infop->mri_intr;
 		mintr->mi_handle = (mac_intr_handle_t)rxq;
 		mintr->mi_enable = t4_ring_intr_enable;
 		mintr->mi_disable = t4_ring_intr_disable;
@@ -1031,8 +991,12 @@ t4_fill_ring(void *arg, mac_ring_type_t rtype, const int group_index,
 	}
 	case MAC_RING_TYPE_TX: {
 		struct sge_txq *txq =
-		    &pi->adapter->sge.txq[pi->first_txq + ring_index];
+		    &pi->adapter->sge.txq[pi->txq_start + ring_index];
+
+		ASSERT3S(ring_index, <, pi->txq_count);
+
 		txq->ring_handle = rh;
+
 		infop->mri_driver = (mac_ring_driver_t)txq;
 		infop->mri_start = NULL;
 		infop->mri_stop = NULL;
@@ -1041,21 +1005,11 @@ t4_fill_ring(void *arg, mac_ring_type_t rtype, const int group_index,
 		break;
 	}
 	default:
-		ASSERT(0);
+		panic("unexpected ring type: %d", rtype);
 		break;
 	}
 }
 
-mblk_t *
-t4_mc_tx(void *arg, mblk_t *m)
-{
-	struct port_info *pi = arg;
-	struct adapter *sc = pi->adapter;
-	struct sge_txq *txq = &sc->sge.txq[pi->first_txq];
-
-	return (t4_eth_tx(txq, m));
-}
-
 static int
 t4_mc_transceiver_info(void *arg, uint_t id, mac_transceiver_info_t *infop)
 {
@@ -1140,10 +1094,7 @@ t4_port_led_set(void *arg, mac_led_mode_t mode, uint_t flags)
 		return (ENOTSUP);
 	}
 
-	ADAPTER_LOCK(sc);
 	rc = -t4_identify_port(sc, sc->mbox, pi->viid, val);
-	ADAPTER_UNLOCK(sc);
-
 	return (rc);
 }
 
@@ -1152,8 +1103,6 @@ t4_mc_getcapab(void *arg, mac_capab_t cap, void *data)
 {
 	struct port_info *pi = arg;
 	boolean_t status = B_TRUE;
-	mac_capab_transceiver_t *mct;
-	mac_capab_led_t *mcl;
 
 	switch (cap) {
 	case MAC_CAPAB_HCKSUM:
@@ -1161,8 +1110,9 @@ t4_mc_getcapab(void *arg, mac_capab_t cap, void *data)
 			uint32_t *d = data;
 			*d = HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM |
 			    HCKSUM_INET_FULL_V6;
-		} else
+		} else {
 			status = B_FALSE;
+		}
 		break;
 
 	case MAC_CAPAB_LSO:
@@ -1175,21 +1125,18 @@ t4_mc_getcapab(void *arg, mac_capab_t cap, void *data)
 			    LSO_TX_BASIC_TCP_IPV6;
 			d->lso_basic_tcp_ipv4.lso_max = 65535;
 			d->lso_basic_tcp_ipv6.lso_max = 65535;
-		} else
+		} else {
 			status = B_FALSE;
+		}
 		break;
 
 	case MAC_CAPAB_RINGS: {
 		mac_capab_rings_t *cap_rings = data;
 
-		if (!pi->adapter->props.multi_rings) {
-			status = B_FALSE;
-			break;
-		}
 		switch (cap_rings->mr_type) {
 		case MAC_RING_TYPE_RX:
 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
-			cap_rings->mr_rnum = pi->nrxq;
+			cap_rings->mr_rnum = pi->rxq_count;
 			cap_rings->mr_gnum = 1;
 			cap_rings->mr_rget = t4_fill_ring;
 			cap_rings->mr_gget = t4_fill_group;
@@ -1198,7 +1145,7 @@ t4_mc_getcapab(void *arg, mac_capab_t cap, void *data)
 			break;
 		case MAC_RING_TYPE_TX:
 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
-			cap_rings->mr_rnum = pi->ntxq;
+			cap_rings->mr_rnum = pi->txq_count;
 			cap_rings->mr_gnum = 0;
 			cap_rings->mr_rget = t4_fill_ring;
 			cap_rings->mr_gget = NULL;
@@ -1207,20 +1154,24 @@ t4_mc_getcapab(void *arg, mac_capab_t cap, void *data)
 		break;
 	}
 
-	case MAC_CAPAB_TRANSCEIVER:
-		mct = data;
+	case MAC_CAPAB_TRANSCEIVER: {
+		mac_capab_transceiver_t *mct = data;
 
 		mct->mct_flags = 0;
 		mct->mct_ntransceivers = 1;
 		mct->mct_info = t4_mc_transceiver_info;
 		mct->mct_read = t4_mc_transceiver_read;
 		break;
-	case MAC_CAPAB_LED:
-		mcl = data;
+	}
+
+	case MAC_CAPAB_LED: {
+		mac_capab_led_t *mcl = data;
+
 		mcl->mcl_flags = 0;
 		mcl->mcl_modes = MAC_LED_DEFAULT | MAC_LED_IDENT;
 		mcl->mcl_set = t4_port_led_set;
 		break;
+	}
 
 	default:
 		status = B_FALSE; /* cap not supported */
@@ -1229,28 +1180,22 @@ t4_mc_getcapab(void *arg, mac_capab_t cap, void *data)
 	return (status);
 }
 
-static void
-t4_mac_link_caps_to_flowctrl(fw_port_cap32_t caps, link_flowctrl_t *fc)
+static link_flowctrl_t
+t4_mac_link_caps_to_flowctrl(fw_port_cap32_t caps)
 {
-	u8 pause_tx = 0, pause_rx = 0;
-
-	if (caps & FW_PORT_CAP32_FC_TX)
-		pause_tx = 1;
-
-	if (caps & FW_PORT_CAP32_FC_RX)
-		pause_rx = 1;
-
-	if (pause_rx & pause_tx)
-		*fc = LINK_FLOWCTRL_BI;
-	else if (pause_tx)
-		*fc = LINK_FLOWCTRL_TX;
-	else if (pause_rx)
-		*fc = LINK_FLOWCTRL_RX;
-	else
-		*fc = LINK_FLOWCTRL_NONE;
+	switch (caps & (FW_PORT_CAP32_FC_TX | FW_PORT_CAP32_FC_RX)) {
+	case (FW_PORT_CAP32_FC_TX | FW_PORT_CAP32_FC_RX):
+		return (LINK_FLOWCTRL_BI);
+	case FW_PORT_CAP32_FC_TX:
+		return (LINK_FLOWCTRL_TX);
+	case FW_PORT_CAP32_FC_RX:
+		return (LINK_FLOWCTRL_RX);
+	default:
+		return (LINK_FLOWCTRL_NONE);
+	}
 }
 
-static int
+static void
 t4_mac_flowctrl_to_link_caps(struct port_info *pi, link_flowctrl_t fc,
     fw_port_cap32_t *new_caps)
 {
@@ -1273,7 +1218,7 @@ t4_mac_flowctrl_to_link_caps(struct port_info *pi, link_flowctrl_t fc,
 	if (pi->link_cfg.admin_caps & FW_PORT_CAP32_ANEG)
 		pause |= PAUSE_AUTONEG;
 
-	return (t4_link_set_pause(pi, pause, new_caps));
+	t4_link_set_pause(pi, pause, new_caps);
 }
 
 static link_fec_t
@@ -1297,20 +1242,13 @@ t4_mac_port_caps_to_fec_cap(fw_port_cap32_t caps)
 	return (link_fec);
 }
 
-static void
-t4_mac_admin_caps_to_fec_cap(fw_port_cap32_t caps, link_fec_t *fec)
-{
-	*fec = t4_mac_port_caps_to_fec_cap(caps);
-}
-
-static void
-t4_mac_link_caps_to_fec_cap(fw_port_cap32_t caps, link_fec_t *fec)
+static link_fec_t
+t4_mac_link_caps_to_fec_cap(fw_port_cap32_t caps)
 {
-	link_fec_t link_fec;
+	const link_fec_t link_fec =
+	    t4_mac_port_caps_to_fec_cap(caps & ~FW_PORT_CAP32_FEC_NO_FEC);
 
-	caps &= ~FW_PORT_CAP32_FEC_NO_FEC;
-	link_fec = t4_mac_port_caps_to_fec_cap(caps);
-	*fec = link_fec ? link_fec : LINK_FEC_NONE;
+	return (link_fec ? link_fec : LINK_FEC_NONE);
 }
 
 static int
@@ -1350,7 +1288,6 @@ out:
 	return (t4_link_set_fec(pi, fec, new_caps));
 }
 
-/* ARGSUSED */
 static int
 t4_mc_setprop(void *arg, const char *name, mac_prop_id_t id, uint_t size,
     const void *val)
@@ -1380,8 +1317,8 @@ t4_mc_setprop(void *arg, const char *name, mac_prop_id_t id, uint_t size,
 		break;
 
 	case MAC_PROP_FLOWCTRL:
-		rc = t4_mac_flowctrl_to_link_caps(pi,
-		    *(const link_flowctrl_t *)val, &new_caps);
+		t4_mac_flowctrl_to_link_caps(pi, *(const link_flowctrl_t *)val,
+		    &new_caps);
 		relink = 1;
 		break;
 
@@ -1441,8 +1378,9 @@ t4_mc_setprop(void *arg, const char *name, mac_prop_id_t id, uint_t size,
 		break;
 	}
 
-	if (rc != 0)
+	if (rc != 0) {
 		return (rc);
+	}
 
 	if ((pi->flags & TPF_OPEN) != 0) {
 		if (relink != 0) {
@@ -1453,6 +1391,7 @@ t4_mc_setprop(void *arg, const char *name, mac_prop_id_t id, uint_t size,
 			if (rc != 0) {
 				cxgb_printf(pi->dip, CE_WARN,
 				    "%s link config failed: %d", __func__, rc);
+				PORT_UNLOCK(pi);
 				return (rc);
 			}
 		}
@@ -1465,6 +1404,7 @@ t4_mc_setprop(void *arg, const char *name, mac_prop_id_t id, uint_t size,
 			if (rc != 0) {
 				cxgb_printf(pi->dip, CE_WARN,
 				    "set_rxmode failed: %d", rc);
+				PORT_UNLOCK(pi);
 				return (rc);
 			}
 		}
@@ -1519,15 +1459,18 @@ t4_mc_getprop(void *arg, const char *name, mac_prop_id_t id, uint_t size,
 		break;
 
 	case MAC_PROP_FLOWCTRL:
-		t4_mac_link_caps_to_flowctrl(lc->link_caps, val);
+		*(link_flowctrl_t *)val =
+		    t4_mac_link_caps_to_flowctrl(lc->link_caps);
 		break;
 
 	case MAC_PROP_ADV_FEC_CAP:
-		t4_mac_link_caps_to_fec_cap(lc->link_caps, val);
+		*(link_fec_t *)val =
+		    t4_mac_link_caps_to_fec_cap(lc->link_caps);
 		break;
 
 	case MAC_PROP_EN_FEC_CAP:
-		t4_mac_admin_caps_to_fec_cap(lc->admin_caps, val);
+		*(link_fec_t *)val =
+		    t4_mac_port_caps_to_fec_cap(lc->admin_caps);
 		break;
 
 	case MAC_PROP_ADV_100GFDX_CAP:
@@ -1703,7 +1646,7 @@ t4_mc_propinfo(void *arg, const char *name, mac_prop_id_t id,
 }
 
 static int
-t4_init_synchronized(struct port_info *pi)
+t4_port_enable(struct port_info *pi)
 {
 	struct adapter *sc = pi->adapter;
 	int rc = 0;
@@ -1724,9 +1667,8 @@ t4_init_synchronized(struct port_info *pi)
 			PORT_UNLOCK(pi);
 			return (rc); /* error message displayed already */
 		}
-	} else {
-		t4_port_queues_enable(pi);
 	}
+	t4_port_queues_enable(pi);
 
 	rc = -t4_set_rxmode(sc, sc->mbox, pi->viid, pi->mtu, 0, 0, 1, 0, false);
 	if (rc != 0) {
@@ -1755,41 +1697,41 @@ t4_init_synchronized(struct port_info *pi)
 		cxgb_printf(pi->dip, CE_WARN, "enable_vi failed: %d", rc);
 		goto done;
 	}
+	pi->flags |= TPF_VI_ENABLED;
 
 	/* all ok */
 	pi->flags |= TPF_OPEN;
 done:
 	PORT_UNLOCK(pi);
 	if (rc != 0)
-		(void) t4_uninit_synchronized(pi);
+		(void) t4_port_disable(pi);
 
 	return (rc);
 }
 
-/*
- * Idempotent.
- */
 static int
-t4_uninit_synchronized(struct port_info *pi)
+t4_port_disable(struct port_info *pi)
 {
 	struct adapter *sc = pi->adapter;
-	int rc;
 
 	ADAPTER_LOCK_ASSERT_OWNED(pi->adapter);
 
 	PORT_LOCK(pi);
 	/*
 	 * Disable the VI so that all its data in either direction is discarded
-	 * by the MPS.  Leave everything else (the queues, interrupts, and 1Hz
-	 * tick) intact as the TP can deliver negative advice or data that it's
-	 * holding in its RAM (for an offloaded connection) even after the VI is
-	 * disabled.
+	 * by the MPS.  Leave everything else (queues, interrupts, etc) so any
+	 * straggling work in flight has a safe place to land.
 	 */
-	rc = -t4_enable_vi(sc, sc->mbox, pi->viid, false, false);
-	if (rc != 0) {
-		cxgb_printf(pi->dip, CE_WARN, "disable_vi failed: %d", rc);
-		PORT_UNLOCK(pi);
-		return (rc);
+	if (pi->flags & TPF_VI_ENABLED) {
+		const int rc =
+		    -t4_enable_vi(sc, sc->mbox, pi->viid, false, false);
+		if (rc != 0) {
+			cxgb_printf(pi->dip, CE_WARN,
+			    "disable_vi failed: %d", rc);
+			PORT_UNLOCK(pi);
+			return (rc);
+		}
+		pi->flags &= ~TPF_VI_ENABLED;
 	}
 
 	t4_port_queues_disable(pi);
@@ -1833,8 +1775,8 @@ t4_propinfo_priv(struct port_info *pi, const char *name,
     mac_prop_info_handle_t ph)
 {
 	struct adapter *sc = pi->adapter;
-	struct driver_properties *dp = &sc->props;
-	struct link_config *lc = &pi->link_cfg;
+	const struct driver_properties *dp = &sc->props;
+	const struct link_config *lc = &pi->link_cfg;
 
 	const t4_priv_prop_t *prop = t4_priv_prop_match(name);
 	if (prop == NULL || !t4_priv_prop_supported(pi, prop)) {
@@ -1844,18 +1786,16 @@ t4_propinfo_priv(struct port_info *pi, const char *name,
 	int v = 0;
 	switch (prop->tpp_id) {
 	case T4PROP_FW_TMR:
-		v = t4_convert_holdoff_timer(sc, sc->props.fwq_tmr_idx);
+		v = t4_convert_holdoff_timer(sc, dp->fwq_tmr_idx);
 		break;
 	case T4PROP_FW_PKTC:
-		v = t4_convert_holdoff_pktcnt(sc, sc->props.fwq_pktc_idx);
+		v = t4_convert_holdoff_pktcnt(sc, dp->fwq_pktc_idx);
 		break;
 	case T4PROP_RX_TMR:
-		v = t4_convert_holdoff_timer(sc, t4_port_is_10xg(pi) ?
-		    dp->tmr_idx_10g : dp->tmr_idx_1g);
+		v = t4_convert_holdoff_timer(sc, dp->ethq_tmr_idx);
 		break;
 	case T4PROP_RX_PKTC:
-		v = t4_convert_holdoff_pktcnt(sc, t4_port_is_10xg(pi) ?
-		    dp->pktc_idx_10g : dp->pktc_idx_1g);
+		v = t4_convert_holdoff_pktcnt(sc, dp->ethq_pktc_idx);
 		break;
 	case T4PROP_TX_TMR:
 		v = t4_convert_dbq_timer(sc, dp->dbq_timer_idx);
@@ -1892,7 +1832,6 @@ t4_getprop_priv(struct port_info *pi, const char *name, uint_t size, void *val)
 		return (ENOTSUP);
 	}
 
-	PORT_LOCK(pi);
 	int v = 0;
 	switch (prop->tpp_id) {
 	case T4PROP_FW_TMR:
@@ -1923,10 +1862,8 @@ t4_getprop_priv(struct port_info *pi, const char *name, uint_t size, void *val)
 		v = (lc->link_caps & FW_PORT_CAP32_FC_RX) ? 1 : 0;
 		break;
 	default:
-		PORT_UNLOCK(pi);
 		return (ENOTSUP);
 	}
-	PORT_UNLOCK(pi);
 
 	(void) snprintf(val, size, "%d", v);
 	return (0);
@@ -2000,7 +1937,6 @@ t4_choose_dbq_timer(struct adapter *sc, uint_t target_us)
 	return (chosen_idx);
 }
 
-
 static int
 t4_setprop_priv(struct port_info *pi, const char *name, const void *val)
 {
@@ -2028,7 +1964,7 @@ t4_setprop_priv(struct port_info *pi, const char *name, const void *val)
 
 	switch (prop->tpp_id) {
 	case T4PROP_FW_TMR: {
-		struct sge_iq *fwq = &sc->sge.fwq;
+		t4_sge_iq_t *fwq = &sc->sge.fwq;
 		const uint8_t idx = t4_choose_holdoff_timer(sc, MAX(0, v));
 
 		IQ_LOCK(fwq);
@@ -2039,7 +1975,7 @@ t4_setprop_priv(struct port_info *pi, const char *name, const void *val)
 		break;
 	}
 	case T4PROP_FW_PKTC: {
-		struct sge_iq *fwq = &sc->sge.fwq;
+		t4_sge_iq_t *fwq = &sc->sge.fwq;
 		const int8_t idx = t4_choose_holdoff_pktcnt(sc, (int)v);
 
 		IQ_LOCK(fwq);
@@ -2079,7 +2015,7 @@ t4_setprop_priv(struct port_info *pi, const char *name, const void *val)
 			int i;
 			struct sge_txq *txq;
 			for_each_txq(pi, i, txq) {
-				struct sge_eq *eq = &txq->eq;
+				t4_sge_eq_t *eq = &txq->eq;
 
 				EQ_LOCK(eq);
 				t4_eq_update_dbq_timer(eq, pi);
@@ -2133,7 +2069,7 @@ t4_setprop_priv(struct port_info *pi, const char *name, const void *val)
 		PORT_LOCK(pi);
 		if ((pi->flags & TPF_OPEN) != 0) {
 			for_each_rxq(pi, i, rxq) {
-				struct sge_iq *iq = &rxq->iq;
+				t4_sge_iq_t *iq = &rxq->iq;
 
 				IQ_LOCK(iq);
 				t4_iq_update_intr_cfg(iq, pi->tmr_idx,
@@ -2170,15 +2106,6 @@ t4_setprop_priv(struct port_info *pi, const char *name, const void *val)
 	return (0);
 }
 
-void
-t4_mc_cb_init(struct port_info *pi)
-{
-	if (pi->adapter->props.multi_rings)
-		pi->mc = &t4_m_ring_callbacks;
-	else
-		pi->mc = &t4_m_callbacks;
-}
-
 void
 t4_os_link_changed(struct adapter *sc, int idx, int link_stat)
 {
@@ -2187,18 +2114,8 @@ t4_os_link_changed(struct adapter *sc, int idx, int link_stat)
 	mac_link_update(pi->mh, link_stat ? LINK_STATE_UP : LINK_STATE_DOWN);
 }
 
-/* ARGSUSED */
-void
-t4_mac_rx(struct port_info *pi, struct sge_rxq *rxq, mblk_t *m)
-{
-	mac_rx(pi->mh, NULL, m);
-}
-
 void
 t4_mac_tx_update(struct port_info *pi, struct sge_txq *txq)
 {
-	if (pi->adapter->props.multi_rings)
-		mac_tx_ring_update(pi->mh, txq->ring_handle);
-	else
-		mac_tx_update(pi->mh);
+	mac_tx_ring_update(pi->mh, txq->ring_handle);
 }
diff --git a/usr/src/uts/common/io/cxgbe/t4nex/t4_nexus.c b/usr/src/uts/common/io/cxgbe/t4nex/t4_nexus.c
index ad2fda1caa..83b4c8f94f 100644
--- a/usr/src/uts/common/io/cxgbe/t4nex/t4_nexus.c
+++ b/usr/src/uts/common/io/cxgbe/t4nex/t4_nexus.c
@@ -39,59 +39,526 @@
 #include <sys/cred.h>
 #include <sys/stat.h>
 #include <sys/mkdev.h>
-#include <sys/queue.h>
 #include <sys/containerof.h>
 #include <sys/sensors.h>
 #include <sys/firmload.h>
 #include <sys/mac_provider.h>
 #include <sys/mac_ether.h>
 #include <sys/vlan.h>
+#include <sys/cpuvar.h>
 
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_extra_regs.h"
 
+/*
+ * Nexus driver for Chelsio Terminator Network Adapters (T4/T5/T6)
+ *
+ * This driver supports the Chelsio Terminator series of network adapters
+ * starting with the T4 generation and onward. These adapters present a "unified
+ * wire" for managing traditional L2 Ethernet traffic alongside a variety of
+ * stateful offloads including the usual TCP/UDP protocols along with storage
+ * technology like iSCSI, FCoE, NVMe over fabrics, and others. All of these
+ * features coexist on a single ASIC controlled by a single firmware image, thus
+ * the "unified wire". While these adapters provide many offload technologies,
+ * this driver remains focused on providing L2 Ethernet services as presented by
+ * the GLDv3/mac framework. In short, this consists of presenting the device as
+ * groups of rings with filtering and steering capabilities along with stateless
+ * offloads including checksums and LSO. This nexus driver does not preclude the
+ * support of the stateful offload features, but supporting them requires
+ * additional work both inside this driver along with general operating system
+ * enhancements.
+ *
+ * Naming & Terminology
+ * --------------------
+ *
+ * CPL:
+ *
+ *     Chelsio Protocol Language messages. We use these to wrap network data for
+ *     Tx and Rx, this wrapping of packets in CPL is referred to by Chelsio as
+ *     "tunneled" data. Not be to confused with the more general network
+ *     tunneling also known as encapsulation (e.g. IP tunnling, VXLAN, etc).
+ *
+ * Flit:
+ *
+ *     A 64-bit (8 byte) quantity. The Chelsio documentation and code divides
+ *     communication structures into units of flits. For example, a firmware
+ *     command may consist of up to 8 flits (8-bytes x 8 = 64 bytes) where the
+ *     command header is always made up of the first two flits and the remaining
+ *     6 may be used for variable payload data.
+ *
+ * Module/Block:
+ *
+ *     The T4 is comprised of various modules (also referred to as a "block" or
+ *     "engine" in some contexts) which work together to provide the services
+ *     offered by the chip. For example, the Scatter-Gather Engine (SGE) module
+ *     provides the DMA communications used to send and receive traffic.
+ *
+ * T4:
+ *
+ *     The short name to represent any Chelsio Terminator ASIC from the T4 and
+ *     onward. This includes the T4, T5, and T6 line of parts.
+ *
+ * Tunneled Traffic:
+ *
+ *     The Chelsio documentation often refers to sending or receiving "tunneled"
+ *     traffic, but it's not referring to the traditional networking terminology
+ *     of encapsulated data. Rather, it is referring to traffic that is
+ *     sent/received in a non-offload capacity. It's called "tunneled" because
+ *     the data is wrapped/"tunneled" in Work Requests and CPL messages. This
+ *     driver deals purely in tunneled traffic as it make no use of stateful
+ *     offloads.
+ *
+ * ULPTX
+ *
+ *    The Upper Layer Processing Transmit module handle DMA access related to
+ *    egress traffic.
+ *
+ * Work Request (WR)
+ *
+ *    Work Requests are commands and data descriptors use to send Tx packets.
+ *
+ * Communication
+ * -------------
+ *
+ * Before any requests can be made or any data can be transmitted we must first
+ * establish communication with the device. The Chelsio Terminator ASIC, or T4
+ * for short, presents four primary methods of communication between the driver
+ * and itself.
+ *
+ * 1. Registers: read/write simple values or bitwise data over PIO
+ *
+ * 2. Mailboxes: synchronized request/reply structured data over PIO
+ *
+ * 3. Queues: DMA memory of structured data for control and data plane
+ *
+ * 4. Interrupts: MSI/MSI-X interrupts for indicating queue status updates or
+ *    asynchronous events from the firmware
+ *
+ * The first access we have is to the registers via our BAR0 mapping. These
+ * registers provide control and configuration over many aspects of the
+ * different modules that make up the T4.
+ *
+ * Using the registers we then establish a mailbox which provides structured
+ * communication in the form of request/reply commands to the firmware. Both of
+ * these methods use Programmed I/O which is fine for administrative control,
+ * but inadequate for the latency and throughput demands of the datapath and its
+ * associated control plane.
+ *
+ * For the datapath we use the registers and mailbox to establish queues of DMA
+ * memory for transmitting and receiving data. Queues deal in Work Requests
+ * (WR), Chelsio Protocol Language messages (CPL), and Freelist buffer pointers
+ * (FL). These data structures may subsequently point to other DMA memory
+ * (buffers) that hold the data to be transmitted or received along with its
+ * associated software descriptors.
+ *
+ * Finally, the T4 provides various types of interrupt control to asynchronously
+ * signal the driver of conditions such as errors, firmware events, and datapath
+ * (queue) synchronization via status updates (cidx/pidx).
+ *
+ * While nothing precludes the driver from consuming directly these forms of
+ * communication, most of the interface with the T4 is currently provided by the
+ * "common code" interfaces. This common code is, nominally, code shared between
+ * the various operating systems for interacting with the T4.
+ *
+ * Queues (Rings)
+ * --------------
+ *
+ * Queues are circular buffers of DMA memory used to share structured data often
+ * referred to as a "descriptor". These circular buffers are also commonly
+ * called rings. Where each entry in the ring is a descriptor used for locating
+ * and describing data that is meant to be transmitted across or received from
+ * the network device.
+ *
+ * The T4 queues are used in this manner, to share descriptors between the
+ * driver and device, but their level of synchronization is not technically a
+ * descriptor. Rather, a queue is made up of a number of "host credits". The
+ * size of a host credit (sometimes also called an "entry" or "descriptor" in
+ * the code) depends on the type of queue and how it is configured. This
+ * difference in terminology between "host credit" vs. "descriptor" is mostly
+ * pertinent to Egress Queues, which always have 64-byte (8-flit) host credits.
+ * Those host credits are used to pass variable-sized Work Requests (WR), the
+ * structure which actually acts as the "descriptor", which may be smaller or
+ * larger than a single credit. Ingress Queues (IQ) also have variable-sized
+ * entries, but the size is determined at queue creation time and is uniform for
+ * each entry; therefore IQ entries can be called credits, entries, or
+ * descriptors without any real confusion. The official Chelsio documentation
+ * also uses mixed terminology, so it's important to keep that in mind. However,
+ * regardless of how many credits a descriptor requires, communication always
+ * occurs in units of whole credits. A good way to frame this is that queues
+ * provide logical rings of descriptors (WRs, CPLs, FLs) on top of physical
+ * units of host credits.
+ *
+ * There are different types of queues for different purposes, but they are all
+ * variations of either an Ingress Queue (IQ) or Egress Queue (EQ). As the names
+ * suggest, a queue is a unidirectional communication channel: one is the
+ * producer and the other side is a consumer. The Ingress Queue provides
+ * communication from T4 (producer) to driver (consumer), and the Egress Queue
+ * provides communication from the driver (producer) to the T4 (consumer).
+ *
+ * The producer/consumer synchronize communication in units of host credits. The
+ * producer tracks its next host credit to write under the producer index
+ * (pidx), and the consumer tracks its next host credit to read under the
+ * consumer index (cidx). These values are kept in sync through means such as
+ * doorbells (DB), Go-To-Sleep updates (GTS), and interrupts carrying CPL
+ * message (e.g. CPL_SGE_EGR_UPDATE).
+ *
+ * If you read the Terminator Programmer's Guide you will find dicussion about
+ * the queue's "context". This is described as an area of memory that dictates
+ * various features and behavior of the queue. While this queue context may at
+ * one point have been programmed directly, it no longer is. Rather, the various
+ * aspects of queue behavior are controlled by parameters passed during the
+ * queue creation firmware commands, along with other mechanisms such as
+ * registers.
+ *
+ * Each type of queue also has a "status page" which may optionally be updated
+ * with cidx or pidx updates. For EQs this page consumes 1 or 2 credits at the
+ * end of the queue. For IQs it consumes 1 entry at the end of the queue.
+ *
+ * We use EQs to create Tx rings and IQs (plus FLs) to create Rx rings. We
+ * create the same number of Tx and Rx queues. So if we have 32 Tx queues, we
+ * will also have 32 Rx queues. The number of queues created is based on the
+ * port speed. The association from speed to queue count can be found in the
+ * t4_queue_counts array.
+ *
+ * Egress Queues (EQ)
+ * ------------------
+ *
+ * Egress Queues (EQ) provide communication from driver to T4. The driver writes
+ * (produces) descriptors to the queue using one or more host credits. It
+ * notifies the T4 of these new outstanding host credits by updating its pidx
+ * via a doorbell. As new outstanding credits arrive via the doorbell the T4
+ * reads (consumes) them to determine what types of descriptors have been sent
+ * along with their content. As the T4 consumes host credits it notifies the
+ * driver with a programmable combination of status page updates, CPL messages,
+ * and interrupts.
+ *
+ * All EQs use a host credit size of 8 flits (64 bytes). The driver uses these
+ * host credits to send Work Requests (WR) to the T4.
+ *
+ * A WR is variable in size and may be smaller or larger than a single host
+ * credit, but communication is always in whole units of credits. It is legal
+ * for a WR to span across the end of the queue and warp around, but the
+ * contents of the WR may dictate that the wrap-around happens only at certain
+ * offsets within the descriptor. A WR may be 16 to 512 bytes long, but must
+ * always begin at the start of a host credit, thus all WRs must start at a
+ * 64-byte aligned address.
+ *
+ * At this time the only WRs we use are FW_ETH_TX_PKT_WR and FW_ETH_TX_PKTS_WR.
+ *
+ * Ingress Queues (IQ)
+ * -------------------
+ *
+ * Ingress Queues (IQ) provide communication from T4 to driver. The T4 produces
+ * queue entries for the dirver to consume. Unlike EQs, data passed in IQs is
+ * always done as fixed-size entries. That is, each entry in the IQ takes up
+ * exactly one credit, and that credit size is determined at creation time. So
+ * in that sense you could think of a IQ entry as a descriptor. However, these
+ * entries contain different types of data of variable lengths (within in the
+ * bounds of the entry/credit size). There are four possible entry sizes, and
+ * the entry size dictates the possible messages an IQ can hold. The possibles
+ * sizes are 2 flits (16 bytes), 4 flits (32 bytes), 8 flits (64 bytes), and 16
+ * flits (128 bytes). Depending on the size, each entry may a contain Freelist
+ * buffer completion, CPL message, or a forwarded interrupt destined for another
+ * IQ. Which size to use depends on the use case of the IQ.
+ *
+ * Currently we make use of the 64-byte entry size exclusively.
+ *
+ * Freelists (FL)
+ * --------------
+ *
+ * A freelist (FL) is a type of EQ used for providing (producing) buffers for
+ * the purpose of holding received network data for an associated IQ. The driver
+ * produces pointers to DMA data buffers and the associated IQ consumes them as
+ * data is received by the device. A freelist is always associated with an IQ; a
+ * freelist is never used on its own. An IQ, however, may have no FL associated
+ * with it; such is the case for event IQs and interrupt forwarding IQs. An Rx
+ * IQ must have one or two FLs associated with it used to store the incoming
+ * packet headers and payload. The use of two FLs is for when "header splitting"
+ * is enabled: where the headers are placed in one buffer and the payload is
+ * placed in the other. Only the first 1024 IQs may have FLs associated with
+ * them.
+ *
+ * A freelist is always made up of buffer "pointers". Each buffer pointer is 1
+ * flit (8 bytes) in size and points to DMA memory used to hold packet data. The
+ * lowest four bits of the pointer are used as an index into the freelist buffer
+ * size array, allowing up to 16 different buffer sizes. This implies that each
+ * FL buffer pointer must be at least 16-byte aligned. Each pointer may use a
+ * different size. Since EQ communication must happen in units of host credits,
+ * and an EQ host credit is 8 flits, it means that the driver must always
+ * produce 8 FL buffer pointers per credit. If the driver cannot produce 8
+ * buffer pointers, the rest of the credit may be filled with zero-sized
+ * pointers ("null" or "zero" buffer) which is to say their size index points to
+ * a zero-value entry in the array.
+ *
+ * The digram below depicts how the FL buffer pointer indexes into the
+ * SGE_FL_BUFFER_SZ[N] array.
+ *
+ * +-------------------+-------------------------+
+ * | Buffer Ptr [63:4] | SGE_FL_BUFFER_SIZE[3:0] |
+ * +-------------------+-------------------------+
+ *                                  |
+ *            +---------------------+
+ *            v
+ * +--------------------+--------------------+
+ * | SGE_FL_BUFFER_SZ0  |         0          |  "zero" buffer
+ * +--------------------+--------------------+
+ * | SGE_FL_BUFFER_SZ1  |        4096        |  4K buffer
+ * +--------------------+--------------------+
+ *                      .
+ *                      .
+ *                      .
+ * +--------------------+--------------------+
+ * | SGE_FL_BUFFER_SZ15 |       16384        |  16K buffer
+ * +--------------------+--------------------+
+ *
+ * FL buffers may have "packing" enabled where a single buffer may be used for
+ * multiple packets. This requires that the driver keep track of the current
+ * offset within the current FL buffer. When a new buffer is required by the
+ * device, because the next packet will not fit in the remaining space of the
+ * current buffer, it will consume a new buffer and set a bit in the IQ
+ * completion entry to notify the driver. At this point the driver updates its
+ * cidx and restarts the offset at zero.
+ *
+ * If packing is not enabled each new packet starts at a new buffer.
+ *
+ * This driver currently sets the FL buffer size to 8192 (rx_buf_size) and
+ * enables packing.
+ *
+ * Doorbells, GTS messages, and Interrupts
+ * ---------------------------------------
+ *
+ * The driver and T4 need some way to communicate udpates to the pidx/cidx
+ * values of their queues. To achieve this goal, the driver uses a combination
+ * of doorbells, GTS messages, status pages, and interrupts.
+ *
+ * Doorbells
+ * ---------
+ *
+ * The driver informs the T4 of new EQ credits by way of a "doorbell" (DB). A
+ * doorbell is a register write directed towards a single queue. The doorbell
+ * carries a priority and an incremental update to the pidx value. There are two
+ * types of doorbells:
+ *
+ * 1. Kernel Space doorbells (KDB) which use BAR0.
+ * 2. User Space doorbells (UDB) which use BAR2.
+ *
+ * The "user space" doorbells, while useful for kernel-bypass networking, are
+ * also used for regular in-kernel networking. They divide the queue doorbell
+ * space into multiple 128 byte segments versus KDB's single address for all
+ * queues. They also provide the ability to perform Write-Combining Work
+ * Requeusts (DOORBELL_WCWR) and Write-Combining Doorbells (DOORBELL_UDBWC). The
+ * WCWR allows you to send a single credit as one write and avoid the need for
+ * the T4 to DMA the credit's contents (a WR or FL buffer pointers) from host
+ * memory. We currently make use of WCWR for the Tx datapath, but not for
+ * writing freelist descriptors.
+ *
+ * There is some more discussion of doorbells at the t4_doorbells_t definition
+ * in adapter.h.
+ *
+ * EQ Status Updates
+ * -----------------
+ *
+ * This section covers how EQ status updates work. While an FL is technically an
+ * EQ it makes no use of these mechanisms because the use of FL buffers (cidx)
+ * is tracked implicitly as CPL Rx messages arrive on the associated IQ.
+ *
+ * The driver can track the EQ cidx either by reading the EQ status page or by
+ * asking for a notification via an IQ. This is delivered by way of a
+ * CPL_SGE_EGR_UPDATE message. Furthermore, if the IQ this message is destined
+ * for has interrupts enabled, an interrupt is generated upon delivery of the
+ * message. The EQ status page update and the delivery of this message is
+ * controlled by several factors.
+ *
+ * 1. The EQ context field 'CIDXFlushThresh' (FW_EQ_ETH_CMD.cidxfthresh)
+ *    indicates how many consumed credits must be outstanding before the T4
+ *    generates a cidx update (both status page update and CPL message).
+ *
+ * 2. The EQ context field 'FCThreshOverride' (FW_EQ_ETH_CMD.cidxfthresho) tells
+ *    the T4 to generate a cidx update anytime cidx==pidx; i.e., when the T4 has
+ *    consumed all outstanding credits. This happens regardless if the cidx
+ *    flush threshold has been reached or not (thus the "override"). This is
+ *    useful for dealing with cases of intermitten transmission where the
+ *    threshold may not be reached in a timely manner.
+ *
+ * 3. The DBQ Timer (see TAF_DBQ_TIMER) provides for sending a cidx notification
+ *    anytime the EQ has sat idle (no pidx updates) for a period of time. This
+ *    is preferred to method (2) as it allows batching cidx updates while also
+ *    recycling consumed credits in a timely manner. This is available starting
+ *    with the T6 chip.
+ *
+ * 4. The FW_EQ_FLUSH_WR (its own WR on the EQ) allows the driver to request
+ *    either a status page update, EGR update, or both.
+ *
+ * 5. The FW_ETH_TX_PKT_WR and FW_ETH_TX_PKTS_WR, used to send packets, allows
+ *    the driver to request either a status page update, EGR update, or both as
+ *    part of sending the packet.
+ *
+ * This driver utilizes both the status page and CPL udpates as well as all the
+ * methods listed above to generate these updates.
+ *
+ * GTS Messages
+ * ------------
+ *
+ * The driver sends a GTS (Go To Sleep) message to the T4 to update the SGE
+ * about a specific IQ. The message conveys four pieces of information.
+ *
+ * 1. The Ingress Queue the update is for.
+ *
+ * 2. The current cidx of the driver.
+ *
+ * 3. The new timer value for pidx update scheduling (see IQ context
+ *    'Update_Scheduling' field).
+ *
+ * 4. Either a) arming the "Solicited Event" Interrupt or b) setting the new
+ *    value for the IQ context 'Update_Scheduling' field. Which one depends on
+ *    the IQ context 'GTS_Mode' value.
+ *
+ * We currently always set 'GTS_Mode=1' which indicates that the GTS 'SEIntArm'
+ * value (number 4 above) is used to dictate the new value for the
+ * 'Update_Scheduling' field.
+ *
+ * As the driver processes outstanding IQ credits it uses GTS messages to notify
+ * the driver of how many credits it has consumed and optionally re-arm the
+ * timer and packet counter notifications.
+ *
+ * The GTS messages, like the EQ Doorbells, have both kernel and user space
+ * registers. We currently only make use of the kernel space register.
+ *
+ * Ingress Queue Generation Bit
+ * ----------------------------
+ *
+ * Ingress Queues have an alternative method for pidx updates beyond the status
+ * page update or an explicit CPL message like is done for Ethernet EQs. They
+ * also provide a generation bit as part of each queue entry (credit) which can
+ * be used by the driver, after it has received an interrupt indicating new data
+ * is available, to determine which entries are newly produced by the device.
+ * This method allows you to eschew IQ status page updates altogether, and that
+ * is how we use IQs both for our firmware queue as well as our Rx data queues.
+ *
+ * Freelist Updates
+ * ----------------
+ *
+ * While an FL is technically an EQ we do not make use of explicit EQ status
+ * updates to track the FL cidx. Rather, the current FL buffer is tracked
+ * implicitly by way of the Rx IQ CPL messages generated as part of incoming
+ * traffic. As new packets come in the SGE writes the data in the current FL
+ * buffer and writes a new CPL message onto the Rx IQ. These CPL messages allow
+ * the driver to track which FL buffer is currently in use by the device and
+ * when to move onto the next FL buffer.
+ *
+ * Interrupts
+ * ----------
+ *
+ * The T4 provides interrupt capability for support of asynchrnous
+ * notifications. The primary uses of interrupts consist of the following.
+ *
+ * 1. Notification of new IQ entries (credits) available for consumption by the
+ *    driver. That is, the T4 notifies the host that of its latest IQ pidx value
+ *    indicating that there are new credits for host consumption.
+ *
+ * 2. Notification of new EQ credits available for production by the driver.
+ *    That is, the T4 notifies the host of its latest EQ cidx value indicating
+ *    that there are new credits avilable for host production.
+ *
+ * 3. Notification of firmware events (also referred to as the "firmware queue"
+ *    or "asynchronous event queue").
+ *
+ * This driver employs three different strategies for assigning interrupts
+ * depending on the type and number of interrupts available. These strategies
+ * are listed in order of preference. The solution is chosen by
+ * t4_cfg_intrs_queues() and the setup is done by t4_setup_intrs().
+ *
+ * TIP_PER_PORT
+ *
+ *     The first strategy is used when we have enough MSI/MSI-X interrupts to
+ *     dedicate one to error conditions, one for asynchronous firmware events,
+ *     and at least one for Tx/Rx events on each network port on the adapter. A
+ *     port may have more than one interrupt, in which case its Tx/Rx queue
+ *     events are distributed across those interrupts as evenly as possible. For
+ *     example, given a two-port adapter with eight interrupts, one interrupt
+ *     would be consumed for error conditions, one for firmware events, and the
+ *     remaning six would be divided as three interrupts per port. If each port
+ *     has 32 Rx queues, then two interrupts would be responsbile for 11 queues,
+ *     and the third interrupt would be responsible for 10.
+ *
+ *     The error interrupt vector points to the t4_intr_err() function. Errors
+ *     are deliverd via registers and are handled by t4_slow_intr_handler().
+ *
+ *     The asynchronous firmware event interrupt points to the t4_intr_fwq()
+ *     function and the events arrive on the firmware queue (sc->sge.fwq).
+ *
+ *     The per port interrupts point to t4_intr_port_queue() and each port's
+ *     events land on one of the per port event queues (port->intr_iqs).
+ *
+ * TIP_ERR_QUEUES
+ *
+ *     The second strategy is used when we have only two interrupts. In this
+ *     case one of the interrupts is dedicated to errors and the other one is
+ *     shared between the firmware events and the port events (Rx/Tx
+ *     notifications).
+ *
+ *     In this case the firmware and port events all land on the firmware queue
+ *     which is processed by t4_intr_fwq().
+ *
+ * TIP_SINGLE
+ *
+ *     The last strategy is for when we have a single interrupt and everything
+ *     needs to share it. In this case the interrupt lands on t4_intr_all() and
+ *     all firmware and port events go to the firmware queue.
+ *
+ * The per-port events queues (port->intr_iq) do not receive any network data
+ * themselves. Rather, they are used for two purposes:
+ *
+ * 1. To handle CPL_SGE_EGR_UPDATE messages; used to notify the driver about the
+ *    device's current cidx in a particular EQ. This is how Tx queues know when
+ *    they reclaim credits used for sending packets.
+ *
+ * 2. To handle "forwarded interrupt" notifications; used to notify the driver
+ *    that a particular receive IQ has outstanding credits to read. This is how
+ *    Rx queues know when there are new packets available to read.
+ */
+
 static void *t4_soft_state;
 
 static kmutex_t t4_adapter_list_lock;
 static list_t t4_adapter_list;
 
-struct intrs_and_queues {
-	int intr_type;		/* DDI_INTR_TYPE_* */
-	int nirq;		/* Number of vectors */
-	int intr_fwd;		/* Interrupts forwarded */
-	int ntxq10g;		/* # of NIC txq's for each 10G port */
-	int nrxq10g;		/* # of NIC rxq's for each 10G port */
-	int ntxq1g;		/* # of NIC txq's for each 1G port */
-	int nrxq1g;		/* # of NIC rxq's for each 1G port */
-};
-
-static unsigned int getpf(struct adapter *sc);
-static int prep_firmware(struct adapter *sc);
-static int upload_config_file(struct adapter *sc, uint32_t *mt, uint32_t *ma);
-static int partition_resources(struct adapter *sc);
-static int adap__pre_init_tweaks(struct adapter *sc);
-static int get_params__pre_init(struct adapter *sc);
-static int get_params__post_init(struct adapter *sc);
-static int set_params__post_init(struct adapter *);
-static void t4_setup_adapter_memwin(struct adapter *sc);
-static int validate_mt_off_len(struct adapter *, int, uint32_t, int,
-    uint32_t *);
+typedef enum t4_port_speed {
+	TPS_1G,
+	TPS_10G,
+	TPS_25G,
+	TPS_40G,
+	TPS_50G,
+	TPS_100G,
+	TPS_200G,
+	TPS_400G,
+} t4_port_speed_t;
+
+static uint_t t4_getpf(struct adapter *);
+static int t4_prep_firmware(struct adapter *);
+static int t4_upload_config_file(struct adapter *, uint32_t *, uint32_t *);
+static int t4_partition_resources(struct adapter *);
+static int t4_init_adap_tweaks(struct adapter *);
+static int t4_init_get_params_pre(struct adapter *);
+static int t4_init_get_params_post(struct adapter *);
+static int t4_init_set_params(struct adapter *);
+static void t4_setup_adapter_memwin(struct adapter *);
 static uint32_t t4_position_memwin(struct adapter *, int, uint32_t);
-static int init_driver_props(struct adapter *sc, struct driver_properties *p);
-static int remove_extra_props(struct adapter *sc, int n10g, int n1g);
-static int cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g,
-    struct intrs_and_queues *iaq);
-static int add_child_node(struct adapter *sc, int idx);
-static int remove_child_node(struct adapter *sc, int idx);
-static kstat_t *setup_kstats(struct adapter *sc);
-static kstat_t *setup_wc_kstats(struct adapter *);
-static int update_wc_kstats(kstat_t *, int);
-static int t4_port_full_uninit(struct port_info *);
+static void t4_init_driver_props(struct adapter *);
+static int t4_cfg_intrs_queues(struct adapter *);
+static int t4_setup_intrs(struct adapter *);
+static int t4_add_child_node(struct adapter *, uint_t);
+static int t4_remove_child_node(struct adapter *, uint_t);
+static kstat_t *t4_setup_kstats(struct adapter *);
+static kstat_t *t4_setup_wc_kstats(struct adapter *);
+static void t4_port_full_uninit(struct port_info *);
+static t4_port_speed_t t4_port_speed(const struct port_info *);
 
 static int t4_temperature_read(void *, sensor_ioctl_scalar_t *);
 static int t4_voltage_read(void *, sensor_ioctl_scalar_t *);
+
 static const ksensor_ops_t t4_temp_ops = {
 	.kso_kind = ksensor_kind_temperature,
 	.kso_scalar = t4_temperature_read
@@ -113,7 +580,7 @@ static ddi_ufm_ops_t t4_ufm_ops = {
 	.ddi_ufm_op_getcaps = t4_ufm_getcaps
 };
 
-/* ARGSUSED */
+
 static int
 t4_devo_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
 {
@@ -168,13 +635,9 @@ static int t4_devo_detach(dev_info_t *, ddi_detach_cmd_t);
 static int
 t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 {
-	struct adapter *sc = NULL;
-	struct sge *s;
-	int i, instance, rc = DDI_SUCCESS, rqidx, tqidx, q;
-	int irq = 0, nxg = 0, n1g = 0;
+	int i = 0;
+	int rc = DDI_SUCCESS;
 	char name[16];
-	struct driver_properties *prp;
-	struct intrs_and_queues iaq;
 	ddi_device_acc_attr_t da = {
 		.devacc_attr_version = DDI_DEVICE_ATTR_V0,
 		.devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC,
@@ -192,7 +655,7 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	/*
 	 * Allocate space for soft state.
 	 */
-	instance = ddi_get_instance(dip);
+	const int instance = ddi_get_instance(dip);
 	rc = ddi_soft_state_zalloc(t4_soft_state, instance);
 	if (rc != DDI_SUCCESS) {
 		cxgb_printf(dip, CE_WARN,
@@ -200,21 +663,23 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 		return (DDI_FAILURE);
 	}
 
-	sc = ddi_get_soft_state(t4_soft_state, instance);
+	struct adapter *sc = ddi_get_soft_state(t4_soft_state, instance);
 	sc->dip = dip;
 	sc->dev = makedevice(ddi_driver_major(dip), instance);
 	mutex_init(&sc->lock, NULL, MUTEX_DRIVER, NULL);
 	cv_init(&sc->cv, NULL, CV_DRIVER, NULL);
 	mutex_init(&sc->sfl_lock, NULL, MUTEX_DRIVER, NULL);
-	TAILQ_INIT(&sc->sfl);
+	list_create(&sc->sfl_list, sizeof (struct sge_fl),
+	    offsetof(struct sge_fl, sfl_node));
 	mutex_init(&sc->mbox_lock, NULL, MUTEX_DRIVER, NULL);
-	STAILQ_INIT(&sc->mbox_list);
+	list_create(&sc->mbox_list, sizeof (t4_mbox_waiter_t),
+	    offsetof(t4_mbox_waiter_t, node));
 
 	mutex_enter(&t4_adapter_list_lock);
 	list_insert_tail(&t4_adapter_list, sc);
 	mutex_exit(&t4_adapter_list_lock);
 
-	sc->pf = getpf(sc);
+	sc->pf = t4_getpf(sc);
 	if (sc->pf > 8) {
 		rc = EINVAL;
 		cxgb_printf(dip, CE_WARN,
@@ -224,8 +689,8 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	sc->mbox = sc->pf;
 
 	/* Initialize the driver properties */
-	prp = &sc->props;
-	(void) init_driver_props(sc, prp);
+	t4_init_driver_props(sc);
+	struct driver_properties *prp = &sc->props;
 
 	/*
 	 * Enable access to the PCI config space.
@@ -273,7 +738,7 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	} else {
 		if (t4_cver_ge(sc, CHELSIO_T5)) {
 			sc->doorbells |= DOORBELL_UDB;
-			if (prp->wc) {
+			if (prp->write_combine) {
 				/*
 				 * Enable write combining on BAR2.  This is the
 				 * userspace doorbell BAR and is split into 128B
@@ -312,15 +777,15 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	t4_setup_adapter_memwin(sc);
 
 	/* Prepare the firmware for operation */
-	rc = prep_firmware(sc);
+	rc = t4_prep_firmware(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
-	rc = adap__pre_init_tweaks(sc);
+	rc = t4_init_adap_tweaks(sc);
 	if (rc != 0)
 		goto done;
 
-	rc = get_params__pre_init(sc);
+	rc = t4_init_get_params_pre(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
@@ -336,11 +801,11 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 		}
 	}
 
-	rc = get_params__post_init(sc);
+	rc = t4_init_get_params_post(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
-	rc = set_params__post_init(sc);
+	rc = t4_init_set_params(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
 
@@ -348,7 +813,6 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	 * TODO: This is the place to call t4_set_filter_mode()
 	 */
 
-	/* tweak some settings */
 	t4_write_reg(sc, A_TP_SHIFT_CNT,
 	    V_SYNSHIFTMAX(6) |
 	    V_RXTSHIFTMAXR1(4) |
@@ -400,140 +864,44 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 		mutex_init(&pi->lock, NULL, MUTEX_DRIVER, NULL);
 		pi->mtu = ETHERMTU;
 
-		if (t4_port_is_10xg(pi)) {
-			nxg++;
-			pi->tmr_idx = prp->tmr_idx_10g;
-			pi->pktc_idx = prp->pktc_idx_10g;
-		} else {
-			n1g++;
-			pi->tmr_idx = prp->tmr_idx_1g;
-			pi->pktc_idx = prp->pktc_idx_1g;
-		}
+		pi->tmr_idx = prp->ethq_tmr_idx;
+		pi->pktc_idx = prp->ethq_pktc_idx;
 		pi->dbq_timer_idx = prp->dbq_timer_idx;
 
 		pi->xact_addr_filt = -1;
 	}
 
-	(void) remove_extra_props(sc, nxg, n1g);
-
-	rc = cfg_itype_and_nqueues(sc, nxg, n1g, &iaq);
-	if (rc != 0)
+	if ((rc = t4_cfg_intrs_queues(sc)) != 0) {
 		goto done; /* error message displayed already */
-
-	sc->intr_type = iaq.intr_type;
-	sc->intr_count = iaq.nirq;
-
-	if (sc->props.multi_rings && (sc->intr_type != DDI_INTR_TYPE_MSIX)) {
-		sc->props.multi_rings = 0;
-		cxgb_printf(dip, CE_WARN,
-		    "Multiple rings disabled as interrupt type is not MSI-X");
 	}
 
-	if (sc->props.multi_rings && iaq.intr_fwd) {
-		sc->props.multi_rings = 0;
-		cxgb_printf(dip, CE_WARN,
-		    "Multiple rings disabled as interrupts are forwarded");
-	}
-
-	if (!sc->props.multi_rings) {
-		iaq.ntxq10g = 1;
-		iaq.ntxq1g = 1;
-	}
-	s = &sc->sge;
-	s->nrxq = nxg * iaq.nrxq10g + n1g * iaq.nrxq1g;
-	s->ntxq = nxg * iaq.ntxq10g + n1g * iaq.ntxq1g;
-	s->neq = s->ntxq + s->nrxq;	/* the fl in an rxq is an eq */
-	s->niq = s->nrxq + 1;		/* 1 extra for firmware event queue */
-	if (iaq.intr_fwd != 0)
-		sc->flags |= TAF_INTR_FWD;
-	s->rxq = kmem_zalloc(s->nrxq * sizeof (struct sge_rxq), KM_SLEEP);
-	s->txq = kmem_zalloc(s->ntxq * sizeof (struct sge_txq), KM_SLEEP);
-	s->iqmap =
-	    kmem_zalloc(s->iqmap_sz * sizeof (struct sge_iq *), KM_SLEEP);
-	s->eqmap =
-	    kmem_zalloc(s->eqmap_sz * sizeof (struct sge_eq *), KM_SLEEP);
+	const struct t4_intrs_queues *iaq = &sc->intr_queue_cfg;
+	struct sge_info *sge = &sc->sge;
+	sge->rxq =
+	    kmem_zalloc(sge->rxq_count * sizeof (struct sge_rxq), KM_SLEEP);
+	sge->txq =
+	    kmem_zalloc(sge->txq_count * sizeof (struct sge_txq), KM_SLEEP);
+	sge->iqmap =
+	    kmem_zalloc(sge->iqmap_sz * sizeof (struct sge_iq *), KM_SLEEP);
+	sge->eqmap =
+	    kmem_zalloc(sge->eqmap_sz * sizeof (struct sge_eq *), KM_SLEEP);
 
 	sc->intr_handle =
-	    kmem_zalloc(sc->intr_count * sizeof (ddi_intr_handle_t), KM_SLEEP);
+	    kmem_zalloc(iaq->intr_count * sizeof (ddi_intr_handle_t),
+	    KM_SLEEP);
 
 	/*
-	 * Second pass over the ports.  This time we know the number of rx and
-	 * tx queues that each port should get.
+	 * Enable hw checksumming and LSO for all ports by default.
+	 * They can be disabled using ndd (hw_csum and hw_lso).
 	 */
-	rqidx = tqidx = 0;
 	for_each_port(sc, i) {
-		struct port_info *pi = sc->port[i];
-
-		if (pi == NULL)
-			continue;
-
-		t4_mc_cb_init(pi);
-		pi->first_rxq = rqidx;
-		pi->nrxq = (t4_port_is_10xg(pi)) ? iaq.nrxq10g : iaq.nrxq1g;
-		pi->first_txq = tqidx;
-		pi->ntxq = (t4_port_is_10xg(pi)) ? iaq.ntxq10g : iaq.ntxq1g;
-
-		rqidx += pi->nrxq;
-		tqidx += pi->ntxq;
-
-		/*
-		 * Enable hw checksumming and LSO for all ports by default.
-		 * They can be disabled using ndd (hw_csum and hw_lso).
-		 */
-		pi->features |= (CXGBE_HW_CSUM | CXGBE_HW_LSO);
+		sc->port[i]->features |= (CXGBE_HW_CSUM | CXGBE_HW_LSO);
 	}
 
-	/*
-	 * Setup Interrupts.
-	 */
-
-	i = 0;
-	rc = ddi_intr_alloc(dip, sc->intr_handle, sc->intr_type, 0,
-	    sc->intr_count, &i, DDI_INTR_ALLOC_STRICT);
-	if (rc != DDI_SUCCESS) {
-		cxgb_printf(dip, CE_WARN,
-		    "failed to allocate %d interrupt(s) of type %d: %d, %d",
-		    sc->intr_count, sc->intr_type, rc, i);
+	/* Setup Interrupts. */
+	if ((rc = t4_setup_intrs(sc)) != DDI_SUCCESS) {
 		goto done;
 	}
-	ASSERT(sc->intr_count == i); /* allocation was STRICT */
-	(void) ddi_intr_get_cap(sc->intr_handle[0], &sc->intr_cap);
-	(void) ddi_intr_get_pri(sc->intr_handle[0], &sc->intr_pri);
-	if (sc->intr_count == 1) {
-		ASSERT(sc->flags & TAF_INTR_FWD);
-		(void) ddi_intr_add_handler(sc->intr_handle[0], t4_intr_all, sc,
-		    &s->fwq);
-	} else {
-		/* Multiple interrupts.  The first one is always error intr */
-		(void) ddi_intr_add_handler(sc->intr_handle[0], t4_intr_err, sc,
-		    NULL);
-		irq++;
-
-		/* The second one is always the firmware event queue */
-		(void) ddi_intr_add_handler(sc->intr_handle[1], t4_intr, sc,
-		    &s->fwq);
-		irq++;
-		/*
-		 * Note that if TAF_INTR_FWD is set then either the NIC rx
-		 * queues or (exclusive or) the TOE rx queueus will be taking
-		 * direct interrupts.
-		 *
-		 * There is no need to check for is_offload(sc) as nofldrxq
-		 * will be 0 if offload is disabled.
-		 */
-		for_each_port(sc, i) {
-			struct port_info *pi = sc->port[i];
-			struct sge_rxq *rxq;
-			rxq = &s->rxq[pi->first_rxq];
-			for (q = 0; q < pi->nrxq; q++, rxq++) {
-				(void) ddi_intr_add_handler(
-				    sc->intr_handle[irq], t4_intr, sc,
-				    &rxq->iq);
-				irq++;
-			}
-		}
-
-	}
 	sc->flags |= TAF_INTR_ALLOC;
 
 	if ((rc = ksensor_create_scalar_pcidev(dip, SENSOR_KIND_TEMPERATURE,
@@ -561,17 +929,31 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	}
 	ddi_ufm_update(sc->ufm_hdl);
 
-	if ((rc = t4_alloc_fwq(sc)) != 0) {
+	if ((rc = t4_alloc_evt_iqs(sc)) != 0) {
 		cxgb_printf(dip, CE_WARN, "failed to alloc FWQ: %d", rc);
 		rc = DDI_FAILURE;
 		goto done;
 	}
 
 	if (sc->intr_cap & DDI_INTR_FLAG_BLOCK) {
-		(void) ddi_intr_block_enable(sc->intr_handle, sc->intr_count);
+		rc = ddi_intr_block_enable(sc->intr_handle, iaq->intr_count);
+
+		if (rc != DDI_SUCCESS) {
+			cxgb_printf(dip, CE_WARN, "failed to enable intr "
+			    "block: %d", rc);
+			rc = DDI_FAILURE;
+			goto done;
+		}
 	} else {
-		for (i = 0; i < sc->intr_count; i++)
-			(void) ddi_intr_enable(sc->intr_handle[i]);
+		for (i = 0; i < iaq->intr_count; i++) {
+			rc = ddi_intr_enable(sc->intr_handle[i]);
+			if (rc != DDI_SUCCESS) {
+				cxgb_printf(dip, CE_WARN, "failed to enable "
+				    "intr %d: %d", i, rc);
+				rc = DDI_FAILURE;
+				goto done;
+			}
+		}
 	}
 	t4_intr_enable(sc);
 
@@ -588,14 +970,8 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	 */
 	t4_dump_version_info(sc);
 
-	cxgb_printf(dip, CE_NOTE, "(%d rxq, %d txq total) %d %s.",
-	    rqidx, tqidx, sc->intr_count,
-	    sc->intr_type == DDI_INTR_TYPE_MSIX ? "MSI-X interrupts" :
-	    sc->intr_type == DDI_INTR_TYPE_MSI ? "MSI interrupts" :
-	    "fixed interrupt");
-
-	sc->ksp = setup_kstats(sc);
-	sc->ksp_stat = setup_wc_kstats(sc);
+	sc->ksp = t4_setup_kstats(sc);
+	sc->ksp_stat = t4_setup_wc_kstats(sc);
 	sc->params.drv_memwin = MEMWIN_NIC;
 
 done:
@@ -612,36 +988,37 @@ done:
 static int
 t4_devo_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 {
-	int instance, i;
-	struct adapter *sc;
+	int i = 0;
 	struct port_info *pi;
-	struct sge *s;
+	struct sge_info *s;
 
 	if (cmd != DDI_DETACH)
 		return (DDI_FAILURE);
 
-	instance = ddi_get_instance(dip);
-	sc = ddi_get_soft_state(t4_soft_state, instance);
+	const int instance = ddi_get_instance(dip);
+	struct adapter *sc = ddi_get_soft_state(t4_soft_state, instance);
 	if (sc == NULL)
 		return (DDI_SUCCESS);
 
+	struct t4_intrs_queues *iaq = &sc->intr_queue_cfg;
+
 	if (sc->flags & TAF_INIT_DONE) {
 		t4_intr_disable(sc);
 		for_each_port(sc, i) {
 			pi = sc->port[i];
 			if (pi && pi->flags & TPF_INIT_DONE)
-				(void) t4_port_full_uninit(pi);
+				t4_port_full_uninit(pi);
 		}
 
 		if (sc->intr_cap & DDI_INTR_FLAG_BLOCK) {
 			(void) ddi_intr_block_disable(sc->intr_handle,
-			    sc->intr_count);
+			    iaq->intr_count);
 		} else {
-			for (i = 0; i < sc->intr_count; i++)
+			for (i = 0; i < iaq->intr_count; i++)
 				(void) ddi_intr_disable(sc->intr_handle[i]);
 		}
 
-		(void) t4_free_fwq(sc);
+		t4_free_evt_iqs(sc);
 
 		sc->flags &= ~TAF_INIT_DONE;
 	}
@@ -662,9 +1039,9 @@ t4_devo_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 
 	s = &sc->sge;
 	if (s->rxq != NULL)
-		kmem_free(s->rxq, s->nrxq * sizeof (struct sge_rxq));
+		kmem_free(s->rxq, s->rxq_count * sizeof (struct sge_rxq));
 	if (s->txq != NULL)
-		kmem_free(s->txq, s->ntxq * sizeof (struct sge_txq));
+		kmem_free(s->txq, s->txq_count * sizeof (struct sge_txq));
 	if (s->iqmap != NULL)
 		kmem_free(s->iqmap, s->iqmap_sz * sizeof (struct sge_iq *));
 	if (s->eqmap != NULL)
@@ -674,21 +1051,39 @@ t4_devo_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 		kmem_cache_destroy(s->rxbuf_cache);
 
 	if (sc->flags & TAF_INTR_ALLOC) {
-		for (i = 0; i < sc->intr_count; i++) {
-			(void) ddi_intr_remove_handler(sc->intr_handle[i]);
-			(void) ddi_intr_free(sc->intr_handle[i]);
+		for (int i = 0; i < iaq->intr_count; i++) {
+			int rc = ddi_intr_remove_handler(sc->intr_handle[i]);
+			if (rc != DDI_SUCCESS) {
+				cxgb_printf(sc->dip, CE_WARN, "failed to "
+				    "remove interrupt handler %d for type: %d "
+				    "plan: %d: %d", i, iaq->intr_type,
+				    iaq->intr_plan, rc);
+			}
+
+			rc = ddi_intr_free(sc->intr_handle[i]);
+			if (rc != DDI_SUCCESS) {
+				cxgb_printf(sc->dip, CE_WARN, "failed to free "
+				    "interrupt %d for type: %d plan: %d: %d", i,
+				    iaq->intr_type, iaq->intr_plan, rc);
+
+			}
 		}
 		sc->flags &= ~TAF_INTR_ALLOC;
 	}
 
 	if (sc->intr_handle != NULL) {
 		kmem_free(sc->intr_handle,
-		    sc->intr_count * sizeof (*sc->intr_handle));
+		    iaq->intr_count * sizeof (*sc->intr_handle));
 	}
 
 	for_each_port(sc, i) {
 		pi = sc->port[i];
 		if (pi != NULL) {
+			if (pi->intr_iqs != NULL) {
+				kmem_free(pi->intr_iqs,
+				    sizeof (pi->intr_iqs[0]) *
+				    sc->intr_queue_cfg.intr_per_port);
+			}
 			mutex_destroy(&pi->lock);
 			kmem_free(pi, sizeof (*pi));
 		}
@@ -758,9 +1153,10 @@ t4_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t op, void *arg,
 
 	switch (op) {
 	case DDI_CTLOPS_REPORTDEV:
-		pi = ddi_get_parent_data(rdip);
-		pi->instance = ddi_get_instance(dip);
-		pi->child_inst = ddi_get_instance(rdip);
+		if (rdip == NULL)
+			return (DDI_FAILURE);
+		cmn_err(CE_CONT, "?t4nexus: %s%d\n",
+		    ddi_driver_name(rdip), ddi_get_instance(rdip));
 		return (DDI_SUCCESS);
 
 	case DDI_CTLOPS_INITCHILD:
@@ -784,43 +1180,58 @@ t4_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t op, void *arg,
 	}
 }
 
+/* From a provided "cxgbe@0" string, parse the device number */
+static bool
+t4_parse_devnum(const char *devname, uint_t *inst_nump)
+{
+	const size_t name_sz = strlen(devname) + 1;
+	char *name_copy = i_ddi_strdup(devname, KM_SLEEP);
+
+	bool res = false;
+	char *nodename, *addrname = NULL;
+	i_ddi_parse_name(name_copy, &nodename, &addrname, NULL);
+	if (addrname == NULL || strcmp(T4_PORT_NAME, nodename) != 0) {
+		goto done;
+	}
+
+	ulong_t num;
+	if (ddi_strtoul(addrname, NULL, 10, &num) != 0 || num > UINT_MAX) {
+		goto done;
+	}
+	*inst_nump = (uint_t)num;
+	res = true;
+
+done:
+	kmem_free(name_copy, name_sz);
+	return (res);
+}
+
 static int
 t4_bus_config(dev_info_t *dip, uint_t flags, ddi_bus_config_op_t op, void *arg,
     dev_info_t **cdipp)
 {
-	int instance, i;
-	struct adapter *sc;
-
-	instance = ddi_get_instance(dip);
-	sc = ddi_get_soft_state(t4_soft_state, instance);
+	struct adapter *sc =
+	    ddi_get_soft_state(t4_soft_state, ddi_get_instance(dip));
 
 	if (op == BUS_CONFIG_ONE) {
-		char *c;
-
-		/*
-		 * arg is something like "cxgb@0" where 0 is the port_id hanging
-		 * off this nexus.
-		 */
+		uint_t dev_num;
 
-		c = arg;
-		while (*(c + 1))
-			c++;
-
-		/* There should be exactly 1 digit after '@' */
-		if (*(c - 1) != '@')
+		if (!t4_parse_devnum((const char *)arg, &dev_num)) {
 			return (NDI_FAILURE);
-
-		i = *c - '0';
-
-		if (add_child_node(sc, i) != 0)
+		}
+		if (t4_add_child_node(sc, dev_num) != 0) {
 			return (NDI_FAILURE);
+		}
 
 		flags |= NDI_ONLINE_ATTACH;
 
 	} else if (op == BUS_CONFIG_ALL || op == BUS_CONFIG_DRIVER) {
+		int i;
+
 		/* Allocate and bind all child device nodes */
-		for_each_port(sc, i)
-		    (void) add_child_node(sc, i);
+		for_each_port(sc, i) {
+			(void) t4_add_child_node(sc, (uint_t)i);
+		}
 		flags |= NDI_ONLINE_ATTACH;
 	}
 
@@ -831,107 +1242,97 @@ static int
 t4_bus_unconfig(dev_info_t *dip, uint_t flags, ddi_bus_config_op_t op,
     void *arg)
 {
-	int instance, i, rc;
-	struct adapter *sc;
-
-	instance = ddi_get_instance(dip);
-	sc = ddi_get_soft_state(t4_soft_state, instance);
+	struct adapter *sc
+	    = ddi_get_soft_state(t4_soft_state, ddi_get_instance(dip));
 
-	if (op == BUS_CONFIG_ONE || op == BUS_UNCONFIG_ALL ||
-	    op == BUS_UNCONFIG_DRIVER)
+	if (op == BUS_UNCONFIG_ONE ||
+	    op == BUS_UNCONFIG_ALL ||
+	    op == BUS_UNCONFIG_DRIVER) {
 		flags |= NDI_UNCONFIG;
+	}
 
-	rc = ndi_busop_bus_unconfig(dip, flags, op, arg);
+	int rc = ndi_busop_bus_unconfig(dip, flags, op, arg);
 	if (rc != 0)
 		return (rc);
 
 	if (op == BUS_UNCONFIG_ONE) {
-		char *c;
-
-		c = arg;
-		while (*(c + 1))
-			c++;
+		uint_t dev_num;
 
-		if (*(c - 1) != '@')
-			return (NDI_SUCCESS);
-
-		i = *c - '0';
-
-		rc = remove_child_node(sc, i);
+		if (!t4_parse_devnum((const char *)arg, &dev_num)) {
+			return (NDI_FAILURE);
+		}
 
+		rc = t4_remove_child_node(sc, dev_num);
 	} else if (op == BUS_UNCONFIG_ALL || op == BUS_UNCONFIG_DRIVER) {
+		uint_t i;
 
-		for_each_port(sc, i)
-		    (void) remove_child_node(sc, i);
+		for_each_port(sc, i) {
+			(void) t4_remove_child_node(sc, i);
+		}
 	}
 
 	return (rc);
 }
 
-/* ARGSUSED */
 static int
 t4_cb_open(dev_t *devp, int flag, int otyp, cred_t *credp)
 {
 	struct adapter *sc;
 
-	if (otyp != OTYP_CHR)
+	if (otyp != OTYP_CHR) {
 		return (EINVAL);
+	}
 
 	sc = ddi_get_soft_state(t4_soft_state, getminor(*devp));
-	if (sc == NULL)
+	if (sc == NULL) {
 		return (ENXIO);
+	}
 
 	return (atomic_cas_uint(&sc->open, 0, EBUSY));
 }
 
-/* ARGSUSED */
 static int
 t4_cb_close(dev_t dev, int flag, int otyp, cred_t *credp)
 {
-	struct adapter *sc;
+	struct adapter *sc = ddi_get_soft_state(t4_soft_state, getminor(dev));
 
-	sc = ddi_get_soft_state(t4_soft_state, getminor(dev));
-	if (sc == NULL)
+	if (sc == NULL) {
 		return (EINVAL);
+	}
 
 	(void) atomic_swap_uint(&sc->open, 0);
 	return (0);
 }
 
-/* ARGSUSED */
 static int
 t4_cb_ioctl(dev_t dev, int cmd, intptr_t d, int mode, cred_t *credp, int *rp)
 {
-	int instance;
-	struct adapter *sc;
-	void *data = (void *)d;
-
-	if (crgetuid(credp) != 0)
+	if (crgetuid(credp) != 0) {
 		return (EPERM);
+	}
 
-	instance = getminor(dev);
-	sc = ddi_get_soft_state(t4_soft_state, instance);
-	if (sc == NULL)
+	struct adapter *sc = ddi_get_soft_state(t4_soft_state, getminor(dev));
+
+	if (sc == NULL) {
 		return (EINVAL);
+	}
 
-	return (t4_ioctl(sc, cmd, data, mode));
+	return (t4_ioctl(sc, cmd, (void *)d, mode));
 }
 
-static unsigned int
-getpf(struct adapter *sc)
+static uint_t
+t4_getpf(struct adapter *sc)
 {
-	int rc, *data;
-	uint_t n, pf;
+	int *data;
+	uint_t n;
 
-	rc = ddi_prop_lookup_int_array(DDI_DEV_T_ANY, sc->dip,
+	const int rc = ddi_prop_lookup_int_array(DDI_DEV_T_ANY, sc->dip,
 	    DDI_PROP_DONTPASS, "reg", &data, &n);
 	if (rc != DDI_SUCCESS) {
-		cxgb_printf(sc->dip, CE_WARN,
-		    "failed to lookup \"reg\" property: %d", rc);
-		return (0xff);
+		return (UINT_MAX);
 	}
 
-	pf = PCI_REG_FUNC_G(data[0]);
+	const uint_t pf = PCI_REG_FUNC_G(data[0]);
 	ddi_prop_free(data);
 
 	return (pf);
@@ -942,21 +1343,12 @@ getpf(struct adapter *sc)
  * become the master, and reset the device.
  */
 static int
-prep_firmware(struct adapter *sc)
+t4_prep_firmware(struct adapter *sc)
 {
 	int rc;
-	size_t fw_size;
-	int reset = 1;
-	enum dev_state state;
-	unsigned char *fw_data;
-	struct fw_hdr *card_fw, *hdr;
-	const char *fw_file = NULL;
-	firmware_handle_t fw_hdl;
-	struct fw_info fi, *fw_info = &fi;
-
-	struct driver_properties *p = &sc->props;
 
 	/* Contact firmware, request master */
+	enum dev_state state;
 	rc = t4_fw_hello(sc, sc->mbox, sc->mbox, MASTER_MUST, &state);
 	if (rc < 0) {
 		rc = -rc;
@@ -969,8 +1361,9 @@ prep_firmware(struct adapter *sc)
 		sc->flags |= TAF_MASTER_PF;
 
 	/* We may need FW version info for later reporting */
-	t4_get_version_info(sc);
+	(void) t4_get_version_info(sc);
 
+	const char *fw_file = NULL;
 	switch (CHELSIO_CHIP_VERSION(sc->params.chip)) {
 	case CHELSIO_T4:
 		fw_file = "t4fw.bin";
@@ -986,58 +1379,58 @@ prep_firmware(struct adapter *sc)
 		return (EINVAL);
 	}
 
+	firmware_handle_t fw_hdl;
 	if (firmware_open(T4_PORT_NAME, fw_file, &fw_hdl) != 0) {
 		cxgb_printf(sc->dip, CE_WARN, "Could not open %s\n", fw_file);
 		return (EINVAL);
 	}
 
-	fw_size = firmware_get_size(fw_hdl);
-
+	const size_t fw_size = firmware_get_size(fw_hdl);
 	if (fw_size < sizeof (struct fw_hdr)) {
-		cxgb_printf(sc->dip, CE_WARN, "%s is too small (%ld bytes)\n",
+		cxgb_printf(sc->dip, CE_WARN, "%s is too small (%lu bytes)\n",
 		    fw_file, fw_size);
-		firmware_close(fw_hdl);
+		(void) firmware_close(fw_hdl);
 		return (EINVAL);
 	}
-
 	if (fw_size > FLASH_FW_MAX_SIZE) {
 		cxgb_printf(sc->dip, CE_WARN,
-		    "%s is too large (%ld bytes, max allowed is %ld)\n",
+		    "%s is too large (%lu bytes, max allowed is %lu)\n",
 		    fw_file, fw_size, FLASH_FW_MAX_SIZE);
-		firmware_close(fw_hdl);
+		(void) firmware_close(fw_hdl);
 		return (EFBIG);
 	}
 
-	fw_data = kmem_zalloc(fw_size, KM_SLEEP);
+	unsigned char *fw_data = kmem_zalloc(fw_size, KM_SLEEP);
 	if (firmware_read(fw_hdl, 0, fw_data, fw_size) != 0) {
 		cxgb_printf(sc->dip, CE_WARN, "Failed to read from %s\n",
 		    fw_file);
-		firmware_close(fw_hdl);
+		(void) firmware_close(fw_hdl);
 		kmem_free(fw_data, fw_size);
 		return (EINVAL);
 	}
-	firmware_close(fw_hdl);
-
-	bzero(fw_info, sizeof (*fw_info));
-	fw_info->chip = CHELSIO_CHIP_VERSION(sc->params.chip);
-
-	hdr = (struct fw_hdr *)fw_data;
-	fw_info->fw_hdr.fw_ver = hdr->fw_ver;
-	fw_info->fw_hdr.chip = hdr->chip;
-	fw_info->fw_hdr.intfver_nic = hdr->intfver_nic;
-	fw_info->fw_hdr.intfver_vnic = hdr->intfver_vnic;
-	fw_info->fw_hdr.intfver_ofld = hdr->intfver_ofld;
-	fw_info->fw_hdr.intfver_ri = hdr->intfver_ri;
-	fw_info->fw_hdr.intfver_iscsipdu = hdr->intfver_iscsipdu;
-	fw_info->fw_hdr.intfver_iscsi = hdr->intfver_iscsi;
-	fw_info->fw_hdr.intfver_fcoepdu = hdr->intfver_fcoepdu;
-	fw_info->fw_hdr.intfver_fcoe = hdr->intfver_fcoe;
+	(void) firmware_close(fw_hdl);
+
+	const struct fw_hdr *hdr = (struct fw_hdr *)fw_data;
+	struct fw_info fi;
+	bzero(&fi, sizeof (fi));
+	fi.chip				= CHELSIO_CHIP_VERSION(sc->params.chip);
+	fi.fw_hdr.fw_ver		= hdr->fw_ver;
+	fi.fw_hdr.chip			= hdr->chip;
+	fi.fw_hdr.intfver_nic		= hdr->intfver_nic;
+	fi.fw_hdr.intfver_vnic		= hdr->intfver_vnic;
+	fi.fw_hdr.intfver_ofld		= hdr->intfver_ofld;
+	fi.fw_hdr.intfver_ri		= hdr->intfver_ri;
+	fi.fw_hdr.intfver_iscsipdu	= hdr->intfver_iscsipdu;
+	fi.fw_hdr.intfver_iscsi		= hdr->intfver_iscsi;
+	fi.fw_hdr.intfver_fcoepdu	= hdr->intfver_fcoepdu;
+	fi.fw_hdr.intfver_fcoe		= hdr->intfver_fcoe;
 
 	/* allocate memory to read the header of the firmware on the card */
-	card_fw = kmem_zalloc(sizeof (*card_fw), KM_SLEEP);
+	struct fw_hdr *card_fw = kmem_zalloc(sizeof (struct fw_hdr), KM_SLEEP);
 
-	rc = -t4_prep_fw(sc, fw_info, fw_data, fw_size, card_fw,
-	    p->t4_fw_install, state, &reset);
+	int reset = 1;
+	rc = -t4_prep_fw(sc, &fi, fw_data, fw_size, card_fw,
+	    sc->props.t4_fw_install, state, &reset);
 
 	kmem_free(card_fw, sizeof (*card_fw));
 	kmem_free(fw_data, fw_size);
@@ -1065,18 +1458,21 @@ prep_firmware(struct adapter *sc)
 	if (sc->flags & TAF_MASTER_PF) {
 		/* Handle default vs special T4 config file */
 
-		rc = partition_resources(sc);
-		if (rc != 0)
-			goto err;	/* error message displayed already */
+		rc = t4_partition_resources(sc);
+		if (rc != 0) {
+			return (rc);
+		}
 	}
 
 	sc->flags |= FW_OK;
 	return (0);
-err:
-	return (rc);
-
 }
 
+struct memwin {
+	uint32_t base;
+	uint32_t aperture;
+};
+
 static const struct memwin t4_memwin[] = {
 	{ MEMWIN0_BASE, MEMWIN0_APERTURE },
 	{ MEMWIN1_BASE, MEMWIN1_APERTURE },
@@ -1101,8 +1497,8 @@ static const struct memwin t5_memwin[] = {
  * valid and lies entirely within the memtype specified.  The global address of
  * the start of the range is returned in addr.
  */
-int
-validate_mt_off_len(struct adapter *sc, int mtype, uint32_t off, int len,
+static int
+t4_validate_mt_off_len(struct adapter *sc, int mtype, uint32_t off, int len,
     uint32_t *addr)
 {
 	uint32_t em, addr_len, maddr, mlen;
@@ -1156,7 +1552,7 @@ validate_mt_off_len(struct adapter *sc, int mtype, uint32_t off, int len,
 }
 
 static void
-memwin_info(struct adapter *sc, int win, uint32_t *base, uint32_t *aperture)
+t4_memwin_info(struct adapter *sc, int win, uint32_t *base, uint32_t *aperture)
 {
 	const struct memwin *mw;
 
@@ -1176,7 +1572,7 @@ memwin_info(struct adapter *sc, int win, uint32_t *base, uint32_t *aperture)
  * Upload configuration file to card's memory.
  */
 static int
-upload_config_file(struct adapter *sc, uint32_t *mt, uint32_t *ma)
+t4_upload_config_file(struct adapter *sc, uint32_t *mt, uint32_t *ma)
 {
 	int rc = 0;
 	size_t cflen, cfbaselen;
@@ -1231,17 +1627,17 @@ upload_config_file(struct adapter *sc, uint32_t *mt, uint32_t *ma)
 		cxgb_printf(sc->dip, CE_WARN,
 		    "config file too long (%d, max allowed is %d).  ",
 		    cflen, FLASH_CFG_MAX_SIZE);
-		firmware_close(fw_hdl);
+		(void) firmware_close(fw_hdl);
 		return (EFBIG);
 	}
 
-	rc = validate_mt_off_len(sc, mtype, maddr, cflen, &addr);
+	rc = t4_validate_mt_off_len(sc, mtype, maddr, cflen, &addr);
 	if (rc != 0) {
 		cxgb_printf(sc->dip, CE_WARN,
 		    "%s: addr (%d/0x%x) or len %d is not valid: %d.  "
 		    "Will try to use the config on the card, if any.\n",
 		    __func__, mtype, maddr, cflen, rc);
-		firmware_close(fw_hdl);
+		(void) firmware_close(fw_hdl);
 		return (EFAULT);
 	}
 
@@ -1250,13 +1646,13 @@ upload_config_file(struct adapter *sc, uint32_t *mt, uint32_t *ma)
 	if (firmware_read(fw_hdl, 0, cfdata, cflen) != 0) {
 		cxgb_printf(sc->dip, CE_WARN, "Failed to read from %s\n",
 		    cfg_file);
-		firmware_close(fw_hdl);
+		(void) firmware_close(fw_hdl);
 		kmem_free(cfbase, cfbaselen);
 		return (EINVAL);
 	}
-	firmware_close(fw_hdl);
+	(void) firmware_close(fw_hdl);
 
-	memwin_info(sc, 2, &mw_base, &mw_aperture);
+	t4_memwin_info(sc, 2, &mw_base, &mw_aperture);
 	while (cflen) {
 		off = t4_position_memwin(sc, 2, addr);
 		n = min(cflen, mw_aperture - off);
@@ -1277,24 +1673,26 @@ upload_config_file(struct adapter *sc, uint32_t *mt, uint32_t *ma)
  * the firmware to process it.
  */
 static int
-partition_resources(struct adapter *sc)
+t4_partition_resources(struct adapter *sc)
 {
 	int rc;
-	struct fw_caps_config_cmd caps;
-	uint32_t mtype, maddr, finicsum, cfcsum;
+	uint32_t mtype, maddr;
 
-	rc = upload_config_file(sc, &mtype, &maddr);
+	rc = t4_upload_config_file(sc, &mtype, &maddr);
 	if (rc != 0) {
 		mtype = FW_MEMTYPE_CF_FLASH;
 		maddr = t4_flash_cfg_addr(sc);
 	}
 
+	struct fw_caps_config_cmd caps;
 	bzero(&caps, sizeof (caps));
 	caps.op_to_write = BE_32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
 	    F_FW_CMD_REQUEST | F_FW_CMD_READ);
 	caps.cfvalid_to_len16 = BE_32(F_FW_CAPS_CONFIG_CMD_CFVALID |
 	    V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) |
-	    V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(maddr >> 16) | FW_LEN16(caps));
+	    V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(maddr >> 16) |
+	    FW_LEN16(struct fw_caps_config_cmd));
+
 	rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof (caps), &caps);
 	if (rc != 0) {
 		cxgb_printf(sc->dip, CE_WARN,
@@ -1302,26 +1700,26 @@ partition_resources(struct adapter *sc)
 		return (rc);
 	}
 
-	finicsum = ntohl(caps.finicsum);
-	cfcsum = ntohl(caps.cfcsum);
-	if (finicsum != cfcsum) {
+	if (caps.finicsum != caps.cfcsum) {
 		cxgb_printf(sc->dip, CE_WARN,
 		    "WARNING: config file checksum mismatch: %08x %08x\n",
-		    finicsum, cfcsum);
+		    caps.finicsum, caps.cfcsum);
 	}
-	sc->cfcsum = cfcsum;
+	sc->cfcsum = caps.cfcsum;
 
-	/* TODO: Need to configure this correctly */
-	caps.toecaps = htons(FW_CAPS_CONFIG_TOE);
+	/* Disable unused offloads and features */
+	caps.toecaps = 0;
 	caps.iscsicaps = 0;
 	caps.rdmacaps = 0;
 	caps.fcoecaps = 0;
+	caps.cryptocaps = 0;
+
 	/* TODO: Disable VNIC cap for now */
-	caps.niccaps ^= htons(FW_CAPS_CONFIG_NIC_VM);
+	caps.niccaps &= BE_16(~FW_CAPS_CONFIG_NIC_VM);
 
-	caps.op_to_write = htonl(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
+	caps.op_to_write = BE_32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
 	    F_FW_CMD_REQUEST | F_FW_CMD_WRITE);
-	caps.cfvalid_to_len16 = htonl(FW_LEN16(caps));
+	caps.cfvalid_to_len16 = BE_32(FW_LEN16(caps));
 	rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof (caps), NULL);
 	if (rc != 0) {
 		cxgb_printf(sc->dip, CE_WARN,
@@ -1342,7 +1740,7 @@ partition_resources(struct adapter *sc)
  * Configuration Files and hard-coded initialization ...
  */
 static int
-adap__pre_init_tweaks(struct adapter *sc)
+t4_init_adap_tweaks(struct adapter *sc)
 {
 	int rx_dma_offset = 2; /* Offset of RX packets into DMA buffers */
 
@@ -1364,12 +1762,10 @@ adap__pre_init_tweaks(struct adapter *sc)
  * t4_sge_init and t4_fw_initialize.
  */
 static int
-get_params__pre_init(struct adapter *sc)
+t4_init_get_params_pre(struct adapter *sc)
 {
 	int rc;
 	uint32_t param[2], val[2];
-	struct fw_devlog_cmd cmd;
-	struct devlog_params *dlog = &sc->params.devlog;
 
 	/*
 	 * Grab the raw VPD parameters.
@@ -1401,25 +1797,31 @@ get_params__pre_init(struct adapter *sc)
 		sc->params.nports++;
 		val[0] &= val[0] - 1;
 	}
-
 	sc->params.vpd.cclk = val[1];
 
 	/* Read device log parameters. */
+	struct fw_devlog_cmd cmd;
 	bzero(&cmd, sizeof (cmd));
-	cmd.op_to_write = htonl(V_FW_CMD_OP(FW_DEVLOG_CMD) |
+	cmd.op_to_write = BE_32(V_FW_CMD_OP(FW_DEVLOG_CMD) |
 	    F_FW_CMD_REQUEST | F_FW_CMD_READ);
-	cmd.retval_len16 = htonl(FW_LEN16(cmd));
+	cmd.retval_len16 = BE_32(FW_LEN16(struct fw_devlog_cmd));
+
 	rc = -t4_wr_mbox(sc, sc->mbox, &cmd, sizeof (cmd), &cmd);
 	if (rc != 0) {
 		cxgb_printf(sc->dip, CE_WARN,
 		    "failed to get devlog parameters: %d.\n", rc);
-		bzero(dlog, sizeof (*dlog));
-		rc = 0;	/* devlog isn't critical for device operation */
+
+		/* devlog isn't critical for device operation */
+		bzero(&sc->params.devlog, sizeof (sc->params.devlog));
+		rc = 0;
 	} else {
-		val[0] = ntohl(cmd.memtype_devlog_memaddr16_devlog);
-		dlog->memtype = G_FW_DEVLOG_CMD_MEMTYPE_DEVLOG(val[0]);
-		dlog->start = G_FW_DEVLOG_CMD_MEMADDR16_DEVLOG(val[0]) << 4;
-		dlog->size = ntohl(cmd.memsize_devlog);
+		const uint32_t info =
+		    BE_32(cmd.memtype_devlog_memaddr16_devlog);
+		struct devlog_params *dlog = &sc->params.devlog;
+
+		dlog->memtype = G_FW_DEVLOG_CMD_MEMTYPE_DEVLOG(info);
+		dlog->start = G_FW_DEVLOG_CMD_MEMADDR16_DEVLOG(info) << 4;
+		dlog->size = BE_32(cmd.memsize_devlog);
 	}
 
 	return (rc);
@@ -1430,11 +1832,10 @@ get_params__pre_init(struct adapter *sc)
  * has been initialized by the firmware at this point.
  */
 static int
-get_params__post_init(struct adapter *sc)
+t4_init_get_params_post(struct adapter *sc)
 {
 	int rc;
 	uint32_t param[4], val[4];
-	struct fw_caps_config_cmd caps;
 
 	param[0] = FW_PARAM_PFVF(IQFLINT_START);
 	param[1] = FW_PARAM_PFVF(EQ_START);
@@ -1447,27 +1848,10 @@ get_params__post_init(struct adapter *sc)
 		return (rc);
 	}
 
-	sc->sge.iq_start = val[0];
-	sc->sge.eq_start = val[1];
-	sc->sge.iqmap_sz = val[2] - sc->sge.iq_start + 1;
-	sc->sge.eqmap_sz = val[3] - sc->sge.eq_start + 1;
-
-	uint32_t r = t4_read_reg(sc, A_SGE_EGRESS_QUEUES_PER_PAGE_PF);
-	r >>= S_QUEUESPERPAGEPF0 +
-	    (S_QUEUESPERPAGEPF1 - S_QUEUESPERPAGEPF0) * sc->pf;
-	sc->sge.s_qpp = r & M_QUEUESPERPAGEPF0;
-
-	/* get capabilites */
-	bzero(&caps, sizeof (caps));
-	caps.op_to_write = htonl(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
-	    F_FW_CMD_REQUEST | F_FW_CMD_READ);
-	caps.cfvalid_to_len16 = htonl(FW_LEN16(caps));
-	rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof (caps), &caps);
-	if (rc != 0) {
-		cxgb_printf(sc->dip, CE_WARN,
-		    "failed to get card capabilities: %d.\n", rc);
-		return (rc);
-	}
+	sc->sge.iqmap_start = val[0];
+	sc->sge.eqmap_start = val[1];
+	sc->sge.iqmap_sz = (val[2] - sc->sge.iqmap_start) + 1;
+	sc->sge.eqmap_sz = (val[3] - sc->sge.eqmap_start) + 1;
 
 	/* Check if DBQ timer is available for tracking egress completions */
 	param[0] = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
@@ -1479,6 +1863,19 @@ get_params__post_init(struct adapter *sc)
 		    ARRAY_SIZE(sc->sge.dbq_timers), sc->sge.dbq_timers);
 		if (rc == 0) {
 			sc->flags |= TAF_DBQ_TIMER;
+
+			/*
+			 * Expose DBQ timer values as property, converting them
+			 * to plain `int` as required.
+			 */
+			int tmp_encode[ARRAY_SIZE(sc->sge.dbq_timers)];
+			for (uint_t i = 0; i < ARRAY_SIZE(sc->sge.dbq_timers);
+			    i++) {
+				tmp_encode[i] = sc->sge.dbq_timers[i];
+			};
+			(void) ddi_prop_update_int_array(sc->dev, sc->dip,
+			    "tx-reclaim-timer-us-values",
+			    tmp_encode, SGE_NTIMERS);
 		} else {
 			sc->sge.dbq_timer_tick = 0;
 		}
@@ -1533,7 +1930,7 @@ get_params__post_init(struct adapter *sc)
 }
 
 static int
-set_params__post_init(struct adapter *sc)
+t4_init_set_params(struct adapter *sc)
 {
 	uint32_t param, val;
 
@@ -1643,15 +2040,34 @@ prop_lookup_int(struct adapter *sc, char *name, int defval)
 	    name, defval));
 }
 
+static bool
+prop_lookup_bool(struct adapter *sc, char *name, bool defval)
+{
+	int rc;
+
+	rc = ddi_prop_get_int(sc->dev, sc->dip, DDI_PROP_DONTPASS, name, -1);
+	if (rc == -1) {
+		rc = ddi_prop_get_int(DDI_DEV_T_ANY, sc->dip, DDI_PROP_DONTPASS,
+		    name, -1);
+	}
+
+	if (rc != -1) {
+		return (rc != 0);
+	} else {
+		return (defval);
+	}
+}
+
 const uint_t t4_holdoff_timer_default[SGE_NTIMERS] = {5, 10, 20, 50, 100, 200};
 const uint_t t4_holdoff_pktcnt_default[SGE_NCOUNTERS] = {1, 8, 16, 32};
 
-static int
-init_driver_props(struct adapter *sc, struct driver_properties *p)
+static void
+t4_init_driver_props(struct adapter *sc)
 {
+	struct driver_properties *p = &sc->props;
 	dev_t dev = sc->dev;
 	dev_info_t *dip = sc->dip;
-	int i;
+	int val;
 
 	/*
 	 * For now, just use the defaults for the hold-off timers and counters.
@@ -1670,392 +2086,518 @@ init_driver_props(struct adapter *sc, struct driver_properties *p)
 	(void) ddi_prop_update_int_array(dev, dip, "holdoff-pkt-counter-values",
 	    (int *)p->holdoff_pktcnt, SGE_NCOUNTERS);
 
-	/*
-	 * Maximum # of tx and rx queues to use for each
-	 * 100G, 40G, 25G, 10G and 1G port.
-	 */
-	p->max_ntxq_10g = prop_lookup_int(sc, "max-ntxq-10G-port", 8);
-	(void) ddi_prop_update_int(dev, dip, "max-ntxq-10G-port",
-	    p->max_ntxq_10g);
-
-	p->max_nrxq_10g = prop_lookup_int(sc, "max-nrxq-10G-port", 8);
-	(void) ddi_prop_update_int(dev, dip, "max-nrxq-10G-port",
-	    p->max_nrxq_10g);
-
-	p->max_ntxq_1g = prop_lookup_int(sc, "max-ntxq-1G-port", 2);
-	(void) ddi_prop_update_int(dev, dip, "max-ntxq-1G-port",
-	    p->max_ntxq_1g);
-
-	p->max_nrxq_1g = prop_lookup_int(sc, "max-nrxq-1G-port", 2);
-	(void) ddi_prop_update_int(dev, dip, "max-nrxq-1G-port",
-	    p->max_nrxq_1g);
-
-	/*
-	 * Holdoff parameters for 10G and 1G ports.
-	 */
-	p->tmr_idx_10g = prop_lookup_int(sc, "holdoff-timer-idx-10G", 0);
-	(void) ddi_prop_update_int(dev, dip, "holdoff-timer-idx-10G",
-	    p->tmr_idx_10g);
-
-	p->pktc_idx_10g = prop_lookup_int(sc, "holdoff-pktc-idx-10G", 2);
-	(void) ddi_prop_update_int(dev, dip, "holdoff-pktc-idx-10G",
-	    p->pktc_idx_10g);
+	p->ethq_tmr_idx = prop_lookup_int(sc, "holdoff-timer-idx", 0);
+	p->ethq_pktc_idx = prop_lookup_int(sc, "holdoff-pktc-idx", 2);
 
-	p->tmr_idx_1g = prop_lookup_int(sc, "holdoff-timer-idx-1G", 0);
-	(void) ddi_prop_update_int(dev, dip, "holdoff-timer-idx-1G",
-	    p->tmr_idx_1g);
+	(void) ddi_prop_update_int(dev, dip, "holdoff-timer-idx",
+	    p->ethq_tmr_idx);
+	(void) ddi_prop_update_int(dev, dip, "holdoff-pktc-idx",
+	    p->ethq_pktc_idx);
 
-	p->pktc_idx_1g = prop_lookup_int(sc, "holdoff-pktc-idx-1G", 2);
-	(void) ddi_prop_update_int(dev, dip, "holdoff-pktc-idx-1G",
-	    p->pktc_idx_1g);
-
-	/*
-	 * Size (number of entries) of each tx and rx queue.
-	 */
-	i = prop_lookup_int(sc, "qsize-txq", TX_EQ_QSIZE);
-	p->qsize_txq = max(i, 128);
-	if (p->qsize_txq != i) {
+	/* The size (number of host credits) of the tx queue. */
+	val = prop_lookup_int(sc, "qsize-txq", T4_TX_DEF_QSIZE);
+	p->qsize_txq = MAX(val, 128);
+	p->qsize_txq = MIN(p->qsize_txq, T4_MAX_EQ_SIZE);
+	if (p->qsize_txq != val) {
 		cxgb_printf(dip, CE_WARN,
 		    "using %d instead of %d as the tx queue size",
-		    p->qsize_txq, i);
+		    p->qsize_txq, val);
 	}
 	(void) ddi_prop_update_int(dev, dip, "qsize-txq", p->qsize_txq);
 
-	i = prop_lookup_int(sc, "qsize-rxq", RX_IQ_QSIZE);
-	p->qsize_rxq = max(i, 128);
-	while (p->qsize_rxq & 7)
-		p->qsize_rxq--;
-	if (p->qsize_rxq != i) {
-		cxgb_printf(dip, CE_WARN,
-		    "using %d instead of %d as the rx queue size",
-		    p->qsize_rxq, i);
-	}
-	(void) ddi_prop_update_int(dev, dip, "qsize-rxq", p->qsize_rxq);
-
 	/*
-	 * Interrupt types allowed.
-	 * Bits 0, 1, 2 = INTx, MSI, MSI-X respectively.  See sys/ddi_intr.h
+	 * The size (number of entries/host credits) of the rx queue. The device
+	 * requires that all IQs be sized to a multiple of 16.
 	 */
-	p->intr_types = prop_lookup_int(sc, "interrupt-types",
-	    DDI_INTR_TYPE_MSIX | DDI_INTR_TYPE_MSI | DDI_INTR_TYPE_FIXED);
-	(void) ddi_prop_update_int(dev, dip, "interrupt-types", p->intr_types);
-
-	/*
-	 * Write combining
-	 * 0 to disable, 1 to enable
-	 */
-	p->wc = prop_lookup_int(sc, "write-combine", 1);
-	cxgb_printf(dip, CE_WARN, "write-combine: using of %d", p->wc);
-	if (p->wc != 0 && p->wc != 1) {
+	val = prop_lookup_int(sc, "qsize-rxq", T4_RX_DEF_QSIZE);
+	p->qsize_rxq = MAX(val, 128) & ~15;
+	p->qsize_rxq = MIN(p->qsize_rxq, SGE_MAX_IQ_SIZE);
+	if (p->qsize_rxq != val) {
 		cxgb_printf(dip, CE_WARN,
-		    "write-combine: using 1 instead of %d", p->wc);
-		p->wc = 1;
+		    "using %u instead of %d as the rx queue size",
+		    p->qsize_rxq, val);
 	}
-	(void) ddi_prop_update_int(dev, dip, "write-combine", p->wc);
+	(void) ddi_prop_update_int(dev, dip, "qsize-rxq", p->qsize_rxq);
+
+	p->write_combine = prop_lookup_bool(sc, "write-combine", true);
+	(void) ddi_prop_update_int(dev, dip, "write-combine",
+	    p->write_combine ? 1 : 0);
 
 	p->t4_fw_install = prop_lookup_int(sc, "t4_fw_install", 1);
 	if (p->t4_fw_install != 0 && p->t4_fw_install != 2)
 		p->t4_fw_install = 1;
 	(void) ddi_prop_update_int(dev, dip, "t4_fw_install", p->t4_fw_install);
+}
 
-	/* Multiple Rings */
-	p->multi_rings = prop_lookup_int(sc, "multi-rings", 1);
-	if (p->multi_rings != 0 && p->multi_rings != 1) {
-		cxgb_printf(dip, CE_NOTE,
-		    "multi-rings: using value 1 instead of %d", p->multi_rings);
-		p->multi_rings = 1;
-	}
-
-	(void) ddi_prop_update_int(dev, dip, "multi-rings", p->multi_rings);
+/*
+ * Permit artificial clamping of interrupts for device.
+ * Provided mainly for development and testing purposes.
+ */
+static int t4_intr_count_clamp = 0;
 
-	return (0);
-}
+/*
+ * Queue counts to allocate per-port based on device speed.
+ *
+ * These have been picked somewhat arbitrarily, and should be further
+ * scrutinized with additional testing.
+ */
+#define	T4_QCNT(speed, num)	[speed] = { speed, num, num }
+static const struct t4_queue_count {
+	t4_port_speed_t tqc_speed;
+	uint_t		tqc_rxq_count;
+	uint_t		tqc_txq_count;
+} t4_queue_counts[] = {
+	T4_QCNT(TPS_1G, 2),
+	T4_QCNT(TPS_10G, 8),
+	T4_QCNT(TPS_25G, 16),
+	T4_QCNT(TPS_40G, 24),
+	T4_QCNT(TPS_50G, 24),
+	T4_QCNT(TPS_100G, 32),
+	T4_QCNT(TPS_200G, 48),
+	T4_QCNT(TPS_400G, 64),
+};
 
 static int
-remove_extra_props(struct adapter *sc, int n10g, int n1g)
+t4_cfg_intrs_queues(struct adapter *sc)
 {
-	if (n10g == 0) {
-		(void) ddi_prop_remove(sc->dev, sc->dip, "max-ntxq-10G-port");
-		(void) ddi_prop_remove(sc->dev, sc->dip, "max-nrxq-10G-port");
-		(void) ddi_prop_remove(sc->dev, sc->dip,
-		    "holdoff-timer-idx-10G");
-		(void) ddi_prop_remove(sc->dev, sc->dip,
-		    "holdoff-pktc-idx-10G");
+	struct t4_intrs_queues *iaq = &sc->intr_queue_cfg;
+	int rc;
+
+	bzero(iaq, sizeof (*iaq));
+
+	int supported_itypes;
+	rc = ddi_intr_get_supported_types(sc->dip, &supported_itypes);
+	if (rc != DDI_SUCCESS) {
+		cxgb_printf(sc->dip, CE_WARN,
+		    "failed to determine supported interrupt types: %d", rc);
+		return (rc);
 	}
 
-	if (n1g == 0) {
-		(void) ddi_prop_remove(sc->dev, sc->dip, "max-ntxq-1G-port");
-		(void) ddi_prop_remove(sc->dev, sc->dip, "max-nrxq-1G-port");
-		(void) ddi_prop_remove(sc->dev, sc->dip,
-		    "holdoff-timer-idx-1G");
-		(void) ddi_prop_remove(sc->dev, sc->dip, "holdoff-pktc-idx-1G");
+	const int intr_types[] = {
+		DDI_INTR_TYPE_MSIX, DDI_INTR_TYPE_MSI, DDI_INTR_TYPE_FIXED,
+	};
+	const char *intr_str[] = { "MSI-X", "MSI", "Fixed" };
+	int itype = -1;
+
+	for (uint_t i = 0; i < ARRAY_SIZE(intr_types); i++) {
+		itype = intr_types[i];
+		if ((itype & supported_itypes) == 0) {
+			continue;
+		}
+
+		rc = ddi_intr_get_navail(sc->dip, itype, &iaq->intr_avail);
+		if (rc != DDI_SUCCESS || iaq->intr_avail < 0) {
+			cxgb_printf(sc->dip, CE_WARN, "failed to query "
+			    "available interrupts for type %s: %d", intr_str[i],
+			    rc);
+			continue;
+		}
+
+		/*
+		 * The device error and FWQ interrupts are hard-coded to indexes
+		 * 0 and 1, respectively.  We require at least two interrupts be
+		 * available for MSI(-X) in order to cover both of those cases.
+		 */
+		if (iaq->intr_avail >= 2 ||
+		    (iaq->intr_avail == 1 && itype == DDI_INTR_TYPE_FIXED)) {
+			break;
+		}
 	}
 
-	return (0);
-}
+	if (iaq->intr_avail == 0) {
+		cxgb_printf(sc->dip, CE_WARN, "failed to get any interrupts "
+		    "after querying all types");
+		return (rc);
+	}
 
-static int
-cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g,
-    struct intrs_and_queues *iaq)
-{
-	struct driver_properties *p = &sc->props;
-	int rc, itype, itypes, navail, nc, n;
-	int pfres_rxq, pfres_txq, pfresq;
+	ASSERT3S(iaq->intr_avail, >, 0);
+	iaq->intr_type = itype;
+	iaq->intr_count = iaq->intr_avail;
 
-	bzero(iaq, sizeof (*iaq));
-	nc = ncpus;	/* our snapshot of the number of CPUs */
-	iaq->ntxq10g = min(nc, p->max_ntxq_10g);
-	iaq->ntxq1g = min(nc, p->max_ntxq_1g);
-	iaq->nrxq10g = min(nc, p->max_nrxq_10g);
-	iaq->nrxq1g = min(nc, p->max_nrxq_1g);
+	/* Permit artificial clamping of consumed interrupts. */
+	if (t4_intr_count_clamp > 1) {
+		iaq->intr_count = MIN(iaq->intr_avail, t4_intr_count_clamp);
+	}
+
+	const uint_t port_count = sc->params.nports;
+
+	iaq->intr_per_port = 0;
+	/* One IQ for the FWQ */
+	iaq->num_iqs = 1;
 
-	pfres_rxq = iaq->nrxq10g * n10g + iaq->nrxq1g * n1g;
-	pfres_txq = iaq->ntxq10g * n10g + iaq->ntxq1g * n1g;
+	if (iaq->intr_count == 1) {
+		iaq->intr_plan = TIP_SINGLE;
+	} else if (iaq->intr_count == 2 || iaq->intr_count < (port_count + 2)) {
+		iaq->intr_plan = TIP_ERR_QUEUES;
+	} else {
+		/*
+		 * We know the interrupt count is at least equal to
+		 * port_count+2, and thus we should always have at least
+		 * one event interrupt per port.
+		 */
+		VERIFY(iaq->intr_count >= (port_count + 2));
+		iaq->intr_plan = TIP_PER_PORT;
+		iaq->intr_per_port = (iaq->intr_count - 2) / port_count;
+		VERIFY3U(iaq->intr_per_port, >, 0);
+		iaq->num_iqs += iaq->intr_per_port * port_count;
+	}
 
+	const struct pf_resources *pfres = &sc->params.pfres;
+	if (pfres->niqflint <= 1) {
+		/* We cannot achieve much with a single IQ */
+		cxgb_printf(sc->dip, CE_WARN,
+		    "inadequate IQ resources available");
+		return (DDI_FAILURE);
+	}
+
+	const uint_t port_iqs = pfres->niqflint - iaq->num_iqs;
 	/*
-	 * If current configuration of max number of Rxqs and Txqs exceed
-	 * the max available for all the ports under this PF, then shrink
-	 * the queues to max available. Reduce them in a way that each
-	 * port under this PF has equally distributed number of queues.
-	 * Must guarantee at least 1 queue for each port for both NIC
-	 * and Offload queues.
-	 *
-	 * neq - fixed max number of Egress queues on Tx path and Free List
-	 * queues that hold Rx payload data on Rx path. Half are reserved
-	 * for Egress queues and the other half for Free List queues.
-	 * Hence, the division by 2.
+	 * Every RX queue needs an IQ capable of interrupts (for the receive
+	 * notifications) as well as an EQ (for posting the freelist entries to
+	 * the device.  Half of the total EQs are left for TXQs.
+	 */
+	const uint_t max_rxq = MIN(port_iqs, pfres->neq / 2);
+
+	/* Every TX queue needs an ethernet-capable EQ. */
+	const uint_t max_txq = MIN(pfres->nethctrl, pfres->neq / 2);
+
+	if ((max_rxq / port_count) == 0) {
+		cxgb_printf(sc->dip, CE_WARN,
+		    "inadequate RX queue resources available");
+		return (DDI_FAILURE);
+	} else if ((max_txq / port_count) == 0) {
+		cxgb_printf(sc->dip, CE_WARN,
+		    "inadequate TX queue resources available");
+		return (DDI_FAILURE);
+	}
+
+	/* Clamp max queue counts to number of CPUs */
+	iaq->port_max_rxq = MIN(max_rxq, ncpus);
+	iaq->port_max_txq = MIN(max_txq, ncpus);
+
+	VERIFY(iaq->intr_count > 0);
+	VERIFY(iaq->port_max_rxq != 0);
+	VERIFY(iaq->port_max_txq != 0);
+	VERIFY(iaq->num_iqs != 0);
+
+	/*
+	 * Determine per-port queue counts based on maximum port speed.
 	 *
-	 * niqflint - max number of Ingress queues with interrupts on Rx
-	 * path to receive completions that indicate Rx payload has been
-	 * posted in its associated Free List queue. Also handles Tx
-	 * completions for packets successfully transmitted on Tx path.
+	 * This is a bit unfortunate, since there does not seem to be a way to
+	 * query the maximum possible speed for a port independent of any
+	 * installed transceiver.  If a transceiver of lesser speed capability
+	 * is installed in a port, that port will clamp its own reported
+	 * capabilities to those of the transceiver.
 	 *
-	 * nethctrl - max number of Egress queues only for Tx path. This
-	 * number is usually half of neq. However, if it became less than
-	 * neq due to lack of resources based on firmware configuration,
-	 * then take the lower value.
+	 * Our compromise is to size queue allocations based on the fastest port
+	 * we can find.  This will be less than ideal for adapters with
+	 * heterogeneous port configurations or systems where transceivers of
+	 * differing speed capabilities are swapped in after the driver
+	 * initializes the adapter(s).
 	 */
-	const uint_t max_rxq =
-	    MIN(sc->params.pfres.neq / 2, sc->params.pfres.niqflint);
-	while (pfres_rxq > max_rxq) {
-		pfresq = pfres_rxq;
-
-		if (iaq->nrxq10g > 1) {
-			iaq->nrxq10g--;
-			pfres_rxq -= n10g;
-		}
+	t4_port_speed_t max_speed = TPS_1G;
+	for (uint_t i = 0; i < port_count; i++) {
+		max_speed = MAX(max_speed, t4_port_speed(sc->port[i]));
+	}
+	ASSERT(max_speed < ARRAY_SIZE(t4_queue_counts));
+	const struct t4_queue_count *qc = &t4_queue_counts[max_speed];
 
-		if (iaq->nrxq1g > 1) {
-			iaq->nrxq1g--;
-			pfres_rxq -= n1g;
-		}
+	uint_t rxq_idx = 0, txq_idx = 0;
+	for (uint_t i = 0; i < port_count; i++) {
+		struct port_info *pi = sc->port[i];
 
-		/* Break if nothing changed */
-		if (pfresq == pfres_rxq)
-			break;
+		/* Clamp to per-port maximums */
+		pi->rxq_count = MIN(qc->tqc_rxq_count, iaq->port_max_rxq);
+		pi->txq_count = MIN(qc->tqc_txq_count, iaq->port_max_txq);
+
+		pi->rxq_start = rxq_idx;
+		pi->txq_start = txq_idx;
+		rxq_idx += pi->rxq_count;
+		txq_idx += pi->txq_count;
 	}
 
-	const uint_t max_txq =
-	    MIN(sc->params.pfres.neq / 2, sc->params.pfres.nethctrl);
-	while (pfres_txq > max_txq) {
-		pfresq = pfres_txq;
+	struct sge_info *sge = &sc->sge;
+	sge->rxq_count = rxq_idx;
+	sge->txq_count = txq_idx;
 
-		if (iaq->ntxq10g > 1) {
-			iaq->ntxq10g--;
-			pfres_txq -= n10g;
-		}
+	cxgb_printf(sc->dip, CE_NOTE, "(%u rxq, %u txq total) %d %s.",
+	    rxq_idx, txq_idx, iaq->intr_count,
+	    iaq->intr_type == DDI_INTR_TYPE_MSIX ? "MSI-X interrupts" :
+	    iaq->intr_type == DDI_INTR_TYPE_MSI ? "MSI interrupts" :
+	    "fixed interrupt");
 
-		if (iaq->ntxq1g > 1) {
-			iaq->ntxq1g--;
-			pfres_txq -= n1g;
-		}
+	return (DDI_SUCCESS);
+}
 
-		/* Break if nothing changed */
-		if (pfresq == pfres_txq)
-			break;
+static int
+t4_setup_port_intrs(struct adapter *sc, int *handlers)
+{
+	int rc = 0;
+	const struct t4_intrs_queues *iaq = &sc->intr_queue_cfg;
+
+	for (uint_t i = 0; i < sc->params.nports; i++) {
+		struct port_info *port = sc->port[i];
+
+		port->intr_iqs = kmem_zalloc(iaq->intr_per_port *
+		    sizeof (t4_sge_iq_t), KM_SLEEP);
+
+		for (uint_t j = 0; j < iaq->intr_per_port; j++) {
+			uint_t intr_idx = 2 + (i * iaq->intr_per_port) + j;
+			VERIFY3S(intr_idx, <, iaq->intr_count);
+			ddi_intr_handle_t ihdl = sc->intr_handle[intr_idx];
+			rc = ddi_intr_add_handler(ihdl, t4_intr_port_queue,
+			    &port->intr_iqs[j], NULL);
+			if (rc != DDI_SUCCESS) {
+				/*
+				 * Previously installed handlers are cleaned up
+				 * by the parent function.
+				 */
+				cxgb_printf(sc->dip, CE_WARN, "failed to add "
+				    "interrupt handler %u for type: %d plan: "
+				    "%d: %d", intr_idx, iaq->intr_type,
+				    iaq->intr_plan, rc);
+				return (rc);
+			}
+			*handlers += 1;
+		}
 	}
 
-	rc = ddi_intr_get_supported_types(sc->dip, &itypes);
+	return (DDI_SUCCESS);
+}
+
+static int
+t4_setup_intrs(struct adapter *sc)
+{
+	const struct t4_intrs_queues *iaq = &sc->intr_queue_cfg;
+	const int intr_count = iaq->intr_count;
+	const int intr_type = iaq->intr_type;
+	int allocated = 0;
+	int handlers = 0;
+
+	int rc = ddi_intr_alloc(sc->dip, sc->intr_handle, intr_type, 0,
+	    intr_count, &allocated, DDI_INTR_ALLOC_STRICT);
 	if (rc != DDI_SUCCESS) {
 		cxgb_printf(sc->dip, CE_WARN,
-		    "failed to determine supported interrupt types: %d", rc);
-		return (rc);
+		    "failed to allocate %d interrupt(s) of type %d: %d, %d",
+		    intr_count, intr_type, rc, allocated);
+		goto fail;
 	}
 
-	for (itype = DDI_INTR_TYPE_MSIX; itype; itype >>= 1) {
-		ASSERT(itype == DDI_INTR_TYPE_MSIX ||
-		    itype == DDI_INTR_TYPE_MSI ||
-		    itype == DDI_INTR_TYPE_FIXED);
+	VERIFY3U(intr_count, ==, allocated); /* allocation was STRICT */
 
-		if ((itype & itypes & p->intr_types) == 0)
-			continue;	/* not supported or not allowed */
+	rc = ddi_intr_get_cap(sc->intr_handle[0], &sc->intr_cap);
+	if (rc != DDI_SUCCESS) {
+		cxgb_printf(sc->dip, CE_WARN, "failed to get interrupt "
+		    "capabilities for type %d: %d", intr_type, rc);
+		goto fail;
+	}
 
-		navail = 0;
-		rc = ddi_intr_get_navail(sc->dip, itype, &navail);
-		if (rc != DDI_SUCCESS || navail == 0) {
-			cxgb_printf(sc->dip, CE_WARN,
-			    "failed to get # of interrupts for type %d: %d",
-			    itype, rc);
-			continue;	/* carry on */
+	rc = ddi_intr_get_pri(sc->intr_handle[0], &sc->intr_pri);
+	if (rc != DDI_SUCCESS) {
+		cxgb_printf(sc->dip, CE_WARN, "failed to get interrupt "
+		    "priority for type %d: %d", intr_type, rc);
+		goto fail;
+	}
+
+	switch (iaq->intr_plan) {
+	case TIP_SINGLE:
+		ASSERT3U(intr_count, ==, 1);
+		rc = ddi_intr_add_handler(sc->intr_handle[0], t4_intr_all, sc,
+		    NULL);
+		if (rc != DDI_SUCCESS) {
+			cxgb_printf(sc->dip, CE_WARN, "failed to add interrupt "
+			    "handler %u for type: %d plan: %d: %d", handlers,
+			    intr_type, iaq->intr_plan, rc);
+			goto fail;
 		}
+		handlers++;
+		break;
 
-		iaq->intr_type = itype;
-		if (navail == 0)
-			continue;
+	case TIP_ERR_QUEUES:
+		VERIFY3U(intr_count, ==, 2);
+		rc = ddi_intr_add_handler(sc->intr_handle[0], t4_intr_err, sc,
+		    NULL);
+		if (rc != DDI_SUCCESS) {
+			cxgb_printf(sc->dip, CE_WARN, "failed to add interrupt "
+			    "handler %u for type: %d plan: %d: %d", handlers,
+			    intr_type, iaq->intr_plan, rc);
+			goto fail;
+		}
+		handlers++;
 
-		/*
-		 * Best option: an interrupt vector for errors, one for the
-		 * firmware event queue, and one each for each rxq (NIC as well
-		 * as offload).
-		 */
-		iaq->nirq = T4_EXTRA_INTR;
-		iaq->nirq += n10g * iaq->nrxq10g;
-		iaq->nirq += n1g * iaq->nrxq1g;
-
-		if (iaq->nirq <= navail &&
-		    (itype != DDI_INTR_TYPE_MSI || ISP2(iaq->nirq))) {
-			iaq->intr_fwd = 0;
-			goto allocate;
+		rc = ddi_intr_add_handler(sc->intr_handle[1], t4_intr_fwq, sc,
+		    NULL);
+		if (rc != DDI_SUCCESS) {
+			cxgb_printf(sc->dip, CE_WARN, "failed to add interrupt "
+			    "handler %u for type: %d plan: %d: %d", handlers,
+			    intr_type, iaq->intr_plan, rc);
+			goto fail;
 		}
+		handlers++;
+		break;
 
-		/*
-		 * Second best option: an interrupt vector for errors, one for
-		 * the firmware event queue, and one each for either NIC or
-		 * offload rxq's.
-		 */
-		iaq->nirq = T4_EXTRA_INTR;
-		iaq->nirq += n10g * iaq->nrxq10g;
-		iaq->nirq += n1g * iaq->nrxq1g;
-		if (iaq->nirq <= navail &&
-		    (itype != DDI_INTR_TYPE_MSI || ISP2(iaq->nirq))) {
-			iaq->intr_fwd = 1;
-			goto allocate;
+	case TIP_PER_PORT:
+		VERIFY3U(intr_count, >=, 2 + sc->params.nports);
+		rc = ddi_intr_add_handler(sc->intr_handle[0], t4_intr_err, sc,
+		    NULL);
+		if (rc != DDI_SUCCESS) {
+			cxgb_printf(sc->dip, CE_WARN, "failed to add interrupt "
+			    "handler %u for type: %d plan: %d: %d", handlers,
+			    intr_type, iaq->intr_plan, rc);
+			goto fail;
 		}
+		handlers++;
 
-		/*
-		 * Next best option: an interrupt vector for errors, one for the
-		 * firmware event queue, and at least one per port.  At this
-		 * point we know we'll have to downsize nrxq or nofldrxq to fit
-		 * what's available to us.
-		 */
-		iaq->nirq = T4_EXTRA_INTR;
-		iaq->nirq += n10g + n1g;
-		if (iaq->nirq <= navail) {
-			int leftover = navail - iaq->nirq;
-
-			if (n10g > 0) {
-				int target = iaq->nrxq10g;
-
-				n = 1;
-				while (n < target && leftover >= n10g) {
-					leftover -= n10g;
-					iaq->nirq += n10g;
-					n++;
-				}
-				iaq->nrxq10g = min(n, iaq->nrxq10g);
-			}
+		rc =  ddi_intr_add_handler(sc->intr_handle[1], t4_intr_fwq, sc,
+		    NULL);
+		if (rc != DDI_SUCCESS) {
+			cxgb_printf(sc->dip, CE_WARN, "failed to add interrupt "
+			    "handler %u for type: %d plan: %d: %d", handlers,
+			    intr_type, iaq->intr_plan, rc);
+			goto fail;
+		}
+		handlers++;
 
-			if (n1g > 0) {
-				int target = iaq->nrxq1g;
+		rc = t4_setup_port_intrs(sc, &handlers);
 
-				n = 1;
-				while (n < target && leftover >= n1g) {
-					leftover -= n1g;
-					iaq->nirq += n1g;
-					n++;
-				}
-				iaq->nrxq1g = min(n, iaq->nrxq1g);
-			}
+		if (rc != DDI_SUCCESS) {
+			goto fail;
+		}
 
+		break;
+	}
+
+	return (DDI_SUCCESS);
+
+fail:
+	for (int i = 0; i < handlers; i++) {
+		rc = ddi_intr_remove_handler(sc->intr_handle[i]);
+		if (rc != DDI_SUCCESS) {
 			/*
-			 * We have arrived at a minimum value required to enable
-			 * per queue irq(either NIC or offload). Thus for non-
-			 * offload case, we will get a vector per queue, while
-			 * offload case, we will get a vector per offload/NIC q.
-			 * Hence enable Interrupt forwarding only for offload
-			 * case.
+			 * We tried our best, the only thing left is to log the
+			 * failure and move on.
 			 */
-			if (itype != DDI_INTR_TYPE_MSI) {
-				goto allocate;
-			}
+			cxgb_printf(sc->dip, CE_WARN, "failed to remove "
+			    "interrupt handler %d for type: %d plan: %d: %d", i,
+			    intr_type, iaq->intr_plan, rc);
 		}
+	}
 
-		/*
-		 * Least desirable option: one interrupt vector for everything.
-		 */
-		iaq->nirq = iaq->nrxq10g = iaq->nrxq1g = 1;
-		iaq->intr_fwd = 1;
-
-allocate:
-		return (0);
+	for (int i = 0; i < allocated; i++) {
+		rc = ddi_intr_free(sc->intr_handle[i]);
+		if (rc != DDI_SUCCESS) {
+			cxgb_printf(sc->dip, CE_WARN, "failed to free "
+			    "interrupt %d for type: %d plan: %d: %d", i,
+			    intr_type, iaq->intr_plan, rc);
+		}
 	}
 
-	cxgb_printf(sc->dip, CE_WARN,
-	    "failed to find a usable interrupt type.  supported=%d, allowed=%d",
-	    itypes, p->intr_types);
 	return (DDI_FAILURE);
 }
 
 static int
-add_child_node(struct adapter *sc, int idx)
+t4_add_child_node(struct adapter *sc, uint_t idx)
 {
-	int rc;
-	struct port_info *pi;
 
-	if (idx < 0 || idx >= sc->params.nports)
+	if (idx >= sc->params.nports)
 		return (EINVAL);
 
-	pi = sc->port[idx];
-	if (pi == NULL)
-		return (ENODEV);	/* t4_port_init failed earlier */
+	struct port_info *pi = sc->port[idx];
+	if (pi == NULL) {
+		/* t4_port_init failed earlier */
+		return (ENODEV);
+	}
 
 	PORT_LOCK(pi);
 	if (pi->dip != NULL) {
-		rc = 0;		/* EEXIST really, but then bus_config fails */
-		goto done;
+		PORT_UNLOCK(pi);
+		/* EEXIST really, but then bus_config fails */
+		return (0);
 	}
 
-	rc = ndi_devi_alloc(sc->dip, T4_PORT_NAME, DEVI_SID_NODEID, &pi->dip);
+	const int rc =
+	    ndi_devi_alloc(sc->dip, T4_PORT_NAME, DEVI_SID_NODEID, &pi->dip);
 	if (rc != DDI_SUCCESS || pi->dip == NULL) {
-		rc = ENOMEM;
-		goto done;
+		PORT_UNLOCK(pi);
+		return (ENOMEM);
 	}
 
 	(void) ddi_set_parent_data(pi->dip, pi);
 	(void) ndi_devi_bind_driver(pi->dip, 0);
-	rc = 0;
-done:
+
 	PORT_UNLOCK(pi);
-	return (rc);
+	return (0);
 }
 
 static int
-remove_child_node(struct adapter *sc, int idx)
+t4_remove_child_node(struct adapter *sc, uint_t idx)
 {
-	int rc;
-	struct port_info *pi;
-
-	if (idx < 0 || idx >= sc->params.nports)
+	if (idx >= sc->params.nports)
 		return (EINVAL);
 
-	pi = sc->port[idx];
+	struct port_info *pi = sc->port[idx];
 	if (pi == NULL)
 		return (ENODEV);
 
 	PORT_LOCK(pi);
 	if (pi->dip == NULL) {
-		rc = ENODEV;
-		goto done;
+		PORT_UNLOCK(pi);
+		return (ENODEV);
 	}
 
-	rc = ndi_devi_free(pi->dip);
+	const int rc = ndi_devi_free(pi->dip);
 	if (rc == 0)
 		pi->dip = NULL;
-done:
+
 	PORT_UNLOCK(pi);
 	return (rc);
 }
 
+struct t4_port_speed_def {
+	uint32_t	tpsd_cap;
+	t4_port_speed_t	tpsd_speed;
+	const char	*tpsd_name;
+};
+#define	T4_PORT_SPEED_DEF(speed)			\
+{							\
+	.tpsd_cap = FW_PORT_CAP32_SPEED_ ## speed,	\
+	.tpsd_speed = TPS_ ## speed,			\
+	.tpsd_name = #speed,				\
+}
+
+static const struct t4_port_speed_def t4_port_speeds[] = {
+	T4_PORT_SPEED_DEF(400G),
+	T4_PORT_SPEED_DEF(200G),
+	T4_PORT_SPEED_DEF(100G),
+	T4_PORT_SPEED_DEF(50G),
+	T4_PORT_SPEED_DEF(40G),
+	T4_PORT_SPEED_DEF(25G),
+	T4_PORT_SPEED_DEF(10G),
+	T4_PORT_SPEED_DEF(1G),
+};
+
+/*
+ * Get maximum advertised speed of this port.
+ *
+ * This is, unfortunately, impacted by the installed transceiver at the time of
+ * query.
+ */
+static t4_port_speed_t
+t4_port_speed(const struct port_info *pi)
+{
+	ASSERT(pi != NULL);
+
+	const uint32_t pcap = pi->link_cfg.pcaps;
+	for (uint_t i = 0; i < ARRAY_SIZE(t4_port_speeds); i++) {
+		if (t4_port_speeds[i].tpsd_cap & pcap) {
+			return (t4_port_speeds[i].tpsd_speed);
+		}
+	}
+
+	/* Fall back to 1G for unknown speeds */
+	return (TPS_1G);
+}
+
 static const char *
 t4_port_speed_name(const struct port_info *pi)
 {
@@ -2063,28 +2605,27 @@ t4_port_speed_name(const struct port_info *pi)
 		return ("-");
 	}
 
-	const uint32_t pcaps = pi->link_cfg.pcaps;
-	if (pcaps & FW_PORT_CAP32_SPEED_100G) {
-		return ("100G");
-	} else if (pcaps & FW_PORT_CAP32_SPEED_50G) {
-		return ("50G");
-	} else if (pcaps & FW_PORT_CAP32_SPEED_40G) {
-		return ("40G");
-	} else if (pcaps & FW_PORT_CAP32_SPEED_25G) {
-		return ("25G");
-	} else if (pcaps & FW_PORT_CAP32_SPEED_10G) {
-		return ("10G");
-	} else {
-		return ("1G");
+	const uint32_t pcap = pi->link_cfg.pcaps;
+	for (uint_t i = 0; i < ARRAY_SIZE(t4_port_speeds); i++) {
+		if (t4_port_speeds[i].tpsd_cap & pcap) {
+			return (t4_port_speeds[i].tpsd_name);
+		}
 	}
+
+	return ("-");
 }
 
-#define	KS_UINIT(x)	kstat_named_init(&kstatp->x, #x, KSTAT_DATA_ULONG)
-#define	KS_CINIT(x)	kstat_named_init(&kstatp->x, #x, KSTAT_DATA_CHAR)
-#define	KS_U64INIT(x)	kstat_named_init(&kstatp->x, #x, KSTAT_DATA_UINT64)
-#define	KS_U_SET(x, y)	kstatp->x.value.ul = (y)
-#define	KS_C_SET(x, ...)	\
-			(void) snprintf(kstatp->x.value.c, 16,  __VA_ARGS__)
+#define	KS_INIT_U64(kstatp,  n)	\
+	kstat_named_init(&kstatp->n, #n, KSTAT_DATA_UINT64)
+#define	KS_INIT_CHAR(kstatp, n)	\
+	kstat_named_init(&kstatp->n, #n, KSTAT_DATA_CHAR)
+#define	KS_INIT_STR(kstatp, n)	\
+	kstat_named_init(&kstatp->n, #n, KSTAT_DATA_STRING)
+#define	KS_SET_U64(kstatp, n, v)	kstatp->n.value.ul = (v)
+#define	KS_SET_CHAR(kstatp, n, ...)	\
+	(void) snprintf(kstatp->n.value.c, 16,  __VA_ARGS__)
+#define	KS_SET_STR(kstatp, n, v)	\
+	kstat_named_setstr(&kstatp->n, v)
 
 /*
  * t4nex:X:config
@@ -2097,80 +2638,57 @@ struct t4_kstats {
 	kstat_named_t serial_number;
 	kstat_named_t ec_level;
 	kstat_named_t id;
-	kstat_named_t bus_type;
-	kstat_named_t bus_width;
-	kstat_named_t bus_speed;
 	kstat_named_t core_clock;
 	kstat_named_t port_cnt;
 	kstat_named_t port_type;
-	kstat_named_t pci_vendor_id;
-	kstat_named_t pci_device_id;
 };
+
 static kstat_t *
-setup_kstats(struct adapter *sc)
+t4_setup_kstats(struct adapter *sc)
 {
-	kstat_t *ksp;
-	struct t4_kstats *kstatp;
-	int ndata;
-	struct pci_params *p = &sc->params.pci;
-	struct vpd_params *v = &sc->params.vpd;
-	uint16_t pci_vendor, pci_device;
-
-	ndata = sizeof (struct t4_kstats) / sizeof (kstat_named_t);
-
-	ksp = kstat_create(T4_NEXUS_NAME, ddi_get_instance(sc->dip), "config",
-	    "nexus", KSTAT_TYPE_NAMED, ndata, 0);
+	const ulong_t ndata = sizeof (struct t4_kstats) /
+	    sizeof (kstat_named_t);
+	kstat_t *ksp = kstat_create(T4_NEXUS_NAME, ddi_get_instance(sc->dip),
+	    "config", "nexus", KSTAT_TYPE_NAMED, ndata, 0);
 	if (ksp == NULL) {
 		cxgb_printf(sc->dip, CE_WARN, "failed to initialize kstats.");
 		return (NULL);
 	}
 
-	kstatp = (struct t4_kstats *)ksp->ks_data;
-
-	KS_UINIT(chip_ver);
-	KS_CINIT(fw_vers);
-	KS_CINIT(tp_vers);
-	KS_CINIT(driver_version);
-	KS_CINIT(serial_number);
-	KS_CINIT(ec_level);
-	KS_CINIT(id);
-	KS_CINIT(bus_type);
-	KS_CINIT(bus_width);
-	KS_CINIT(bus_speed);
-	KS_UINIT(core_clock);
-	KS_UINIT(port_cnt);
-	KS_CINIT(port_type);
-	KS_CINIT(pci_vendor_id);
-	KS_CINIT(pci_device_id);
-
-	KS_U_SET(chip_ver, sc->params.chip);
-	KS_C_SET(fw_vers, "%d.%d.%d.%d",
+	struct t4_kstats *kstatp = (struct t4_kstats *)ksp->ks_data;
+
+	KS_INIT_U64(kstatp, chip_ver);
+	KS_INIT_CHAR(kstatp, fw_vers);
+	KS_INIT_CHAR(kstatp, tp_vers);
+	KS_INIT_CHAR(kstatp, driver_version);
+	KS_INIT_STR(kstatp, serial_number);
+	KS_INIT_STR(kstatp, ec_level);
+	KS_INIT_STR(kstatp, id);
+	KS_INIT_U64(kstatp, core_clock);
+	KS_INIT_U64(kstatp, port_cnt);
+	KS_INIT_CHAR(kstatp, port_type);
+
+	KS_SET_U64(kstatp, chip_ver, sc->params.chip);
+	KS_SET_CHAR(kstatp, fw_vers, "%d.%d.%d.%d",
 	    G_FW_HDR_FW_VER_MAJOR(sc->params.fw_vers),
 	    G_FW_HDR_FW_VER_MINOR(sc->params.fw_vers),
 	    G_FW_HDR_FW_VER_MICRO(sc->params.fw_vers),
 	    G_FW_HDR_FW_VER_BUILD(sc->params.fw_vers));
-	KS_C_SET(tp_vers, "%d.%d.%d.%d",
+	KS_SET_CHAR(kstatp, tp_vers, "%d.%d.%d.%d",
 	    G_FW_HDR_FW_VER_MAJOR(sc->params.tp_vers),
 	    G_FW_HDR_FW_VER_MINOR(sc->params.tp_vers),
 	    G_FW_HDR_FW_VER_MICRO(sc->params.tp_vers),
 	    G_FW_HDR_FW_VER_BUILD(sc->params.tp_vers));
-	KS_C_SET(driver_version, DRV_VERSION);
-	KS_C_SET(serial_number, "%s", v->sn);
-	KS_C_SET(ec_level, "%s", v->ec);
-	KS_C_SET(id, "%s", v->id);
-	KS_C_SET(bus_type, "pci-express");
-	KS_C_SET(bus_width, "x%d lanes", p->width);
-	KS_C_SET(bus_speed, "%d", p->speed);
-	KS_U_SET(core_clock, v->cclk);
-	KS_U_SET(port_cnt, sc->params.nports);
-
-	pci_vendor = pci_config_get16(sc->pci_regh, PCI_CONF_VENID);
-	KS_C_SET(pci_vendor_id, "0x%x", pci_vendor);
-
-	pci_device = pci_config_get16(sc->pci_regh, PCI_CONF_DEVID);
-	KS_C_SET(pci_device_id, "0x%x", pci_device);
-
-	KS_C_SET(port_type, "%s/%s/%s/%s",
+	KS_SET_CHAR(kstatp, driver_version, DRV_VERSION);
+
+	const struct vpd_params *vpd = &sc->params.vpd;
+	KS_SET_STR(kstatp, serial_number, (const char *)vpd->sn);
+	KS_SET_STR(kstatp, ec_level, (const char *)vpd->ec);
+	KS_SET_STR(kstatp, id, (const char *)vpd->id);
+	KS_SET_U64(kstatp, core_clock, vpd->cclk);
+	KS_SET_U64(kstatp, port_cnt, sc->params.nports);
+
+	KS_SET_CHAR(kstatp, port_type, "%s/%s/%s/%s",
 	    t4_port_speed_name(sc->port[0]),
 	    t4_port_speed_name(sc->port[1]),
 	    t4_port_speed_name(sc->port[2]),
@@ -2192,8 +2710,28 @@ struct t4_wc_kstats {
 	kstat_named_t write_coal_success;
 	kstat_named_t write_coal_failure;
 };
+
+static int
+t4_update_wc_kstats(kstat_t *ksp, int rw)
+{
+	struct t4_wc_kstats *kstatp = (struct t4_wc_kstats *)ksp->ks_data;
+	struct adapter *sc = ksp->ks_private;
+
+	if (rw == KSTAT_WRITE)
+		return (0);
+
+	if (t4_cver_ge(sc, CHELSIO_T5)) {
+		const uint32_t wc_total = t4_read_reg(sc, A_SGE_STAT_TOTAL);
+		const uint32_t wc_failure = t4_read_reg(sc, A_SGE_STAT_MATCH);
+		KS_SET_U64(kstatp, write_coal_success, wc_total - wc_failure);
+		KS_SET_U64(kstatp, write_coal_failure, wc_failure);
+	}
+
+	return (0);
+}
+
 static kstat_t *
-setup_wc_kstats(struct adapter *sc)
+t4_setup_wc_kstats(struct adapter *sc)
 {
 	kstat_t *ksp;
 	struct t4_wc_kstats *kstatp;
@@ -2209,10 +2747,10 @@ setup_wc_kstats(struct adapter *sc)
 
 	kstatp = (struct t4_wc_kstats *)ksp->ks_data;
 
-	KS_UINIT(write_coal_success);
-	KS_UINIT(write_coal_failure);
+	KS_INIT_U64(kstatp, write_coal_success);
+	KS_INIT_U64(kstatp, write_coal_failure);
 
-	ksp->ks_update = update_wc_kstats;
+	ksp->ks_update = t4_update_wc_kstats;
 	/* Install the kstat */
 	ksp->ks_private = (void *)sc;
 	kstat_install(ksp);
@@ -2220,31 +2758,6 @@ setup_wc_kstats(struct adapter *sc)
 	return (ksp);
 }
 
-static int
-update_wc_kstats(kstat_t *ksp, int rw)
-{
-	struct t4_wc_kstats *kstatp = (struct t4_wc_kstats *)ksp->ks_data;
-	struct adapter *sc = ksp->ks_private;
-	uint32_t wc_total, wc_success, wc_failure;
-
-	if (rw == KSTAT_WRITE)
-		return (0);
-
-	if (t4_cver_ge(sc, CHELSIO_T5)) {
-		wc_total = t4_read_reg(sc, A_SGE_STAT_TOTAL);
-		wc_failure = t4_read_reg(sc, A_SGE_STAT_MATCH);
-		wc_success = wc_total - wc_failure;
-	} else {
-		wc_success = 0;
-		wc_failure = 0;
-	}
-
-	KS_U_SET(write_coal_success, wc_success);
-	KS_U_SET(write_coal_failure, wc_failure);
-
-	return (0);
-}
-
 /*
  * cxgbe:X:fec
  *
@@ -2272,21 +2785,18 @@ struct cxgbe_port_fec_kstats {
 };
 
 static uint32_t
-read_fec_pair(struct port_info *pi, uint32_t lo_reg, uint32_t high_reg)
+t4_read_fec_pair(struct port_info *pi, uint32_t lo_reg, uint32_t high_reg)
 {
 	struct adapter *sc = pi->adapter;
-	uint8_t port = pi->tx_chan;
-	uint32_t low, high, ret;
+	const uint8_t port = pi->tx_chan;
 
-	low = t4_read_reg(sc, T5_PORT_REG(port, lo_reg));
-	high = t4_read_reg(sc, T5_PORT_REG(port, high_reg));
-	ret = low & 0xffff;
-	ret |= (high & 0xffff) << 16;
-	return (ret);
+	const uint32_t low = t4_read_reg(sc, T5_PORT_REG(port, lo_reg));
+	const uint32_t high = t4_read_reg(sc, T5_PORT_REG(port, high_reg));
+	return ((low & 0xffff) | ((high & 0xffff) << 16));
 }
 
 static int
-update_port_fec_kstats(kstat_t *ksp, int rw)
+t4_update_fec_kstats(kstat_t *ksp, int rw)
 {
 	struct cxgbe_port_fec_kstats *fec = ksp->ks_data;
 	struct port_info *pi = ksp->ks_private;
@@ -2298,44 +2808,44 @@ update_port_fec_kstats(kstat_t *ksp, int rw)
 	/*
 	 * First go ahead and gather RS related stats.
 	 */
-	fec->rs_corr.value.ui64 += read_fec_pair(pi, T6_RS_FEC_CCW_LO,
-	    T6_RS_FEC_CCW_HI);
-	fec->rs_uncorr.value.ui64 += read_fec_pair(pi, T6_RS_FEC_NCCW_LO,
-	    T6_RS_FEC_NCCW_HI);
-	fec->rs_sym0_corr.value.ui64 += read_fec_pair(pi, T6_RS_FEC_SYMERR0_LO,
-	    T6_RS_FEC_SYMERR0_HI);
-	fec->rs_sym1_corr.value.ui64 += read_fec_pair(pi, T6_RS_FEC_SYMERR1_LO,
-	    T6_RS_FEC_SYMERR1_HI);
-	fec->rs_sym2_corr.value.ui64 += read_fec_pair(pi, T6_RS_FEC_SYMERR2_LO,
-	    T6_RS_FEC_SYMERR2_HI);
-	fec->rs_sym3_corr.value.ui64 += read_fec_pair(pi, T6_RS_FEC_SYMERR3_LO,
-	    T6_RS_FEC_SYMERR3_HI);
+	fec->rs_corr.value.ui64 +=
+	    t4_read_fec_pair(pi, T6_RS_FEC_CCW_LO, T6_RS_FEC_CCW_HI);
+	fec->rs_uncorr.value.ui64 +=
+	    t4_read_fec_pair(pi, T6_RS_FEC_NCCW_LO, T6_RS_FEC_NCCW_HI);
+	fec->rs_sym0_corr.value.ui64 +=
+	    t4_read_fec_pair(pi, T6_RS_FEC_SYMERR0_LO, T6_RS_FEC_SYMERR0_HI);
+	fec->rs_sym1_corr.value.ui64 +=
+	    t4_read_fec_pair(pi, T6_RS_FEC_SYMERR1_LO, T6_RS_FEC_SYMERR1_HI);
+	fec->rs_sym2_corr.value.ui64 +=
+	    t4_read_fec_pair(pi, T6_RS_FEC_SYMERR2_LO, T6_RS_FEC_SYMERR2_HI);
+	fec->rs_sym3_corr.value.ui64 +=
+	    t4_read_fec_pair(pi, T6_RS_FEC_SYMERR3_LO, T6_RS_FEC_SYMERR3_HI);
 
 	/*
 	 * Now go through and try to grab Firecode/BASE-R stats.
 	 */
-	fec->fc_lane0_corr.value.ui64 += read_fec_pair(pi, T6_FC_FEC_L0_CERR_LO,
-	    T6_FC_FEC_L0_CERR_HI);
-	fec->fc_lane0_uncorr.value.ui64 += read_fec_pair(pi,
-	    T6_FC_FEC_L0_NCERR_LO, T6_FC_FEC_L0_NCERR_HI);
-	fec->fc_lane1_corr.value.ui64 += read_fec_pair(pi, T6_FC_FEC_L1_CERR_LO,
-	    T6_FC_FEC_L1_CERR_HI);
-	fec->fc_lane1_uncorr.value.ui64 += read_fec_pair(pi,
-	    T6_FC_FEC_L1_NCERR_LO, T6_FC_FEC_L1_NCERR_HI);
-	fec->fc_lane2_corr.value.ui64 += read_fec_pair(pi, T6_FC_FEC_L2_CERR_LO,
-	    T6_FC_FEC_L2_CERR_HI);
-	fec->fc_lane2_uncorr.value.ui64 += read_fec_pair(pi,
-	    T6_FC_FEC_L2_NCERR_LO, T6_FC_FEC_L2_NCERR_HI);
-	fec->fc_lane3_corr.value.ui64 += read_fec_pair(pi, T6_FC_FEC_L3_CERR_LO,
-	    T6_FC_FEC_L3_CERR_HI);
-	fec->fc_lane3_uncorr.value.ui64 += read_fec_pair(pi,
-	    T6_FC_FEC_L3_NCERR_LO, T6_FC_FEC_L3_NCERR_HI);
+	fec->fc_lane0_corr.value.ui64 +=
+	    t4_read_fec_pair(pi, T6_FC_FEC_L0_CERR_LO, T6_FC_FEC_L0_CERR_HI);
+	fec->fc_lane0_uncorr.value.ui64 +=
+	    t4_read_fec_pair(pi, T6_FC_FEC_L0_NCERR_LO, T6_FC_FEC_L0_NCERR_HI);
+	fec->fc_lane1_corr.value.ui64 +=
+	    t4_read_fec_pair(pi, T6_FC_FEC_L1_CERR_LO, T6_FC_FEC_L1_CERR_HI);
+	fec->fc_lane1_uncorr.value.ui64 +=
+	    t4_read_fec_pair(pi, T6_FC_FEC_L1_NCERR_LO, T6_FC_FEC_L1_NCERR_HI);
+	fec->fc_lane2_corr.value.ui64 +=
+	    t4_read_fec_pair(pi, T6_FC_FEC_L2_CERR_LO, T6_FC_FEC_L2_CERR_HI);
+	fec->fc_lane2_uncorr.value.ui64 +=
+	    t4_read_fec_pair(pi, T6_FC_FEC_L2_NCERR_LO, T6_FC_FEC_L2_NCERR_HI);
+	fec->fc_lane3_corr.value.ui64 +=
+	    t4_read_fec_pair(pi, T6_FC_FEC_L3_CERR_LO, T6_FC_FEC_L3_CERR_HI);
+	fec->fc_lane3_uncorr.value.ui64 +=
+	    t4_read_fec_pair(pi, T6_FC_FEC_L3_NCERR_LO, T6_FC_FEC_L3_NCERR_HI);
 
 	return (0);
 }
 
 static kstat_t *
-setup_port_fec_kstats(struct port_info *pi)
+t4_init_fec_kstats(struct port_info *pi)
 {
 	kstat_t *ksp;
 	struct cxgbe_port_fec_kstats *kstatp;
@@ -2354,22 +2864,22 @@ setup_port_fec_kstats(struct port_info *pi)
 	}
 
 	kstatp = ksp->ks_data;
-	KS_U64INIT(rs_corr);
-	KS_U64INIT(rs_uncorr);
-	KS_U64INIT(rs_sym0_corr);
-	KS_U64INIT(rs_sym1_corr);
-	KS_U64INIT(rs_sym2_corr);
-	KS_U64INIT(rs_sym3_corr);
-	KS_U64INIT(fc_lane0_corr);
-	KS_U64INIT(fc_lane0_uncorr);
-	KS_U64INIT(fc_lane1_corr);
-	KS_U64INIT(fc_lane1_uncorr);
-	KS_U64INIT(fc_lane2_corr);
-	KS_U64INIT(fc_lane2_uncorr);
-	KS_U64INIT(fc_lane3_corr);
-	KS_U64INIT(fc_lane3_uncorr);
-
-	ksp->ks_update = update_port_fec_kstats;
+	KS_INIT_U64(kstatp, rs_corr);
+	KS_INIT_U64(kstatp, rs_uncorr);
+	KS_INIT_U64(kstatp, rs_sym0_corr);
+	KS_INIT_U64(kstatp, rs_sym1_corr);
+	KS_INIT_U64(kstatp, rs_sym2_corr);
+	KS_INIT_U64(kstatp, rs_sym3_corr);
+	KS_INIT_U64(kstatp, fc_lane0_corr);
+	KS_INIT_U64(kstatp, fc_lane0_uncorr);
+	KS_INIT_U64(kstatp, fc_lane1_corr);
+	KS_INIT_U64(kstatp, fc_lane1_uncorr);
+	KS_INIT_U64(kstatp, fc_lane2_corr);
+	KS_INIT_U64(kstatp, fc_lane2_uncorr);
+	KS_INIT_U64(kstatp, fc_lane3_corr);
+	KS_INIT_U64(kstatp, fc_lane3_uncorr);
+
+	ksp->ks_update = t4_update_fec_kstats;
 	ksp->ks_private = pi;
 	kstat_install(ksp);
 
@@ -2380,43 +2890,42 @@ int
 t4_port_full_init(struct port_info *pi)
 {
 	struct adapter *sc = pi->adapter;
-	uint16_t *rss;
 	struct sge_rxq *rxq;
 	int rc, i;
 
 	ASSERT((pi->flags & TPF_INIT_DONE) == 0);
 
-	/*
-	 * Allocate tx/rx/fl queues for this port.
-	 */
-	rc = t4_setup_port_queues(pi);
-	if (rc != 0)
-		goto done;	/* error message displayed already */
+	/* Allocate TX/RX/FL queues for this port. */
+	if ((rc = t4_port_queues_init(pi)) != 0) {
+		goto done;
+	}
 
-	/*
-	 * Setup RSS for this port.
-	 */
-	rss = kmem_zalloc(pi->nrxq * sizeof (*rss), KM_SLEEP);
+	/* Setup RSS for this port. */
+	uint16_t *rss = kmem_zalloc(pi->rxq_count * sizeof (*rss), KM_SLEEP);
 	for_each_rxq(pi, i, rxq) {
-		rss[i] = rxq->iq.abs_id;
+		rss[i] = rxq->iq.tsi_abs_id;
 	}
 	rc = -t4_config_rss_range(sc, sc->mbox, pi->viid, 0,
-	    pi->rss_size, rss, pi->nrxq);
-	kmem_free(rss, pi->nrxq * sizeof (*rss));
+	    pi->rss_size, rss, pi->rxq_count);
+	kmem_free(rss, pi->rxq_count * sizeof (*rss));
 	if (rc != 0) {
 		cxgb_printf(pi->dip, CE_WARN, "rss_config failed: %d", rc);
 		goto done;
 	}
 
-	/*
-	 * Initialize our per-port FEC kstats.
-	 */
-	pi->ksp_fec = setup_port_fec_kstats(pi);
+	t4_port_kstats_init(pi);
+	pi->ksp_fec = t4_init_fec_kstats(pi);
 
 	pi->flags |= TPF_INIT_DONE;
+
 done:
-	if (rc != 0)
-		(void) t4_port_full_uninit(pi);
+	if (rc != 0) {
+		/*
+		 * Clean up any state resulting which may be lingering due to
+		 * failure part way through initialization.
+		 */
+		t4_port_full_uninit(pi);
+	}
 
 	return (rc);
 }
@@ -2424,83 +2933,16 @@ done:
 /*
  * Idempotent.
  */
-static int
+static void
 t4_port_full_uninit(struct port_info *pi)
 {
-
-	ASSERT(pi->flags & TPF_INIT_DONE);
-
 	if (pi->ksp_fec != NULL) {
 		kstat_delete(pi->ksp_fec);
 		pi->ksp_fec = NULL;
 	}
-	(void) t4_teardown_port_queues(pi);
+	t4_port_kstats_fini(pi);
+	t4_port_queues_fini(pi);
 	pi->flags &= ~TPF_INIT_DONE;
-
-	return (0);
-}
-
-void
-t4_port_queues_enable(struct port_info *pi)
-{
-	ASSERT(pi->flags & TPF_INIT_DONE);
-
-	/*
-	 * TODO: whatever was queued up after we set iq->state to IQS_DISABLED
-	 * back in t4_port_queues_disable will be processed now, after an
-	 * unbounded delay.  This can't be good.
-	 */
-
-	int i;
-	struct adapter *sc = pi->adapter;
-	struct sge_rxq *rxq;
-
-	mutex_enter(&sc->sfl_lock);
-	for_each_rxq(pi, i, rxq) {
-		struct sge_iq *iq = &rxq->iq;
-
-		if (atomic_cas_uint(&iq->state, IQS_DISABLED, IQS_IDLE) !=
-		    IQS_DISABLED)
-			panic("%s: iq %p wasn't disabled", __func__,
-			    (void *) iq);
-
-		/*
-		 * Freelists which were marked "doomed" by a previous
-		 * t4_port_queues_disable() call should clear that status.
-		 */
-		rxq->fl.flags &= ~FL_DOOMED;
-
-		t4_iq_gts_update(iq, iq->intr_params, 0);
-
-	}
-	mutex_exit(&sc->sfl_lock);
-}
-
-void
-t4_port_queues_disable(struct port_info *pi)
-{
-	int i;
-	struct adapter *sc = pi->adapter;
-	struct sge_rxq *rxq;
-
-	ASSERT(pi->flags & TPF_INIT_DONE);
-
-	/*
-	 * TODO: need proper implementation for all tx queues (ctrl, eth, ofld).
-	 */
-
-	for_each_rxq(pi, i, rxq) {
-		while (atomic_cas_uint(&rxq->iq.state, IQS_IDLE,
-		    IQS_DISABLED) != IQS_IDLE)
-			msleep(1);
-	}
-
-	mutex_enter(&sc->sfl_lock);
-	for_each_rxq(pi, i, rxq) {
-		rxq->fl.flags |= FL_DOOMED;
-	}
-	mutex_exit(&sc->sfl_lock);
-	/* TODO: need to wait for all fl's to be removed from sc->sfl */
 }
 
 void
@@ -2568,6 +3010,91 @@ t4_os_set_hw_addr(struct adapter *sc, int idx, const uint8_t *hw_addr)
 	bcopy(hw_addr, sc->port[idx]->hw_addr, ETHERADDRL);
 }
 
+/* Add thread to list of consumers waiting to access adapter mailbox */
+void
+t4_mbox_waiter_add(struct adapter *sc, t4_mbox_waiter_t *ent)
+{
+	mutex_enter(&sc->mbox_lock);
+	ent->thread = curthread;
+	list_insert_tail(&sc->mbox_list, ent);
+	mutex_exit(&sc->mbox_lock);
+}
+
+/* Remove thread from list of consumers waiting to access adapter mailbox */
+void
+t4_mbox_waiter_remove(struct adapter *sc, t4_mbox_waiter_t *ent)
+{
+	ASSERT(ent->thread == curthread);
+
+	mutex_enter(&sc->mbox_lock);
+	const bool was_owner = (list_head(&sc->mbox_list) == ent);
+	list_remove(&sc->mbox_list, ent);
+
+	if (was_owner && !list_is_empty(&sc->mbox_list)) {
+		/*
+		 * Wake the other threads waiting on the mbox as we are vacating
+		 * the "owner" slot.
+		 */
+		cv_broadcast(&sc->mbox_cv);
+	}
+	mutex_exit(&sc->mbox_lock);
+}
+
+/*
+ * Wait for the current thread, which has called t4_mbox_waiter_add(), to become
+ * the "owner" of the adapter mailbox (head of the waiter list).
+ *
+ * Returns true if current thread is the owner, else false if we slept/spun for
+ * `wait_us` and are not yet owner (and thus should recheck adapter status).
+ */
+bool
+t4_mbox_wait_owner(struct adapter *sc, uint_t wait_us, bool sleep_ok)
+{
+	mutex_enter(&sc->mbox_lock);
+	t4_mbox_waiter_t *head = list_head(&sc->mbox_list);
+	ASSERT(head != NULL);
+
+	if (head->thread == curthread) {
+		mutex_exit(&sc->mbox_lock);
+		return (true);
+	}
+
+	if (!sleep_ok) {
+		mutex_exit(&sc->mbox_lock);
+		drv_usecwait(wait_us);
+
+		mutex_enter(&sc->mbox_lock);
+		head = list_head(&sc->mbox_list);
+		ASSERT(head != NULL);
+		bool is_owner = head->thread == curthread;
+		mutex_exit(&sc->mbox_lock);
+		return (is_owner);
+	}
+
+	/*
+	 * Using a singal-aware wait would be more courteous here, but much of
+	 * the logic which ultimately accesses the device mbox is ill-equipped
+	 * to handle gracefully EINTR failures.
+	 */
+	const int res = cv_reltimedwait(&sc->mbox_cv, &sc->mbox_lock,
+	    USEC_TO_TICK(wait_us), TR_MICROSEC);
+	if (res > 0) {
+		head = list_head(&sc->mbox_list);
+		ASSERT(head != NULL);
+		if (head->thread == curthread) {
+			/*
+			 * CV was signaled and this thread now occupies the head
+			 * of the list (indicating mbox ownership).
+			 */
+			mutex_exit(&sc->mbox_lock);
+			return (true);
+		}
+	}
+	mutex_exit(&sc->mbox_lock);
+	return (false);
+}
+
+
 uint32_t
 t4_read_reg(struct adapter *sc, uint32_t reg)
 {
@@ -2790,16 +3317,12 @@ t4_cxgbe_attach(struct port_info *pi, dev_info_t *dip)
 	mac->m_driver = pi;
 	mac->m_dip = dip;
 	mac->m_src_addr = pi->hw_addr;
-	mac->m_callbacks = pi->mc;
+	mac->m_callbacks = &t4_mac_callbacks;
 	mac->m_max_sdu = pi->mtu;
 	/* mac_register() treats this as const, so we can cast it away */
 	mac->m_priv_props = (char **)props;
 	mac->m_margin = VLAN_TAGSZ;
-
-	if (!mac->m_callbacks->mc_unicst) {
-		/* Multiple rings enabled */
-		mac->m_v12n = MAC_VIRT_LEVEL1;
-	}
+	mac->m_v12n = MAC_VIRT_LEVEL1;
 
 	mac_handle_t mh = NULL;
 	const int rc = mac_register(mac, &mh);
diff --git a/usr/src/uts/common/io/cxgbe/t4nex/t4_sge.c b/usr/src/uts/common/io/cxgbe/t4nex/t4_sge.c
index 06d0e3f6d5..0c1e03f044 100644
--- a/usr/src/uts/common/io/cxgbe/t4nex/t4_sge.c
+++ b/usr/src/uts/common/io/cxgbe/t4nex/t4_sge.c
@@ -69,10 +69,6 @@ struct txinfo {
 	struct ulptx_sge_pair reserved[TX_SGL_SEGS / 2];
 };
 
-struct mblk_pair {
-	mblk_t *head, *tail;
-};
-
 struct rxbuf {
 	kmem_cache_t *cache;		/* the kmem_cache this rxb came from */
 	ddi_dma_handle_t dhdl;
@@ -84,24 +80,33 @@ struct rxbuf {
 	volatile uint_t ref_cnt;
 };
 
-static int service_iq(struct sge_iq *iq, int budget);
-static inline void init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx,
-    int8_t pktc_idx, int qsize, uint8_t esize);
-static inline void init_fl(struct sge_fl *fl, uint16_t qsize);
-static int alloc_iq_fl(struct port_info *pi, struct sge_iq *iq,
-    struct sge_fl *fl, int intr_idx, int cong);
-static int free_iq_fl(struct port_info *pi, struct sge_iq *iq,
-    struct sge_fl *fl);
-static int alloc_rxq(struct port_info *pi, struct sge_rxq *rxq, int intr_idx,
-    int i);
-static int free_rxq(struct port_info *pi, struct sge_rxq *rxq);
-static int eth_eq_alloc(struct adapter *sc, struct port_info *pi,
-    struct sge_eq *eq);
-static int alloc_eq(struct adapter *sc, struct port_info *pi,
-    struct sge_eq *eq);
-static int free_eq(struct adapter *sc, struct sge_eq *eq);
-static int alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx);
-static int free_txq(struct port_info *pi, struct sge_txq *txq);
+static const uint16_t t4_iq_esize_bytes[] = {
+	[T4_IQ_ESIZE_16B] = 16,
+	[T4_IQ_ESIZE_32B] = 32,
+	[T4_IQ_ESIZE_64B] = 64,
+	[T4_IQ_ESIZE_128B] = 128,
+};
+
+typedef struct t4_iq_params {
+	t4_iq_type_t	tip_iq_type;
+	uint8_t		tip_tmr_idx;
+	int8_t		tip_pktc_idx;
+	uint16_t	tip_qsize;
+	t4_iq_esize_t	tip_esize;
+	uint16_t	tip_fl_qsize;
+	int		tip_cong_chan;
+	t4_sge_iq_t	*tip_intr_evtq;
+	uint_t		tip_intr_idx;
+} t4_iq_params_t;
+
+static int t4_alloc_eq_base(struct port_info *, t4_sge_eq_t *);
+static void t4_free_iq(struct port_info *, t4_sge_iq_t *);
+static int t4_alloc_rxq(struct port_info *, struct sge_rxq *, uint_t);
+static void t4_free_rxq(struct port_info *, struct sge_rxq *);
+static void t4_free_eq(struct port_info *, t4_sge_eq_t *);
+static void t4_alloc_eq_post(struct port_info *, t4_sge_eq_t *);
+static int t4_alloc_txq(struct port_info *, struct sge_txq *, int);
+static void t4_free_txq(struct port_info *, struct sge_txq *);
 static int alloc_dma_memory(struct adapter *sc, size_t len, int flags,
     ddi_device_acc_attr_t *acc_attr, ddi_dma_attr_t *dma_attr,
     ddi_dma_handle_t *dma_hdl, ddi_acc_handle_t *acc_hdl, uint64_t *pba,
@@ -114,15 +119,14 @@ static int free_desc_ring(ddi_dma_handle_t *dhdl, ddi_acc_handle_t *ahdl);
 static int alloc_tx_copybuffer(struct adapter *sc, size_t len,
     ddi_dma_handle_t *dma_hdl, ddi_acc_handle_t *acc_hdl, uint64_t *pba,
     caddr_t *pva);
-static inline bool is_new_response(const struct sge_iq *iq,
-    struct rsp_ctrl **ctrl);
-static inline void iq_next(struct sge_iq *iq);
-static int refill_fl(struct adapter *sc, struct sge_fl *fl, int nbufs);
-static void refill_sfl(void *arg);
-static void add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl);
-static void free_fl_bufs(struct sge_fl *fl);
-static mblk_t *get_fl_payload(struct adapter *sc, struct sge_fl *fl,
-    uint32_t len_newbuf, int *fl_bufs_used);
+static inline bool t4_get_new_rsp(const t4_sge_iq_t *, struct rsp_ctrl *);
+static inline void t4_iq_next_entry(t4_sge_iq_t *iq);
+static t4_iq_result_t t4_process_event_iq(t4_sge_iq_t *event_iq);
+static bool t4_fl_refill(struct sge_fl *, uint_t);
+static void t4_sfl_enqueue(struct adapter *, struct sge_fl *);
+static void t4_sfl_process(void *);
+static void t4_fl_free_bufs(struct sge_fl *fl);
+static mblk_t *t4_fl_get_payload(struct sge_fl *, uint32_t, bool);
 static int get_frame_txinfo(struct sge_txq *txq, mblk_t **fp,
     struct txinfo *txinfo, int sgl_only);
 static inline int fits_in_txb(struct sge_txq *txq, int len, int *waste);
@@ -140,29 +144,27 @@ static int write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, mblk_t *m,
 static void t4_write_flush_wr(struct sge_txq *);
 static inline void write_ulp_cpl_sgl(struct port_info *pi, struct sge_txq *txq,
     struct txpkts *txpkts, struct txinfo *txinfo);
-static inline void copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to,
-    int len);
+static inline void copy_to_txd(t4_sge_eq_t *eq, caddr_t from, caddr_t *to,
+    size_t len);
 static void t4_tx_ring_db(struct sge_txq *);
-static uint_t t4_tx_reclaim_descs(struct sge_txq *, uint_t, mblk_t **);
-static int t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss,
-    mblk_t *m);
-static inline void ring_fl_db(struct adapter *sc, struct sge_fl *fl);
+static uint16_t t4_tx_reclaim_credits(struct sge_txq *, uint16_t, mblk_t **);
+static void t4_fl_ring_db(struct sge_fl *fl);
 static kstat_t *setup_port_config_kstats(struct port_info *pi);
 static kstat_t *setup_port_info_kstats(struct port_info *pi);
 static kstat_t *setup_rxq_kstats(struct port_info *pi, struct sge_rxq *rxq,
-    int idx);
+    uint_t idx);
 static int update_rxq_kstats(kstat_t *ksp, int rw);
 static int update_port_info_kstats(kstat_t *ksp, int rw);
 static kstat_t *setup_txq_kstats(struct port_info *pi, struct sge_txq *txq,
     int idx);
 static int update_txq_kstats(kstat_t *ksp, int rw);
-static void t4_sge_egr_update(struct sge_iq *, const struct rss_header *);
-static int t4_handle_cpl_msg(struct sge_iq *, const struct rss_header *,
+static void t4_sge_egr_update(t4_sge_iq_t *, const struct rss_header *);
+static int t4_handle_cpl_msg(t4_sge_iq_t *, const struct rss_header *,
     mblk_t *);
-static int t4_handle_fw_msg(struct sge_iq *, const struct rss_header *);
+static int t4_handle_fw_msg(t4_sge_iq_t *, const struct rss_header *);
 
 static kmem_cache_t *rxbuf_cache_create(struct rxbuf_cache_params *);
-static struct rxbuf *rxbuf_alloc(kmem_cache_t *, int, uint_t);
+static struct rxbuf *rxbuf_alloc(kmem_cache_t *, int);
 static void rxbuf_free(struct rxbuf *);
 static int rxbuf_ctor(void *, void *, int);
 static void rxbuf_dtor(void *, void *);
@@ -173,34 +175,54 @@ t4_rss_payload(const struct rss_header *rss)
 	return ((void *)(&rss[1]));
 }
 
-static inline struct sge_iq **
+static inline t4_sge_iq_t **
 t4_iqmap_slot(struct adapter *sc, uint_t cntxt_id)
 {
-	const uint_t idx = cntxt_id - sc->sge.iq_start;
+	const uint_t idx = cntxt_id - sc->sge.iqmap_start;
 	VERIFY3U(idx, <, sc->sge.iqmap_sz);
 	return (&sc->sge.iqmap[idx]);
 }
 
-static inline struct sge_eq **
+static inline t4_sge_eq_t **
 t4_eqmap_slot(struct adapter *sc, uint_t cntxt_id)
 {
-	const uint_t idx = cntxt_id - sc->sge.eq_start;
+	const uint_t idx = cntxt_id - sc->sge.eqmap_start;
 	VERIFY3U(idx, <, sc->sge.eqmap_sz);
 	return (&sc->sge.eqmap[idx]);
 }
 
-static inline int
-reclaimable(struct sge_eq *eq)
+/*
+ * Get the address of the EQ host credit at the provided index.
+ */
+static inline void *
+t4_eq_credit(t4_sge_eq_t *eq, uint16_t idx)
 {
-	unsigned int cidx;
+	ASSERT3U(idx, <, eq->tse_qsize_spg);
+	uint8_t *credits = eq->tse_ring;
+	return (&credits[idx * EQ_HC_SIZE]);
+}
+
+static inline struct sge_rxq *
+t4_iq_to_rxq(t4_sge_iq_t *iq)
+{
+	if (iq->tsi_iqtype == TIQT_ETH_RX) {
+		return (__containerof(iq, struct sge_rxq, iq));
+	} else {
+		return (NULL);
+	}
+}
 
-	cidx = eq->spg->cidx;   /* stable snapshot */
-	cidx = be16_to_cpu(cidx);
+static inline t4_sge_iq_t *
+t4_fl_to_iq(struct sge_fl *fl)
+{
+	/*
+	 * Currently, RXQs are the only consumer of sge_fl, and are thus the
+	 * only case we need to worry about.
+	 */
+	struct sge_rxq *rxq = __containerof(fl, struct sge_rxq, fl);
+	ASSERT(rxq->iq.tsi_iqtype == TIQT_ETH_RX);
 
-	if (cidx >= eq->cidx)
-		return (cidx - eq->cidx);
-	else
-		return (cidx + eq->cap - eq->cidx);
+	return (&rxq->iq);
 }
 
 void
@@ -209,8 +231,7 @@ t4_sge_init(struct adapter *sc)
 	struct driver_properties *p = &sc->props;
 	ddi_dma_attr_t *dma_attr;
 	ddi_device_acc_attr_t *acc_attr;
-	uint32_t sge_control, sge_conm_ctrl;
-	int egress_threshold;
+	uint32_t sge_control;
 
 	/*
 	 * Device access and DMA attributes for descriptor rings
@@ -261,16 +282,25 @@ t4_sge_init(struct adapter *sc)
 	 */
 	sge_control = t4_read_reg(sc, A_SGE_CONTROL);
 	sc->sge.pktshift = G_PKTSHIFT(sge_control);
-	sc->sge.stat_len = (sge_control & F_EGRSTATUSPAGESIZE) ? 128 : 64;
+	sc->sge.eq_spg_len = (sge_control & F_EGRSTATUSPAGESIZE) ? 2 : 1;
 
 	/* t4_nex uses FLM packed mode */
-	sc->sge.fl_align = t4_fl_pkt_align(sc, true);
+	const int fl_align = t4_fl_pkt_align(sc, true);
+	VERIFY3S(fl_align, >=, 0);
+	/*
+	 * Minimum alignment for freelist buffer sizes is stated as 16, but in
+	 * order to keep bits [3:0] clear for identifying the buffer size
+	 * register, we use a minimum of 32.
+	 *
+	 * See A_SGE_FL_BUFFER_SIZE0 setting below.
+	 */
+	sc->sge.fl_align = MAX(fl_align, 32);
 
 	/*
-	 * Device access and DMA attributes for rx buffers
+	 * Device access and DMA attributes for RX buffers
 	 */
 	sc->sge.rxb_params.dip = sc->dip;
-	sc->sge.rxb_params.buf_size = rx_buf_size;
+	sc->sge.rxb_params.buf_size = P2ROUNDUP(rx_buf_size, fl_align);
 
 	acc_attr = &sc->sge.rxb_params.acc_attr_rx;
 	acc_attr->devacc_attr_version = DDI_DEVICE_ATTR_V0;
@@ -281,11 +311,6 @@ t4_sge_init(struct adapter *sc)
 	dma_attr->dma_attr_addr_lo = 0;
 	dma_attr->dma_attr_addr_hi = UINT64_MAX;
 	dma_attr->dma_attr_count_max = UINT64_MAX;
-	/*
-	 * Low 4 bits of an rx buffer address have a special meaning to the SGE
-	 * and an rx buf cannot have an address with any of these bits set.
-	 * FL_ALIGN is >= 32 so we're sure things are ok.
-	 */
 	dma_attr->dma_attr_align = sc->sge.fl_align;
 	dma_attr->dma_attr_burstsizes = 0xfff;
 	dma_attr->dma_attr_minxfer = 1;
@@ -298,7 +323,7 @@ t4_sge_init(struct adapter *sc)
 	sc->sge.rxbuf_cache = rxbuf_cache_create(&sc->sge.rxb_params);
 
 	/*
-	 * A FL with <= fl_starve_thres buffers is starving and a periodic
+	 * A FL with <= fl_starve_threshold buffers is starving and a periodic
 	 * timer will attempt to refill it.  This needs to be larger than the
 	 * SGE's Egress Congestion Threshold.  If it isn't, then we can get
 	 * stuck waiting for new packets while the SGE is waiting for us to
@@ -310,7 +335,8 @@ t4_sge_init(struct adapter *sc)
 	 * buffers.
 	 */
 
-	sge_conm_ctrl = t4_read_reg(sc, A_SGE_CONM_CTRL);
+	const uint32_t sge_conm_ctrl = t4_read_reg(sc, A_SGE_CONM_CTRL);
+	uint_t egress_threshold;
 	switch (CHELSIO_CHIP_VERSION(sc->params.chip)) {
 	case CHELSIO_T4:
 		egress_threshold = G_EGRTHRESHOLD(sge_conm_ctrl);
@@ -322,9 +348,20 @@ t4_sge_init(struct adapter *sc)
 	default:
 		egress_threshold = G_T6_EGRTHRESHOLDPACKING(sge_conm_ctrl);
 	}
-	sc->sge.fl_starve_threshold = 2*egress_threshold + 1;
+	sc->sge.fl_starve_threshold = 2 * egress_threshold + 1;
 
-	t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0, rx_buf_size);
+	/*
+	 * Set the size of buffers submitted through freelists.
+	 *
+	 * Strictly speaking, this is setting one of sixteen possible buffer
+	 * sizes, with bits [3:0] of freelist entries designating the size
+	 * register (0-15) which contains its corresponding size.
+	 *
+	 * Our driver does not currently make use of multiple sizes.  Submitted
+	 * buffers are at least 16-byte aligned, thus bits [3:0] are 0,
+	 * selecting this size register.
+	 */
+	t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0, sc->sge.rxb_params.buf_size);
 
 	t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD,
 	    V_THRESHOLD_0(p->holdoff_pktcnt[0]) |
@@ -343,201 +380,216 @@ t4_sge_init(struct adapter *sc)
 	    V_TIMERVALUE5(us_to_core_ticks(sc, p->holdoff_timer_us[5])));
 }
 
-static inline int
-first_vector(struct port_info *pi)
+static uint_t
+t4_queue_to_intrq(struct adapter *sc, uint_t q_idx)
 {
-	struct adapter *sc = pi->adapter;
-	int rc = T4_EXTRA_INTR, i;
-
-	if (sc->intr_count == 1)
-		return (0);
-
-	for_each_port(sc, i) {
-		struct port_info *p = sc->port[i];
-
-		if (i == pi->port_id)
-			break;
-
-		/*
-		 * Not compiled with offload support and intr_count > 1.  Only
-		 * NIC queues exist and they'd better be taking direct
-		 * interrupts.
-		 */
-		ASSERT(!(sc->flags & TAF_INTR_FWD));
-		rc += p->nrxq;
-	}
-	return (rc);
+	return (q_idx % sc->intr_queue_cfg.intr_per_port);
 }
 
 /*
- * Given an arbitrary "index," come up with an iq that can be used by other
- * queues (of this port) for interrupt forwarding, SGE egress updates, etc.
- * The iq returned is guaranteed to be something that takes direct interrupts.
+ * Assign an interrupt event queue to the Rx queue specified by q_idx. If
+ * we are in TIP_PER_PORT mode, this is done by multiplexing the Rx queues
+ * across the port's interrupt queues. Otherwise, all events are directed
+ * to the adapter-wide firmware queue.
  */
-static struct sge_iq *
-port_intr_iq(struct port_info *pi, int idx)
+static void
+t4_rxq_intr_assign(struct port_info *pi, uint_t rxq_idx,
+    struct t4_iq_params *iqp)
 {
 	struct adapter *sc = pi->adapter;
-	struct sge *s = &sc->sge;
-	struct sge_iq *iq = NULL;
+	const struct t4_intrs_queues *iqc = &sc->intr_queue_cfg;
 
-	if (sc->intr_count == 1)
-		return (&sc->sge.fwq);
+	switch (iqc->intr_plan) {
+	case TIP_PER_PORT: {
+		uint_t intr_iq_idx = t4_queue_to_intrq(sc, rxq_idx);
+		iqp->tip_intr_evtq = &pi->intr_iqs[intr_iq_idx];
+		iqp->tip_intr_idx = INTR_FORWARDED;
+		break;
+	}
+	case TIP_SINGLE:
+	case TIP_ERR_QUEUES:
+	default:
+		/* Forward all RXQ interrupts to FWQ */
+		iqp->tip_intr_evtq = &sc->sge.fwq;
+		iqp->tip_intr_idx = INTR_FORWARDED;
+		break;
+	}
+}
 
-	/*
-	 * Not compiled with offload support and intr_count > 1.  Only NIC
-	 * queues exist and they'd better be taking direct interrupts.
-	 */
-	ASSERT(!(sc->flags & TAF_INTR_FWD));
+void
+t4_port_kstats_init(struct port_info *pi)
+{
+	ASSERT(pi->ksp_config == NULL);
+	ASSERT(pi->ksp_info == NULL);
 
-	idx %= pi->nrxq;
-	iq = &s->rxq[pi->first_rxq + idx].iq;
+	pi->ksp_config = setup_port_config_kstats(pi);
+	pi->ksp_info = setup_port_info_kstats(pi);
+}
 
-	return (iq);
+void
+t4_port_kstats_fini(struct port_info *pi)
+{
+	if (pi->ksp_config != NULL) {
+		kstat_delete(pi->ksp_config);
+		pi->ksp_config = NULL;
+	}
+	if (pi->ksp_info != NULL) {
+		kstat_delete(pi->ksp_info);
+		pi->ksp_info = NULL;
+	}
 }
 
 int
-t4_setup_port_queues(struct port_info *pi)
+t4_port_queues_init(struct port_info *pi)
 {
-	int rc = 0, i, intr_idx, j;
-	struct sge_rxq *rxq;
-	struct sge_txq *txq;
+	int rc = 0;
+	uint_t q_idx;
 	struct adapter *sc = pi->adapter;
-	struct driver_properties *p = &sc->props;
 
-	pi->ksp_config = setup_port_config_kstats(pi);
-	pi->ksp_info   = setup_port_info_kstats(pi);
+	struct sge_rxq *rxq;
+	for_each_rxq(pi, q_idx, rxq) {
+		if ((rc = t4_alloc_rxq(pi, rxq, q_idx)) != 0) {
+			goto cleanup;
+		}
+	}
 
-	/* Interrupt vector to start from (when using multiple vectors) */
-	intr_idx = first_vector(pi);
+	struct sge_txq *txq;
+	for_each_txq(pi, q_idx, txq) {
+		txq->eq.tse_flags = 0;
+		txq->eq.tse_tx_chan = pi->tx_chan;
+		txq->eq.tse_qsize = sc->props.qsize_txq;
 
-	/*
-	 * First pass over all rx queues (NIC and TOE):
-	 * a) initialize iq and fl
-	 * b) allocate queue iff it will take direct interrupts.
-	 */
+		if (sc->intr_queue_cfg.intr_plan == TIP_PER_PORT) {
+			/*
+			 * If we have per port interrupts, then multiplex
+			 * TX completion events across them.
+			 */
+			uint_t intr_iq_idx = t4_queue_to_intrq(sc, q_idx);
+			txq->eq.tse_iqid =
+			    pi->intr_iqs[intr_iq_idx].tsi_cntxt_id;
+		} else {
+			/*
+			 * Otherwise, handle all TX completion events in
+			 * the firmware queue.
+			 */
+			txq->eq.tse_iqid = sc->sge.fwq.tsi_cntxt_id;
+		}
 
-	for_each_rxq(pi, i, rxq) {
+		if ((rc = t4_alloc_txq(pi, txq, q_idx)) != 0) {
+			goto cleanup;
+		}
+	}
 
-		init_iq(&rxq->iq, sc, pi->tmr_idx, pi->pktc_idx, p->qsize_rxq,
-		    RX_IQ_ESIZE);
+	return (0);
 
-		init_fl(&rxq->fl, p->qsize_rxq / 8); /* 8 bufs in each entry */
+cleanup:
+	t4_port_queues_fini(pi);
+	return (rc);
+}
 
-		if ((!(sc->flags & TAF_INTR_FWD)) ||
-		    (sc->intr_count > 1 && pi->nrxq)) {
-			rxq->iq.flags |= IQ_INTR;
-			rc = alloc_rxq(pi, rxq, intr_idx, i);
-			if (rc != 0)
-				goto done;
-			intr_idx++;
-		}
+void
+t4_port_queues_fini(struct port_info *pi)
+{
+	uint_t i;
 
+	struct sge_txq *txq;
+	for_each_txq(pi, i, txq) {
+		t4_free_txq(pi, txq);
 	}
 
-	/*
-	 * Second pass over all rx queues (NIC and TOE).  The queues forwarding
-	 * their interrupts are allocated now.
-	 */
-	j = 0;
+	struct sge_rxq *rxq;
 	for_each_rxq(pi, i, rxq) {
-		if (rxq->iq.flags & IQ_INTR)
-			continue;
+		t4_free_rxq(pi, rxq);
+	}
+}
 
-		intr_idx = port_intr_iq(pi, j)->abs_id;
+void
+t4_port_queues_enable(struct port_info *pi)
+{
+	ASSERT(pi->flags & TPF_INIT_DONE);
 
-		rc = alloc_rxq(pi, rxq, intr_idx, i);
-		if (rc != 0)
-			goto done;
-		j++;
-	}
+	uint_t i;
+	struct adapter *sc = pi->adapter;
+	struct sge_rxq *rxq;
 
-	/*
-	 * Now the tx queues.  Only one pass needed.
-	 */
-	j = 0;
-	for_each_txq(pi, i, txq) {
-		txq->eq.flags = 0;
-		txq->eq.tx_chan = pi->tx_chan;
-		txq->eq.qsize = p->qsize_txq;
+	mutex_enter(&sc->sfl_lock);
+	for_each_rxq(pi, i, rxq) {
+		t4_sge_iq_t *iq = &rxq->iq;
 
-		/* For now, direct all TX queue notifications to the FW IQ. */
-		txq->eq.iqid = sc->sge.fwq.cntxt_id;
+		IQ_LOCK(iq);
+		VERIFY0(iq->tsi_flags & IQ_ENABLED);
+		iq->tsi_flags |= IQ_ENABLED;
+
+		/*
+		 * Freelists which were marked "doomed" by a previous
+		 * t4_port_queues_disable() call should clear that status.
+		 */
+		rxq->fl.sfl_flags &= ~SFL_DOOMED;
 
-		rc = alloc_txq(pi, txq, i);
-		if (rc != 0)
-			goto done;
+		t4_iq_gts_update(iq, iq->tsi_gts_rearm, 0);
+		IQ_UNLOCK(iq);
 	}
+	mutex_exit(&sc->sfl_lock);
 
-done:
-	if (rc != 0)
-		(void) t4_teardown_port_queues(pi);
+	struct sge_txq *txq;
+	for_each_txq(pi, i, txq) {
+		t4_sge_eq_t *eq = &txq->eq;
 
-	return (rc);
+		EQ_LOCK(eq);
+		eq->tse_flags |= EQ_ENABLED;
+		EQ_UNLOCK(eq);
+	}
 }
 
-/*
- * Idempotent
- */
-int
-t4_teardown_port_queues(struct port_info *pi)
+void
+t4_port_queues_disable(struct port_info *pi)
 {
-	int i;
+	uint_t i;
+	struct adapter *sc = pi->adapter;
 	struct sge_rxq *rxq;
-	struct sge_txq *txq;
-
-	if (pi->ksp_config != NULL) {
-		kstat_delete(pi->ksp_config);
-		pi->ksp_config = NULL;
-	}
-	if (pi->ksp_info != NULL) {
-		kstat_delete(pi->ksp_info);
-		pi->ksp_info = NULL;
-	}
 
-	for_each_txq(pi, i, txq) {
-		(void) free_txq(pi, txq);
-	}
+	ASSERT(pi->flags & TPF_INIT_DONE);
 
 	for_each_rxq(pi, i, rxq) {
-		if ((rxq->iq.flags & IQ_INTR) == 0)
-			(void) free_rxq(pi, rxq);
-	}
+		t4_sge_iq_t *iq = &rxq->iq;
 
-	/*
-	 * Then take down the rx queues that take direct interrupts.
-	 */
+		IQ_LOCK(iq);
+		iq->tsi_flags &= ~IQ_ENABLED;
+		IQ_UNLOCK(iq);
+	}
 
+	mutex_enter(&sc->sfl_lock);
 	for_each_rxq(pi, i, rxq) {
-		if (rxq->iq.flags & IQ_INTR)
-			(void) free_rxq(pi, rxq);
+		rxq->fl.sfl_flags |= SFL_DOOMED;
 	}
+	mutex_exit(&sc->sfl_lock);
+	/* TODO: need to wait for all fl's to be removed from sc->sfl */
 
-	return (0);
-}
-
-/* Deals with errors and forwarded interrupts */
-uint_t
-t4_intr_all(caddr_t arg1, caddr_t arg2)
-{
-
-	(void) t4_intr_err(arg1, arg2);
-	(void) t4_intr(arg1, arg2);
+	struct sge_txq *txq;
+	for_each_txq(pi, i, txq) {
+		t4_sge_eq_t *eq = &txq->eq;
 
-	return (DDI_INTR_CLAIMED);
+		EQ_LOCK(eq);
+		eq->tse_flags &= ~EQ_ENABLED;
+		EQ_UNLOCK(eq);
+	}
+	/*
+	 * TODO: issue flush WR to EQs and wait for EGR update to ensure that
+	 * all processing has completed.
+	 */
 }
 
 /*
- * We are counting on the values of t4_intr_config_t matching the register
+ * We are counting on the values of t4_gts_config_t matching the register
  * definitions from the shared code.
  */
-CTASSERT(TIC_SE_INTR_ARM == F_QINTR_CNT_EN);
-CTASSERT(TIC_TIMER0 == V_QINTR_TIMER_IDX(X_TIMERREG_COUNTER0));
-CTASSERT(TIC_TIMER5 == V_QINTR_TIMER_IDX(X_TIMERREG_COUNTER5));
-CTASSERT(TIC_START_COUNTER == V_QINTR_TIMER_IDX(X_TIMERREG_RESTART_COUNTER));
+CTASSERT(TGC_SE_INTR_ARM == F_QINTR_CNT_EN);
+CTASSERT(TGC_TIMER0 == V_QINTR_TIMER_IDX(X_TIMERREG_COUNTER0));
+CTASSERT(TGC_TIMER5 == V_QINTR_TIMER_IDX(X_TIMERREG_COUNTER5));
+CTASSERT(TGC_START_COUNTER == V_QINTR_TIMER_IDX(X_TIMERREG_RESTART_COUNTER));
 
 void
-t4_iq_update_intr_cfg(struct sge_iq *iq, uint8_t tmr_idx, int8_t pktc_idx)
+t4_iq_update_intr_cfg(t4_sge_iq_t *iq, uint8_t tmr_idx, int8_t pktc_idx)
 {
 	ASSERT((pktc_idx >= 0 && pktc_idx < SGE_NCOUNTERS) || pktc_idx == -1);
 	IQ_LOCK_ASSERT_OWNED(iq);
@@ -550,37 +602,37 @@ t4_iq_update_intr_cfg(struct sge_iq *iq, uint8_t tmr_idx, int8_t pktc_idx)
 	 */
 	ASSERT3U(tmr_idx, <, SGE_NTIMERS);
 
-	iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx) |
-	    ((pktc_idx != -1) ? TIC_SE_INTR_ARM : 0);
+	iq->tsi_gts_rearm = V_QINTR_TIMER_IDX(tmr_idx) |
+	    ((pktc_idx != -1) ? TGC_SE_INTR_ARM : 0);
 
 	/* Update IQ for new packet count threshold, but only if enabled */
-	if (pktc_idx != iq->intr_pktc_idx && pktc_idx >= 0) {
+	if (pktc_idx != iq->tsi_intr_pktc_idx && pktc_idx >= 0) {
 		const uint32_t param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_IQ_INTCNTTHRESH) |
-		    V_FW_PARAMS_PARAM_YZ(iq->cntxt_id);
+		    V_FW_PARAMS_PARAM_YZ(iq->tsi_cntxt_id);
 		const uint32_t val = pktc_idx;
 
-		struct adapter *sc = iq->adapter;
+		struct adapter *sc = iq->tsi_adapter;
 		int rc =
 		    -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 		if (rc != 0) {
 			/* report error but carry on */
 			cxgb_printf(sc->dip, CE_WARN,
 			    "failed to set intr pktcnt index for IQ %d: %d",
-			    iq->cntxt_id, rc);
+			    iq->tsi_cntxt_id, rc);
 		}
 	}
-	iq->intr_pktc_idx = pktc_idx;
+	iq->tsi_intr_pktc_idx = pktc_idx;
 }
 
 void
-t4_eq_update_dbq_timer(struct sge_eq *eq, struct port_info *pi)
+t4_eq_update_dbq_timer(t4_sge_eq_t *eq, struct port_info *pi)
 {
 	struct adapter *sc = pi->adapter;
 
 	const uint32_t param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_TIMERIX) |
-	    V_FW_PARAMS_PARAM_YZ(eq->cntxt_id);
+	    V_FW_PARAMS_PARAM_YZ(eq->tse_cntxt_id);
 	const uint32_t val = pi->dbq_timer_idx;
 
 	int rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
@@ -588,7 +640,7 @@ t4_eq_update_dbq_timer(struct sge_eq *eq, struct port_info *pi)
 		/* report error but carry on */
 		cxgb_printf(sc->dip, CE_WARN,
 		    "failed to set DBQ timer index for EQ %d: %d",
-		    eq->cntxt_id, rc);
+		    eq->tse_cntxt_id, rc);
 	}
 }
 
@@ -597,13 +649,13 @@ t4_eq_update_dbq_timer(struct sge_eq *eq, struct port_info *pi)
  * ingress queue.
  */
 void
-t4_iq_gts_update(struct sge_iq *iq, t4_intr_config_t cfg, uint16_t cidx_incr)
+t4_iq_gts_update(t4_sge_iq_t *iq, t4_gts_config_t cfg, uint16_t cidx_incr)
 {
 	const uint32_t value =
-	    V_INGRESSQID((uint32_t)iq->cntxt_id) |
+	    V_INGRESSQID((uint32_t)iq->tsi_cntxt_id) |
 	    V_CIDXINC((uint32_t)cidx_incr) |
 	    V_SEINTARM((uint32_t)cfg);
-	t4_write_reg(iq->adapter, MYPF_REG(A_SGE_PF_GTS), value);
+	t4_write_reg(iq->tsi_adapter, MYPF_REG(A_SGE_PF_GTS), value);
 }
 
 /*
@@ -613,376 +665,466 @@ t4_iq_gts_update(struct sge_iq *iq, t4_intr_config_t cfg, uint16_t cidx_incr)
  * associated with the IQ.
  */
 static void
-t4_iq_gts_incr(struct sge_iq *iq, uint16_t cidx_incr)
+t4_iq_gts_incr(t4_sge_iq_t *iq, uint16_t cidx_incr)
 {
 	if (cidx_incr == 0) {
 		return;
 	}
 
 	const uint32_t value =
-	    V_INGRESSQID((uint32_t)iq->cntxt_id) |
+	    V_INGRESSQID((uint32_t)iq->tsi_cntxt_id) |
 	    V_CIDXINC((uint32_t)cidx_incr) |
 	    V_SEINTARM((uint32_t)V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX));
-	t4_write_reg(iq->adapter, MYPF_REG(A_SGE_PF_GTS), value);
+	t4_write_reg(iq->tsi_adapter, MYPF_REG(A_SGE_PF_GTS), value);
 }
 
-static void
-t4_intr_rx_work(struct sge_iq *iq)
-{
-	mblk_t *mp = NULL;
-	struct sge_rxq *rxq = iq_to_rxq(iq);	/* Use iff iq is part of rxq */
-	RXQ_LOCK(rxq);
-	if (!iq->polling) {
-		mp = t4_ring_rx(rxq, iq->qsize/8);
-		t4_iq_gts_update(iq, iq->intr_params, 0);
-	}
-	RXQ_UNLOCK(rxq);
-	if (mp != NULL) {
-		mac_rx_ring(rxq->port->mh, rxq->ring_handle, mp,
-		    rxq->ring_gen_num);
-	}
-}
-
-/* Deals with interrupts on the given ingress queue */
-/* ARGSUSED */
 uint_t
-t4_intr(caddr_t arg1, caddr_t arg2)
+t4_intr_all(caddr_t arg1, caddr_t arg2)
 {
-	struct sge_iq *iq = (struct sge_iq *)arg2;
-	int state;
+	struct adapter *sc = (struct adapter *)arg1;
+
+	/* handle any device errors */
+	t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0);
+	(void) t4_slow_intr_handler(sc);
+
+	/* process fwq */
+	(void) t4_process_event_iq(&sc->sge.fwq);
 
-	/*
-	 * Right now receive polling is only enabled for MSI-X and
-	 * when we have enough msi-x vectors i.e no interrupt forwarding.
-	 */
-	if (iq->adapter->props.multi_rings) {
-		t4_intr_rx_work(iq);
-	} else {
-		state = atomic_cas_uint(&iq->state, IQS_IDLE, IQS_BUSY);
-		if (state == IQS_IDLE) {
-			(void) service_iq(iq, 0);
-			(void) atomic_cas_uint(&iq->state, IQS_BUSY, IQS_IDLE);
-		}
-	}
 	return (DDI_INTR_CLAIMED);
 }
 
-/* Deals with error interrupts */
-/* ARGSUSED */
 uint_t
 t4_intr_err(caddr_t arg1, caddr_t arg2)
 {
 	struct adapter *sc = (struct adapter *)arg1;
 
+	/* handle any device errors */
 	t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0);
 	(void) t4_slow_intr_handler(sc);
 
 	return (DDI_INTR_CLAIMED);
 }
 
-/*
- * t4_ring_rx - Process responses from an SGE response queue.
- *
- * This function processes responses from an SGE response queue up to the
- * supplied budget.  Responses include received packets as well as control
- * messages from FW or HW.
- *
- * It returns a chain of mblks containing the received data, to be
- * passed up to mac_rx_ring().
- */
-mblk_t *
-t4_ring_rx(struct sge_rxq *rxq, int budget)
-{
-	struct sge_iq *iq = &rxq->iq;
-	struct sge_fl *fl = &rxq->fl;		/* Use iff IQ_HAS_FL */
-	struct adapter *sc = iq->adapter;
-	struct rsp_ctrl *ctrl;
-	int ndescs = 0, fl_bufs_used = 0;
-	mblk_t *mblk_head = NULL, **mblk_tail = &mblk_head;
-	uint32_t received_bytes = 0, pkt_len = 0;
-	uint16_t err_vec;
-
-	while (is_new_response(iq, &ctrl)) {
-		membar_consumer();
-
-		const uint8_t type_gen = ctrl->u.type_gen;
-		const uint8_t rsp_type = G_RSPD_TYPE(type_gen);
-		const bool overflowed = (type_gen & F_RSPD_QOVFL) != 0;
-		const uint32_t data_len = BE_32(ctrl->pldbuflen_qid);
-
-		iq->stats.sis_processed++;
-		if (overflowed) {
-			iq->stats.sis_overflow++;
-		}
+uint_t
+t4_intr_fwq(caddr_t arg1, caddr_t arg2)
+{
+	struct adapter *sc = (struct adapter *)arg1;
 
-		const struct rss_header *rss =
-		    (const struct rss_header *)iq->cdesc;
-		mblk_t *m = NULL;
+	(void) t4_process_event_iq(&sc->sge.fwq);
 
-		switch (rsp_type) {
-		case X_RSPD_TYPE_FLBUF:
+	return (DDI_INTR_CLAIMED);
+}
 
-			ASSERT(iq->flags & IQ_HAS_FL);
+uint_t
+t4_intr_port_queue(caddr_t arg1, caddr_t arg2)
+{
+	t4_sge_iq_t *iq = (t4_sge_iq_t *)arg1;
 
-			if (CPL_RX_PKT == rss->opcode) {
-				const struct cpl_rx_pkt *cpl =
-				    t4_rss_payload(rss);
-				pkt_len = be16_to_cpu(cpl->len);
+	(void) t4_process_event_iq(iq);
 
-				if (iq->polling &&
-				    ((received_bytes + pkt_len) > budget))
-					goto done;
+	return (DDI_INTR_CLAIMED);
+}
 
-				m = get_fl_payload(sc, fl, data_len,
-				    &fl_bufs_used);
-				if (m == NULL)
-					goto done;
+static bool
+t4_fl_periodic_refill(struct sge_fl *fl)
+{
+	FL_LOCK(fl);
+	const bool starved = t4_fl_refill(fl, fl->bufs_cap / 8);
+	FL_UNLOCK(fl);
 
-				m->b_rptr += sc->sge.pktshift;
-				if (sc->params.tp.rx_pkt_encap) {
-					/* Enabled only in T6 config file */
-					err_vec = G_T6_COMPR_RXERR_VEC(
-					    ntohs(cpl->err_vec));
-				} else {
-					err_vec = ntohs(cpl->err_vec);
-				}
+	return (starved);
+}
 
-				const bool csum_ok = cpl->csum_calc && !err_vec;
+/*
+ * Convenience struct for tracking entry types while servicing an IQ.
+ * Used to communicate said counts through the t4-process-* probes.
+ */
+struct sge_iq_totals {
+	uint_t sit_desc;
+	uint_t sit_flbuf;
+	uint_t sit_cpl;
+	uint_t sit_intr;
+	uint_t sit_rx_bytes;
+};
 
-				/* TODO: what about cpl->ip_frag? */
-				if (csum_ok && !cpl->ip_frag) {
-					mac_hcksum_set(m, 0, 0, 0, 0xffff,
-					    HCK_FULLCKSUM_OK | HCK_FULLCKSUM |
-					    HCK_IPV4_HDRCKSUM_OK);
-					rxq->rxcsum++;
-				}
-				rxq->rxpkts++;
-				rxq->rxbytes += pkt_len;
-				received_bytes += pkt_len;
+/*
+ * Process entries on an event Ingress Queue. This type of queue receives
+ * firmware events, Tx EGR messages, and Rx forwarded interrupts only. It is
+ * used by the firmware queue and the individual port queues.
+ */
+static t4_iq_result_t
+t4_process_event_iq(t4_sge_iq_t *event_iq)
+{
+	int rc = TIR_SUCCESS;
+	struct adapter *sc = event_iq->tsi_adapter;
+
+	const uint_t desc_limit = event_iq->tsi_qsize / 8;
+	struct sge_iq_totals totals = { 0 };
+	uint_t cidx_incr = 0;
+	struct rsp_ctrl ctrl;
+	list_t iql_fwd;
+
+	ASSERT3S(event_iq->tsi_iqtype, ==, TIQT_EVENT);
+	ASSERT3P(event_iq->tsi_intr_evtq, ==, NULL);
+
+	IQ_LOCK(event_iq);
+	if ((event_iq->tsi_flags & IQ_ENABLED) == 0) {
+		IQ_UNLOCK(event_iq);
+		return (TIR_DISABLED);
+	}
 
-				*mblk_tail = m;
-				mblk_tail = &m->b_next;
+	list_create(&iql_fwd, sizeof (t4_sge_iq_t),
+	    offsetof(t4_sge_iq_t, tsi_intr_fwd_node));
 
-				break;
-			}
+	while (t4_get_new_rsp(event_iq, &ctrl)) {
+		const uint8_t rsp_type = G_RSPD_TYPE(ctrl.u.type_gen);
+		const bool overflowed = (ctrl.u.type_gen & F_RSPD_QOVFL) != 0;
 
-			m = get_fl_payload(sc, fl, data_len, &fl_bufs_used);
-			if (m == NULL)
-				goto done;
-			/* FALLTHROUGH */
+		if (overflowed) {
+			event_iq->tsi_stats.sis_overflow++;
+		}
+
+		const struct rss_header *rss =
+		    (const struct rss_header *)event_iq->tsi_cdesc;
+
+		DTRACE_PROBE3(t4__event__iq__entry, t4_sge_iq_t *, event_iq,
+		    struct rsp_ctrl *, &ctrl, struct rss_header *, rss);
+		ASSERT((rsp_type & (X_RSPD_TYPE_CPL | X_RSPD_TYPE_INTR)) != 0);
 
+		switch (rsp_type) {
 		case X_RSPD_TYPE_CPL:
-			(void) t4_handle_cpl_msg(iq, rss, m);
+			totals.sit_cpl++;
+			(void) t4_handle_cpl_msg(event_iq, rss, NULL);
+			break;
+
+		case X_RSPD_TYPE_INTR:
+			totals.sit_intr++;
+			const uint32_t tgt_qid = BE_32(ctrl.pldbuflen_qid);
+
+			t4_sge_iq_t *tgt_iq = *t4_iqmap_slot(sc, tgt_qid);
+			/*
+			 * Make sure the forwarded interrupt was sent to the
+			 * expected event queue.
+			 */
+			ASSERT3P(tgt_iq->tsi_intr_evtq, ==, event_iq);
+
+			if (!list_link_active(&tgt_iq->tsi_intr_fwd_node)) {
+				list_insert_tail(&iql_fwd, tgt_iq);
+			}
 			break;
 
 		default:
+			cxgb_printf(sc->dip, CE_WARN, "unexpected IQ entry "
+			    "type %d on IQ %u of type %d", rsp_type,
+			    event_iq->tsi_cntxt_id, event_iq->tsi_iqtype);
 			break;
 		}
-		iq_next(iq);
-		++ndescs;
-		if (!iq->polling && (ndescs == budget))
+
+		t4_iq_next_entry(event_iq);
+		cidx_incr++;
+		totals.sit_desc++;
+		event_iq->tsi_stats.sis_processed++;
+
+		if (cidx_incr == desc_limit) {
+			rc = TIR_BUDGET_MAX;
 			break;
+		}
 	}
 
-done:
+	/*
+	 * At this point we may have collected a number of interrupt forwarding
+	 * entries for Rx IQs, indicating that they have outstanding data ready
+	 * for consumption. We process those now while still in interrupt
+	 * context. We remain holding the event IQ's mutex while doing this
+	 * work. No additional interrupts should be generated for this event IQ
+	 * until after we have finished processing and re-armed the interrupt
+	 * via t4_iq_gts_update().
+	 *
+	 * There is a finite budget for processing each rx queue, and not all
+	 * data is guaranteed to be processed as part of this interrupt. Each rx
+	 * queue should re-arm its interrupt to trigger a fresh interrupt later
+	 * if polling mode has not been enabled.
+	 */
+	t4_sge_iq_t *rx_iq = NULL;
+	while ((rx_iq = list_remove_head(&iql_fwd)) != NULL) {
+		(void) t4_process_rx_iq(rx_iq, rx_iq->tsi_qsize / 8, NULL);
+	}
 
-	t4_iq_gts_incr(iq, ndescs);
+	/*
+	 * Send an update to the device about the event queue's new cidx and
+	 * re-arm its interrupt.
+	 */
+	ASSERT3U(cidx_incr, >, 0);
+	t4_iq_gts_update(event_iq, event_iq->tsi_gts_rearm, cidx_incr);
+	IQ_UNLOCK(event_iq);
 
-	if ((fl_bufs_used > 0) || (iq->flags & IQ_HAS_FL)) {
-		int starved;
-		FL_LOCK(fl);
-		fl->needed += fl_bufs_used;
-		starved = refill_fl(sc, fl, fl->cap / 8);
-		FL_UNLOCK(fl);
-		if (starved)
-			add_fl_to_sfl(sc, fl);
-	}
-	return (mblk_head);
+	DTRACE_PROBE3(t4__event__iq__processed, t4_sge_iq_t *, event_iq,
+	    struct sge_iq_totals *, &totals, t4_iq_result_t, rc);
+	return (rc);
 }
 
 /*
- * Deals with anything and everything on the given ingress queue.
+ * Process entries on an Rx Ingress Queue. When called from interrupt context
+ * 'desc_budget' should be non-zero and 'tpr' should be NULL. When called from
+ * polling context 'desc_budget' should be zero and 'tpr' should be non-NULL.
  */
-static int
-service_iq(struct sge_iq *iq, int budget)
+t4_iq_result_t
+t4_process_rx_iq(t4_sge_iq_t *rx_iq, uint_t desc_budget,
+    struct t4_poll_req *tpr)
 {
-	struct sge_iq *q;
-	struct sge_rxq *rxq = iq_to_rxq(iq);	/* Use iff iq is part of rxq */
-	struct sge_fl *fl = &rxq->fl;		/* Use iff IQ_HAS_FL */
-	struct adapter *sc = iq->adapter;
-	struct rsp_ctrl *ctrl;
-	int ndescs = 0, fl_bufs_used = 0;
-	int starved;
-	STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql);
-
-	const uint_t limit = (budget != 0) ? budget : iq->qsize / 8;
-
+	struct adapter *sc = rx_iq->tsi_adapter;
+	struct sge_fl *fl = rx_iq->tsi_fl;
+	struct sge_rxq *rxq = t4_iq_to_rxq(rx_iq);
+	const uint_t byte_limit = (tpr != NULL) ? tpr->tpr_byte_budget : 0;
+	mblk_t *mp_head = NULL, **mp_tail = &mp_head;
+	struct sge_iq_totals totals = { 0 };
+	uint_t cidx_incr = 0;
+	struct rsp_ctrl ctrl;
+	t4_iq_result_t rc = TIR_SUCCESS;
+
+	ASSERT3S(rx_iq->tsi_iqtype, ==, TIQT_ETH_RX);
+	ASSERT3P(rx_iq->tsi_intr_evtq, !=, NULL);
+	ASSERT3P(rxq, !=, NULL);
+	/* Rx queues require an FL. */
+	ASSERT3P(fl, !=, NULL);
 	/*
-	 * We always come back and check the descriptor ring for new indirect
-	 * interrupts and other responses after running a single handler.
+	 * The desc_budget is used only when processing in interrupt context.
+	 * The tpr is used only when proessing in polling context.
 	 */
-	for (;;) {
-		while (is_new_response(iq, &ctrl)) {
-			membar_consumer();
-
-			const uint8_t type_gen = ctrl->u.type_gen;
-			const uint8_t rsp_type = G_RSPD_TYPE(type_gen);
-			const uint32_t dlen_qid = BE_32(ctrl->pldbuflen_qid);
-
-			mblk_t *m = NULL;
-			const struct rss_header *rss =
-			    (const struct rss_header *)iq->cdesc;
-
-			switch (rsp_type) {
-			case X_RSPD_TYPE_FLBUF:
-
-				ASSERT(iq->flags & IQ_HAS_FL);
-
-				m = get_fl_payload(sc, fl, dlen_qid,
-				    &fl_bufs_used);
-				if (m == NULL) {
-					/*
-					 * Rearm the iq with a
-					 * longer-than-default timer
-					 */
-					t4_iq_gts_update(iq, TIC_TIMER5,
-					    ndescs);
-					if (fl_bufs_used > 0) {
-						ASSERT(iq->flags & IQ_HAS_FL);
-						FL_LOCK(fl);
-						fl->needed += fl_bufs_used;
-						starved = refill_fl(sc, fl,
-						    fl->cap / 8);
-						FL_UNLOCK(fl);
-						if (starved)
-							add_fl_to_sfl(sc, fl);
-					}
-					return (0);
-				}
+	ASSERT(desc_budget == 0 || tpr == NULL);
+
+	IQ_LOCK(rx_iq);
+	const bool is_polling = (rx_iq->tsi_flags & IQ_POLLING) != 0;
+	if ((rx_iq->tsi_flags & IQ_ENABLED) == 0) {
+		IQ_UNLOCK(rx_iq);
+		return (TIR_DISABLED);
+	} else if (is_polling && tpr == NULL) {
+		/*
+		 * Skip IQ processing driven from interrupt when port is
+		 * configured for polling.
+		 */
+		IQ_UNLOCK(rx_iq);
+		return (TIR_POLLING);
+	}
 
-			/* FALLTHRU */
-			case X_RSPD_TYPE_CPL:
-				(void) t4_handle_cpl_msg(iq, rss, m);
-				break;
+	while (t4_get_new_rsp(rx_iq, &ctrl)) {
+		const uint8_t rsp_type = G_RSPD_TYPE(ctrl.u.type_gen);
+		const bool overflowed = (ctrl.u.type_gen & F_RSPD_QOVFL) != 0;
 
-			case X_RSPD_TYPE_INTR:
+		if (overflowed) {
+			rx_iq->tsi_stats.sis_overflow++;
+		}
 
-				/*
-				 * Interrupts should be forwarded only to queues
-				 * that are not forwarding their interrupts.
-				 * This means service_iq can recurse but only 1
-				 * level deep.
-				 */
-				ASSERT(budget == 0);
-
-				q = *t4_iqmap_slot(sc, dlen_qid);
-				if (atomic_cas_uint(&q->state, IQS_IDLE,
-				    IQS_BUSY) == IQS_IDLE) {
-					if (service_iq(q, q->qsize / 8) == 0) {
-						(void) atomic_cas_uint(
-						    &q->state, IQS_BUSY,
-						    IQS_IDLE);
-					} else {
-						STAILQ_INSERT_TAIL(&iql, q,
-						    link);
-					}
+		const struct rss_header *rss =
+		    (const struct rss_header *)rx_iq->tsi_cdesc;
+
+		DTRACE_PROBE3(t4__rx__iq__entry, t4_sge_iq_t *, rx_iq,
+		    struct rsp_ctrl *, &ctrl, struct rss_header *, rss);
+
+		switch (rsp_type) {
+		case X_RSPD_TYPE_FLBUF: {
+			const uint32_t dlen_nb = BE_32(ctrl.pldbuflen_qid);
+			const struct cpl_rx_pkt *cpl = t4_rss_payload(rss);
+
+			if (rss->opcode == CPL_RX_PKT) {
+				const uint16_t pkt_len = BE_16(cpl->len);
+				const uint_t new_total =
+				    totals.sit_rx_bytes + pkt_len;
+
+				if (byte_limit != 0 && new_total > byte_limit) {
+					rc = TIR_BUDGET_MAX;
+					goto bail;
 				}
-				break;
+			}
 
-			default:
-				break;
+			const bool newbuf = (dlen_nb & F_RSPD_NEWBUF) != 0;
+			const uint32_t data_len = G_RSPD_LEN(dlen_nb);
+			mblk_t *mp = t4_fl_get_payload(fl, data_len, newbuf);
+			if (mp == NULL) {
+				/* Rearm IQ with longer-than-default timer */
+				t4_iq_gts_update(rx_iq, TGC_TIMER5, cidx_incr);
+				cidx_incr = 0;
+				rc = TIR_ALLOC_FAIL;
+				goto bail;
 			}
 
-			iq_next(iq);
-			if (++ndescs == limit) {
-				t4_iq_gts_incr(iq, ndescs);
-				ndescs = 0;
-
-				if (fl_bufs_used > 0) {
-					ASSERT(iq->flags & IQ_HAS_FL);
-					FL_LOCK(fl);
-					fl->needed += fl_bufs_used;
-					(void) refill_fl(sc, fl, fl->cap / 8);
-					FL_UNLOCK(fl);
-					fl_bufs_used = 0;
+			/*
+			 * Add this entry to the totals once we are past the
+			 * possible bail-outs above.
+			 */
+			totals.sit_flbuf++;
+
+			if (rss->opcode == CPL_RX_PKT) {
+				mp->b_rptr += sc->sge.pktshift;
+
+				uint16_t err_vec;
+				if (sc->params.tp.rx_pkt_encap) {
+					/* Enabled only in T6 config file */
+					err_vec = G_T6_COMPR_RXERR_VEC(
+					    ntohs(cpl->err_vec));
+				} else {
+					err_vec = ntohs(cpl->err_vec);
 				}
 
-				if (budget != 0)
-					return (EINPROGRESS);
+				const bool csum_ok = cpl->csum_calc && !err_vec;
+
+				if (csum_ok && !cpl->ip_frag) {
+					mac_hcksum_set(mp, 0, 0, 0, 0xffff,
+					    HCK_FULLCKSUM_OK | HCK_FULLCKSUM |
+					    HCK_IPV4_HDRCKSUM_OK);
+					rxq->stats.rxcsum++;
+				}
+
+				const uint16_t pkt_len = BE_16(cpl->len);
+				rxq->stats.rxpkts++;
+				rxq->stats.rxbytes += pkt_len;
+				totals.sit_rx_bytes += pkt_len;
+
+				*mp_tail = mp;
+				mp_tail = &mp->b_next;
+			} else {
+				(void) t4_handle_cpl_msg(rx_iq, rss, mp);
 			}
+			break;
 		}
 
-		if (STAILQ_EMPTY(&iql) != 0)
+		default:
+			cxgb_printf(sc->dip, CE_WARN, "unexpected IQ entry "
+			    "type %d on IQ %u of type %d", rsp_type,
+			    rx_iq->tsi_cntxt_id, rx_iq->tsi_iqtype);
+#ifdef DEBUG
+			panic("unexpected IQ entry on rx queue");
+#endif
 			break;
+		}
+
+		t4_iq_next_entry(rx_iq);
+		cidx_incr++;
+		totals.sit_desc++;
+		rx_iq->tsi_stats.sis_processed++;
 
 		/*
-		 * Process the head only, and send it to the back of the list if
-		 * it's still not done.
+		 * The desc_budget value is non-zero only when processing in
+		 * interrupt context. In this case we honor the desc_limit. In
+		 * polling mode we are passed a byte-based budget and disregard
+		 * the the desc_limit.
 		 */
-		q = STAILQ_FIRST(&iql);
-		STAILQ_REMOVE_HEAD(&iql, link);
-		if (service_iq(q, q->qsize / 8) == 0)
-			(void) atomic_cas_uint(&q->state, IQS_BUSY, IQS_IDLE);
-		else
-			STAILQ_INSERT_TAIL(&iql, q, link);
+		if (desc_budget != 0 && cidx_incr == desc_budget) {
+			rc = TIR_BUDGET_MAX;
+			goto bail;
+		}
 	}
 
-	t4_iq_gts_update(iq, iq->intr_params, ndescs);
+bail:
+	if (tpr != NULL) {
+		/*
+		 * Do not re-arm interrupts while this IQ is being polled.
+		 * Just update the CIDX as necessary.
+		 */
+		if (cidx_incr != 0) {
+			t4_iq_gts_incr(rx_iq, cidx_incr);
+		}
+	} else {
+		/*
+		 * Just being extra sure that any future code changes keep this
+		 * code path to interrupt processing only.
+		 */
+		ASSERT3U(desc_budget, >, 0);
+		ASSERT3P(tpr, ==, NULL);
 
-	if (iq->flags & IQ_HAS_FL) {
-		FL_LOCK(fl);
-		fl->needed += fl_bufs_used;
-		starved = refill_fl(sc, fl, fl->cap / 4);
-		FL_UNLOCK(fl);
-		if (starved != 0)
-			add_fl_to_sfl(sc, fl);
+		/*
+		 * Make sure to re-arm the interrupt for this rx queue.
+		 * Remember, the actual interrupt is delivered to the event
+		 * queue (rq_iq->tsi_intr_evtq), but the generation of the
+		 * forwarded interrupt event requires arming the interrupt on
+		 * this rx queue.
+		 */
+		t4_iq_gts_update(rx_iq, rx_iq->tsi_gts_rearm, cidx_incr);
 	}
 
-	return (0);
+	/*
+	 * Take a snapshot of the ring generation number prior to dropping the
+	 * IQ/RXQ lock, in case we need it to pass packets into the mac RX path.
+	 */
+	const uint64_t ring_gen_num = rxq->ring_gen_num;
+	IQ_UNLOCK(rx_iq);
+
+	/*
+	 * First we deliver the packets up to mac to give the client a chance to
+	 * consume these mblks before the driver attempts to refill them.
+	 */
+	if (mp_head != NULL) {
+		if (tpr != NULL) {
+			tpr->tpr_mp = mp_head;
+		} else {
+			mac_rx_ring(rxq->port->mh, rxq->ring_handle, mp_head,
+			    ring_gen_num);
+		}
+	}
+
+	/*
+	 * Next we refill some FL buffers. If the FL is "starving", we enqueue
+	 * it on the starving list for further refilling on a background
+	 * thread.
+	 */
+	if (fl != NULL && t4_fl_periodic_refill(fl)) {
+		t4_sfl_enqueue(sc, fl);
+	}
+	DTRACE_PROBE3(t4__rx__iq__processed, t4_sge_iq_t *, rx_iq,
+	    struct sge_iq_totals *, &totals, t4_iq_result_t, rc);
+	return (rc);
 }
 
 /* Per-packet header in a coalesced tx WR, before the SGL starts (in flits) */
-#define	TXPKTS_PKT_HDR ((\
+#define	TXPKTS_PKT_HDR_FLITS ((\
 	sizeof (struct ulp_txpkt) + \
 	sizeof (struct ulptx_idata) + \
-	sizeof (struct cpl_tx_pkt_core)) / 8)
+	sizeof (struct cpl_tx_pkt_core)) / FLIT_NUM_BYTES)
 
 /* Header of a coalesced tx WR, before SGL of first packet (in flits) */
-#define	TXPKTS_WR_HDR (\
-	sizeof (struct fw_eth_tx_pkts_wr) / 8 + \
-	TXPKTS_PKT_HDR)
+#define	TXPKTS_WR_HDR_FLITS (\
+	sizeof (struct fw_eth_tx_pkts_wr) / FLIT_NUM_BYTES + \
+	TXPKTS_PKT_HDR_FLITS)
 
 /* Header of a tx WR, before SGL of first packet (in flits) */
-#define	TXPKT_WR_HDR ((\
+#define	TXPKT_WR_HDR_FLITS ((\
 	sizeof (struct fw_eth_tx_pkt_wr) + \
-	sizeof (struct cpl_tx_pkt_core)) / 8)
+	sizeof (struct cpl_tx_pkt_core)) / FLIT_NUM_BYTES)
 
 /* Header of a tx LSO WR, before SGL of first packet (in flits) */
-#define	TXPKT_LSO_WR_HDR ((\
+#define	TXPKT_LSO_WR_HDR_FLITS ((\
 	sizeof (struct fw_eth_tx_pkt_wr) + \
 	sizeof (struct cpl_tx_pkt_lso_core) + \
-	sizeof (struct cpl_tx_pkt_core)) / 8)
+	sizeof (struct cpl_tx_pkt_core)) / FLIT_NUM_BYTES)
 
 mblk_t *
 t4_eth_tx(void *arg, mblk_t *frame)
 {
-	struct sge_txq *txq = (struct sge_txq *)arg;
+	struct sge_txq *txq = arg;
 	struct port_info *pi = txq->port;
-	struct sge_eq *eq = &txq->eq;
-	mblk_t *next_frame;
-	int rc, coalescing;
-	struct txpkts txpkts;
-	struct txinfo txinfo;
+	t4_sge_eq_t *eq = &txq->eq;
+	mblk_t *next_frame = NULL;
+	int coalescing = 0;
+	struct txpkts txpkts = {};
+	struct txinfo txinfo = {};
 
 	txpkts.npkt = 0; /* indicates there's nothing in txpkts */
-	coalescing = 0;
 
 	TXQ_LOCK(txq);
-	if (eq->avail < 8)
-		(void) t4_tx_reclaim_descs(txq, 8, NULL);
-	for (; frame; frame = next_frame) {
+	if ((eq->tse_flags & EQ_ENABLED) == 0) {
+		/* Apply flow control until EQ is enabled. */
+		TXQ_UNLOCK(txq);
+		return (frame);
+	}
+
+	/* We always strive to send the maximum size WR. */
+	if (eq->tse_avail < TX_WR_MAX_CREDITS) {
+		(void) t4_tx_reclaim_credits(txq, TX_WR_MAX_CREDITS, NULL);
+	}
+	for (; frame != NULL; frame = next_frame) {
+		int rc = 0;
 
-		if (eq->avail < 8)
+		if (eq->tse_avail < TX_WR_MAX_CREDITS)
 			break;
 
 		next_frame = frame->b_next;
@@ -1006,7 +1148,6 @@ t4_eth_tx(void *arg, mblk_t *frame)
 				 * state in mac to continue transmissions.
 				 */
 				t4_write_flush_wr(txq);
-
 				break;
 			}
 
@@ -1020,9 +1161,7 @@ t4_eth_tx(void *arg, mblk_t *frame)
 
 		if (coalescing != 0 &&
 		    add_to_txpkts(txq, &txpkts, frame, &txinfo) == 0) {
-
 			/* Successfully absorbed into txpkts */
-
 			write_ulp_cpl_sgl(pi, txq, &txpkts, &txinfo);
 			goto doorbell;
 		}
@@ -1038,8 +1177,11 @@ t4_eth_tx(void *arg, mblk_t *frame)
 		/* We're sending out individual frames now */
 		coalescing = 0;
 
-		if (eq->avail < 8)
-			(void) t4_tx_reclaim_descs(txq, 8, NULL);
+		if (eq->tse_avail < TX_WR_MAX_CREDITS) {
+			(void) t4_tx_reclaim_credits(txq, TX_WR_MAX_CREDITS,
+			    NULL);
+		}
+
 		rc = write_txpkt_wr(pi, txq, frame, &txinfo);
 		if (rc != 0) {
 
@@ -1051,7 +1193,7 @@ t4_eth_tx(void *arg, mblk_t *frame)
 			 * can't send out the frame.  What's worse, we have to
 			 * spend even more time freeing up everything in txinfo.
 			 */
-			txq->qfull++;
+			txq->stats.qfull++;
 			free_txinfo_resources(txq, &txinfo);
 
 			frame->b_next = next_frame;
@@ -1060,104 +1202,86 @@ t4_eth_tx(void *arg, mblk_t *frame)
 
 doorbell:
 		/* Fewer and fewer doorbells as the queue fills up */
-		if (eq->pending >= (1 << (fls(eq->qsize - eq->avail) / 2))) {
-			txq->txbytes += txinfo.len;
-			txq->txpkts++;
+		if (eq->tse_pending >=
+		    (1 << (fls(eq->tse_qsize - eq->tse_avail) / 2))) {
+			txq->stats.txbytes += txinfo.len;
+			txq->stats.txpkts++;
 			t4_tx_ring_db(txq);
 		}
-		(void) t4_tx_reclaim_descs(txq, 32, NULL);
+		(void) t4_tx_reclaim_credits(txq, 32, NULL);
 	}
 
 	if (txpkts.npkt > 0) {
 		write_txpkts_wr(txq, &txpkts);
 	}
 
-	if (eq->pending != 0) {
+	if (eq->tse_pending != 0) {
 		t4_tx_ring_db(txq);
 	}
 
 	if (frame != NULL) {
-		eq->flags |= EQ_CORKED;
+		eq->tse_flags |= EQ_CORKED;
 	}
 
-	(void) t4_tx_reclaim_descs(txq, eq->qsize, NULL);
+	(void) t4_tx_reclaim_credits(txq, eq->tse_qsize, NULL);
 	TXQ_UNLOCK(txq);
 
 	return (frame);
 }
 
-static inline void
-init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int8_t pktc_idx,
-    int qsize, uint8_t esize)
-{
-	ASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS);
-	ASSERT(pktc_idx < SGE_NCOUNTERS);	/* -ve is ok, means don't use */
-
-	iq->flags = 0;
-	iq->adapter = sc;
-	iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx);
-	iq->intr_pktc_idx = -1;
-	if (pktc_idx >= 0) {
-		iq->intr_params |= TIC_SE_INTR_ARM;
-		iq->intr_pktc_idx = pktc_idx;
-	}
-	iq->qsize = roundup(qsize, 16);		/* See FW_IQ_CMD/iqsize */
-	iq->esize = max(esize, 16);		/* See FW_IQ_CMD/iqesize */
-}
-
-static inline void
-init_fl(struct sge_fl *fl, uint16_t qsize)
+static int
+t4_alloc_iq(struct port_info *pi, const t4_iq_params_t *tip, t4_sge_iq_t *iq,
+    struct sge_fl *fl)
 {
+	struct adapter *sc = pi->adapter;
+	int rc;
 
-	fl->qsize = qsize;
-	fl->allocb_fail = 0;
-}
-
-/*
- * Allocates the ring for an ingress queue and an optional freelist.  If the
- * freelist is specified it will be allocated and then associated with the
- * ingress queue.
- *
- * Returns errno on failure.  Resources allocated up to that point may still be
- * allocated.  Caller is responsible for cleanup in case this function fails.
- *
- * If the ingress queue will take interrupts directly (iq->flags & IQ_INTR) then
- * the intr_idx specifies the vector, starting from 0.  Otherwise it specifies
- * the index of the queue to which its interrupts will be forwarded.
- */
-static int
-alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl,
-    int intr_idx, int cong)
-{
-	int rc, i;
-	size_t len;
-	struct fw_iq_cmd c;
-	struct adapter *sc = iq->adapter;
-	uint32_t v = 0;
-
-	len = iq->qsize * iq->esize;
-	rc = alloc_desc_ring(sc, len, DDI_DMA_READ, &iq->dhdl, &iq->ahdl,
-	    &iq->ba, (caddr_t *)&iq->desc);
-	if (rc != 0)
-		return (rc);
+	ASSERT(tip->tip_tmr_idx >= 0 && tip->tip_tmr_idx < SGE_NTIMERS);
+	ASSERT(tip->tip_pktc_idx < SGE_NCOUNTERS);
+	ASSERT(tip->tip_cong_chan == -1 || tip->tip_cong_chan > 0);
 
-	bzero(&c, sizeof (c));
-	c.op_to_vfn = cpu_to_be32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
-	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
-	    V_FW_IQ_CMD_VFN(0));
+	const bool intr_fwd = (tip->tip_intr_evtq != NULL);
+	const uint_t intr_idx =
+	    intr_fwd ? tip->tip_intr_evtq->tsi_cntxt_id : tip->tip_intr_idx;
 
-	c.alloc_to_len16 = cpu_to_be32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
-	    FW_LEN16(c));
+	ASSERT(intr_fwd || intr_idx < sc->intr_queue_cfg.intr_count);
 
-	/* Special handling for firmware event queue */
-	if (iq == &sc->sge.fwq)
-		v |= F_FW_IQ_CMD_IQASYNCH;
+	mutex_init(&iq->tsi_lock, NULL, MUTEX_DRIVER,
+	    DDI_INTR_PRI(DDI_INTR_PRI(sc->intr_pri)));
+	iq->tsi_flags = 0;
+	iq->tsi_iqtype = tip->tip_iq_type;
+	iq->tsi_adapter = sc;
+	iq->tsi_gts_rearm = V_QINTR_TIMER_IDX(tip->tip_tmr_idx);
+	iq->tsi_intr_pktc_idx = -1;
+	if (tip->tip_pktc_idx >= 0) {
+		iq->tsi_gts_rearm |= TGC_SE_INTR_ARM;
+		iq->tsi_intr_pktc_idx = tip->tip_pktc_idx;
+	}
 
-	if (iq->flags & IQ_INTR)
-		ASSERT(intr_idx < sc->intr_count);
-	else
-		v |= F_FW_IQ_CMD_IQANDST;
-	v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx);
+	/*
+	 * The tsi_qsize holds the number of total entries in the queue, but the
+	 * device requires that this number be a multiple of 16. See the
+	 * documentation for FW_IQ_CMD in the Firmware Interface Book.
+	 */
+	iq->tsi_qsize = P2ROUNDUP(tip->tip_qsize, 16);
+	/*
+	 * The last entry is always reserved for the status page, even if status
+	 * page updates are not being utilized.
+	 */
+	iq->tsi_cap = iq->tsi_qsize - 1;
+	iq->tsi_esize = tip->tip_esize;
+	iq->tsi_esize_bytes = t4_iq_esize_bytes[iq->tsi_esize];
+	iq->tsi_intr_evtq = intr_fwd ? tip->tip_intr_evtq : NULL;
+	iq->tsi_intr_idx = intr_fwd ? INTR_FORWARDED : intr_idx;
+
+	const size_t len = iq->tsi_qsize * iq->tsi_esize_bytes;
+	rc = alloc_desc_ring(sc, len, DDI_DMA_READ, &iq->tsi_desc_dhdl,
+	    &iq->tsi_desc_ahdl, &iq->tsi_desc_ba, (caddr_t *)&iq->tsi_desc);
+	if (rc != 0) {
+		mutex_destroy(&iq->tsi_lock);
+		return (rc);
+	}
+	iq->tsi_flags |= IQ_ALLOC_HOST;
 
 	/*
 	 * If the coalescing counter is not enabled for this IQ, use the 0
@@ -1166,53 +1290,65 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl,
 	 * The selected index does not matter when the counter is not enabled
 	 * through the GTS flags.
 	 */
-	const uint_t pktc_idx = (iq->intr_pktc_idx < 0) ? 0 : iq->intr_pktc_idx;
+	const uint_t pktc_idx = (iq->tsi_intr_pktc_idx < 0) ? 0 :
+	    iq->tsi_intr_pktc_idx;
+	const bool is_fwq = (iq == &sc->sge.fwq);
+
+	struct fw_iq_cmd iq_cmd;
+	bzero(&iq_cmd, sizeof (iq_cmd));
 
-	c.type_to_iqandstindex = cpu_to_be32(v |
+	iq_cmd.op_to_vfn = BE_32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
+	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
+	    V_FW_IQ_CMD_VFN(0));
+
+	iq_cmd.alloc_to_len16 = BE_32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
+	    FW_LEN16(struct fw_iq_cmd));
+
+	iq_cmd.type_to_iqandstindex = BE_32(
+	    /* Special handling for firmware event queue */
+	    (is_fwq ? F_FW_IQ_CMD_IQASYNCH : 0) |
+	    (intr_fwd ? F_FW_IQ_CMD_IQANDST : 0) |
+	    V_FW_IQ_CMD_IQANDSTINDEX(intr_idx) |
 	    V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
 	    V_FW_IQ_CMD_VIID(pi->viid) |
 	    V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT));
-	c.iqdroprss_to_iqesize = cpu_to_be16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) |
-	    F_FW_IQ_CMD_IQGTSMODE |
-	    V_FW_IQ_CMD_IQINTCNTTHRESH(pktc_idx) |
-	    V_FW_IQ_CMD_IQESIZE(ilog2(iq->esize) - 4));
-	c.iqsize = cpu_to_be16(iq->qsize);
-	c.iqaddr = cpu_to_be64(iq->ba);
-	if (cong >= 0) {
-		const uint32_t iq_type =
-		    cong ? FW_IQ_IQTYPE_NIC : FW_IQ_IQTYPE_OFLD;
-		c.iqns_to_fl0congen = BE_32(F_FW_IQ_CMD_IQFLINTCONGEN |
-		    V_FW_IQ_CMD_IQTYPE(iq_type));
-	}
+
+	iq_cmd.iqdroprss_to_iqesize = BE_16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) |
+	    F_FW_IQ_CMD_IQGTSMODE | V_FW_IQ_CMD_IQINTCNTTHRESH(pktc_idx) |
+	    V_FW_IQ_CMD_IQESIZE(iq->tsi_esize));
+
+	iq_cmd.iqsize = BE_16(iq->tsi_qsize);
+	iq_cmd.iqaddr = BE_64(iq->tsi_desc_ba);
+	iq_cmd.iqns_to_fl0congen = tip->tip_cong_chan == -1 ? 0 :
+	    BE_32(F_FW_IQ_CMD_IQFLINTCONGEN);
+
+	/*
+	 * This setting currently only pertains to T4/T5 parts with 2 ports, and
+	 * its only effect is to correct a bug in setting the IQPCIECH related
+	 * to offload queues (Chelsio bug#34516). Therefore, setting it is
+	 * irrelevant for our driver. However, we set it anyways in case a
+	 * future part or fimrware revision decides to use this information for
+	 * other purposes relevant the behavior of our driver.
+	 */
+	iq_cmd.iqns_to_fl0congen |= BE_32(V_FW_IQ_CMD_IQTYPE(FW_IQ_IQTYPE_NIC));
 
 	if (fl != NULL) {
-		mutex_init(&fl->lock, NULL, MUTEX_DRIVER,
-		    DDI_INTR_PRI(sc->intr_pri));
-		fl->flags |= FL_MTX;
-
-		len = fl->qsize * RX_FL_ESIZE;
-		rc = alloc_desc_ring(sc, len, DDI_DMA_WRITE, &fl->dhdl,
-		    &fl->ahdl, &fl->ba, (caddr_t *)&fl->desc);
-		if (rc != 0)
-			return (rc);
+		t4_sge_eq_t *eq = &fl->eq;
 
-		/* Allocate space for one software descriptor per buffer. */
-		fl->cap = (fl->qsize - sc->sge.stat_len / RX_FL_ESIZE) * 8;
-		fl->sdesc = kmem_zalloc(sizeof (struct fl_sdesc) * fl->cap,
-		    KM_SLEEP);
-		fl->needed = fl->cap;
-		fl->lowat = roundup(sc->sge.fl_starve_threshold, 8);
-
-		c.iqns_to_fl0congen |=
-		    cpu_to_be32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
-		    F_FW_IQ_CMD_FL0PACKEN | F_FW_IQ_CMD_FL0PADEN);
-		if (cong >= 0) {
-			c.iqns_to_fl0congen |=
-			    BE_32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) |
-			    F_FW_IQ_CMD_FL0CONGCIF |
-			    F_FW_IQ_CMD_FL0CONGEN);
+		iq->tsi_fl = fl;
+		bzero(&fl->stats, sizeof (fl->stats));
+
+		fl->bufs_cap = tip->tip_fl_qsize;
+		eq->tse_flags = 0;
+		eq->tse_qsize = EQ_FLITS_TO_HC(fl->bufs_cap);
+
+		if ((rc = t4_alloc_eq_base(pi, eq)) != 0) {
+			t4_free_iq(pi, iq);
+			return (rc);
 		}
 
+		fl->bufs_lowat = P2ROUNDUP(sc->sge.fl_starve_threshold, 8);
+
 		/*
 		 * In T6, for egress queue type FL there is internal overhead
 		 * of 16B for header going into FLM module.  Hence the maximum
@@ -1226,60 +1362,82 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl,
 		    X_FETCHBURSTMIN_64B_T6: X_FETCHBURSTMIN_128B;
 		const uint_t fbmax = t4_cver_ge(sc, CHELSIO_T6) ?
 		    X_FETCHBURSTMAX_256B : X_FETCHBURSTMAX_512B;
-		c.fl0dcaen_to_fl0cidxfthresh = cpu_to_be16(
+		const uint32_t fl_cong = (tip->tip_cong_chan == -1) ? 0 :
+		    (V_FW_IQ_CMD_FL0CNGCHMAP(tip->tip_cong_chan) |
+		    F_FW_IQ_CMD_FL0CONGCIF |
+		    F_FW_IQ_CMD_FL0CONGEN);
+
+		iq_cmd.iqns_to_fl0congen |= BE_32(
+		    V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
+		    F_FW_IQ_CMD_FL0PACKEN |
+		    F_FW_IQ_CMD_FL0PADEN |
+		    fl_cong);
+		/*
+		 * We do not set cidx flushing because we choose to have no cidx
+		 * updates for an FL. Instead we track FL usage implicitly by
+		 * the incoming CPL messages on the Rx IQ.
+		 */
+		iq_cmd.fl0dcaen_to_fl0cidxfthresh |= BE_16(
 		    V_FW_IQ_CMD_FL0FBMIN(fbmin) |
 		    V_FW_IQ_CMD_FL0FBMAX(fbmax));
-		c.fl0size = cpu_to_be16(fl->qsize);
-		c.fl0addr = cpu_to_be64(fl->ba);
+		iq_cmd.fl0size |= BE_16(eq->tse_qsize_spg);
+		iq_cmd.fl0addr |= BE_64(eq->tse_ring_ba);
+	}
+	if (!intr_fwd) {
+		iq->tsi_flags |= IQ_INTR;
 	}
 
-	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof (c), &c);
+	rc = -t4_wr_mbox(sc, sc->mbox, &iq_cmd, sizeof (iq_cmd), &iq_cmd);
 	if (rc != 0) {
 		cxgb_printf(sc->dip, CE_WARN,
 		    "failed to create ingress queue: %d", rc);
+		t4_free_iq(pi, iq);
 		return (rc);
 	}
+	iq->tsi_cntxt_id = BE_16(iq_cmd.iqid);
+	iq->tsi_abs_id = BE_16(iq_cmd.physiqid);
+	iq->tsi_flags |= IQ_ALLOC_DEV;
 
-	iq->cdesc = iq->desc;
-	iq->cidx = 0;
-	iq->gen = 1;
-	iq->adapter = sc;
-	iq->cntxt_id = be16_to_cpu(c.iqid);
-	iq->abs_id = be16_to_cpu(c.physiqid);
-	iq->flags |= IQ_ALLOCATED;
-	mutex_init(&iq->lock, NULL, MUTEX_DRIVER,
-	    DDI_INTR_PRI(DDI_INTR_PRI(sc->intr_pri)));
-	iq->polling = 0;
+	iq->tsi_cdesc = iq->tsi_desc;
+	iq->tsi_cidx = 0;
+	iq->tsi_gen = F_RSPD_GEN;
+	iq->tsi_adapter = sc;
 
-	*t4_iqmap_slot(sc, iq->cntxt_id) = iq;
+	*t4_iqmap_slot(sc, iq->tsi_cntxt_id) = iq;
 
 	if (fl != NULL) {
-		fl->cntxt_id = be16_to_cpu(c.fl0id);
-		fl->pidx = fl->cidx = 0;
+		t4_sge_eq_t *eq = &fl->eq;
+
+		eq->tse_cntxt_id = BE_16(iq_cmd.fl0id);
+
+		CTASSERT(offsetof(struct sge_fl, eq) == 0);
+		*t4_eqmap_slot(sc, eq->tse_cntxt_id) = (t4_sge_eq_t *)fl;
+		eq->tse_flags |= EQ_ALLOC_DEV;
+		eq->tse_pidx = eq->tse_cidx = 0;
+		t4_alloc_eq_post(pi, eq);
 		fl->copy_threshold = rx_copy_threshold;
 
-		*t4_eqmap_slot(sc, fl->cntxt_id) = (struct sge_eq *)fl;
+		/* Allocate space for one software descriptor per buffer. */
+		const size_t sdesc_sz = fl->bufs_cap * sizeof (struct fl_sdesc);
+		fl->sdesc = kmem_zalloc(sdesc_sz, KM_SLEEP);
+		eq->tse_flags |= EQ_ALLOC_DESC;
 
 		FL_LOCK(fl);
-		(void) refill_fl(sc, fl, fl->lowat);
+		(void) t4_fl_refill(fl, fl->bufs_lowat);
 		FL_UNLOCK(fl);
-
-		iq->flags |= IQ_HAS_FL;
 	}
 
-	if (t4_cver_ge(sc, CHELSIO_T5) && cong >= 0) {
-		uint32_t param, val;
-
-		param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
+	if (t4_cver_ge(sc, CHELSIO_T5) && tip->tip_cong_chan != -1) {
+		const uint32_t param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
-		    V_FW_PARAMS_PARAM_YZ(iq->cntxt_id);
-		if (cong == 0)
-			val = 1 << 19;
-		else {
-			val = 2 << 19;
-			for (i = 0; i < 4; i++) {
-				if (cong & (1 << i))
-					val |= 1 << (i << 2);
+		    V_FW_PARAMS_PARAM_YZ(iq->tsi_cntxt_id);
+
+		const uint_t congmap_log = sc->params.arch.cng_ch_bits_log;
+		uint32_t val =
+		    V_CONMCTXT_CNGTPMODE(X_CONMCTXT_CNGTPMODE_CHANNEL);
+		for (uint_t i = 0; i < 4; i++) {
+			if (tip->tip_cong_chan & (1 << i)) {
+				val |= (1 << (i << congmap_log));
 			}
 		}
 
@@ -1288,137 +1446,283 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl,
 			/* report error but carry on */
 			cxgb_printf(sc->dip, CE_WARN,
 			    "failed to set congestion manager context for "
-			    "ingress queue %d: %d", iq->cntxt_id, rc);
+			    "ingress queue %d: %d", iq->tsi_cntxt_id, rc);
 		}
 	}
 
-	/* Enable IQ interrupts */
-	iq->state = IQS_IDLE;
-	t4_iq_gts_update(iq, iq->intr_params, 0);
+	/* Enable event (and firmware) queues IQs immediately */
+	if (iq->tsi_iqtype == TIQT_EVENT) {
+		iq->tsi_flags |= IQ_ENABLED;
+		t4_iq_gts_update(iq, iq->tsi_gts_rearm, 0);
+	}
 
 	return (0);
 }
 
-static int
-free_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl)
+static void
+t4_free_iq(struct port_info *pi, t4_sge_iq_t *iq)
 {
-	int rc;
+	struct adapter *sc = iq->tsi_adapter;
+	struct sge_fl *fl = iq->tsi_fl;
+	t4_sge_eq_t *eq = fl != NULL ? &fl->eq : NULL;
 
-	if (iq != NULL) {
-		struct adapter *sc = iq->adapter;
-		dev_info_t *dip;
-
-		dip = pi ? pi->dip : sc->dip;
-		if (iq->flags & IQ_ALLOCATED) {
-			rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0,
-			    FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id,
-			    fl ? fl->cntxt_id : 0xffff, 0xffff);
-			if (rc != 0) {
-				cxgb_printf(dip, CE_WARN,
-				    "failed to free queue %p: %d", iq, rc);
-				return (rc);
-			}
-			mutex_destroy(&iq->lock);
-			iq->flags &= ~IQ_ALLOCATED;
-		}
+	/*
+	 * The onus is placed on the caller to ensure that no further activity
+	 * will occur on this IQ.
+	 */
+	iq->tsi_flags &= ~IQ_ENABLED;
 
-		if (iq->desc != NULL) {
-			(void) free_desc_ring(&iq->dhdl, &iq->ahdl);
-			iq->desc = NULL;
-		}
+	if (iq->tsi_flags & IQ_ALLOC_DEV) {
+		/*
+		 * Device-side resources of freelists are allocated in concert
+		 * with the device-side resources of their associated IQ.
+		 */
+		ASSERT(fl == NULL || (eq->tse_flags & EQ_ALLOC_DEV));
 
-		bzero(iq, sizeof (*iq));
+		const uint16_t eq_cntxid = fl ? eq->tse_cntxt_id : 0xffff;
+		int rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0,
+		    FW_IQ_TYPE_FL_INT_CAP, iq->tsi_cntxt_id, eq_cntxid, 0xffff);
+		if (rc != 0) {
+			cxgb_printf(sc->dip, CE_WARN,
+			    "failed to free IQ/FL (%x/%x): %d",
+			    iq->tsi_cntxt_id, eq_cntxid, rc);
+			/* attempt to complete the rest of clean-up */
+		}
+		iq->tsi_flags &= ~IQ_ALLOC_DEV;
+		if (fl != NULL) {
+			eq->tse_flags &= ~EQ_ALLOC_DEV;
+		}
 	}
+	if (iq->tsi_flags & IQ_ALLOC_HOST) {
+		(void) free_desc_ring(&iq->tsi_desc_dhdl, &iq->tsi_desc_ahdl);
+		iq->tsi_desc = NULL;
+		iq->tsi_cdesc = NULL;
+		iq->tsi_desc_ba = 0;
+		mutex_destroy(&iq->tsi_lock);
+		iq->tsi_flags &= ~IQ_ALLOC_HOST;
+	}
+	iq->tsi_flags &= ~IQ_INTR;
+	ASSERT0(iq->tsi_flags);
+
+	iq->tsi_intr_idx = 0;
+	iq->tsi_intr_evtq = NULL;
+	iq->tsi_iqtype = TIQT_UNINIT;
 
 	if (fl != NULL) {
-		if (fl->sdesc != NULL) {
+		if (eq->tse_flags & EQ_ALLOC_DESC) {
 			FL_LOCK(fl);
-			free_fl_bufs(fl);
+			t4_fl_free_bufs(fl);
 			FL_UNLOCK(fl);
 
-			kmem_free(fl->sdesc, sizeof (struct fl_sdesc) *
-			    fl->cap);
+			kmem_free(fl->sdesc, fl->bufs_cap *
+			    sizeof (struct fl_sdesc));
 			fl->sdesc = NULL;
+
+			eq->tse_flags &= ~EQ_ALLOC_DESC;
 		}
+		t4_free_eq(pi, eq);
+		iq->tsi_fl = NULL;
+
+		ASSERT0(eq->tse_flags);
+	}
+}
+
+int
+t4_alloc_evt_iqs(struct adapter *sc)
+{
+	const t4_intr_plan_t plan = sc->intr_queue_cfg.intr_plan;
+
+	const t4_iq_params_t fwq_iqp = {
+		.tip_iq_type	= TIQT_EVENT,
+		.tip_tmr_idx	= sc->sge.fwq_tmr_idx,
+		.tip_pktc_idx	= sc->sge.fwq_pktc_idx,
+		.tip_qsize	= FW_IQ_QSIZE,
+		.tip_esize	= FW_IQ_ESIZE,
+		.tip_cong_chan	= -1,
+		.tip_intr_evtq	= NULL,
+		/*
+		 * The device error-handling interrupt always occupies the 0th
+		 * slot, which the firmware queue will share if no additional
+		 * interrupts are available.  Otherwise it uses the next slot
+		 * after that.
+		 */
+		.tip_intr_idx	= (plan == TIP_SINGLE) ? 0 : 1,
+	};
+	const int rc = t4_alloc_iq(sc->port[0], &fwq_iqp, &sc->sge.fwq, NULL);
+	if (rc != 0) {
+		cxgb_printf(sc->dip, CE_WARN,
+		    "failed to create firmware event queue: %d.", rc);
+		return (rc);
+	}
 
-		if (fl->desc != NULL) {
-			(void) free_desc_ring(&fl->dhdl, &fl->ahdl);
-			fl->desc = NULL;
+	if (plan == TIP_PER_PORT) {
+		const uint_t ipp = sc->intr_queue_cfg.intr_per_port;
+		const uint_t port_count = sc->params.nports;
+
+		for (uint_t i = 0; i < port_count; i++) {
+			struct port_info *port = sc->port[i];
+
+			for (uint_t j = 0; j < ipp; j++) {
+				const t4_iq_params_t iqp = {
+					.tip_iq_type	= TIQT_EVENT,
+					.tip_tmr_idx	= sc->sge.fwq_tmr_idx,
+					.tip_pktc_idx	= sc->sge.fwq_pktc_idx,
+					.tip_qsize	= FW_IQ_QSIZE,
+					.tip_esize	= FW_IQ_ESIZE,
+					.tip_cong_chan	= -1,
+					.tip_intr_evtq	= NULL,
+					.tip_intr_idx	= 2 + (i * ipp) + j,
+				};
+
+				const int rc = t4_alloc_iq(port, &iqp,
+				    &port->intr_iqs[j], NULL);
+				if (rc != 0) {
+					cxgb_printf(sc->dip, CE_WARN,
+					    "failed to create interrupt event "
+					    "queue %u for port %u: %d.", j, i,
+					    rc);
+					t4_free_evt_iqs(sc);
+					return (rc);
+				}
+			}
 		}
+	}
+
+	return (0);
+}
+
+void
+t4_free_evt_iqs(struct adapter *sc)
+{
+	const uint_t port_count = sc->params.nports;
 
-		if (fl->flags & FL_MTX) {
-			mutex_destroy(&fl->lock);
-			fl->flags &= ~FL_MTX;
+	for (uint_t i = 0; i < port_count; i++) {
+		struct port_info *port = sc->port[i];
+
+		for (uint_t j = 0; j < sc->intr_queue_cfg.intr_per_port; j++) {
+			t4_free_iq(port, &port->intr_iqs[j]);
 		}
+	}
+
+	t4_free_iq(sc->port[0], &sc->sge.fwq);
+}
 
-		bzero(fl, sizeof (struct sge_fl));
+static int
+t4_alloc_rxq(struct port_info *pi, struct sge_rxq *rxq, uint_t q_idx)
+{
+	struct adapter *sc = pi->adapter;
+
+	rxq->port = pi;
+
+	t4_iq_params_t iqp = {
+		.tip_iq_type	= TIQT_ETH_RX,
+		.tip_tmr_idx	= pi->tmr_idx,
+		.tip_pktc_idx	= pi->pktc_idx,
+		.tip_qsize	= sc->props.qsize_rxq,
+		.tip_esize	= RX_IQ_ESIZE,
+		.tip_fl_qsize	= sc->props.qsize_rxq,
+		.tip_cong_chan	= t4_get_tp_ch_map(sc, pi->tx_chan),
+	};
+	t4_rxq_intr_assign(pi, q_idx, &iqp);
+	const int rc = t4_alloc_iq(pi, &iqp, &rxq->iq, &rxq->fl);
+	if (rc != 0) {
+		return (rc);
+	}
+
+	rxq->ksp = setup_rxq_kstats(pi, rxq, q_idx);
+	return (0);
+}
+
+static void
+t4_free_rxq(struct port_info *pi, struct sge_rxq *rxq)
+{
+	if (rxq->ksp != NULL) {
+		kstat_delete(rxq->ksp);
+		rxq->ksp = NULL;
 	}
 
-	return (0);
+	t4_free_iq(pi, &rxq->iq);
 }
 
-int
-t4_alloc_fwq(struct adapter *sc)
+static int
+t4_alloc_eq_base(struct port_info *pi, t4_sge_eq_t *eq)
 {
-	int rc, intr_idx;
-	struct sge_iq *fwq = &sc->sge.fwq;
+	struct adapter *sc = pi->adapter;
+	ASSERT0(eq->tse_flags);
+	mutex_init(&eq->tse_lock, NULL, MUTEX_DRIVER,
+	    DDI_INTR_PRI(sc->intr_pri));
+
+	/*
+	 * Make sure to account for the status page which sits at the end of the
+	 * hardware ring and may consume one or two credits.
+	 */
+	ASSERT3U(eq->tse_qsize, <=, T4_MAX_EQ_SIZE);
+	eq->tse_qsize_spg = eq->tse_qsize + sc->sge.eq_spg_len;
 
-	init_iq(fwq, sc, sc->sge.fwq_tmr_idx, sc->sge.fwq_pktc_idx,
-	    FW_IQ_QSIZE, FW_IQ_ESIZE);
-	fwq->flags |= IQ_INTR;	/* always */
-	intr_idx = sc->intr_count > 1 ? 1 : 0;
-	rc = alloc_iq_fl(sc->port[0], fwq, NULL, intr_idx, -1);
+	/*
+	 * We are allocating the "hardware" ring to hold the host credits, make
+	 * sure to use tse_qsize_spg to include the status page credits.
+	 */
+	const size_t len = eq->tse_qsize_spg * EQ_HC_SIZE;
+	int rc = alloc_desc_ring(sc, len, DDI_DMA_WRITE, &eq->tse_ring_dhdl,
+	    &eq->tse_ring_ahdl, &eq->tse_ring_ba, (caddr_t *)&eq->tse_ring);
 	if (rc != 0) {
-		cxgb_printf(sc->dip, CE_WARN,
-		    "failed to create firmware event queue: %d.", rc);
+		mutex_destroy(&eq->tse_lock);
 		return (rc);
 	}
+	eq->tse_flags |= EQ_ALLOC_HOST;
+
+	/*
+	 * We always use one credit less than the technical capacity to avoid
+	 * the situation where pidx == cidx which would indicate to the hardware
+	 * that the queue is empty.
+	 */
+	eq->tse_avail = eq->tse_qsize - 1;
+	eq->tse_pending = 0;
+	eq->tse_pidx = 0;
+	eq->tse_cidx = 0;
+	eq->tse_spg = t4_eq_credit(eq, eq->tse_qsize);
 
 	return (0);
 }
 
-int
-t4_free_fwq(struct adapter *sc)
-{
-	return (free_iq_fl(NULL, &sc->sge.fwq, NULL));
-}
+#define	UDB_DBS	(DOORBELL_UDB | DOORBELL_UDBWC | DOORBELL_WCWR)
 
-static int
-alloc_rxq(struct port_info *pi, struct sge_rxq *rxq, int intr_idx, int i)
+static void
+t4_alloc_eq_post(struct port_info *pi, t4_sge_eq_t *eq)
 {
-	int rc;
-
-	rxq->port = pi;
-	rc = alloc_iq_fl(pi, &rxq->iq, &rxq->fl, intr_idx,
-	    t4_get_tp_ch_map(pi->adapter, pi->tx_chan));
-	if (rc != 0)
-		return (rc);
+	struct adapter *sc = pi->adapter;
+	const boolean_t udb = (sc->doorbells & UDB_DBS) != 0;
+	ASSERT(eq->tse_flags & EQ_ALLOC_DEV);
 
-	rxq->ksp = setup_rxq_kstats(pi, rxq, i);
+	eq->tse_doorbells = sc->doorbells;
+	if (udb) {
+		uint64_t udb_offset;
+		uint_t udb_qid;
 
-	return (rc);
+		const int rc = t4_bar2_sge_qregs(sc, eq->tse_cntxt_id,
+		    T4_BAR2_QTYPE_EGRESS, 0, &udb_offset, &udb_qid);
+		if (rc == 0) {
+			eq->tse_udb = sc->bar2_ptr + udb_offset;
+			eq->tse_udb_qid = udb_qid;
+		} else {
+			eq->tse_doorbells &= ~UDB_DBS;
+			eq->tse_udb = NULL;
+			eq->tse_udb_qid = 0;
+		}
+	}
 }
 
 static int
-free_rxq(struct port_info *pi, struct sge_rxq *rxq)
+t4_eq_alloc_eth(struct port_info *pi, t4_sge_eq_t *eq)
 {
+	struct adapter *sc = pi->adapter;
 	int rc;
 
-	if (rxq->ksp != NULL) {
-		kstat_delete(rxq->ksp);
-		rxq->ksp = NULL;
+	if ((rc = t4_alloc_eq_base(pi, eq)) != 0) {
+		return (rc);
 	}
 
-	rc = free_iq_fl(pi, &rxq->iq, &rxq->fl);
-	if (rc == 0)
-		bzero(&rxq->fl, sizeof (*rxq) - offsetof(struct sge_rxq, fl));
-
-	return (rc);
-}
-
-static int
-eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
-{
 	struct fw_eq_eth_cmd c = {
 		.op_to_vfn = BE_32(
 		    V_FW_CMD_OP(FW_EQ_ETH_CMD) |
@@ -1435,36 +1739,37 @@ eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 		    V_FW_EQ_ETH_CMD_VIID(pi->viid)),
 		.fetchszm_to_iqid = BE_32(
 		    V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_BOTH) |
-		    V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) |
+		    V_FW_EQ_ETH_CMD_PCIECHN(eq->tse_tx_chan) |
 		    F_FW_EQ_ETH_CMD_FETCHRO |
-		    V_FW_EQ_ETH_CMD_IQID(eq->iqid)),
+		    V_FW_EQ_ETH_CMD_IQID(eq->tse_iqid)),
 		.dcaen_to_eqsize = BE_32(
 		    V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
 		    V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 		    V_FW_EQ_ETH_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) |
-		    V_FW_EQ_ETH_CMD_EQSIZE(eq->qsize)),
-		.eqaddr = BE_64(eq->ba),
+		    V_FW_EQ_ETH_CMD_EQSIZE(eq->tse_qsize_spg)),
+		.eqaddr = BE_64(eq->tse_ring_ba),
 	};
 
 	/*
-	 * The EQ is configured to send a notification for every 32 consumed
-	 * entries (X_CIDXFLUSHTHRESH_32).  In order to ensure timely
-	 * notification of entry consumption during slow periods when that
-	 * threshold may not be reached with regularity, two mechanisms exist:
+	 * The T4 is configured to send a notification for every 32 consumed
+	 * host credits (X_CIDXFLUSHTHRESH_32). During times of periodic Tx
+	 * traffic that threshold may not be reached with regularity, leaving
+	 * outstanding credits that cannot be reclaimed until more traffic is
+	 * sent. This can result in a situation where the device driver is
+	 * unable to shutdown and detach. To alleviate this problem two methods
+	 * may be employed:
 	 *
-	 * 1. The DBQ timer can be configured to fire (and send a notification)
-	 *    after a period when the EQ has gone idle.  This is available on T6
-	 *    and later adapters.
+	 * 1. The DBQ timer can be configured to arm and deliver a notification
+	 *    after the EQ has gone idle for a period of time. This is available
+	 *    on T6 and later adapters.
 	 *
-	 * 2. The CIDXFlushThresholdOverride flag will send a notification
-	 *    whenever a consumed entry causes CDIX==PIDX, even if the
-	 *    CIDXFlushThreshold has not been reached.
+	 * 2. The CIDXFlushThresholdOverride flag (also documented under
+	 *    FCThreshOverride flag in the T6 Programmers Guide) will send a
+	 *    notification whenever a consumed credit causes CDIX==PIDX, even if
+	 *    the CIDXFlushThreshold has not been reached.
 	 *
-	 * The DBQ timer is preferred, as it results in no additional
-	 * notifications when the EQ is kept busy with small transmissions.
-	 * Comparatively, flows of many short packets (like frequent ACKs) can
-	 * cause the CIDXFlushThresholdOverride mechanism to induce a
-	 * notification for every transmitted packet.
+	 * The DBQ timer is preferred, as it results in less notifications when
+	 * the EQ is kept busy with frequent single-credit transmissions.
 	 */
 	if (sc->flags & TAF_DBQ_TIMER) {
 		/* Configure the DBQ timer when it is available */
@@ -1476,117 +1781,78 @@ eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 		c.dcaen_to_eqsize |= BE_32(F_FW_EQ_ETH_CMD_CIDXFTHRESHO);
 	}
 
-	int rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof (c), &c);
+	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof (c), &c);
 	if (rc != 0) {
 		cxgb_printf(pi->dip, CE_WARN,
 		    "failed to create Ethernet egress queue: %d", rc);
 		return (rc);
 	}
-	eq->flags |= EQ_ALLOCATED;
-
-	eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(BE_32(c.eqid_pkd));
-
-	*t4_eqmap_slot(sc, eq->cntxt_id) = eq;
-
-	return (rc);
-}
-
-static int
-alloc_eq(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
-{
-	int rc;
-	size_t len;
-
-	mutex_init(&eq->lock, NULL, MUTEX_DRIVER, DDI_INTR_PRI(sc->intr_pri));
-	eq->flags |= EQ_MTX;
-
-	len = eq->qsize * EQ_ESIZE;
-	rc = alloc_desc_ring(sc, len, DDI_DMA_WRITE, &eq->desc_dhdl,
-	    &eq->desc_ahdl, &eq->ba, (caddr_t *)&eq->desc);
-	if (rc != 0)
-		return (rc);
-
-	eq->cap = eq->qsize - sc->sge.stat_len / EQ_ESIZE;
-	eq->spg = (void *)&eq->desc[eq->cap];
-	eq->avail = eq->cap - 1;	/* one less to avoid cidx = pidx */
-	eq->pidx = eq->cidx = 0;
-	eq->doorbells = sc->doorbells;
-
-	rc = eth_eq_alloc(sc, pi, eq);
-	if (rc != 0) {
-		cxgb_printf(sc->dip, CE_WARN,
-		    "failed to allocate egress queue: %d", rc);
-	}
-
-	if (eq->doorbells & (DOORBELL_UDB | DOORBELL_UDBWC | DOORBELL_WCWR)) {
-		uint64_t udb_offset;
-		uint_t udb_qid;
+	eq->tse_cntxt_id = G_FW_EQ_ETH_CMD_EQID(BE_32(c.eqid_pkd));
+	*t4_eqmap_slot(sc, eq->tse_cntxt_id) = eq;
+	eq->tse_flags |= EQ_ALLOC_DEV;
 
-		rc = t4_bar2_sge_qregs(sc, eq->cntxt_id, T4_BAR2_QTYPE_EGRESS,
-		    0, &udb_offset, &udb_qid);
-
-		if (rc == 0) {
-			eq->udb = sc->bar2_ptr + udb_offset;
-			eq->udb_qid = udb_qid;
-		} else {
-			eq->doorbells &=
-			    ~(DOORBELL_UDB | DOORBELL_UDBWC | DOORBELL_WCWR);
-			eq->udb = NULL;
-			eq->udb_qid = 0;
-		}
-	}
+	t4_alloc_eq_post(pi, eq);
 
-	return (rc);
+	return (0);
 }
 
-static int
-free_eq(struct adapter *sc, struct sge_eq *eq)
+static void
+t4_free_eq(struct port_info *pi, t4_sge_eq_t *eq)
 {
-	int rc;
+	struct adapter *sc = pi->adapter;
 
-	if (eq->flags & EQ_ALLOCATED) {
-		rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id);
+	if (eq->tse_flags & EQ_ALLOC_DEV) {
+		int rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0,
+		    eq->tse_cntxt_id);
 		if (rc != 0) {
 			cxgb_printf(sc->dip, CE_WARN,
 			    "failed to free egress queue: %d", rc);
-			return (rc);
+			/*
+			 * Continue on with freeing operation, even though the
+			 * device resource will be effectively leaked.
+			 */
 		}
-		eq->flags &= ~EQ_ALLOCATED;
+		eq->tse_flags &= ~EQ_ALLOC_DEV;
 	}
 
-	if (eq->desc != NULL) {
-		(void) free_desc_ring(&eq->desc_dhdl, &eq->desc_ahdl);
-		eq->desc = NULL;
+	if (eq->tse_flags & EQ_ALLOC_HOST) {
+		(void) free_desc_ring(&eq->tse_ring_dhdl, &eq->tse_ring_ahdl);
+		eq->tse_ring = NULL;
+		eq->tse_ring_ba = 0;
+		eq->tse_spg = NULL;
+		mutex_destroy(&eq->tse_lock);
+		eq->tse_flags &= ~EQ_ALLOC_HOST;
 	}
 
-	if (eq->flags & EQ_MTX)
-		mutex_destroy(&eq->lock);
-
 	bzero(eq, sizeof (*eq));
-	return (0);
 }
 
 static int
-alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx)
+t4_alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx)
 {
-	int rc, i;
 	struct adapter *sc = pi->adapter;
-	struct sge_eq *eq = &txq->eq;
+	t4_sge_eq_t *eq = &txq->eq;
+	int rc;
 
-	rc = alloc_eq(sc, pi, eq);
-	if (rc != 0)
+	if ((rc = t4_eq_alloc_eth(pi, eq)) != 0) {
 		return (rc);
+	}
 
 	txq->port = pi;
-	txq->sdesc = kmem_zalloc(sizeof (struct tx_sdesc) * eq->cap, KM_SLEEP);
+	txq->sdesc = kmem_zalloc(sizeof (struct tx_sdesc) * eq->tse_qsize,
+	    KM_SLEEP);
 	txq->copy_threshold = tx_copy_threshold;
-	txq->txb_size = eq->qsize * txq->copy_threshold;
+	txq->txb_size = eq->tse_qsize * txq->copy_threshold;
 	rc = alloc_tx_copybuffer(sc, txq->txb_size, &txq->txb_dhdl,
 	    &txq->txb_ahdl, &txq->txb_ba, &txq->txb_va);
-	if (rc == 0)
+	if (rc != 0) {
+		txq->txb_size = 0;
+		txq->txb_avail = 0;
+		return (ENOMEM);
+	} else {
 		txq->txb_avail = txq->txb_size;
-	else
-		txq->txb_avail = txq->txb_size = 0;
+		eq->tse_flags |= EQ_ALLOC_DESC;
+	}
 
 	/*
 	 * TODO: is this too low?  Worst case would need around 4 times qsize
@@ -1594,10 +1860,10 @@ alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx)
 	 * the SGL coming from a distinct DMA handle).  Increase tx_dhdl_total
 	 * if you see too many dma_hdl_failed.
 	 */
-	txq->tx_dhdl_total = eq->qsize * 2;
+	txq->tx_dhdl_total = eq->tse_qsize * 2;
 	txq->tx_dhdl = kmem_zalloc(sizeof (ddi_dma_handle_t) *
 	    txq->tx_dhdl_total, KM_SLEEP);
-	for (i = 0; i < txq->tx_dhdl_total; i++) {
+	for (uint_t i = 0; i < txq->tx_dhdl_total; i++) {
 		rc = ddi_dma_alloc_handle(sc->dip, &sc->sge.dma_attr_tx,
 		    DDI_DMA_SLEEP, 0, &txq->tx_dhdl[i]);
 		if (rc != DDI_SUCCESS) {
@@ -1611,15 +1877,13 @@ alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx)
 
 	txq->ksp = setup_txq_kstats(pi, txq, idx);
 
-	return (rc);
+	return (0);
 }
 
-static int
-free_txq(struct port_info *pi, struct sge_txq *txq)
+static void
+t4_free_txq(struct port_info *pi, struct sge_txq *txq)
 {
-	int i;
-	struct adapter *sc = pi->adapter;
-	struct sge_eq *eq = &txq->eq;
+	t4_sge_eq_t *eq = &txq->eq;
 
 	if (txq->ksp != NULL) {
 		kstat_delete(txq->ksp);
@@ -1632,14 +1896,13 @@ free_txq(struct port_info *pi, struct sge_txq *txq)
 	}
 
 	if (txq->sdesc != NULL) {
-		struct tx_sdesc *sd;
 		ddi_dma_handle_t hdl;
 
 		TXQ_LOCK(txq);
-		while (eq->cidx != eq->pidx) {
-			sd = &txq->sdesc[eq->cidx];
+		while (eq->tse_cidx != eq->tse_pidx) {
+			struct tx_sdesc *sd = &txq->sdesc[eq->tse_cidx];
 
-			for (i = sd->hdls_used; i; i--) {
+			for (uint_t i = sd->hdls_used; i != 0; i--) {
 				hdl = txq->tx_dhdl[txq->tx_dhdl_cidx];
 				(void) ddi_dma_unbind_handle(hdl);
 				if (++txq->tx_dhdl_cidx == txq->tx_dhdl_total)
@@ -1650,22 +1913,23 @@ free_txq(struct port_info *pi, struct sge_txq *txq)
 			freemsgchain(sd->mp_head);
 			sd->mp_head = sd->mp_tail = NULL;
 
-			eq->cidx += sd->desc_used;
-			if (eq->cidx >= eq->cap)
-				eq->cidx -= eq->cap;
+			eq->tse_cidx += sd->credits_used;
+			if (eq->tse_cidx >= eq->tse_qsize)
+				eq->tse_cidx -= eq->tse_qsize;
 
-			txq->txb_avail += txq->txb_used;
+			txq->txb_avail += sd->txb_used;
 		}
 		ASSERT(txq->tx_dhdl_cidx == txq->tx_dhdl_pidx);
 		ASSERT(txq->txb_avail == txq->txb_size);
 		TXQ_UNLOCK(txq);
 
-		kmem_free(txq->sdesc, sizeof (struct tx_sdesc) * eq->cap);
+		kmem_free(txq->sdesc, sizeof (struct tx_sdesc) * eq->tse_qsize);
 		txq->sdesc = NULL;
+		eq->tse_flags &= ~EQ_ALLOC_DESC;
 	}
 
 	if (txq->tx_dhdl != NULL) {
-		for (i = 0; i < txq->tx_dhdl_total; i++) {
+		for (uint_t i = 0; i < txq->tx_dhdl_total; i++) {
 			if (txq->tx_dhdl[i] != NULL)
 				ddi_dma_free_handle(&txq->tx_dhdl[i]);
 		}
@@ -1674,10 +1938,9 @@ free_txq(struct port_info *pi, struct sge_txq *txq)
 		txq->tx_dhdl = NULL;
 	}
 
-	(void) free_eq(sc, &txq->eq);
+	t4_free_eq(pi, &txq->eq);
 
 	bzero(txq, sizeof (*txq));
-	return (0);
 }
 
 /*
@@ -1799,27 +2062,80 @@ alloc_tx_copybuffer(struct adapter *sc, size_t len,
 	    acc_attr, dma_attr, dma_hdl, acc_hdl, pba, pva));
 }
 
+/*
+ * Fetch next valid (if any) response from adapter in IQ.  Returns `true` if
+ * rsp_ctrl data read into `ctrl` has generation bit state matching IQ
+ * expectation for a new entry.
+ *
+ * This does not advance cidx, which is left to a subsequent call to
+ * t4_iq_next_entry().
+ */
 static inline bool
-is_new_response(const struct sge_iq *iq, struct rsp_ctrl **ctrl)
+t4_get_new_rsp(const t4_sge_iq_t *iq, struct rsp_ctrl *ctrl)
 {
-	(void) ddi_dma_sync(iq->dhdl, (uintptr_t)iq->cdesc -
-	    (uintptr_t)iq->desc, iq->esize, DDI_DMA_SYNC_FORKERNEL);
+	(void) ddi_dma_sync(iq->tsi_desc_dhdl, 0, 0, DDI_DMA_SYNC_FORKERNEL);
 
-	*ctrl = (void *)((uintptr_t)iq->cdesc +
-	    (iq->esize - sizeof (struct rsp_ctrl)));
-
-	return ((((*ctrl)->u.type_gen >> S_RSPD_GEN) == iq->gen));
+	*ctrl = *(struct rsp_ctrl *)
+	    ((caddr_t)iq->tsi_cdesc + (iq->tsi_esize_bytes -
+	    sizeof (struct rsp_ctrl)));
+	return ((ctrl->u.type_gen & F_RSPD_GEN) == iq->tsi_gen);
 }
 
+/*
+ * Advance IQ consumer index, wrapping (and toggling generation bit) when the
+ * end of the ring is reached.
+ */
 static inline void
-iq_next(struct sge_iq *iq)
+t4_iq_next_entry(t4_sge_iq_t *iq)
+{
+	iq->tsi_cdesc = (void *) ((caddr_t)iq->tsi_cdesc + iq->tsi_esize_bytes);
+	if (++iq->tsi_cidx == iq->tsi_cap) {
+		iq->tsi_cidx = 0;
+		iq->tsi_gen ^= F_RSPD_GEN;
+		iq->tsi_cdesc = iq->tsi_desc;
+	}
+}
+
+static inline bool
+t4_fl_running_low(const struct sge_fl *fl)
+{
+	return (fl->bufs_avail <= fl->bufs_lowat);
+}
+
+static inline bool
+t4_fl_not_running_low(const struct sge_fl *fl)
+{
+	return (fl->bufs_avail >= (2 * fl->bufs_lowat));
+}
+
+static inline uint_t
+t4_fl_advance_cidx(struct sge_fl *fl)
 {
-	iq->cdesc = (void *) ((uintptr_t)iq->cdesc + iq->esize);
-	if (++iq->cidx == iq->qsize - 1) {
-		iq->cidx = 0;
-		iq->gen ^= 1;
-		iq->cdesc = iq->desc;
+	t4_sge_eq_t *eq = &fl->eq;
+
+	FL_LOCK_ASSERT_OWNED(fl);
+	ASSERT3U(fl->cidx_sdesc, <, FL_BUF_PTR_PER_HC);
+	ASSERT3U(eq->tse_cidx, <, eq->tse_qsize);
+
+	fl->cidx_sdesc++;
+	if (fl->cidx_sdesc == FL_BUF_PTR_PER_HC) {
+		fl->cidx_sdesc = 0;
+		eq->tse_cidx++;
+		if (eq->tse_cidx == eq->tse_qsize) {
+			eq->tse_cidx = 0;
+		}
+		return (1);
 	}
+	return (0);
+}
+
+static inline struct fl_sdesc *
+t4_fl_sdesc(struct sge_fl *fl, uint_t eq_idx, uint_t sdesc_idx)
+{
+	ASSERT(sdesc_idx < FL_BUF_PTR_PER_HC);
+	const uint_t idx = (eq_idx * FL_BUF_PTR_PER_HC) + sdesc_idx;
+
+	return (&fl->sdesc[idx]);
 }
 
 /*
@@ -1828,19 +2144,24 @@ iq_next(struct sge_iq *iq)
  * Returns non-zero to indicate that it should be added to the list of starving
  * freelists.
  */
-static int
-refill_fl(struct adapter *sc, struct sge_fl *fl, int nbufs)
+static bool
+t4_fl_refill(struct sge_fl *fl, uint_t nbufs)
 {
-	uint64_t *d = &fl->desc[fl->pidx];
-	struct fl_sdesc *sd = &fl->sdesc[fl->pidx];
+	struct adapter *sc = t4_fl_to_iq(fl)->tsi_adapter;
+	t4_sge_eq_t *eq = &fl->eq;
 
 	FL_LOCK_ASSERT_OWNED(fl);
-	ASSERT(nbufs >= 0);
 
-	if (nbufs > fl->needed)
-		nbufs = fl->needed;
+	/*
+	 * We refill up to nbufs, but maybe less if there are not that many
+	 * outstanding.
+	 */
+	nbufs = MIN(nbufs, fl->bufs_cap - fl->bufs_avail);
+	while (nbufs != 0 && eq->tse_avail != 0) {
+		struct fl_desc *fld = t4_eq_credit(eq, eq->tse_pidx);
+		struct fl_sdesc *sd = t4_fl_sdesc(fl, eq->tse_pidx,
+		    fl->pidx_sdesc);
 
-	while (nbufs--) {
 		if (sd->rxb != NULL) {
 			if (sd->rxb->ref_cnt == 1) {
 				/*
@@ -1860,9 +2181,9 @@ refill_fl(struct adapter *sc, struct sge_fl *fl, int nbufs)
 				 * Either way the bus address in the descriptor
 				 * ring is already valid.
 				 */
-				ASSERT(*d == cpu_to_be64(sd->rxb->ba));
-				d++;
-				goto recycled;
+				ASSERT3U(fld->dptr[fl->pidx_sdesc], ==,
+				    BE_64(sd->rxb->ba));
+				fl->stats.rxb_recycle++;
 			} else {
 				/*
 				 * Buffer still in use and we need a
@@ -1870,89 +2191,113 @@ refill_fl(struct adapter *sc, struct sge_fl *fl, int nbufs)
 				 * on the existing buffer.
 				 */
 				rxbuf_free(sd->rxb);
+				sd->rxb = NULL;
 			}
 		}
 
-		sd->rxb = rxbuf_alloc(sc->sge.rxbuf_cache, KM_NOSLEEP, 1);
-		if (sd->rxb == NULL)
-			break;
-		*d++ = cpu_to_be64(sd->rxb->ba);
-
-recycled:	fl->pending++;
-		sd++;
-		fl->needed--;
-		if (++fl->pidx == fl->cap) {
-			fl->pidx = 0;
-			sd = fl->sdesc;
-			d = fl->desc;
+		if (sd->rxb == NULL) {
+			sd->rxb = rxbuf_alloc(sc->sge.rxbuf_cache, KM_NOSLEEP);
+			if (sd->rxb == NULL) {
+				fl->stats.rxb_alloc_fail++;
+				break;
+			}
+			fl->stats.rxb_alloc++;
+		}
+		fld->dptr[fl->pidx_sdesc] = BE_64(sd->rxb->ba);
+
+		nbufs--;
+		fl->bufs_avail++;
+		fl->pidx_sdesc++;
+		if (fl->pidx_sdesc == FL_BUF_PTR_PER_HC) {
+			/*
+			 * The host credit is filled. It is now ready to be
+			 * posted to the device.
+			 */
+			fl->pidx_sdesc = 0;
+			eq->tse_pending++;
+			eq->tse_avail--;
+			eq->tse_pidx++;
+			if (eq->tse_pidx == eq->tse_qsize) {
+				eq->tse_pidx = 0;
+			}
 		}
 	}
 
-	if (fl->pending >= 8)
-		ring_fl_db(sc, fl);
+	if (eq->tse_pending != 0) {
+		t4_fl_ring_db(fl);
+	}
 
-	return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING));
+	return (t4_fl_running_low(fl));
 }
 
-#ifndef TAILQ_FOREACH_SAFE
-#define	TAILQ_FOREACH_SAFE(var, head, field, tvar)			\
-	for ((var) = TAILQ_FIRST((head));				\
-	    (var) && ((tvar) = TAILQ_NEXT((var), field), 1);		\
-	    (var) = (tvar))
-#endif
+static clock_t t4_sfl_period_us = 100000;
+
+static void
+t4_sfl_reschedule(struct adapter *sc)
+{
+	ASSERT(MUTEX_HELD(&sc->sfl_lock));
+	ASSERT(!list_is_empty(&sc->sfl_list));
+
+	sc->sfl_timer = timeout(t4_sfl_process, sc,
+	    drv_usectohz(t4_sfl_period_us));
+}
 
 /*
  * Attempt to refill all starving freelists.
  */
 static void
-refill_sfl(void *arg)
+t4_sfl_process(void *arg)
 {
 	struct adapter *sc = arg;
-	struct sge_fl *fl, *fl_temp;
 
 	mutex_enter(&sc->sfl_lock);
-	TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) {
+	struct sge_fl *fl = list_head(&sc->sfl_list);
+	while (fl != NULL) {
+		struct sge_fl *next = list_next(&sc->sfl_list, fl);
+
 		FL_LOCK(fl);
-		(void) refill_fl(sc, fl, 64);
-		if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) {
-			TAILQ_REMOVE(&sc->sfl, fl, link);
-			fl->flags &= ~FL_STARVING;
+		(void) t4_fl_refill(fl, 64);
+		if (t4_fl_not_running_low(fl) || fl->sfl_flags & SFL_DOOMED) {
+			list_remove(&sc->sfl_list, fl);
+			fl->sfl_flags &= ~SFL_STARVING;
 		}
 		FL_UNLOCK(fl);
+		fl = next;
 	}
 
-	if (!TAILQ_EMPTY(&sc->sfl) != 0)
-		sc->sfl_timer =  timeout(refill_sfl, sc, drv_usectohz(100000));
+	if (!list_is_empty(&sc->sfl_list)) {
+		t4_sfl_reschedule(sc);
+	}
 	mutex_exit(&sc->sfl_lock);
 }
 
 static void
-add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl)
+t4_sfl_enqueue(struct adapter *sc, struct sge_fl *fl)
 {
 	mutex_enter(&sc->sfl_lock);
 	FL_LOCK(fl);
-	if ((fl->flags & FL_DOOMED) == 0) {
-		if (TAILQ_EMPTY(&sc->sfl) != 0) {
-			sc->sfl_timer = timeout(refill_sfl, sc,
-			    drv_usectohz(100000));
+	if ((fl->sfl_flags & (SFL_DOOMED | SFL_STARVING)) == 0) {
+		const bool was_empty = list_is_empty(&sc->sfl_list);
+
+		fl->sfl_flags |= SFL_STARVING;
+		list_insert_tail(&sc->sfl_list, fl);
+		if (was_empty) {
+			t4_sfl_reschedule(sc);
 		}
-		fl->flags |= FL_STARVING;
-		TAILQ_INSERT_TAIL(&sc->sfl, fl, link);
 	}
 	FL_UNLOCK(fl);
 	mutex_exit(&sc->sfl_lock);
 }
 
 static void
-free_fl_bufs(struct sge_fl *fl)
+t4_fl_free_bufs(struct sge_fl *fl)
 {
-	struct fl_sdesc *sd;
-	unsigned int i;
+	t4_sge_eq_t *eq = &fl->eq;
 
-	FL_LOCK_ASSERT_OWNED(fl);
+	EQ_LOCK_ASSERT_OWNED(eq);
 
-	for (i = 0; i < fl->cap; i++) {
-		sd = &fl->sdesc[i];
+	for (uint_t i = 0; i < eq->tse_qsize * FL_BUF_PTR_PER_HC; i++) {
+		struct fl_sdesc *sd = &fl->sdesc[i];
 
 		if (sd->rxb != NULL) {
 			rxbuf_free(sd->rxb);
@@ -1962,94 +2307,114 @@ free_fl_bufs(struct sge_fl *fl)
 }
 
 /*
- * Note that fl->cidx and fl->offset are left unchanged in case of failure.
+ * Attempt to create an mblk representing the payload stored at the current
+ * offset (fl->offset) in the current FL buffer (fl->cidx_sdesc). If the length
+ * of the payload is less than fl->copy_threshold, then allocable a new
+ * mblk/dblk to hold the contents and copy it over. Otherwise, attempt to
+ * desballoc() the payload. If there is a failure to allocate, then restore the
+ * eq->tse_cidx and fl->offset to their original value that they had upon
+ * entering this function.
  */
 static mblk_t *
-get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf,
-    int *fl_bufs_used)
+t4_fl_get_payload(struct sge_fl *fl, uint32_t len, bool newbuf)
 {
-	struct mblk_pair frame = {0};
-	struct rxbuf *rxb;
-	mblk_t *m = NULL;
-	uint_t nbuf = 0, len, copy, n;
-	uint32_t cidx, offset, rcidx, roffset;
+	struct adapter *sc = t4_fl_to_iq(fl)->tsi_adapter;
+	t4_sge_eq_t *eq = &fl->eq;
+	mblk_t *mp = NULL;
+	mblk_t *head = NULL, **tailp = &head;
+	uint_t bufs_consumed = 0;
 
+	FL_LOCK(fl);
 	/*
 	 * The SGE won't pack a new frame into the current buffer if the entire
 	 * payload doesn't fit in the remaining space.  Move on to the next buf
 	 * in that case.
 	 */
-	rcidx = fl->cidx;
-	roffset = fl->offset;
-	if (fl->offset > 0 && len_newbuf & F_RSPD_NEWBUF) {
+	const uint16_t rcidx = eq->tse_cidx;
+	const uint_t rcidx_sdesc = fl->cidx_sdesc;
+	const uint32_t roffset = fl->offset;
+	uint_t credits_avail = 0;
+
+	if (fl->offset > 0 && newbuf) {
+		/*
+		 * The device has moved onto the next buffer. Reset our offset
+		 * into the current buffer and advanced the driver's cidx, which
+		 * may have freed up an EQ host credit to be refilled by the
+		 * driver.
+		 */
 		fl->offset = 0;
-		if (++fl->cidx == fl->cap)
-			fl->cidx = 0;
-		nbuf++;
+		credits_avail += t4_fl_advance_cidx(fl);
+		bufs_consumed++;
 	}
-	cidx = fl->cidx;
-	offset = fl->offset;
 
-	len = G_RSPD_LEN(len_newbuf);	/* pktshift + payload length */
-	copy = (len <= fl->copy_threshold);
-	if (copy != 0) {
-		frame.head = m = allocb(len, BPRI_HI);
-		if (m == NULL) {
-			fl->allocb_fail++;
+	const bool do_copy = (len <= fl->copy_threshold);
+	if (do_copy) {
+		mp = allocb(len, 0);
+		if (mp == NULL) {
+			fl->stats.copy_fail++;
 			DTRACE_PROBE1(t4__fl_alloc_fail, struct sge_fl *, fl);
-			fl->cidx = rcidx;
-			fl->offset = roffset;
-			return (NULL);
+			goto restore;
 		}
+		*tailp = mp;
+		tailp = &mp->b_cont;
 	}
 
-	while (len) {
-		rxb = fl->sdesc[cidx].rxb;
-		n = min(len, rxb->buf_size - offset);
+	uint_t offset = fl->offset;
+	while (len != 0) {
+		struct rxbuf *rxb =
+		    t4_fl_sdesc(fl, eq->tse_cidx, fl->cidx_sdesc)->rxb;
+		const uint_t copy_len = MIN(len, rxb->buf_size - offset);
 
-		(void) ddi_dma_sync(rxb->dhdl, offset, n,
-		    DDI_DMA_SYNC_FORKERNEL);
+		(void) ddi_dma_sync(rxb->dhdl, 0, 0, DDI_DMA_SYNC_FORKERNEL);
 
-		if (copy != 0)
-			bcopy(rxb->va + offset, m->b_wptr, n);
-		else {
-			m = desballoc((unsigned char *)rxb->va + offset, n,
-			    BPRI_HI, &rxb->freefunc);
-			if (m == NULL) {
-				fl->allocb_fail++;
+		if (do_copy) {
+			bcopy(rxb->va + offset, mp->b_wptr, copy_len);
+			fl->stats.copy++;
+		} else {
+			mp = desballoc((unsigned char *)rxb->va + offset,
+			    copy_len, 0, &rxb->freefunc);
+			if (mp == NULL) {
+				fl->stats.wrap_fail++;
 				DTRACE_PROBE1(t4__fl_alloc_fail,
 				    struct sge_fl *, fl);
-				if (frame.head)
-					freemsgchain(frame.head);
-				fl->cidx = rcidx;
-				fl->offset = roffset;
-				return (NULL);
+				goto restore;
 			}
 			atomic_inc_uint(&rxb->ref_cnt);
-			if (frame.head != NULL)
-				frame.tail->b_cont = m;
-			else
-				frame.head = m;
-			frame.tail = m;
+			*tailp = mp;
+			tailp = &mp->b_cont;
+			fl->stats.wrap++;
 		}
-		m->b_wptr += n;
-		len -= n;
-		offset += roundup(n, sc->sge.fl_align);
-		ASSERT(offset <= rxb->buf_size);
+		mp->b_wptr += copy_len;
+		len -= copy_len;
+		offset += roundup(copy_len, sc->sge.fl_align);
+
+		ASSERT3U(offset, <=, rxb->buf_size);
 		if (offset == rxb->buf_size) {
 			offset = 0;
-			if (++cidx == fl->cap)
-				cidx = 0;
-			nbuf++;
+			credits_avail += t4_fl_advance_cidx(fl);
+			bufs_consumed++;
 		}
 	}
-
-	fl->cidx = cidx;
 	fl->offset = offset;
-	(*fl_bufs_used) += nbuf;
+	ASSERT3U(credits_avail, <=, eq->tse_qsize);
+	eq->tse_avail += credits_avail;
+	/* We can't consume more than are available. */
+	ASSERT3U(bufs_consumed, <=, fl->bufs_avail);
+	fl->bufs_avail -= bufs_consumed;
+
+	FL_UNLOCK(fl);
+
+	ASSERT(head != NULL);
+	return (head);
+
+restore:
+	eq->tse_cidx = rcidx;
+	fl->cidx_sdesc = rcidx_sdesc;
+	fl->offset = roffset;
+	FL_UNLOCK(fl);
+	freemsgchain(head);
 
-	ASSERT(frame.head != NULL);
-	return (frame.head);
+	return (NULL);
 }
 
 /*
@@ -2058,7 +2423,7 @@ get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf,
  * of immediate data.
  */
 #define	IMM_LEN ( \
-	2 * EQ_ESIZE \
+	2 * EQ_HC_SIZE \
 	- sizeof (struct fw_eth_tx_pkt_wr) \
 	- sizeof (struct cpl_tx_pkt_core))
 
@@ -2100,7 +2465,8 @@ get_frame_txinfo(struct sge_txq *txq, mblk_t **fp, struct txinfo *txinfo,
 		bzero(&txinfo->meoi, sizeof (txinfo->meoi));
 	}
 
-start:	txinfo->nsegs = 0;
+start:
+	txinfo->nsegs = 0;
 	txinfo->hdls_used = 0;
 	txinfo->txb_used = 0;
 	txinfo->len = 0;
@@ -2115,10 +2481,10 @@ start:	txinfo->nsegs = 0;
 	m = *fp;
 
 	if (n >= TX_SGL_SEGS || ((flags & HW_LSO) && MBLKL(m) < 50)) {
-		txq->pullup_early++;
+		txq->stats.pullup_early++;
 		m = msgpullup(*fp, -1);
 		if (m == NULL) {
-			txq->pullup_failed++;
+			txq->stats.pullup_failed++;
 			return (E2BIG);	/* (*fp) left as it was */
 		}
 		freemsg(*fp);
@@ -2150,7 +2516,7 @@ start:	txinfo->nsegs = 0;
 		if (rc == E2BIG ||
 		    (txinfo->nsegs == TX_SGL_SEGS && m->b_cont)) {
 
-			txq->pullup_late++;
+			txq->stats.pullup_late++;
 			m = msgpullup(*fp, -1);
 			if (m != NULL) {
 				free_txinfo_resources(txq, txinfo);
@@ -2160,7 +2526,7 @@ start:	txinfo->nsegs = 0;
 				goto start;
 			}
 
-			txq->pullup_failed++;
+			txq->stats.pullup_failed++;
 			rc = E2BIG;
 		}
 
@@ -2170,9 +2536,8 @@ start:	txinfo->nsegs = 0;
 		}
 	}
 
-	ASSERT(txinfo->nsegs > 0 && txinfo->nsegs <= TX_SGL_SEGS);
-
 done:
+	ASSERT(txinfo->nsegs > 0 && txinfo->nsegs <= TX_SGL_SEGS);
 
 	/*
 	 * Store the # of flits required to hold this frame's SGL in nflits.  An
@@ -2222,7 +2587,7 @@ copy_into_txb(struct sge_txq *txq, mblk_t *m, int len, struct txinfo *txinfo)
 	TXQ_LOCK_ASSERT_OWNED(txq);	/* will manipulate txb */
 
 	if (!fits_in_txb(txq, len, &waste)) {
-		txq->txb_full++;
+		txq->stats.txb_full++;
 		return (ENOMEM);
 	}
 
@@ -2293,7 +2658,7 @@ add_mblk(struct sge_txq *txq, struct txinfo *txinfo, mblk_t *m, int len)
 	TXQ_LOCK_ASSERT_OWNED(txq);	/* will manipulate dhdls */
 
 	if (txq->tx_dhdl_avail == 0) {
-		txq->dma_hdl_failed++;
+		txq->stats.dma_hdl_failed++;
 		return (ENOMEM);
 	}
 
@@ -2302,7 +2667,7 @@ add_mblk(struct sge_txq *txq, struct txinfo *txinfo, mblk_t *m, int len)
 	    DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, NULL, &cookie,
 	    &ccount);
 	if (rc != DDI_DMA_MAPPED) {
-		txq->dma_map_failed++;
+		txq->stats.dma_map_failed++;
 
 		ASSERT(rc != DDI_DMA_INUSE && rc != DDI_DMA_PARTIAL_MAP);
 
@@ -2372,7 +2737,7 @@ static int
 add_to_txpkts(struct sge_txq *txq, struct txpkts *txpkts, mblk_t *m,
     struct txinfo *txinfo)
 {
-	struct sge_eq *eq = &txq->eq;
+	t4_sge_eq_t *eq = &txq->eq;
 	int can_coalesce;
 	struct tx_sdesc *txsd;
 	uint8_t flits;
@@ -2381,10 +2746,10 @@ add_to_txpkts(struct sge_txq *txq, struct txpkts *txpkts, mblk_t *m,
 	ASSERT(m->b_next == NULL);
 
 	if (txpkts->npkt > 0) {
-		flits = TXPKTS_PKT_HDR + txinfo->nflits;
+		flits = TXPKTS_PKT_HDR_FLITS + txinfo->nflits;
 		can_coalesce = (txinfo->flags & HW_LSO) == 0 &&
-		    txpkts->nflits + flits <= TX_WR_FLITS &&
-		    txpkts->nflits + flits <= eq->avail * 8 &&
+		    txpkts->nflits + flits <= TX_WR_MAX_FLITS &&
+		    txpkts->nflits + flits <= EQ_HC_TO_FLITS(eq->tse_avail) &&
 		    txpkts->plen + txinfo->len < 65536;
 
 		if (can_coalesce != 0) {
@@ -2394,7 +2759,7 @@ add_to_txpkts(struct sge_txq *txq, struct txpkts *txpkts, mblk_t *m,
 			txpkts->nflits += flits;
 			txpkts->plen += txinfo->len;
 
-			txsd = &txq->sdesc[eq->pidx];
+			txsd = &txq->sdesc[eq->tse_pidx];
 			txsd->txb_used += txinfo->txb_used;
 			txsd->hdls_used += txinfo->hdls_used;
 
@@ -2424,9 +2789,15 @@ add_to_txpkts(struct sge_txq *txq, struct txpkts *txpkts, mblk_t *m,
 	ASSERT(txpkts->npkt == 0);
 	ASSERT(txinfo->len < 65536);
 
-	flits = TXPKTS_WR_HDR + txinfo->nflits;
+	flits = TXPKTS_WR_HDR_FLITS + txinfo->nflits;
+
+	/*
+	 * We can coalesce if this is non-LSO and the number of flits required
+	 * is both less than or equal to the maximum flits allowed for a single
+	 * WR and less than or equal to the number of flits currently available.
+	 */
 	can_coalesce = (txinfo->flags & HW_LSO) == 0 &&
-	    flits <= eq->avail * 8 && flits <= TX_WR_FLITS;
+	    flits <= EQ_HC_TO_FLITS(eq->tse_avail) && flits <= TX_WR_MAX_FLITS;
 
 	if (can_coalesce == 0)
 		return (EINVAL);
@@ -2434,13 +2805,14 @@ add_to_txpkts(struct sge_txq *txq, struct txpkts *txpkts, mblk_t *m,
 	/*
 	 * Start a fresh coalesced tx WR with m as the first frame in it.
 	 */
+	t4_eq_host_credit_t *hc = t4_eq_credit(eq, eq->tse_pidx);
 	txpkts->tail = m;
 	txpkts->npkt = 1;
 	txpkts->nflits = flits;
-	txpkts->flitp = &eq->desc[eq->pidx].flit[2];
+	txpkts->flitp = &hc->flit[2];
 	txpkts->plen = txinfo->len;
 
-	txsd = &txq->sdesc[eq->pidx];
+	txsd = &txq->sdesc[eq->tse_pidx];
 	txsd->mp_head = txsd->mp_tail = m;
 	txsd->txb_used = txinfo->txb_used;
 	txsd->hdls_used = txinfo->hdls_used;
@@ -2449,63 +2821,72 @@ add_to_txpkts(struct sge_txq *txq, struct txpkts *txpkts, mblk_t *m,
 }
 
 static inline void
-t4_tx_incr_pending(struct sge_txq *txq, uint_t ndesc)
+t4_tx_incr_pending(struct sge_txq *txq, uint16_t ncredits)
 {
-	struct sge_eq *eq = &txq->eq;
+	t4_sge_eq_t *eq = &txq->eq;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
-	ASSERT3U(ndesc, !=, 0);
-	ASSERT3U(eq->avail, >=, ndesc);
-
-	eq->pending += ndesc;
-	eq->avail -= ndesc;
-	eq->pidx += ndesc;
-	if (eq->pidx >= eq->cap) {
-		eq->pidx -= eq->cap;
+	ASSERT3U(ncredits, !=, 0);
+	ASSERT3U(eq->tse_avail, >=, ncredits);
+
+	eq->tse_pending += ncredits;
+	eq->tse_avail -= ncredits;
+	eq->tse_pidx += ncredits;
+	if (eq->tse_pidx >= eq->tse_qsize) {
+		eq->tse_pidx -= eq->tse_qsize;
 	}
+
+	ASSERT3U(eq->tse_pidx, <, eq->tse_qsize);
+	ASSERT3U(eq->tse_pending, <=, eq->tse_qsize - 1);
 }
 
 /*
- * Note that write_txpkts_wr can never run out of hardware descriptors (but
- * write_txpkt_wr can).  add_to_txpkts ensures that a frame is accepted for
- * coalescing only if sufficient hardware descriptors are available.
+ * Note that write_txpkts_wr() can never run out of host credits (but
+ * write_txpkt_wr() can). add_to_txpkts() ensures that a frame is accepted for
+ * coalescing only if sufficient host credits are available.
  */
 static void
 write_txpkts_wr(struct sge_txq *txq, struct txpkts *txpkts)
 {
-	struct sge_eq *eq = &txq->eq;
-	struct fw_eth_tx_pkts_wr *wr;
-	struct tx_sdesc *txsd;
-	uint32_t ctrl;
-	uint16_t ndesc;
+	t4_sge_eq_t *eq = &txq->eq;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);	/* pidx, avail */
 
-	ndesc = howmany(txpkts->nflits, 8);
+	struct fw_eth_tx_pkts_wr *wr = t4_eq_credit(eq, eq->tse_pidx);
+	const uint16_t ncredits = EQ_FLITS_TO_HC(txpkts->nflits);
+	ASSERT3U(ncredits, <=, eq->tse_avail);
 
-	wr = (void *)&eq->desc[eq->pidx];
-	wr->op_pkd = cpu_to_be32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR) |
-	    V_FW_WR_IMMDLEN(0)); /* immdlen does not matter in this WR */
-	ctrl = V_FW_WR_LEN16(howmany(txpkts->nflits, 2));
-	if (eq->avail == ndesc)
-		ctrl |= F_FW_WR_EQUEQ | F_FW_WR_EQUIQ;
-	wr->equiq_to_len16 = cpu_to_be32(ctrl);
-	wr->plen = cpu_to_be16(txpkts->plen);
+	/* The immdlen value does not matter for this WR. */
+	wr->op_pkd = BE_32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR) | V_FW_WR_IMMDLEN(0));
+
+	/*
+	 * If all remaining credits are consumed by this WR, then request an EQ
+	 * status update to both the EQ status page and the associated ingress
+	 * queue entry.
+	 *
+	 * See §29.10 FW_ETH_TX_PKTS_WR of the T4 Firmware Interface
+	 * Specification.
+	 */
+	const uint32_t update_bits = (eq->tse_avail == ncredits) ?
+	    (F_FW_WR_EQUEQ | F_FW_WR_EQUIQ) : 0;
+	wr->equiq_to_len16 = BE_32(V_FW_WR_LEN16(howmany(txpkts->nflits, 2)) |
+	    update_bits);
+	wr->r3 = 0;
+	wr->plen = BE_16(txpkts->plen);
 	wr->npkt = txpkts->npkt;
-	wr->r3 = wr->type = 0;
+	wr->type = 0;
 
 	/* Everything else already written */
+	struct tx_sdesc *txsd = &txq->sdesc[eq->tse_pidx];
+	txsd->credits_used = ncredits;
 
-	txsd = &txq->sdesc[eq->pidx];
-	txsd->desc_used = ndesc;
-
-	txq->txb_used += txsd->txb_used / TXB_CHUNK;
-	txq->hdl_used += txsd->hdls_used;
+	txq->stats.txb_used += txsd->txb_used / TXB_CHUNK;
+	txq->stats.hdl_used += txsd->hdls_used;
 
-	t4_tx_incr_pending(txq, ndesc);
+	t4_tx_incr_pending(txq, ncredits);
 
-	txq->txpkts_pkts += txpkts->npkt;
-	txq->txpkts_wrs++;
+	txq->stats.txpkts_pkts += txpkts->npkt;
+	txq->stats.txpkts_wrs++;
 	txpkts->npkt = 0;	/* emptied */
 }
 
@@ -2621,12 +3002,11 @@ static int
 write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, mblk_t *m,
     struct txinfo *txinfo)
 {
-	struct sge_eq *eq = &txq->eq;
-	struct fw_eth_tx_pkt_wr *wr;
+	t4_sge_eq_t *eq = &txq->eq;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
 	uint64_t ctrl1;
-	int nflits, ndesc;
+	uint16_t nflits = 0;
 	struct tx_sdesc *txsd;
 	caddr_t dst;
 	const mac_ether_offload_info_t *meoi = &txinfo->meoi;
@@ -2638,28 +3018,38 @@ write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, mblk_t *m,
 	 */
 	ctrl = sizeof (struct cpl_tx_pkt_core);
 	if (txinfo->flags & HW_LSO) {
-		nflits = TXPKT_LSO_WR_HDR;
+		nflits = TXPKT_LSO_WR_HDR_FLITS;
 		ctrl += sizeof (struct cpl_tx_pkt_lso_core);
 	} else {
-		nflits = TXPKT_WR_HDR;
+		nflits = TXPKT_WR_HDR_FLITS;
 	}
 	if (txinfo->nsegs > 0)
 		nflits += txinfo->nflits;
 	else {
-		nflits += howmany(txinfo->len, 8);
+		nflits += howmany(txinfo->len, FLIT_NUM_BYTES);
 		ctrl += txinfo->len;
 	}
-	ndesc = howmany(nflits, 8);
-	if (ndesc > eq->avail)
+
+	ASSERT3U(nflits, >, 0);
+
+	const uint16_t ncredits = EQ_FLITS_TO_HC(nflits);
+	if (ncredits > eq->tse_avail)
 		return (ENOMEM);
 
 	/* Firmware work request header */
-	wr = (void *)&eq->desc[eq->pidx];
+	struct fw_eth_tx_pkt_wr *wr = t4_eq_credit(eq, eq->tse_pidx);
 	wr->op_immdlen = cpu_to_be32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_WR_IMMDLEN(ctrl));
 	ctrl = V_FW_WR_LEN16(howmany(nflits, 2));
-	if (eq->avail == ndesc)
+
+	/*
+	 * If all remaining credits are consumed by this WR, then request an EQ
+	 * status update to both the EQ status page and the associated ingress
+	 * queue entry.
+	 */
+	if (ncredits == eq->tse_avail)
 		ctrl |= F_FW_WR_EQUEQ | F_FW_WR_EQUIQ;
+
 	wr->equiq_to_len16 = cpu_to_be32(ctrl);
 	wr->r3 = 0;
 
@@ -2703,7 +3093,7 @@ write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, mblk_t *m,
 
 		cpl = (void *)(lso + 1);
 
-		txq->tso_wrs++;
+		txq->stats.tso_wrs++;
 	} else {
 		cpl = (void *)(wr + 1);
 	}
@@ -2712,14 +3102,14 @@ write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, mblk_t *m,
 	switch (csum_to_ctrl(txinfo,
 	    CHELSIO_CHIP_VERSION(pi->adapter->params.chip), &ctrl1)) {
 	case COS_SUCCESS:
-		txq->txcsum++;
+		txq->stats.txcsum++;
 		break;
 	case COS_FAIL:
 		/*
 		 * Packet will be going out with checksums which are probably
 		 * wrong but there is little we can do now.
 		 */
-		txq->csum_failed++;
+		txq->stats.csum_failed++;
 		break;
 	default:
 		break;
@@ -2733,21 +3123,21 @@ write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, mblk_t *m,
 	cpl->ctrl1 = cpu_to_be64(ctrl1);
 
 	/* Software descriptor */
-	txsd = &txq->sdesc[eq->pidx];
+	txsd = &txq->sdesc[eq->tse_pidx];
 	txsd->mp_head = txsd->mp_tail = m;
 	txsd->txb_used = txinfo->txb_used;
 	txsd->hdls_used = txinfo->hdls_used;
-	txsd->desc_used = ndesc;
+	txsd->credits_used = ncredits;
 
-	txq->txb_used += txinfo->txb_used / TXB_CHUNK;
-	txq->hdl_used += txinfo->hdls_used;
+	txq->stats.txb_used += txinfo->txb_used / TXB_CHUNK;
+	txq->stats.hdl_used += txinfo->hdls_used;
 
-	t4_tx_incr_pending(txq, ndesc);
+	t4_tx_incr_pending(txq, ncredits);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 	if (txinfo->nsegs > 0) {
-		txq->sgl_wrs++;
+		txq->stats.sgl_wrs++;
 		copy_to_txd(eq, (void *)&txinfo->sgl, &dst, txinfo->nflits * 8);
 
 		/* Need to zero-pad to a 16 byte boundary if not on one */
@@ -2755,7 +3145,7 @@ write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, mblk_t *m,
 			*(uint64_t *)dst = 0;
 
 	} else {
-		txq->imm_wrs++;
+		txq->stats.imm_wrs++;
 #ifdef DEBUG
 		ctrl = txinfo->len;
 #endif
@@ -2768,17 +3158,17 @@ write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, mblk_t *m,
 		ASSERT(ctrl == 0);
 	}
 
-	txq->txpkt_wrs++;
+	txq->stats.txpkt_wrs++;
 	return (0);
 }
 
 static void
 t4_write_flush_wr(struct sge_txq *txq)
 {
-	struct sge_eq *eq = &txq->eq;
+	t4_sge_eq_t *eq = &txq->eq;
 
 	EQ_LOCK_ASSERT_OWNED(eq);
-	ASSERT(eq->avail > 0);
+	ASSERT3U(eq->tse_avail, >, 0);
 
 	const struct fw_eq_flush_wr wr = {
 		.opcode = FW_EQ_FLUSH_WR,
@@ -2786,20 +3176,31 @@ t4_write_flush_wr(struct sge_txq *txq)
 		    V_FW_WR_LEN16(sizeof (struct fw_eq_flush_wr) / 16) |
 		    F_FW_WR_EQUEQ | F_FW_WR_EQUIQ),
 	};
-	*(struct fw_eq_flush_wr *)&eq->desc[eq->pidx] = wr;
+	*(struct fw_eq_flush_wr *)t4_eq_credit(eq, eq->tse_pidx) = wr;
 
 	const struct tx_sdesc txsd = {
 		.mp_head = NULL,
 		.mp_tail = NULL,
 		.txb_used = 0,
 		.hdls_used = 0,
-		.desc_used = 1,
+		.credits_used = 1,
 	};
-	txq->sdesc[eq->pidx] = txsd;
+	txq->sdesc[eq->tse_pidx] = txsd;
 
 	t4_tx_incr_pending(txq, 1);
 }
 
+/*
+ * Increment the flit pointer by the given number of bytes.
+ */
+static inline void *
+t4_incr_flit(void *flitp, size_t num_bytes)
+{
+	/* A flit should always start on an 8-byte boundary. */
+	ASSERT0(((uintptr_t)flitp + num_bytes) & 0x7);
+	return ((void *)((caddr_t)(flitp) + (num_bytes)));
+}
+
 static inline void
 write_ulp_cpl_sgl(struct port_info *pi, struct sge_txq *txq,
     struct txpkts *txpkts, struct txinfo *txinfo)
@@ -2807,27 +3208,25 @@ write_ulp_cpl_sgl(struct port_info *pi, struct sge_txq *txq,
 	struct ulp_txpkt *ulpmc;
 	struct ulptx_idata *ulpsc;
 	struct cpl_tx_pkt_core *cpl;
-	uintptr_t flitp, start, end;
+	void *flitp = txpkts->flitp;
 	uint64_t ctrl;
 	caddr_t dst;
+	const uintptr_t end = (uintptr_t)txq->eq.tse_spg;
 
-	ASSERT(txpkts->npkt > 0);
-
-	start = (uintptr_t)txq->eq.desc;
-	end = (uintptr_t)txq->eq.spg;
+	ASSERT3U(txpkts->npkt, >, 0);
 
 	/* Checksum offload */
 	switch (csum_to_ctrl(txinfo,
 	    CHELSIO_CHIP_VERSION(pi->adapter->params.chip), &ctrl)) {
 	case COS_SUCCESS:
-		txq->txcsum++;
+		txq->stats.txcsum++;
 		break;
 	case COS_FAIL:
 		/*
 		 * Packet will be going out with checksums which are probably
 		 * wrong but there is little we can do now.
 		 */
-		txq->csum_failed++;
+		txq->stats.csum_failed++;
 		break;
 	default:
 		break;
@@ -2840,113 +3239,131 @@ write_ulp_cpl_sgl(struct port_info *pi, struct sge_txq *txq,
 	 * bytes each), and that it can not wrap around in the middle of the
 	 * cpl_tx_pkt_core either.
 	 */
-	flitp = (uintptr_t)txpkts->flitp;
-	ASSERT((flitp & 0xf) == 0);
+	ASSERT0((uintptr_t)flitp & 0xf);
+	ASSERT3U((uintptr_t)flitp + sizeof (*ulpmc), <=, end);
 
 	/* ULP master command */
-	ulpmc = (void *)flitp;
+	ulpmc = flitp;
 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
 	ulpmc->len = htonl(howmany(sizeof (*ulpmc) + sizeof (*ulpsc) +
-	    sizeof (*cpl) + 8 * txinfo->nflits, 16));
+	    sizeof (*cpl) + FLITS_TO_BYTES(txinfo->nflits), 16));
+
+	flitp = t4_incr_flit(flitp, sizeof (*ulpmc));
+
+	/* We cannot wrap-around between the ULPTX master and subcommand. */
+	ASSERT3U((uintptr_t)flitp, <, end);
+	ASSERT3U((uintptr_t)flitp + sizeof (*ulpsc), <=, end);
 
 	/* ULP subcommand */
-	ulpsc = (void *)(ulpmc + 1);
+	ulpsc = flitp;
 	ulpsc->cmd_more = cpu_to_be32(V_ULPTX_CMD((u32)ULP_TX_SC_IMM) |
 	    F_ULP_TX_SC_MORE);
 	ulpsc->len = cpu_to_be32(sizeof (struct cpl_tx_pkt_core));
 
-	flitp += sizeof (*ulpmc) + sizeof (*ulpsc);
-	if (flitp == end)
-		flitp = start;
+	flitp = t4_incr_flit(flitp, sizeof (*ulpsc));
+
+	/* If we have reached the end, go back to the start of the ring. */
+	if ((uintptr_t)flitp == end)
+		flitp = txq->eq.tse_ring;
 
 	/* CPL_TX_PKT_XT */
-	cpl = (void *)flitp;
+	cpl = flitp;
 	cpl->ctrl0 = cpu_to_be32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
 	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(pi->adapter->pf));
 	cpl->pack = 0;
 	cpl->len = cpu_to_be16(txinfo->len);
 	cpl->ctrl1 = cpu_to_be64(ctrl);
 
-	flitp += sizeof (*cpl);
-	if (flitp == end)
-		flitp = start;
+	flitp = t4_incr_flit(flitp, sizeof (*cpl));
+
+	/* The CPL cannot wrap-around the end. */
+	ASSERT3U((uintptr_t)flitp, <=, end);
+
+	if ((uintptr_t)flitp == end)
+		flitp = txq->eq.tse_ring;
 
 	/* SGL for this frame */
 	dst = (caddr_t)flitp;
-	copy_to_txd(&txq->eq, (void *)&txinfo->sgl, &dst, txinfo->nflits * 8);
-	flitp = (uintptr_t)dst;
+	copy_to_txd(&txq->eq, (void *)&txinfo->sgl, &dst,
+	    FLITS_TO_BYTES(txinfo->nflits));
+	flitp = (void *)dst;
 
 	/* Zero pad and advance to a 16 byte boundary if not already at one. */
-	if (flitp & 0xf) {
-
-		/* no matter what, flitp should be on an 8 byte boundary */
-		ASSERT((flitp & 0x7) == 0);
+	if (((uintptr_t)flitp & 0xf) != 0) {
+		/* A flit should always be on an 8 byte boundary. */
+		ASSERT(((uintptr_t)flitp & 0x7) == 0);
 
 		*(uint64_t *)flitp = 0;
-		flitp += sizeof (uint64_t);
+		flitp = t4_incr_flit(flitp, FLIT_NUM_BYTES);
 		txpkts->nflits++;
 	}
 
-	if (flitp == end)
-		flitp = start;
+	ASSERT0((uintptr_t)flitp & 0xf);
+
+	/*
+	 * The SGL can wrap-around, but lets make sure we stayed within the
+	 * ring.
+	 */
+	ASSERT3U((uintptr_t)flitp, <=, end);
+
+	if ((uintptr_t)flitp == end)
+		flitp = txq->eq.tse_ring;
 
-	txpkts->flitp = (void *)flitp;
+	txpkts->flitp = flitp;
 }
 
 static inline void
-copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
+copy_to_txd(t4_sge_eq_t *eq, caddr_t from, caddr_t *to, size_t len)
 {
-	if ((uintptr_t)(*to) + len <= (uintptr_t)eq->spg) {
+	/*
+	 * Technically the maximum WR size is lower, but this assert is just to
+	 * make sure nothing funky is going on with len. We subtract one from
+	 * the qsize because you can never totally fill the queue.
+	 */
+	ASSERT3U(len, <=, FLITS_TO_BYTES(EQ_HC_TO_FLITS(eq->tse_qsize - 1)));
+
+	if ((uintptr_t)(*to) + len <= (uintptr_t)eq->tse_spg) {
 		bcopy(from, *to, len);
 		(*to) += len;
 	} else {
-		int portion = (uintptr_t)eq->spg - (uintptr_t)(*to);
+		/*
+		 * The number of bytes left before the end of the ring (which is
+		 * the status page).
+		 */
+		size_t portion = (uintptr_t)eq->tse_spg - (uintptr_t)(*to);
 
+		ASSERT3U(portion, <, len);
 		bcopy(from, *to, portion);
 		from += portion;
 		portion = len - portion;	/* remaining */
-		bcopy(from, (void *)eq->desc, portion);
-		(*to) = (caddr_t)eq->desc + portion;
+		bcopy(from, eq->tse_ring, portion);
+		(*to) = (caddr_t)eq->tse_ring + portion;
 	}
 }
 
 static void
 t4_tx_ring_db(struct sge_txq *txq)
 {
-	struct sge_eq *eq = &txq->eq;
+	t4_sge_eq_t *eq = &txq->eq;
 	struct adapter *sc = txq->port->adapter;
 	int val, db_mode;
-	t4_doorbells_t db = eq->doorbells;
+	t4_doorbells_t db = eq->tse_doorbells;
 
 	EQ_LOCK_ASSERT_OWNED(eq);
 
-	if (eq->pending > 1)
+	/*
+	 * A Write-Combining Work Request implicitly uses a single credit and
+	 * only a single credit. If we have produced more than one credit, then
+	 * fallback to the Write-Combining UDB, then plain UDB, and finally KDB.
+	 */
+	if (eq->tse_pending > 1)
 		db &= ~DOORBELL_WCWR;
 
-	if (eq->pending > eq->pidx) {
-		int offset = eq->cap - (eq->pending - eq->pidx);
-
-		/* pidx has wrapped around since last doorbell */
-
-		(void) ddi_dma_sync(eq->desc_dhdl,
-		    offset * sizeof (struct tx_desc), 0,
-		    DDI_DMA_SYNC_FORDEV);
-		(void) ddi_dma_sync(eq->desc_dhdl,
-		    0, eq->pidx * sizeof (struct tx_desc),
-		    DDI_DMA_SYNC_FORDEV);
-	} else if (eq->pending > 0) {
-		(void) ddi_dma_sync(eq->desc_dhdl,
-		    (eq->pidx - eq->pending) * sizeof (struct tx_desc),
-		    eq->pending * sizeof (struct tx_desc),
-		    DDI_DMA_SYNC_FORDEV);
-	}
+	(void) ddi_dma_sync(eq->tse_ring_dhdl, 0, 0, DDI_DMA_SYNC_FORDEV);
 
 	membar_producer();
 
-	if (t4_cver_eq(sc, CHELSIO_T4))
-		val = V_PIDX(eq->pending);
-	else
-		val = V_PIDX_T5(eq->pending);
+	val = V_PIDX(eq->tse_pending);
 
 	db_mode = (1 << (ffs(db) - 1));
 	switch (db_mode) {
@@ -2957,17 +3374,20 @@ t4_tx_ring_db(struct sge_txq *txq)
 			 * (udb_qid is always 0).  Only queues with
 			 * doorbell segments can do WCWR.
 			 */
-			ASSERT(eq->udb_qid == 0 && eq->pending == 1);
+			ASSERT(eq->tse_udb_qid == 0 && eq->tse_pending == 1);
 
-			const uint_t desc_idx =
-			    eq->pidx != 0 ? eq->pidx - 1 : eq->cap - 1;
-			uint64_t *src = (uint64_t *)&eq->desc[desc_idx];
+			const uint16_t credit_idx = eq->tse_pidx != 0 ?
+			    eq->tse_pidx - 1 : eq->tse_qsize - 1;
+			uint64_t *src = t4_eq_credit(eq, credit_idx);
 			volatile uint64_t *dst =
-			    (uint64_t *)(eq->udb + UDBS_WR_OFFSET);
+			    (uint64_t *)(eq->tse_udb + UDBS_WR_OFFSET);
 
-			/* Copy the 8 flits of the TX descriptor to the DB */
+			/*
+			 * Copy the 8 flits of the host credit to the UDB WCWR
+			 * space (the second 64 bytes of the 128 byte segment).
+			 */
 			const uint_t flit_count =
-			    sizeof (struct tx_desc) / sizeof (uint64_t);
+			    sizeof (t4_eq_host_credit_t) / sizeof (uint64_t);
 			for (uint_t i = 0; i < flit_count; i++) {
 				/*
 				 * Perform the copy directly through the BAR
@@ -2986,50 +3406,59 @@ t4_tx_ring_db(struct sge_txq *txq)
 		case DOORBELL_UDB:
 		case DOORBELL_UDBWC:
 			ddi_put32(sc->bar2_hdl,
-			    (uint32_t *)(eq->udb + UDBS_DB_OFFSET),
-			    LE_32(V_QID(eq->udb_qid) | val));
+			    (uint32_t *)(eq->tse_udb + UDBS_DB_OFFSET),
+			    LE_32(V_QID(eq->tse_udb_qid) | val));
 			membar_producer();
 			break;
 
 		case DOORBELL_KDB:
 			t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL),
-			    V_QID(eq->cntxt_id) | val);
+			    V_QID(eq->tse_cntxt_id) | val);
 			break;
 	}
 
-	eq->pending = 0;
+	eq->tse_pending = 0;
 }
 
 /*
- * Reclaim consumed descriptors from egress queue.  This will be capped at an
- * upper bound of `howmany`.  The corresponding mblks will be freed inline,
- * unless a non-NULL `defer_freemp` is provided, in which case the to-be-freed
- * mblk chain will be provided to the caller.
+ * Attempt to reclaim consumed host credits from the given Tx EQ. The number of
+ * credits to reclaim is specified by 'howmany', but that value is clamped down
+ * to the number of credits available for reclaim if it is too large. The mblks
+ * associated with the reclaimed credits are freed inline unless a non-NULL
+ * 'defer_freemp' is provided; in that case an mblk chain is provided to the
+ * caller who is now responsible for freeing.
  *
- * Returns the number of descriptors which underwent reclamation.
+ * Returns the number of reclaimed host credits.
+ *
+ * When debugging/analyzing this code it is important to remember that host
+ * credits != mblks.
  */
-static uint_t
-t4_tx_reclaim_descs(struct sge_txq *txq, uint_t howmany, mblk_t **defer_freemp)
+static uint16_t
+t4_tx_reclaim_credits(struct sge_txq *txq, uint16_t howmany,
+    mblk_t **defer_freemp)
 {
-	struct sge_eq *eq = &txq->eq;
+	t4_sge_eq_t *eq = &txq->eq;
 
 	EQ_LOCK_ASSERT_OWNED(eq);
 
-	const uint_t cur_cidx = BE_16(eq->spg->cidx);
-	const uint_t reclaim_avail = (cur_cidx >= eq->cidx) ?
-	    (cur_cidx - eq->cidx) : (cur_cidx + eq->cap - eq->cidx);
+	const uint16_t cur_cidx = BE_16(eq->tse_spg->cidx);
+	const uint16_t reclaim_avail = (cur_cidx >= eq->tse_cidx) ?
+	    (cur_cidx - eq->tse_cidx) :
+	    (cur_cidx + eq->tse_qsize - eq->tse_cidx);
 
 	if (reclaim_avail == 0) {
 		return (0);
 	}
 
-	uint_t txb_freed = 0, hdl_freed = 0, reclaimed = 0;
+	uint_t txb_freed = 0, hdl_freed = 0;
+	uint16_t reclaimed = 0;
+
 	do {
-		struct tx_sdesc *txsd = &txq->sdesc[eq->cidx];
-		const uint_t ndesc = txsd->desc_used;
+		struct tx_sdesc *txsd = &txq->sdesc[eq->tse_cidx];
+		const uint16_t ncredits = txsd->credits_used;
 
 		/* Firmware doesn't return "partial" credits. */
-		ASSERT3U(reclaimed + ndesc, <=, reclaim_avail);
+		ASSERT3U(reclaimed + ncredits, <=, reclaim_avail);
 
 		if (txsd->mp_head != NULL) {
 			/*
@@ -3060,24 +3489,24 @@ t4_tx_reclaim_descs(struct sge_txq *txq, uint_t howmany, mblk_t **defer_freemp)
 			 */
 			ASSERT0(txsd->txb_used);
 			ASSERT0(txsd->hdls_used);
-			ASSERT3U(ndesc, ==, 1);
+			ASSERT3U(ncredits, ==, 1);
 		}
 
 		txb_freed += txsd->txb_used;
 		hdl_freed += txsd->hdls_used;
-		reclaimed += ndesc;
+		reclaimed += ncredits;
 
-		eq->cidx += ndesc;
-		if (eq->cidx >= eq->cap) {
-			eq->cidx -= eq->cap;
+		eq->tse_cidx += ncredits;
+		if (eq->tse_cidx >= eq->tse_qsize) {
+			eq->tse_cidx -= eq->tse_qsize;
 		}
 	} while (reclaimed < reclaim_avail && reclaimed < howmany);
 
-	eq->avail += reclaimed;
+	eq->tse_avail += reclaimed;
 	txq->txb_avail += txb_freed;
 	txq->tx_dhdl_avail += hdl_freed;
 
-	ASSERT3U(eq->avail, <, eq->cap);
+	ASSERT3U(eq->tse_avail, <, eq->tse_qsize);
 	ASSERT3U(txq->tx_dhdl_avail, <=, txq->tx_dhdl_total);
 
 	for (; hdl_freed; hdl_freed--) {
@@ -3090,11 +3519,11 @@ t4_tx_reclaim_descs(struct sge_txq *txq, uint_t howmany, mblk_t **defer_freemp)
 }
 
 static int
-t4_handle_cpl_msg(struct sge_iq *iq, const struct rss_header *rss, mblk_t *mp)
+t4_handle_cpl_msg(t4_sge_iq_t *iq, const struct rss_header *rss, mblk_t *mp)
 {
 	const uint8_t opcode = rss->opcode;
 
-	DTRACE_PROBE4(t4__cpl_msg, struct sge_iq *, iq, uint8_t, opcode,
+	DTRACE_PROBE4(t4__cpl_msg, t4_sge_iq_t *, iq, uint8_t, opcode,
 	    const struct rss_header *, rss, mblk_t *, mp);
 
 	switch (opcode) {
@@ -3107,9 +3536,16 @@ t4_handle_cpl_msg(struct sge_iq *iq, const struct rss_header *rss, mblk_t *mp)
 		t4_sge_egr_update(iq, rss);
 		return (0);
 	case CPL_RX_PKT:
-		return (t4_eth_rx(iq, rss, mp));
+		/*
+		 * Packet RX is expected to be handled in t4_process_rx_iq().
+		 * CPL messages of such a type should not make it here.
+		 */
+		cxgb_printf(iq->tsi_adapter->dip, CE_WARN,
+		    "unexpected unhandled CPL_RX_PKT msg");
+		freemsg(mp);
+		return (0);
 	default:
-		cxgb_printf(iq->adapter->dip, CE_WARN,
+		cxgb_printf(iq->tsi_adapter->dip, CE_WARN,
 		    "unhandled CPL opcode 0x%02x", opcode);
 		if (mp != NULL) {
 			freemsg(mp);
@@ -3119,14 +3555,14 @@ t4_handle_cpl_msg(struct sge_iq *iq, const struct rss_header *rss, mblk_t *mp)
 }
 
 static int
-t4_handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss)
+t4_handle_fw_msg(t4_sge_iq_t *iq, const struct rss_header *rss)
 {
 	const struct cpl_fw6_msg *cpl = (const void *)(rss + 1);
 	const uint8_t msg_type = cpl->type;
 	const struct rss_header *rss2;
-	struct adapter *sc = iq->adapter;
+	struct adapter *sc = iq->tsi_adapter;
 
-	DTRACE_PROBE3(t4__fw_msg, struct sge_iq *, iq, uint8_t, msg_type,
+	DTRACE_PROBE3(t4__fw_msg, t4_sge_iq_t *, iq, uint8_t, msg_type,
 	    const struct rss_header *, rss);
 
 	switch (msg_type) {
@@ -3142,115 +3578,34 @@ t4_handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss)
 	}
 }
 
-static int
-t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, mblk_t *m)
-{
-	bool csum_ok;
-	uint16_t err_vec;
-	struct sge_rxq *rxq = (void *)iq;
-	struct mblk_pair chain = {0};
-	struct adapter *sc = iq->adapter;
-	const struct cpl_rx_pkt *cpl = t4_rss_payload(rss);
-
-	m->b_rptr += sc->sge.pktshift;
-
-	/* Compressed error vector is enabled for T6 only */
-	if (sc->params.tp.rx_pkt_encap)
-		/* It is enabled only in T6 config file */
-		err_vec = G_T6_COMPR_RXERR_VEC(ntohs(cpl->err_vec));
-	else
-		err_vec = ntohs(cpl->err_vec);
-
-	csum_ok = cpl->csum_calc && !err_vec;
-	/* TODO: what about cpl->ip_frag? */
-	if (csum_ok && !cpl->ip_frag) {
-		mac_hcksum_set(m, 0, 0, 0, 0xffff,
-		    HCK_FULLCKSUM_OK | HCK_FULLCKSUM |
-		    HCK_IPV4_HDRCKSUM_OK);
-		rxq->rxcsum++;
-	}
-
-	/* Add to the chain that we'll send up */
-	if (chain.head != NULL)
-		chain.tail->b_next = m;
-	else
-		chain.head = m;
-	chain.tail = m;
-
-	t4_mac_rx(rxq->port, rxq, chain.head);
-
-	rxq->rxpkts++;
-	rxq->rxbytes  += be16_to_cpu(cpl->len);
-	return (0);
-}
-
-#define	FL_HW_IDX(idx)	((idx) >> 3)
-
-static inline void
-ring_fl_db(struct adapter *sc, struct sge_fl *fl)
+static void
+t4_fl_ring_db(struct sge_fl *fl)
 {
-	int desc_start, desc_last, ndesc;
-	uint32_t v = sc->params.arch.sge_fl_db;
-
-	ndesc = FL_HW_IDX(fl->pending);
-
-	/* Hold back one credit if pidx = cidx */
-	if (FL_HW_IDX(fl->pidx) == FL_HW_IDX(fl->cidx))
-		ndesc--;
-
-	/*
-	 * There are chances of ndesc modified above (to avoid pidx = cidx).
-	 * If there is nothing to post, return.
-	 */
-	if (ndesc <= 0)
-		return;
-
-	desc_last = FL_HW_IDX(fl->pidx);
+	struct adapter *sc = t4_fl_to_iq(fl)->tsi_adapter;
+	t4_sge_eq_t *eq = &fl->eq;
 
-	if (fl->pidx < fl->pending) {
-		/* There was a wrap */
-		desc_start = FL_HW_IDX(fl->pidx + fl->cap - fl->pending);
-
-		/* From desc_start to the end of list */
-		(void) ddi_dma_sync(fl->dhdl, desc_start * RX_FL_ESIZE, 0,
-		    DDI_DMA_SYNC_FORDEV);
-
-		/* From start of list to the desc_last */
-		if (desc_last != 0)
-			(void) ddi_dma_sync(fl->dhdl, 0, desc_last *
-			    RX_FL_ESIZE, DDI_DMA_SYNC_FORDEV);
-	} else {
-		/* There was no wrap, sync from start_desc to last_desc */
-		desc_start = FL_HW_IDX(fl->pidx - fl->pending);
-		(void) ddi_dma_sync(fl->dhdl, desc_start * RX_FL_ESIZE,
-		    ndesc * RX_FL_ESIZE, DDI_DMA_SYNC_FORDEV);
-	}
+	EQ_LOCK_ASSERT_OWNED(eq);
 
-	if (t4_cver_eq(sc, CHELSIO_T4))
-		v |= V_PIDX(ndesc);
-	else
-		v |= V_PIDX_T5(ndesc);
-	v |= V_QID(fl->cntxt_id) | V_PIDX(ndesc);
+	(void) ddi_dma_sync(eq->tse_ring_dhdl, 0, 0, DDI_DMA_SYNC_FORDEV);
 
 	membar_producer();
 
-	t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), v);
+	t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL),
+	    sc->params.arch.sge_fl_db |
+	    V_QID(eq->tse_cntxt_id) |
+	    V_PIDX(eq->tse_pending));
 
-	/*
-	 * Update pending count:
-	 * Deduct the number of descriptors posted
-	 */
-	fl->pending -= ndesc * 8;
+	eq->tse_pending = 0;
 }
 
 static void
-t4_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss)
+t4_sge_egr_update(t4_sge_iq_t *iq, const struct rss_header *rss)
 {
-	struct adapter *sc = iq->adapter;
+	struct adapter *sc = iq->tsi_adapter;
 	const struct cpl_sge_egr_update *cpl = t4_rss_payload(rss);
 	const uint_t qid = G_EGR_QID(BE_32(cpl->opcode_qid));
 	struct sge_txq *txq = (struct sge_txq *)(*t4_eqmap_slot(sc, qid));
-	struct sge_eq *eq = &txq->eq;
+	t4_sge_eq_t *eq = &txq->eq;
 
 	/*
 	 * Get a "live" snapshot of the flags and PIDX state from the TXQ.
@@ -3260,8 +3615,8 @@ t4_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss)
 	 * reclaim.
 	 */
 	membar_consumer();
-	const uint16_t live_pidx = BE_16(eq->pidx);
-	const t4_eq_flags_t live_flags = eq->flags;
+	const uint16_t live_pidx = BE_16(eq->tse_pidx);
+	const t4_eq_flags_t live_flags = eq->tse_flags;
 
 	if ((live_flags & EQ_CORKED) == 0 &&
 	    (cpl->pidx != cpl->cidx || live_pidx != cpl->cidx)) {
@@ -3272,7 +3627,7 @@ t4_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss)
 		 *    allocate descriptors (or memory) while attempting to place
 		 *    a packet in the TXQ.
 		 *
-		 * 2. There are additional transmit descriptors in the EQ which
+		 * 2. There are outstanding transmit descriptors in the EQ which
 		 *    will trigger a subsequent SGE_EGR_UPDATE notification.
 		 *
 		 * When those conditions are met, it is safe to skip performing
@@ -3288,10 +3643,10 @@ t4_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss)
 	bool do_mac_update = false;
 
 	TXQ_LOCK(txq);
-	(void) t4_tx_reclaim_descs(txq, eq->qsize, &freemp);
-	if (eq->flags & EQ_CORKED && eq->avail != 0) {
+	(void) t4_tx_reclaim_credits(txq, eq->tse_qsize, &freemp);
+	if (eq->tse_flags & EQ_CORKED && eq->tse_avail != 0) {
 		do_mac_update = true;
-		eq->flags &= ~EQ_CORKED;
+		eq->tse_flags &= ~EQ_CORKED;
 	}
 	TXQ_UNLOCK(txq);
 
@@ -3304,7 +3659,7 @@ t4_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss)
 #define	KS_UINIT(x)	kstat_named_init(&kstatp->x, #x, KSTAT_DATA_ULONG)
 #define	KS_CINIT(x)	kstat_named_init(&kstatp->x, #x, KSTAT_DATA_CHAR)
 #define	KS_U_SET(x, y)	kstatp->x.value.ul = (y)
-#define	KS_U_FROM(x, y)	kstatp->x.value.ul = (y)->x
+#define	KS_U_FROM(x, y)	kstatp->x.value.ul = (y)->stats.x
 #define	KS_C_SET(x, ...)	\
 			(void) snprintf(kstatp->x.value.c, 16,  __VA_ARGS__)
 
@@ -3313,10 +3668,10 @@ t4_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss)
  */
 struct cxgbe_port_config_kstats {
 	kstat_named_t idx;
-	kstat_named_t nrxq;
-	kstat_named_t ntxq;
-	kstat_named_t first_rxq;
-	kstat_named_t first_txq;
+	kstat_named_t rxq_count;
+	kstat_named_t txq_count;
+	kstat_named_t rxq_start;
+	kstat_named_t txq_start;
 	kstat_named_t controller;
 	kstat_named_t factory_mac_address;
 };
@@ -3360,18 +3715,18 @@ setup_port_config_kstats(struct port_info *pi)
 	kstatp = (struct cxgbe_port_config_kstats *)ksp->ks_data;
 
 	KS_UINIT(idx);
-	KS_UINIT(nrxq);
-	KS_UINIT(ntxq);
-	KS_UINIT(first_rxq);
-	KS_UINIT(first_txq);
+	KS_UINIT(rxq_count);
+	KS_UINIT(txq_count);
+	KS_UINIT(rxq_start);
+	KS_UINIT(txq_start);
 	KS_CINIT(controller);
 	KS_CINIT(factory_mac_address);
 
 	KS_U_SET(idx, pi->port_id);
-	KS_U_SET(nrxq, pi->nrxq);
-	KS_U_SET(ntxq, pi->ntxq);
-	KS_U_SET(first_rxq, pi->first_rxq);
-	KS_U_SET(first_txq, pi->first_txq);
+	KS_U_SET(rxq_count, pi->rxq_count);
+	KS_U_SET(txq_count, pi->txq_count);
+	KS_U_SET(rxq_start, pi->rxq_start);
+	KS_U_SET(txq_start, pi->txq_start);
 	KS_C_SET(controller, "%s%d", ddi_driver_name(pdip),
 	    ddi_get_instance(pdip));
 	KS_C_SET(factory_mac_address, "%02X%02X%02X%02X%02X%02X",
@@ -3492,11 +3847,10 @@ struct rxq_kstats {
 	kstat_named_t rxcsum;
 	kstat_named_t rxpkts;
 	kstat_named_t rxbytes;
-	kstat_named_t nomem;
 };
 
 static kstat_t *
-setup_rxq_kstats(struct port_info *pi, struct sge_rxq *rxq, int idx)
+setup_rxq_kstats(struct port_info *pi, struct sge_rxq *rxq, uint_t q_idx)
 {
 	struct kstat *ksp;
 	struct rxq_kstats *kstatp;
@@ -3504,14 +3858,14 @@ setup_rxq_kstats(struct port_info *pi, struct sge_rxq *rxq, int idx)
 	char str[16];
 
 	ndata = sizeof (struct rxq_kstats) / sizeof (kstat_named_t);
-	(void) snprintf(str, sizeof (str), "rxq%u", idx);
+	(void) snprintf(str, sizeof (str), "rxq%u", q_idx);
 
 	ksp = kstat_create(T4_PORT_NAME, ddi_get_instance(pi->dip), str, "rxq",
 	    KSTAT_TYPE_NAMED, ndata, 0);
 	if (ksp == NULL) {
 		cxgb_printf(pi->dip, CE_WARN,
-		    "%s: failed to initialize rxq kstats for queue %d.",
-		    __func__, idx);
+		    "%s: failed to initialize rxq kstats for queue %u.",
+		    __func__, q_idx);
 		return (NULL);
 	}
 
@@ -3520,7 +3874,6 @@ setup_rxq_kstats(struct port_info *pi, struct sge_rxq *rxq, int idx)
 	KS_UINIT(rxcsum);
 	KS_UINIT(rxpkts);
 	KS_UINIT(rxbytes);
-	KS_UINIT(nomem);
 
 	ksp->ks_update = update_rxq_kstats;
 	ksp->ks_private = (void *)rxq;
@@ -3541,7 +3894,6 @@ update_rxq_kstats(kstat_t *ksp, int rw)
 	KS_U_FROM(rxcsum, rxq);
 	KS_U_FROM(rxpkts, rxq);
 	KS_U_FROM(rxbytes, rxq);
-	KS_U_FROM(nomem, rxq);
 
 	return (0);
 }
@@ -3661,20 +4013,14 @@ rxbuf_cache_create(struct rxbuf_cache_params *p)
 	    rxbuf_ctor, rxbuf_dtor, NULL, p, NULL, 0);
 }
 
-/*
- * If ref_cnt is more than 1 then those many calls to rxbuf_free will
- * have to be made before the rxb is released back to the kmem_cache.
- */
 static struct rxbuf *
-rxbuf_alloc(kmem_cache_t *cache, int kmflags, uint_t ref_cnt)
+rxbuf_alloc(kmem_cache_t *cache, int kmflags)
 {
 	struct rxbuf *rxb;
 
-	ASSERT(ref_cnt > 0);
-
 	rxb = kmem_cache_alloc(cache, kmflags);
 	if (rxb != NULL) {
-		rxb->ref_cnt = ref_cnt;
+		rxb->ref_cnt = 1;
 		rxb->cache = cache;
 	}
 
diff --git a/usr/src/uts/intel/cxgbe/t4nex/Makefile b/usr/src/uts/intel/cxgbe/t4nex/Makefile
index 5353461f36..eda61150f1 100644
--- a/usr/src/uts/intel/cxgbe/t4nex/Makefile
+++ b/usr/src/uts/intel/cxgbe/t4nex/Makefile
@@ -13,7 +13,7 @@
 # Copyright (c) 2013 by Chelsio Communications, Inc. All rights reserved.
 #
 # Copyright (c) 2018, Joyent, Inc.
-# Copyright 2023 Oxide Computer Company
+# Copyright 2025 Oxide Computer Company
 
 #
 # This makefile drives the production of the Chelsio Terminator 4 10G Ethernet
@@ -53,8 +53,11 @@ CFLAGS += -I$(UTSBASE)/common/io/cxgbe -I$(UTSBASE)/common/io/cxgbe/common \
 #
 LDFLAGS += -N misc/mac -N drv/ip
 
-# needs work
-SMOFF += all_func_returns,snprintf_overflow
+# "common" code is still not smatch clean
+T4NEX_SMOFF_OBJS	= t4_hw.o cudbg_lib.o cudbg_wtp.o cudbg_flash_utils.o \
+			fastlz_api.o
+T4NEX_SMOFF_PATHS	= $(T4NEX_SMOFF_OBJS:%=$(OBJS_DIR)/%)
+$(T4NEX_SMOFF_PATHS)	:= SMOFF += all_func_returns
 
 #
 # Default build targets.
-- 
2.51.2