From 01fb84c6e8e2cecbedbce08d9acba7e1538e70bf Mon Sep 17 00:00:00 2001 From: Ryan Zezeski Date: Wed, 7 Jan 2026 10:19:36 -0500 Subject: [PATCH] 17526 cxgbe could use more queues 17527 cxgbe should be smatch clean 17528 cxgbe should always be mac ring capable Portions contributed by: Patrick Mooney Reviewed by: Robert Mustacchi Change-Id: I719cb8b599b7de95d65055d65367a704e22b8d9c --- usr/src/uts/common/io/cxgbe/common/common.h | 2 +- usr/src/uts/common/io/cxgbe/common/t4_hw.c | 30 +- usr/src/uts/common/io/cxgbe/shared/shared.h | 4 +- usr/src/uts/common/io/cxgbe/t4nex/adapter.h | 966 +++--- usr/src/uts/common/io/cxgbe/t4nex/cudbg.h | 2 +- usr/src/uts/common/io/cxgbe/t4nex/cudbg_lib.c | 2 +- usr/src/uts/common/io/cxgbe/t4nex/t4_ioctl.c | 1 - usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c | 371 +-- usr/src/uts/common/io/cxgbe/t4nex/t4_nexus.c | 2357 ++++++++------ usr/src/uts/common/io/cxgbe/t4nex/t4_sge.c | 2894 +++++++++-------- usr/src/uts/intel/cxgbe/t4nex/Makefile | 9 +- 11 files changed, 3820 insertions(+), 2818 deletions(-) diff --git a/usr/src/uts/common/io/cxgbe/common/common.h b/usr/src/uts/common/io/cxgbe/common/common.h index 22a28e4a3d..cabe280784 100644 --- a/usr/src/uts/common/io/cxgbe/common/common.h +++ b/usr/src/uts/common/io/cxgbe/common/common.h @@ -583,7 +583,7 @@ unsigned int t4_link_fwcap_to_speed(fw_port_cap32_t caps); fw_port_cap32_t t4_link_fwcap_to_fwspeed(fw_port_cap32_t acaps); int t4_link_set_autoneg(struct port_info *pi, u8 autoneg, fw_port_cap32_t *new_caps); -int t4_link_set_pause(struct port_info *pi, cc_pause_t pause, +void t4_link_set_pause(struct port_info *pi, cc_pause_t pause, fw_port_cap32_t *new_caps); int t4_link_set_fec(struct port_info *pi, cc_fec_t fec, fw_port_cap32_t *new_caps); diff --git a/usr/src/uts/common/io/cxgbe/common/t4_hw.c b/usr/src/uts/common/io/cxgbe/common/t4_hw.c index 9fb9f9301c..7757fd0b51 100644 --- a/usr/src/uts/common/io/cxgbe/common/t4_hw.c +++ b/usr/src/uts/common/io/cxgbe/common/t4_hw.c @@ -449,7 +449,7 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd, u32 ctl_reg = PF_REG(mbox, A_CIM_PF_MAILBOX_CTRL); u32 ctl; __be64 cmd_rpl[MBOX_LEN/8]; - struct t4_mbox_list entry; + t4_mbox_waiter_t entry; u32 pcie_fw; if ((size & 15) || size > MBOX_LEN) @@ -469,7 +469,7 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd, * wait [for a while] till we're at the front [or bail out with an * EBUSY] ... */ - t4_mbox_list_add(adap, &entry); + t4_mbox_waiter_add(adap, &entry); for (i = 0; ; i++) { /* @@ -481,28 +481,15 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd, */ pcie_fw = t4_read_reg(adap, A_PCIE_FW); if (i > 4*timeout || (pcie_fw & F_PCIE_FW_ERR)) { - t4_mbox_list_del(adap, &entry); + t4_mbox_waiter_remove(adap, &entry); t4_report_fw_error(adap); ret = (pcie_fw & F_PCIE_FW_ERR) ? -ENXIO : -EBUSY; T4_RECORD_MBOX(adap, cmd, size, ret, 0); return ret; } - /* - * If we're at the head, break out and start the mailbox - * protocol. - */ - if (t4_mbox_list_first_entry(adap) == &entry) + if (t4_mbox_wait_owner(adap, MBOX_CMD_DELAY, sleep_ok)) { break; - - /* - * Delay for a bit before checking again ... - */ - if (sleep_ok) { - usleep_range(MIN_MBOX_CMD_DELAY, MBOX_CMD_DELAY); - } else { - T4_OS_TOUCH_NMI_WATCHDOG(); - udelay(MBOX_CMD_DELAY); } } #ifdef T4_OS_LOG_MBOX_CMDS @@ -524,7 +511,7 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd, * mailbox atomic access list and report the error to our caller. */ if (v != X_MBOWNER_PL) { - t4_mbox_list_del(adap, &entry); + t4_mbox_waiter_remove(adap, &entry); t4_report_fw_error(adap); ret = (v == X_MBOWNER_FW) ? -EBUSY : -ETIMEDOUT; T4_RECORD_MBOX(adap, cmd, size, access, ret); @@ -597,7 +584,7 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd, */ get_mbox_rpl(adap, cmd_rpl, size/8, data_reg); t4_write_reg(adap, ctl_reg, V_MBOWNER(X_MBOWNER_NONE)); - t4_mbox_list_del(adap, &entry); + t4_mbox_waiter_remove(adap, &entry); T4_RECORD_MBOX(adap, cmd_rpl, size, access, i + 1); @@ -625,7 +612,7 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd, * the error and also check to see if the firmware reported any * errors ... */ - t4_mbox_list_del(adap, &entry); + t4_mbox_waiter_remove(adap, &entry); ret = (pcie_fw & F_PCIE_FW_ERR) ? -ENXIO : -ETIMEDOUT; T4_RECORD_MBOX(adap, cmd, size, access, ret); @@ -9099,7 +9086,7 @@ int t4_link_set_autoneg(struct port_info *pi, u8 autoneg, return 0; } -int t4_link_set_pause(struct port_info *pi, cc_pause_t pause, +void t4_link_set_pause(struct port_info *pi, cc_pause_t pause, fw_port_cap32_t *new_caps) { struct link_config *lc = &pi->link_cfg; @@ -9128,7 +9115,6 @@ int t4_link_set_pause(struct port_info *pi, cc_pause_t pause, caps |= FW_PORT_CAP32_FORCE_PAUSE; *new_caps = caps; - return 0; } #define T4_LINK_FEC_MASK V_FW_PORT_CAP32_FEC(M_FW_PORT_CAP32_FEC) diff --git a/usr/src/uts/common/io/cxgbe/shared/shared.h b/usr/src/uts/common/io/cxgbe/shared/shared.h index 8a1f682be0..158728427f 100644 --- a/usr/src/uts/common/io/cxgbe/shared/shared.h +++ b/usr/src/uts/common/io/cxgbe/shared/shared.h @@ -21,7 +21,7 @@ */ /* - * Copyright 2024 Oxide Computer Company + * Copyright 2025 Oxide Computer Company */ #ifndef __CXGBE_SHARED_H @@ -51,7 +51,7 @@ #define CH_DBG(sc, category, fmt, ...) do {} while (0) #endif -extern int cxgb_printf(dev_info_t *dip, int level, char *f, ...); +extern void cxgb_printf(dev_info_t *dip, int level, char *f, ...); /* Attach/detach logic used by cxgbe, calling into t4nex */ struct port_info; diff --git a/usr/src/uts/common/io/cxgbe/t4nex/adapter.h b/usr/src/uts/common/io/cxgbe/t4nex/adapter.h index ea147b19da..34659a32c6 100644 --- a/usr/src/uts/common/io/cxgbe/t4nex/adapter.h +++ b/usr/src/uts/common/io/cxgbe/t4nex/adapter.h @@ -30,129 +30,122 @@ #include #include #include -#include +#include #include #include +#include #include "firmware/t4fw_interface.h" #include "shared.h" struct adapter; +struct port_info; typedef struct adapter adapter_t; +struct sge_fl; + +/* See the _Ingress Context Contents_ section of the T4 Programmers Guide. */ +typedef enum t4_iq_esize { + T4_IQ_ESIZE_16B = 0, + T4_IQ_ESIZE_32B = 1, + T4_IQ_ESIZE_64B = 2, + T4_IQ_ESIZE_128B = 3, +} t4_iq_esize_t; #define FW_IQ_QSIZE 256 -#define FW_IQ_ESIZE 64 /* At least 64 mandated by the firmware spec */ +/* At least 64 bytes mandated by the firmware spec */ +#define FW_IQ_ESIZE T4_IQ_ESIZE_64B + +#define T4_RX_DEF_QSIZE 1024 +/* At least 64 so CPL_RX_PKT will fit */ +#define RX_IQ_ESIZE T4_IQ_ESIZE_64B + +/* A flit is an 8 byte quantity. */ +#define FLIT_NUM_BYTES 8 +#define FLITS_TO_BYTES(nflits) ((nflits) * FLIT_NUM_BYTES) -#define RX_IQ_QSIZE 1024 -#define RX_IQ_ESIZE 64 /* At least 64 so CPL_RX_PKT will fit */ +/* + * Egress Queues (EQ) are made up of units called "host credits". Each credit is + * always 8 flits (64 bytes) in size. The number of entries in the queue as well + * as the producer and consumer indexes (pidx/cidx) are phrased in units of + * credits. + * + * A freelist (FL) is a type of EQ. It consists of 16-byte aligned, 8-byte + * pointers to data buffers meant to hold the data of incoming packets. Since an + * EQ host credit is always 8 flits, and an FL buffer pointer is a single flit, + * each credit holds 8 FL buffer pointers. + * + */ +#define FLITS_PER_EQ_HC 8 +#define EQ_HC_SIZE FLITS_PER_EQ_HC * FLIT_NUM_BYTES +#define FL_BUF_PTR_PER_HC FLITS_PER_EQ_HC -#define EQ_ESIZE 64 /* All egress queues use this entry size */ +/* + * Given a number of host credits, calculate the total number of flits + * contained in those credits. + */ +#define EQ_HC_TO_FLITS(num_credits) ((num_credits) * FLITS_PER_EQ_HC) -#define RX_FL_ESIZE 64 /* 8 64bit addresses */ +/* + * Given a number of flits, calculate how many host credits are needed to hold + * them. + */ +#define EQ_FLITS_TO_HC(num_flits) (howmany(num_flits, FLITS_PER_EQ_HC)) -#define FL_BUF_SIZES 4 +/* + * We constrain the max "usable" EQ size so that there is always room for the + * status page, which may require 1-2 host credits. + */ +#define T4_MAX_EQ_SIZE (UINT16_MAX - 2) +#define T4_TX_DEF_QSIZE 1024 +#define TX_SGL_SEGS 36 -#define CTRL_EQ_QSIZE 128 +/* The maximum number of flits/credits a single WR may consume. */ +#define TX_WR_MAX_FLITS (SGE_MAX_WR_LEN / FLIT_NUM_BYTES) +#define TX_WR_MAX_CREDITS (TX_WR_MAX_FLITS / FLITS_PER_EQ_HC) -#define TX_EQ_QSIZE 1024 -#define TX_SGL_SEGS 36 -#define TX_WR_FLITS (SGE_MAX_WR_LEN / 8) +CTASSERT(TX_WR_MAX_FLITS == 64); +CTASSERT(TX_WR_MAX_CREDITS == 8); #define UDBS_SEG_SHIFT 7 /* log2(UDBS_SEG_SIZE) */ #define UDBS_DB_OFFSET 8 /* offset of the 4B doorbell in a segment */ #define UDBS_WR_OFFSET 64 /* offset of the work request in a segment */ -typedef enum t4_port_flags { - TPF_INIT_DONE = (1 << 0), - TPF_OPEN = (1 << 1), -} t4_port_flags_t; - -typedef enum t4_port_feat { - CXGBE_HW_LSO = (1 << 0), - CXGBE_HW_CSUM = (1 << 1), -} t4_port_feat_t; - -struct port_info { - dev_info_t *dip; - mac_handle_t mh; - mac_callbacks_t *mc; - int mtu; - uint8_t hw_addr[ETHERADDRL]; - - kmutex_t lock; - struct adapter *adapter; - - t4_port_flags_t flags; - - uint16_t viid; - int16_t xact_addr_filt; /* index of exact MAC address filter */ - uint16_t rss_size; /* size of VI's RSS table slice */ - uint16_t ntxq; /* # of tx queues */ - uint16_t first_txq; /* index of first tx queue */ - uint16_t nrxq; /* # of rx queues */ - uint16_t first_rxq; /* index of first rx queue */ - uint8_t lport; /* associated offload logical port */ - int8_t mdio_addr; - uint8_t port_type; - uint8_t mod_type; - uint8_t port_id; - uint8_t tx_chan; - uint8_t rx_chan; - uint8_t rx_cchan; - uint8_t instance; /* Associated adapter instance */ - uint8_t child_inst; /* Associated child instance */ - - uint8_t tmr_idx; - int8_t pktc_idx; - uint8_t dbq_timer_idx; - - struct link_config link_cfg; - struct port_stats stats; - t4_port_feat_t features; - uint8_t macaddr_cnt; - u8 rss_mode; - u16 viid_mirror; - kstat_t *ksp_config; - kstat_t *ksp_info; - kstat_t *ksp_fec; - - u8 vivld; - u8 vin; - u8 smt_idx; +/* + * A sentinel to mark when the interrupts for an IQ are being forwarded from + * another IQ which is receiving the actual interrupt. + */ +#define INTR_FORWARDED UINT_MAX - u8 vivld_mirror; - u8 vin_mirror; - u8 smt_idx_mirror; +struct fl_desc { + uint64_t dptr[FL_BUF_PTR_PER_HC]; }; struct fl_sdesc { struct rxbuf *rxb; }; -struct tx_desc { - __be64 flit[8]; -}; +typedef struct t4_eq_host_credit { + uint64_t flit[8]; +} t4_eq_host_credit_t; struct tx_sdesc { mblk_t *mp_head; mblk_t *mp_tail; uint32_t txb_used; /* # of bytes of tx copy buffer used */ uint16_t hdls_used; /* # of dma handles used */ - uint16_t desc_used; /* # of hardware descriptors used */ + uint16_t credits_used; /* # of EQ host credits used */ uint64_t _pad; }; typedef enum t4_iq_flags { - IQ_ALLOCATED = (1 << 0), /* firmware resources allocated */ - IQ_INTR = (1 << 1), /* iq takes direct interrupt */ - IQ_HAS_FL = (1 << 2), /* iq has fl */ -} t4_iq_flags_t; + IQ_ALLOC_HOST = (1 << 0), /* host-side resources allocated */ + IQ_ALLOC_DEV = (1 << 1), /* device-side resource allocated */ + IQ_INTR = (1 << 2), /* iq takes direct interrupt */ -typedef enum t4_iq_state { - IQS_DISABLED = 0, - IQS_BUSY = 1, - IQS_IDLE = 2, -} t4_iq_state_t; + /* Runtime state flags: */ + IQ_ENABLED = (1 << 3), + IQ_POLLING = (1 << 4), +} t4_iq_flags_t; struct rxbuf_cache_params { dev_info_t *dip; @@ -162,8 +155,8 @@ struct rxbuf_cache_params { }; struct sge_iq_stats { - uint64_t sis_overflow; - uint64_t sis_processed; + uint64_t sis_processed; /* # entries processed from IQ */ + uint64_t sis_overflow; /* # entries bearing overflow flag */ }; /* @@ -172,141 +165,330 @@ struct sge_iq_stats { * * See: t4_iq_update_intr_cfg() and t4_iq_gts_update(). */ -typedef enum t4_intr_config { - TIC_SE_INTR_ARM = 1, - TIC_TIMER0 = (0 << 1), - TIC_TIMER1 = (1 << 1), - TIC_TIMER2 = (2 << 1), - TIC_TIMER3 = (3 << 1), - TIC_TIMER4 = (4 << 1), - TIC_TIMER5 = (5 << 1), - TIC_START_COUNTER = (6 << 1), -} t4_intr_config_t; +typedef enum t4_gts_config { + TGC_SE_INTR_ARM = 1, + TGC_TIMER0 = (0 << 1), + TGC_TIMER1 = (1 << 1), + TGC_TIMER2 = (2 << 1), + TGC_TIMER3 = (3 << 1), + TGC_TIMER4 = (4 << 1), + TGC_TIMER5 = (5 << 1), + TGC_START_COUNTER = (6 << 1), +} t4_gts_config_t; /* - * Ingress Queue: T4 is producer, driver is consumer. + * Event IQs are used for firmware events, Tx EGR updates, and IQ forwarded + * interrupts. + * + * Ethernet Rx IQs are used for receiving incoming packets. */ -struct sge_iq { - t4_iq_state_t state; - t4_iq_flags_t flags; - t4_intr_config_t intr_params; - - ddi_dma_handle_t dhdl; - ddi_acc_handle_t ahdl; - - __be64 *desc; /* KVA of descriptor ring */ - uint64_t ba; /* bus address of descriptor ring */ - const __be64 *cdesc; /* current descriptor */ - struct adapter *adapter; /* associated adapter */ - uint8_t gen; /* generation bit */ - int8_t intr_pktc_idx; /* packet count threshold index */ - uint8_t esize; /* size (bytes) of each entry in the queue */ - uint16_t qsize; /* size (# of entries) of the queue */ - uint16_t cidx; /* consumer index */ - uint16_t pending; /* # of descs processed since last doorbell */ - uint16_t cntxt_id; /* SGE context id for the iq */ - uint16_t abs_id; /* absolute SGE id for the iq */ - kmutex_t lock; /* Rx access lock */ - uint8_t polling; - - struct sge_iq_stats stats; - - STAILQ_ENTRY(sge_iq) link; +typedef enum t4_iq_type { + TIQT_UNINIT, + TIQT_EVENT, + TIQT_ETH_RX, +} t4_iq_type_t; + +/* Ingress Queue: T4 is producer, driver is consumer. */ +typedef struct t4_sge_iq { + kmutex_t tsi_lock; + + t4_iq_type_t tsi_iqtype; /* Write Once */ + t4_iq_flags_t tsi_flags; /* tsi_lock */ + + /* + * This field is non-NULL only for Rx queues. It points to the event + * queue which receives interrupts on its behalf. The event queue + * processes these "forwarded interrupts" in t4_process_event_iq() and + * calls into t4_process_rx_iq() for each Rx queue with an interrupt + * notification message. + */ + struct t4_sge_iq *tsi_intr_evtq; /* Write Once */ + /* + * This field is only used by the event queues. + * + * As the event queue processes forwarded interrupt notification + * messages it adds the destination rx queue receving the notification + * to this list. After the event queue finishes processing its own + * messages, it then uses this list to process the rx queues which have + * pending notifications. + */ + list_node_t tsi_intr_fwd_node; /* tsi_lock */ + /* + * This field is used by both event queues and rx queues. + * + * For event queues this field holds the interrupt vector assigned to + * this queue. + * + * For rx queues it holds the sentinel value INTR_FORWARDED to indicate + * it has its interrupts forwarded by the event queue. In the current + * iteration of this driver all rx queues will always have their + * interrupts forwarded. + */ + uint_t tsi_intr_idx; /* Write Once */ + + ddi_dma_handle_t tsi_desc_dhdl; /* Write Once */ + ddi_acc_handle_t tsi_desc_ahdl; /* Write Once */ + + /* KVA of descriptor ring */ + void *tsi_desc; /* Write Once */ + /* bus address of descriptor ring */ + uint64_t tsi_desc_ba; /* Write Once */ + /* current descriptor (at CIDX) */ + const void *tsi_cdesc; /* tsi_lock */ + + /* Sizing and status */ + /* size of each entry in the queue */ + t4_iq_esize_t tsi_esize; /* Write Once */ + /* entry size in bytes */ + uint16_t tsi_esize_bytes; /* Write Once */ + /* number of entries in the queue */ + uint16_t tsi_qsize; /* Write Once */ + /* number of usable entries in the queue */ + uint16_t tsi_cap; /* Write Once */ + /* consumer index */ + uint16_t tsi_cidx; /* tsi_lock */ + /* generation bit */ + uint8_t tsi_gen; /* tsi_lock */ + + /* GTS config to re-arm queue notification */ + t4_gts_config_t tsi_gts_rearm; /* tsi_lock */ + /* packet count threshold index */ + int8_t tsi_intr_pktc_idx; /* tsi_lock */ + + /* SGE context ID for IQ */ + uint16_t tsi_cntxt_id; /* Write Once */ + /* absolute SGE ID for IQ */ + uint16_t tsi_abs_id; /* Write Once */ + + /* associated adapter */ + struct adapter *tsi_adapter; /* Write Once */ + /* associated freelist (if any) */ + struct sge_fl *tsi_fl; /* Write Once */ + + struct sge_iq_stats tsi_stats; /* tsi_lock */ +} t4_sge_iq_t; + +/* Result of servicing IQ in t4_iq_service() call */ +typedef enum t4_iq_result { + TIR_SUCCESS, /* All available entries processed successfully */ + TIR_DISABLED, /* IQ is disabled */ + TIR_POLLING, /* non-polling service req'd on polling-cfg'd IQ */ + TIR_ALLOC_FAIL, /* could not allocate packet buffer(s) */ + TIR_BUDGET_MAX, /* hit budget limit while processing entries */ +} t4_iq_result_t; + +/* + * Details used when servicing an IQ as part of polling. + */ +struct t4_poll_req { + mblk_t *tpr_mp; + uint_t tpr_byte_budget; }; typedef enum t4_eq_flags { /* Initialization state flags: */ - EQ_ALLOCATED = (1 << 0), /* firmware resources allocated */ - EQ_MTX = (1 << 1), /* mutex has been initialized */ + EQ_ALLOC_HOST = (1 << 0), /* host-side resources allocated */ + EQ_ALLOC_DEV = (1 << 1), /* EQ allocated in device firmware */ + EQ_ALLOC_DESC = (1 << 2), /* descriptor inputs allocated */ /* Runtime state flags: */ + EQ_ENABLED = (1 << 3), /* ready for submitted work requests */ /* * Short on resources (memory and/or descriptors) while attempting to * enqueue work in EQ */ - EQ_CORKED = (1 << 2), + EQ_CORKED = (1 << 4), } t4_eq_flags_t; -/* Listed in order of preference. */ +/* + * These are the Egress Queue doorbell methods. They are listed in order of + * preference (WCWR most preferred, KDB least). This ordering is important as + * the datapath uses ffs (find first set) to pick the preferred method. + * + * The first three are "user space" doorbells. They are mapped in BAR2 and are + * provided to allow kernel-bypass network stacks. However, they can also be + * used in the kernel and provide benefits such as write combining and per-queue + * registers (versus KDB which is a single register). + * + * The WCWR, Write Combining Work Request, is the preferred method. It allows + * the driver to push a WR directly to the device without the need for it to + * perform a DMA read of the hardware ring (to read the EQ host credit). Instead + * it comes in via the BAR2/UDB memory space and the device increments the pidx + * accordingly. However, the WCWR is limited to a single WR. It's use is + * intended for low latency situations or low rate of work, not for throughput. + * + * The maximum WCWR for T4 is 256 bytes. For T5/T6 it is 64-128 bytes, depending + * on the write-combining size of the platform. + * + * T4 Firmware Interface Specification, §9.2 Egress Queues and Work Requests. + */ typedef enum t4_doorbells { - DOORBELL_UDB = (1 << 0), - DOORBELL_WCWR = (1 << 1), - DOORBELL_UDBWC = (1 << 2), + DOORBELL_WCWR = (1 << 0), + DOORBELL_UDBWC = (1 << 1), + DOORBELL_UDB = (1 << 2), DOORBELL_KDB = (1 << 3), } t4_doorbells_t; -/* - * Egress Queue: driver is producer, T4 is consumer. - * - * Note: A free list is an egress queue (driver produces the buffers and T4 - * consumes them) but it's special enough to have its own struct (see sge_fl). - */ -struct sge_eq { - ddi_dma_handle_t desc_dhdl; - ddi_acc_handle_t desc_ahdl; - t4_eq_flags_t flags; - kmutex_t lock; +/* Egress Queue: driver is producer, T4 is consumer. */ +typedef struct t4_sge_eq { + kmutex_t tse_lock; - struct tx_desc *desc; /* KVA of descriptor ring */ - uint64_t ba; /* bus address of descriptor ring */ - struct sge_qstat *spg; /* status page, for convenience */ - t4_doorbells_t doorbells; - caddr_t udb; /* KVA of doorbell (lies within BAR2) */ - uint_t udb_qid; /* relative qid within the doorbell page */ - uint16_t cap; /* max # of desc, for convenience */ - uint16_t avail; /* available descriptors, for convenience */ - uint16_t qsize; /* size (# of entries) of the queue */ - uint16_t cidx; /* consumer idx (desc idx) */ - uint16_t pidx; /* producer idx (desc idx) */ - uint16_t pending; /* # of descriptors used since last doorbell */ - uint16_t iqid; /* iq that gets egr_update for the eq */ - uint8_t tx_chan; /* tx channel used by the eq */ - uint32_t cntxt_id; /* SGE context id for the eq */ -}; + t4_eq_flags_t tse_flags; /* tse_lock */ -typedef enum t4_fl_flags { - FL_MTX = (1 << 0), /* mutex has been initialized */ - FL_STARVING = (1 << 1), /* on the list of starving fl's */ - FL_DOOMED = (1 << 2), /* about to be destroyed */ -} t4_fl_flags_t; + ddi_dma_handle_t tse_ring_dhdl; /* Write Once */ + ddi_acc_handle_t tse_ring_ahdl; /* Write Once */ + + /* + * The ring type is pointer to void because the ring does not consist of + * descriptors but rather host credits. These host credits carry + * variable length work requests (WR) as well as the status page (SP) at + * the end of the ring. We use void* to facilitate the type punning + * required to work with these various types of EQ entries. + * + * In order to access credits and their individual flits we make use to + * the t4_eq_host_credit_t type. + */ + void *tse_ring; /* KVA of ring - Write Once */ + uint64_t tse_ring_ba; /* bus address of ring - Write Once */ + + /* + * tse_qsize: The number of host credits that may be used for data. This + * value is static for the lifetime of the queue. + * + * tse_qsize_spg: The total number of host credits in the queue. This is + * 1-2 more credits than tse_qsize to account for the status page at the + * end of the queue. The status page credits cannot be used for sending + * data, rather the beginning of the status page is considered the end + * of the queue as far as the datapath is concerned. + * + * tse_avail: The number of host credits that are currently available + * for use by the host. This is never more than 'tse_qsize - 1' in order + * to avoid 'tse_pidx==tse_cidx' which we use to indicate an empty + * queue. This number is updated as credits are used/recycled. + * + * tse_pending: The number of credits that have been written by the host + * but still require a doorbell before the device can consume them. Said + * another way, it's the number of credits the host's pidx is ahead of + * the device's cidx. + */ + uint16_t tse_qsize; /* Write Once */ + uint16_t tse_qsize_spg; /* Write Once */ + uint16_t tse_avail; /* tse_lock */ + uint16_t tse_pending; /* tse_lock */ -#define FL_RUNNING_LOW(fl) (fl->cap - fl->needed <= fl->lowat) -#define FL_NOT_RUNNING_LOW(fl) (fl->cap - fl->needed >= 2 * fl->lowat) + /* + * The pidx is the driver's position in the queue, pointing to the next + * credit to consume. The cidx is the device's position in the queue, + * pointing to the last credit it has consumed as of the last status + * update. + */ + uint16_t tse_cidx; /* tse_lock */ + uint16_t tse_pidx; /* tse_lock */ + + /* Doorbell bits */ + t4_doorbells_t tse_doorbells; /* Write Once */ + /* KVA of doorbell (lies within BAR2) */ + caddr_t tse_udb; /* Write Once */ + /* relative qid within the doorbell page */ + uint_t tse_udb_qid; /* Write Once */ + + struct sge_qstat *tse_spg; /* status page - Write Once */ + /* IQ that gets egr_update msg for EQ */ + uint16_t tse_iqid; /* Write Once */ + /* tx channel used by the EQ */ + uint8_t tse_tx_chan; /* Write Once */ + /* SGE context id for the EQ */ + uint32_t tse_cntxt_id; /* Write Once */ +} t4_sge_eq_t; + +typedef enum t4_sfl_flags { + SFL_STARVING = (1 << 0), /* on the list of starving fl's */ + SFL_DOOMED = (1 << 1), /* about to be destroyed */ +} t4_sfl_flags_t; + +struct sge_fl_stats { + /* These stats describe the receiving of data. */ + uint64_t copy; /* # of frames copied (allocb) */ + uint64_t copy_fail; /* # of allocb failures */ + uint64_t wrap; /* # of frames wrapped (desballoc) */ + uint64_t wrap_fail; /* # of deballoc failures */ + + /* These stats describe the refilling of rx (FL) buffers. */ + uint64_t rxb_recycle; /* # of rx buffers recycled */ + uint64_t rxb_alloc; /* # of rx buffers allocated */ + uint64_t rxb_alloc_fail; /* # of rx buffers that failed to allocb */ +}; struct sge_fl { - t4_fl_flags_t flags; - kmutex_t lock; - ddi_dma_handle_t dhdl; - ddi_acc_handle_t ahdl; - - __be64 *desc; /* KVA of descriptor ring, ptr to addresses */ - uint64_t ba; /* bus address of descriptor ring */ - struct fl_sdesc *sdesc; /* KVA of software descriptor ring */ - uint32_t cap; /* max # of buffers, for convenience */ - uint16_t qsize; /* size (# of entries) of the queue */ - uint16_t cntxt_id; /* SGE context id for the freelist */ - uint32_t cidx; /* consumer idx (buffer idx, NOT hw desc idx) */ - uint32_t pidx; /* producer idx (buffer idx, NOT hw desc idx) */ - uint32_t needed; /* # of buffers needed to fill up fl. */ - uint32_t lowat; /* # of buffers <= this means fl needs help */ - uint32_t pending; /* # of bufs allocated since last doorbell */ - uint32_t offset; /* current packet within the larger buffer */ - uint16_t copy_threshold; /* anything this size or less is copied up */ + /* + * EQ for passing freelist entries to adapter. + * Must be first field in struct + */ + t4_sge_eq_t eq; /* Write Once */ + + /* + * Index at which new buffers are to be placed in the FL descriptor + * which is currently being produced for the device. + */ + uint8_t cidx_sdesc; /* FL_LOCK */ + uint8_t pidx_sdesc; /* FL_LOCK */ + + /* KVA of the software descriptor ring. */ + struct fl_sdesc *sdesc; /* Write Once */ + /* Total number of buffers in the FL. */ + uint32_t bufs_cap; /* Write Once */ + /* + * Number of buffers available to receive data, buffers owned by the + * device. + */ + uint32_t bufs_avail; /* FL_LOCK */ + /* Number of buffers at which the FL is considered "starving". */ + uint32_t bufs_lowat; /* Write Once */ + /* The byte offset in the current FL buffer. */ + uint32_t offset; /* FL_LOCK */ + /* Any packet smaller or equal to this is copied (allocb). */ + uint16_t copy_threshold; /* Write Once */ + + /* Starvation-related state for this freelist. */ + t4_sfl_flags_t sfl_flags; /* adapter->sfl_lock */ + list_node_t sfl_node; /* adapter->sfl_lock */ + + struct sge_fl_stats stats; /* FL_LOCK */ +}; - uint64_t copied_up; /* # of frames copied into mblk and handed up */ - uint64_t passed_up; /* # of frames wrapped in mblk and handed up */ - uint64_t allocb_fail; /* # of mblk allocation failures */ +struct sge_txq_stats { + /* stats for common events first */ + uint64_t txpkts; /* # of ethernet packets */ + uint64_t txbytes; /* # of ethernet bytes */ + uint64_t txcsum; /* # of times hardware assisted with checksum */ + uint64_t tso_wrs; /* # of IPv4 TSO work requests */ + uint64_t imm_wrs; /* # of work requests with immediate data */ + uint64_t sgl_wrs; /* # of work requests with direct SGL */ + uint64_t txpkt_wrs; /* # of txpkt work requests (not coalesced) */ + uint64_t txpkts_wrs; /* # of coalesced tx work requests */ + uint64_t txpkts_pkts; /* # of frames in coalesced tx work requests */ + uint64_t txb_used; /* # of tx copy buffers used (64 byte each) */ + uint64_t hdl_used; /* # of DMA handles used */ - TAILQ_ENTRY(sge_fl) link; /* All starving freelists */ + /* stats for not-that-common events */ + uint32_t txb_full; /* txb ran out of space */ + uint32_t dma_hdl_failed; /* couldn't obtain DMA handle */ + uint32_t dma_map_failed; /* couldn't obtain DMA mapping */ + uint32_t qfull; /* out of hardware descriptors */ + uint32_t pullup_early; /* # of pullups before starting frame's SGL */ + uint32_t pullup_late; /* # of pullups while building frame's SGL */ + uint32_t pullup_failed; /* # of failed pullups */ + uint32_t csum_failed; /* # of csum reqs we failed to fulfill */ }; -/* txq: SGE egress queue + miscellaneous items */ +/* Ethernet packet transmission queue */ struct sge_txq { - struct sge_eq eq; /* MUST be first */ + t4_sge_eq_t eq; - struct port_info *port; /* the port this txq belongs to */ + struct port_info *port; struct tx_sdesc *sdesc; /* KVA of software descriptor ring */ + mac_ring_handle_t ring_handle; /* DMA handles used for tx */ @@ -326,137 +508,234 @@ struct sge_txq { uint32_t txb_avail; /* # of bytes available */ uint16_t copy_threshold; /* anything this size or less is copied up */ - uint64_t txpkts; /* # of ethernet packets */ - uint64_t txbytes; /* # of ethernet bytes */ kstat_t *ksp; + struct sge_txq_stats stats; +}; +struct sge_rxq_stats { /* stats for common events first */ - - uint64_t txcsum; /* # of times hardware assisted with checksum */ - uint64_t tso_wrs; /* # of IPv4 TSO work requests */ - uint64_t imm_wrs; /* # of work requests with immediate data */ - uint64_t sgl_wrs; /* # of work requests with direct SGL */ - uint64_t txpkt_wrs; /* # of txpkt work requests (not coalesced) */ - uint64_t txpkts_wrs; /* # of coalesced tx work requests */ - uint64_t txpkts_pkts; /* # of frames in coalesced tx work requests */ - uint64_t txb_used; /* # of tx copy buffers used (64 byte each) */ - uint64_t hdl_used; /* # of DMA handles used */ - - /* stats for not-that-common events */ - - uint32_t txb_full; /* txb ran out of space */ - uint32_t dma_hdl_failed; /* couldn't obtain DMA handle */ - uint32_t dma_map_failed; /* couldn't obtain DMA mapping */ - uint32_t qfull; /* out of hardware descriptors */ - uint32_t pullup_early; /* # of pullups before starting frame's SGL */ - uint32_t pullup_late; /* # of pullups while building frame's SGL */ - uint32_t pullup_failed; /* # of failed pullups */ - uint32_t csum_failed; /* # of csum reqs we failed to fulfill */ + uint64_t rxcsum; /* # of times hardware assisted with checksum */ + uint64_t rxpkts; /* # of ethernet packets */ + uint64_t rxbytes; /* # of ethernet bytes */ }; -/* rxq: SGE ingress queue + SGE free list + miscellaneous items */ +/* Ethernet packet receive queue */ struct sge_rxq { - struct sge_iq iq; /* MUST be first */ - struct sge_fl fl; + t4_sge_iq_t iq; + struct sge_fl fl; /* Freelist for packet receive buffers */ - struct port_info *port; /* the port this rxq belongs to */ - kstat_t *ksp; + struct port_info *port; mac_ring_handle_t ring_handle; uint64_t ring_gen_num; - /* stats for common events first */ + kstat_t *ksp; + struct sge_rxq_stats stats; +}; - uint64_t rxcsum; /* # of times hardware assisted with checksum */ - uint64_t rxpkts; /* # of ethernet packets */ - uint64_t rxbytes; /* # of ethernet bytes */ +typedef enum t4_port_flags { + TPF_INIT_DONE = (1 << 0), + TPF_OPEN = (1 << 1), + TPF_VI_ENABLED = (1 << 2), +} t4_port_flags_t; - /* stats for not-that-common events */ +typedef enum t4_port_feat { + CXGBE_HW_LSO = (1 << 0), + CXGBE_HW_CSUM = (1 << 1), +} t4_port_feat_t; + + +struct port_info { + kmutex_t lock; + dev_info_t *dip; + struct adapter *adapter; + uint8_t port_id; + + t4_port_flags_t flags; + t4_port_feat_t features; + + mac_handle_t mh; + int mtu; + uint8_t hw_addr[ETHERADDRL]; + int16_t xact_addr_filt; /* index of exact MAC address filter */ + + uint16_t rxq_count; /* # of RX queues */ + uint16_t rxq_start; /* index of first RX queue */ + uint16_t txq_count; /* # of TX queues */ + uint16_t txq_start; /* index of first TX queue */ + + /* + * Array of IQs for queue events, such as interrupt forward events + * for Rx queue processing and completion events for Tx queues. + * Only available when TIP_PER_PORT is selected. The size is based + * on adapter.intr_queue_cfg.intr_per_port. + */ + t4_sge_iq_t *intr_iqs; + + kstat_t *ksp_config; + kstat_t *ksp_info; + kstat_t *ksp_fec; + + /* Port attributes/data set by common code: */ + uint16_t viid; + uint16_t rss_size; /* size of VI's RSS table slice */ + + uint8_t port_type; + int8_t mdio_addr; + uint8_t mod_type; + + uint8_t lport; + uint8_t tx_chan; + uint8_t rx_chan; + uint8_t rx_cchan; + + uint8_t rss_mode; + + uint8_t tmr_idx; + int8_t pktc_idx; + uint8_t dbq_timer_idx; - uint32_t nomem; /* mblk allocation during rx failed */ + struct link_config link_cfg; + uint8_t macaddr_cnt; + + u8 vivld; + u8 vin; + u8 smt_idx; + + /* Mirroring bits utilized by common code (unused by our driver) */ + u16 viid_mirror; + u8 vivld_mirror; + u8 vin_mirror; }; -struct sge { - int fl_starve_threshold; - int s_qpp; +struct sge_info { + uint_t fl_starve_threshold; uint64_t dbq_timer_tick; uint16_t dbq_timers[SGE_NDBQTIMERS]; - int nrxq; /* total rx queues (all ports and the rest) */ - int ntxq; /* total tx queues (all ports and the rest) */ - int niq; /* total ingress queues */ - int neq; /* total egress queues */ - int stat_len; /* length of status page at ring end */ - int pktshift; /* padding between CPL & packet data */ - int fl_align; /* response queue message alignment */ + uint_t eq_spg_len; /* EQ status page length in host credits */ + uint_t pktshift; /* padding between CPL & packet data */ + uint_t fl_align; /* response queue message alignment */ uint8_t fwq_tmr_idx; /* Intr. coalesce timer for FWQ */ int8_t fwq_pktc_idx; /* Intr. coalesce count for FWQ */ - struct sge_iq fwq; /* Firmware event queue */ - struct sge_txq *txq; /* NIC tx queues */ - struct sge_rxq *rxq; /* NIC rx queues */ + t4_sge_iq_t fwq; /* Firmware event queue */ - uint_t iq_start; /* iq context id map start index */ - uint_t eq_start; /* eq context id map start index */ - uint_t iqmap_sz; /* size of iq context id map */ - uint_t eqmap_sz; /* size of eq context id map */ - struct sge_iq **iqmap; /* iq->cntxt_id to iq mapping */ - struct sge_eq **eqmap; /* eq->cntxt_id to eq mapping */ + uint_t rxq_count; /* total RX queues (all ports and the rest) */ + uint_t txq_count; /* total TX queues (all ports and the rest) */ + struct sge_txq *txq; /* NIC TX queues */ + struct sge_rxq *rxq; /* NIC RX queues */ + + /* + * Adapters uses 16-bit "context IDs" to uniquely identify queues. + * + * References to the queues, indexed by said context IDs are maintained + * here, using the start/end values queried from the adapter. + */ + uint_t iqmap_start; /* IQ context id map start index */ + uint_t rxqmap_start; /* IQ context id map start index */ + uint_t eqmap_start; /* EQ context id map start index */ + uint_t iqmap_sz; /* size of IQ context id map */ + uint_t eqmap_sz; /* size of EQ context id map */ + t4_sge_iq_t **iqmap; /* iq->cntxt_id to IQ mapping */ + t4_sge_eq_t **eqmap; /* eq->cntxt_id to EQ mapping */ /* Device access and DMA attributes for all the descriptor rings */ ddi_device_acc_attr_t acc_attr_desc; ddi_dma_attr_t dma_attr_desc; - /* Device access and DMA attributes for tx buffers */ + /* Device access and DMA attributes for TX buffers */ ddi_device_acc_attr_t acc_attr_tx; ddi_dma_attr_t dma_attr_tx; - /* Device access and DMA attributes for rx buffers are in rxb_params */ + /* Device access and DMA attributes for RX buffers are in rxb_params */ kmem_cache_t *rxbuf_cache; struct rxbuf_cache_params rxb_params; }; struct driver_properties { - int max_ntxq_10g; - int max_nrxq_10g; - int max_ntxq_1g; - int max_nrxq_1g; - int intr_types; - int tmr_idx_10g; - int pktc_idx_10g; - int tmr_idx_1g; - int pktc_idx_1g; + uint8_t ethq_tmr_idx; + int8_t ethq_pktc_idx; uint8_t dbq_timer_idx; uint8_t fwq_tmr_idx; int8_t fwq_pktc_idx; - int qsize_txq; - int qsize_rxq; + uint16_t qsize_txq; + uint16_t qsize_rxq; uint_t holdoff_timer_us[SGE_NTIMERS]; uint_t holdoff_pktcnt[SGE_NCOUNTERS]; - int wc; - - int multi_rings; + bool write_combine; int t4_fw_install; }; -struct t4_mbox_list { - STAILQ_ENTRY(t4_mbox_list) link; -}; +typedef struct t4_mbox_waiter { + list_node_t node; + kthread_t *thread; +} t4_mbox_waiter_t; typedef enum t4_adapter_flags { /* Initialization progress status bits */ TAF_INIT_DONE = (1 << 0), TAF_FW_OK = (1 << 1), - TAF_INTR_FWD = (1 << 2), - TAF_INTR_ALLOC = (1 << 3), + TAF_INTR_ALLOC = (1 << 2), /* State & capability bits */ - TAF_MASTER_PF = (1 << 4), - TAF_DBQ_TIMER = (1 << 5), + TAF_MASTER_PF = (1 << 8), + TAF_DBQ_TIMER = (1 << 9), } t4_adapter_flags_t; +/* Plan for interrupt allocation */ +typedef enum t4_intr_plan { + /* Everything on a single interrupt */ + TIP_SINGLE, + /* One for device errors, one FWQ (including forwarded intrs) */ + TIP_ERR_QUEUES, + /* 1 + 1 for errors and FWQ, with rest divided evenly between ports */ + TIP_PER_PORT, +} t4_intr_plan_t; + +struct t4_intrs_queues { + /* The DDI_INTR_TYPE_* value negotiated. */ + int intr_type; + + /* + * The plan for interrupt allocation, based on the interrupt type + * and number of interrupts available. See the block comment in + * t4_nexus.c for more information. + */ + t4_intr_plan_t intr_plan; + + /* + * The number of interrupts available (intr_avail) for use vs. the + * number of interrupts the driver has decided to make use of + * (intr_count). These values may be different depending on the + * number available and the port count of the attached part. + */ + int intr_avail; + int intr_count; + + /* + * The number of interrupts per port for use with event queues. + * These interrupts are used to take delivery of Tx recycling + * messages and Rx packet delivery. + */ + uint_t intr_per_port; + + /* + * Track the number of IQs allocated for use with interrupts. We track + * this to know how many IQs we have leftover for Rx queue usage. + */ + uint_t num_iqs; + + /* The maximum number of RX/TX queues per port. */ + uint_t port_max_rxq; + uint_t port_max_txq; +}; + +/* + * WO - Write Once at initialization time. + */ struct adapter { list_node_t node; dev_info_t *dip; @@ -483,17 +762,15 @@ struct adapter { caddr_t bar2_ptr; /* Interrupt information */ - int intr_type; - int intr_count; + ddi_intr_handle_t *intr_handle; int intr_cap; uint_t intr_pri; - ddi_intr_handle_t *intr_handle; struct driver_properties props; kstat_t *ksp; kstat_t *ksp_stat; - struct sge sge; + struct sge_info sge; struct port_info *port[MAX_NPORTS]; uint8_t chan_map[NCHAN]; @@ -504,13 +781,19 @@ struct adapter { unsigned int cfcsum; struct adapter_params params; + struct t4_intrs_queues intr_queue_cfg; kmutex_t lock; kcondvar_t cv; - /* Starving free lists */ - kmutex_t sfl_lock; /* same cache-line as sc_lock? but that's ok */ - TAILQ_HEAD(, sge_fl) sfl; + /* + * Starving freelist state + * + * sfl_lock protects the `sfl_flags` and `sfl_node` fields in all sge_fl + * structs owned by this adapter. + */ + kmutex_t sfl_lock; + list_t sfl_list; timeout_id_t sfl_timer; /* Sensors */ @@ -521,12 +804,8 @@ struct adapter { /* support for single-threading access to adapter mailbox registers */ kmutex_t mbox_lock; - STAILQ_HEAD(, t4_mbox_list) mbox_list; -}; - -struct memwin { - uint32_t base; - uint32_t aperture; + kcondvar_t mbox_cv; + list_t mbox_list; }; #define ADAPTER_LOCK(sc) mutex_enter(&(sc)->lock) @@ -539,30 +818,21 @@ struct memwin { #define PORT_LOCK_ASSERT_OWNED(pi) ASSERT(mutex_owned(&(pi)->lock)) #define PORT_LOCK_ASSERT_NOTOWNED(pi) ASSERT(!mutex_owned(&(pi)->lock)) -#define IQ_LOCK(iq) mutex_enter(&(iq)->lock) -#define IQ_UNLOCK(iq) mutex_exit(&(iq)->lock) -#define IQ_LOCK_ASSERT_OWNED(iq) ASSERT(mutex_owned(&(iq)->lock)) -#define IQ_LOCK_ASSERT_NOTOWNED(iq) ASSERT(!mutex_owned(&(iq)->lock)) - -#define FL_LOCK(fl) mutex_enter(&(fl)->lock) -#define FL_UNLOCK(fl) mutex_exit(&(fl)->lock) -#define FL_LOCK_ASSERT_OWNED(fl) ASSERT(mutex_owned(&(fl)->lock)) -#define FL_LOCK_ASSERT_NOTOWNED(fl) ASSERT(!mutex_owned(&(fl)->lock)) +#define IQ_LOCK(iq) mutex_enter(&(iq)->tsi_lock) +#define IQ_UNLOCK(iq) mutex_exit(&(iq)->tsi_lock) +#define IQ_LOCK_ASSERT_OWNED(iq) ASSERT(mutex_owned(&(iq)->tsi_lock)) +#define IQ_LOCK_ASSERT_NOTOWNED(iq) ASSERT(!mutex_owned(&(iq)->tsi_lock)) -#define RXQ_LOCK(rxq) IQ_LOCK(&(rxq)->iq) -#define RXQ_UNLOCK(rxq) IQ_UNLOCK(&(rxq)->iq) -#define RXQ_LOCK_ASSERT_OWNED(rxq) IQ_LOCK_ASSERT_OWNED(&(rxq)->iq) -#define RXQ_LOCK_ASSERT_NOTOWNED(rxq) IQ_LOCK_ASSERT_NOTOWNED(&(rxq)->iq) +#define EQ_LOCK(eq) mutex_enter(&(eq)->tse_lock) +#define EQ_UNLOCK(eq) mutex_exit(&(eq)->tse_lock) +#define EQ_LOCK_ASSERT_OWNED(eq) ASSERT(mutex_owned(&(eq)->tse_lock)) +#define EQ_LOCK_ASSERT_NOTOWNED(eq) ASSERT(!mutex_owned(&(eq)->tse_lock)) -#define RXQ_FL_LOCK(rxq) FL_LOCK(&(rxq)->fl) -#define RXQ_FL_UNLOCK(rxq) FL_UNLOCK(&(rxq)->fl) -#define RXQ_FL_LOCK_ASSERT_OWNED(rxq) FL_LOCK_ASSERT_OWNED(&(rxq)->fl) -#define RXQ_FL_LOCK_ASSERT_NOTOWNED(rxq) FL_LOCK_ASSERT_NOTOWNED(&(rxq)->fl) - -#define EQ_LOCK(eq) mutex_enter(&(eq)->lock) -#define EQ_UNLOCK(eq) mutex_exit(&(eq)->lock) -#define EQ_LOCK_ASSERT_OWNED(eq) ASSERT(mutex_owned(&(eq)->lock)) -#define EQ_LOCK_ASSERT_NOTOWNED(eq) ASSERT(!mutex_owned(&(eq)->lock)) +/* Freelist state is protected by its EQ lock */ +#define FL_LOCK(fl) EQ_LOCK(&(fl)->eq) +#define FL_UNLOCK(fl) EQ_UNLOCK(&(fl)->eq) +#define FL_LOCK_ASSERT_OWNED(fl) EQ_LOCK_ASSERT_OWNED(&(fl)->eq) +#define FL_LOCK_ASSERT_NOTOWNED(fl) EQ_LOCK_ASSERT_NOTOWNED(&(fl)->eq) #define TXQ_LOCK(txq) EQ_LOCK(&(txq)->eq) #define TXQ_UNLOCK(txq) EQ_UNLOCK(&(txq)->eq) @@ -570,38 +840,11 @@ struct memwin { #define TXQ_LOCK_ASSERT_NOTOWNED(txq) EQ_LOCK_ASSERT_NOTOWNED(&(txq)->eq) #define for_each_txq(pi, iter, txq) \ - txq = &pi->adapter->sge.txq[pi->first_txq]; \ - for (iter = 0; iter < pi->ntxq; ++iter, ++txq) + txq = &pi->adapter->sge.txq[pi->txq_start]; \ + for (iter = 0; iter < pi->txq_count; ++iter, ++txq) #define for_each_rxq(pi, iter, rxq) \ - rxq = &pi->adapter->sge.rxq[pi->first_rxq]; \ - for (iter = 0; iter < pi->nrxq; ++iter, ++rxq) - -#define NFIQ(sc) ((sc)->intr_count > 1 ? (sc)->intr_count - 1 : 1) - -/* One for errors, one for firmware events */ -#define T4_EXTRA_INTR 2 - -static inline void t4_mbox_list_add(struct adapter *adap, - struct t4_mbox_list *entry) -{ - mutex_enter(&adap->mbox_lock); - STAILQ_INSERT_TAIL(&adap->mbox_list, entry, link); - mutex_exit(&adap->mbox_lock); -} - -static inline void t4_mbox_list_del(struct adapter *adap, - struct t4_mbox_list *entry) -{ - mutex_enter(&adap->mbox_lock); - STAILQ_REMOVE(&adap->mbox_list, entry, t4_mbox_list, link); - mutex_exit(&adap->mbox_lock); -} - -static inline struct t4_mbox_list * -t4_mbox_list_first_entry(struct adapter *adap) -{ - return (STAILQ_FIRST(&adap->mbox_list)); -} + rxq = &pi->adapter->sge.rxq[pi->rxq_start]; \ + for (iter = 0; iter < pi->rxq_count; ++iter, ++rxq) static inline struct port_info * adap2pinfo(struct adapter *sc, int idx) @@ -609,28 +852,9 @@ adap2pinfo(struct adapter *sc, int idx) return (sc->port[idx]); } -static inline struct sge_rxq * -iq_to_rxq(struct sge_iq *iq) -{ - return (__containerof(iq, struct sge_rxq, iq)); -} - -static inline bool -t4_port_is_10xg(const struct port_info *pi) -{ - return (pi->link_cfg.pcaps & - (FW_PORT_CAP32_SPEED_400G | - FW_PORT_CAP32_SPEED_200G | - FW_PORT_CAP32_SPEED_100G | - FW_PORT_CAP32_SPEED_50G | - FW_PORT_CAP32_SPEED_40G | - FW_PORT_CAP32_SPEED_25G | - FW_PORT_CAP32_SPEED_10G)); -} - static inline unsigned int t4_use_ldst(struct adapter *adap) { - return (adap->flags & FW_OK); + return (adap->flags & TAF_FW_OK); } static inline void t4_db_full(struct adapter *adap) {} @@ -652,46 +876,50 @@ t4_cver_ge(const adapter_t *adap, uint8_t ver) /* t4_nexus.c */ int t4_port_full_init(struct port_info *); -void t4_port_queues_enable(struct port_info *pi); -void t4_port_queues_disable(struct port_info *pi); uint32_t t4_read_reg(struct adapter *, uint32_t); void t4_write_reg(struct adapter *, uint32_t, uint32_t); uint64_t t4_read_reg64(struct adapter *, uint32_t); void t4_write_reg64(struct adapter *, uint32_t, uint64_t); +void t4_mbox_waiter_add(struct adapter *, t4_mbox_waiter_t *); +void t4_mbox_waiter_remove(struct adapter *, t4_mbox_waiter_t *); +bool t4_mbox_wait_owner(struct adapter *, uint_t, bool); + /* t4_debug.c */ void t4_debug_init(void); void t4_debug_fini(void); /* t4_sge.c */ -void t4_sge_init(struct adapter *sc); -int t4_alloc_fwq(struct adapter *); -int t4_free_fwq(struct adapter *); -int t4_setup_port_queues(struct port_info *pi); -int t4_teardown_port_queues(struct port_info *pi); -uint_t t4_intr_all(caddr_t arg1, caddr_t arg2); -uint_t t4_intr(caddr_t arg1, caddr_t arg2); -uint_t t4_intr_err(caddr_t arg1, caddr_t arg2); -void t4_iq_gts_update(struct sge_iq *, t4_intr_config_t, uint16_t); -void t4_iq_update_intr_cfg(struct sge_iq *, uint8_t, int8_t); -void t4_eq_update_dbq_timer(struct sge_eq *, struct port_info *); -int t4_mgmt_tx(struct adapter *sc, mblk_t *m); +void t4_sge_init(struct adapter *); +int t4_alloc_evt_iqs(struct adapter *); +void t4_free_evt_iqs(struct adapter *); +void t4_port_kstats_init(struct port_info *); +void t4_port_kstats_fini(struct port_info *); +int t4_port_queues_init(struct port_info *); +void t4_port_queues_fini(struct port_info *); +void t4_port_queues_enable(struct port_info *pi); +void t4_port_queues_disable(struct port_info *pi); +uint_t t4_intr_all(caddr_t, caddr_t); +uint_t t4_intr_err(caddr_t, caddr_t); +uint_t t4_intr_fwq(caddr_t, caddr_t); +uint_t t4_intr_port_queue(caddr_t, caddr_t); +void t4_iq_gts_update(t4_sge_iq_t *, t4_gts_config_t, uint16_t); +void t4_iq_update_intr_cfg(t4_sge_iq_t *, uint8_t, int8_t); +void t4_eq_update_dbq_timer(t4_sge_eq_t *, struct port_info *); mblk_t *t4_eth_tx(void *, mblk_t *); -mblk_t *t4_mc_tx(void *arg, mblk_t *m); -mblk_t *t4_ring_rx(struct sge_rxq *rxq, int poll_bytes); +t4_iq_result_t t4_process_rx_iq(t4_sge_iq_t *, uint_t, struct t4_poll_req *); /* t4_mac.c */ -void t4_mc_cb_init(struct port_info *); void t4_os_link_changed(struct adapter *sc, int idx, int link_stat); -void t4_mac_rx(struct port_info *pi, struct sge_rxq *rxq, mblk_t *m); void t4_mac_tx_update(struct port_info *pi, struct sge_txq *txq); int t4_addmac(void *arg, const uint8_t *ucaddr); const char **t4_get_priv_props(struct port_info *, size_t *); uint8_t t4_choose_holdoff_timer(struct adapter *, uint_t); int8_t t4_choose_holdoff_pktcnt(struct adapter *, int); uint_t t4_choose_dbq_timer(struct adapter *, uint_t); +extern mac_callbacks_t t4_mac_callbacks; /* t4_ioctl.c */ int t4_ioctl(struct adapter *sc, int cmd, void *data, int mode); diff --git a/usr/src/uts/common/io/cxgbe/t4nex/cudbg.h b/usr/src/uts/common/io/cxgbe/t4nex/cudbg.h index e86de21085..64cdbedd93 100644 --- a/usr/src/uts/common/io/cxgbe/t4nex/cudbg.h +++ b/usr/src/uts/common/io/cxgbe/t4nex/cudbg.h @@ -318,7 +318,7 @@ static struct el ATTRIBUTE_UNUSED entity_list[] = { }; #ifdef _KERNEL -typedef int (*cudbg_print_cb) (dev_info_t *dip, int, char *, ...); +typedef void (*cudbg_print_cb) (dev_info_t *dip, int, char *, ...); #else typedef int (*cudbg_print_cb) (char *, ...); #endif diff --git a/usr/src/uts/common/io/cxgbe/t4nex/cudbg_lib.c b/usr/src/uts/common/io/cxgbe/t4nex/cudbg_lib.c index e6b5b16667..1b1caa64f5 100644 --- a/usr/src/uts/common/io/cxgbe/t4nex/cudbg_lib.c +++ b/usr/src/uts/common/io/cxgbe/t4nex/cudbg_lib.c @@ -3389,7 +3389,7 @@ collect_tid(struct cudbg_init *pdbg_init, rc = compress_buff(&scratch_buff, dbg_buff); err1: - ADAPTER_UNLOCK(padap); + ADAPTER_LOCK(padap); release_scratch_buff(&scratch_buff, dbg_buff); err: return rc; diff --git a/usr/src/uts/common/io/cxgbe/t4nex/t4_ioctl.c b/usr/src/uts/common/io/cxgbe/t4nex/t4_ioctl.c index 6ca43b52b9..4263ad691d 100644 --- a/usr/src/uts/common/io/cxgbe/t4nex/t4_ioctl.c +++ b/usr/src/uts/common/io/cxgbe/t4nex/t4_ioctl.c @@ -22,7 +22,6 @@ #include #include -#include #include "t4nex.h" #include "common/common.h" diff --git a/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c b/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c index 107feaa51f..f6e8a8a609 100644 --- a/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c +++ b/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c @@ -31,7 +31,6 @@ #include #include #include -#include #include "common/common.h" #include "common/t4_regs.h" @@ -50,37 +49,20 @@ static int t4_mc_getprop(void *arg, const char *name, mac_prop_id_t id, static void t4_mc_propinfo(void *arg, const char *name, mac_prop_id_t id, mac_prop_info_handle_t ph); -static int t4_init_synchronized(struct port_info *pi); -static int t4_uninit_synchronized(struct port_info *pi); +static int t4_port_enable(struct port_info *pi); +static int t4_port_disable(struct port_info *pi); static void t4_propinfo_priv(struct port_info *, const char *, mac_prop_info_handle_t); static int t4_getprop_priv(struct port_info *, const char *, uint_t, void *); static int t4_setprop_priv(struct port_info *, const char *, const void *); -mac_callbacks_t t4_m_callbacks = { +mac_callbacks_t t4_mac_callbacks = { .mc_callbacks = MC_GETCAPAB | MC_PROPERTIES, .mc_getstat = t4_mc_getstat, .mc_start = t4_mc_start, .mc_stop = t4_mc_stop, .mc_setpromisc = t4_mc_setpromisc, .mc_multicst = t4_mc_multicst, - .mc_unicst = t4_mc_unicst, - .mc_tx = t4_mc_tx, - .mc_getcapab = t4_mc_getcapab, - .mc_setprop = t4_mc_setprop, - .mc_getprop = t4_mc_getprop, - .mc_propinfo = t4_mc_propinfo, -}; - -mac_callbacks_t t4_m_ring_callbacks = { - .mc_callbacks = MC_GETCAPAB | MC_PROPERTIES, - .mc_getstat = t4_mc_getstat, - .mc_start = t4_mc_start, - .mc_stop = t4_mc_stop, - .mc_setpromisc = t4_mc_setpromisc, - .mc_multicst = t4_mc_multicst, - .mc_unicst = NULL, /* t4_addmac */ - .mc_tx = NULL, /* t4_eth_tx */ .mc_getcapab = t4_mc_getcapab, .mc_setprop = t4_mc_setprop, .mc_getprop = t4_mc_getprop, @@ -401,7 +383,8 @@ t4_mc_getstat(void *arg, uint_t stat, uint64_t *val) break; case MAC_STAT_NORCVBUF: - *val = 0; /* TODO should come from rxq->nomem */ + /* TODO: pull from freelist stats? */ + *val = 0; break; case MAC_STAT_IERRORS: @@ -725,7 +708,7 @@ t4_mc_start(void *arg) struct port_info *pi = arg; ADAPTER_LOCK(pi->adapter); - const int rc = t4_init_synchronized(pi); + const int rc = t4_port_enable(pi); ADAPTER_UNLOCK(pi->adapter); return (rc); @@ -737,7 +720,7 @@ t4_mc_stop(void *arg) struct port_info *pi = arg; ADAPTER_LOCK(pi->adapter); - (void) t4_uninit_synchronized(pi); + (void) t4_port_disable(pi); ADAPTER_UNLOCK(pi->adapter); } @@ -746,11 +729,10 @@ t4_mc_setpromisc(void *arg, boolean_t on) { struct port_info *pi = arg; struct adapter *sc = pi->adapter; - int rc; ADAPTER_LOCK(sc); - rc = -t4_set_rxmode(sc, sc->mbox, pi->viid, -1, on ? 1 : 0, -1, -1, -1, - false); + const int rc = -t4_set_rxmode(sc, sc->mbox, pi->viid, -1, on ? 1 : 0, + -1, -1, -1, false); ADAPTER_UNLOCK(sc); return (rc); @@ -766,10 +748,10 @@ t4_mc_multicst(void *arg, boolean_t add, const uint8_t *mcaddr) struct port_info *pi = arg; struct adapter *sc = pi->adapter; struct fw_vi_mac_cmd c; - int len16, rc; + int rc = 0; + int len16 = howmany(sizeof (c.op_to_viid) + + sizeof (c.freemacs_to_len16) + sizeof (c.u.exact[0]), 16); - len16 = howmany(sizeof (c.op_to_viid) + sizeof (c.freemacs_to_len16) + - sizeof (c.u.exact[0]), 16); c.op_to_viid = htonl(V_FW_CMD_OP(FW_VI_MAC_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | V_FW_VI_MAC_CMD_VIID(pi->viid)); c.freemacs_to_len16 = htonl(V_FW_CMD_LEN16(len16)); @@ -781,26 +763,8 @@ t4_mc_multicst(void *arg, boolean_t add, const uint8_t *mcaddr) ADAPTER_LOCK(sc); rc = -t4_wr_mbox_meat(sc, sc->mbox, &c, len16 * 16, &c, true); ADAPTER_UNLOCK(sc); - if (rc != 0) - return (rc); -#ifdef DEBUG - /* - * TODO: Firmware doesn't seem to return the correct index on removal - * (it gives back 0x3fd FW_VI_MAC_MAC_BASED_FREE unchanged. Remove this - * code once it is fixed. - */ - else { - uint16_t idx; - - idx = G_FW_VI_MAC_CMD_IDX(ntohs(c.u.exact[0].valid_to_idx)); - cxgb_printf(pi->dip, CE_NOTE, - "%02x:%02x:%02x:%02x:%02x:%02x %s %d", mcaddr[0], - mcaddr[1], mcaddr[2], mcaddr[3], mcaddr[4], mcaddr[5], - add ? "added at index" : "removed from index", idx); - } -#endif - return (0); + return (rc); } int @@ -808,30 +772,31 @@ t4_mc_unicst(void *arg, const uint8_t *ucaddr) { struct port_info *pi = arg; struct adapter *sc = pi->adapter; - int rc; - if (ucaddr == NULL) + if (ucaddr == NULL) { return (EINVAL); + } ADAPTER_LOCK(sc); /* We will support adding only one mac address */ - if (pi->adapter->props.multi_rings && pi->macaddr_cnt) { + if (pi->macaddr_cnt) { ADAPTER_UNLOCK(sc); return (ENOSPC); } - rc = t4_change_mac(sc, sc->mbox, pi->viid, pi->xact_addr_filt, ucaddr, - true, &pi->smt_idx); + + const int rc = t4_change_mac(sc, sc->mbox, pi->viid, pi->xact_addr_filt, + ucaddr, true, &pi->smt_idx); if (rc < 0) { - rc = -rc; - } else { - pi->macaddr_cnt++; - pi->xact_addr_filt = rc; - rc = 0; + PORT_UNLOCK(pi); + return (-rc); } + + pi->macaddr_cnt++; + pi->xact_addr_filt = rc; ADAPTER_UNLOCK(sc); - return (rc); + return (0); } int @@ -845,9 +810,9 @@ t4_remmac(void *arg, const uint8_t *mac_addr) { struct port_info *pi = arg; - ADAPTER_LOCK(pi->adapter); + PORT_LOCK(pi); pi->macaddr_cnt--; - ADAPTER_UNLOCK(pi->adapter); + PORT_UNLOCK(pi); return (0); } @@ -868,7 +833,7 @@ t4_fill_group(void *arg, mac_ring_type_t rtype, const int rg_index, infop->mgi_stop = NULL; infop->mgi_addmac = t4_addmac; infop->mgi_remmac = t4_remmac; - infop->mgi_count = pi->nrxq; + infop->mgi_count = pi->rxq_count; break; } case MAC_RING_TYPE_TX: @@ -879,52 +844,45 @@ t4_fill_group(void *arg, mac_ring_type_t rtype, const int rg_index, } static int -t4_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num) +t4_ring_rx_start(mac_ring_driver_t rh, uint64_t mr_gen_num) { struct sge_rxq *rxq = (struct sge_rxq *)rh; + t4_sge_iq_t *iq = &rxq->iq; - RXQ_LOCK(rxq); + IQ_LOCK(iq); rxq->ring_gen_num = mr_gen_num; - RXQ_UNLOCK(rxq); + IQ_UNLOCK(iq); + return (0); } -/* - * Enable interrupt on the specificed rx ring. - */ int t4_ring_intr_enable(mac_intr_handle_t intrh) { struct sge_rxq *rxq = (struct sge_rxq *)intrh; - struct sge_iq *iq = &rxq->iq; + t4_sge_iq_t *iq = &rxq->iq; + + IQ_LOCK(iq); + iq->tsi_flags &= ~IQ_POLLING; + t4_iq_gts_update(iq, iq->tsi_gts_rearm, 0); + IQ_UNLOCK(iq); - RXQ_LOCK(rxq); - iq->polling = 0; - iq->state = IQS_IDLE; - t4_iq_gts_update(iq, iq->intr_params, 0); - RXQ_UNLOCK(rxq); return (0); } -/* - * Disable interrupt on the specificed rx ring. - */ int t4_ring_intr_disable(mac_intr_handle_t intrh) { struct sge_rxq *rxq = (struct sge_rxq *)intrh; - struct sge_iq *iq; + t4_sge_iq_t *iq = &rxq->iq; + IQ_LOCK(iq); /* * Nothing to be done here WRT the interrupt, as it will not fire until * re-enabled through the t4_iq_gts_update() in t4_ring_intr_enable(). */ - - iq = &rxq->iq; - RXQ_LOCK(rxq); - iq->polling = 1; - iq->state = IQS_BUSY; - RXQ_UNLOCK(rxq); + iq->tsi_flags |= IQ_POLLING; + IQ_UNLOCK(iq); return (0); } @@ -933,17 +891,17 @@ mblk_t * t4_poll_ring(void *arg, int n_bytes) { struct sge_rxq *rxq = (struct sge_rxq *)arg; - mblk_t *mp = NULL; ASSERT(n_bytes >= 0); if (n_bytes == 0) return (NULL); - RXQ_LOCK(rxq); - mp = t4_ring_rx(rxq, n_bytes); - RXQ_UNLOCK(rxq); - - return (mp); + struct t4_poll_req req = { + .tpr_byte_budget = n_bytes, + .tpr_mp = NULL, + }; + (void) t4_process_rx_iq(&rxq->iq, 0, &req); + return (req.tpr_mp); } /* @@ -956,11 +914,11 @@ t4_rx_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val) switch (stat) { case MAC_STAT_RBYTES: - *val = rxq->rxbytes; + *val = rxq->stats.rxbytes; break; case MAC_STAT_IPACKETS: - *val = rxq->rxpkts; + *val = rxq->stats.rxpkts; break; default: @@ -980,12 +938,12 @@ t4_tx_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val) struct sge_txq *txq = (struct sge_txq *)rh; switch (stat) { - case MAC_STAT_RBYTES: - *val = txq->txbytes; + case MAC_STAT_OBYTES: + *val = txq->stats.txbytes; break; - case MAC_STAT_IPACKETS: - *val = txq->txpkts; + case MAC_STAT_OPACKETS: + *val = txq->stats.txpkts; break; default: @@ -997,9 +955,8 @@ t4_tx_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val) } /* - * Callback funtion for MAC layer to register all rings - * for given ring_group, noted by group_index. - * Since we have only one group, ring index becomes + * Callback funtion for MAC layer to register all rings for given ring_group, + * noted by group_index. Since we have only one group, ring index becomes * absolute index. */ void @@ -1007,22 +964,25 @@ t4_fill_ring(void *arg, mac_ring_type_t rtype, const int group_index, const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh) { struct port_info *pi = arg; - mac_intr_t *mintr; + + ASSERT3S(ring_index, >=, 0); switch (rtype) { case MAC_RING_TYPE_RX: { - struct sge_rxq *rxq; + struct sge_rxq *rxq = + &pi->adapter->sge.rxq[pi->rxq_start + ring_index]; + mac_intr_t *mintr = &infop->mri_intr; + + ASSERT3S(ring_index, <, pi->rxq_count); - rxq = &pi->adapter->sge.rxq[pi->first_rxq + ring_index]; rxq->ring_handle = rh; infop->mri_driver = (mac_ring_driver_t)rxq; - infop->mri_start = t4_ring_start; + infop->mri_start = t4_ring_rx_start; infop->mri_stop = NULL; infop->mri_poll = t4_poll_ring; infop->mri_stat = t4_rx_stat; - mintr = &infop->mri_intr; mintr->mi_handle = (mac_intr_handle_t)rxq; mintr->mi_enable = t4_ring_intr_enable; mintr->mi_disable = t4_ring_intr_disable; @@ -1031,8 +991,12 @@ t4_fill_ring(void *arg, mac_ring_type_t rtype, const int group_index, } case MAC_RING_TYPE_TX: { struct sge_txq *txq = - &pi->adapter->sge.txq[pi->first_txq + ring_index]; + &pi->adapter->sge.txq[pi->txq_start + ring_index]; + + ASSERT3S(ring_index, <, pi->txq_count); + txq->ring_handle = rh; + infop->mri_driver = (mac_ring_driver_t)txq; infop->mri_start = NULL; infop->mri_stop = NULL; @@ -1041,21 +1005,11 @@ t4_fill_ring(void *arg, mac_ring_type_t rtype, const int group_index, break; } default: - ASSERT(0); + panic("unexpected ring type: %d", rtype); break; } } -mblk_t * -t4_mc_tx(void *arg, mblk_t *m) -{ - struct port_info *pi = arg; - struct adapter *sc = pi->adapter; - struct sge_txq *txq = &sc->sge.txq[pi->first_txq]; - - return (t4_eth_tx(txq, m)); -} - static int t4_mc_transceiver_info(void *arg, uint_t id, mac_transceiver_info_t *infop) { @@ -1140,10 +1094,7 @@ t4_port_led_set(void *arg, mac_led_mode_t mode, uint_t flags) return (ENOTSUP); } - ADAPTER_LOCK(sc); rc = -t4_identify_port(sc, sc->mbox, pi->viid, val); - ADAPTER_UNLOCK(sc); - return (rc); } @@ -1152,8 +1103,6 @@ t4_mc_getcapab(void *arg, mac_capab_t cap, void *data) { struct port_info *pi = arg; boolean_t status = B_TRUE; - mac_capab_transceiver_t *mct; - mac_capab_led_t *mcl; switch (cap) { case MAC_CAPAB_HCKSUM: @@ -1161,8 +1110,9 @@ t4_mc_getcapab(void *arg, mac_capab_t cap, void *data) uint32_t *d = data; *d = HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM | HCKSUM_INET_FULL_V6; - } else + } else { status = B_FALSE; + } break; case MAC_CAPAB_LSO: @@ -1175,21 +1125,18 @@ t4_mc_getcapab(void *arg, mac_capab_t cap, void *data) LSO_TX_BASIC_TCP_IPV6; d->lso_basic_tcp_ipv4.lso_max = 65535; d->lso_basic_tcp_ipv6.lso_max = 65535; - } else + } else { status = B_FALSE; + } break; case MAC_CAPAB_RINGS: { mac_capab_rings_t *cap_rings = data; - if (!pi->adapter->props.multi_rings) { - status = B_FALSE; - break; - } switch (cap_rings->mr_type) { case MAC_RING_TYPE_RX: cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; - cap_rings->mr_rnum = pi->nrxq; + cap_rings->mr_rnum = pi->rxq_count; cap_rings->mr_gnum = 1; cap_rings->mr_rget = t4_fill_ring; cap_rings->mr_gget = t4_fill_group; @@ -1198,7 +1145,7 @@ t4_mc_getcapab(void *arg, mac_capab_t cap, void *data) break; case MAC_RING_TYPE_TX: cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; - cap_rings->mr_rnum = pi->ntxq; + cap_rings->mr_rnum = pi->txq_count; cap_rings->mr_gnum = 0; cap_rings->mr_rget = t4_fill_ring; cap_rings->mr_gget = NULL; @@ -1207,20 +1154,24 @@ t4_mc_getcapab(void *arg, mac_capab_t cap, void *data) break; } - case MAC_CAPAB_TRANSCEIVER: - mct = data; + case MAC_CAPAB_TRANSCEIVER: { + mac_capab_transceiver_t *mct = data; mct->mct_flags = 0; mct->mct_ntransceivers = 1; mct->mct_info = t4_mc_transceiver_info; mct->mct_read = t4_mc_transceiver_read; break; - case MAC_CAPAB_LED: - mcl = data; + } + + case MAC_CAPAB_LED: { + mac_capab_led_t *mcl = data; + mcl->mcl_flags = 0; mcl->mcl_modes = MAC_LED_DEFAULT | MAC_LED_IDENT; mcl->mcl_set = t4_port_led_set; break; + } default: status = B_FALSE; /* cap not supported */ @@ -1229,28 +1180,22 @@ t4_mc_getcapab(void *arg, mac_capab_t cap, void *data) return (status); } -static void -t4_mac_link_caps_to_flowctrl(fw_port_cap32_t caps, link_flowctrl_t *fc) +static link_flowctrl_t +t4_mac_link_caps_to_flowctrl(fw_port_cap32_t caps) { - u8 pause_tx = 0, pause_rx = 0; - - if (caps & FW_PORT_CAP32_FC_TX) - pause_tx = 1; - - if (caps & FW_PORT_CAP32_FC_RX) - pause_rx = 1; - - if (pause_rx & pause_tx) - *fc = LINK_FLOWCTRL_BI; - else if (pause_tx) - *fc = LINK_FLOWCTRL_TX; - else if (pause_rx) - *fc = LINK_FLOWCTRL_RX; - else - *fc = LINK_FLOWCTRL_NONE; + switch (caps & (FW_PORT_CAP32_FC_TX | FW_PORT_CAP32_FC_RX)) { + case (FW_PORT_CAP32_FC_TX | FW_PORT_CAP32_FC_RX): + return (LINK_FLOWCTRL_BI); + case FW_PORT_CAP32_FC_TX: + return (LINK_FLOWCTRL_TX); + case FW_PORT_CAP32_FC_RX: + return (LINK_FLOWCTRL_RX); + default: + return (LINK_FLOWCTRL_NONE); + } } -static int +static void t4_mac_flowctrl_to_link_caps(struct port_info *pi, link_flowctrl_t fc, fw_port_cap32_t *new_caps) { @@ -1273,7 +1218,7 @@ t4_mac_flowctrl_to_link_caps(struct port_info *pi, link_flowctrl_t fc, if (pi->link_cfg.admin_caps & FW_PORT_CAP32_ANEG) pause |= PAUSE_AUTONEG; - return (t4_link_set_pause(pi, pause, new_caps)); + t4_link_set_pause(pi, pause, new_caps); } static link_fec_t @@ -1297,20 +1242,13 @@ t4_mac_port_caps_to_fec_cap(fw_port_cap32_t caps) return (link_fec); } -static void -t4_mac_admin_caps_to_fec_cap(fw_port_cap32_t caps, link_fec_t *fec) -{ - *fec = t4_mac_port_caps_to_fec_cap(caps); -} - -static void -t4_mac_link_caps_to_fec_cap(fw_port_cap32_t caps, link_fec_t *fec) +static link_fec_t +t4_mac_link_caps_to_fec_cap(fw_port_cap32_t caps) { - link_fec_t link_fec; + const link_fec_t link_fec = + t4_mac_port_caps_to_fec_cap(caps & ~FW_PORT_CAP32_FEC_NO_FEC); - caps &= ~FW_PORT_CAP32_FEC_NO_FEC; - link_fec = t4_mac_port_caps_to_fec_cap(caps); - *fec = link_fec ? link_fec : LINK_FEC_NONE; + return (link_fec ? link_fec : LINK_FEC_NONE); } static int @@ -1350,7 +1288,6 @@ out: return (t4_link_set_fec(pi, fec, new_caps)); } -/* ARGSUSED */ static int t4_mc_setprop(void *arg, const char *name, mac_prop_id_t id, uint_t size, const void *val) @@ -1380,8 +1317,8 @@ t4_mc_setprop(void *arg, const char *name, mac_prop_id_t id, uint_t size, break; case MAC_PROP_FLOWCTRL: - rc = t4_mac_flowctrl_to_link_caps(pi, - *(const link_flowctrl_t *)val, &new_caps); + t4_mac_flowctrl_to_link_caps(pi, *(const link_flowctrl_t *)val, + &new_caps); relink = 1; break; @@ -1441,8 +1378,9 @@ t4_mc_setprop(void *arg, const char *name, mac_prop_id_t id, uint_t size, break; } - if (rc != 0) + if (rc != 0) { return (rc); + } if ((pi->flags & TPF_OPEN) != 0) { if (relink != 0) { @@ -1453,6 +1391,7 @@ t4_mc_setprop(void *arg, const char *name, mac_prop_id_t id, uint_t size, if (rc != 0) { cxgb_printf(pi->dip, CE_WARN, "%s link config failed: %d", __func__, rc); + PORT_UNLOCK(pi); return (rc); } } @@ -1465,6 +1404,7 @@ t4_mc_setprop(void *arg, const char *name, mac_prop_id_t id, uint_t size, if (rc != 0) { cxgb_printf(pi->dip, CE_WARN, "set_rxmode failed: %d", rc); + PORT_UNLOCK(pi); return (rc); } } @@ -1519,15 +1459,18 @@ t4_mc_getprop(void *arg, const char *name, mac_prop_id_t id, uint_t size, break; case MAC_PROP_FLOWCTRL: - t4_mac_link_caps_to_flowctrl(lc->link_caps, val); + *(link_flowctrl_t *)val = + t4_mac_link_caps_to_flowctrl(lc->link_caps); break; case MAC_PROP_ADV_FEC_CAP: - t4_mac_link_caps_to_fec_cap(lc->link_caps, val); + *(link_fec_t *)val = + t4_mac_link_caps_to_fec_cap(lc->link_caps); break; case MAC_PROP_EN_FEC_CAP: - t4_mac_admin_caps_to_fec_cap(lc->admin_caps, val); + *(link_fec_t *)val = + t4_mac_port_caps_to_fec_cap(lc->admin_caps); break; case MAC_PROP_ADV_100GFDX_CAP: @@ -1703,7 +1646,7 @@ t4_mc_propinfo(void *arg, const char *name, mac_prop_id_t id, } static int -t4_init_synchronized(struct port_info *pi) +t4_port_enable(struct port_info *pi) { struct adapter *sc = pi->adapter; int rc = 0; @@ -1724,9 +1667,8 @@ t4_init_synchronized(struct port_info *pi) PORT_UNLOCK(pi); return (rc); /* error message displayed already */ } - } else { - t4_port_queues_enable(pi); } + t4_port_queues_enable(pi); rc = -t4_set_rxmode(sc, sc->mbox, pi->viid, pi->mtu, 0, 0, 1, 0, false); if (rc != 0) { @@ -1755,41 +1697,41 @@ t4_init_synchronized(struct port_info *pi) cxgb_printf(pi->dip, CE_WARN, "enable_vi failed: %d", rc); goto done; } + pi->flags |= TPF_VI_ENABLED; /* all ok */ pi->flags |= TPF_OPEN; done: PORT_UNLOCK(pi); if (rc != 0) - (void) t4_uninit_synchronized(pi); + (void) t4_port_disable(pi); return (rc); } -/* - * Idempotent. - */ static int -t4_uninit_synchronized(struct port_info *pi) +t4_port_disable(struct port_info *pi) { struct adapter *sc = pi->adapter; - int rc; ADAPTER_LOCK_ASSERT_OWNED(pi->adapter); PORT_LOCK(pi); /* * Disable the VI so that all its data in either direction is discarded - * by the MPS. Leave everything else (the queues, interrupts, and 1Hz - * tick) intact as the TP can deliver negative advice or data that it's - * holding in its RAM (for an offloaded connection) even after the VI is - * disabled. + * by the MPS. Leave everything else (queues, interrupts, etc) so any + * straggling work in flight has a safe place to land. */ - rc = -t4_enable_vi(sc, sc->mbox, pi->viid, false, false); - if (rc != 0) { - cxgb_printf(pi->dip, CE_WARN, "disable_vi failed: %d", rc); - PORT_UNLOCK(pi); - return (rc); + if (pi->flags & TPF_VI_ENABLED) { + const int rc = + -t4_enable_vi(sc, sc->mbox, pi->viid, false, false); + if (rc != 0) { + cxgb_printf(pi->dip, CE_WARN, + "disable_vi failed: %d", rc); + PORT_UNLOCK(pi); + return (rc); + } + pi->flags &= ~TPF_VI_ENABLED; } t4_port_queues_disable(pi); @@ -1833,8 +1775,8 @@ t4_propinfo_priv(struct port_info *pi, const char *name, mac_prop_info_handle_t ph) { struct adapter *sc = pi->adapter; - struct driver_properties *dp = &sc->props; - struct link_config *lc = &pi->link_cfg; + const struct driver_properties *dp = &sc->props; + const struct link_config *lc = &pi->link_cfg; const t4_priv_prop_t *prop = t4_priv_prop_match(name); if (prop == NULL || !t4_priv_prop_supported(pi, prop)) { @@ -1844,18 +1786,16 @@ t4_propinfo_priv(struct port_info *pi, const char *name, int v = 0; switch (prop->tpp_id) { case T4PROP_FW_TMR: - v = t4_convert_holdoff_timer(sc, sc->props.fwq_tmr_idx); + v = t4_convert_holdoff_timer(sc, dp->fwq_tmr_idx); break; case T4PROP_FW_PKTC: - v = t4_convert_holdoff_pktcnt(sc, sc->props.fwq_pktc_idx); + v = t4_convert_holdoff_pktcnt(sc, dp->fwq_pktc_idx); break; case T4PROP_RX_TMR: - v = t4_convert_holdoff_timer(sc, t4_port_is_10xg(pi) ? - dp->tmr_idx_10g : dp->tmr_idx_1g); + v = t4_convert_holdoff_timer(sc, dp->ethq_tmr_idx); break; case T4PROP_RX_PKTC: - v = t4_convert_holdoff_pktcnt(sc, t4_port_is_10xg(pi) ? - dp->pktc_idx_10g : dp->pktc_idx_1g); + v = t4_convert_holdoff_pktcnt(sc, dp->ethq_pktc_idx); break; case T4PROP_TX_TMR: v = t4_convert_dbq_timer(sc, dp->dbq_timer_idx); @@ -1892,7 +1832,6 @@ t4_getprop_priv(struct port_info *pi, const char *name, uint_t size, void *val) return (ENOTSUP); } - PORT_LOCK(pi); int v = 0; switch (prop->tpp_id) { case T4PROP_FW_TMR: @@ -1923,10 +1862,8 @@ t4_getprop_priv(struct port_info *pi, const char *name, uint_t size, void *val) v = (lc->link_caps & FW_PORT_CAP32_FC_RX) ? 1 : 0; break; default: - PORT_UNLOCK(pi); return (ENOTSUP); } - PORT_UNLOCK(pi); (void) snprintf(val, size, "%d", v); return (0); @@ -2000,7 +1937,6 @@ t4_choose_dbq_timer(struct adapter *sc, uint_t target_us) return (chosen_idx); } - static int t4_setprop_priv(struct port_info *pi, const char *name, const void *val) { @@ -2028,7 +1964,7 @@ t4_setprop_priv(struct port_info *pi, const char *name, const void *val) switch (prop->tpp_id) { case T4PROP_FW_TMR: { - struct sge_iq *fwq = &sc->sge.fwq; + t4_sge_iq_t *fwq = &sc->sge.fwq; const uint8_t idx = t4_choose_holdoff_timer(sc, MAX(0, v)); IQ_LOCK(fwq); @@ -2039,7 +1975,7 @@ t4_setprop_priv(struct port_info *pi, const char *name, const void *val) break; } case T4PROP_FW_PKTC: { - struct sge_iq *fwq = &sc->sge.fwq; + t4_sge_iq_t *fwq = &sc->sge.fwq; const int8_t idx = t4_choose_holdoff_pktcnt(sc, (int)v); IQ_LOCK(fwq); @@ -2079,7 +2015,7 @@ t4_setprop_priv(struct port_info *pi, const char *name, const void *val) int i; struct sge_txq *txq; for_each_txq(pi, i, txq) { - struct sge_eq *eq = &txq->eq; + t4_sge_eq_t *eq = &txq->eq; EQ_LOCK(eq); t4_eq_update_dbq_timer(eq, pi); @@ -2133,7 +2069,7 @@ t4_setprop_priv(struct port_info *pi, const char *name, const void *val) PORT_LOCK(pi); if ((pi->flags & TPF_OPEN) != 0) { for_each_rxq(pi, i, rxq) { - struct sge_iq *iq = &rxq->iq; + t4_sge_iq_t *iq = &rxq->iq; IQ_LOCK(iq); t4_iq_update_intr_cfg(iq, pi->tmr_idx, @@ -2170,15 +2106,6 @@ t4_setprop_priv(struct port_info *pi, const char *name, const void *val) return (0); } -void -t4_mc_cb_init(struct port_info *pi) -{ - if (pi->adapter->props.multi_rings) - pi->mc = &t4_m_ring_callbacks; - else - pi->mc = &t4_m_callbacks; -} - void t4_os_link_changed(struct adapter *sc, int idx, int link_stat) { @@ -2187,18 +2114,8 @@ t4_os_link_changed(struct adapter *sc, int idx, int link_stat) mac_link_update(pi->mh, link_stat ? LINK_STATE_UP : LINK_STATE_DOWN); } -/* ARGSUSED */ -void -t4_mac_rx(struct port_info *pi, struct sge_rxq *rxq, mblk_t *m) -{ - mac_rx(pi->mh, NULL, m); -} - void t4_mac_tx_update(struct port_info *pi, struct sge_txq *txq) { - if (pi->adapter->props.multi_rings) - mac_tx_ring_update(pi->mh, txq->ring_handle); - else - mac_tx_update(pi->mh); + mac_tx_ring_update(pi->mh, txq->ring_handle); } diff --git a/usr/src/uts/common/io/cxgbe/t4nex/t4_nexus.c b/usr/src/uts/common/io/cxgbe/t4nex/t4_nexus.c index ad2fda1caa..83b4c8f94f 100644 --- a/usr/src/uts/common/io/cxgbe/t4nex/t4_nexus.c +++ b/usr/src/uts/common/io/cxgbe/t4nex/t4_nexus.c @@ -39,59 +39,526 @@ #include #include #include -#include #include #include #include #include #include #include +#include #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_extra_regs.h" +/* + * Nexus driver for Chelsio Terminator Network Adapters (T4/T5/T6) + * + * This driver supports the Chelsio Terminator series of network adapters + * starting with the T4 generation and onward. These adapters present a "unified + * wire" for managing traditional L2 Ethernet traffic alongside a variety of + * stateful offloads including the usual TCP/UDP protocols along with storage + * technology like iSCSI, FCoE, NVMe over fabrics, and others. All of these + * features coexist on a single ASIC controlled by a single firmware image, thus + * the "unified wire". While these adapters provide many offload technologies, + * this driver remains focused on providing L2 Ethernet services as presented by + * the GLDv3/mac framework. In short, this consists of presenting the device as + * groups of rings with filtering and steering capabilities along with stateless + * offloads including checksums and LSO. This nexus driver does not preclude the + * support of the stateful offload features, but supporting them requires + * additional work both inside this driver along with general operating system + * enhancements. + * + * Naming & Terminology + * -------------------- + * + * CPL: + * + * Chelsio Protocol Language messages. We use these to wrap network data for + * Tx and Rx, this wrapping of packets in CPL is referred to by Chelsio as + * "tunneled" data. Not be to confused with the more general network + * tunneling also known as encapsulation (e.g. IP tunnling, VXLAN, etc). + * + * Flit: + * + * A 64-bit (8 byte) quantity. The Chelsio documentation and code divides + * communication structures into units of flits. For example, a firmware + * command may consist of up to 8 flits (8-bytes x 8 = 64 bytes) where the + * command header is always made up of the first two flits and the remaining + * 6 may be used for variable payload data. + * + * Module/Block: + * + * The T4 is comprised of various modules (also referred to as a "block" or + * "engine" in some contexts) which work together to provide the services + * offered by the chip. For example, the Scatter-Gather Engine (SGE) module + * provides the DMA communications used to send and receive traffic. + * + * T4: + * + * The short name to represent any Chelsio Terminator ASIC from the T4 and + * onward. This includes the T4, T5, and T6 line of parts. + * + * Tunneled Traffic: + * + * The Chelsio documentation often refers to sending or receiving "tunneled" + * traffic, but it's not referring to the traditional networking terminology + * of encapsulated data. Rather, it is referring to traffic that is + * sent/received in a non-offload capacity. It's called "tunneled" because + * the data is wrapped/"tunneled" in Work Requests and CPL messages. This + * driver deals purely in tunneled traffic as it make no use of stateful + * offloads. + * + * ULPTX + * + * The Upper Layer Processing Transmit module handle DMA access related to + * egress traffic. + * + * Work Request (WR) + * + * Work Requests are commands and data descriptors use to send Tx packets. + * + * Communication + * ------------- + * + * Before any requests can be made or any data can be transmitted we must first + * establish communication with the device. The Chelsio Terminator ASIC, or T4 + * for short, presents four primary methods of communication between the driver + * and itself. + * + * 1. Registers: read/write simple values or bitwise data over PIO + * + * 2. Mailboxes: synchronized request/reply structured data over PIO + * + * 3. Queues: DMA memory of structured data for control and data plane + * + * 4. Interrupts: MSI/MSI-X interrupts for indicating queue status updates or + * asynchronous events from the firmware + * + * The first access we have is to the registers via our BAR0 mapping. These + * registers provide control and configuration over many aspects of the + * different modules that make up the T4. + * + * Using the registers we then establish a mailbox which provides structured + * communication in the form of request/reply commands to the firmware. Both of + * these methods use Programmed I/O which is fine for administrative control, + * but inadequate for the latency and throughput demands of the datapath and its + * associated control plane. + * + * For the datapath we use the registers and mailbox to establish queues of DMA + * memory for transmitting and receiving data. Queues deal in Work Requests + * (WR), Chelsio Protocol Language messages (CPL), and Freelist buffer pointers + * (FL). These data structures may subsequently point to other DMA memory + * (buffers) that hold the data to be transmitted or received along with its + * associated software descriptors. + * + * Finally, the T4 provides various types of interrupt control to asynchronously + * signal the driver of conditions such as errors, firmware events, and datapath + * (queue) synchronization via status updates (cidx/pidx). + * + * While nothing precludes the driver from consuming directly these forms of + * communication, most of the interface with the T4 is currently provided by the + * "common code" interfaces. This common code is, nominally, code shared between + * the various operating systems for interacting with the T4. + * + * Queues (Rings) + * -------------- + * + * Queues are circular buffers of DMA memory used to share structured data often + * referred to as a "descriptor". These circular buffers are also commonly + * called rings. Where each entry in the ring is a descriptor used for locating + * and describing data that is meant to be transmitted across or received from + * the network device. + * + * The T4 queues are used in this manner, to share descriptors between the + * driver and device, but their level of synchronization is not technically a + * descriptor. Rather, a queue is made up of a number of "host credits". The + * size of a host credit (sometimes also called an "entry" or "descriptor" in + * the code) depends on the type of queue and how it is configured. This + * difference in terminology between "host credit" vs. "descriptor" is mostly + * pertinent to Egress Queues, which always have 64-byte (8-flit) host credits. + * Those host credits are used to pass variable-sized Work Requests (WR), the + * structure which actually acts as the "descriptor", which may be smaller or + * larger than a single credit. Ingress Queues (IQ) also have variable-sized + * entries, but the size is determined at queue creation time and is uniform for + * each entry; therefore IQ entries can be called credits, entries, or + * descriptors without any real confusion. The official Chelsio documentation + * also uses mixed terminology, so it's important to keep that in mind. However, + * regardless of how many credits a descriptor requires, communication always + * occurs in units of whole credits. A good way to frame this is that queues + * provide logical rings of descriptors (WRs, CPLs, FLs) on top of physical + * units of host credits. + * + * There are different types of queues for different purposes, but they are all + * variations of either an Ingress Queue (IQ) or Egress Queue (EQ). As the names + * suggest, a queue is a unidirectional communication channel: one is the + * producer and the other side is a consumer. The Ingress Queue provides + * communication from T4 (producer) to driver (consumer), and the Egress Queue + * provides communication from the driver (producer) to the T4 (consumer). + * + * The producer/consumer synchronize communication in units of host credits. The + * producer tracks its next host credit to write under the producer index + * (pidx), and the consumer tracks its next host credit to read under the + * consumer index (cidx). These values are kept in sync through means such as + * doorbells (DB), Go-To-Sleep updates (GTS), and interrupts carrying CPL + * message (e.g. CPL_SGE_EGR_UPDATE). + * + * If you read the Terminator Programmer's Guide you will find dicussion about + * the queue's "context". This is described as an area of memory that dictates + * various features and behavior of the queue. While this queue context may at + * one point have been programmed directly, it no longer is. Rather, the various + * aspects of queue behavior are controlled by parameters passed during the + * queue creation firmware commands, along with other mechanisms such as + * registers. + * + * Each type of queue also has a "status page" which may optionally be updated + * with cidx or pidx updates. For EQs this page consumes 1 or 2 credits at the + * end of the queue. For IQs it consumes 1 entry at the end of the queue. + * + * We use EQs to create Tx rings and IQs (plus FLs) to create Rx rings. We + * create the same number of Tx and Rx queues. So if we have 32 Tx queues, we + * will also have 32 Rx queues. The number of queues created is based on the + * port speed. The association from speed to queue count can be found in the + * t4_queue_counts array. + * + * Egress Queues (EQ) + * ------------------ + * + * Egress Queues (EQ) provide communication from driver to T4. The driver writes + * (produces) descriptors to the queue using one or more host credits. It + * notifies the T4 of these new outstanding host credits by updating its pidx + * via a doorbell. As new outstanding credits arrive via the doorbell the T4 + * reads (consumes) them to determine what types of descriptors have been sent + * along with their content. As the T4 consumes host credits it notifies the + * driver with a programmable combination of status page updates, CPL messages, + * and interrupts. + * + * All EQs use a host credit size of 8 flits (64 bytes). The driver uses these + * host credits to send Work Requests (WR) to the T4. + * + * A WR is variable in size and may be smaller or larger than a single host + * credit, but communication is always in whole units of credits. It is legal + * for a WR to span across the end of the queue and warp around, but the + * contents of the WR may dictate that the wrap-around happens only at certain + * offsets within the descriptor. A WR may be 16 to 512 bytes long, but must + * always begin at the start of a host credit, thus all WRs must start at a + * 64-byte aligned address. + * + * At this time the only WRs we use are FW_ETH_TX_PKT_WR and FW_ETH_TX_PKTS_WR. + * + * Ingress Queues (IQ) + * ------------------- + * + * Ingress Queues (IQ) provide communication from T4 to driver. The T4 produces + * queue entries for the dirver to consume. Unlike EQs, data passed in IQs is + * always done as fixed-size entries. That is, each entry in the IQ takes up + * exactly one credit, and that credit size is determined at creation time. So + * in that sense you could think of a IQ entry as a descriptor. However, these + * entries contain different types of data of variable lengths (within in the + * bounds of the entry/credit size). There are four possible entry sizes, and + * the entry size dictates the possible messages an IQ can hold. The possibles + * sizes are 2 flits (16 bytes), 4 flits (32 bytes), 8 flits (64 bytes), and 16 + * flits (128 bytes). Depending on the size, each entry may a contain Freelist + * buffer completion, CPL message, or a forwarded interrupt destined for another + * IQ. Which size to use depends on the use case of the IQ. + * + * Currently we make use of the 64-byte entry size exclusively. + * + * Freelists (FL) + * -------------- + * + * A freelist (FL) is a type of EQ used for providing (producing) buffers for + * the purpose of holding received network data for an associated IQ. The driver + * produces pointers to DMA data buffers and the associated IQ consumes them as + * data is received by the device. A freelist is always associated with an IQ; a + * freelist is never used on its own. An IQ, however, may have no FL associated + * with it; such is the case for event IQs and interrupt forwarding IQs. An Rx + * IQ must have one or two FLs associated with it used to store the incoming + * packet headers and payload. The use of two FLs is for when "header splitting" + * is enabled: where the headers are placed in one buffer and the payload is + * placed in the other. Only the first 1024 IQs may have FLs associated with + * them. + * + * A freelist is always made up of buffer "pointers". Each buffer pointer is 1 + * flit (8 bytes) in size and points to DMA memory used to hold packet data. The + * lowest four bits of the pointer are used as an index into the freelist buffer + * size array, allowing up to 16 different buffer sizes. This implies that each + * FL buffer pointer must be at least 16-byte aligned. Each pointer may use a + * different size. Since EQ communication must happen in units of host credits, + * and an EQ host credit is 8 flits, it means that the driver must always + * produce 8 FL buffer pointers per credit. If the driver cannot produce 8 + * buffer pointers, the rest of the credit may be filled with zero-sized + * pointers ("null" or "zero" buffer) which is to say their size index points to + * a zero-value entry in the array. + * + * The digram below depicts how the FL buffer pointer indexes into the + * SGE_FL_BUFFER_SZ[N] array. + * + * +-------------------+-------------------------+ + * | Buffer Ptr [63:4] | SGE_FL_BUFFER_SIZE[3:0] | + * +-------------------+-------------------------+ + * | + * +---------------------+ + * v + * +--------------------+--------------------+ + * | SGE_FL_BUFFER_SZ0 | 0 | "zero" buffer + * +--------------------+--------------------+ + * | SGE_FL_BUFFER_SZ1 | 4096 | 4K buffer + * +--------------------+--------------------+ + * . + * . + * . + * +--------------------+--------------------+ + * | SGE_FL_BUFFER_SZ15 | 16384 | 16K buffer + * +--------------------+--------------------+ + * + * FL buffers may have "packing" enabled where a single buffer may be used for + * multiple packets. This requires that the driver keep track of the current + * offset within the current FL buffer. When a new buffer is required by the + * device, because the next packet will not fit in the remaining space of the + * current buffer, it will consume a new buffer and set a bit in the IQ + * completion entry to notify the driver. At this point the driver updates its + * cidx and restarts the offset at zero. + * + * If packing is not enabled each new packet starts at a new buffer. + * + * This driver currently sets the FL buffer size to 8192 (rx_buf_size) and + * enables packing. + * + * Doorbells, GTS messages, and Interrupts + * --------------------------------------- + * + * The driver and T4 need some way to communicate udpates to the pidx/cidx + * values of their queues. To achieve this goal, the driver uses a combination + * of doorbells, GTS messages, status pages, and interrupts. + * + * Doorbells + * --------- + * + * The driver informs the T4 of new EQ credits by way of a "doorbell" (DB). A + * doorbell is a register write directed towards a single queue. The doorbell + * carries a priority and an incremental update to the pidx value. There are two + * types of doorbells: + * + * 1. Kernel Space doorbells (KDB) which use BAR0. + * 2. User Space doorbells (UDB) which use BAR2. + * + * The "user space" doorbells, while useful for kernel-bypass networking, are + * also used for regular in-kernel networking. They divide the queue doorbell + * space into multiple 128 byte segments versus KDB's single address for all + * queues. They also provide the ability to perform Write-Combining Work + * Requeusts (DOORBELL_WCWR) and Write-Combining Doorbells (DOORBELL_UDBWC). The + * WCWR allows you to send a single credit as one write and avoid the need for + * the T4 to DMA the credit's contents (a WR or FL buffer pointers) from host + * memory. We currently make use of WCWR for the Tx datapath, but not for + * writing freelist descriptors. + * + * There is some more discussion of doorbells at the t4_doorbells_t definition + * in adapter.h. + * + * EQ Status Updates + * ----------------- + * + * This section covers how EQ status updates work. While an FL is technically an + * EQ it makes no use of these mechanisms because the use of FL buffers (cidx) + * is tracked implicitly as CPL Rx messages arrive on the associated IQ. + * + * The driver can track the EQ cidx either by reading the EQ status page or by + * asking for a notification via an IQ. This is delivered by way of a + * CPL_SGE_EGR_UPDATE message. Furthermore, if the IQ this message is destined + * for has interrupts enabled, an interrupt is generated upon delivery of the + * message. The EQ status page update and the delivery of this message is + * controlled by several factors. + * + * 1. The EQ context field 'CIDXFlushThresh' (FW_EQ_ETH_CMD.cidxfthresh) + * indicates how many consumed credits must be outstanding before the T4 + * generates a cidx update (both status page update and CPL message). + * + * 2. The EQ context field 'FCThreshOverride' (FW_EQ_ETH_CMD.cidxfthresho) tells + * the T4 to generate a cidx update anytime cidx==pidx; i.e., when the T4 has + * consumed all outstanding credits. This happens regardless if the cidx + * flush threshold has been reached or not (thus the "override"). This is + * useful for dealing with cases of intermitten transmission where the + * threshold may not be reached in a timely manner. + * + * 3. The DBQ Timer (see TAF_DBQ_TIMER) provides for sending a cidx notification + * anytime the EQ has sat idle (no pidx updates) for a period of time. This + * is preferred to method (2) as it allows batching cidx updates while also + * recycling consumed credits in a timely manner. This is available starting + * with the T6 chip. + * + * 4. The FW_EQ_FLUSH_WR (its own WR on the EQ) allows the driver to request + * either a status page update, EGR update, or both. + * + * 5. The FW_ETH_TX_PKT_WR and FW_ETH_TX_PKTS_WR, used to send packets, allows + * the driver to request either a status page update, EGR update, or both as + * part of sending the packet. + * + * This driver utilizes both the status page and CPL udpates as well as all the + * methods listed above to generate these updates. + * + * GTS Messages + * ------------ + * + * The driver sends a GTS (Go To Sleep) message to the T4 to update the SGE + * about a specific IQ. The message conveys four pieces of information. + * + * 1. The Ingress Queue the update is for. + * + * 2. The current cidx of the driver. + * + * 3. The new timer value for pidx update scheduling (see IQ context + * 'Update_Scheduling' field). + * + * 4. Either a) arming the "Solicited Event" Interrupt or b) setting the new + * value for the IQ context 'Update_Scheduling' field. Which one depends on + * the IQ context 'GTS_Mode' value. + * + * We currently always set 'GTS_Mode=1' which indicates that the GTS 'SEIntArm' + * value (number 4 above) is used to dictate the new value for the + * 'Update_Scheduling' field. + * + * As the driver processes outstanding IQ credits it uses GTS messages to notify + * the driver of how many credits it has consumed and optionally re-arm the + * timer and packet counter notifications. + * + * The GTS messages, like the EQ Doorbells, have both kernel and user space + * registers. We currently only make use of the kernel space register. + * + * Ingress Queue Generation Bit + * ---------------------------- + * + * Ingress Queues have an alternative method for pidx updates beyond the status + * page update or an explicit CPL message like is done for Ethernet EQs. They + * also provide a generation bit as part of each queue entry (credit) which can + * be used by the driver, after it has received an interrupt indicating new data + * is available, to determine which entries are newly produced by the device. + * This method allows you to eschew IQ status page updates altogether, and that + * is how we use IQs both for our firmware queue as well as our Rx data queues. + * + * Freelist Updates + * ---------------- + * + * While an FL is technically an EQ we do not make use of explicit EQ status + * updates to track the FL cidx. Rather, the current FL buffer is tracked + * implicitly by way of the Rx IQ CPL messages generated as part of incoming + * traffic. As new packets come in the SGE writes the data in the current FL + * buffer and writes a new CPL message onto the Rx IQ. These CPL messages allow + * the driver to track which FL buffer is currently in use by the device and + * when to move onto the next FL buffer. + * + * Interrupts + * ---------- + * + * The T4 provides interrupt capability for support of asynchrnous + * notifications. The primary uses of interrupts consist of the following. + * + * 1. Notification of new IQ entries (credits) available for consumption by the + * driver. That is, the T4 notifies the host that of its latest IQ pidx value + * indicating that there are new credits for host consumption. + * + * 2. Notification of new EQ credits available for production by the driver. + * That is, the T4 notifies the host of its latest EQ cidx value indicating + * that there are new credits avilable for host production. + * + * 3. Notification of firmware events (also referred to as the "firmware queue" + * or "asynchronous event queue"). + * + * This driver employs three different strategies for assigning interrupts + * depending on the type and number of interrupts available. These strategies + * are listed in order of preference. The solution is chosen by + * t4_cfg_intrs_queues() and the setup is done by t4_setup_intrs(). + * + * TIP_PER_PORT + * + * The first strategy is used when we have enough MSI/MSI-X interrupts to + * dedicate one to error conditions, one for asynchronous firmware events, + * and at least one for Tx/Rx events on each network port on the adapter. A + * port may have more than one interrupt, in which case its Tx/Rx queue + * events are distributed across those interrupts as evenly as possible. For + * example, given a two-port adapter with eight interrupts, one interrupt + * would be consumed for error conditions, one for firmware events, and the + * remaning six would be divided as three interrupts per port. If each port + * has 32 Rx queues, then two interrupts would be responsbile for 11 queues, + * and the third interrupt would be responsible for 10. + * + * The error interrupt vector points to the t4_intr_err() function. Errors + * are deliverd via registers and are handled by t4_slow_intr_handler(). + * + * The asynchronous firmware event interrupt points to the t4_intr_fwq() + * function and the events arrive on the firmware queue (sc->sge.fwq). + * + * The per port interrupts point to t4_intr_port_queue() and each port's + * events land on one of the per port event queues (port->intr_iqs). + * + * TIP_ERR_QUEUES + * + * The second strategy is used when we have only two interrupts. In this + * case one of the interrupts is dedicated to errors and the other one is + * shared between the firmware events and the port events (Rx/Tx + * notifications). + * + * In this case the firmware and port events all land on the firmware queue + * which is processed by t4_intr_fwq(). + * + * TIP_SINGLE + * + * The last strategy is for when we have a single interrupt and everything + * needs to share it. In this case the interrupt lands on t4_intr_all() and + * all firmware and port events go to the firmware queue. + * + * The per-port events queues (port->intr_iq) do not receive any network data + * themselves. Rather, they are used for two purposes: + * + * 1. To handle CPL_SGE_EGR_UPDATE messages; used to notify the driver about the + * device's current cidx in a particular EQ. This is how Tx queues know when + * they reclaim credits used for sending packets. + * + * 2. To handle "forwarded interrupt" notifications; used to notify the driver + * that a particular receive IQ has outstanding credits to read. This is how + * Rx queues know when there are new packets available to read. + */ + static void *t4_soft_state; static kmutex_t t4_adapter_list_lock; static list_t t4_adapter_list; -struct intrs_and_queues { - int intr_type; /* DDI_INTR_TYPE_* */ - int nirq; /* Number of vectors */ - int intr_fwd; /* Interrupts forwarded */ - int ntxq10g; /* # of NIC txq's for each 10G port */ - int nrxq10g; /* # of NIC rxq's for each 10G port */ - int ntxq1g; /* # of NIC txq's for each 1G port */ - int nrxq1g; /* # of NIC rxq's for each 1G port */ -}; - -static unsigned int getpf(struct adapter *sc); -static int prep_firmware(struct adapter *sc); -static int upload_config_file(struct adapter *sc, uint32_t *mt, uint32_t *ma); -static int partition_resources(struct adapter *sc); -static int adap__pre_init_tweaks(struct adapter *sc); -static int get_params__pre_init(struct adapter *sc); -static int get_params__post_init(struct adapter *sc); -static int set_params__post_init(struct adapter *); -static void t4_setup_adapter_memwin(struct adapter *sc); -static int validate_mt_off_len(struct adapter *, int, uint32_t, int, - uint32_t *); +typedef enum t4_port_speed { + TPS_1G, + TPS_10G, + TPS_25G, + TPS_40G, + TPS_50G, + TPS_100G, + TPS_200G, + TPS_400G, +} t4_port_speed_t; + +static uint_t t4_getpf(struct adapter *); +static int t4_prep_firmware(struct adapter *); +static int t4_upload_config_file(struct adapter *, uint32_t *, uint32_t *); +static int t4_partition_resources(struct adapter *); +static int t4_init_adap_tweaks(struct adapter *); +static int t4_init_get_params_pre(struct adapter *); +static int t4_init_get_params_post(struct adapter *); +static int t4_init_set_params(struct adapter *); +static void t4_setup_adapter_memwin(struct adapter *); static uint32_t t4_position_memwin(struct adapter *, int, uint32_t); -static int init_driver_props(struct adapter *sc, struct driver_properties *p); -static int remove_extra_props(struct adapter *sc, int n10g, int n1g); -static int cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g, - struct intrs_and_queues *iaq); -static int add_child_node(struct adapter *sc, int idx); -static int remove_child_node(struct adapter *sc, int idx); -static kstat_t *setup_kstats(struct adapter *sc); -static kstat_t *setup_wc_kstats(struct adapter *); -static int update_wc_kstats(kstat_t *, int); -static int t4_port_full_uninit(struct port_info *); +static void t4_init_driver_props(struct adapter *); +static int t4_cfg_intrs_queues(struct adapter *); +static int t4_setup_intrs(struct adapter *); +static int t4_add_child_node(struct adapter *, uint_t); +static int t4_remove_child_node(struct adapter *, uint_t); +static kstat_t *t4_setup_kstats(struct adapter *); +static kstat_t *t4_setup_wc_kstats(struct adapter *); +static void t4_port_full_uninit(struct port_info *); +static t4_port_speed_t t4_port_speed(const struct port_info *); static int t4_temperature_read(void *, sensor_ioctl_scalar_t *); static int t4_voltage_read(void *, sensor_ioctl_scalar_t *); + static const ksensor_ops_t t4_temp_ops = { .kso_kind = ksensor_kind_temperature, .kso_scalar = t4_temperature_read @@ -113,7 +580,7 @@ static ddi_ufm_ops_t t4_ufm_ops = { .ddi_ufm_op_getcaps = t4_ufm_getcaps }; -/* ARGSUSED */ + static int t4_devo_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp) { @@ -168,13 +635,9 @@ static int t4_devo_detach(dev_info_t *, ddi_detach_cmd_t); static int t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { - struct adapter *sc = NULL; - struct sge *s; - int i, instance, rc = DDI_SUCCESS, rqidx, tqidx, q; - int irq = 0, nxg = 0, n1g = 0; + int i = 0; + int rc = DDI_SUCCESS; char name[16]; - struct driver_properties *prp; - struct intrs_and_queues iaq; ddi_device_acc_attr_t da = { .devacc_attr_version = DDI_DEVICE_ATTR_V0, .devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC, @@ -192,7 +655,7 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) /* * Allocate space for soft state. */ - instance = ddi_get_instance(dip); + const int instance = ddi_get_instance(dip); rc = ddi_soft_state_zalloc(t4_soft_state, instance); if (rc != DDI_SUCCESS) { cxgb_printf(dip, CE_WARN, @@ -200,21 +663,23 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) return (DDI_FAILURE); } - sc = ddi_get_soft_state(t4_soft_state, instance); + struct adapter *sc = ddi_get_soft_state(t4_soft_state, instance); sc->dip = dip; sc->dev = makedevice(ddi_driver_major(dip), instance); mutex_init(&sc->lock, NULL, MUTEX_DRIVER, NULL); cv_init(&sc->cv, NULL, CV_DRIVER, NULL); mutex_init(&sc->sfl_lock, NULL, MUTEX_DRIVER, NULL); - TAILQ_INIT(&sc->sfl); + list_create(&sc->sfl_list, sizeof (struct sge_fl), + offsetof(struct sge_fl, sfl_node)); mutex_init(&sc->mbox_lock, NULL, MUTEX_DRIVER, NULL); - STAILQ_INIT(&sc->mbox_list); + list_create(&sc->mbox_list, sizeof (t4_mbox_waiter_t), + offsetof(t4_mbox_waiter_t, node)); mutex_enter(&t4_adapter_list_lock); list_insert_tail(&t4_adapter_list, sc); mutex_exit(&t4_adapter_list_lock); - sc->pf = getpf(sc); + sc->pf = t4_getpf(sc); if (sc->pf > 8) { rc = EINVAL; cxgb_printf(dip, CE_WARN, @@ -224,8 +689,8 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) sc->mbox = sc->pf; /* Initialize the driver properties */ - prp = &sc->props; - (void) init_driver_props(sc, prp); + t4_init_driver_props(sc); + struct driver_properties *prp = &sc->props; /* * Enable access to the PCI config space. @@ -273,7 +738,7 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) } else { if (t4_cver_ge(sc, CHELSIO_T5)) { sc->doorbells |= DOORBELL_UDB; - if (prp->wc) { + if (prp->write_combine) { /* * Enable write combining on BAR2. This is the * userspace doorbell BAR and is split into 128B @@ -312,15 +777,15 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) t4_setup_adapter_memwin(sc); /* Prepare the firmware for operation */ - rc = prep_firmware(sc); + rc = t4_prep_firmware(sc); if (rc != 0) goto done; /* error message displayed already */ - rc = adap__pre_init_tweaks(sc); + rc = t4_init_adap_tweaks(sc); if (rc != 0) goto done; - rc = get_params__pre_init(sc); + rc = t4_init_get_params_pre(sc); if (rc != 0) goto done; /* error message displayed already */ @@ -336,11 +801,11 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) } } - rc = get_params__post_init(sc); + rc = t4_init_get_params_post(sc); if (rc != 0) goto done; /* error message displayed already */ - rc = set_params__post_init(sc); + rc = t4_init_set_params(sc); if (rc != 0) goto done; /* error message displayed already */ @@ -348,7 +813,6 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) * TODO: This is the place to call t4_set_filter_mode() */ - /* tweak some settings */ t4_write_reg(sc, A_TP_SHIFT_CNT, V_SYNSHIFTMAX(6) | V_RXTSHIFTMAXR1(4) | @@ -400,140 +864,44 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) mutex_init(&pi->lock, NULL, MUTEX_DRIVER, NULL); pi->mtu = ETHERMTU; - if (t4_port_is_10xg(pi)) { - nxg++; - pi->tmr_idx = prp->tmr_idx_10g; - pi->pktc_idx = prp->pktc_idx_10g; - } else { - n1g++; - pi->tmr_idx = prp->tmr_idx_1g; - pi->pktc_idx = prp->pktc_idx_1g; - } + pi->tmr_idx = prp->ethq_tmr_idx; + pi->pktc_idx = prp->ethq_pktc_idx; pi->dbq_timer_idx = prp->dbq_timer_idx; pi->xact_addr_filt = -1; } - (void) remove_extra_props(sc, nxg, n1g); - - rc = cfg_itype_and_nqueues(sc, nxg, n1g, &iaq); - if (rc != 0) + if ((rc = t4_cfg_intrs_queues(sc)) != 0) { goto done; /* error message displayed already */ - - sc->intr_type = iaq.intr_type; - sc->intr_count = iaq.nirq; - - if (sc->props.multi_rings && (sc->intr_type != DDI_INTR_TYPE_MSIX)) { - sc->props.multi_rings = 0; - cxgb_printf(dip, CE_WARN, - "Multiple rings disabled as interrupt type is not MSI-X"); } - if (sc->props.multi_rings && iaq.intr_fwd) { - sc->props.multi_rings = 0; - cxgb_printf(dip, CE_WARN, - "Multiple rings disabled as interrupts are forwarded"); - } - - if (!sc->props.multi_rings) { - iaq.ntxq10g = 1; - iaq.ntxq1g = 1; - } - s = &sc->sge; - s->nrxq = nxg * iaq.nrxq10g + n1g * iaq.nrxq1g; - s->ntxq = nxg * iaq.ntxq10g + n1g * iaq.ntxq1g; - s->neq = s->ntxq + s->nrxq; /* the fl in an rxq is an eq */ - s->niq = s->nrxq + 1; /* 1 extra for firmware event queue */ - if (iaq.intr_fwd != 0) - sc->flags |= TAF_INTR_FWD; - s->rxq = kmem_zalloc(s->nrxq * sizeof (struct sge_rxq), KM_SLEEP); - s->txq = kmem_zalloc(s->ntxq * sizeof (struct sge_txq), KM_SLEEP); - s->iqmap = - kmem_zalloc(s->iqmap_sz * sizeof (struct sge_iq *), KM_SLEEP); - s->eqmap = - kmem_zalloc(s->eqmap_sz * sizeof (struct sge_eq *), KM_SLEEP); + const struct t4_intrs_queues *iaq = &sc->intr_queue_cfg; + struct sge_info *sge = &sc->sge; + sge->rxq = + kmem_zalloc(sge->rxq_count * sizeof (struct sge_rxq), KM_SLEEP); + sge->txq = + kmem_zalloc(sge->txq_count * sizeof (struct sge_txq), KM_SLEEP); + sge->iqmap = + kmem_zalloc(sge->iqmap_sz * sizeof (struct sge_iq *), KM_SLEEP); + sge->eqmap = + kmem_zalloc(sge->eqmap_sz * sizeof (struct sge_eq *), KM_SLEEP); sc->intr_handle = - kmem_zalloc(sc->intr_count * sizeof (ddi_intr_handle_t), KM_SLEEP); + kmem_zalloc(iaq->intr_count * sizeof (ddi_intr_handle_t), + KM_SLEEP); /* - * Second pass over the ports. This time we know the number of rx and - * tx queues that each port should get. + * Enable hw checksumming and LSO for all ports by default. + * They can be disabled using ndd (hw_csum and hw_lso). */ - rqidx = tqidx = 0; for_each_port(sc, i) { - struct port_info *pi = sc->port[i]; - - if (pi == NULL) - continue; - - t4_mc_cb_init(pi); - pi->first_rxq = rqidx; - pi->nrxq = (t4_port_is_10xg(pi)) ? iaq.nrxq10g : iaq.nrxq1g; - pi->first_txq = tqidx; - pi->ntxq = (t4_port_is_10xg(pi)) ? iaq.ntxq10g : iaq.ntxq1g; - - rqidx += pi->nrxq; - tqidx += pi->ntxq; - - /* - * Enable hw checksumming and LSO for all ports by default. - * They can be disabled using ndd (hw_csum and hw_lso). - */ - pi->features |= (CXGBE_HW_CSUM | CXGBE_HW_LSO); + sc->port[i]->features |= (CXGBE_HW_CSUM | CXGBE_HW_LSO); } - /* - * Setup Interrupts. - */ - - i = 0; - rc = ddi_intr_alloc(dip, sc->intr_handle, sc->intr_type, 0, - sc->intr_count, &i, DDI_INTR_ALLOC_STRICT); - if (rc != DDI_SUCCESS) { - cxgb_printf(dip, CE_WARN, - "failed to allocate %d interrupt(s) of type %d: %d, %d", - sc->intr_count, sc->intr_type, rc, i); + /* Setup Interrupts. */ + if ((rc = t4_setup_intrs(sc)) != DDI_SUCCESS) { goto done; } - ASSERT(sc->intr_count == i); /* allocation was STRICT */ - (void) ddi_intr_get_cap(sc->intr_handle[0], &sc->intr_cap); - (void) ddi_intr_get_pri(sc->intr_handle[0], &sc->intr_pri); - if (sc->intr_count == 1) { - ASSERT(sc->flags & TAF_INTR_FWD); - (void) ddi_intr_add_handler(sc->intr_handle[0], t4_intr_all, sc, - &s->fwq); - } else { - /* Multiple interrupts. The first one is always error intr */ - (void) ddi_intr_add_handler(sc->intr_handle[0], t4_intr_err, sc, - NULL); - irq++; - - /* The second one is always the firmware event queue */ - (void) ddi_intr_add_handler(sc->intr_handle[1], t4_intr, sc, - &s->fwq); - irq++; - /* - * Note that if TAF_INTR_FWD is set then either the NIC rx - * queues or (exclusive or) the TOE rx queueus will be taking - * direct interrupts. - * - * There is no need to check for is_offload(sc) as nofldrxq - * will be 0 if offload is disabled. - */ - for_each_port(sc, i) { - struct port_info *pi = sc->port[i]; - struct sge_rxq *rxq; - rxq = &s->rxq[pi->first_rxq]; - for (q = 0; q < pi->nrxq; q++, rxq++) { - (void) ddi_intr_add_handler( - sc->intr_handle[irq], t4_intr, sc, - &rxq->iq); - irq++; - } - } - - } sc->flags |= TAF_INTR_ALLOC; if ((rc = ksensor_create_scalar_pcidev(dip, SENSOR_KIND_TEMPERATURE, @@ -561,17 +929,31 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) } ddi_ufm_update(sc->ufm_hdl); - if ((rc = t4_alloc_fwq(sc)) != 0) { + if ((rc = t4_alloc_evt_iqs(sc)) != 0) { cxgb_printf(dip, CE_WARN, "failed to alloc FWQ: %d", rc); rc = DDI_FAILURE; goto done; } if (sc->intr_cap & DDI_INTR_FLAG_BLOCK) { - (void) ddi_intr_block_enable(sc->intr_handle, sc->intr_count); + rc = ddi_intr_block_enable(sc->intr_handle, iaq->intr_count); + + if (rc != DDI_SUCCESS) { + cxgb_printf(dip, CE_WARN, "failed to enable intr " + "block: %d", rc); + rc = DDI_FAILURE; + goto done; + } } else { - for (i = 0; i < sc->intr_count; i++) - (void) ddi_intr_enable(sc->intr_handle[i]); + for (i = 0; i < iaq->intr_count; i++) { + rc = ddi_intr_enable(sc->intr_handle[i]); + if (rc != DDI_SUCCESS) { + cxgb_printf(dip, CE_WARN, "failed to enable " + "intr %d: %d", i, rc); + rc = DDI_FAILURE; + goto done; + } + } } t4_intr_enable(sc); @@ -588,14 +970,8 @@ t4_devo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) */ t4_dump_version_info(sc); - cxgb_printf(dip, CE_NOTE, "(%d rxq, %d txq total) %d %s.", - rqidx, tqidx, sc->intr_count, - sc->intr_type == DDI_INTR_TYPE_MSIX ? "MSI-X interrupts" : - sc->intr_type == DDI_INTR_TYPE_MSI ? "MSI interrupts" : - "fixed interrupt"); - - sc->ksp = setup_kstats(sc); - sc->ksp_stat = setup_wc_kstats(sc); + sc->ksp = t4_setup_kstats(sc); + sc->ksp_stat = t4_setup_wc_kstats(sc); sc->params.drv_memwin = MEMWIN_NIC; done: @@ -612,36 +988,37 @@ done: static int t4_devo_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { - int instance, i; - struct adapter *sc; + int i = 0; struct port_info *pi; - struct sge *s; + struct sge_info *s; if (cmd != DDI_DETACH) return (DDI_FAILURE); - instance = ddi_get_instance(dip); - sc = ddi_get_soft_state(t4_soft_state, instance); + const int instance = ddi_get_instance(dip); + struct adapter *sc = ddi_get_soft_state(t4_soft_state, instance); if (sc == NULL) return (DDI_SUCCESS); + struct t4_intrs_queues *iaq = &sc->intr_queue_cfg; + if (sc->flags & TAF_INIT_DONE) { t4_intr_disable(sc); for_each_port(sc, i) { pi = sc->port[i]; if (pi && pi->flags & TPF_INIT_DONE) - (void) t4_port_full_uninit(pi); + t4_port_full_uninit(pi); } if (sc->intr_cap & DDI_INTR_FLAG_BLOCK) { (void) ddi_intr_block_disable(sc->intr_handle, - sc->intr_count); + iaq->intr_count); } else { - for (i = 0; i < sc->intr_count; i++) + for (i = 0; i < iaq->intr_count; i++) (void) ddi_intr_disable(sc->intr_handle[i]); } - (void) t4_free_fwq(sc); + t4_free_evt_iqs(sc); sc->flags &= ~TAF_INIT_DONE; } @@ -662,9 +1039,9 @@ t4_devo_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) s = &sc->sge; if (s->rxq != NULL) - kmem_free(s->rxq, s->nrxq * sizeof (struct sge_rxq)); + kmem_free(s->rxq, s->rxq_count * sizeof (struct sge_rxq)); if (s->txq != NULL) - kmem_free(s->txq, s->ntxq * sizeof (struct sge_txq)); + kmem_free(s->txq, s->txq_count * sizeof (struct sge_txq)); if (s->iqmap != NULL) kmem_free(s->iqmap, s->iqmap_sz * sizeof (struct sge_iq *)); if (s->eqmap != NULL) @@ -674,21 +1051,39 @@ t4_devo_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) kmem_cache_destroy(s->rxbuf_cache); if (sc->flags & TAF_INTR_ALLOC) { - for (i = 0; i < sc->intr_count; i++) { - (void) ddi_intr_remove_handler(sc->intr_handle[i]); - (void) ddi_intr_free(sc->intr_handle[i]); + for (int i = 0; i < iaq->intr_count; i++) { + int rc = ddi_intr_remove_handler(sc->intr_handle[i]); + if (rc != DDI_SUCCESS) { + cxgb_printf(sc->dip, CE_WARN, "failed to " + "remove interrupt handler %d for type: %d " + "plan: %d: %d", i, iaq->intr_type, + iaq->intr_plan, rc); + } + + rc = ddi_intr_free(sc->intr_handle[i]); + if (rc != DDI_SUCCESS) { + cxgb_printf(sc->dip, CE_WARN, "failed to free " + "interrupt %d for type: %d plan: %d: %d", i, + iaq->intr_type, iaq->intr_plan, rc); + + } } sc->flags &= ~TAF_INTR_ALLOC; } if (sc->intr_handle != NULL) { kmem_free(sc->intr_handle, - sc->intr_count * sizeof (*sc->intr_handle)); + iaq->intr_count * sizeof (*sc->intr_handle)); } for_each_port(sc, i) { pi = sc->port[i]; if (pi != NULL) { + if (pi->intr_iqs != NULL) { + kmem_free(pi->intr_iqs, + sizeof (pi->intr_iqs[0]) * + sc->intr_queue_cfg.intr_per_port); + } mutex_destroy(&pi->lock); kmem_free(pi, sizeof (*pi)); } @@ -758,9 +1153,10 @@ t4_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t op, void *arg, switch (op) { case DDI_CTLOPS_REPORTDEV: - pi = ddi_get_parent_data(rdip); - pi->instance = ddi_get_instance(dip); - pi->child_inst = ddi_get_instance(rdip); + if (rdip == NULL) + return (DDI_FAILURE); + cmn_err(CE_CONT, "?t4nexus: %s%d\n", + ddi_driver_name(rdip), ddi_get_instance(rdip)); return (DDI_SUCCESS); case DDI_CTLOPS_INITCHILD: @@ -784,43 +1180,58 @@ t4_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t op, void *arg, } } +/* From a provided "cxgbe@0" string, parse the device number */ +static bool +t4_parse_devnum(const char *devname, uint_t *inst_nump) +{ + const size_t name_sz = strlen(devname) + 1; + char *name_copy = i_ddi_strdup(devname, KM_SLEEP); + + bool res = false; + char *nodename, *addrname = NULL; + i_ddi_parse_name(name_copy, &nodename, &addrname, NULL); + if (addrname == NULL || strcmp(T4_PORT_NAME, nodename) != 0) { + goto done; + } + + ulong_t num; + if (ddi_strtoul(addrname, NULL, 10, &num) != 0 || num > UINT_MAX) { + goto done; + } + *inst_nump = (uint_t)num; + res = true; + +done: + kmem_free(name_copy, name_sz); + return (res); +} + static int t4_bus_config(dev_info_t *dip, uint_t flags, ddi_bus_config_op_t op, void *arg, dev_info_t **cdipp) { - int instance, i; - struct adapter *sc; - - instance = ddi_get_instance(dip); - sc = ddi_get_soft_state(t4_soft_state, instance); + struct adapter *sc = + ddi_get_soft_state(t4_soft_state, ddi_get_instance(dip)); if (op == BUS_CONFIG_ONE) { - char *c; - - /* - * arg is something like "cxgb@0" where 0 is the port_id hanging - * off this nexus. - */ + uint_t dev_num; - c = arg; - while (*(c + 1)) - c++; - - /* There should be exactly 1 digit after '@' */ - if (*(c - 1) != '@') + if (!t4_parse_devnum((const char *)arg, &dev_num)) { return (NDI_FAILURE); - - i = *c - '0'; - - if (add_child_node(sc, i) != 0) + } + if (t4_add_child_node(sc, dev_num) != 0) { return (NDI_FAILURE); + } flags |= NDI_ONLINE_ATTACH; } else if (op == BUS_CONFIG_ALL || op == BUS_CONFIG_DRIVER) { + int i; + /* Allocate and bind all child device nodes */ - for_each_port(sc, i) - (void) add_child_node(sc, i); + for_each_port(sc, i) { + (void) t4_add_child_node(sc, (uint_t)i); + } flags |= NDI_ONLINE_ATTACH; } @@ -831,107 +1242,97 @@ static int t4_bus_unconfig(dev_info_t *dip, uint_t flags, ddi_bus_config_op_t op, void *arg) { - int instance, i, rc; - struct adapter *sc; - - instance = ddi_get_instance(dip); - sc = ddi_get_soft_state(t4_soft_state, instance); + struct adapter *sc + = ddi_get_soft_state(t4_soft_state, ddi_get_instance(dip)); - if (op == BUS_CONFIG_ONE || op == BUS_UNCONFIG_ALL || - op == BUS_UNCONFIG_DRIVER) + if (op == BUS_UNCONFIG_ONE || + op == BUS_UNCONFIG_ALL || + op == BUS_UNCONFIG_DRIVER) { flags |= NDI_UNCONFIG; + } - rc = ndi_busop_bus_unconfig(dip, flags, op, arg); + int rc = ndi_busop_bus_unconfig(dip, flags, op, arg); if (rc != 0) return (rc); if (op == BUS_UNCONFIG_ONE) { - char *c; - - c = arg; - while (*(c + 1)) - c++; + uint_t dev_num; - if (*(c - 1) != '@') - return (NDI_SUCCESS); - - i = *c - '0'; - - rc = remove_child_node(sc, i); + if (!t4_parse_devnum((const char *)arg, &dev_num)) { + return (NDI_FAILURE); + } + rc = t4_remove_child_node(sc, dev_num); } else if (op == BUS_UNCONFIG_ALL || op == BUS_UNCONFIG_DRIVER) { + uint_t i; - for_each_port(sc, i) - (void) remove_child_node(sc, i); + for_each_port(sc, i) { + (void) t4_remove_child_node(sc, i); + } } return (rc); } -/* ARGSUSED */ static int t4_cb_open(dev_t *devp, int flag, int otyp, cred_t *credp) { struct adapter *sc; - if (otyp != OTYP_CHR) + if (otyp != OTYP_CHR) { return (EINVAL); + } sc = ddi_get_soft_state(t4_soft_state, getminor(*devp)); - if (sc == NULL) + if (sc == NULL) { return (ENXIO); + } return (atomic_cas_uint(&sc->open, 0, EBUSY)); } -/* ARGSUSED */ static int t4_cb_close(dev_t dev, int flag, int otyp, cred_t *credp) { - struct adapter *sc; + struct adapter *sc = ddi_get_soft_state(t4_soft_state, getminor(dev)); - sc = ddi_get_soft_state(t4_soft_state, getminor(dev)); - if (sc == NULL) + if (sc == NULL) { return (EINVAL); + } (void) atomic_swap_uint(&sc->open, 0); return (0); } -/* ARGSUSED */ static int t4_cb_ioctl(dev_t dev, int cmd, intptr_t d, int mode, cred_t *credp, int *rp) { - int instance; - struct adapter *sc; - void *data = (void *)d; - - if (crgetuid(credp) != 0) + if (crgetuid(credp) != 0) { return (EPERM); + } - instance = getminor(dev); - sc = ddi_get_soft_state(t4_soft_state, instance); - if (sc == NULL) + struct adapter *sc = ddi_get_soft_state(t4_soft_state, getminor(dev)); + + if (sc == NULL) { return (EINVAL); + } - return (t4_ioctl(sc, cmd, data, mode)); + return (t4_ioctl(sc, cmd, (void *)d, mode)); } -static unsigned int -getpf(struct adapter *sc) +static uint_t +t4_getpf(struct adapter *sc) { - int rc, *data; - uint_t n, pf; + int *data; + uint_t n; - rc = ddi_prop_lookup_int_array(DDI_DEV_T_ANY, sc->dip, + const int rc = ddi_prop_lookup_int_array(DDI_DEV_T_ANY, sc->dip, DDI_PROP_DONTPASS, "reg", &data, &n); if (rc != DDI_SUCCESS) { - cxgb_printf(sc->dip, CE_WARN, - "failed to lookup \"reg\" property: %d", rc); - return (0xff); + return (UINT_MAX); } - pf = PCI_REG_FUNC_G(data[0]); + const uint_t pf = PCI_REG_FUNC_G(data[0]); ddi_prop_free(data); return (pf); @@ -942,21 +1343,12 @@ getpf(struct adapter *sc) * become the master, and reset the device. */ static int -prep_firmware(struct adapter *sc) +t4_prep_firmware(struct adapter *sc) { int rc; - size_t fw_size; - int reset = 1; - enum dev_state state; - unsigned char *fw_data; - struct fw_hdr *card_fw, *hdr; - const char *fw_file = NULL; - firmware_handle_t fw_hdl; - struct fw_info fi, *fw_info = &fi; - - struct driver_properties *p = &sc->props; /* Contact firmware, request master */ + enum dev_state state; rc = t4_fw_hello(sc, sc->mbox, sc->mbox, MASTER_MUST, &state); if (rc < 0) { rc = -rc; @@ -969,8 +1361,9 @@ prep_firmware(struct adapter *sc) sc->flags |= TAF_MASTER_PF; /* We may need FW version info for later reporting */ - t4_get_version_info(sc); + (void) t4_get_version_info(sc); + const char *fw_file = NULL; switch (CHELSIO_CHIP_VERSION(sc->params.chip)) { case CHELSIO_T4: fw_file = "t4fw.bin"; @@ -986,58 +1379,58 @@ prep_firmware(struct adapter *sc) return (EINVAL); } + firmware_handle_t fw_hdl; if (firmware_open(T4_PORT_NAME, fw_file, &fw_hdl) != 0) { cxgb_printf(sc->dip, CE_WARN, "Could not open %s\n", fw_file); return (EINVAL); } - fw_size = firmware_get_size(fw_hdl); - + const size_t fw_size = firmware_get_size(fw_hdl); if (fw_size < sizeof (struct fw_hdr)) { - cxgb_printf(sc->dip, CE_WARN, "%s is too small (%ld bytes)\n", + cxgb_printf(sc->dip, CE_WARN, "%s is too small (%lu bytes)\n", fw_file, fw_size); - firmware_close(fw_hdl); + (void) firmware_close(fw_hdl); return (EINVAL); } - if (fw_size > FLASH_FW_MAX_SIZE) { cxgb_printf(sc->dip, CE_WARN, - "%s is too large (%ld bytes, max allowed is %ld)\n", + "%s is too large (%lu bytes, max allowed is %lu)\n", fw_file, fw_size, FLASH_FW_MAX_SIZE); - firmware_close(fw_hdl); + (void) firmware_close(fw_hdl); return (EFBIG); } - fw_data = kmem_zalloc(fw_size, KM_SLEEP); + unsigned char *fw_data = kmem_zalloc(fw_size, KM_SLEEP); if (firmware_read(fw_hdl, 0, fw_data, fw_size) != 0) { cxgb_printf(sc->dip, CE_WARN, "Failed to read from %s\n", fw_file); - firmware_close(fw_hdl); + (void) firmware_close(fw_hdl); kmem_free(fw_data, fw_size); return (EINVAL); } - firmware_close(fw_hdl); - - bzero(fw_info, sizeof (*fw_info)); - fw_info->chip = CHELSIO_CHIP_VERSION(sc->params.chip); - - hdr = (struct fw_hdr *)fw_data; - fw_info->fw_hdr.fw_ver = hdr->fw_ver; - fw_info->fw_hdr.chip = hdr->chip; - fw_info->fw_hdr.intfver_nic = hdr->intfver_nic; - fw_info->fw_hdr.intfver_vnic = hdr->intfver_vnic; - fw_info->fw_hdr.intfver_ofld = hdr->intfver_ofld; - fw_info->fw_hdr.intfver_ri = hdr->intfver_ri; - fw_info->fw_hdr.intfver_iscsipdu = hdr->intfver_iscsipdu; - fw_info->fw_hdr.intfver_iscsi = hdr->intfver_iscsi; - fw_info->fw_hdr.intfver_fcoepdu = hdr->intfver_fcoepdu; - fw_info->fw_hdr.intfver_fcoe = hdr->intfver_fcoe; + (void) firmware_close(fw_hdl); + + const struct fw_hdr *hdr = (struct fw_hdr *)fw_data; + struct fw_info fi; + bzero(&fi, sizeof (fi)); + fi.chip = CHELSIO_CHIP_VERSION(sc->params.chip); + fi.fw_hdr.fw_ver = hdr->fw_ver; + fi.fw_hdr.chip = hdr->chip; + fi.fw_hdr.intfver_nic = hdr->intfver_nic; + fi.fw_hdr.intfver_vnic = hdr->intfver_vnic; + fi.fw_hdr.intfver_ofld = hdr->intfver_ofld; + fi.fw_hdr.intfver_ri = hdr->intfver_ri; + fi.fw_hdr.intfver_iscsipdu = hdr->intfver_iscsipdu; + fi.fw_hdr.intfver_iscsi = hdr->intfver_iscsi; + fi.fw_hdr.intfver_fcoepdu = hdr->intfver_fcoepdu; + fi.fw_hdr.intfver_fcoe = hdr->intfver_fcoe; /* allocate memory to read the header of the firmware on the card */ - card_fw = kmem_zalloc(sizeof (*card_fw), KM_SLEEP); + struct fw_hdr *card_fw = kmem_zalloc(sizeof (struct fw_hdr), KM_SLEEP); - rc = -t4_prep_fw(sc, fw_info, fw_data, fw_size, card_fw, - p->t4_fw_install, state, &reset); + int reset = 1; + rc = -t4_prep_fw(sc, &fi, fw_data, fw_size, card_fw, + sc->props.t4_fw_install, state, &reset); kmem_free(card_fw, sizeof (*card_fw)); kmem_free(fw_data, fw_size); @@ -1065,18 +1458,21 @@ prep_firmware(struct adapter *sc) if (sc->flags & TAF_MASTER_PF) { /* Handle default vs special T4 config file */ - rc = partition_resources(sc); - if (rc != 0) - goto err; /* error message displayed already */ + rc = t4_partition_resources(sc); + if (rc != 0) { + return (rc); + } } sc->flags |= FW_OK; return (0); -err: - return (rc); - } +struct memwin { + uint32_t base; + uint32_t aperture; +}; + static const struct memwin t4_memwin[] = { { MEMWIN0_BASE, MEMWIN0_APERTURE }, { MEMWIN1_BASE, MEMWIN1_APERTURE }, @@ -1101,8 +1497,8 @@ static const struct memwin t5_memwin[] = { * valid and lies entirely within the memtype specified. The global address of * the start of the range is returned in addr. */ -int -validate_mt_off_len(struct adapter *sc, int mtype, uint32_t off, int len, +static int +t4_validate_mt_off_len(struct adapter *sc, int mtype, uint32_t off, int len, uint32_t *addr) { uint32_t em, addr_len, maddr, mlen; @@ -1156,7 +1552,7 @@ validate_mt_off_len(struct adapter *sc, int mtype, uint32_t off, int len, } static void -memwin_info(struct adapter *sc, int win, uint32_t *base, uint32_t *aperture) +t4_memwin_info(struct adapter *sc, int win, uint32_t *base, uint32_t *aperture) { const struct memwin *mw; @@ -1176,7 +1572,7 @@ memwin_info(struct adapter *sc, int win, uint32_t *base, uint32_t *aperture) * Upload configuration file to card's memory. */ static int -upload_config_file(struct adapter *sc, uint32_t *mt, uint32_t *ma) +t4_upload_config_file(struct adapter *sc, uint32_t *mt, uint32_t *ma) { int rc = 0; size_t cflen, cfbaselen; @@ -1231,17 +1627,17 @@ upload_config_file(struct adapter *sc, uint32_t *mt, uint32_t *ma) cxgb_printf(sc->dip, CE_WARN, "config file too long (%d, max allowed is %d). ", cflen, FLASH_CFG_MAX_SIZE); - firmware_close(fw_hdl); + (void) firmware_close(fw_hdl); return (EFBIG); } - rc = validate_mt_off_len(sc, mtype, maddr, cflen, &addr); + rc = t4_validate_mt_off_len(sc, mtype, maddr, cflen, &addr); if (rc != 0) { cxgb_printf(sc->dip, CE_WARN, "%s: addr (%d/0x%x) or len %d is not valid: %d. " "Will try to use the config on the card, if any.\n", __func__, mtype, maddr, cflen, rc); - firmware_close(fw_hdl); + (void) firmware_close(fw_hdl); return (EFAULT); } @@ -1250,13 +1646,13 @@ upload_config_file(struct adapter *sc, uint32_t *mt, uint32_t *ma) if (firmware_read(fw_hdl, 0, cfdata, cflen) != 0) { cxgb_printf(sc->dip, CE_WARN, "Failed to read from %s\n", cfg_file); - firmware_close(fw_hdl); + (void) firmware_close(fw_hdl); kmem_free(cfbase, cfbaselen); return (EINVAL); } - firmware_close(fw_hdl); + (void) firmware_close(fw_hdl); - memwin_info(sc, 2, &mw_base, &mw_aperture); + t4_memwin_info(sc, 2, &mw_base, &mw_aperture); while (cflen) { off = t4_position_memwin(sc, 2, addr); n = min(cflen, mw_aperture - off); @@ -1277,24 +1673,26 @@ upload_config_file(struct adapter *sc, uint32_t *mt, uint32_t *ma) * the firmware to process it. */ static int -partition_resources(struct adapter *sc) +t4_partition_resources(struct adapter *sc) { int rc; - struct fw_caps_config_cmd caps; - uint32_t mtype, maddr, finicsum, cfcsum; + uint32_t mtype, maddr; - rc = upload_config_file(sc, &mtype, &maddr); + rc = t4_upload_config_file(sc, &mtype, &maddr); if (rc != 0) { mtype = FW_MEMTYPE_CF_FLASH; maddr = t4_flash_cfg_addr(sc); } + struct fw_caps_config_cmd caps; bzero(&caps, sizeof (caps)); caps.op_to_write = BE_32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); caps.cfvalid_to_len16 = BE_32(F_FW_CAPS_CONFIG_CMD_CFVALID | V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) | - V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(maddr >> 16) | FW_LEN16(caps)); + V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(maddr >> 16) | + FW_LEN16(struct fw_caps_config_cmd)); + rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof (caps), &caps); if (rc != 0) { cxgb_printf(sc->dip, CE_WARN, @@ -1302,26 +1700,26 @@ partition_resources(struct adapter *sc) return (rc); } - finicsum = ntohl(caps.finicsum); - cfcsum = ntohl(caps.cfcsum); - if (finicsum != cfcsum) { + if (caps.finicsum != caps.cfcsum) { cxgb_printf(sc->dip, CE_WARN, "WARNING: config file checksum mismatch: %08x %08x\n", - finicsum, cfcsum); + caps.finicsum, caps.cfcsum); } - sc->cfcsum = cfcsum; + sc->cfcsum = caps.cfcsum; - /* TODO: Need to configure this correctly */ - caps.toecaps = htons(FW_CAPS_CONFIG_TOE); + /* Disable unused offloads and features */ + caps.toecaps = 0; caps.iscsicaps = 0; caps.rdmacaps = 0; caps.fcoecaps = 0; + caps.cryptocaps = 0; + /* TODO: Disable VNIC cap for now */ - caps.niccaps ^= htons(FW_CAPS_CONFIG_NIC_VM); + caps.niccaps &= BE_16(~FW_CAPS_CONFIG_NIC_VM); - caps.op_to_write = htonl(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | + caps.op_to_write = BE_32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE); - caps.cfvalid_to_len16 = htonl(FW_LEN16(caps)); + caps.cfvalid_to_len16 = BE_32(FW_LEN16(caps)); rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof (caps), NULL); if (rc != 0) { cxgb_printf(sc->dip, CE_WARN, @@ -1342,7 +1740,7 @@ partition_resources(struct adapter *sc) * Configuration Files and hard-coded initialization ... */ static int -adap__pre_init_tweaks(struct adapter *sc) +t4_init_adap_tweaks(struct adapter *sc) { int rx_dma_offset = 2; /* Offset of RX packets into DMA buffers */ @@ -1364,12 +1762,10 @@ adap__pre_init_tweaks(struct adapter *sc) * t4_sge_init and t4_fw_initialize. */ static int -get_params__pre_init(struct adapter *sc) +t4_init_get_params_pre(struct adapter *sc) { int rc; uint32_t param[2], val[2]; - struct fw_devlog_cmd cmd; - struct devlog_params *dlog = &sc->params.devlog; /* * Grab the raw VPD parameters. @@ -1401,25 +1797,31 @@ get_params__pre_init(struct adapter *sc) sc->params.nports++; val[0] &= val[0] - 1; } - sc->params.vpd.cclk = val[1]; /* Read device log parameters. */ + struct fw_devlog_cmd cmd; bzero(&cmd, sizeof (cmd)); - cmd.op_to_write = htonl(V_FW_CMD_OP(FW_DEVLOG_CMD) | + cmd.op_to_write = BE_32(V_FW_CMD_OP(FW_DEVLOG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); - cmd.retval_len16 = htonl(FW_LEN16(cmd)); + cmd.retval_len16 = BE_32(FW_LEN16(struct fw_devlog_cmd)); + rc = -t4_wr_mbox(sc, sc->mbox, &cmd, sizeof (cmd), &cmd); if (rc != 0) { cxgb_printf(sc->dip, CE_WARN, "failed to get devlog parameters: %d.\n", rc); - bzero(dlog, sizeof (*dlog)); - rc = 0; /* devlog isn't critical for device operation */ + + /* devlog isn't critical for device operation */ + bzero(&sc->params.devlog, sizeof (sc->params.devlog)); + rc = 0; } else { - val[0] = ntohl(cmd.memtype_devlog_memaddr16_devlog); - dlog->memtype = G_FW_DEVLOG_CMD_MEMTYPE_DEVLOG(val[0]); - dlog->start = G_FW_DEVLOG_CMD_MEMADDR16_DEVLOG(val[0]) << 4; - dlog->size = ntohl(cmd.memsize_devlog); + const uint32_t info = + BE_32(cmd.memtype_devlog_memaddr16_devlog); + struct devlog_params *dlog = &sc->params.devlog; + + dlog->memtype = G_FW_DEVLOG_CMD_MEMTYPE_DEVLOG(info); + dlog->start = G_FW_DEVLOG_CMD_MEMADDR16_DEVLOG(info) << 4; + dlog->size = BE_32(cmd.memsize_devlog); } return (rc); @@ -1430,11 +1832,10 @@ get_params__pre_init(struct adapter *sc) * has been initialized by the firmware at this point. */ static int -get_params__post_init(struct adapter *sc) +t4_init_get_params_post(struct adapter *sc) { int rc; uint32_t param[4], val[4]; - struct fw_caps_config_cmd caps; param[0] = FW_PARAM_PFVF(IQFLINT_START); param[1] = FW_PARAM_PFVF(EQ_START); @@ -1447,27 +1848,10 @@ get_params__post_init(struct adapter *sc) return (rc); } - sc->sge.iq_start = val[0]; - sc->sge.eq_start = val[1]; - sc->sge.iqmap_sz = val[2] - sc->sge.iq_start + 1; - sc->sge.eqmap_sz = val[3] - sc->sge.eq_start + 1; - - uint32_t r = t4_read_reg(sc, A_SGE_EGRESS_QUEUES_PER_PAGE_PF); - r >>= S_QUEUESPERPAGEPF0 + - (S_QUEUESPERPAGEPF1 - S_QUEUESPERPAGEPF0) * sc->pf; - sc->sge.s_qpp = r & M_QUEUESPERPAGEPF0; - - /* get capabilites */ - bzero(&caps, sizeof (caps)); - caps.op_to_write = htonl(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | - F_FW_CMD_REQUEST | F_FW_CMD_READ); - caps.cfvalid_to_len16 = htonl(FW_LEN16(caps)); - rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof (caps), &caps); - if (rc != 0) { - cxgb_printf(sc->dip, CE_WARN, - "failed to get card capabilities: %d.\n", rc); - return (rc); - } + sc->sge.iqmap_start = val[0]; + sc->sge.eqmap_start = val[1]; + sc->sge.iqmap_sz = (val[2] - sc->sge.iqmap_start) + 1; + sc->sge.eqmap_sz = (val[3] - sc->sge.eqmap_start) + 1; /* Check if DBQ timer is available for tracking egress completions */ param[0] = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | @@ -1479,6 +1863,19 @@ get_params__post_init(struct adapter *sc) ARRAY_SIZE(sc->sge.dbq_timers), sc->sge.dbq_timers); if (rc == 0) { sc->flags |= TAF_DBQ_TIMER; + + /* + * Expose DBQ timer values as property, converting them + * to plain `int` as required. + */ + int tmp_encode[ARRAY_SIZE(sc->sge.dbq_timers)]; + for (uint_t i = 0; i < ARRAY_SIZE(sc->sge.dbq_timers); + i++) { + tmp_encode[i] = sc->sge.dbq_timers[i]; + }; + (void) ddi_prop_update_int_array(sc->dev, sc->dip, + "tx-reclaim-timer-us-values", + tmp_encode, SGE_NTIMERS); } else { sc->sge.dbq_timer_tick = 0; } @@ -1533,7 +1930,7 @@ get_params__post_init(struct adapter *sc) } static int -set_params__post_init(struct adapter *sc) +t4_init_set_params(struct adapter *sc) { uint32_t param, val; @@ -1643,15 +2040,34 @@ prop_lookup_int(struct adapter *sc, char *name, int defval) name, defval)); } +static bool +prop_lookup_bool(struct adapter *sc, char *name, bool defval) +{ + int rc; + + rc = ddi_prop_get_int(sc->dev, sc->dip, DDI_PROP_DONTPASS, name, -1); + if (rc == -1) { + rc = ddi_prop_get_int(DDI_DEV_T_ANY, sc->dip, DDI_PROP_DONTPASS, + name, -1); + } + + if (rc != -1) { + return (rc != 0); + } else { + return (defval); + } +} + const uint_t t4_holdoff_timer_default[SGE_NTIMERS] = {5, 10, 20, 50, 100, 200}; const uint_t t4_holdoff_pktcnt_default[SGE_NCOUNTERS] = {1, 8, 16, 32}; -static int -init_driver_props(struct adapter *sc, struct driver_properties *p) +static void +t4_init_driver_props(struct adapter *sc) { + struct driver_properties *p = &sc->props; dev_t dev = sc->dev; dev_info_t *dip = sc->dip; - int i; + int val; /* * For now, just use the defaults for the hold-off timers and counters. @@ -1670,392 +2086,518 @@ init_driver_props(struct adapter *sc, struct driver_properties *p) (void) ddi_prop_update_int_array(dev, dip, "holdoff-pkt-counter-values", (int *)p->holdoff_pktcnt, SGE_NCOUNTERS); - /* - * Maximum # of tx and rx queues to use for each - * 100G, 40G, 25G, 10G and 1G port. - */ - p->max_ntxq_10g = prop_lookup_int(sc, "max-ntxq-10G-port", 8); - (void) ddi_prop_update_int(dev, dip, "max-ntxq-10G-port", - p->max_ntxq_10g); - - p->max_nrxq_10g = prop_lookup_int(sc, "max-nrxq-10G-port", 8); - (void) ddi_prop_update_int(dev, dip, "max-nrxq-10G-port", - p->max_nrxq_10g); - - p->max_ntxq_1g = prop_lookup_int(sc, "max-ntxq-1G-port", 2); - (void) ddi_prop_update_int(dev, dip, "max-ntxq-1G-port", - p->max_ntxq_1g); - - p->max_nrxq_1g = prop_lookup_int(sc, "max-nrxq-1G-port", 2); - (void) ddi_prop_update_int(dev, dip, "max-nrxq-1G-port", - p->max_nrxq_1g); - - /* - * Holdoff parameters for 10G and 1G ports. - */ - p->tmr_idx_10g = prop_lookup_int(sc, "holdoff-timer-idx-10G", 0); - (void) ddi_prop_update_int(dev, dip, "holdoff-timer-idx-10G", - p->tmr_idx_10g); - - p->pktc_idx_10g = prop_lookup_int(sc, "holdoff-pktc-idx-10G", 2); - (void) ddi_prop_update_int(dev, dip, "holdoff-pktc-idx-10G", - p->pktc_idx_10g); + p->ethq_tmr_idx = prop_lookup_int(sc, "holdoff-timer-idx", 0); + p->ethq_pktc_idx = prop_lookup_int(sc, "holdoff-pktc-idx", 2); - p->tmr_idx_1g = prop_lookup_int(sc, "holdoff-timer-idx-1G", 0); - (void) ddi_prop_update_int(dev, dip, "holdoff-timer-idx-1G", - p->tmr_idx_1g); + (void) ddi_prop_update_int(dev, dip, "holdoff-timer-idx", + p->ethq_tmr_idx); + (void) ddi_prop_update_int(dev, dip, "holdoff-pktc-idx", + p->ethq_pktc_idx); - p->pktc_idx_1g = prop_lookup_int(sc, "holdoff-pktc-idx-1G", 2); - (void) ddi_prop_update_int(dev, dip, "holdoff-pktc-idx-1G", - p->pktc_idx_1g); - - /* - * Size (number of entries) of each tx and rx queue. - */ - i = prop_lookup_int(sc, "qsize-txq", TX_EQ_QSIZE); - p->qsize_txq = max(i, 128); - if (p->qsize_txq != i) { + /* The size (number of host credits) of the tx queue. */ + val = prop_lookup_int(sc, "qsize-txq", T4_TX_DEF_QSIZE); + p->qsize_txq = MAX(val, 128); + p->qsize_txq = MIN(p->qsize_txq, T4_MAX_EQ_SIZE); + if (p->qsize_txq != val) { cxgb_printf(dip, CE_WARN, "using %d instead of %d as the tx queue size", - p->qsize_txq, i); + p->qsize_txq, val); } (void) ddi_prop_update_int(dev, dip, "qsize-txq", p->qsize_txq); - i = prop_lookup_int(sc, "qsize-rxq", RX_IQ_QSIZE); - p->qsize_rxq = max(i, 128); - while (p->qsize_rxq & 7) - p->qsize_rxq--; - if (p->qsize_rxq != i) { - cxgb_printf(dip, CE_WARN, - "using %d instead of %d as the rx queue size", - p->qsize_rxq, i); - } - (void) ddi_prop_update_int(dev, dip, "qsize-rxq", p->qsize_rxq); - /* - * Interrupt types allowed. - * Bits 0, 1, 2 = INTx, MSI, MSI-X respectively. See sys/ddi_intr.h + * The size (number of entries/host credits) of the rx queue. The device + * requires that all IQs be sized to a multiple of 16. */ - p->intr_types = prop_lookup_int(sc, "interrupt-types", - DDI_INTR_TYPE_MSIX | DDI_INTR_TYPE_MSI | DDI_INTR_TYPE_FIXED); - (void) ddi_prop_update_int(dev, dip, "interrupt-types", p->intr_types); - - /* - * Write combining - * 0 to disable, 1 to enable - */ - p->wc = prop_lookup_int(sc, "write-combine", 1); - cxgb_printf(dip, CE_WARN, "write-combine: using of %d", p->wc); - if (p->wc != 0 && p->wc != 1) { + val = prop_lookup_int(sc, "qsize-rxq", T4_RX_DEF_QSIZE); + p->qsize_rxq = MAX(val, 128) & ~15; + p->qsize_rxq = MIN(p->qsize_rxq, SGE_MAX_IQ_SIZE); + if (p->qsize_rxq != val) { cxgb_printf(dip, CE_WARN, - "write-combine: using 1 instead of %d", p->wc); - p->wc = 1; + "using %u instead of %d as the rx queue size", + p->qsize_rxq, val); } - (void) ddi_prop_update_int(dev, dip, "write-combine", p->wc); + (void) ddi_prop_update_int(dev, dip, "qsize-rxq", p->qsize_rxq); + + p->write_combine = prop_lookup_bool(sc, "write-combine", true); + (void) ddi_prop_update_int(dev, dip, "write-combine", + p->write_combine ? 1 : 0); p->t4_fw_install = prop_lookup_int(sc, "t4_fw_install", 1); if (p->t4_fw_install != 0 && p->t4_fw_install != 2) p->t4_fw_install = 1; (void) ddi_prop_update_int(dev, dip, "t4_fw_install", p->t4_fw_install); +} - /* Multiple Rings */ - p->multi_rings = prop_lookup_int(sc, "multi-rings", 1); - if (p->multi_rings != 0 && p->multi_rings != 1) { - cxgb_printf(dip, CE_NOTE, - "multi-rings: using value 1 instead of %d", p->multi_rings); - p->multi_rings = 1; - } - - (void) ddi_prop_update_int(dev, dip, "multi-rings", p->multi_rings); +/* + * Permit artificial clamping of interrupts for device. + * Provided mainly for development and testing purposes. + */ +static int t4_intr_count_clamp = 0; - return (0); -} +/* + * Queue counts to allocate per-port based on device speed. + * + * These have been picked somewhat arbitrarily, and should be further + * scrutinized with additional testing. + */ +#define T4_QCNT(speed, num) [speed] = { speed, num, num } +static const struct t4_queue_count { + t4_port_speed_t tqc_speed; + uint_t tqc_rxq_count; + uint_t tqc_txq_count; +} t4_queue_counts[] = { + T4_QCNT(TPS_1G, 2), + T4_QCNT(TPS_10G, 8), + T4_QCNT(TPS_25G, 16), + T4_QCNT(TPS_40G, 24), + T4_QCNT(TPS_50G, 24), + T4_QCNT(TPS_100G, 32), + T4_QCNT(TPS_200G, 48), + T4_QCNT(TPS_400G, 64), +}; static int -remove_extra_props(struct adapter *sc, int n10g, int n1g) +t4_cfg_intrs_queues(struct adapter *sc) { - if (n10g == 0) { - (void) ddi_prop_remove(sc->dev, sc->dip, "max-ntxq-10G-port"); - (void) ddi_prop_remove(sc->dev, sc->dip, "max-nrxq-10G-port"); - (void) ddi_prop_remove(sc->dev, sc->dip, - "holdoff-timer-idx-10G"); - (void) ddi_prop_remove(sc->dev, sc->dip, - "holdoff-pktc-idx-10G"); + struct t4_intrs_queues *iaq = &sc->intr_queue_cfg; + int rc; + + bzero(iaq, sizeof (*iaq)); + + int supported_itypes; + rc = ddi_intr_get_supported_types(sc->dip, &supported_itypes); + if (rc != DDI_SUCCESS) { + cxgb_printf(sc->dip, CE_WARN, + "failed to determine supported interrupt types: %d", rc); + return (rc); } - if (n1g == 0) { - (void) ddi_prop_remove(sc->dev, sc->dip, "max-ntxq-1G-port"); - (void) ddi_prop_remove(sc->dev, sc->dip, "max-nrxq-1G-port"); - (void) ddi_prop_remove(sc->dev, sc->dip, - "holdoff-timer-idx-1G"); - (void) ddi_prop_remove(sc->dev, sc->dip, "holdoff-pktc-idx-1G"); + const int intr_types[] = { + DDI_INTR_TYPE_MSIX, DDI_INTR_TYPE_MSI, DDI_INTR_TYPE_FIXED, + }; + const char *intr_str[] = { "MSI-X", "MSI", "Fixed" }; + int itype = -1; + + for (uint_t i = 0; i < ARRAY_SIZE(intr_types); i++) { + itype = intr_types[i]; + if ((itype & supported_itypes) == 0) { + continue; + } + + rc = ddi_intr_get_navail(sc->dip, itype, &iaq->intr_avail); + if (rc != DDI_SUCCESS || iaq->intr_avail < 0) { + cxgb_printf(sc->dip, CE_WARN, "failed to query " + "available interrupts for type %s: %d", intr_str[i], + rc); + continue; + } + + /* + * The device error and FWQ interrupts are hard-coded to indexes + * 0 and 1, respectively. We require at least two interrupts be + * available for MSI(-X) in order to cover both of those cases. + */ + if (iaq->intr_avail >= 2 || + (iaq->intr_avail == 1 && itype == DDI_INTR_TYPE_FIXED)) { + break; + } } - return (0); -} + if (iaq->intr_avail == 0) { + cxgb_printf(sc->dip, CE_WARN, "failed to get any interrupts " + "after querying all types"); + return (rc); + } -static int -cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g, - struct intrs_and_queues *iaq) -{ - struct driver_properties *p = &sc->props; - int rc, itype, itypes, navail, nc, n; - int pfres_rxq, pfres_txq, pfresq; + ASSERT3S(iaq->intr_avail, >, 0); + iaq->intr_type = itype; + iaq->intr_count = iaq->intr_avail; - bzero(iaq, sizeof (*iaq)); - nc = ncpus; /* our snapshot of the number of CPUs */ - iaq->ntxq10g = min(nc, p->max_ntxq_10g); - iaq->ntxq1g = min(nc, p->max_ntxq_1g); - iaq->nrxq10g = min(nc, p->max_nrxq_10g); - iaq->nrxq1g = min(nc, p->max_nrxq_1g); + /* Permit artificial clamping of consumed interrupts. */ + if (t4_intr_count_clamp > 1) { + iaq->intr_count = MIN(iaq->intr_avail, t4_intr_count_clamp); + } + + const uint_t port_count = sc->params.nports; + + iaq->intr_per_port = 0; + /* One IQ for the FWQ */ + iaq->num_iqs = 1; - pfres_rxq = iaq->nrxq10g * n10g + iaq->nrxq1g * n1g; - pfres_txq = iaq->ntxq10g * n10g + iaq->ntxq1g * n1g; + if (iaq->intr_count == 1) { + iaq->intr_plan = TIP_SINGLE; + } else if (iaq->intr_count == 2 || iaq->intr_count < (port_count + 2)) { + iaq->intr_plan = TIP_ERR_QUEUES; + } else { + /* + * We know the interrupt count is at least equal to + * port_count+2, and thus we should always have at least + * one event interrupt per port. + */ + VERIFY(iaq->intr_count >= (port_count + 2)); + iaq->intr_plan = TIP_PER_PORT; + iaq->intr_per_port = (iaq->intr_count - 2) / port_count; + VERIFY3U(iaq->intr_per_port, >, 0); + iaq->num_iqs += iaq->intr_per_port * port_count; + } + const struct pf_resources *pfres = &sc->params.pfres; + if (pfres->niqflint <= 1) { + /* We cannot achieve much with a single IQ */ + cxgb_printf(sc->dip, CE_WARN, + "inadequate IQ resources available"); + return (DDI_FAILURE); + } + + const uint_t port_iqs = pfres->niqflint - iaq->num_iqs; /* - * If current configuration of max number of Rxqs and Txqs exceed - * the max available for all the ports under this PF, then shrink - * the queues to max available. Reduce them in a way that each - * port under this PF has equally distributed number of queues. - * Must guarantee at least 1 queue for each port for both NIC - * and Offload queues. - * - * neq - fixed max number of Egress queues on Tx path and Free List - * queues that hold Rx payload data on Rx path. Half are reserved - * for Egress queues and the other half for Free List queues. - * Hence, the division by 2. + * Every RX queue needs an IQ capable of interrupts (for the receive + * notifications) as well as an EQ (for posting the freelist entries to + * the device. Half of the total EQs are left for TXQs. + */ + const uint_t max_rxq = MIN(port_iqs, pfres->neq / 2); + + /* Every TX queue needs an ethernet-capable EQ. */ + const uint_t max_txq = MIN(pfres->nethctrl, pfres->neq / 2); + + if ((max_rxq / port_count) == 0) { + cxgb_printf(sc->dip, CE_WARN, + "inadequate RX queue resources available"); + return (DDI_FAILURE); + } else if ((max_txq / port_count) == 0) { + cxgb_printf(sc->dip, CE_WARN, + "inadequate TX queue resources available"); + return (DDI_FAILURE); + } + + /* Clamp max queue counts to number of CPUs */ + iaq->port_max_rxq = MIN(max_rxq, ncpus); + iaq->port_max_txq = MIN(max_txq, ncpus); + + VERIFY(iaq->intr_count > 0); + VERIFY(iaq->port_max_rxq != 0); + VERIFY(iaq->port_max_txq != 0); + VERIFY(iaq->num_iqs != 0); + + /* + * Determine per-port queue counts based on maximum port speed. * - * niqflint - max number of Ingress queues with interrupts on Rx - * path to receive completions that indicate Rx payload has been - * posted in its associated Free List queue. Also handles Tx - * completions for packets successfully transmitted on Tx path. + * This is a bit unfortunate, since there does not seem to be a way to + * query the maximum possible speed for a port independent of any + * installed transceiver. If a transceiver of lesser speed capability + * is installed in a port, that port will clamp its own reported + * capabilities to those of the transceiver. * - * nethctrl - max number of Egress queues only for Tx path. This - * number is usually half of neq. However, if it became less than - * neq due to lack of resources based on firmware configuration, - * then take the lower value. + * Our compromise is to size queue allocations based on the fastest port + * we can find. This will be less than ideal for adapters with + * heterogeneous port configurations or systems where transceivers of + * differing speed capabilities are swapped in after the driver + * initializes the adapter(s). */ - const uint_t max_rxq = - MIN(sc->params.pfres.neq / 2, sc->params.pfres.niqflint); - while (pfres_rxq > max_rxq) { - pfresq = pfres_rxq; - - if (iaq->nrxq10g > 1) { - iaq->nrxq10g--; - pfres_rxq -= n10g; - } + t4_port_speed_t max_speed = TPS_1G; + for (uint_t i = 0; i < port_count; i++) { + max_speed = MAX(max_speed, t4_port_speed(sc->port[i])); + } + ASSERT(max_speed < ARRAY_SIZE(t4_queue_counts)); + const struct t4_queue_count *qc = &t4_queue_counts[max_speed]; - if (iaq->nrxq1g > 1) { - iaq->nrxq1g--; - pfres_rxq -= n1g; - } + uint_t rxq_idx = 0, txq_idx = 0; + for (uint_t i = 0; i < port_count; i++) { + struct port_info *pi = sc->port[i]; - /* Break if nothing changed */ - if (pfresq == pfres_rxq) - break; + /* Clamp to per-port maximums */ + pi->rxq_count = MIN(qc->tqc_rxq_count, iaq->port_max_rxq); + pi->txq_count = MIN(qc->tqc_txq_count, iaq->port_max_txq); + + pi->rxq_start = rxq_idx; + pi->txq_start = txq_idx; + rxq_idx += pi->rxq_count; + txq_idx += pi->txq_count; } - const uint_t max_txq = - MIN(sc->params.pfres.neq / 2, sc->params.pfres.nethctrl); - while (pfres_txq > max_txq) { - pfresq = pfres_txq; + struct sge_info *sge = &sc->sge; + sge->rxq_count = rxq_idx; + sge->txq_count = txq_idx; - if (iaq->ntxq10g > 1) { - iaq->ntxq10g--; - pfres_txq -= n10g; - } + cxgb_printf(sc->dip, CE_NOTE, "(%u rxq, %u txq total) %d %s.", + rxq_idx, txq_idx, iaq->intr_count, + iaq->intr_type == DDI_INTR_TYPE_MSIX ? "MSI-X interrupts" : + iaq->intr_type == DDI_INTR_TYPE_MSI ? "MSI interrupts" : + "fixed interrupt"); - if (iaq->ntxq1g > 1) { - iaq->ntxq1g--; - pfres_txq -= n1g; - } + return (DDI_SUCCESS); +} - /* Break if nothing changed */ - if (pfresq == pfres_txq) - break; +static int +t4_setup_port_intrs(struct adapter *sc, int *handlers) +{ + int rc = 0; + const struct t4_intrs_queues *iaq = &sc->intr_queue_cfg; + + for (uint_t i = 0; i < sc->params.nports; i++) { + struct port_info *port = sc->port[i]; + + port->intr_iqs = kmem_zalloc(iaq->intr_per_port * + sizeof (t4_sge_iq_t), KM_SLEEP); + + for (uint_t j = 0; j < iaq->intr_per_port; j++) { + uint_t intr_idx = 2 + (i * iaq->intr_per_port) + j; + VERIFY3S(intr_idx, <, iaq->intr_count); + ddi_intr_handle_t ihdl = sc->intr_handle[intr_idx]; + rc = ddi_intr_add_handler(ihdl, t4_intr_port_queue, + &port->intr_iqs[j], NULL); + if (rc != DDI_SUCCESS) { + /* + * Previously installed handlers are cleaned up + * by the parent function. + */ + cxgb_printf(sc->dip, CE_WARN, "failed to add " + "interrupt handler %u for type: %d plan: " + "%d: %d", intr_idx, iaq->intr_type, + iaq->intr_plan, rc); + return (rc); + } + *handlers += 1; + } } - rc = ddi_intr_get_supported_types(sc->dip, &itypes); + return (DDI_SUCCESS); +} + +static int +t4_setup_intrs(struct adapter *sc) +{ + const struct t4_intrs_queues *iaq = &sc->intr_queue_cfg; + const int intr_count = iaq->intr_count; + const int intr_type = iaq->intr_type; + int allocated = 0; + int handlers = 0; + + int rc = ddi_intr_alloc(sc->dip, sc->intr_handle, intr_type, 0, + intr_count, &allocated, DDI_INTR_ALLOC_STRICT); if (rc != DDI_SUCCESS) { cxgb_printf(sc->dip, CE_WARN, - "failed to determine supported interrupt types: %d", rc); - return (rc); + "failed to allocate %d interrupt(s) of type %d: %d, %d", + intr_count, intr_type, rc, allocated); + goto fail; } - for (itype = DDI_INTR_TYPE_MSIX; itype; itype >>= 1) { - ASSERT(itype == DDI_INTR_TYPE_MSIX || - itype == DDI_INTR_TYPE_MSI || - itype == DDI_INTR_TYPE_FIXED); + VERIFY3U(intr_count, ==, allocated); /* allocation was STRICT */ - if ((itype & itypes & p->intr_types) == 0) - continue; /* not supported or not allowed */ + rc = ddi_intr_get_cap(sc->intr_handle[0], &sc->intr_cap); + if (rc != DDI_SUCCESS) { + cxgb_printf(sc->dip, CE_WARN, "failed to get interrupt " + "capabilities for type %d: %d", intr_type, rc); + goto fail; + } - navail = 0; - rc = ddi_intr_get_navail(sc->dip, itype, &navail); - if (rc != DDI_SUCCESS || navail == 0) { - cxgb_printf(sc->dip, CE_WARN, - "failed to get # of interrupts for type %d: %d", - itype, rc); - continue; /* carry on */ + rc = ddi_intr_get_pri(sc->intr_handle[0], &sc->intr_pri); + if (rc != DDI_SUCCESS) { + cxgb_printf(sc->dip, CE_WARN, "failed to get interrupt " + "priority for type %d: %d", intr_type, rc); + goto fail; + } + + switch (iaq->intr_plan) { + case TIP_SINGLE: + ASSERT3U(intr_count, ==, 1); + rc = ddi_intr_add_handler(sc->intr_handle[0], t4_intr_all, sc, + NULL); + if (rc != DDI_SUCCESS) { + cxgb_printf(sc->dip, CE_WARN, "failed to add interrupt " + "handler %u for type: %d plan: %d: %d", handlers, + intr_type, iaq->intr_plan, rc); + goto fail; } + handlers++; + break; - iaq->intr_type = itype; - if (navail == 0) - continue; + case TIP_ERR_QUEUES: + VERIFY3U(intr_count, ==, 2); + rc = ddi_intr_add_handler(sc->intr_handle[0], t4_intr_err, sc, + NULL); + if (rc != DDI_SUCCESS) { + cxgb_printf(sc->dip, CE_WARN, "failed to add interrupt " + "handler %u for type: %d plan: %d: %d", handlers, + intr_type, iaq->intr_plan, rc); + goto fail; + } + handlers++; - /* - * Best option: an interrupt vector for errors, one for the - * firmware event queue, and one each for each rxq (NIC as well - * as offload). - */ - iaq->nirq = T4_EXTRA_INTR; - iaq->nirq += n10g * iaq->nrxq10g; - iaq->nirq += n1g * iaq->nrxq1g; - - if (iaq->nirq <= navail && - (itype != DDI_INTR_TYPE_MSI || ISP2(iaq->nirq))) { - iaq->intr_fwd = 0; - goto allocate; + rc = ddi_intr_add_handler(sc->intr_handle[1], t4_intr_fwq, sc, + NULL); + if (rc != DDI_SUCCESS) { + cxgb_printf(sc->dip, CE_WARN, "failed to add interrupt " + "handler %u for type: %d plan: %d: %d", handlers, + intr_type, iaq->intr_plan, rc); + goto fail; } + handlers++; + break; - /* - * Second best option: an interrupt vector for errors, one for - * the firmware event queue, and one each for either NIC or - * offload rxq's. - */ - iaq->nirq = T4_EXTRA_INTR; - iaq->nirq += n10g * iaq->nrxq10g; - iaq->nirq += n1g * iaq->nrxq1g; - if (iaq->nirq <= navail && - (itype != DDI_INTR_TYPE_MSI || ISP2(iaq->nirq))) { - iaq->intr_fwd = 1; - goto allocate; + case TIP_PER_PORT: + VERIFY3U(intr_count, >=, 2 + sc->params.nports); + rc = ddi_intr_add_handler(sc->intr_handle[0], t4_intr_err, sc, + NULL); + if (rc != DDI_SUCCESS) { + cxgb_printf(sc->dip, CE_WARN, "failed to add interrupt " + "handler %u for type: %d plan: %d: %d", handlers, + intr_type, iaq->intr_plan, rc); + goto fail; } + handlers++; - /* - * Next best option: an interrupt vector for errors, one for the - * firmware event queue, and at least one per port. At this - * point we know we'll have to downsize nrxq or nofldrxq to fit - * what's available to us. - */ - iaq->nirq = T4_EXTRA_INTR; - iaq->nirq += n10g + n1g; - if (iaq->nirq <= navail) { - int leftover = navail - iaq->nirq; - - if (n10g > 0) { - int target = iaq->nrxq10g; - - n = 1; - while (n < target && leftover >= n10g) { - leftover -= n10g; - iaq->nirq += n10g; - n++; - } - iaq->nrxq10g = min(n, iaq->nrxq10g); - } + rc = ddi_intr_add_handler(sc->intr_handle[1], t4_intr_fwq, sc, + NULL); + if (rc != DDI_SUCCESS) { + cxgb_printf(sc->dip, CE_WARN, "failed to add interrupt " + "handler %u for type: %d plan: %d: %d", handlers, + intr_type, iaq->intr_plan, rc); + goto fail; + } + handlers++; - if (n1g > 0) { - int target = iaq->nrxq1g; + rc = t4_setup_port_intrs(sc, &handlers); - n = 1; - while (n < target && leftover >= n1g) { - leftover -= n1g; - iaq->nirq += n1g; - n++; - } - iaq->nrxq1g = min(n, iaq->nrxq1g); - } + if (rc != DDI_SUCCESS) { + goto fail; + } + break; + } + + return (DDI_SUCCESS); + +fail: + for (int i = 0; i < handlers; i++) { + rc = ddi_intr_remove_handler(sc->intr_handle[i]); + if (rc != DDI_SUCCESS) { /* - * We have arrived at a minimum value required to enable - * per queue irq(either NIC or offload). Thus for non- - * offload case, we will get a vector per queue, while - * offload case, we will get a vector per offload/NIC q. - * Hence enable Interrupt forwarding only for offload - * case. + * We tried our best, the only thing left is to log the + * failure and move on. */ - if (itype != DDI_INTR_TYPE_MSI) { - goto allocate; - } + cxgb_printf(sc->dip, CE_WARN, "failed to remove " + "interrupt handler %d for type: %d plan: %d: %d", i, + intr_type, iaq->intr_plan, rc); } + } - /* - * Least desirable option: one interrupt vector for everything. - */ - iaq->nirq = iaq->nrxq10g = iaq->nrxq1g = 1; - iaq->intr_fwd = 1; - -allocate: - return (0); + for (int i = 0; i < allocated; i++) { + rc = ddi_intr_free(sc->intr_handle[i]); + if (rc != DDI_SUCCESS) { + cxgb_printf(sc->dip, CE_WARN, "failed to free " + "interrupt %d for type: %d plan: %d: %d", i, + intr_type, iaq->intr_plan, rc); + } } - cxgb_printf(sc->dip, CE_WARN, - "failed to find a usable interrupt type. supported=%d, allowed=%d", - itypes, p->intr_types); return (DDI_FAILURE); } static int -add_child_node(struct adapter *sc, int idx) +t4_add_child_node(struct adapter *sc, uint_t idx) { - int rc; - struct port_info *pi; - if (idx < 0 || idx >= sc->params.nports) + if (idx >= sc->params.nports) return (EINVAL); - pi = sc->port[idx]; - if (pi == NULL) - return (ENODEV); /* t4_port_init failed earlier */ + struct port_info *pi = sc->port[idx]; + if (pi == NULL) { + /* t4_port_init failed earlier */ + return (ENODEV); + } PORT_LOCK(pi); if (pi->dip != NULL) { - rc = 0; /* EEXIST really, but then bus_config fails */ - goto done; + PORT_UNLOCK(pi); + /* EEXIST really, but then bus_config fails */ + return (0); } - rc = ndi_devi_alloc(sc->dip, T4_PORT_NAME, DEVI_SID_NODEID, &pi->dip); + const int rc = + ndi_devi_alloc(sc->dip, T4_PORT_NAME, DEVI_SID_NODEID, &pi->dip); if (rc != DDI_SUCCESS || pi->dip == NULL) { - rc = ENOMEM; - goto done; + PORT_UNLOCK(pi); + return (ENOMEM); } (void) ddi_set_parent_data(pi->dip, pi); (void) ndi_devi_bind_driver(pi->dip, 0); - rc = 0; -done: + PORT_UNLOCK(pi); - return (rc); + return (0); } static int -remove_child_node(struct adapter *sc, int idx) +t4_remove_child_node(struct adapter *sc, uint_t idx) { - int rc; - struct port_info *pi; - - if (idx < 0 || idx >= sc->params.nports) + if (idx >= sc->params.nports) return (EINVAL); - pi = sc->port[idx]; + struct port_info *pi = sc->port[idx]; if (pi == NULL) return (ENODEV); PORT_LOCK(pi); if (pi->dip == NULL) { - rc = ENODEV; - goto done; + PORT_UNLOCK(pi); + return (ENODEV); } - rc = ndi_devi_free(pi->dip); + const int rc = ndi_devi_free(pi->dip); if (rc == 0) pi->dip = NULL; -done: + PORT_UNLOCK(pi); return (rc); } +struct t4_port_speed_def { + uint32_t tpsd_cap; + t4_port_speed_t tpsd_speed; + const char *tpsd_name; +}; +#define T4_PORT_SPEED_DEF(speed) \ +{ \ + .tpsd_cap = FW_PORT_CAP32_SPEED_ ## speed, \ + .tpsd_speed = TPS_ ## speed, \ + .tpsd_name = #speed, \ +} + +static const struct t4_port_speed_def t4_port_speeds[] = { + T4_PORT_SPEED_DEF(400G), + T4_PORT_SPEED_DEF(200G), + T4_PORT_SPEED_DEF(100G), + T4_PORT_SPEED_DEF(50G), + T4_PORT_SPEED_DEF(40G), + T4_PORT_SPEED_DEF(25G), + T4_PORT_SPEED_DEF(10G), + T4_PORT_SPEED_DEF(1G), +}; + +/* + * Get maximum advertised speed of this port. + * + * This is, unfortunately, impacted by the installed transceiver at the time of + * query. + */ +static t4_port_speed_t +t4_port_speed(const struct port_info *pi) +{ + ASSERT(pi != NULL); + + const uint32_t pcap = pi->link_cfg.pcaps; + for (uint_t i = 0; i < ARRAY_SIZE(t4_port_speeds); i++) { + if (t4_port_speeds[i].tpsd_cap & pcap) { + return (t4_port_speeds[i].tpsd_speed); + } + } + + /* Fall back to 1G for unknown speeds */ + return (TPS_1G); +} + static const char * t4_port_speed_name(const struct port_info *pi) { @@ -2063,28 +2605,27 @@ t4_port_speed_name(const struct port_info *pi) return ("-"); } - const uint32_t pcaps = pi->link_cfg.pcaps; - if (pcaps & FW_PORT_CAP32_SPEED_100G) { - return ("100G"); - } else if (pcaps & FW_PORT_CAP32_SPEED_50G) { - return ("50G"); - } else if (pcaps & FW_PORT_CAP32_SPEED_40G) { - return ("40G"); - } else if (pcaps & FW_PORT_CAP32_SPEED_25G) { - return ("25G"); - } else if (pcaps & FW_PORT_CAP32_SPEED_10G) { - return ("10G"); - } else { - return ("1G"); + const uint32_t pcap = pi->link_cfg.pcaps; + for (uint_t i = 0; i < ARRAY_SIZE(t4_port_speeds); i++) { + if (t4_port_speeds[i].tpsd_cap & pcap) { + return (t4_port_speeds[i].tpsd_name); + } } + + return ("-"); } -#define KS_UINIT(x) kstat_named_init(&kstatp->x, #x, KSTAT_DATA_ULONG) -#define KS_CINIT(x) kstat_named_init(&kstatp->x, #x, KSTAT_DATA_CHAR) -#define KS_U64INIT(x) kstat_named_init(&kstatp->x, #x, KSTAT_DATA_UINT64) -#define KS_U_SET(x, y) kstatp->x.value.ul = (y) -#define KS_C_SET(x, ...) \ - (void) snprintf(kstatp->x.value.c, 16, __VA_ARGS__) +#define KS_INIT_U64(kstatp, n) \ + kstat_named_init(&kstatp->n, #n, KSTAT_DATA_UINT64) +#define KS_INIT_CHAR(kstatp, n) \ + kstat_named_init(&kstatp->n, #n, KSTAT_DATA_CHAR) +#define KS_INIT_STR(kstatp, n) \ + kstat_named_init(&kstatp->n, #n, KSTAT_DATA_STRING) +#define KS_SET_U64(kstatp, n, v) kstatp->n.value.ul = (v) +#define KS_SET_CHAR(kstatp, n, ...) \ + (void) snprintf(kstatp->n.value.c, 16, __VA_ARGS__) +#define KS_SET_STR(kstatp, n, v) \ + kstat_named_setstr(&kstatp->n, v) /* * t4nex:X:config @@ -2097,80 +2638,57 @@ struct t4_kstats { kstat_named_t serial_number; kstat_named_t ec_level; kstat_named_t id; - kstat_named_t bus_type; - kstat_named_t bus_width; - kstat_named_t bus_speed; kstat_named_t core_clock; kstat_named_t port_cnt; kstat_named_t port_type; - kstat_named_t pci_vendor_id; - kstat_named_t pci_device_id; }; + static kstat_t * -setup_kstats(struct adapter *sc) +t4_setup_kstats(struct adapter *sc) { - kstat_t *ksp; - struct t4_kstats *kstatp; - int ndata; - struct pci_params *p = &sc->params.pci; - struct vpd_params *v = &sc->params.vpd; - uint16_t pci_vendor, pci_device; - - ndata = sizeof (struct t4_kstats) / sizeof (kstat_named_t); - - ksp = kstat_create(T4_NEXUS_NAME, ddi_get_instance(sc->dip), "config", - "nexus", KSTAT_TYPE_NAMED, ndata, 0); + const ulong_t ndata = sizeof (struct t4_kstats) / + sizeof (kstat_named_t); + kstat_t *ksp = kstat_create(T4_NEXUS_NAME, ddi_get_instance(sc->dip), + "config", "nexus", KSTAT_TYPE_NAMED, ndata, 0); if (ksp == NULL) { cxgb_printf(sc->dip, CE_WARN, "failed to initialize kstats."); return (NULL); } - kstatp = (struct t4_kstats *)ksp->ks_data; - - KS_UINIT(chip_ver); - KS_CINIT(fw_vers); - KS_CINIT(tp_vers); - KS_CINIT(driver_version); - KS_CINIT(serial_number); - KS_CINIT(ec_level); - KS_CINIT(id); - KS_CINIT(bus_type); - KS_CINIT(bus_width); - KS_CINIT(bus_speed); - KS_UINIT(core_clock); - KS_UINIT(port_cnt); - KS_CINIT(port_type); - KS_CINIT(pci_vendor_id); - KS_CINIT(pci_device_id); - - KS_U_SET(chip_ver, sc->params.chip); - KS_C_SET(fw_vers, "%d.%d.%d.%d", + struct t4_kstats *kstatp = (struct t4_kstats *)ksp->ks_data; + + KS_INIT_U64(kstatp, chip_ver); + KS_INIT_CHAR(kstatp, fw_vers); + KS_INIT_CHAR(kstatp, tp_vers); + KS_INIT_CHAR(kstatp, driver_version); + KS_INIT_STR(kstatp, serial_number); + KS_INIT_STR(kstatp, ec_level); + KS_INIT_STR(kstatp, id); + KS_INIT_U64(kstatp, core_clock); + KS_INIT_U64(kstatp, port_cnt); + KS_INIT_CHAR(kstatp, port_type); + + KS_SET_U64(kstatp, chip_ver, sc->params.chip); + KS_SET_CHAR(kstatp, fw_vers, "%d.%d.%d.%d", G_FW_HDR_FW_VER_MAJOR(sc->params.fw_vers), G_FW_HDR_FW_VER_MINOR(sc->params.fw_vers), G_FW_HDR_FW_VER_MICRO(sc->params.fw_vers), G_FW_HDR_FW_VER_BUILD(sc->params.fw_vers)); - KS_C_SET(tp_vers, "%d.%d.%d.%d", + KS_SET_CHAR(kstatp, tp_vers, "%d.%d.%d.%d", G_FW_HDR_FW_VER_MAJOR(sc->params.tp_vers), G_FW_HDR_FW_VER_MINOR(sc->params.tp_vers), G_FW_HDR_FW_VER_MICRO(sc->params.tp_vers), G_FW_HDR_FW_VER_BUILD(sc->params.tp_vers)); - KS_C_SET(driver_version, DRV_VERSION); - KS_C_SET(serial_number, "%s", v->sn); - KS_C_SET(ec_level, "%s", v->ec); - KS_C_SET(id, "%s", v->id); - KS_C_SET(bus_type, "pci-express"); - KS_C_SET(bus_width, "x%d lanes", p->width); - KS_C_SET(bus_speed, "%d", p->speed); - KS_U_SET(core_clock, v->cclk); - KS_U_SET(port_cnt, sc->params.nports); - - pci_vendor = pci_config_get16(sc->pci_regh, PCI_CONF_VENID); - KS_C_SET(pci_vendor_id, "0x%x", pci_vendor); - - pci_device = pci_config_get16(sc->pci_regh, PCI_CONF_DEVID); - KS_C_SET(pci_device_id, "0x%x", pci_device); - - KS_C_SET(port_type, "%s/%s/%s/%s", + KS_SET_CHAR(kstatp, driver_version, DRV_VERSION); + + const struct vpd_params *vpd = &sc->params.vpd; + KS_SET_STR(kstatp, serial_number, (const char *)vpd->sn); + KS_SET_STR(kstatp, ec_level, (const char *)vpd->ec); + KS_SET_STR(kstatp, id, (const char *)vpd->id); + KS_SET_U64(kstatp, core_clock, vpd->cclk); + KS_SET_U64(kstatp, port_cnt, sc->params.nports); + + KS_SET_CHAR(kstatp, port_type, "%s/%s/%s/%s", t4_port_speed_name(sc->port[0]), t4_port_speed_name(sc->port[1]), t4_port_speed_name(sc->port[2]), @@ -2192,8 +2710,28 @@ struct t4_wc_kstats { kstat_named_t write_coal_success; kstat_named_t write_coal_failure; }; + +static int +t4_update_wc_kstats(kstat_t *ksp, int rw) +{ + struct t4_wc_kstats *kstatp = (struct t4_wc_kstats *)ksp->ks_data; + struct adapter *sc = ksp->ks_private; + + if (rw == KSTAT_WRITE) + return (0); + + if (t4_cver_ge(sc, CHELSIO_T5)) { + const uint32_t wc_total = t4_read_reg(sc, A_SGE_STAT_TOTAL); + const uint32_t wc_failure = t4_read_reg(sc, A_SGE_STAT_MATCH); + KS_SET_U64(kstatp, write_coal_success, wc_total - wc_failure); + KS_SET_U64(kstatp, write_coal_failure, wc_failure); + } + + return (0); +} + static kstat_t * -setup_wc_kstats(struct adapter *sc) +t4_setup_wc_kstats(struct adapter *sc) { kstat_t *ksp; struct t4_wc_kstats *kstatp; @@ -2209,10 +2747,10 @@ setup_wc_kstats(struct adapter *sc) kstatp = (struct t4_wc_kstats *)ksp->ks_data; - KS_UINIT(write_coal_success); - KS_UINIT(write_coal_failure); + KS_INIT_U64(kstatp, write_coal_success); + KS_INIT_U64(kstatp, write_coal_failure); - ksp->ks_update = update_wc_kstats; + ksp->ks_update = t4_update_wc_kstats; /* Install the kstat */ ksp->ks_private = (void *)sc; kstat_install(ksp); @@ -2220,31 +2758,6 @@ setup_wc_kstats(struct adapter *sc) return (ksp); } -static int -update_wc_kstats(kstat_t *ksp, int rw) -{ - struct t4_wc_kstats *kstatp = (struct t4_wc_kstats *)ksp->ks_data; - struct adapter *sc = ksp->ks_private; - uint32_t wc_total, wc_success, wc_failure; - - if (rw == KSTAT_WRITE) - return (0); - - if (t4_cver_ge(sc, CHELSIO_T5)) { - wc_total = t4_read_reg(sc, A_SGE_STAT_TOTAL); - wc_failure = t4_read_reg(sc, A_SGE_STAT_MATCH); - wc_success = wc_total - wc_failure; - } else { - wc_success = 0; - wc_failure = 0; - } - - KS_U_SET(write_coal_success, wc_success); - KS_U_SET(write_coal_failure, wc_failure); - - return (0); -} - /* * cxgbe:X:fec * @@ -2272,21 +2785,18 @@ struct cxgbe_port_fec_kstats { }; static uint32_t -read_fec_pair(struct port_info *pi, uint32_t lo_reg, uint32_t high_reg) +t4_read_fec_pair(struct port_info *pi, uint32_t lo_reg, uint32_t high_reg) { struct adapter *sc = pi->adapter; - uint8_t port = pi->tx_chan; - uint32_t low, high, ret; + const uint8_t port = pi->tx_chan; - low = t4_read_reg(sc, T5_PORT_REG(port, lo_reg)); - high = t4_read_reg(sc, T5_PORT_REG(port, high_reg)); - ret = low & 0xffff; - ret |= (high & 0xffff) << 16; - return (ret); + const uint32_t low = t4_read_reg(sc, T5_PORT_REG(port, lo_reg)); + const uint32_t high = t4_read_reg(sc, T5_PORT_REG(port, high_reg)); + return ((low & 0xffff) | ((high & 0xffff) << 16)); } static int -update_port_fec_kstats(kstat_t *ksp, int rw) +t4_update_fec_kstats(kstat_t *ksp, int rw) { struct cxgbe_port_fec_kstats *fec = ksp->ks_data; struct port_info *pi = ksp->ks_private; @@ -2298,44 +2808,44 @@ update_port_fec_kstats(kstat_t *ksp, int rw) /* * First go ahead and gather RS related stats. */ - fec->rs_corr.value.ui64 += read_fec_pair(pi, T6_RS_FEC_CCW_LO, - T6_RS_FEC_CCW_HI); - fec->rs_uncorr.value.ui64 += read_fec_pair(pi, T6_RS_FEC_NCCW_LO, - T6_RS_FEC_NCCW_HI); - fec->rs_sym0_corr.value.ui64 += read_fec_pair(pi, T6_RS_FEC_SYMERR0_LO, - T6_RS_FEC_SYMERR0_HI); - fec->rs_sym1_corr.value.ui64 += read_fec_pair(pi, T6_RS_FEC_SYMERR1_LO, - T6_RS_FEC_SYMERR1_HI); - fec->rs_sym2_corr.value.ui64 += read_fec_pair(pi, T6_RS_FEC_SYMERR2_LO, - T6_RS_FEC_SYMERR2_HI); - fec->rs_sym3_corr.value.ui64 += read_fec_pair(pi, T6_RS_FEC_SYMERR3_LO, - T6_RS_FEC_SYMERR3_HI); + fec->rs_corr.value.ui64 += + t4_read_fec_pair(pi, T6_RS_FEC_CCW_LO, T6_RS_FEC_CCW_HI); + fec->rs_uncorr.value.ui64 += + t4_read_fec_pair(pi, T6_RS_FEC_NCCW_LO, T6_RS_FEC_NCCW_HI); + fec->rs_sym0_corr.value.ui64 += + t4_read_fec_pair(pi, T6_RS_FEC_SYMERR0_LO, T6_RS_FEC_SYMERR0_HI); + fec->rs_sym1_corr.value.ui64 += + t4_read_fec_pair(pi, T6_RS_FEC_SYMERR1_LO, T6_RS_FEC_SYMERR1_HI); + fec->rs_sym2_corr.value.ui64 += + t4_read_fec_pair(pi, T6_RS_FEC_SYMERR2_LO, T6_RS_FEC_SYMERR2_HI); + fec->rs_sym3_corr.value.ui64 += + t4_read_fec_pair(pi, T6_RS_FEC_SYMERR3_LO, T6_RS_FEC_SYMERR3_HI); /* * Now go through and try to grab Firecode/BASE-R stats. */ - fec->fc_lane0_corr.value.ui64 += read_fec_pair(pi, T6_FC_FEC_L0_CERR_LO, - T6_FC_FEC_L0_CERR_HI); - fec->fc_lane0_uncorr.value.ui64 += read_fec_pair(pi, - T6_FC_FEC_L0_NCERR_LO, T6_FC_FEC_L0_NCERR_HI); - fec->fc_lane1_corr.value.ui64 += read_fec_pair(pi, T6_FC_FEC_L1_CERR_LO, - T6_FC_FEC_L1_CERR_HI); - fec->fc_lane1_uncorr.value.ui64 += read_fec_pair(pi, - T6_FC_FEC_L1_NCERR_LO, T6_FC_FEC_L1_NCERR_HI); - fec->fc_lane2_corr.value.ui64 += read_fec_pair(pi, T6_FC_FEC_L2_CERR_LO, - T6_FC_FEC_L2_CERR_HI); - fec->fc_lane2_uncorr.value.ui64 += read_fec_pair(pi, - T6_FC_FEC_L2_NCERR_LO, T6_FC_FEC_L2_NCERR_HI); - fec->fc_lane3_corr.value.ui64 += read_fec_pair(pi, T6_FC_FEC_L3_CERR_LO, - T6_FC_FEC_L3_CERR_HI); - fec->fc_lane3_uncorr.value.ui64 += read_fec_pair(pi, - T6_FC_FEC_L3_NCERR_LO, T6_FC_FEC_L3_NCERR_HI); + fec->fc_lane0_corr.value.ui64 += + t4_read_fec_pair(pi, T6_FC_FEC_L0_CERR_LO, T6_FC_FEC_L0_CERR_HI); + fec->fc_lane0_uncorr.value.ui64 += + t4_read_fec_pair(pi, T6_FC_FEC_L0_NCERR_LO, T6_FC_FEC_L0_NCERR_HI); + fec->fc_lane1_corr.value.ui64 += + t4_read_fec_pair(pi, T6_FC_FEC_L1_CERR_LO, T6_FC_FEC_L1_CERR_HI); + fec->fc_lane1_uncorr.value.ui64 += + t4_read_fec_pair(pi, T6_FC_FEC_L1_NCERR_LO, T6_FC_FEC_L1_NCERR_HI); + fec->fc_lane2_corr.value.ui64 += + t4_read_fec_pair(pi, T6_FC_FEC_L2_CERR_LO, T6_FC_FEC_L2_CERR_HI); + fec->fc_lane2_uncorr.value.ui64 += + t4_read_fec_pair(pi, T6_FC_FEC_L2_NCERR_LO, T6_FC_FEC_L2_NCERR_HI); + fec->fc_lane3_corr.value.ui64 += + t4_read_fec_pair(pi, T6_FC_FEC_L3_CERR_LO, T6_FC_FEC_L3_CERR_HI); + fec->fc_lane3_uncorr.value.ui64 += + t4_read_fec_pair(pi, T6_FC_FEC_L3_NCERR_LO, T6_FC_FEC_L3_NCERR_HI); return (0); } static kstat_t * -setup_port_fec_kstats(struct port_info *pi) +t4_init_fec_kstats(struct port_info *pi) { kstat_t *ksp; struct cxgbe_port_fec_kstats *kstatp; @@ -2354,22 +2864,22 @@ setup_port_fec_kstats(struct port_info *pi) } kstatp = ksp->ks_data; - KS_U64INIT(rs_corr); - KS_U64INIT(rs_uncorr); - KS_U64INIT(rs_sym0_corr); - KS_U64INIT(rs_sym1_corr); - KS_U64INIT(rs_sym2_corr); - KS_U64INIT(rs_sym3_corr); - KS_U64INIT(fc_lane0_corr); - KS_U64INIT(fc_lane0_uncorr); - KS_U64INIT(fc_lane1_corr); - KS_U64INIT(fc_lane1_uncorr); - KS_U64INIT(fc_lane2_corr); - KS_U64INIT(fc_lane2_uncorr); - KS_U64INIT(fc_lane3_corr); - KS_U64INIT(fc_lane3_uncorr); - - ksp->ks_update = update_port_fec_kstats; + KS_INIT_U64(kstatp, rs_corr); + KS_INIT_U64(kstatp, rs_uncorr); + KS_INIT_U64(kstatp, rs_sym0_corr); + KS_INIT_U64(kstatp, rs_sym1_corr); + KS_INIT_U64(kstatp, rs_sym2_corr); + KS_INIT_U64(kstatp, rs_sym3_corr); + KS_INIT_U64(kstatp, fc_lane0_corr); + KS_INIT_U64(kstatp, fc_lane0_uncorr); + KS_INIT_U64(kstatp, fc_lane1_corr); + KS_INIT_U64(kstatp, fc_lane1_uncorr); + KS_INIT_U64(kstatp, fc_lane2_corr); + KS_INIT_U64(kstatp, fc_lane2_uncorr); + KS_INIT_U64(kstatp, fc_lane3_corr); + KS_INIT_U64(kstatp, fc_lane3_uncorr); + + ksp->ks_update = t4_update_fec_kstats; ksp->ks_private = pi; kstat_install(ksp); @@ -2380,43 +2890,42 @@ int t4_port_full_init(struct port_info *pi) { struct adapter *sc = pi->adapter; - uint16_t *rss; struct sge_rxq *rxq; int rc, i; ASSERT((pi->flags & TPF_INIT_DONE) == 0); - /* - * Allocate tx/rx/fl queues for this port. - */ - rc = t4_setup_port_queues(pi); - if (rc != 0) - goto done; /* error message displayed already */ + /* Allocate TX/RX/FL queues for this port. */ + if ((rc = t4_port_queues_init(pi)) != 0) { + goto done; + } - /* - * Setup RSS for this port. - */ - rss = kmem_zalloc(pi->nrxq * sizeof (*rss), KM_SLEEP); + /* Setup RSS for this port. */ + uint16_t *rss = kmem_zalloc(pi->rxq_count * sizeof (*rss), KM_SLEEP); for_each_rxq(pi, i, rxq) { - rss[i] = rxq->iq.abs_id; + rss[i] = rxq->iq.tsi_abs_id; } rc = -t4_config_rss_range(sc, sc->mbox, pi->viid, 0, - pi->rss_size, rss, pi->nrxq); - kmem_free(rss, pi->nrxq * sizeof (*rss)); + pi->rss_size, rss, pi->rxq_count); + kmem_free(rss, pi->rxq_count * sizeof (*rss)); if (rc != 0) { cxgb_printf(pi->dip, CE_WARN, "rss_config failed: %d", rc); goto done; } - /* - * Initialize our per-port FEC kstats. - */ - pi->ksp_fec = setup_port_fec_kstats(pi); + t4_port_kstats_init(pi); + pi->ksp_fec = t4_init_fec_kstats(pi); pi->flags |= TPF_INIT_DONE; + done: - if (rc != 0) - (void) t4_port_full_uninit(pi); + if (rc != 0) { + /* + * Clean up any state resulting which may be lingering due to + * failure part way through initialization. + */ + t4_port_full_uninit(pi); + } return (rc); } @@ -2424,83 +2933,16 @@ done: /* * Idempotent. */ -static int +static void t4_port_full_uninit(struct port_info *pi) { - - ASSERT(pi->flags & TPF_INIT_DONE); - if (pi->ksp_fec != NULL) { kstat_delete(pi->ksp_fec); pi->ksp_fec = NULL; } - (void) t4_teardown_port_queues(pi); + t4_port_kstats_fini(pi); + t4_port_queues_fini(pi); pi->flags &= ~TPF_INIT_DONE; - - return (0); -} - -void -t4_port_queues_enable(struct port_info *pi) -{ - ASSERT(pi->flags & TPF_INIT_DONE); - - /* - * TODO: whatever was queued up after we set iq->state to IQS_DISABLED - * back in t4_port_queues_disable will be processed now, after an - * unbounded delay. This can't be good. - */ - - int i; - struct adapter *sc = pi->adapter; - struct sge_rxq *rxq; - - mutex_enter(&sc->sfl_lock); - for_each_rxq(pi, i, rxq) { - struct sge_iq *iq = &rxq->iq; - - if (atomic_cas_uint(&iq->state, IQS_DISABLED, IQS_IDLE) != - IQS_DISABLED) - panic("%s: iq %p wasn't disabled", __func__, - (void *) iq); - - /* - * Freelists which were marked "doomed" by a previous - * t4_port_queues_disable() call should clear that status. - */ - rxq->fl.flags &= ~FL_DOOMED; - - t4_iq_gts_update(iq, iq->intr_params, 0); - - } - mutex_exit(&sc->sfl_lock); -} - -void -t4_port_queues_disable(struct port_info *pi) -{ - int i; - struct adapter *sc = pi->adapter; - struct sge_rxq *rxq; - - ASSERT(pi->flags & TPF_INIT_DONE); - - /* - * TODO: need proper implementation for all tx queues (ctrl, eth, ofld). - */ - - for_each_rxq(pi, i, rxq) { - while (atomic_cas_uint(&rxq->iq.state, IQS_IDLE, - IQS_DISABLED) != IQS_IDLE) - msleep(1); - } - - mutex_enter(&sc->sfl_lock); - for_each_rxq(pi, i, rxq) { - rxq->fl.flags |= FL_DOOMED; - } - mutex_exit(&sc->sfl_lock); - /* TODO: need to wait for all fl's to be removed from sc->sfl */ } void @@ -2568,6 +3010,91 @@ t4_os_set_hw_addr(struct adapter *sc, int idx, const uint8_t *hw_addr) bcopy(hw_addr, sc->port[idx]->hw_addr, ETHERADDRL); } +/* Add thread to list of consumers waiting to access adapter mailbox */ +void +t4_mbox_waiter_add(struct adapter *sc, t4_mbox_waiter_t *ent) +{ + mutex_enter(&sc->mbox_lock); + ent->thread = curthread; + list_insert_tail(&sc->mbox_list, ent); + mutex_exit(&sc->mbox_lock); +} + +/* Remove thread from list of consumers waiting to access adapter mailbox */ +void +t4_mbox_waiter_remove(struct adapter *sc, t4_mbox_waiter_t *ent) +{ + ASSERT(ent->thread == curthread); + + mutex_enter(&sc->mbox_lock); + const bool was_owner = (list_head(&sc->mbox_list) == ent); + list_remove(&sc->mbox_list, ent); + + if (was_owner && !list_is_empty(&sc->mbox_list)) { + /* + * Wake the other threads waiting on the mbox as we are vacating + * the "owner" slot. + */ + cv_broadcast(&sc->mbox_cv); + } + mutex_exit(&sc->mbox_lock); +} + +/* + * Wait for the current thread, which has called t4_mbox_waiter_add(), to become + * the "owner" of the adapter mailbox (head of the waiter list). + * + * Returns true if current thread is the owner, else false if we slept/spun for + * `wait_us` and are not yet owner (and thus should recheck adapter status). + */ +bool +t4_mbox_wait_owner(struct adapter *sc, uint_t wait_us, bool sleep_ok) +{ + mutex_enter(&sc->mbox_lock); + t4_mbox_waiter_t *head = list_head(&sc->mbox_list); + ASSERT(head != NULL); + + if (head->thread == curthread) { + mutex_exit(&sc->mbox_lock); + return (true); + } + + if (!sleep_ok) { + mutex_exit(&sc->mbox_lock); + drv_usecwait(wait_us); + + mutex_enter(&sc->mbox_lock); + head = list_head(&sc->mbox_list); + ASSERT(head != NULL); + bool is_owner = head->thread == curthread; + mutex_exit(&sc->mbox_lock); + return (is_owner); + } + + /* + * Using a singal-aware wait would be more courteous here, but much of + * the logic which ultimately accesses the device mbox is ill-equipped + * to handle gracefully EINTR failures. + */ + const int res = cv_reltimedwait(&sc->mbox_cv, &sc->mbox_lock, + USEC_TO_TICK(wait_us), TR_MICROSEC); + if (res > 0) { + head = list_head(&sc->mbox_list); + ASSERT(head != NULL); + if (head->thread == curthread) { + /* + * CV was signaled and this thread now occupies the head + * of the list (indicating mbox ownership). + */ + mutex_exit(&sc->mbox_lock); + return (true); + } + } + mutex_exit(&sc->mbox_lock); + return (false); +} + + uint32_t t4_read_reg(struct adapter *sc, uint32_t reg) { @@ -2790,16 +3317,12 @@ t4_cxgbe_attach(struct port_info *pi, dev_info_t *dip) mac->m_driver = pi; mac->m_dip = dip; mac->m_src_addr = pi->hw_addr; - mac->m_callbacks = pi->mc; + mac->m_callbacks = &t4_mac_callbacks; mac->m_max_sdu = pi->mtu; /* mac_register() treats this as const, so we can cast it away */ mac->m_priv_props = (char **)props; mac->m_margin = VLAN_TAGSZ; - - if (!mac->m_callbacks->mc_unicst) { - /* Multiple rings enabled */ - mac->m_v12n = MAC_VIRT_LEVEL1; - } + mac->m_v12n = MAC_VIRT_LEVEL1; mac_handle_t mh = NULL; const int rc = mac_register(mac, &mh); diff --git a/usr/src/uts/common/io/cxgbe/t4nex/t4_sge.c b/usr/src/uts/common/io/cxgbe/t4nex/t4_sge.c index 06d0e3f6d5..0c1e03f044 100644 --- a/usr/src/uts/common/io/cxgbe/t4nex/t4_sge.c +++ b/usr/src/uts/common/io/cxgbe/t4nex/t4_sge.c @@ -69,10 +69,6 @@ struct txinfo { struct ulptx_sge_pair reserved[TX_SGL_SEGS / 2]; }; -struct mblk_pair { - mblk_t *head, *tail; -}; - struct rxbuf { kmem_cache_t *cache; /* the kmem_cache this rxb came from */ ddi_dma_handle_t dhdl; @@ -84,24 +80,33 @@ struct rxbuf { volatile uint_t ref_cnt; }; -static int service_iq(struct sge_iq *iq, int budget); -static inline void init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, - int8_t pktc_idx, int qsize, uint8_t esize); -static inline void init_fl(struct sge_fl *fl, uint16_t qsize); -static int alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, - struct sge_fl *fl, int intr_idx, int cong); -static int free_iq_fl(struct port_info *pi, struct sge_iq *iq, - struct sge_fl *fl); -static int alloc_rxq(struct port_info *pi, struct sge_rxq *rxq, int intr_idx, - int i); -static int free_rxq(struct port_info *pi, struct sge_rxq *rxq); -static int eth_eq_alloc(struct adapter *sc, struct port_info *pi, - struct sge_eq *eq); -static int alloc_eq(struct adapter *sc, struct port_info *pi, - struct sge_eq *eq); -static int free_eq(struct adapter *sc, struct sge_eq *eq); -static int alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx); -static int free_txq(struct port_info *pi, struct sge_txq *txq); +static const uint16_t t4_iq_esize_bytes[] = { + [T4_IQ_ESIZE_16B] = 16, + [T4_IQ_ESIZE_32B] = 32, + [T4_IQ_ESIZE_64B] = 64, + [T4_IQ_ESIZE_128B] = 128, +}; + +typedef struct t4_iq_params { + t4_iq_type_t tip_iq_type; + uint8_t tip_tmr_idx; + int8_t tip_pktc_idx; + uint16_t tip_qsize; + t4_iq_esize_t tip_esize; + uint16_t tip_fl_qsize; + int tip_cong_chan; + t4_sge_iq_t *tip_intr_evtq; + uint_t tip_intr_idx; +} t4_iq_params_t; + +static int t4_alloc_eq_base(struct port_info *, t4_sge_eq_t *); +static void t4_free_iq(struct port_info *, t4_sge_iq_t *); +static int t4_alloc_rxq(struct port_info *, struct sge_rxq *, uint_t); +static void t4_free_rxq(struct port_info *, struct sge_rxq *); +static void t4_free_eq(struct port_info *, t4_sge_eq_t *); +static void t4_alloc_eq_post(struct port_info *, t4_sge_eq_t *); +static int t4_alloc_txq(struct port_info *, struct sge_txq *, int); +static void t4_free_txq(struct port_info *, struct sge_txq *); static int alloc_dma_memory(struct adapter *sc, size_t len, int flags, ddi_device_acc_attr_t *acc_attr, ddi_dma_attr_t *dma_attr, ddi_dma_handle_t *dma_hdl, ddi_acc_handle_t *acc_hdl, uint64_t *pba, @@ -114,15 +119,14 @@ static int free_desc_ring(ddi_dma_handle_t *dhdl, ddi_acc_handle_t *ahdl); static int alloc_tx_copybuffer(struct adapter *sc, size_t len, ddi_dma_handle_t *dma_hdl, ddi_acc_handle_t *acc_hdl, uint64_t *pba, caddr_t *pva); -static inline bool is_new_response(const struct sge_iq *iq, - struct rsp_ctrl **ctrl); -static inline void iq_next(struct sge_iq *iq); -static int refill_fl(struct adapter *sc, struct sge_fl *fl, int nbufs); -static void refill_sfl(void *arg); -static void add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl); -static void free_fl_bufs(struct sge_fl *fl); -static mblk_t *get_fl_payload(struct adapter *sc, struct sge_fl *fl, - uint32_t len_newbuf, int *fl_bufs_used); +static inline bool t4_get_new_rsp(const t4_sge_iq_t *, struct rsp_ctrl *); +static inline void t4_iq_next_entry(t4_sge_iq_t *iq); +static t4_iq_result_t t4_process_event_iq(t4_sge_iq_t *event_iq); +static bool t4_fl_refill(struct sge_fl *, uint_t); +static void t4_sfl_enqueue(struct adapter *, struct sge_fl *); +static void t4_sfl_process(void *); +static void t4_fl_free_bufs(struct sge_fl *fl); +static mblk_t *t4_fl_get_payload(struct sge_fl *, uint32_t, bool); static int get_frame_txinfo(struct sge_txq *txq, mblk_t **fp, struct txinfo *txinfo, int sgl_only); static inline int fits_in_txb(struct sge_txq *txq, int len, int *waste); @@ -140,29 +144,27 @@ static int write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, mblk_t *m, static void t4_write_flush_wr(struct sge_txq *); static inline void write_ulp_cpl_sgl(struct port_info *pi, struct sge_txq *txq, struct txpkts *txpkts, struct txinfo *txinfo); -static inline void copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, - int len); +static inline void copy_to_txd(t4_sge_eq_t *eq, caddr_t from, caddr_t *to, + size_t len); static void t4_tx_ring_db(struct sge_txq *); -static uint_t t4_tx_reclaim_descs(struct sge_txq *, uint_t, mblk_t **); -static int t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, - mblk_t *m); -static inline void ring_fl_db(struct adapter *sc, struct sge_fl *fl); +static uint16_t t4_tx_reclaim_credits(struct sge_txq *, uint16_t, mblk_t **); +static void t4_fl_ring_db(struct sge_fl *fl); static kstat_t *setup_port_config_kstats(struct port_info *pi); static kstat_t *setup_port_info_kstats(struct port_info *pi); static kstat_t *setup_rxq_kstats(struct port_info *pi, struct sge_rxq *rxq, - int idx); + uint_t idx); static int update_rxq_kstats(kstat_t *ksp, int rw); static int update_port_info_kstats(kstat_t *ksp, int rw); static kstat_t *setup_txq_kstats(struct port_info *pi, struct sge_txq *txq, int idx); static int update_txq_kstats(kstat_t *ksp, int rw); -static void t4_sge_egr_update(struct sge_iq *, const struct rss_header *); -static int t4_handle_cpl_msg(struct sge_iq *, const struct rss_header *, +static void t4_sge_egr_update(t4_sge_iq_t *, const struct rss_header *); +static int t4_handle_cpl_msg(t4_sge_iq_t *, const struct rss_header *, mblk_t *); -static int t4_handle_fw_msg(struct sge_iq *, const struct rss_header *); +static int t4_handle_fw_msg(t4_sge_iq_t *, const struct rss_header *); static kmem_cache_t *rxbuf_cache_create(struct rxbuf_cache_params *); -static struct rxbuf *rxbuf_alloc(kmem_cache_t *, int, uint_t); +static struct rxbuf *rxbuf_alloc(kmem_cache_t *, int); static void rxbuf_free(struct rxbuf *); static int rxbuf_ctor(void *, void *, int); static void rxbuf_dtor(void *, void *); @@ -173,34 +175,54 @@ t4_rss_payload(const struct rss_header *rss) return ((void *)(&rss[1])); } -static inline struct sge_iq ** +static inline t4_sge_iq_t ** t4_iqmap_slot(struct adapter *sc, uint_t cntxt_id) { - const uint_t idx = cntxt_id - sc->sge.iq_start; + const uint_t idx = cntxt_id - sc->sge.iqmap_start; VERIFY3U(idx, <, sc->sge.iqmap_sz); return (&sc->sge.iqmap[idx]); } -static inline struct sge_eq ** +static inline t4_sge_eq_t ** t4_eqmap_slot(struct adapter *sc, uint_t cntxt_id) { - const uint_t idx = cntxt_id - sc->sge.eq_start; + const uint_t idx = cntxt_id - sc->sge.eqmap_start; VERIFY3U(idx, <, sc->sge.eqmap_sz); return (&sc->sge.eqmap[idx]); } -static inline int -reclaimable(struct sge_eq *eq) +/* + * Get the address of the EQ host credit at the provided index. + */ +static inline void * +t4_eq_credit(t4_sge_eq_t *eq, uint16_t idx) { - unsigned int cidx; + ASSERT3U(idx, <, eq->tse_qsize_spg); + uint8_t *credits = eq->tse_ring; + return (&credits[idx * EQ_HC_SIZE]); +} + +static inline struct sge_rxq * +t4_iq_to_rxq(t4_sge_iq_t *iq) +{ + if (iq->tsi_iqtype == TIQT_ETH_RX) { + return (__containerof(iq, struct sge_rxq, iq)); + } else { + return (NULL); + } +} - cidx = eq->spg->cidx; /* stable snapshot */ - cidx = be16_to_cpu(cidx); +static inline t4_sge_iq_t * +t4_fl_to_iq(struct sge_fl *fl) +{ + /* + * Currently, RXQs are the only consumer of sge_fl, and are thus the + * only case we need to worry about. + */ + struct sge_rxq *rxq = __containerof(fl, struct sge_rxq, fl); + ASSERT(rxq->iq.tsi_iqtype == TIQT_ETH_RX); - if (cidx >= eq->cidx) - return (cidx - eq->cidx); - else - return (cidx + eq->cap - eq->cidx); + return (&rxq->iq); } void @@ -209,8 +231,7 @@ t4_sge_init(struct adapter *sc) struct driver_properties *p = &sc->props; ddi_dma_attr_t *dma_attr; ddi_device_acc_attr_t *acc_attr; - uint32_t sge_control, sge_conm_ctrl; - int egress_threshold; + uint32_t sge_control; /* * Device access and DMA attributes for descriptor rings @@ -261,16 +282,25 @@ t4_sge_init(struct adapter *sc) */ sge_control = t4_read_reg(sc, A_SGE_CONTROL); sc->sge.pktshift = G_PKTSHIFT(sge_control); - sc->sge.stat_len = (sge_control & F_EGRSTATUSPAGESIZE) ? 128 : 64; + sc->sge.eq_spg_len = (sge_control & F_EGRSTATUSPAGESIZE) ? 2 : 1; /* t4_nex uses FLM packed mode */ - sc->sge.fl_align = t4_fl_pkt_align(sc, true); + const int fl_align = t4_fl_pkt_align(sc, true); + VERIFY3S(fl_align, >=, 0); + /* + * Minimum alignment for freelist buffer sizes is stated as 16, but in + * order to keep bits [3:0] clear for identifying the buffer size + * register, we use a minimum of 32. + * + * See A_SGE_FL_BUFFER_SIZE0 setting below. + */ + sc->sge.fl_align = MAX(fl_align, 32); /* - * Device access and DMA attributes for rx buffers + * Device access and DMA attributes for RX buffers */ sc->sge.rxb_params.dip = sc->dip; - sc->sge.rxb_params.buf_size = rx_buf_size; + sc->sge.rxb_params.buf_size = P2ROUNDUP(rx_buf_size, fl_align); acc_attr = &sc->sge.rxb_params.acc_attr_rx; acc_attr->devacc_attr_version = DDI_DEVICE_ATTR_V0; @@ -281,11 +311,6 @@ t4_sge_init(struct adapter *sc) dma_attr->dma_attr_addr_lo = 0; dma_attr->dma_attr_addr_hi = UINT64_MAX; dma_attr->dma_attr_count_max = UINT64_MAX; - /* - * Low 4 bits of an rx buffer address have a special meaning to the SGE - * and an rx buf cannot have an address with any of these bits set. - * FL_ALIGN is >= 32 so we're sure things are ok. - */ dma_attr->dma_attr_align = sc->sge.fl_align; dma_attr->dma_attr_burstsizes = 0xfff; dma_attr->dma_attr_minxfer = 1; @@ -298,7 +323,7 @@ t4_sge_init(struct adapter *sc) sc->sge.rxbuf_cache = rxbuf_cache_create(&sc->sge.rxb_params); /* - * A FL with <= fl_starve_thres buffers is starving and a periodic + * A FL with <= fl_starve_threshold buffers is starving and a periodic * timer will attempt to refill it. This needs to be larger than the * SGE's Egress Congestion Threshold. If it isn't, then we can get * stuck waiting for new packets while the SGE is waiting for us to @@ -310,7 +335,8 @@ t4_sge_init(struct adapter *sc) * buffers. */ - sge_conm_ctrl = t4_read_reg(sc, A_SGE_CONM_CTRL); + const uint32_t sge_conm_ctrl = t4_read_reg(sc, A_SGE_CONM_CTRL); + uint_t egress_threshold; switch (CHELSIO_CHIP_VERSION(sc->params.chip)) { case CHELSIO_T4: egress_threshold = G_EGRTHRESHOLD(sge_conm_ctrl); @@ -322,9 +348,20 @@ t4_sge_init(struct adapter *sc) default: egress_threshold = G_T6_EGRTHRESHOLDPACKING(sge_conm_ctrl); } - sc->sge.fl_starve_threshold = 2*egress_threshold + 1; + sc->sge.fl_starve_threshold = 2 * egress_threshold + 1; - t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0, rx_buf_size); + /* + * Set the size of buffers submitted through freelists. + * + * Strictly speaking, this is setting one of sixteen possible buffer + * sizes, with bits [3:0] of freelist entries designating the size + * register (0-15) which contains its corresponding size. + * + * Our driver does not currently make use of multiple sizes. Submitted + * buffers are at least 16-byte aligned, thus bits [3:0] are 0, + * selecting this size register. + */ + t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0, sc->sge.rxb_params.buf_size); t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, V_THRESHOLD_0(p->holdoff_pktcnt[0]) | @@ -343,201 +380,216 @@ t4_sge_init(struct adapter *sc) V_TIMERVALUE5(us_to_core_ticks(sc, p->holdoff_timer_us[5]))); } -static inline int -first_vector(struct port_info *pi) +static uint_t +t4_queue_to_intrq(struct adapter *sc, uint_t q_idx) { - struct adapter *sc = pi->adapter; - int rc = T4_EXTRA_INTR, i; - - if (sc->intr_count == 1) - return (0); - - for_each_port(sc, i) { - struct port_info *p = sc->port[i]; - - if (i == pi->port_id) - break; - - /* - * Not compiled with offload support and intr_count > 1. Only - * NIC queues exist and they'd better be taking direct - * interrupts. - */ - ASSERT(!(sc->flags & TAF_INTR_FWD)); - rc += p->nrxq; - } - return (rc); + return (q_idx % sc->intr_queue_cfg.intr_per_port); } /* - * Given an arbitrary "index," come up with an iq that can be used by other - * queues (of this port) for interrupt forwarding, SGE egress updates, etc. - * The iq returned is guaranteed to be something that takes direct interrupts. + * Assign an interrupt event queue to the Rx queue specified by q_idx. If + * we are in TIP_PER_PORT mode, this is done by multiplexing the Rx queues + * across the port's interrupt queues. Otherwise, all events are directed + * to the adapter-wide firmware queue. */ -static struct sge_iq * -port_intr_iq(struct port_info *pi, int idx) +static void +t4_rxq_intr_assign(struct port_info *pi, uint_t rxq_idx, + struct t4_iq_params *iqp) { struct adapter *sc = pi->adapter; - struct sge *s = &sc->sge; - struct sge_iq *iq = NULL; + const struct t4_intrs_queues *iqc = &sc->intr_queue_cfg; - if (sc->intr_count == 1) - return (&sc->sge.fwq); + switch (iqc->intr_plan) { + case TIP_PER_PORT: { + uint_t intr_iq_idx = t4_queue_to_intrq(sc, rxq_idx); + iqp->tip_intr_evtq = &pi->intr_iqs[intr_iq_idx]; + iqp->tip_intr_idx = INTR_FORWARDED; + break; + } + case TIP_SINGLE: + case TIP_ERR_QUEUES: + default: + /* Forward all RXQ interrupts to FWQ */ + iqp->tip_intr_evtq = &sc->sge.fwq; + iqp->tip_intr_idx = INTR_FORWARDED; + break; + } +} - /* - * Not compiled with offload support and intr_count > 1. Only NIC - * queues exist and they'd better be taking direct interrupts. - */ - ASSERT(!(sc->flags & TAF_INTR_FWD)); +void +t4_port_kstats_init(struct port_info *pi) +{ + ASSERT(pi->ksp_config == NULL); + ASSERT(pi->ksp_info == NULL); - idx %= pi->nrxq; - iq = &s->rxq[pi->first_rxq + idx].iq; + pi->ksp_config = setup_port_config_kstats(pi); + pi->ksp_info = setup_port_info_kstats(pi); +} - return (iq); +void +t4_port_kstats_fini(struct port_info *pi) +{ + if (pi->ksp_config != NULL) { + kstat_delete(pi->ksp_config); + pi->ksp_config = NULL; + } + if (pi->ksp_info != NULL) { + kstat_delete(pi->ksp_info); + pi->ksp_info = NULL; + } } int -t4_setup_port_queues(struct port_info *pi) +t4_port_queues_init(struct port_info *pi) { - int rc = 0, i, intr_idx, j; - struct sge_rxq *rxq; - struct sge_txq *txq; + int rc = 0; + uint_t q_idx; struct adapter *sc = pi->adapter; - struct driver_properties *p = &sc->props; - pi->ksp_config = setup_port_config_kstats(pi); - pi->ksp_info = setup_port_info_kstats(pi); + struct sge_rxq *rxq; + for_each_rxq(pi, q_idx, rxq) { + if ((rc = t4_alloc_rxq(pi, rxq, q_idx)) != 0) { + goto cleanup; + } + } - /* Interrupt vector to start from (when using multiple vectors) */ - intr_idx = first_vector(pi); + struct sge_txq *txq; + for_each_txq(pi, q_idx, txq) { + txq->eq.tse_flags = 0; + txq->eq.tse_tx_chan = pi->tx_chan; + txq->eq.tse_qsize = sc->props.qsize_txq; - /* - * First pass over all rx queues (NIC and TOE): - * a) initialize iq and fl - * b) allocate queue iff it will take direct interrupts. - */ + if (sc->intr_queue_cfg.intr_plan == TIP_PER_PORT) { + /* + * If we have per port interrupts, then multiplex + * TX completion events across them. + */ + uint_t intr_iq_idx = t4_queue_to_intrq(sc, q_idx); + txq->eq.tse_iqid = + pi->intr_iqs[intr_iq_idx].tsi_cntxt_id; + } else { + /* + * Otherwise, handle all TX completion events in + * the firmware queue. + */ + txq->eq.tse_iqid = sc->sge.fwq.tsi_cntxt_id; + } - for_each_rxq(pi, i, rxq) { + if ((rc = t4_alloc_txq(pi, txq, q_idx)) != 0) { + goto cleanup; + } + } - init_iq(&rxq->iq, sc, pi->tmr_idx, pi->pktc_idx, p->qsize_rxq, - RX_IQ_ESIZE); + return (0); - init_fl(&rxq->fl, p->qsize_rxq / 8); /* 8 bufs in each entry */ +cleanup: + t4_port_queues_fini(pi); + return (rc); +} - if ((!(sc->flags & TAF_INTR_FWD)) || - (sc->intr_count > 1 && pi->nrxq)) { - rxq->iq.flags |= IQ_INTR; - rc = alloc_rxq(pi, rxq, intr_idx, i); - if (rc != 0) - goto done; - intr_idx++; - } +void +t4_port_queues_fini(struct port_info *pi) +{ + uint_t i; + struct sge_txq *txq; + for_each_txq(pi, i, txq) { + t4_free_txq(pi, txq); } - /* - * Second pass over all rx queues (NIC and TOE). The queues forwarding - * their interrupts are allocated now. - */ - j = 0; + struct sge_rxq *rxq; for_each_rxq(pi, i, rxq) { - if (rxq->iq.flags & IQ_INTR) - continue; + t4_free_rxq(pi, rxq); + } +} - intr_idx = port_intr_iq(pi, j)->abs_id; +void +t4_port_queues_enable(struct port_info *pi) +{ + ASSERT(pi->flags & TPF_INIT_DONE); - rc = alloc_rxq(pi, rxq, intr_idx, i); - if (rc != 0) - goto done; - j++; - } + uint_t i; + struct adapter *sc = pi->adapter; + struct sge_rxq *rxq; - /* - * Now the tx queues. Only one pass needed. - */ - j = 0; - for_each_txq(pi, i, txq) { - txq->eq.flags = 0; - txq->eq.tx_chan = pi->tx_chan; - txq->eq.qsize = p->qsize_txq; + mutex_enter(&sc->sfl_lock); + for_each_rxq(pi, i, rxq) { + t4_sge_iq_t *iq = &rxq->iq; - /* For now, direct all TX queue notifications to the FW IQ. */ - txq->eq.iqid = sc->sge.fwq.cntxt_id; + IQ_LOCK(iq); + VERIFY0(iq->tsi_flags & IQ_ENABLED); + iq->tsi_flags |= IQ_ENABLED; + + /* + * Freelists which were marked "doomed" by a previous + * t4_port_queues_disable() call should clear that status. + */ + rxq->fl.sfl_flags &= ~SFL_DOOMED; - rc = alloc_txq(pi, txq, i); - if (rc != 0) - goto done; + t4_iq_gts_update(iq, iq->tsi_gts_rearm, 0); + IQ_UNLOCK(iq); } + mutex_exit(&sc->sfl_lock); -done: - if (rc != 0) - (void) t4_teardown_port_queues(pi); + struct sge_txq *txq; + for_each_txq(pi, i, txq) { + t4_sge_eq_t *eq = &txq->eq; - return (rc); + EQ_LOCK(eq); + eq->tse_flags |= EQ_ENABLED; + EQ_UNLOCK(eq); + } } -/* - * Idempotent - */ -int -t4_teardown_port_queues(struct port_info *pi) +void +t4_port_queues_disable(struct port_info *pi) { - int i; + uint_t i; + struct adapter *sc = pi->adapter; struct sge_rxq *rxq; - struct sge_txq *txq; - - if (pi->ksp_config != NULL) { - kstat_delete(pi->ksp_config); - pi->ksp_config = NULL; - } - if (pi->ksp_info != NULL) { - kstat_delete(pi->ksp_info); - pi->ksp_info = NULL; - } - for_each_txq(pi, i, txq) { - (void) free_txq(pi, txq); - } + ASSERT(pi->flags & TPF_INIT_DONE); for_each_rxq(pi, i, rxq) { - if ((rxq->iq.flags & IQ_INTR) == 0) - (void) free_rxq(pi, rxq); - } + t4_sge_iq_t *iq = &rxq->iq; - /* - * Then take down the rx queues that take direct interrupts. - */ + IQ_LOCK(iq); + iq->tsi_flags &= ~IQ_ENABLED; + IQ_UNLOCK(iq); + } + mutex_enter(&sc->sfl_lock); for_each_rxq(pi, i, rxq) { - if (rxq->iq.flags & IQ_INTR) - (void) free_rxq(pi, rxq); + rxq->fl.sfl_flags |= SFL_DOOMED; } + mutex_exit(&sc->sfl_lock); + /* TODO: need to wait for all fl's to be removed from sc->sfl */ - return (0); -} - -/* Deals with errors and forwarded interrupts */ -uint_t -t4_intr_all(caddr_t arg1, caddr_t arg2) -{ - - (void) t4_intr_err(arg1, arg2); - (void) t4_intr(arg1, arg2); + struct sge_txq *txq; + for_each_txq(pi, i, txq) { + t4_sge_eq_t *eq = &txq->eq; - return (DDI_INTR_CLAIMED); + EQ_LOCK(eq); + eq->tse_flags &= ~EQ_ENABLED; + EQ_UNLOCK(eq); + } + /* + * TODO: issue flush WR to EQs and wait for EGR update to ensure that + * all processing has completed. + */ } /* - * We are counting on the values of t4_intr_config_t matching the register + * We are counting on the values of t4_gts_config_t matching the register * definitions from the shared code. */ -CTASSERT(TIC_SE_INTR_ARM == F_QINTR_CNT_EN); -CTASSERT(TIC_TIMER0 == V_QINTR_TIMER_IDX(X_TIMERREG_COUNTER0)); -CTASSERT(TIC_TIMER5 == V_QINTR_TIMER_IDX(X_TIMERREG_COUNTER5)); -CTASSERT(TIC_START_COUNTER == V_QINTR_TIMER_IDX(X_TIMERREG_RESTART_COUNTER)); +CTASSERT(TGC_SE_INTR_ARM == F_QINTR_CNT_EN); +CTASSERT(TGC_TIMER0 == V_QINTR_TIMER_IDX(X_TIMERREG_COUNTER0)); +CTASSERT(TGC_TIMER5 == V_QINTR_TIMER_IDX(X_TIMERREG_COUNTER5)); +CTASSERT(TGC_START_COUNTER == V_QINTR_TIMER_IDX(X_TIMERREG_RESTART_COUNTER)); void -t4_iq_update_intr_cfg(struct sge_iq *iq, uint8_t tmr_idx, int8_t pktc_idx) +t4_iq_update_intr_cfg(t4_sge_iq_t *iq, uint8_t tmr_idx, int8_t pktc_idx) { ASSERT((pktc_idx >= 0 && pktc_idx < SGE_NCOUNTERS) || pktc_idx == -1); IQ_LOCK_ASSERT_OWNED(iq); @@ -550,37 +602,37 @@ t4_iq_update_intr_cfg(struct sge_iq *iq, uint8_t tmr_idx, int8_t pktc_idx) */ ASSERT3U(tmr_idx, <, SGE_NTIMERS); - iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx) | - ((pktc_idx != -1) ? TIC_SE_INTR_ARM : 0); + iq->tsi_gts_rearm = V_QINTR_TIMER_IDX(tmr_idx) | + ((pktc_idx != -1) ? TGC_SE_INTR_ARM : 0); /* Update IQ for new packet count threshold, but only if enabled */ - if (pktc_idx != iq->intr_pktc_idx && pktc_idx >= 0) { + if (pktc_idx != iq->tsi_intr_pktc_idx && pktc_idx >= 0) { const uint32_t param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_IQ_INTCNTTHRESH) | - V_FW_PARAMS_PARAM_YZ(iq->cntxt_id); + V_FW_PARAMS_PARAM_YZ(iq->tsi_cntxt_id); const uint32_t val = pktc_idx; - struct adapter *sc = iq->adapter; + struct adapter *sc = iq->tsi_adapter; int rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc != 0) { /* report error but carry on */ cxgb_printf(sc->dip, CE_WARN, "failed to set intr pktcnt index for IQ %d: %d", - iq->cntxt_id, rc); + iq->tsi_cntxt_id, rc); } } - iq->intr_pktc_idx = pktc_idx; + iq->tsi_intr_pktc_idx = pktc_idx; } void -t4_eq_update_dbq_timer(struct sge_eq *eq, struct port_info *pi) +t4_eq_update_dbq_timer(t4_sge_eq_t *eq, struct port_info *pi) { struct adapter *sc = pi->adapter; const uint32_t param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_TIMERIX) | - V_FW_PARAMS_PARAM_YZ(eq->cntxt_id); + V_FW_PARAMS_PARAM_YZ(eq->tse_cntxt_id); const uint32_t val = pi->dbq_timer_idx; int rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); @@ -588,7 +640,7 @@ t4_eq_update_dbq_timer(struct sge_eq *eq, struct port_info *pi) /* report error but carry on */ cxgb_printf(sc->dip, CE_WARN, "failed to set DBQ timer index for EQ %d: %d", - eq->cntxt_id, rc); + eq->tse_cntxt_id, rc); } } @@ -597,13 +649,13 @@ t4_eq_update_dbq_timer(struct sge_eq *eq, struct port_info *pi) * ingress queue. */ void -t4_iq_gts_update(struct sge_iq *iq, t4_intr_config_t cfg, uint16_t cidx_incr) +t4_iq_gts_update(t4_sge_iq_t *iq, t4_gts_config_t cfg, uint16_t cidx_incr) { const uint32_t value = - V_INGRESSQID((uint32_t)iq->cntxt_id) | + V_INGRESSQID((uint32_t)iq->tsi_cntxt_id) | V_CIDXINC((uint32_t)cidx_incr) | V_SEINTARM((uint32_t)cfg); - t4_write_reg(iq->adapter, MYPF_REG(A_SGE_PF_GTS), value); + t4_write_reg(iq->tsi_adapter, MYPF_REG(A_SGE_PF_GTS), value); } /* @@ -613,376 +665,466 @@ t4_iq_gts_update(struct sge_iq *iq, t4_intr_config_t cfg, uint16_t cidx_incr) * associated with the IQ. */ static void -t4_iq_gts_incr(struct sge_iq *iq, uint16_t cidx_incr) +t4_iq_gts_incr(t4_sge_iq_t *iq, uint16_t cidx_incr) { if (cidx_incr == 0) { return; } const uint32_t value = - V_INGRESSQID((uint32_t)iq->cntxt_id) | + V_INGRESSQID((uint32_t)iq->tsi_cntxt_id) | V_CIDXINC((uint32_t)cidx_incr) | V_SEINTARM((uint32_t)V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)); - t4_write_reg(iq->adapter, MYPF_REG(A_SGE_PF_GTS), value); + t4_write_reg(iq->tsi_adapter, MYPF_REG(A_SGE_PF_GTS), value); } -static void -t4_intr_rx_work(struct sge_iq *iq) -{ - mblk_t *mp = NULL; - struct sge_rxq *rxq = iq_to_rxq(iq); /* Use iff iq is part of rxq */ - RXQ_LOCK(rxq); - if (!iq->polling) { - mp = t4_ring_rx(rxq, iq->qsize/8); - t4_iq_gts_update(iq, iq->intr_params, 0); - } - RXQ_UNLOCK(rxq); - if (mp != NULL) { - mac_rx_ring(rxq->port->mh, rxq->ring_handle, mp, - rxq->ring_gen_num); - } -} - -/* Deals with interrupts on the given ingress queue */ -/* ARGSUSED */ uint_t -t4_intr(caddr_t arg1, caddr_t arg2) +t4_intr_all(caddr_t arg1, caddr_t arg2) { - struct sge_iq *iq = (struct sge_iq *)arg2; - int state; + struct adapter *sc = (struct adapter *)arg1; + + /* handle any device errors */ + t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0); + (void) t4_slow_intr_handler(sc); + + /* process fwq */ + (void) t4_process_event_iq(&sc->sge.fwq); - /* - * Right now receive polling is only enabled for MSI-X and - * when we have enough msi-x vectors i.e no interrupt forwarding. - */ - if (iq->adapter->props.multi_rings) { - t4_intr_rx_work(iq); - } else { - state = atomic_cas_uint(&iq->state, IQS_IDLE, IQS_BUSY); - if (state == IQS_IDLE) { - (void) service_iq(iq, 0); - (void) atomic_cas_uint(&iq->state, IQS_BUSY, IQS_IDLE); - } - } return (DDI_INTR_CLAIMED); } -/* Deals with error interrupts */ -/* ARGSUSED */ uint_t t4_intr_err(caddr_t arg1, caddr_t arg2) { struct adapter *sc = (struct adapter *)arg1; + /* handle any device errors */ t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0); (void) t4_slow_intr_handler(sc); return (DDI_INTR_CLAIMED); } -/* - * t4_ring_rx - Process responses from an SGE response queue. - * - * This function processes responses from an SGE response queue up to the - * supplied budget. Responses include received packets as well as control - * messages from FW or HW. - * - * It returns a chain of mblks containing the received data, to be - * passed up to mac_rx_ring(). - */ -mblk_t * -t4_ring_rx(struct sge_rxq *rxq, int budget) -{ - struct sge_iq *iq = &rxq->iq; - struct sge_fl *fl = &rxq->fl; /* Use iff IQ_HAS_FL */ - struct adapter *sc = iq->adapter; - struct rsp_ctrl *ctrl; - int ndescs = 0, fl_bufs_used = 0; - mblk_t *mblk_head = NULL, **mblk_tail = &mblk_head; - uint32_t received_bytes = 0, pkt_len = 0; - uint16_t err_vec; - - while (is_new_response(iq, &ctrl)) { - membar_consumer(); - - const uint8_t type_gen = ctrl->u.type_gen; - const uint8_t rsp_type = G_RSPD_TYPE(type_gen); - const bool overflowed = (type_gen & F_RSPD_QOVFL) != 0; - const uint32_t data_len = BE_32(ctrl->pldbuflen_qid); - - iq->stats.sis_processed++; - if (overflowed) { - iq->stats.sis_overflow++; - } +uint_t +t4_intr_fwq(caddr_t arg1, caddr_t arg2) +{ + struct adapter *sc = (struct adapter *)arg1; - const struct rss_header *rss = - (const struct rss_header *)iq->cdesc; - mblk_t *m = NULL; + (void) t4_process_event_iq(&sc->sge.fwq); - switch (rsp_type) { - case X_RSPD_TYPE_FLBUF: + return (DDI_INTR_CLAIMED); +} - ASSERT(iq->flags & IQ_HAS_FL); +uint_t +t4_intr_port_queue(caddr_t arg1, caddr_t arg2) +{ + t4_sge_iq_t *iq = (t4_sge_iq_t *)arg1; - if (CPL_RX_PKT == rss->opcode) { - const struct cpl_rx_pkt *cpl = - t4_rss_payload(rss); - pkt_len = be16_to_cpu(cpl->len); + (void) t4_process_event_iq(iq); - if (iq->polling && - ((received_bytes + pkt_len) > budget)) - goto done; + return (DDI_INTR_CLAIMED); +} - m = get_fl_payload(sc, fl, data_len, - &fl_bufs_used); - if (m == NULL) - goto done; +static bool +t4_fl_periodic_refill(struct sge_fl *fl) +{ + FL_LOCK(fl); + const bool starved = t4_fl_refill(fl, fl->bufs_cap / 8); + FL_UNLOCK(fl); - m->b_rptr += sc->sge.pktshift; - if (sc->params.tp.rx_pkt_encap) { - /* Enabled only in T6 config file */ - err_vec = G_T6_COMPR_RXERR_VEC( - ntohs(cpl->err_vec)); - } else { - err_vec = ntohs(cpl->err_vec); - } + return (starved); +} - const bool csum_ok = cpl->csum_calc && !err_vec; +/* + * Convenience struct for tracking entry types while servicing an IQ. + * Used to communicate said counts through the t4-process-* probes. + */ +struct sge_iq_totals { + uint_t sit_desc; + uint_t sit_flbuf; + uint_t sit_cpl; + uint_t sit_intr; + uint_t sit_rx_bytes; +}; - /* TODO: what about cpl->ip_frag? */ - if (csum_ok && !cpl->ip_frag) { - mac_hcksum_set(m, 0, 0, 0, 0xffff, - HCK_FULLCKSUM_OK | HCK_FULLCKSUM | - HCK_IPV4_HDRCKSUM_OK); - rxq->rxcsum++; - } - rxq->rxpkts++; - rxq->rxbytes += pkt_len; - received_bytes += pkt_len; +/* + * Process entries on an event Ingress Queue. This type of queue receives + * firmware events, Tx EGR messages, and Rx forwarded interrupts only. It is + * used by the firmware queue and the individual port queues. + */ +static t4_iq_result_t +t4_process_event_iq(t4_sge_iq_t *event_iq) +{ + int rc = TIR_SUCCESS; + struct adapter *sc = event_iq->tsi_adapter; + + const uint_t desc_limit = event_iq->tsi_qsize / 8; + struct sge_iq_totals totals = { 0 }; + uint_t cidx_incr = 0; + struct rsp_ctrl ctrl; + list_t iql_fwd; + + ASSERT3S(event_iq->tsi_iqtype, ==, TIQT_EVENT); + ASSERT3P(event_iq->tsi_intr_evtq, ==, NULL); + + IQ_LOCK(event_iq); + if ((event_iq->tsi_flags & IQ_ENABLED) == 0) { + IQ_UNLOCK(event_iq); + return (TIR_DISABLED); + } - *mblk_tail = m; - mblk_tail = &m->b_next; + list_create(&iql_fwd, sizeof (t4_sge_iq_t), + offsetof(t4_sge_iq_t, tsi_intr_fwd_node)); - break; - } + while (t4_get_new_rsp(event_iq, &ctrl)) { + const uint8_t rsp_type = G_RSPD_TYPE(ctrl.u.type_gen); + const bool overflowed = (ctrl.u.type_gen & F_RSPD_QOVFL) != 0; - m = get_fl_payload(sc, fl, data_len, &fl_bufs_used); - if (m == NULL) - goto done; - /* FALLTHROUGH */ + if (overflowed) { + event_iq->tsi_stats.sis_overflow++; + } + + const struct rss_header *rss = + (const struct rss_header *)event_iq->tsi_cdesc; + + DTRACE_PROBE3(t4__event__iq__entry, t4_sge_iq_t *, event_iq, + struct rsp_ctrl *, &ctrl, struct rss_header *, rss); + ASSERT((rsp_type & (X_RSPD_TYPE_CPL | X_RSPD_TYPE_INTR)) != 0); + switch (rsp_type) { case X_RSPD_TYPE_CPL: - (void) t4_handle_cpl_msg(iq, rss, m); + totals.sit_cpl++; + (void) t4_handle_cpl_msg(event_iq, rss, NULL); + break; + + case X_RSPD_TYPE_INTR: + totals.sit_intr++; + const uint32_t tgt_qid = BE_32(ctrl.pldbuflen_qid); + + t4_sge_iq_t *tgt_iq = *t4_iqmap_slot(sc, tgt_qid); + /* + * Make sure the forwarded interrupt was sent to the + * expected event queue. + */ + ASSERT3P(tgt_iq->tsi_intr_evtq, ==, event_iq); + + if (!list_link_active(&tgt_iq->tsi_intr_fwd_node)) { + list_insert_tail(&iql_fwd, tgt_iq); + } break; default: + cxgb_printf(sc->dip, CE_WARN, "unexpected IQ entry " + "type %d on IQ %u of type %d", rsp_type, + event_iq->tsi_cntxt_id, event_iq->tsi_iqtype); break; } - iq_next(iq); - ++ndescs; - if (!iq->polling && (ndescs == budget)) + + t4_iq_next_entry(event_iq); + cidx_incr++; + totals.sit_desc++; + event_iq->tsi_stats.sis_processed++; + + if (cidx_incr == desc_limit) { + rc = TIR_BUDGET_MAX; break; + } } -done: + /* + * At this point we may have collected a number of interrupt forwarding + * entries for Rx IQs, indicating that they have outstanding data ready + * for consumption. We process those now while still in interrupt + * context. We remain holding the event IQ's mutex while doing this + * work. No additional interrupts should be generated for this event IQ + * until after we have finished processing and re-armed the interrupt + * via t4_iq_gts_update(). + * + * There is a finite budget for processing each rx queue, and not all + * data is guaranteed to be processed as part of this interrupt. Each rx + * queue should re-arm its interrupt to trigger a fresh interrupt later + * if polling mode has not been enabled. + */ + t4_sge_iq_t *rx_iq = NULL; + while ((rx_iq = list_remove_head(&iql_fwd)) != NULL) { + (void) t4_process_rx_iq(rx_iq, rx_iq->tsi_qsize / 8, NULL); + } - t4_iq_gts_incr(iq, ndescs); + /* + * Send an update to the device about the event queue's new cidx and + * re-arm its interrupt. + */ + ASSERT3U(cidx_incr, >, 0); + t4_iq_gts_update(event_iq, event_iq->tsi_gts_rearm, cidx_incr); + IQ_UNLOCK(event_iq); - if ((fl_bufs_used > 0) || (iq->flags & IQ_HAS_FL)) { - int starved; - FL_LOCK(fl); - fl->needed += fl_bufs_used; - starved = refill_fl(sc, fl, fl->cap / 8); - FL_UNLOCK(fl); - if (starved) - add_fl_to_sfl(sc, fl); - } - return (mblk_head); + DTRACE_PROBE3(t4__event__iq__processed, t4_sge_iq_t *, event_iq, + struct sge_iq_totals *, &totals, t4_iq_result_t, rc); + return (rc); } /* - * Deals with anything and everything on the given ingress queue. + * Process entries on an Rx Ingress Queue. When called from interrupt context + * 'desc_budget' should be non-zero and 'tpr' should be NULL. When called from + * polling context 'desc_budget' should be zero and 'tpr' should be non-NULL. */ -static int -service_iq(struct sge_iq *iq, int budget) +t4_iq_result_t +t4_process_rx_iq(t4_sge_iq_t *rx_iq, uint_t desc_budget, + struct t4_poll_req *tpr) { - struct sge_iq *q; - struct sge_rxq *rxq = iq_to_rxq(iq); /* Use iff iq is part of rxq */ - struct sge_fl *fl = &rxq->fl; /* Use iff IQ_HAS_FL */ - struct adapter *sc = iq->adapter; - struct rsp_ctrl *ctrl; - int ndescs = 0, fl_bufs_used = 0; - int starved; - STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql); - - const uint_t limit = (budget != 0) ? budget : iq->qsize / 8; - + struct adapter *sc = rx_iq->tsi_adapter; + struct sge_fl *fl = rx_iq->tsi_fl; + struct sge_rxq *rxq = t4_iq_to_rxq(rx_iq); + const uint_t byte_limit = (tpr != NULL) ? tpr->tpr_byte_budget : 0; + mblk_t *mp_head = NULL, **mp_tail = &mp_head; + struct sge_iq_totals totals = { 0 }; + uint_t cidx_incr = 0; + struct rsp_ctrl ctrl; + t4_iq_result_t rc = TIR_SUCCESS; + + ASSERT3S(rx_iq->tsi_iqtype, ==, TIQT_ETH_RX); + ASSERT3P(rx_iq->tsi_intr_evtq, !=, NULL); + ASSERT3P(rxq, !=, NULL); + /* Rx queues require an FL. */ + ASSERT3P(fl, !=, NULL); /* - * We always come back and check the descriptor ring for new indirect - * interrupts and other responses after running a single handler. + * The desc_budget is used only when processing in interrupt context. + * The tpr is used only when proessing in polling context. */ - for (;;) { - while (is_new_response(iq, &ctrl)) { - membar_consumer(); - - const uint8_t type_gen = ctrl->u.type_gen; - const uint8_t rsp_type = G_RSPD_TYPE(type_gen); - const uint32_t dlen_qid = BE_32(ctrl->pldbuflen_qid); - - mblk_t *m = NULL; - const struct rss_header *rss = - (const struct rss_header *)iq->cdesc; - - switch (rsp_type) { - case X_RSPD_TYPE_FLBUF: - - ASSERT(iq->flags & IQ_HAS_FL); - - m = get_fl_payload(sc, fl, dlen_qid, - &fl_bufs_used); - if (m == NULL) { - /* - * Rearm the iq with a - * longer-than-default timer - */ - t4_iq_gts_update(iq, TIC_TIMER5, - ndescs); - if (fl_bufs_used > 0) { - ASSERT(iq->flags & IQ_HAS_FL); - FL_LOCK(fl); - fl->needed += fl_bufs_used; - starved = refill_fl(sc, fl, - fl->cap / 8); - FL_UNLOCK(fl); - if (starved) - add_fl_to_sfl(sc, fl); - } - return (0); - } + ASSERT(desc_budget == 0 || tpr == NULL); + + IQ_LOCK(rx_iq); + const bool is_polling = (rx_iq->tsi_flags & IQ_POLLING) != 0; + if ((rx_iq->tsi_flags & IQ_ENABLED) == 0) { + IQ_UNLOCK(rx_iq); + return (TIR_DISABLED); + } else if (is_polling && tpr == NULL) { + /* + * Skip IQ processing driven from interrupt when port is + * configured for polling. + */ + IQ_UNLOCK(rx_iq); + return (TIR_POLLING); + } - /* FALLTHRU */ - case X_RSPD_TYPE_CPL: - (void) t4_handle_cpl_msg(iq, rss, m); - break; + while (t4_get_new_rsp(rx_iq, &ctrl)) { + const uint8_t rsp_type = G_RSPD_TYPE(ctrl.u.type_gen); + const bool overflowed = (ctrl.u.type_gen & F_RSPD_QOVFL) != 0; - case X_RSPD_TYPE_INTR: + if (overflowed) { + rx_iq->tsi_stats.sis_overflow++; + } - /* - * Interrupts should be forwarded only to queues - * that are not forwarding their interrupts. - * This means service_iq can recurse but only 1 - * level deep. - */ - ASSERT(budget == 0); - - q = *t4_iqmap_slot(sc, dlen_qid); - if (atomic_cas_uint(&q->state, IQS_IDLE, - IQS_BUSY) == IQS_IDLE) { - if (service_iq(q, q->qsize / 8) == 0) { - (void) atomic_cas_uint( - &q->state, IQS_BUSY, - IQS_IDLE); - } else { - STAILQ_INSERT_TAIL(&iql, q, - link); - } + const struct rss_header *rss = + (const struct rss_header *)rx_iq->tsi_cdesc; + + DTRACE_PROBE3(t4__rx__iq__entry, t4_sge_iq_t *, rx_iq, + struct rsp_ctrl *, &ctrl, struct rss_header *, rss); + + switch (rsp_type) { + case X_RSPD_TYPE_FLBUF: { + const uint32_t dlen_nb = BE_32(ctrl.pldbuflen_qid); + const struct cpl_rx_pkt *cpl = t4_rss_payload(rss); + + if (rss->opcode == CPL_RX_PKT) { + const uint16_t pkt_len = BE_16(cpl->len); + const uint_t new_total = + totals.sit_rx_bytes + pkt_len; + + if (byte_limit != 0 && new_total > byte_limit) { + rc = TIR_BUDGET_MAX; + goto bail; } - break; + } - default: - break; + const bool newbuf = (dlen_nb & F_RSPD_NEWBUF) != 0; + const uint32_t data_len = G_RSPD_LEN(dlen_nb); + mblk_t *mp = t4_fl_get_payload(fl, data_len, newbuf); + if (mp == NULL) { + /* Rearm IQ with longer-than-default timer */ + t4_iq_gts_update(rx_iq, TGC_TIMER5, cidx_incr); + cidx_incr = 0; + rc = TIR_ALLOC_FAIL; + goto bail; } - iq_next(iq); - if (++ndescs == limit) { - t4_iq_gts_incr(iq, ndescs); - ndescs = 0; - - if (fl_bufs_used > 0) { - ASSERT(iq->flags & IQ_HAS_FL); - FL_LOCK(fl); - fl->needed += fl_bufs_used; - (void) refill_fl(sc, fl, fl->cap / 8); - FL_UNLOCK(fl); - fl_bufs_used = 0; + /* + * Add this entry to the totals once we are past the + * possible bail-outs above. + */ + totals.sit_flbuf++; + + if (rss->opcode == CPL_RX_PKT) { + mp->b_rptr += sc->sge.pktshift; + + uint16_t err_vec; + if (sc->params.tp.rx_pkt_encap) { + /* Enabled only in T6 config file */ + err_vec = G_T6_COMPR_RXERR_VEC( + ntohs(cpl->err_vec)); + } else { + err_vec = ntohs(cpl->err_vec); } - if (budget != 0) - return (EINPROGRESS); + const bool csum_ok = cpl->csum_calc && !err_vec; + + if (csum_ok && !cpl->ip_frag) { + mac_hcksum_set(mp, 0, 0, 0, 0xffff, + HCK_FULLCKSUM_OK | HCK_FULLCKSUM | + HCK_IPV4_HDRCKSUM_OK); + rxq->stats.rxcsum++; + } + + const uint16_t pkt_len = BE_16(cpl->len); + rxq->stats.rxpkts++; + rxq->stats.rxbytes += pkt_len; + totals.sit_rx_bytes += pkt_len; + + *mp_tail = mp; + mp_tail = &mp->b_next; + } else { + (void) t4_handle_cpl_msg(rx_iq, rss, mp); } + break; } - if (STAILQ_EMPTY(&iql) != 0) + default: + cxgb_printf(sc->dip, CE_WARN, "unexpected IQ entry " + "type %d on IQ %u of type %d", rsp_type, + rx_iq->tsi_cntxt_id, rx_iq->tsi_iqtype); +#ifdef DEBUG + panic("unexpected IQ entry on rx queue"); +#endif break; + } + + t4_iq_next_entry(rx_iq); + cidx_incr++; + totals.sit_desc++; + rx_iq->tsi_stats.sis_processed++; /* - * Process the head only, and send it to the back of the list if - * it's still not done. + * The desc_budget value is non-zero only when processing in + * interrupt context. In this case we honor the desc_limit. In + * polling mode we are passed a byte-based budget and disregard + * the the desc_limit. */ - q = STAILQ_FIRST(&iql); - STAILQ_REMOVE_HEAD(&iql, link); - if (service_iq(q, q->qsize / 8) == 0) - (void) atomic_cas_uint(&q->state, IQS_BUSY, IQS_IDLE); - else - STAILQ_INSERT_TAIL(&iql, q, link); + if (desc_budget != 0 && cidx_incr == desc_budget) { + rc = TIR_BUDGET_MAX; + goto bail; + } } - t4_iq_gts_update(iq, iq->intr_params, ndescs); +bail: + if (tpr != NULL) { + /* + * Do not re-arm interrupts while this IQ is being polled. + * Just update the CIDX as necessary. + */ + if (cidx_incr != 0) { + t4_iq_gts_incr(rx_iq, cidx_incr); + } + } else { + /* + * Just being extra sure that any future code changes keep this + * code path to interrupt processing only. + */ + ASSERT3U(desc_budget, >, 0); + ASSERT3P(tpr, ==, NULL); - if (iq->flags & IQ_HAS_FL) { - FL_LOCK(fl); - fl->needed += fl_bufs_used; - starved = refill_fl(sc, fl, fl->cap / 4); - FL_UNLOCK(fl); - if (starved != 0) - add_fl_to_sfl(sc, fl); + /* + * Make sure to re-arm the interrupt for this rx queue. + * Remember, the actual interrupt is delivered to the event + * queue (rq_iq->tsi_intr_evtq), but the generation of the + * forwarded interrupt event requires arming the interrupt on + * this rx queue. + */ + t4_iq_gts_update(rx_iq, rx_iq->tsi_gts_rearm, cidx_incr); } - return (0); + /* + * Take a snapshot of the ring generation number prior to dropping the + * IQ/RXQ lock, in case we need it to pass packets into the mac RX path. + */ + const uint64_t ring_gen_num = rxq->ring_gen_num; + IQ_UNLOCK(rx_iq); + + /* + * First we deliver the packets up to mac to give the client a chance to + * consume these mblks before the driver attempts to refill them. + */ + if (mp_head != NULL) { + if (tpr != NULL) { + tpr->tpr_mp = mp_head; + } else { + mac_rx_ring(rxq->port->mh, rxq->ring_handle, mp_head, + ring_gen_num); + } + } + + /* + * Next we refill some FL buffers. If the FL is "starving", we enqueue + * it on the starving list for further refilling on a background + * thread. + */ + if (fl != NULL && t4_fl_periodic_refill(fl)) { + t4_sfl_enqueue(sc, fl); + } + DTRACE_PROBE3(t4__rx__iq__processed, t4_sge_iq_t *, rx_iq, + struct sge_iq_totals *, &totals, t4_iq_result_t, rc); + return (rc); } /* Per-packet header in a coalesced tx WR, before the SGL starts (in flits) */ -#define TXPKTS_PKT_HDR ((\ +#define TXPKTS_PKT_HDR_FLITS ((\ sizeof (struct ulp_txpkt) + \ sizeof (struct ulptx_idata) + \ - sizeof (struct cpl_tx_pkt_core)) / 8) + sizeof (struct cpl_tx_pkt_core)) / FLIT_NUM_BYTES) /* Header of a coalesced tx WR, before SGL of first packet (in flits) */ -#define TXPKTS_WR_HDR (\ - sizeof (struct fw_eth_tx_pkts_wr) / 8 + \ - TXPKTS_PKT_HDR) +#define TXPKTS_WR_HDR_FLITS (\ + sizeof (struct fw_eth_tx_pkts_wr) / FLIT_NUM_BYTES + \ + TXPKTS_PKT_HDR_FLITS) /* Header of a tx WR, before SGL of first packet (in flits) */ -#define TXPKT_WR_HDR ((\ +#define TXPKT_WR_HDR_FLITS ((\ sizeof (struct fw_eth_tx_pkt_wr) + \ - sizeof (struct cpl_tx_pkt_core)) / 8) + sizeof (struct cpl_tx_pkt_core)) / FLIT_NUM_BYTES) /* Header of a tx LSO WR, before SGL of first packet (in flits) */ -#define TXPKT_LSO_WR_HDR ((\ +#define TXPKT_LSO_WR_HDR_FLITS ((\ sizeof (struct fw_eth_tx_pkt_wr) + \ sizeof (struct cpl_tx_pkt_lso_core) + \ - sizeof (struct cpl_tx_pkt_core)) / 8) + sizeof (struct cpl_tx_pkt_core)) / FLIT_NUM_BYTES) mblk_t * t4_eth_tx(void *arg, mblk_t *frame) { - struct sge_txq *txq = (struct sge_txq *)arg; + struct sge_txq *txq = arg; struct port_info *pi = txq->port; - struct sge_eq *eq = &txq->eq; - mblk_t *next_frame; - int rc, coalescing; - struct txpkts txpkts; - struct txinfo txinfo; + t4_sge_eq_t *eq = &txq->eq; + mblk_t *next_frame = NULL; + int coalescing = 0; + struct txpkts txpkts = {}; + struct txinfo txinfo = {}; txpkts.npkt = 0; /* indicates there's nothing in txpkts */ - coalescing = 0; TXQ_LOCK(txq); - if (eq->avail < 8) - (void) t4_tx_reclaim_descs(txq, 8, NULL); - for (; frame; frame = next_frame) { + if ((eq->tse_flags & EQ_ENABLED) == 0) { + /* Apply flow control until EQ is enabled. */ + TXQ_UNLOCK(txq); + return (frame); + } + + /* We always strive to send the maximum size WR. */ + if (eq->tse_avail < TX_WR_MAX_CREDITS) { + (void) t4_tx_reclaim_credits(txq, TX_WR_MAX_CREDITS, NULL); + } + for (; frame != NULL; frame = next_frame) { + int rc = 0; - if (eq->avail < 8) + if (eq->tse_avail < TX_WR_MAX_CREDITS) break; next_frame = frame->b_next; @@ -1006,7 +1148,6 @@ t4_eth_tx(void *arg, mblk_t *frame) * state in mac to continue transmissions. */ t4_write_flush_wr(txq); - break; } @@ -1020,9 +1161,7 @@ t4_eth_tx(void *arg, mblk_t *frame) if (coalescing != 0 && add_to_txpkts(txq, &txpkts, frame, &txinfo) == 0) { - /* Successfully absorbed into txpkts */ - write_ulp_cpl_sgl(pi, txq, &txpkts, &txinfo); goto doorbell; } @@ -1038,8 +1177,11 @@ t4_eth_tx(void *arg, mblk_t *frame) /* We're sending out individual frames now */ coalescing = 0; - if (eq->avail < 8) - (void) t4_tx_reclaim_descs(txq, 8, NULL); + if (eq->tse_avail < TX_WR_MAX_CREDITS) { + (void) t4_tx_reclaim_credits(txq, TX_WR_MAX_CREDITS, + NULL); + } + rc = write_txpkt_wr(pi, txq, frame, &txinfo); if (rc != 0) { @@ -1051,7 +1193,7 @@ t4_eth_tx(void *arg, mblk_t *frame) * can't send out the frame. What's worse, we have to * spend even more time freeing up everything in txinfo. */ - txq->qfull++; + txq->stats.qfull++; free_txinfo_resources(txq, &txinfo); frame->b_next = next_frame; @@ -1060,104 +1202,86 @@ t4_eth_tx(void *arg, mblk_t *frame) doorbell: /* Fewer and fewer doorbells as the queue fills up */ - if (eq->pending >= (1 << (fls(eq->qsize - eq->avail) / 2))) { - txq->txbytes += txinfo.len; - txq->txpkts++; + if (eq->tse_pending >= + (1 << (fls(eq->tse_qsize - eq->tse_avail) / 2))) { + txq->stats.txbytes += txinfo.len; + txq->stats.txpkts++; t4_tx_ring_db(txq); } - (void) t4_tx_reclaim_descs(txq, 32, NULL); + (void) t4_tx_reclaim_credits(txq, 32, NULL); } if (txpkts.npkt > 0) { write_txpkts_wr(txq, &txpkts); } - if (eq->pending != 0) { + if (eq->tse_pending != 0) { t4_tx_ring_db(txq); } if (frame != NULL) { - eq->flags |= EQ_CORKED; + eq->tse_flags |= EQ_CORKED; } - (void) t4_tx_reclaim_descs(txq, eq->qsize, NULL); + (void) t4_tx_reclaim_credits(txq, eq->tse_qsize, NULL); TXQ_UNLOCK(txq); return (frame); } -static inline void -init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int8_t pktc_idx, - int qsize, uint8_t esize) -{ - ASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS); - ASSERT(pktc_idx < SGE_NCOUNTERS); /* -ve is ok, means don't use */ - - iq->flags = 0; - iq->adapter = sc; - iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx); - iq->intr_pktc_idx = -1; - if (pktc_idx >= 0) { - iq->intr_params |= TIC_SE_INTR_ARM; - iq->intr_pktc_idx = pktc_idx; - } - iq->qsize = roundup(qsize, 16); /* See FW_IQ_CMD/iqsize */ - iq->esize = max(esize, 16); /* See FW_IQ_CMD/iqesize */ -} - -static inline void -init_fl(struct sge_fl *fl, uint16_t qsize) +static int +t4_alloc_iq(struct port_info *pi, const t4_iq_params_t *tip, t4_sge_iq_t *iq, + struct sge_fl *fl) { + struct adapter *sc = pi->adapter; + int rc; - fl->qsize = qsize; - fl->allocb_fail = 0; -} - -/* - * Allocates the ring for an ingress queue and an optional freelist. If the - * freelist is specified it will be allocated and then associated with the - * ingress queue. - * - * Returns errno on failure. Resources allocated up to that point may still be - * allocated. Caller is responsible for cleanup in case this function fails. - * - * If the ingress queue will take interrupts directly (iq->flags & IQ_INTR) then - * the intr_idx specifies the vector, starting from 0. Otherwise it specifies - * the index of the queue to which its interrupts will be forwarded. - */ -static int -alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl, - int intr_idx, int cong) -{ - int rc, i; - size_t len; - struct fw_iq_cmd c; - struct adapter *sc = iq->adapter; - uint32_t v = 0; - - len = iq->qsize * iq->esize; - rc = alloc_desc_ring(sc, len, DDI_DMA_READ, &iq->dhdl, &iq->ahdl, - &iq->ba, (caddr_t *)&iq->desc); - if (rc != 0) - return (rc); + ASSERT(tip->tip_tmr_idx >= 0 && tip->tip_tmr_idx < SGE_NTIMERS); + ASSERT(tip->tip_pktc_idx < SGE_NCOUNTERS); + ASSERT(tip->tip_cong_chan == -1 || tip->tip_cong_chan > 0); - bzero(&c, sizeof (c)); - c.op_to_vfn = cpu_to_be32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST | - F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) | - V_FW_IQ_CMD_VFN(0)); + const bool intr_fwd = (tip->tip_intr_evtq != NULL); + const uint_t intr_idx = + intr_fwd ? tip->tip_intr_evtq->tsi_cntxt_id : tip->tip_intr_idx; - c.alloc_to_len16 = cpu_to_be32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART | - FW_LEN16(c)); + ASSERT(intr_fwd || intr_idx < sc->intr_queue_cfg.intr_count); - /* Special handling for firmware event queue */ - if (iq == &sc->sge.fwq) - v |= F_FW_IQ_CMD_IQASYNCH; + mutex_init(&iq->tsi_lock, NULL, MUTEX_DRIVER, + DDI_INTR_PRI(DDI_INTR_PRI(sc->intr_pri))); + iq->tsi_flags = 0; + iq->tsi_iqtype = tip->tip_iq_type; + iq->tsi_adapter = sc; + iq->tsi_gts_rearm = V_QINTR_TIMER_IDX(tip->tip_tmr_idx); + iq->tsi_intr_pktc_idx = -1; + if (tip->tip_pktc_idx >= 0) { + iq->tsi_gts_rearm |= TGC_SE_INTR_ARM; + iq->tsi_intr_pktc_idx = tip->tip_pktc_idx; + } - if (iq->flags & IQ_INTR) - ASSERT(intr_idx < sc->intr_count); - else - v |= F_FW_IQ_CMD_IQANDST; - v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx); + /* + * The tsi_qsize holds the number of total entries in the queue, but the + * device requires that this number be a multiple of 16. See the + * documentation for FW_IQ_CMD in the Firmware Interface Book. + */ + iq->tsi_qsize = P2ROUNDUP(tip->tip_qsize, 16); + /* + * The last entry is always reserved for the status page, even if status + * page updates are not being utilized. + */ + iq->tsi_cap = iq->tsi_qsize - 1; + iq->tsi_esize = tip->tip_esize; + iq->tsi_esize_bytes = t4_iq_esize_bytes[iq->tsi_esize]; + iq->tsi_intr_evtq = intr_fwd ? tip->tip_intr_evtq : NULL; + iq->tsi_intr_idx = intr_fwd ? INTR_FORWARDED : intr_idx; + + const size_t len = iq->tsi_qsize * iq->tsi_esize_bytes; + rc = alloc_desc_ring(sc, len, DDI_DMA_READ, &iq->tsi_desc_dhdl, + &iq->tsi_desc_ahdl, &iq->tsi_desc_ba, (caddr_t *)&iq->tsi_desc); + if (rc != 0) { + mutex_destroy(&iq->tsi_lock); + return (rc); + } + iq->tsi_flags |= IQ_ALLOC_HOST; /* * If the coalescing counter is not enabled for this IQ, use the 0 @@ -1166,53 +1290,65 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl, * The selected index does not matter when the counter is not enabled * through the GTS flags. */ - const uint_t pktc_idx = (iq->intr_pktc_idx < 0) ? 0 : iq->intr_pktc_idx; + const uint_t pktc_idx = (iq->tsi_intr_pktc_idx < 0) ? 0 : + iq->tsi_intr_pktc_idx; + const bool is_fwq = (iq == &sc->sge.fwq); + + struct fw_iq_cmd iq_cmd; + bzero(&iq_cmd, sizeof (iq_cmd)); - c.type_to_iqandstindex = cpu_to_be32(v | + iq_cmd.op_to_vfn = BE_32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST | + F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) | + V_FW_IQ_CMD_VFN(0)); + + iq_cmd.alloc_to_len16 = BE_32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART | + FW_LEN16(struct fw_iq_cmd)); + + iq_cmd.type_to_iqandstindex = BE_32( + /* Special handling for firmware event queue */ + (is_fwq ? F_FW_IQ_CMD_IQASYNCH : 0) | + (intr_fwd ? F_FW_IQ_CMD_IQANDST : 0) | + V_FW_IQ_CMD_IQANDSTINDEX(intr_idx) | V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) | V_FW_IQ_CMD_VIID(pi->viid) | V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT)); - c.iqdroprss_to_iqesize = cpu_to_be16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) | - F_FW_IQ_CMD_IQGTSMODE | - V_FW_IQ_CMD_IQINTCNTTHRESH(pktc_idx) | - V_FW_IQ_CMD_IQESIZE(ilog2(iq->esize) - 4)); - c.iqsize = cpu_to_be16(iq->qsize); - c.iqaddr = cpu_to_be64(iq->ba); - if (cong >= 0) { - const uint32_t iq_type = - cong ? FW_IQ_IQTYPE_NIC : FW_IQ_IQTYPE_OFLD; - c.iqns_to_fl0congen = BE_32(F_FW_IQ_CMD_IQFLINTCONGEN | - V_FW_IQ_CMD_IQTYPE(iq_type)); - } + + iq_cmd.iqdroprss_to_iqesize = BE_16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) | + F_FW_IQ_CMD_IQGTSMODE | V_FW_IQ_CMD_IQINTCNTTHRESH(pktc_idx) | + V_FW_IQ_CMD_IQESIZE(iq->tsi_esize)); + + iq_cmd.iqsize = BE_16(iq->tsi_qsize); + iq_cmd.iqaddr = BE_64(iq->tsi_desc_ba); + iq_cmd.iqns_to_fl0congen = tip->tip_cong_chan == -1 ? 0 : + BE_32(F_FW_IQ_CMD_IQFLINTCONGEN); + + /* + * This setting currently only pertains to T4/T5 parts with 2 ports, and + * its only effect is to correct a bug in setting the IQPCIECH related + * to offload queues (Chelsio bug#34516). Therefore, setting it is + * irrelevant for our driver. However, we set it anyways in case a + * future part or fimrware revision decides to use this information for + * other purposes relevant the behavior of our driver. + */ + iq_cmd.iqns_to_fl0congen |= BE_32(V_FW_IQ_CMD_IQTYPE(FW_IQ_IQTYPE_NIC)); if (fl != NULL) { - mutex_init(&fl->lock, NULL, MUTEX_DRIVER, - DDI_INTR_PRI(sc->intr_pri)); - fl->flags |= FL_MTX; - - len = fl->qsize * RX_FL_ESIZE; - rc = alloc_desc_ring(sc, len, DDI_DMA_WRITE, &fl->dhdl, - &fl->ahdl, &fl->ba, (caddr_t *)&fl->desc); - if (rc != 0) - return (rc); + t4_sge_eq_t *eq = &fl->eq; - /* Allocate space for one software descriptor per buffer. */ - fl->cap = (fl->qsize - sc->sge.stat_len / RX_FL_ESIZE) * 8; - fl->sdesc = kmem_zalloc(sizeof (struct fl_sdesc) * fl->cap, - KM_SLEEP); - fl->needed = fl->cap; - fl->lowat = roundup(sc->sge.fl_starve_threshold, 8); - - c.iqns_to_fl0congen |= - cpu_to_be32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) | - F_FW_IQ_CMD_FL0PACKEN | F_FW_IQ_CMD_FL0PADEN); - if (cong >= 0) { - c.iqns_to_fl0congen |= - BE_32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) | - F_FW_IQ_CMD_FL0CONGCIF | - F_FW_IQ_CMD_FL0CONGEN); + iq->tsi_fl = fl; + bzero(&fl->stats, sizeof (fl->stats)); + + fl->bufs_cap = tip->tip_fl_qsize; + eq->tse_flags = 0; + eq->tse_qsize = EQ_FLITS_TO_HC(fl->bufs_cap); + + if ((rc = t4_alloc_eq_base(pi, eq)) != 0) { + t4_free_iq(pi, iq); + return (rc); } + fl->bufs_lowat = P2ROUNDUP(sc->sge.fl_starve_threshold, 8); + /* * In T6, for egress queue type FL there is internal overhead * of 16B for header going into FLM module. Hence the maximum @@ -1226,60 +1362,82 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl, X_FETCHBURSTMIN_64B_T6: X_FETCHBURSTMIN_128B; const uint_t fbmax = t4_cver_ge(sc, CHELSIO_T6) ? X_FETCHBURSTMAX_256B : X_FETCHBURSTMAX_512B; - c.fl0dcaen_to_fl0cidxfthresh = cpu_to_be16( + const uint32_t fl_cong = (tip->tip_cong_chan == -1) ? 0 : + (V_FW_IQ_CMD_FL0CNGCHMAP(tip->tip_cong_chan) | + F_FW_IQ_CMD_FL0CONGCIF | + F_FW_IQ_CMD_FL0CONGEN); + + iq_cmd.iqns_to_fl0congen |= BE_32( + V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) | + F_FW_IQ_CMD_FL0PACKEN | + F_FW_IQ_CMD_FL0PADEN | + fl_cong); + /* + * We do not set cidx flushing because we choose to have no cidx + * updates for an FL. Instead we track FL usage implicitly by + * the incoming CPL messages on the Rx IQ. + */ + iq_cmd.fl0dcaen_to_fl0cidxfthresh |= BE_16( V_FW_IQ_CMD_FL0FBMIN(fbmin) | V_FW_IQ_CMD_FL0FBMAX(fbmax)); - c.fl0size = cpu_to_be16(fl->qsize); - c.fl0addr = cpu_to_be64(fl->ba); + iq_cmd.fl0size |= BE_16(eq->tse_qsize_spg); + iq_cmd.fl0addr |= BE_64(eq->tse_ring_ba); + } + if (!intr_fwd) { + iq->tsi_flags |= IQ_INTR; } - rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof (c), &c); + rc = -t4_wr_mbox(sc, sc->mbox, &iq_cmd, sizeof (iq_cmd), &iq_cmd); if (rc != 0) { cxgb_printf(sc->dip, CE_WARN, "failed to create ingress queue: %d", rc); + t4_free_iq(pi, iq); return (rc); } + iq->tsi_cntxt_id = BE_16(iq_cmd.iqid); + iq->tsi_abs_id = BE_16(iq_cmd.physiqid); + iq->tsi_flags |= IQ_ALLOC_DEV; - iq->cdesc = iq->desc; - iq->cidx = 0; - iq->gen = 1; - iq->adapter = sc; - iq->cntxt_id = be16_to_cpu(c.iqid); - iq->abs_id = be16_to_cpu(c.physiqid); - iq->flags |= IQ_ALLOCATED; - mutex_init(&iq->lock, NULL, MUTEX_DRIVER, - DDI_INTR_PRI(DDI_INTR_PRI(sc->intr_pri))); - iq->polling = 0; + iq->tsi_cdesc = iq->tsi_desc; + iq->tsi_cidx = 0; + iq->tsi_gen = F_RSPD_GEN; + iq->tsi_adapter = sc; - *t4_iqmap_slot(sc, iq->cntxt_id) = iq; + *t4_iqmap_slot(sc, iq->tsi_cntxt_id) = iq; if (fl != NULL) { - fl->cntxt_id = be16_to_cpu(c.fl0id); - fl->pidx = fl->cidx = 0; + t4_sge_eq_t *eq = &fl->eq; + + eq->tse_cntxt_id = BE_16(iq_cmd.fl0id); + + CTASSERT(offsetof(struct sge_fl, eq) == 0); + *t4_eqmap_slot(sc, eq->tse_cntxt_id) = (t4_sge_eq_t *)fl; + eq->tse_flags |= EQ_ALLOC_DEV; + eq->tse_pidx = eq->tse_cidx = 0; + t4_alloc_eq_post(pi, eq); fl->copy_threshold = rx_copy_threshold; - *t4_eqmap_slot(sc, fl->cntxt_id) = (struct sge_eq *)fl; + /* Allocate space for one software descriptor per buffer. */ + const size_t sdesc_sz = fl->bufs_cap * sizeof (struct fl_sdesc); + fl->sdesc = kmem_zalloc(sdesc_sz, KM_SLEEP); + eq->tse_flags |= EQ_ALLOC_DESC; FL_LOCK(fl); - (void) refill_fl(sc, fl, fl->lowat); + (void) t4_fl_refill(fl, fl->bufs_lowat); FL_UNLOCK(fl); - - iq->flags |= IQ_HAS_FL; } - if (t4_cver_ge(sc, CHELSIO_T5) && cong >= 0) { - uint32_t param, val; - - param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | + if (t4_cver_ge(sc, CHELSIO_T5) && tip->tip_cong_chan != -1) { + const uint32_t param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) | - V_FW_PARAMS_PARAM_YZ(iq->cntxt_id); - if (cong == 0) - val = 1 << 19; - else { - val = 2 << 19; - for (i = 0; i < 4; i++) { - if (cong & (1 << i)) - val |= 1 << (i << 2); + V_FW_PARAMS_PARAM_YZ(iq->tsi_cntxt_id); + + const uint_t congmap_log = sc->params.arch.cng_ch_bits_log; + uint32_t val = + V_CONMCTXT_CNGTPMODE(X_CONMCTXT_CNGTPMODE_CHANNEL); + for (uint_t i = 0; i < 4; i++) { + if (tip->tip_cong_chan & (1 << i)) { + val |= (1 << (i << congmap_log)); } } @@ -1288,137 +1446,283 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl, /* report error but carry on */ cxgb_printf(sc->dip, CE_WARN, "failed to set congestion manager context for " - "ingress queue %d: %d", iq->cntxt_id, rc); + "ingress queue %d: %d", iq->tsi_cntxt_id, rc); } } - /* Enable IQ interrupts */ - iq->state = IQS_IDLE; - t4_iq_gts_update(iq, iq->intr_params, 0); + /* Enable event (and firmware) queues IQs immediately */ + if (iq->tsi_iqtype == TIQT_EVENT) { + iq->tsi_flags |= IQ_ENABLED; + t4_iq_gts_update(iq, iq->tsi_gts_rearm, 0); + } return (0); } -static int -free_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl) +static void +t4_free_iq(struct port_info *pi, t4_sge_iq_t *iq) { - int rc; + struct adapter *sc = iq->tsi_adapter; + struct sge_fl *fl = iq->tsi_fl; + t4_sge_eq_t *eq = fl != NULL ? &fl->eq : NULL; - if (iq != NULL) { - struct adapter *sc = iq->adapter; - dev_info_t *dip; - - dip = pi ? pi->dip : sc->dip; - if (iq->flags & IQ_ALLOCATED) { - rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0, - FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id, - fl ? fl->cntxt_id : 0xffff, 0xffff); - if (rc != 0) { - cxgb_printf(dip, CE_WARN, - "failed to free queue %p: %d", iq, rc); - return (rc); - } - mutex_destroy(&iq->lock); - iq->flags &= ~IQ_ALLOCATED; - } + /* + * The onus is placed on the caller to ensure that no further activity + * will occur on this IQ. + */ + iq->tsi_flags &= ~IQ_ENABLED; - if (iq->desc != NULL) { - (void) free_desc_ring(&iq->dhdl, &iq->ahdl); - iq->desc = NULL; - } + if (iq->tsi_flags & IQ_ALLOC_DEV) { + /* + * Device-side resources of freelists are allocated in concert + * with the device-side resources of their associated IQ. + */ + ASSERT(fl == NULL || (eq->tse_flags & EQ_ALLOC_DEV)); - bzero(iq, sizeof (*iq)); + const uint16_t eq_cntxid = fl ? eq->tse_cntxt_id : 0xffff; + int rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0, + FW_IQ_TYPE_FL_INT_CAP, iq->tsi_cntxt_id, eq_cntxid, 0xffff); + if (rc != 0) { + cxgb_printf(sc->dip, CE_WARN, + "failed to free IQ/FL (%x/%x): %d", + iq->tsi_cntxt_id, eq_cntxid, rc); + /* attempt to complete the rest of clean-up */ + } + iq->tsi_flags &= ~IQ_ALLOC_DEV; + if (fl != NULL) { + eq->tse_flags &= ~EQ_ALLOC_DEV; + } } + if (iq->tsi_flags & IQ_ALLOC_HOST) { + (void) free_desc_ring(&iq->tsi_desc_dhdl, &iq->tsi_desc_ahdl); + iq->tsi_desc = NULL; + iq->tsi_cdesc = NULL; + iq->tsi_desc_ba = 0; + mutex_destroy(&iq->tsi_lock); + iq->tsi_flags &= ~IQ_ALLOC_HOST; + } + iq->tsi_flags &= ~IQ_INTR; + ASSERT0(iq->tsi_flags); + + iq->tsi_intr_idx = 0; + iq->tsi_intr_evtq = NULL; + iq->tsi_iqtype = TIQT_UNINIT; if (fl != NULL) { - if (fl->sdesc != NULL) { + if (eq->tse_flags & EQ_ALLOC_DESC) { FL_LOCK(fl); - free_fl_bufs(fl); + t4_fl_free_bufs(fl); FL_UNLOCK(fl); - kmem_free(fl->sdesc, sizeof (struct fl_sdesc) * - fl->cap); + kmem_free(fl->sdesc, fl->bufs_cap * + sizeof (struct fl_sdesc)); fl->sdesc = NULL; + + eq->tse_flags &= ~EQ_ALLOC_DESC; } + t4_free_eq(pi, eq); + iq->tsi_fl = NULL; + + ASSERT0(eq->tse_flags); + } +} + +int +t4_alloc_evt_iqs(struct adapter *sc) +{ + const t4_intr_plan_t plan = sc->intr_queue_cfg.intr_plan; + + const t4_iq_params_t fwq_iqp = { + .tip_iq_type = TIQT_EVENT, + .tip_tmr_idx = sc->sge.fwq_tmr_idx, + .tip_pktc_idx = sc->sge.fwq_pktc_idx, + .tip_qsize = FW_IQ_QSIZE, + .tip_esize = FW_IQ_ESIZE, + .tip_cong_chan = -1, + .tip_intr_evtq = NULL, + /* + * The device error-handling interrupt always occupies the 0th + * slot, which the firmware queue will share if no additional + * interrupts are available. Otherwise it uses the next slot + * after that. + */ + .tip_intr_idx = (plan == TIP_SINGLE) ? 0 : 1, + }; + const int rc = t4_alloc_iq(sc->port[0], &fwq_iqp, &sc->sge.fwq, NULL); + if (rc != 0) { + cxgb_printf(sc->dip, CE_WARN, + "failed to create firmware event queue: %d.", rc); + return (rc); + } - if (fl->desc != NULL) { - (void) free_desc_ring(&fl->dhdl, &fl->ahdl); - fl->desc = NULL; + if (plan == TIP_PER_PORT) { + const uint_t ipp = sc->intr_queue_cfg.intr_per_port; + const uint_t port_count = sc->params.nports; + + for (uint_t i = 0; i < port_count; i++) { + struct port_info *port = sc->port[i]; + + for (uint_t j = 0; j < ipp; j++) { + const t4_iq_params_t iqp = { + .tip_iq_type = TIQT_EVENT, + .tip_tmr_idx = sc->sge.fwq_tmr_idx, + .tip_pktc_idx = sc->sge.fwq_pktc_idx, + .tip_qsize = FW_IQ_QSIZE, + .tip_esize = FW_IQ_ESIZE, + .tip_cong_chan = -1, + .tip_intr_evtq = NULL, + .tip_intr_idx = 2 + (i * ipp) + j, + }; + + const int rc = t4_alloc_iq(port, &iqp, + &port->intr_iqs[j], NULL); + if (rc != 0) { + cxgb_printf(sc->dip, CE_WARN, + "failed to create interrupt event " + "queue %u for port %u: %d.", j, i, + rc); + t4_free_evt_iqs(sc); + return (rc); + } + } } + } + + return (0); +} + +void +t4_free_evt_iqs(struct adapter *sc) +{ + const uint_t port_count = sc->params.nports; - if (fl->flags & FL_MTX) { - mutex_destroy(&fl->lock); - fl->flags &= ~FL_MTX; + for (uint_t i = 0; i < port_count; i++) { + struct port_info *port = sc->port[i]; + + for (uint_t j = 0; j < sc->intr_queue_cfg.intr_per_port; j++) { + t4_free_iq(port, &port->intr_iqs[j]); } + } + + t4_free_iq(sc->port[0], &sc->sge.fwq); +} - bzero(fl, sizeof (struct sge_fl)); +static int +t4_alloc_rxq(struct port_info *pi, struct sge_rxq *rxq, uint_t q_idx) +{ + struct adapter *sc = pi->adapter; + + rxq->port = pi; + + t4_iq_params_t iqp = { + .tip_iq_type = TIQT_ETH_RX, + .tip_tmr_idx = pi->tmr_idx, + .tip_pktc_idx = pi->pktc_idx, + .tip_qsize = sc->props.qsize_rxq, + .tip_esize = RX_IQ_ESIZE, + .tip_fl_qsize = sc->props.qsize_rxq, + .tip_cong_chan = t4_get_tp_ch_map(sc, pi->tx_chan), + }; + t4_rxq_intr_assign(pi, q_idx, &iqp); + const int rc = t4_alloc_iq(pi, &iqp, &rxq->iq, &rxq->fl); + if (rc != 0) { + return (rc); + } + + rxq->ksp = setup_rxq_kstats(pi, rxq, q_idx); + return (0); +} + +static void +t4_free_rxq(struct port_info *pi, struct sge_rxq *rxq) +{ + if (rxq->ksp != NULL) { + kstat_delete(rxq->ksp); + rxq->ksp = NULL; } - return (0); + t4_free_iq(pi, &rxq->iq); } -int -t4_alloc_fwq(struct adapter *sc) +static int +t4_alloc_eq_base(struct port_info *pi, t4_sge_eq_t *eq) { - int rc, intr_idx; - struct sge_iq *fwq = &sc->sge.fwq; + struct adapter *sc = pi->adapter; + ASSERT0(eq->tse_flags); + mutex_init(&eq->tse_lock, NULL, MUTEX_DRIVER, + DDI_INTR_PRI(sc->intr_pri)); + + /* + * Make sure to account for the status page which sits at the end of the + * hardware ring and may consume one or two credits. + */ + ASSERT3U(eq->tse_qsize, <=, T4_MAX_EQ_SIZE); + eq->tse_qsize_spg = eq->tse_qsize + sc->sge.eq_spg_len; - init_iq(fwq, sc, sc->sge.fwq_tmr_idx, sc->sge.fwq_pktc_idx, - FW_IQ_QSIZE, FW_IQ_ESIZE); - fwq->flags |= IQ_INTR; /* always */ - intr_idx = sc->intr_count > 1 ? 1 : 0; - rc = alloc_iq_fl(sc->port[0], fwq, NULL, intr_idx, -1); + /* + * We are allocating the "hardware" ring to hold the host credits, make + * sure to use tse_qsize_spg to include the status page credits. + */ + const size_t len = eq->tse_qsize_spg * EQ_HC_SIZE; + int rc = alloc_desc_ring(sc, len, DDI_DMA_WRITE, &eq->tse_ring_dhdl, + &eq->tse_ring_ahdl, &eq->tse_ring_ba, (caddr_t *)&eq->tse_ring); if (rc != 0) { - cxgb_printf(sc->dip, CE_WARN, - "failed to create firmware event queue: %d.", rc); + mutex_destroy(&eq->tse_lock); return (rc); } + eq->tse_flags |= EQ_ALLOC_HOST; + + /* + * We always use one credit less than the technical capacity to avoid + * the situation where pidx == cidx which would indicate to the hardware + * that the queue is empty. + */ + eq->tse_avail = eq->tse_qsize - 1; + eq->tse_pending = 0; + eq->tse_pidx = 0; + eq->tse_cidx = 0; + eq->tse_spg = t4_eq_credit(eq, eq->tse_qsize); return (0); } -int -t4_free_fwq(struct adapter *sc) -{ - return (free_iq_fl(NULL, &sc->sge.fwq, NULL)); -} +#define UDB_DBS (DOORBELL_UDB | DOORBELL_UDBWC | DOORBELL_WCWR) -static int -alloc_rxq(struct port_info *pi, struct sge_rxq *rxq, int intr_idx, int i) +static void +t4_alloc_eq_post(struct port_info *pi, t4_sge_eq_t *eq) { - int rc; - - rxq->port = pi; - rc = alloc_iq_fl(pi, &rxq->iq, &rxq->fl, intr_idx, - t4_get_tp_ch_map(pi->adapter, pi->tx_chan)); - if (rc != 0) - return (rc); + struct adapter *sc = pi->adapter; + const boolean_t udb = (sc->doorbells & UDB_DBS) != 0; + ASSERT(eq->tse_flags & EQ_ALLOC_DEV); - rxq->ksp = setup_rxq_kstats(pi, rxq, i); + eq->tse_doorbells = sc->doorbells; + if (udb) { + uint64_t udb_offset; + uint_t udb_qid; - return (rc); + const int rc = t4_bar2_sge_qregs(sc, eq->tse_cntxt_id, + T4_BAR2_QTYPE_EGRESS, 0, &udb_offset, &udb_qid); + if (rc == 0) { + eq->tse_udb = sc->bar2_ptr + udb_offset; + eq->tse_udb_qid = udb_qid; + } else { + eq->tse_doorbells &= ~UDB_DBS; + eq->tse_udb = NULL; + eq->tse_udb_qid = 0; + } + } } static int -free_rxq(struct port_info *pi, struct sge_rxq *rxq) +t4_eq_alloc_eth(struct port_info *pi, t4_sge_eq_t *eq) { + struct adapter *sc = pi->adapter; int rc; - if (rxq->ksp != NULL) { - kstat_delete(rxq->ksp); - rxq->ksp = NULL; + if ((rc = t4_alloc_eq_base(pi, eq)) != 0) { + return (rc); } - rc = free_iq_fl(pi, &rxq->iq, &rxq->fl); - if (rc == 0) - bzero(&rxq->fl, sizeof (*rxq) - offsetof(struct sge_rxq, fl)); - - return (rc); -} - -static int -eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) -{ struct fw_eq_eth_cmd c = { .op_to_vfn = BE_32( V_FW_CMD_OP(FW_EQ_ETH_CMD) | @@ -1435,36 +1739,37 @@ eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) V_FW_EQ_ETH_CMD_VIID(pi->viid)), .fetchszm_to_iqid = BE_32( V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_BOTH) | - V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | + V_FW_EQ_ETH_CMD_PCIECHN(eq->tse_tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO | - V_FW_EQ_ETH_CMD_IQID(eq->iqid)), + V_FW_EQ_ETH_CMD_IQID(eq->tse_iqid)), .dcaen_to_eqsize = BE_32( V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) | V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_ETH_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) | - V_FW_EQ_ETH_CMD_EQSIZE(eq->qsize)), - .eqaddr = BE_64(eq->ba), + V_FW_EQ_ETH_CMD_EQSIZE(eq->tse_qsize_spg)), + .eqaddr = BE_64(eq->tse_ring_ba), }; /* - * The EQ is configured to send a notification for every 32 consumed - * entries (X_CIDXFLUSHTHRESH_32). In order to ensure timely - * notification of entry consumption during slow periods when that - * threshold may not be reached with regularity, two mechanisms exist: + * The T4 is configured to send a notification for every 32 consumed + * host credits (X_CIDXFLUSHTHRESH_32). During times of periodic Tx + * traffic that threshold may not be reached with regularity, leaving + * outstanding credits that cannot be reclaimed until more traffic is + * sent. This can result in a situation where the device driver is + * unable to shutdown and detach. To alleviate this problem two methods + * may be employed: * - * 1. The DBQ timer can be configured to fire (and send a notification) - * after a period when the EQ has gone idle. This is available on T6 - * and later adapters. + * 1. The DBQ timer can be configured to arm and deliver a notification + * after the EQ has gone idle for a period of time. This is available + * on T6 and later adapters. * - * 2. The CIDXFlushThresholdOverride flag will send a notification - * whenever a consumed entry causes CDIX==PIDX, even if the - * CIDXFlushThreshold has not been reached. + * 2. The CIDXFlushThresholdOverride flag (also documented under + * FCThreshOverride flag in the T6 Programmers Guide) will send a + * notification whenever a consumed credit causes CDIX==PIDX, even if + * the CIDXFlushThreshold has not been reached. * - * The DBQ timer is preferred, as it results in no additional - * notifications when the EQ is kept busy with small transmissions. - * Comparatively, flows of many short packets (like frequent ACKs) can - * cause the CIDXFlushThresholdOverride mechanism to induce a - * notification for every transmitted packet. + * The DBQ timer is preferred, as it results in less notifications when + * the EQ is kept busy with frequent single-credit transmissions. */ if (sc->flags & TAF_DBQ_TIMER) { /* Configure the DBQ timer when it is available */ @@ -1476,117 +1781,78 @@ eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) c.dcaen_to_eqsize |= BE_32(F_FW_EQ_ETH_CMD_CIDXFTHRESHO); } - int rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof (c), &c); + rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof (c), &c); if (rc != 0) { cxgb_printf(pi->dip, CE_WARN, "failed to create Ethernet egress queue: %d", rc); return (rc); } - eq->flags |= EQ_ALLOCATED; - - eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(BE_32(c.eqid_pkd)); - - *t4_eqmap_slot(sc, eq->cntxt_id) = eq; - - return (rc); -} - -static int -alloc_eq(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) -{ - int rc; - size_t len; - - mutex_init(&eq->lock, NULL, MUTEX_DRIVER, DDI_INTR_PRI(sc->intr_pri)); - eq->flags |= EQ_MTX; - - len = eq->qsize * EQ_ESIZE; - rc = alloc_desc_ring(sc, len, DDI_DMA_WRITE, &eq->desc_dhdl, - &eq->desc_ahdl, &eq->ba, (caddr_t *)&eq->desc); - if (rc != 0) - return (rc); - - eq->cap = eq->qsize - sc->sge.stat_len / EQ_ESIZE; - eq->spg = (void *)&eq->desc[eq->cap]; - eq->avail = eq->cap - 1; /* one less to avoid cidx = pidx */ - eq->pidx = eq->cidx = 0; - eq->doorbells = sc->doorbells; - - rc = eth_eq_alloc(sc, pi, eq); - if (rc != 0) { - cxgb_printf(sc->dip, CE_WARN, - "failed to allocate egress queue: %d", rc); - } - - if (eq->doorbells & (DOORBELL_UDB | DOORBELL_UDBWC | DOORBELL_WCWR)) { - uint64_t udb_offset; - uint_t udb_qid; + eq->tse_cntxt_id = G_FW_EQ_ETH_CMD_EQID(BE_32(c.eqid_pkd)); + *t4_eqmap_slot(sc, eq->tse_cntxt_id) = eq; + eq->tse_flags |= EQ_ALLOC_DEV; - rc = t4_bar2_sge_qregs(sc, eq->cntxt_id, T4_BAR2_QTYPE_EGRESS, - 0, &udb_offset, &udb_qid); - - if (rc == 0) { - eq->udb = sc->bar2_ptr + udb_offset; - eq->udb_qid = udb_qid; - } else { - eq->doorbells &= - ~(DOORBELL_UDB | DOORBELL_UDBWC | DOORBELL_WCWR); - eq->udb = NULL; - eq->udb_qid = 0; - } - } + t4_alloc_eq_post(pi, eq); - return (rc); + return (0); } -static int -free_eq(struct adapter *sc, struct sge_eq *eq) +static void +t4_free_eq(struct port_info *pi, t4_sge_eq_t *eq) { - int rc; + struct adapter *sc = pi->adapter; - if (eq->flags & EQ_ALLOCATED) { - rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); + if (eq->tse_flags & EQ_ALLOC_DEV) { + int rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0, + eq->tse_cntxt_id); if (rc != 0) { cxgb_printf(sc->dip, CE_WARN, "failed to free egress queue: %d", rc); - return (rc); + /* + * Continue on with freeing operation, even though the + * device resource will be effectively leaked. + */ } - eq->flags &= ~EQ_ALLOCATED; + eq->tse_flags &= ~EQ_ALLOC_DEV; } - if (eq->desc != NULL) { - (void) free_desc_ring(&eq->desc_dhdl, &eq->desc_ahdl); - eq->desc = NULL; + if (eq->tse_flags & EQ_ALLOC_HOST) { + (void) free_desc_ring(&eq->tse_ring_dhdl, &eq->tse_ring_ahdl); + eq->tse_ring = NULL; + eq->tse_ring_ba = 0; + eq->tse_spg = NULL; + mutex_destroy(&eq->tse_lock); + eq->tse_flags &= ~EQ_ALLOC_HOST; } - if (eq->flags & EQ_MTX) - mutex_destroy(&eq->lock); - bzero(eq, sizeof (*eq)); - return (0); } static int -alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx) +t4_alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx) { - int rc, i; struct adapter *sc = pi->adapter; - struct sge_eq *eq = &txq->eq; + t4_sge_eq_t *eq = &txq->eq; + int rc; - rc = alloc_eq(sc, pi, eq); - if (rc != 0) + if ((rc = t4_eq_alloc_eth(pi, eq)) != 0) { return (rc); + } txq->port = pi; - txq->sdesc = kmem_zalloc(sizeof (struct tx_sdesc) * eq->cap, KM_SLEEP); + txq->sdesc = kmem_zalloc(sizeof (struct tx_sdesc) * eq->tse_qsize, + KM_SLEEP); txq->copy_threshold = tx_copy_threshold; - txq->txb_size = eq->qsize * txq->copy_threshold; + txq->txb_size = eq->tse_qsize * txq->copy_threshold; rc = alloc_tx_copybuffer(sc, txq->txb_size, &txq->txb_dhdl, &txq->txb_ahdl, &txq->txb_ba, &txq->txb_va); - if (rc == 0) + if (rc != 0) { + txq->txb_size = 0; + txq->txb_avail = 0; + return (ENOMEM); + } else { txq->txb_avail = txq->txb_size; - else - txq->txb_avail = txq->txb_size = 0; + eq->tse_flags |= EQ_ALLOC_DESC; + } /* * TODO: is this too low? Worst case would need around 4 times qsize @@ -1594,10 +1860,10 @@ alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx) * the SGL coming from a distinct DMA handle). Increase tx_dhdl_total * if you see too many dma_hdl_failed. */ - txq->tx_dhdl_total = eq->qsize * 2; + txq->tx_dhdl_total = eq->tse_qsize * 2; txq->tx_dhdl = kmem_zalloc(sizeof (ddi_dma_handle_t) * txq->tx_dhdl_total, KM_SLEEP); - for (i = 0; i < txq->tx_dhdl_total; i++) { + for (uint_t i = 0; i < txq->tx_dhdl_total; i++) { rc = ddi_dma_alloc_handle(sc->dip, &sc->sge.dma_attr_tx, DDI_DMA_SLEEP, 0, &txq->tx_dhdl[i]); if (rc != DDI_SUCCESS) { @@ -1611,15 +1877,13 @@ alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx) txq->ksp = setup_txq_kstats(pi, txq, idx); - return (rc); + return (0); } -static int -free_txq(struct port_info *pi, struct sge_txq *txq) +static void +t4_free_txq(struct port_info *pi, struct sge_txq *txq) { - int i; - struct adapter *sc = pi->adapter; - struct sge_eq *eq = &txq->eq; + t4_sge_eq_t *eq = &txq->eq; if (txq->ksp != NULL) { kstat_delete(txq->ksp); @@ -1632,14 +1896,13 @@ free_txq(struct port_info *pi, struct sge_txq *txq) } if (txq->sdesc != NULL) { - struct tx_sdesc *sd; ddi_dma_handle_t hdl; TXQ_LOCK(txq); - while (eq->cidx != eq->pidx) { - sd = &txq->sdesc[eq->cidx]; + while (eq->tse_cidx != eq->tse_pidx) { + struct tx_sdesc *sd = &txq->sdesc[eq->tse_cidx]; - for (i = sd->hdls_used; i; i--) { + for (uint_t i = sd->hdls_used; i != 0; i--) { hdl = txq->tx_dhdl[txq->tx_dhdl_cidx]; (void) ddi_dma_unbind_handle(hdl); if (++txq->tx_dhdl_cidx == txq->tx_dhdl_total) @@ -1650,22 +1913,23 @@ free_txq(struct port_info *pi, struct sge_txq *txq) freemsgchain(sd->mp_head); sd->mp_head = sd->mp_tail = NULL; - eq->cidx += sd->desc_used; - if (eq->cidx >= eq->cap) - eq->cidx -= eq->cap; + eq->tse_cidx += sd->credits_used; + if (eq->tse_cidx >= eq->tse_qsize) + eq->tse_cidx -= eq->tse_qsize; - txq->txb_avail += txq->txb_used; + txq->txb_avail += sd->txb_used; } ASSERT(txq->tx_dhdl_cidx == txq->tx_dhdl_pidx); ASSERT(txq->txb_avail == txq->txb_size); TXQ_UNLOCK(txq); - kmem_free(txq->sdesc, sizeof (struct tx_sdesc) * eq->cap); + kmem_free(txq->sdesc, sizeof (struct tx_sdesc) * eq->tse_qsize); txq->sdesc = NULL; + eq->tse_flags &= ~EQ_ALLOC_DESC; } if (txq->tx_dhdl != NULL) { - for (i = 0; i < txq->tx_dhdl_total; i++) { + for (uint_t i = 0; i < txq->tx_dhdl_total; i++) { if (txq->tx_dhdl[i] != NULL) ddi_dma_free_handle(&txq->tx_dhdl[i]); } @@ -1674,10 +1938,9 @@ free_txq(struct port_info *pi, struct sge_txq *txq) txq->tx_dhdl = NULL; } - (void) free_eq(sc, &txq->eq); + t4_free_eq(pi, &txq->eq); bzero(txq, sizeof (*txq)); - return (0); } /* @@ -1799,27 +2062,80 @@ alloc_tx_copybuffer(struct adapter *sc, size_t len, acc_attr, dma_attr, dma_hdl, acc_hdl, pba, pva)); } +/* + * Fetch next valid (if any) response from adapter in IQ. Returns `true` if + * rsp_ctrl data read into `ctrl` has generation bit state matching IQ + * expectation for a new entry. + * + * This does not advance cidx, which is left to a subsequent call to + * t4_iq_next_entry(). + */ static inline bool -is_new_response(const struct sge_iq *iq, struct rsp_ctrl **ctrl) +t4_get_new_rsp(const t4_sge_iq_t *iq, struct rsp_ctrl *ctrl) { - (void) ddi_dma_sync(iq->dhdl, (uintptr_t)iq->cdesc - - (uintptr_t)iq->desc, iq->esize, DDI_DMA_SYNC_FORKERNEL); + (void) ddi_dma_sync(iq->tsi_desc_dhdl, 0, 0, DDI_DMA_SYNC_FORKERNEL); - *ctrl = (void *)((uintptr_t)iq->cdesc + - (iq->esize - sizeof (struct rsp_ctrl))); - - return ((((*ctrl)->u.type_gen >> S_RSPD_GEN) == iq->gen)); + *ctrl = *(struct rsp_ctrl *) + ((caddr_t)iq->tsi_cdesc + (iq->tsi_esize_bytes - + sizeof (struct rsp_ctrl))); + return ((ctrl->u.type_gen & F_RSPD_GEN) == iq->tsi_gen); } +/* + * Advance IQ consumer index, wrapping (and toggling generation bit) when the + * end of the ring is reached. + */ static inline void -iq_next(struct sge_iq *iq) +t4_iq_next_entry(t4_sge_iq_t *iq) +{ + iq->tsi_cdesc = (void *) ((caddr_t)iq->tsi_cdesc + iq->tsi_esize_bytes); + if (++iq->tsi_cidx == iq->tsi_cap) { + iq->tsi_cidx = 0; + iq->tsi_gen ^= F_RSPD_GEN; + iq->tsi_cdesc = iq->tsi_desc; + } +} + +static inline bool +t4_fl_running_low(const struct sge_fl *fl) +{ + return (fl->bufs_avail <= fl->bufs_lowat); +} + +static inline bool +t4_fl_not_running_low(const struct sge_fl *fl) +{ + return (fl->bufs_avail >= (2 * fl->bufs_lowat)); +} + +static inline uint_t +t4_fl_advance_cidx(struct sge_fl *fl) { - iq->cdesc = (void *) ((uintptr_t)iq->cdesc + iq->esize); - if (++iq->cidx == iq->qsize - 1) { - iq->cidx = 0; - iq->gen ^= 1; - iq->cdesc = iq->desc; + t4_sge_eq_t *eq = &fl->eq; + + FL_LOCK_ASSERT_OWNED(fl); + ASSERT3U(fl->cidx_sdesc, <, FL_BUF_PTR_PER_HC); + ASSERT3U(eq->tse_cidx, <, eq->tse_qsize); + + fl->cidx_sdesc++; + if (fl->cidx_sdesc == FL_BUF_PTR_PER_HC) { + fl->cidx_sdesc = 0; + eq->tse_cidx++; + if (eq->tse_cidx == eq->tse_qsize) { + eq->tse_cidx = 0; + } + return (1); } + return (0); +} + +static inline struct fl_sdesc * +t4_fl_sdesc(struct sge_fl *fl, uint_t eq_idx, uint_t sdesc_idx) +{ + ASSERT(sdesc_idx < FL_BUF_PTR_PER_HC); + const uint_t idx = (eq_idx * FL_BUF_PTR_PER_HC) + sdesc_idx; + + return (&fl->sdesc[idx]); } /* @@ -1828,19 +2144,24 @@ iq_next(struct sge_iq *iq) * Returns non-zero to indicate that it should be added to the list of starving * freelists. */ -static int -refill_fl(struct adapter *sc, struct sge_fl *fl, int nbufs) +static bool +t4_fl_refill(struct sge_fl *fl, uint_t nbufs) { - uint64_t *d = &fl->desc[fl->pidx]; - struct fl_sdesc *sd = &fl->sdesc[fl->pidx]; + struct adapter *sc = t4_fl_to_iq(fl)->tsi_adapter; + t4_sge_eq_t *eq = &fl->eq; FL_LOCK_ASSERT_OWNED(fl); - ASSERT(nbufs >= 0); - if (nbufs > fl->needed) - nbufs = fl->needed; + /* + * We refill up to nbufs, but maybe less if there are not that many + * outstanding. + */ + nbufs = MIN(nbufs, fl->bufs_cap - fl->bufs_avail); + while (nbufs != 0 && eq->tse_avail != 0) { + struct fl_desc *fld = t4_eq_credit(eq, eq->tse_pidx); + struct fl_sdesc *sd = t4_fl_sdesc(fl, eq->tse_pidx, + fl->pidx_sdesc); - while (nbufs--) { if (sd->rxb != NULL) { if (sd->rxb->ref_cnt == 1) { /* @@ -1860,9 +2181,9 @@ refill_fl(struct adapter *sc, struct sge_fl *fl, int nbufs) * Either way the bus address in the descriptor * ring is already valid. */ - ASSERT(*d == cpu_to_be64(sd->rxb->ba)); - d++; - goto recycled; + ASSERT3U(fld->dptr[fl->pidx_sdesc], ==, + BE_64(sd->rxb->ba)); + fl->stats.rxb_recycle++; } else { /* * Buffer still in use and we need a @@ -1870,89 +2191,113 @@ refill_fl(struct adapter *sc, struct sge_fl *fl, int nbufs) * on the existing buffer. */ rxbuf_free(sd->rxb); + sd->rxb = NULL; } } - sd->rxb = rxbuf_alloc(sc->sge.rxbuf_cache, KM_NOSLEEP, 1); - if (sd->rxb == NULL) - break; - *d++ = cpu_to_be64(sd->rxb->ba); - -recycled: fl->pending++; - sd++; - fl->needed--; - if (++fl->pidx == fl->cap) { - fl->pidx = 0; - sd = fl->sdesc; - d = fl->desc; + if (sd->rxb == NULL) { + sd->rxb = rxbuf_alloc(sc->sge.rxbuf_cache, KM_NOSLEEP); + if (sd->rxb == NULL) { + fl->stats.rxb_alloc_fail++; + break; + } + fl->stats.rxb_alloc++; + } + fld->dptr[fl->pidx_sdesc] = BE_64(sd->rxb->ba); + + nbufs--; + fl->bufs_avail++; + fl->pidx_sdesc++; + if (fl->pidx_sdesc == FL_BUF_PTR_PER_HC) { + /* + * The host credit is filled. It is now ready to be + * posted to the device. + */ + fl->pidx_sdesc = 0; + eq->tse_pending++; + eq->tse_avail--; + eq->tse_pidx++; + if (eq->tse_pidx == eq->tse_qsize) { + eq->tse_pidx = 0; + } } } - if (fl->pending >= 8) - ring_fl_db(sc, fl); + if (eq->tse_pending != 0) { + t4_fl_ring_db(fl); + } - return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING)); + return (t4_fl_running_low(fl)); } -#ifndef TAILQ_FOREACH_SAFE -#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ - for ((var) = TAILQ_FIRST((head)); \ - (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ - (var) = (tvar)) -#endif +static clock_t t4_sfl_period_us = 100000; + +static void +t4_sfl_reschedule(struct adapter *sc) +{ + ASSERT(MUTEX_HELD(&sc->sfl_lock)); + ASSERT(!list_is_empty(&sc->sfl_list)); + + sc->sfl_timer = timeout(t4_sfl_process, sc, + drv_usectohz(t4_sfl_period_us)); +} /* * Attempt to refill all starving freelists. */ static void -refill_sfl(void *arg) +t4_sfl_process(void *arg) { struct adapter *sc = arg; - struct sge_fl *fl, *fl_temp; mutex_enter(&sc->sfl_lock); - TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) { + struct sge_fl *fl = list_head(&sc->sfl_list); + while (fl != NULL) { + struct sge_fl *next = list_next(&sc->sfl_list, fl); + FL_LOCK(fl); - (void) refill_fl(sc, fl, 64); - if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) { - TAILQ_REMOVE(&sc->sfl, fl, link); - fl->flags &= ~FL_STARVING; + (void) t4_fl_refill(fl, 64); + if (t4_fl_not_running_low(fl) || fl->sfl_flags & SFL_DOOMED) { + list_remove(&sc->sfl_list, fl); + fl->sfl_flags &= ~SFL_STARVING; } FL_UNLOCK(fl); + fl = next; } - if (!TAILQ_EMPTY(&sc->sfl) != 0) - sc->sfl_timer = timeout(refill_sfl, sc, drv_usectohz(100000)); + if (!list_is_empty(&sc->sfl_list)) { + t4_sfl_reschedule(sc); + } mutex_exit(&sc->sfl_lock); } static void -add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl) +t4_sfl_enqueue(struct adapter *sc, struct sge_fl *fl) { mutex_enter(&sc->sfl_lock); FL_LOCK(fl); - if ((fl->flags & FL_DOOMED) == 0) { - if (TAILQ_EMPTY(&sc->sfl) != 0) { - sc->sfl_timer = timeout(refill_sfl, sc, - drv_usectohz(100000)); + if ((fl->sfl_flags & (SFL_DOOMED | SFL_STARVING)) == 0) { + const bool was_empty = list_is_empty(&sc->sfl_list); + + fl->sfl_flags |= SFL_STARVING; + list_insert_tail(&sc->sfl_list, fl); + if (was_empty) { + t4_sfl_reschedule(sc); } - fl->flags |= FL_STARVING; - TAILQ_INSERT_TAIL(&sc->sfl, fl, link); } FL_UNLOCK(fl); mutex_exit(&sc->sfl_lock); } static void -free_fl_bufs(struct sge_fl *fl) +t4_fl_free_bufs(struct sge_fl *fl) { - struct fl_sdesc *sd; - unsigned int i; + t4_sge_eq_t *eq = &fl->eq; - FL_LOCK_ASSERT_OWNED(fl); + EQ_LOCK_ASSERT_OWNED(eq); - for (i = 0; i < fl->cap; i++) { - sd = &fl->sdesc[i]; + for (uint_t i = 0; i < eq->tse_qsize * FL_BUF_PTR_PER_HC; i++) { + struct fl_sdesc *sd = &fl->sdesc[i]; if (sd->rxb != NULL) { rxbuf_free(sd->rxb); @@ -1962,94 +2307,114 @@ free_fl_bufs(struct sge_fl *fl) } /* - * Note that fl->cidx and fl->offset are left unchanged in case of failure. + * Attempt to create an mblk representing the payload stored at the current + * offset (fl->offset) in the current FL buffer (fl->cidx_sdesc). If the length + * of the payload is less than fl->copy_threshold, then allocable a new + * mblk/dblk to hold the contents and copy it over. Otherwise, attempt to + * desballoc() the payload. If there is a failure to allocate, then restore the + * eq->tse_cidx and fl->offset to their original value that they had upon + * entering this function. */ static mblk_t * -get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf, - int *fl_bufs_used) +t4_fl_get_payload(struct sge_fl *fl, uint32_t len, bool newbuf) { - struct mblk_pair frame = {0}; - struct rxbuf *rxb; - mblk_t *m = NULL; - uint_t nbuf = 0, len, copy, n; - uint32_t cidx, offset, rcidx, roffset; + struct adapter *sc = t4_fl_to_iq(fl)->tsi_adapter; + t4_sge_eq_t *eq = &fl->eq; + mblk_t *mp = NULL; + mblk_t *head = NULL, **tailp = &head; + uint_t bufs_consumed = 0; + FL_LOCK(fl); /* * The SGE won't pack a new frame into the current buffer if the entire * payload doesn't fit in the remaining space. Move on to the next buf * in that case. */ - rcidx = fl->cidx; - roffset = fl->offset; - if (fl->offset > 0 && len_newbuf & F_RSPD_NEWBUF) { + const uint16_t rcidx = eq->tse_cidx; + const uint_t rcidx_sdesc = fl->cidx_sdesc; + const uint32_t roffset = fl->offset; + uint_t credits_avail = 0; + + if (fl->offset > 0 && newbuf) { + /* + * The device has moved onto the next buffer. Reset our offset + * into the current buffer and advanced the driver's cidx, which + * may have freed up an EQ host credit to be refilled by the + * driver. + */ fl->offset = 0; - if (++fl->cidx == fl->cap) - fl->cidx = 0; - nbuf++; + credits_avail += t4_fl_advance_cidx(fl); + bufs_consumed++; } - cidx = fl->cidx; - offset = fl->offset; - len = G_RSPD_LEN(len_newbuf); /* pktshift + payload length */ - copy = (len <= fl->copy_threshold); - if (copy != 0) { - frame.head = m = allocb(len, BPRI_HI); - if (m == NULL) { - fl->allocb_fail++; + const bool do_copy = (len <= fl->copy_threshold); + if (do_copy) { + mp = allocb(len, 0); + if (mp == NULL) { + fl->stats.copy_fail++; DTRACE_PROBE1(t4__fl_alloc_fail, struct sge_fl *, fl); - fl->cidx = rcidx; - fl->offset = roffset; - return (NULL); + goto restore; } + *tailp = mp; + tailp = &mp->b_cont; } - while (len) { - rxb = fl->sdesc[cidx].rxb; - n = min(len, rxb->buf_size - offset); + uint_t offset = fl->offset; + while (len != 0) { + struct rxbuf *rxb = + t4_fl_sdesc(fl, eq->tse_cidx, fl->cidx_sdesc)->rxb; + const uint_t copy_len = MIN(len, rxb->buf_size - offset); - (void) ddi_dma_sync(rxb->dhdl, offset, n, - DDI_DMA_SYNC_FORKERNEL); + (void) ddi_dma_sync(rxb->dhdl, 0, 0, DDI_DMA_SYNC_FORKERNEL); - if (copy != 0) - bcopy(rxb->va + offset, m->b_wptr, n); - else { - m = desballoc((unsigned char *)rxb->va + offset, n, - BPRI_HI, &rxb->freefunc); - if (m == NULL) { - fl->allocb_fail++; + if (do_copy) { + bcopy(rxb->va + offset, mp->b_wptr, copy_len); + fl->stats.copy++; + } else { + mp = desballoc((unsigned char *)rxb->va + offset, + copy_len, 0, &rxb->freefunc); + if (mp == NULL) { + fl->stats.wrap_fail++; DTRACE_PROBE1(t4__fl_alloc_fail, struct sge_fl *, fl); - if (frame.head) - freemsgchain(frame.head); - fl->cidx = rcidx; - fl->offset = roffset; - return (NULL); + goto restore; } atomic_inc_uint(&rxb->ref_cnt); - if (frame.head != NULL) - frame.tail->b_cont = m; - else - frame.head = m; - frame.tail = m; + *tailp = mp; + tailp = &mp->b_cont; + fl->stats.wrap++; } - m->b_wptr += n; - len -= n; - offset += roundup(n, sc->sge.fl_align); - ASSERT(offset <= rxb->buf_size); + mp->b_wptr += copy_len; + len -= copy_len; + offset += roundup(copy_len, sc->sge.fl_align); + + ASSERT3U(offset, <=, rxb->buf_size); if (offset == rxb->buf_size) { offset = 0; - if (++cidx == fl->cap) - cidx = 0; - nbuf++; + credits_avail += t4_fl_advance_cidx(fl); + bufs_consumed++; } } - - fl->cidx = cidx; fl->offset = offset; - (*fl_bufs_used) += nbuf; + ASSERT3U(credits_avail, <=, eq->tse_qsize); + eq->tse_avail += credits_avail; + /* We can't consume more than are available. */ + ASSERT3U(bufs_consumed, <=, fl->bufs_avail); + fl->bufs_avail -= bufs_consumed; + + FL_UNLOCK(fl); + + ASSERT(head != NULL); + return (head); + +restore: + eq->tse_cidx = rcidx; + fl->cidx_sdesc = rcidx_sdesc; + fl->offset = roffset; + FL_UNLOCK(fl); + freemsgchain(head); - ASSERT(frame.head != NULL); - return (frame.head); + return (NULL); } /* @@ -2058,7 +2423,7 @@ get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf, * of immediate data. */ #define IMM_LEN ( \ - 2 * EQ_ESIZE \ + 2 * EQ_HC_SIZE \ - sizeof (struct fw_eth_tx_pkt_wr) \ - sizeof (struct cpl_tx_pkt_core)) @@ -2100,7 +2465,8 @@ get_frame_txinfo(struct sge_txq *txq, mblk_t **fp, struct txinfo *txinfo, bzero(&txinfo->meoi, sizeof (txinfo->meoi)); } -start: txinfo->nsegs = 0; +start: + txinfo->nsegs = 0; txinfo->hdls_used = 0; txinfo->txb_used = 0; txinfo->len = 0; @@ -2115,10 +2481,10 @@ start: txinfo->nsegs = 0; m = *fp; if (n >= TX_SGL_SEGS || ((flags & HW_LSO) && MBLKL(m) < 50)) { - txq->pullup_early++; + txq->stats.pullup_early++; m = msgpullup(*fp, -1); if (m == NULL) { - txq->pullup_failed++; + txq->stats.pullup_failed++; return (E2BIG); /* (*fp) left as it was */ } freemsg(*fp); @@ -2150,7 +2516,7 @@ start: txinfo->nsegs = 0; if (rc == E2BIG || (txinfo->nsegs == TX_SGL_SEGS && m->b_cont)) { - txq->pullup_late++; + txq->stats.pullup_late++; m = msgpullup(*fp, -1); if (m != NULL) { free_txinfo_resources(txq, txinfo); @@ -2160,7 +2526,7 @@ start: txinfo->nsegs = 0; goto start; } - txq->pullup_failed++; + txq->stats.pullup_failed++; rc = E2BIG; } @@ -2170,9 +2536,8 @@ start: txinfo->nsegs = 0; } } - ASSERT(txinfo->nsegs > 0 && txinfo->nsegs <= TX_SGL_SEGS); - done: + ASSERT(txinfo->nsegs > 0 && txinfo->nsegs <= TX_SGL_SEGS); /* * Store the # of flits required to hold this frame's SGL in nflits. An @@ -2222,7 +2587,7 @@ copy_into_txb(struct sge_txq *txq, mblk_t *m, int len, struct txinfo *txinfo) TXQ_LOCK_ASSERT_OWNED(txq); /* will manipulate txb */ if (!fits_in_txb(txq, len, &waste)) { - txq->txb_full++; + txq->stats.txb_full++; return (ENOMEM); } @@ -2293,7 +2658,7 @@ add_mblk(struct sge_txq *txq, struct txinfo *txinfo, mblk_t *m, int len) TXQ_LOCK_ASSERT_OWNED(txq); /* will manipulate dhdls */ if (txq->tx_dhdl_avail == 0) { - txq->dma_hdl_failed++; + txq->stats.dma_hdl_failed++; return (ENOMEM); } @@ -2302,7 +2667,7 @@ add_mblk(struct sge_txq *txq, struct txinfo *txinfo, mblk_t *m, int len) DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, NULL, &cookie, &ccount); if (rc != DDI_DMA_MAPPED) { - txq->dma_map_failed++; + txq->stats.dma_map_failed++; ASSERT(rc != DDI_DMA_INUSE && rc != DDI_DMA_PARTIAL_MAP); @@ -2372,7 +2737,7 @@ static int add_to_txpkts(struct sge_txq *txq, struct txpkts *txpkts, mblk_t *m, struct txinfo *txinfo) { - struct sge_eq *eq = &txq->eq; + t4_sge_eq_t *eq = &txq->eq; int can_coalesce; struct tx_sdesc *txsd; uint8_t flits; @@ -2381,10 +2746,10 @@ add_to_txpkts(struct sge_txq *txq, struct txpkts *txpkts, mblk_t *m, ASSERT(m->b_next == NULL); if (txpkts->npkt > 0) { - flits = TXPKTS_PKT_HDR + txinfo->nflits; + flits = TXPKTS_PKT_HDR_FLITS + txinfo->nflits; can_coalesce = (txinfo->flags & HW_LSO) == 0 && - txpkts->nflits + flits <= TX_WR_FLITS && - txpkts->nflits + flits <= eq->avail * 8 && + txpkts->nflits + flits <= TX_WR_MAX_FLITS && + txpkts->nflits + flits <= EQ_HC_TO_FLITS(eq->tse_avail) && txpkts->plen + txinfo->len < 65536; if (can_coalesce != 0) { @@ -2394,7 +2759,7 @@ add_to_txpkts(struct sge_txq *txq, struct txpkts *txpkts, mblk_t *m, txpkts->nflits += flits; txpkts->plen += txinfo->len; - txsd = &txq->sdesc[eq->pidx]; + txsd = &txq->sdesc[eq->tse_pidx]; txsd->txb_used += txinfo->txb_used; txsd->hdls_used += txinfo->hdls_used; @@ -2424,9 +2789,15 @@ add_to_txpkts(struct sge_txq *txq, struct txpkts *txpkts, mblk_t *m, ASSERT(txpkts->npkt == 0); ASSERT(txinfo->len < 65536); - flits = TXPKTS_WR_HDR + txinfo->nflits; + flits = TXPKTS_WR_HDR_FLITS + txinfo->nflits; + + /* + * We can coalesce if this is non-LSO and the number of flits required + * is both less than or equal to the maximum flits allowed for a single + * WR and less than or equal to the number of flits currently available. + */ can_coalesce = (txinfo->flags & HW_LSO) == 0 && - flits <= eq->avail * 8 && flits <= TX_WR_FLITS; + flits <= EQ_HC_TO_FLITS(eq->tse_avail) && flits <= TX_WR_MAX_FLITS; if (can_coalesce == 0) return (EINVAL); @@ -2434,13 +2805,14 @@ add_to_txpkts(struct sge_txq *txq, struct txpkts *txpkts, mblk_t *m, /* * Start a fresh coalesced tx WR with m as the first frame in it. */ + t4_eq_host_credit_t *hc = t4_eq_credit(eq, eq->tse_pidx); txpkts->tail = m; txpkts->npkt = 1; txpkts->nflits = flits; - txpkts->flitp = &eq->desc[eq->pidx].flit[2]; + txpkts->flitp = &hc->flit[2]; txpkts->plen = txinfo->len; - txsd = &txq->sdesc[eq->pidx]; + txsd = &txq->sdesc[eq->tse_pidx]; txsd->mp_head = txsd->mp_tail = m; txsd->txb_used = txinfo->txb_used; txsd->hdls_used = txinfo->hdls_used; @@ -2449,63 +2821,72 @@ add_to_txpkts(struct sge_txq *txq, struct txpkts *txpkts, mblk_t *m, } static inline void -t4_tx_incr_pending(struct sge_txq *txq, uint_t ndesc) +t4_tx_incr_pending(struct sge_txq *txq, uint16_t ncredits) { - struct sge_eq *eq = &txq->eq; + t4_sge_eq_t *eq = &txq->eq; TXQ_LOCK_ASSERT_OWNED(txq); - ASSERT3U(ndesc, !=, 0); - ASSERT3U(eq->avail, >=, ndesc); - - eq->pending += ndesc; - eq->avail -= ndesc; - eq->pidx += ndesc; - if (eq->pidx >= eq->cap) { - eq->pidx -= eq->cap; + ASSERT3U(ncredits, !=, 0); + ASSERT3U(eq->tse_avail, >=, ncredits); + + eq->tse_pending += ncredits; + eq->tse_avail -= ncredits; + eq->tse_pidx += ncredits; + if (eq->tse_pidx >= eq->tse_qsize) { + eq->tse_pidx -= eq->tse_qsize; } + + ASSERT3U(eq->tse_pidx, <, eq->tse_qsize); + ASSERT3U(eq->tse_pending, <=, eq->tse_qsize - 1); } /* - * Note that write_txpkts_wr can never run out of hardware descriptors (but - * write_txpkt_wr can). add_to_txpkts ensures that a frame is accepted for - * coalescing only if sufficient hardware descriptors are available. + * Note that write_txpkts_wr() can never run out of host credits (but + * write_txpkt_wr() can). add_to_txpkts() ensures that a frame is accepted for + * coalescing only if sufficient host credits are available. */ static void write_txpkts_wr(struct sge_txq *txq, struct txpkts *txpkts) { - struct sge_eq *eq = &txq->eq; - struct fw_eth_tx_pkts_wr *wr; - struct tx_sdesc *txsd; - uint32_t ctrl; - uint16_t ndesc; + t4_sge_eq_t *eq = &txq->eq; TXQ_LOCK_ASSERT_OWNED(txq); /* pidx, avail */ - ndesc = howmany(txpkts->nflits, 8); + struct fw_eth_tx_pkts_wr *wr = t4_eq_credit(eq, eq->tse_pidx); + const uint16_t ncredits = EQ_FLITS_TO_HC(txpkts->nflits); + ASSERT3U(ncredits, <=, eq->tse_avail); - wr = (void *)&eq->desc[eq->pidx]; - wr->op_pkd = cpu_to_be32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR) | - V_FW_WR_IMMDLEN(0)); /* immdlen does not matter in this WR */ - ctrl = V_FW_WR_LEN16(howmany(txpkts->nflits, 2)); - if (eq->avail == ndesc) - ctrl |= F_FW_WR_EQUEQ | F_FW_WR_EQUIQ; - wr->equiq_to_len16 = cpu_to_be32(ctrl); - wr->plen = cpu_to_be16(txpkts->plen); + /* The immdlen value does not matter for this WR. */ + wr->op_pkd = BE_32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR) | V_FW_WR_IMMDLEN(0)); + + /* + * If all remaining credits are consumed by this WR, then request an EQ + * status update to both the EQ status page and the associated ingress + * queue entry. + * + * See §29.10 FW_ETH_TX_PKTS_WR of the T4 Firmware Interface + * Specification. + */ + const uint32_t update_bits = (eq->tse_avail == ncredits) ? + (F_FW_WR_EQUEQ | F_FW_WR_EQUIQ) : 0; + wr->equiq_to_len16 = BE_32(V_FW_WR_LEN16(howmany(txpkts->nflits, 2)) | + update_bits); + wr->r3 = 0; + wr->plen = BE_16(txpkts->plen); wr->npkt = txpkts->npkt; - wr->r3 = wr->type = 0; + wr->type = 0; /* Everything else already written */ + struct tx_sdesc *txsd = &txq->sdesc[eq->tse_pidx]; + txsd->credits_used = ncredits; - txsd = &txq->sdesc[eq->pidx]; - txsd->desc_used = ndesc; - - txq->txb_used += txsd->txb_used / TXB_CHUNK; - txq->hdl_used += txsd->hdls_used; + txq->stats.txb_used += txsd->txb_used / TXB_CHUNK; + txq->stats.hdl_used += txsd->hdls_used; - t4_tx_incr_pending(txq, ndesc); + t4_tx_incr_pending(txq, ncredits); - txq->txpkts_pkts += txpkts->npkt; - txq->txpkts_wrs++; + txq->stats.txpkts_pkts += txpkts->npkt; + txq->stats.txpkts_wrs++; txpkts->npkt = 0; /* emptied */ } @@ -2621,12 +3002,11 @@ static int write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, mblk_t *m, struct txinfo *txinfo) { - struct sge_eq *eq = &txq->eq; - struct fw_eth_tx_pkt_wr *wr; + t4_sge_eq_t *eq = &txq->eq; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; /* used in many unrelated places */ uint64_t ctrl1; - int nflits, ndesc; + uint16_t nflits = 0; struct tx_sdesc *txsd; caddr_t dst; const mac_ether_offload_info_t *meoi = &txinfo->meoi; @@ -2638,28 +3018,38 @@ write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, mblk_t *m, */ ctrl = sizeof (struct cpl_tx_pkt_core); if (txinfo->flags & HW_LSO) { - nflits = TXPKT_LSO_WR_HDR; + nflits = TXPKT_LSO_WR_HDR_FLITS; ctrl += sizeof (struct cpl_tx_pkt_lso_core); } else { - nflits = TXPKT_WR_HDR; + nflits = TXPKT_WR_HDR_FLITS; } if (txinfo->nsegs > 0) nflits += txinfo->nflits; else { - nflits += howmany(txinfo->len, 8); + nflits += howmany(txinfo->len, FLIT_NUM_BYTES); ctrl += txinfo->len; } - ndesc = howmany(nflits, 8); - if (ndesc > eq->avail) + + ASSERT3U(nflits, >, 0); + + const uint16_t ncredits = EQ_FLITS_TO_HC(nflits); + if (ncredits > eq->tse_avail) return (ENOMEM); /* Firmware work request header */ - wr = (void *)&eq->desc[eq->pidx]; + struct fw_eth_tx_pkt_wr *wr = t4_eq_credit(eq, eq->tse_pidx); wr->op_immdlen = cpu_to_be32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) | V_FW_WR_IMMDLEN(ctrl)); ctrl = V_FW_WR_LEN16(howmany(nflits, 2)); - if (eq->avail == ndesc) + + /* + * If all remaining credits are consumed by this WR, then request an EQ + * status update to both the EQ status page and the associated ingress + * queue entry. + */ + if (ncredits == eq->tse_avail) ctrl |= F_FW_WR_EQUEQ | F_FW_WR_EQUIQ; + wr->equiq_to_len16 = cpu_to_be32(ctrl); wr->r3 = 0; @@ -2703,7 +3093,7 @@ write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, mblk_t *m, cpl = (void *)(lso + 1); - txq->tso_wrs++; + txq->stats.tso_wrs++; } else { cpl = (void *)(wr + 1); } @@ -2712,14 +3102,14 @@ write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, mblk_t *m, switch (csum_to_ctrl(txinfo, CHELSIO_CHIP_VERSION(pi->adapter->params.chip), &ctrl1)) { case COS_SUCCESS: - txq->txcsum++; + txq->stats.txcsum++; break; case COS_FAIL: /* * Packet will be going out with checksums which are probably * wrong but there is little we can do now. */ - txq->csum_failed++; + txq->stats.csum_failed++; break; default: break; @@ -2733,21 +3123,21 @@ write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, mblk_t *m, cpl->ctrl1 = cpu_to_be64(ctrl1); /* Software descriptor */ - txsd = &txq->sdesc[eq->pidx]; + txsd = &txq->sdesc[eq->tse_pidx]; txsd->mp_head = txsd->mp_tail = m; txsd->txb_used = txinfo->txb_used; txsd->hdls_used = txinfo->hdls_used; - txsd->desc_used = ndesc; + txsd->credits_used = ncredits; - txq->txb_used += txinfo->txb_used / TXB_CHUNK; - txq->hdl_used += txinfo->hdls_used; + txq->stats.txb_used += txinfo->txb_used / TXB_CHUNK; + txq->stats.hdl_used += txinfo->hdls_used; - t4_tx_incr_pending(txq, ndesc); + t4_tx_incr_pending(txq, ncredits); /* SGL */ dst = (void *)(cpl + 1); if (txinfo->nsegs > 0) { - txq->sgl_wrs++; + txq->stats.sgl_wrs++; copy_to_txd(eq, (void *)&txinfo->sgl, &dst, txinfo->nflits * 8); /* Need to zero-pad to a 16 byte boundary if not on one */ @@ -2755,7 +3145,7 @@ write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, mblk_t *m, *(uint64_t *)dst = 0; } else { - txq->imm_wrs++; + txq->stats.imm_wrs++; #ifdef DEBUG ctrl = txinfo->len; #endif @@ -2768,17 +3158,17 @@ write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, mblk_t *m, ASSERT(ctrl == 0); } - txq->txpkt_wrs++; + txq->stats.txpkt_wrs++; return (0); } static void t4_write_flush_wr(struct sge_txq *txq) { - struct sge_eq *eq = &txq->eq; + t4_sge_eq_t *eq = &txq->eq; EQ_LOCK_ASSERT_OWNED(eq); - ASSERT(eq->avail > 0); + ASSERT3U(eq->tse_avail, >, 0); const struct fw_eq_flush_wr wr = { .opcode = FW_EQ_FLUSH_WR, @@ -2786,20 +3176,31 @@ t4_write_flush_wr(struct sge_txq *txq) V_FW_WR_LEN16(sizeof (struct fw_eq_flush_wr) / 16) | F_FW_WR_EQUEQ | F_FW_WR_EQUIQ), }; - *(struct fw_eq_flush_wr *)&eq->desc[eq->pidx] = wr; + *(struct fw_eq_flush_wr *)t4_eq_credit(eq, eq->tse_pidx) = wr; const struct tx_sdesc txsd = { .mp_head = NULL, .mp_tail = NULL, .txb_used = 0, .hdls_used = 0, - .desc_used = 1, + .credits_used = 1, }; - txq->sdesc[eq->pidx] = txsd; + txq->sdesc[eq->tse_pidx] = txsd; t4_tx_incr_pending(txq, 1); } +/* + * Increment the flit pointer by the given number of bytes. + */ +static inline void * +t4_incr_flit(void *flitp, size_t num_bytes) +{ + /* A flit should always start on an 8-byte boundary. */ + ASSERT0(((uintptr_t)flitp + num_bytes) & 0x7); + return ((void *)((caddr_t)(flitp) + (num_bytes))); +} + static inline void write_ulp_cpl_sgl(struct port_info *pi, struct sge_txq *txq, struct txpkts *txpkts, struct txinfo *txinfo) @@ -2807,27 +3208,25 @@ write_ulp_cpl_sgl(struct port_info *pi, struct sge_txq *txq, struct ulp_txpkt *ulpmc; struct ulptx_idata *ulpsc; struct cpl_tx_pkt_core *cpl; - uintptr_t flitp, start, end; + void *flitp = txpkts->flitp; uint64_t ctrl; caddr_t dst; + const uintptr_t end = (uintptr_t)txq->eq.tse_spg; - ASSERT(txpkts->npkt > 0); - - start = (uintptr_t)txq->eq.desc; - end = (uintptr_t)txq->eq.spg; + ASSERT3U(txpkts->npkt, >, 0); /* Checksum offload */ switch (csum_to_ctrl(txinfo, CHELSIO_CHIP_VERSION(pi->adapter->params.chip), &ctrl)) { case COS_SUCCESS: - txq->txcsum++; + txq->stats.txcsum++; break; case COS_FAIL: /* * Packet will be going out with checksums which are probably * wrong but there is little we can do now. */ - txq->csum_failed++; + txq->stats.csum_failed++; break; default: break; @@ -2840,113 +3239,131 @@ write_ulp_cpl_sgl(struct port_info *pi, struct sge_txq *txq, * bytes each), and that it can not wrap around in the middle of the * cpl_tx_pkt_core either. */ - flitp = (uintptr_t)txpkts->flitp; - ASSERT((flitp & 0xf) == 0); + ASSERT0((uintptr_t)flitp & 0xf); + ASSERT3U((uintptr_t)flitp + sizeof (*ulpmc), <=, end); /* ULP master command */ - ulpmc = (void *)flitp; + ulpmc = flitp; ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); ulpmc->len = htonl(howmany(sizeof (*ulpmc) + sizeof (*ulpsc) + - sizeof (*cpl) + 8 * txinfo->nflits, 16)); + sizeof (*cpl) + FLITS_TO_BYTES(txinfo->nflits), 16)); + + flitp = t4_incr_flit(flitp, sizeof (*ulpmc)); + + /* We cannot wrap-around between the ULPTX master and subcommand. */ + ASSERT3U((uintptr_t)flitp, <, end); + ASSERT3U((uintptr_t)flitp + sizeof (*ulpsc), <=, end); /* ULP subcommand */ - ulpsc = (void *)(ulpmc + 1); + ulpsc = flitp; ulpsc->cmd_more = cpu_to_be32(V_ULPTX_CMD((u32)ULP_TX_SC_IMM) | F_ULP_TX_SC_MORE); ulpsc->len = cpu_to_be32(sizeof (struct cpl_tx_pkt_core)); - flitp += sizeof (*ulpmc) + sizeof (*ulpsc); - if (flitp == end) - flitp = start; + flitp = t4_incr_flit(flitp, sizeof (*ulpsc)); + + /* If we have reached the end, go back to the start of the ring. */ + if ((uintptr_t)flitp == end) + flitp = txq->eq.tse_ring; /* CPL_TX_PKT_XT */ - cpl = (void *)flitp; + cpl = flitp; cpl->ctrl0 = cpu_to_be32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) | V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(pi->adapter->pf)); cpl->pack = 0; cpl->len = cpu_to_be16(txinfo->len); cpl->ctrl1 = cpu_to_be64(ctrl); - flitp += sizeof (*cpl); - if (flitp == end) - flitp = start; + flitp = t4_incr_flit(flitp, sizeof (*cpl)); + + /* The CPL cannot wrap-around the end. */ + ASSERT3U((uintptr_t)flitp, <=, end); + + if ((uintptr_t)flitp == end) + flitp = txq->eq.tse_ring; /* SGL for this frame */ dst = (caddr_t)flitp; - copy_to_txd(&txq->eq, (void *)&txinfo->sgl, &dst, txinfo->nflits * 8); - flitp = (uintptr_t)dst; + copy_to_txd(&txq->eq, (void *)&txinfo->sgl, &dst, + FLITS_TO_BYTES(txinfo->nflits)); + flitp = (void *)dst; /* Zero pad and advance to a 16 byte boundary if not already at one. */ - if (flitp & 0xf) { - - /* no matter what, flitp should be on an 8 byte boundary */ - ASSERT((flitp & 0x7) == 0); + if (((uintptr_t)flitp & 0xf) != 0) { + /* A flit should always be on an 8 byte boundary. */ + ASSERT(((uintptr_t)flitp & 0x7) == 0); *(uint64_t *)flitp = 0; - flitp += sizeof (uint64_t); + flitp = t4_incr_flit(flitp, FLIT_NUM_BYTES); txpkts->nflits++; } - if (flitp == end) - flitp = start; + ASSERT0((uintptr_t)flitp & 0xf); + + /* + * The SGL can wrap-around, but lets make sure we stayed within the + * ring. + */ + ASSERT3U((uintptr_t)flitp, <=, end); + + if ((uintptr_t)flitp == end) + flitp = txq->eq.tse_ring; - txpkts->flitp = (void *)flitp; + txpkts->flitp = flitp; } static inline void -copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len) +copy_to_txd(t4_sge_eq_t *eq, caddr_t from, caddr_t *to, size_t len) { - if ((uintptr_t)(*to) + len <= (uintptr_t)eq->spg) { + /* + * Technically the maximum WR size is lower, but this assert is just to + * make sure nothing funky is going on with len. We subtract one from + * the qsize because you can never totally fill the queue. + */ + ASSERT3U(len, <=, FLITS_TO_BYTES(EQ_HC_TO_FLITS(eq->tse_qsize - 1))); + + if ((uintptr_t)(*to) + len <= (uintptr_t)eq->tse_spg) { bcopy(from, *to, len); (*to) += len; } else { - int portion = (uintptr_t)eq->spg - (uintptr_t)(*to); + /* + * The number of bytes left before the end of the ring (which is + * the status page). + */ + size_t portion = (uintptr_t)eq->tse_spg - (uintptr_t)(*to); + ASSERT3U(portion, <, len); bcopy(from, *to, portion); from += portion; portion = len - portion; /* remaining */ - bcopy(from, (void *)eq->desc, portion); - (*to) = (caddr_t)eq->desc + portion; + bcopy(from, eq->tse_ring, portion); + (*to) = (caddr_t)eq->tse_ring + portion; } } static void t4_tx_ring_db(struct sge_txq *txq) { - struct sge_eq *eq = &txq->eq; + t4_sge_eq_t *eq = &txq->eq; struct adapter *sc = txq->port->adapter; int val, db_mode; - t4_doorbells_t db = eq->doorbells; + t4_doorbells_t db = eq->tse_doorbells; EQ_LOCK_ASSERT_OWNED(eq); - if (eq->pending > 1) + /* + * A Write-Combining Work Request implicitly uses a single credit and + * only a single credit. If we have produced more than one credit, then + * fallback to the Write-Combining UDB, then plain UDB, and finally KDB. + */ + if (eq->tse_pending > 1) db &= ~DOORBELL_WCWR; - if (eq->pending > eq->pidx) { - int offset = eq->cap - (eq->pending - eq->pidx); - - /* pidx has wrapped around since last doorbell */ - - (void) ddi_dma_sync(eq->desc_dhdl, - offset * sizeof (struct tx_desc), 0, - DDI_DMA_SYNC_FORDEV); - (void) ddi_dma_sync(eq->desc_dhdl, - 0, eq->pidx * sizeof (struct tx_desc), - DDI_DMA_SYNC_FORDEV); - } else if (eq->pending > 0) { - (void) ddi_dma_sync(eq->desc_dhdl, - (eq->pidx - eq->pending) * sizeof (struct tx_desc), - eq->pending * sizeof (struct tx_desc), - DDI_DMA_SYNC_FORDEV); - } + (void) ddi_dma_sync(eq->tse_ring_dhdl, 0, 0, DDI_DMA_SYNC_FORDEV); membar_producer(); - if (t4_cver_eq(sc, CHELSIO_T4)) - val = V_PIDX(eq->pending); - else - val = V_PIDX_T5(eq->pending); + val = V_PIDX(eq->tse_pending); db_mode = (1 << (ffs(db) - 1)); switch (db_mode) { @@ -2957,17 +3374,20 @@ t4_tx_ring_db(struct sge_txq *txq) * (udb_qid is always 0). Only queues with * doorbell segments can do WCWR. */ - ASSERT(eq->udb_qid == 0 && eq->pending == 1); + ASSERT(eq->tse_udb_qid == 0 && eq->tse_pending == 1); - const uint_t desc_idx = - eq->pidx != 0 ? eq->pidx - 1 : eq->cap - 1; - uint64_t *src = (uint64_t *)&eq->desc[desc_idx]; + const uint16_t credit_idx = eq->tse_pidx != 0 ? + eq->tse_pidx - 1 : eq->tse_qsize - 1; + uint64_t *src = t4_eq_credit(eq, credit_idx); volatile uint64_t *dst = - (uint64_t *)(eq->udb + UDBS_WR_OFFSET); + (uint64_t *)(eq->tse_udb + UDBS_WR_OFFSET); - /* Copy the 8 flits of the TX descriptor to the DB */ + /* + * Copy the 8 flits of the host credit to the UDB WCWR + * space (the second 64 bytes of the 128 byte segment). + */ const uint_t flit_count = - sizeof (struct tx_desc) / sizeof (uint64_t); + sizeof (t4_eq_host_credit_t) / sizeof (uint64_t); for (uint_t i = 0; i < flit_count; i++) { /* * Perform the copy directly through the BAR @@ -2986,50 +3406,59 @@ t4_tx_ring_db(struct sge_txq *txq) case DOORBELL_UDB: case DOORBELL_UDBWC: ddi_put32(sc->bar2_hdl, - (uint32_t *)(eq->udb + UDBS_DB_OFFSET), - LE_32(V_QID(eq->udb_qid) | val)); + (uint32_t *)(eq->tse_udb + UDBS_DB_OFFSET), + LE_32(V_QID(eq->tse_udb_qid) | val)); membar_producer(); break; case DOORBELL_KDB: t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), - V_QID(eq->cntxt_id) | val); + V_QID(eq->tse_cntxt_id) | val); break; } - eq->pending = 0; + eq->tse_pending = 0; } /* - * Reclaim consumed descriptors from egress queue. This will be capped at an - * upper bound of `howmany`. The corresponding mblks will be freed inline, - * unless a non-NULL `defer_freemp` is provided, in which case the to-be-freed - * mblk chain will be provided to the caller. + * Attempt to reclaim consumed host credits from the given Tx EQ. The number of + * credits to reclaim is specified by 'howmany', but that value is clamped down + * to the number of credits available for reclaim if it is too large. The mblks + * associated with the reclaimed credits are freed inline unless a non-NULL + * 'defer_freemp' is provided; in that case an mblk chain is provided to the + * caller who is now responsible for freeing. * - * Returns the number of descriptors which underwent reclamation. + * Returns the number of reclaimed host credits. + * + * When debugging/analyzing this code it is important to remember that host + * credits != mblks. */ -static uint_t -t4_tx_reclaim_descs(struct sge_txq *txq, uint_t howmany, mblk_t **defer_freemp) +static uint16_t +t4_tx_reclaim_credits(struct sge_txq *txq, uint16_t howmany, + mblk_t **defer_freemp) { - struct sge_eq *eq = &txq->eq; + t4_sge_eq_t *eq = &txq->eq; EQ_LOCK_ASSERT_OWNED(eq); - const uint_t cur_cidx = BE_16(eq->spg->cidx); - const uint_t reclaim_avail = (cur_cidx >= eq->cidx) ? - (cur_cidx - eq->cidx) : (cur_cidx + eq->cap - eq->cidx); + const uint16_t cur_cidx = BE_16(eq->tse_spg->cidx); + const uint16_t reclaim_avail = (cur_cidx >= eq->tse_cidx) ? + (cur_cidx - eq->tse_cidx) : + (cur_cidx + eq->tse_qsize - eq->tse_cidx); if (reclaim_avail == 0) { return (0); } - uint_t txb_freed = 0, hdl_freed = 0, reclaimed = 0; + uint_t txb_freed = 0, hdl_freed = 0; + uint16_t reclaimed = 0; + do { - struct tx_sdesc *txsd = &txq->sdesc[eq->cidx]; - const uint_t ndesc = txsd->desc_used; + struct tx_sdesc *txsd = &txq->sdesc[eq->tse_cidx]; + const uint16_t ncredits = txsd->credits_used; /* Firmware doesn't return "partial" credits. */ - ASSERT3U(reclaimed + ndesc, <=, reclaim_avail); + ASSERT3U(reclaimed + ncredits, <=, reclaim_avail); if (txsd->mp_head != NULL) { /* @@ -3060,24 +3489,24 @@ t4_tx_reclaim_descs(struct sge_txq *txq, uint_t howmany, mblk_t **defer_freemp) */ ASSERT0(txsd->txb_used); ASSERT0(txsd->hdls_used); - ASSERT3U(ndesc, ==, 1); + ASSERT3U(ncredits, ==, 1); } txb_freed += txsd->txb_used; hdl_freed += txsd->hdls_used; - reclaimed += ndesc; + reclaimed += ncredits; - eq->cidx += ndesc; - if (eq->cidx >= eq->cap) { - eq->cidx -= eq->cap; + eq->tse_cidx += ncredits; + if (eq->tse_cidx >= eq->tse_qsize) { + eq->tse_cidx -= eq->tse_qsize; } } while (reclaimed < reclaim_avail && reclaimed < howmany); - eq->avail += reclaimed; + eq->tse_avail += reclaimed; txq->txb_avail += txb_freed; txq->tx_dhdl_avail += hdl_freed; - ASSERT3U(eq->avail, <, eq->cap); + ASSERT3U(eq->tse_avail, <, eq->tse_qsize); ASSERT3U(txq->tx_dhdl_avail, <=, txq->tx_dhdl_total); for (; hdl_freed; hdl_freed--) { @@ -3090,11 +3519,11 @@ t4_tx_reclaim_descs(struct sge_txq *txq, uint_t howmany, mblk_t **defer_freemp) } static int -t4_handle_cpl_msg(struct sge_iq *iq, const struct rss_header *rss, mblk_t *mp) +t4_handle_cpl_msg(t4_sge_iq_t *iq, const struct rss_header *rss, mblk_t *mp) { const uint8_t opcode = rss->opcode; - DTRACE_PROBE4(t4__cpl_msg, struct sge_iq *, iq, uint8_t, opcode, + DTRACE_PROBE4(t4__cpl_msg, t4_sge_iq_t *, iq, uint8_t, opcode, const struct rss_header *, rss, mblk_t *, mp); switch (opcode) { @@ -3107,9 +3536,16 @@ t4_handle_cpl_msg(struct sge_iq *iq, const struct rss_header *rss, mblk_t *mp) t4_sge_egr_update(iq, rss); return (0); case CPL_RX_PKT: - return (t4_eth_rx(iq, rss, mp)); + /* + * Packet RX is expected to be handled in t4_process_rx_iq(). + * CPL messages of such a type should not make it here. + */ + cxgb_printf(iq->tsi_adapter->dip, CE_WARN, + "unexpected unhandled CPL_RX_PKT msg"); + freemsg(mp); + return (0); default: - cxgb_printf(iq->adapter->dip, CE_WARN, + cxgb_printf(iq->tsi_adapter->dip, CE_WARN, "unhandled CPL opcode 0x%02x", opcode); if (mp != NULL) { freemsg(mp); @@ -3119,14 +3555,14 @@ t4_handle_cpl_msg(struct sge_iq *iq, const struct rss_header *rss, mblk_t *mp) } static int -t4_handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss) +t4_handle_fw_msg(t4_sge_iq_t *iq, const struct rss_header *rss) { const struct cpl_fw6_msg *cpl = (const void *)(rss + 1); const uint8_t msg_type = cpl->type; const struct rss_header *rss2; - struct adapter *sc = iq->adapter; + struct adapter *sc = iq->tsi_adapter; - DTRACE_PROBE3(t4__fw_msg, struct sge_iq *, iq, uint8_t, msg_type, + DTRACE_PROBE3(t4__fw_msg, t4_sge_iq_t *, iq, uint8_t, msg_type, const struct rss_header *, rss); switch (msg_type) { @@ -3142,115 +3578,34 @@ t4_handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss) } } -static int -t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, mblk_t *m) -{ - bool csum_ok; - uint16_t err_vec; - struct sge_rxq *rxq = (void *)iq; - struct mblk_pair chain = {0}; - struct adapter *sc = iq->adapter; - const struct cpl_rx_pkt *cpl = t4_rss_payload(rss); - - m->b_rptr += sc->sge.pktshift; - - /* Compressed error vector is enabled for T6 only */ - if (sc->params.tp.rx_pkt_encap) - /* It is enabled only in T6 config file */ - err_vec = G_T6_COMPR_RXERR_VEC(ntohs(cpl->err_vec)); - else - err_vec = ntohs(cpl->err_vec); - - csum_ok = cpl->csum_calc && !err_vec; - /* TODO: what about cpl->ip_frag? */ - if (csum_ok && !cpl->ip_frag) { - mac_hcksum_set(m, 0, 0, 0, 0xffff, - HCK_FULLCKSUM_OK | HCK_FULLCKSUM | - HCK_IPV4_HDRCKSUM_OK); - rxq->rxcsum++; - } - - /* Add to the chain that we'll send up */ - if (chain.head != NULL) - chain.tail->b_next = m; - else - chain.head = m; - chain.tail = m; - - t4_mac_rx(rxq->port, rxq, chain.head); - - rxq->rxpkts++; - rxq->rxbytes += be16_to_cpu(cpl->len); - return (0); -} - -#define FL_HW_IDX(idx) ((idx) >> 3) - -static inline void -ring_fl_db(struct adapter *sc, struct sge_fl *fl) +static void +t4_fl_ring_db(struct sge_fl *fl) { - int desc_start, desc_last, ndesc; - uint32_t v = sc->params.arch.sge_fl_db; - - ndesc = FL_HW_IDX(fl->pending); - - /* Hold back one credit if pidx = cidx */ - if (FL_HW_IDX(fl->pidx) == FL_HW_IDX(fl->cidx)) - ndesc--; - - /* - * There are chances of ndesc modified above (to avoid pidx = cidx). - * If there is nothing to post, return. - */ - if (ndesc <= 0) - return; - - desc_last = FL_HW_IDX(fl->pidx); + struct adapter *sc = t4_fl_to_iq(fl)->tsi_adapter; + t4_sge_eq_t *eq = &fl->eq; - if (fl->pidx < fl->pending) { - /* There was a wrap */ - desc_start = FL_HW_IDX(fl->pidx + fl->cap - fl->pending); - - /* From desc_start to the end of list */ - (void) ddi_dma_sync(fl->dhdl, desc_start * RX_FL_ESIZE, 0, - DDI_DMA_SYNC_FORDEV); - - /* From start of list to the desc_last */ - if (desc_last != 0) - (void) ddi_dma_sync(fl->dhdl, 0, desc_last * - RX_FL_ESIZE, DDI_DMA_SYNC_FORDEV); - } else { - /* There was no wrap, sync from start_desc to last_desc */ - desc_start = FL_HW_IDX(fl->pidx - fl->pending); - (void) ddi_dma_sync(fl->dhdl, desc_start * RX_FL_ESIZE, - ndesc * RX_FL_ESIZE, DDI_DMA_SYNC_FORDEV); - } + EQ_LOCK_ASSERT_OWNED(eq); - if (t4_cver_eq(sc, CHELSIO_T4)) - v |= V_PIDX(ndesc); - else - v |= V_PIDX_T5(ndesc); - v |= V_QID(fl->cntxt_id) | V_PIDX(ndesc); + (void) ddi_dma_sync(eq->tse_ring_dhdl, 0, 0, DDI_DMA_SYNC_FORDEV); membar_producer(); - t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), v); + t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), + sc->params.arch.sge_fl_db | + V_QID(eq->tse_cntxt_id) | + V_PIDX(eq->tse_pending)); - /* - * Update pending count: - * Deduct the number of descriptors posted - */ - fl->pending -= ndesc * 8; + eq->tse_pending = 0; } static void -t4_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss) +t4_sge_egr_update(t4_sge_iq_t *iq, const struct rss_header *rss) { - struct adapter *sc = iq->adapter; + struct adapter *sc = iq->tsi_adapter; const struct cpl_sge_egr_update *cpl = t4_rss_payload(rss); const uint_t qid = G_EGR_QID(BE_32(cpl->opcode_qid)); struct sge_txq *txq = (struct sge_txq *)(*t4_eqmap_slot(sc, qid)); - struct sge_eq *eq = &txq->eq; + t4_sge_eq_t *eq = &txq->eq; /* * Get a "live" snapshot of the flags and PIDX state from the TXQ. @@ -3260,8 +3615,8 @@ t4_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss) * reclaim. */ membar_consumer(); - const uint16_t live_pidx = BE_16(eq->pidx); - const t4_eq_flags_t live_flags = eq->flags; + const uint16_t live_pidx = BE_16(eq->tse_pidx); + const t4_eq_flags_t live_flags = eq->tse_flags; if ((live_flags & EQ_CORKED) == 0 && (cpl->pidx != cpl->cidx || live_pidx != cpl->cidx)) { @@ -3272,7 +3627,7 @@ t4_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss) * allocate descriptors (or memory) while attempting to place * a packet in the TXQ. * - * 2. There are additional transmit descriptors in the EQ which + * 2. There are outstanding transmit descriptors in the EQ which * will trigger a subsequent SGE_EGR_UPDATE notification. * * When those conditions are met, it is safe to skip performing @@ -3288,10 +3643,10 @@ t4_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss) bool do_mac_update = false; TXQ_LOCK(txq); - (void) t4_tx_reclaim_descs(txq, eq->qsize, &freemp); - if (eq->flags & EQ_CORKED && eq->avail != 0) { + (void) t4_tx_reclaim_credits(txq, eq->tse_qsize, &freemp); + if (eq->tse_flags & EQ_CORKED && eq->tse_avail != 0) { do_mac_update = true; - eq->flags &= ~EQ_CORKED; + eq->tse_flags &= ~EQ_CORKED; } TXQ_UNLOCK(txq); @@ -3304,7 +3659,7 @@ t4_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss) #define KS_UINIT(x) kstat_named_init(&kstatp->x, #x, KSTAT_DATA_ULONG) #define KS_CINIT(x) kstat_named_init(&kstatp->x, #x, KSTAT_DATA_CHAR) #define KS_U_SET(x, y) kstatp->x.value.ul = (y) -#define KS_U_FROM(x, y) kstatp->x.value.ul = (y)->x +#define KS_U_FROM(x, y) kstatp->x.value.ul = (y)->stats.x #define KS_C_SET(x, ...) \ (void) snprintf(kstatp->x.value.c, 16, __VA_ARGS__) @@ -3313,10 +3668,10 @@ t4_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss) */ struct cxgbe_port_config_kstats { kstat_named_t idx; - kstat_named_t nrxq; - kstat_named_t ntxq; - kstat_named_t first_rxq; - kstat_named_t first_txq; + kstat_named_t rxq_count; + kstat_named_t txq_count; + kstat_named_t rxq_start; + kstat_named_t txq_start; kstat_named_t controller; kstat_named_t factory_mac_address; }; @@ -3360,18 +3715,18 @@ setup_port_config_kstats(struct port_info *pi) kstatp = (struct cxgbe_port_config_kstats *)ksp->ks_data; KS_UINIT(idx); - KS_UINIT(nrxq); - KS_UINIT(ntxq); - KS_UINIT(first_rxq); - KS_UINIT(first_txq); + KS_UINIT(rxq_count); + KS_UINIT(txq_count); + KS_UINIT(rxq_start); + KS_UINIT(txq_start); KS_CINIT(controller); KS_CINIT(factory_mac_address); KS_U_SET(idx, pi->port_id); - KS_U_SET(nrxq, pi->nrxq); - KS_U_SET(ntxq, pi->ntxq); - KS_U_SET(first_rxq, pi->first_rxq); - KS_U_SET(first_txq, pi->first_txq); + KS_U_SET(rxq_count, pi->rxq_count); + KS_U_SET(txq_count, pi->txq_count); + KS_U_SET(rxq_start, pi->rxq_start); + KS_U_SET(txq_start, pi->txq_start); KS_C_SET(controller, "%s%d", ddi_driver_name(pdip), ddi_get_instance(pdip)); KS_C_SET(factory_mac_address, "%02X%02X%02X%02X%02X%02X", @@ -3492,11 +3847,10 @@ struct rxq_kstats { kstat_named_t rxcsum; kstat_named_t rxpkts; kstat_named_t rxbytes; - kstat_named_t nomem; }; static kstat_t * -setup_rxq_kstats(struct port_info *pi, struct sge_rxq *rxq, int idx) +setup_rxq_kstats(struct port_info *pi, struct sge_rxq *rxq, uint_t q_idx) { struct kstat *ksp; struct rxq_kstats *kstatp; @@ -3504,14 +3858,14 @@ setup_rxq_kstats(struct port_info *pi, struct sge_rxq *rxq, int idx) char str[16]; ndata = sizeof (struct rxq_kstats) / sizeof (kstat_named_t); - (void) snprintf(str, sizeof (str), "rxq%u", idx); + (void) snprintf(str, sizeof (str), "rxq%u", q_idx); ksp = kstat_create(T4_PORT_NAME, ddi_get_instance(pi->dip), str, "rxq", KSTAT_TYPE_NAMED, ndata, 0); if (ksp == NULL) { cxgb_printf(pi->dip, CE_WARN, - "%s: failed to initialize rxq kstats for queue %d.", - __func__, idx); + "%s: failed to initialize rxq kstats for queue %u.", + __func__, q_idx); return (NULL); } @@ -3520,7 +3874,6 @@ setup_rxq_kstats(struct port_info *pi, struct sge_rxq *rxq, int idx) KS_UINIT(rxcsum); KS_UINIT(rxpkts); KS_UINIT(rxbytes); - KS_UINIT(nomem); ksp->ks_update = update_rxq_kstats; ksp->ks_private = (void *)rxq; @@ -3541,7 +3894,6 @@ update_rxq_kstats(kstat_t *ksp, int rw) KS_U_FROM(rxcsum, rxq); KS_U_FROM(rxpkts, rxq); KS_U_FROM(rxbytes, rxq); - KS_U_FROM(nomem, rxq); return (0); } @@ -3661,20 +4013,14 @@ rxbuf_cache_create(struct rxbuf_cache_params *p) rxbuf_ctor, rxbuf_dtor, NULL, p, NULL, 0); } -/* - * If ref_cnt is more than 1 then those many calls to rxbuf_free will - * have to be made before the rxb is released back to the kmem_cache. - */ static struct rxbuf * -rxbuf_alloc(kmem_cache_t *cache, int kmflags, uint_t ref_cnt) +rxbuf_alloc(kmem_cache_t *cache, int kmflags) { struct rxbuf *rxb; - ASSERT(ref_cnt > 0); - rxb = kmem_cache_alloc(cache, kmflags); if (rxb != NULL) { - rxb->ref_cnt = ref_cnt; + rxb->ref_cnt = 1; rxb->cache = cache; } diff --git a/usr/src/uts/intel/cxgbe/t4nex/Makefile b/usr/src/uts/intel/cxgbe/t4nex/Makefile index 5353461f36..eda61150f1 100644 --- a/usr/src/uts/intel/cxgbe/t4nex/Makefile +++ b/usr/src/uts/intel/cxgbe/t4nex/Makefile @@ -13,7 +13,7 @@ # Copyright (c) 2013 by Chelsio Communications, Inc. All rights reserved. # # Copyright (c) 2018, Joyent, Inc. -# Copyright 2023 Oxide Computer Company +# Copyright 2025 Oxide Computer Company # # This makefile drives the production of the Chelsio Terminator 4 10G Ethernet @@ -53,8 +53,11 @@ CFLAGS += -I$(UTSBASE)/common/io/cxgbe -I$(UTSBASE)/common/io/cxgbe/common \ # LDFLAGS += -N misc/mac -N drv/ip -# needs work -SMOFF += all_func_returns,snprintf_overflow +# "common" code is still not smatch clean +T4NEX_SMOFF_OBJS = t4_hw.o cudbg_lib.o cudbg_wtp.o cudbg_flash_utils.o \ + fastlz_api.o +T4NEX_SMOFF_PATHS = $(T4NEX_SMOFF_OBJS:%=$(OBJS_DIR)/%) +$(T4NEX_SMOFF_PATHS) := SMOFF += all_func_returns # # Default build targets. -- 2.51.2