/*
* SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION &
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


#pragma once

#include "GemmOptions.h"

namespace gemm { 


namespace tensorrt_llm
{
namespace kernels
{
// clang-format off

#define TLLM_GEN_COMMIT "c603ed2"
#define TLLM_GEN_EXPORT_VERSION "6.0"

static constexpr size_t tllmGenGemmListLen = 38;

#ifndef EXCLUDE_SM_100
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128_et128x128_m128x128x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin = nullptr;
inline unsigned char* Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_sm100a_cubin = nullptr;
#endif // EXCLUDE_SM_100

#ifndef EXCLUDE_SM_100
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128_et128x128_m128x128x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len = 0;
unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_sm100a_cubin_len = 0;
#endif // EXCLUDE_SM_100


static const gemm::GemmConfig tllmGenGemmList[] = {
#ifndef EXCLUDE_SM_100
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len, 97280, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a", 320, "c9f2153aefc6382d46f61d112c2308c02f7ce8bdeff4f3440b56bb698e141f8f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128_et128x128_m128x128x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128_et128x128_m128x128x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin_len, 152576, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128_et128x128_m128x128x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a", 320, "e31139cc03c9aa35e707fa086bc60721b2f84b9fa4fff6854277c3a9ff4c6e2b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len, 97280, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a", 320, "ce86289df97773dd49b191562ae4b704c704584af156a40b7ec3a34e57d97d9a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin_len, 152576, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a", 320, "4d0ea347686529deef591286cc45dc8e2e311f5faf87d91b73b7b051599e53d0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len, 115712, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a", 320, "83c73a14ed2c708fea9b00a27d86762aa3da00665c74142c1f4e36f03188b2bc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len, 152576, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a", 320, "050576e06304735082241d4608469f57ccee1dd06819d7e854f38302ac848688", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_sm100a_cubin_len, 189440, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_sm100a", 352, "90e5f629657836f17edfba0b1861bd5fa860f065e76e9d6a495f1fa903dac430", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin_len, 189440, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a", 320, "284f451d5ec08aeddbe5eeecfa0c581d6515882bc860fb2dd29053be7cd331cf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len, 115712, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a", 320, "f30d1c281ed04518498e5427ec6ff5d69eb46dc79032be2315eea860434f5ce5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len, 152576, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a", 320, "21e928126d92915655040e7f4fb6af36e9076888363c37d454b0114325f06b5c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_sm100a_cubin_len, 189440, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_sm100a", 352, "ec0e742813a3e238be2cf57340c6a2790a8a171750f867a6c5df0e2c457035e4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin_len, 189440, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_et128x128_m128x128x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a", 320, "87d15a521a86d8e2a8214157173bea9b9a8ff5ee71eb72bbc68028ecda612e86", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len, 95232, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a", 448, "996853f6a3b5d771a3033c8af13c74bfa3afaf4614e07932ffa3a154f8ae3267", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len, 136192, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a", 448, "d711ab0bf58451b4d5b2e94aadae1c1461fc17130dcd26e66781631c598a1289", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin_len, 178176, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a", 448, "886850fd2cdb5f4dff0d785c3f7485891e2fb53a7739370536b4bf19b298d45c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len, 95232, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a", 448, "0dcfb84ef7aa2bb31fde24a9e59260fdf4fab5e06d774e91a11d1e9ffab197cd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len, 136192, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a", 448, "f17c3d199b0decd880e0c7e1e56b326360402f62ceb1a80b7416a61bc5f16afd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin_len, 178176, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a", 448, "ce2ff492645d812d7bb16fdfebbb41e6434be58e04d0b6f965074c284b991479", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len, 62464, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a", 448, "9bea13f4246638c12b015e874d6e7c7019ef48a68a0b4e4e02a69f9123546d4e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin_len, 154624, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a", 448, "6d26de55476af86415b219090261e3bedb562c1ecb731f5a38ee6a43243e1213", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len, 62464, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a", 448, "f8eabe003d5659e3283c0238971333109fef55c52bb5474830e3c4087fd2219d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin_len, 154624, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a", 448, "6dce54c47c910eae481784447c3870562c6cff85f063ae6c1ec7402bfa0e166e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len, 107520, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a", 448, "e9ed6f53615603543b2026dab39fe8f2e8ec9ab16a3550f33efd89cb46be56d8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin_len, 190464, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a", 448, "a8de9f0905b381c707249062123d52c0ef3abaf9d27727e759c9d2dadd944f89", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len, 107520, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a", 448, "5b06860749eb0901bcb9dbb8101315717d99143cb3d7a357ddaaf85e26b9cfec", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a_cubin_len, 190464, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_sm100a", 448, "00429e49af761a8df4689a5c22db5569e8cbabe9ecc4f5588c3a870aeb6a4ea0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len, 49152, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a", 448, "7f0dc0f109b1a00363a39ee1da9ad6bbca5d876da42b013547856999cc26b513", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len, 49152, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a", 448, "5357cc6fada90f8f2886a31b3e9744072df8e17234e71ac5422c2f04002bfe4c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len, 88064, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a", 448, "4bc9840695052b64b2e64b9ec719aee8c7ce59cf7b3f9a8f267047249a580b52", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len, 126976, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a", 448, "ee0bb0ffad7ca20ef901e269fdf76f88223733261302ab07d8f14c84d3e2fa43", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin_len, 166912, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a", 448, "1e279e2e83663b4686c2ef69aa0db96965dff53d3d57edaec7f8b00ed7cbc0cd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a_cubin_len, 88064, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s2_TN_transOut_schedS_sm100a", 448, "51d875d7620a29c1915c82da710dfa436f2a20e55a25676a4eb7db120909b0b0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a_cubin_len, 126976, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_sm100a", 448, "8ab9fd675dd35773afb286d43fef278cc9dd67eae7f42b5ded32e27f53fccbd6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a_cubin_len, 166912, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_sm100a", 448, "488df1b9cf4911ecbbbd15d9e5619728f4ecc2b695415b6753a0054a2fa82f44", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 121856, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_sm100a", 384, "ec338f39ce74011f230100d3c74cc5a11b298cdcb7330a06e42250bcb0693641", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_sm100a_cubin_len, 138240, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_sm100a", 416, "f3fd905fb287c39d083acf7cf208c8ade64e8dae54e1a133e25913bd59130e5c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 138240, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_sm100a", 384, "5e5546c91a51d769841533ae5339845737922cb7837213fe69050adb059baa3d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
 }, gemm::SmVersion::Sm100a },
{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_sm100a_cubin_len, 113664, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_sm100a", 416, "1f5da4a1effb9237f8542a909a6c6ebfb6586d78727f24a99a76cfa71b4a1bd8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
 }, gemm::SmVersion::Sm100a },
#endif // EXCLUDE_SM_100
};
// clang-format on
} // namespace kernels
} // namespace tensorrt_llm
}
