tilelang.autotuner.param module#

The auto-tune parameters.

class tilelang.autotuner.param.AutotuneResult(latency: float, config: dict, ref_latency: float, libcode: str, func: Callable, kernel: Callable)#

Bases: object

Results from auto-tuning process.

latency#

Best achieved execution latency.

Type:

float

config#

Configuration that produced the best result.

Type:

dict

ref_latency#

Reference implementation latency.

Type:

float

libcode#

Generated library code.

Type:

str

func#

Optimized function.

Type:

Callable

kernel#

Compiled kernel function.

Type:

Callable

config: dict#
func: Callable#
kernel: Callable#
latency: float#
libcode: str#
classmethod load_from_disk(path: Path, compile_args: CompileArgs) AutotuneResult#
ref_latency: float#
save_to_disk(path: Path)#
class tilelang.autotuner.param.CompileArgs(out_idx: Union[List[int], int] = -1, execution_backend: Literal['dlpack', 'ctypes', 'cython'] = 'cython', target: Literal['auto', 'cuda', 'hip'] = 'auto', target_host: Optional[Union[str, Target]] = None, verbose: bool = False, pass_configs: Optional[Dict[str, Any]] = None)#

Bases: object

Compile arguments for the auto-tuner. Detailed description can be found in tilelang.jit.compile. .. attribute:: out_idx

List of output tensor indices.

type:

Union[List[int], int]

execution_backend#

Execution backend to use for kernel execution (default: “cython”).

Type:

Literal[‘dlpack’, ‘ctypes’, ‘cython’]

target#

Compilation target, either as a string or a TVM Target object (default: “auto”).

Type:

Literal[‘auto’, ‘cuda’, ‘hip’]

target_host#

Target host for cross-compilation (default: None).

Type:

Union[str, tvm.target.target.Target]

verbose#

Whether to enable verbose output (default: False).

Type:

bool

pass_configs#

Additional keyword arguments to pass to the Compiler PassContext.

Type:

Optional[Dict[str, Any]]

Available options

“tir.disable_vectorize”: bool, default: False “tl.disable_tma_lower”: bool, default: False “tl.disable_warp_specialized”: bool, default: False “tl.config_index_bitwidth”: int, default: None “tl.disable_dynamic_tail_split”: bool, default: False “tl.dynamic_vectorize_size_bits”: int, default: 128 “tl.disable_safe_memory_legalize”: bool, default: False

compile_program(program: PrimFunc)#
execution_backend: Literal['dlpack', 'ctypes', 'cython'] = 'cython'#
out_idx: Union[List[int], int] = -1#
pass_configs: Optional[Dict[str, Any]] = None#
target: Literal['auto', 'cuda', 'hip'] = 'auto'#
target_host: Union[str, Target] = None#
verbose: bool = False#
class tilelang.autotuner.param.ProfileArgs(warmup: int = 25, rep: int = 100, timeout: int = 30, supply_type: TensorSupplyType = TensorSupplyType.Auto, ref_prog: Optional[Callable] = None, supply_prog: Optional[Callable] = None, rtol: float = 0.01, atol: float = 0.01, max_mismatched_ratio: float = 0.01, skip_check: bool = False, manual_check_prog: Optional[Callable] = None, cache_input_tensors: bool = True)#

Bases: object

Profile arguments for the auto-tuner.

warmup#

Number of warmup iterations.

Type:

int

rep#

Number of repetitions for timing.

Type:

int

timeout#

Maximum time per configuration.

Type:

int

supply_type#

Type of tensor supply mechanism.

Type:

tilelang.utils.tensor.TensorSupplyType

ref_prog#

Reference program for correctness validation.

Type:

Callable

supply_prog#

Supply program for input tensors.

Type:

Callable

out_idx#

Union[List[int], int] = -1

supply_type#

tilelang.TensorSupplyType = tilelang.TensorSupplyType.Auto

Type:

tilelang.utils.tensor.TensorSupplyType

ref_prog#

Callable = None

Type:

Callable

supply_prog#

Callable = None

Type:

Callable

rtol#

float = 1e-2

Type:

float

atol#

float = 1e-2

Type:

float

max_mismatched_ratio#

float = 0.01

Type:

float

skip_check#

bool = False

Type:

bool

manual_check_prog#

Callable = None

Type:

Callable

cache_input_tensors#

bool = True

Type:

bool

atol: float = 0.01#
cache_input_tensors: bool = True#
manual_check_prog: Callable = None#
max_mismatched_ratio: float = 0.01#
ref_prog: Callable = None#
rep: int = 100#
rtol: float = 0.01#
skip_check: bool = False#
supply_prog: Callable = None#
supply_type: TensorSupplyType = 7#
timeout: int = 30#
warmup: int = 25#