viva_tensor

High-performance tensor operations for Gleam on the BEAM.

This module is the stable entry point for the package. It exposes the tensor type, common constructors, shape operations, linear algebra, element-wise math, reductions, native acceleration helpers, and layout inspection.

Lower-level implementation, backend, neural-network, quantization, sparse, telemetry, and benchmark modules are intentionally excluded from the public documentation until their contracts are stable. The related viva_tensor/layout, viva_tensor/axis, and viva_tensor/named modules are public when callers need explicit layout metadata or named dimensions.

import gleam/result
import viva_tensor as t

let a = t.zeros([2, 3])
let b = t.ones([2, 3])
use c <- result.try(t.add(a, b))
c

Types

AcceleratedTensor

</>

Result storage selected by the RTX-first planner.

pub type AcceleratedTensor =
  @internal AcceleratedTensor

AccelerationBackend

</>

Backend selected by the RTX-first planner.

pub type AccelerationBackend =
  @internal AccelerationBackend

AccumulatorFormat

</>

Accumulator format for quantized kernels.

pub type AccumulatorFormat =
  @internal AccumulatorFormat

Activation

</>

Activation tag for the FFN sublayer (ReluAct or GeluAct).

pub type Activation =
  @internal Activation

AdaptiveAvgPool1dConfig

</>

1D adaptive average-pool configuration.

pub type AdaptiveAvgPool1dConfig =
  @internal AdaptiveAvgPool1dConfig

AdaptiveAvgPool2dConfig

</>

2D adaptive average-pool configuration.

pub type AdaptiveAvgPool2dConfig =
  @internal AdaptiveAvgPool2dConfig

Average

</>

Averaging strategy for multi-class precision / recall / F1.

pub type Average =
  @internal Average

AvgPool1dConfig

</>

1D average-pool configuration.

pub type AvgPool1dConfig =
  @internal AvgPool1dConfig

BackendCapability

</>

Runtime capability for one backend.

pub type BackendCapability {
  BackendCapability(
    backend: TensorBackend,
    available: Bool,
    device: BackendDevice,
    dtypes: List(BackendDtype),
    operations: List(BackendOperation),
    reason: String,
  )
}

Constructors

BackendCapability(
  backend: TensorBackend,
  available: Bool,
  device: BackendDevice,
  dtypes: List(BackendDtype),
  operations: List(BackendOperation),
  reason: String,
)

BackendDevice

</>

Backend device class used by runtime capability discovery.

pub type BackendDevice {
  BackendBeamCpu
  BackendNativeCpu
  BackendCuda
}

Constructors

```
BackendBeamCpu
```
```
BackendNativeCpu
```
```
BackendCuda
```

BackendDtype

</>

Element type supported by a runtime backend.

pub type BackendDtype {
  BackendFloat64
  BackendFloat32
  BackendFloat16
  BackendInt8
  BackendSparseFloat16
}

Constructors

```
BackendFloat64
```
```
BackendFloat32
```
```
BackendFloat16
```
```
BackendInt8
```
```
BackendSparseFloat16
```

BackendOperation

</>

Operation family supported by a backend capability record.

pub type BackendOperation {
  BackendElementwise
  BackendBroadcast
  BackendReduction
  BackendSoftmax
  BackendMatmul
}

Constructors

```
BackendElementwise
```
```
BackendBroadcast
```
```
BackendReduction
```
```
BackendSoftmax
```
```
BackendMatmul
```

BackendRejection

</>

Backend decision for a tensor operation.

pub type BackendRejection {
  BackendRejection(backend: TensorBackend, reason: String)
}

Constructors

BackendRejection(backend: TensorBackend, reason: String)

Batch

</>

Stacked batch produced by a DataLoader.

pub type Batch =
  @internal Batch

BatchNorm1d

</>

1D batch normalization layer.

pub type BatchNorm1d =
  @internal BatchNorm1d

BatchNorm2d

</>

2D batch normalization layer. Normalizes over [B, H, W] per channel C.

pub type BatchNorm2d =
  @internal BatchNorm2d

BertBlock

</>

Single BERT encoder block (LayerNorm + GELU FFN + bidirectional MHA).

pub type BertBlock =
  @internal BertBlock

BertEmbedding

</>

BERT input embedding (word + position + token_type + LayerNorm).

pub type BertEmbedding =
  @internal BertEmbedding

BertModel

</>

Full BERT model (HF: bert-base-uncased, bert-base-cased, bert-large-uncased).

pub type BertModel =
  @internal BertModel

BpeTokenizer

</>

Byte-pair encoding tokenizer driven by a pre-trained merge table.

pub type BpeTokenizer =
  @internal BpeTokenizer

CharTokenizer

</>

Character-level tokenizer over Unicode graphemes.

pub type CharTokenizer =
  @internal CharTokenizer

ColorJitterConfig

</>

Configuration for color_jitter_forward. See viva_tensor/vision/augmentations for the full docs.

pub type ColorJitterConfig =
  @internal ColorJitterConfig

Conv1dConfig

</>

Configuration for a 1D convolution layer.

pub type Conv1dConfig =
  @internal Conv1dConfig

Conv2dConfig

</>

Configuration for two-dimensional convolution operations.

pub type Conv2dConfig =
  @internal Conv2dConfig

Conv3dConfig

</>

Configuration for a 3D convolution layer.

pub type Conv3dConfig =
  @internal Conv3dConfig

ConvTranspose2dConfig

</>

Configuration for a 2D transposed convolution.

pub type ConvTranspose2dConfig =
  @internal ConvTranspose2dConfig

DataLoader

</>

Iterator-style data loader.

pub type DataLoader =
  @internal DataLoader

Dataset

</>

In-memory dataset of Samples.

pub type Dataset =
  @internal Dataset

DecoderBlock

</>

Transformer decoder block (pre-norm style, causal self-attn + cross-attn

FFN).

pub type DecoderBlock =
  @internal DecoderBlock

Dropout

</>

Inverted-dropout layer (drop probability p).

pub type Dropout =
  @internal Dropout

Embedding

</>

Learnable embedding table: integer ids -> dense vectors.

pub type Embedding =
  @internal Embedding

EncoderBlock

</>

Transformer encoder block (pre-norm style, self-attention + FFN).

pub type EncoderBlock =
  @internal EncoderBlock

FeedForward

</>

Position-wise feed-forward sublayer used by EncoderBlock / DecoderBlock.

pub type FeedForward =
  @internal FeedForward

GenerateOpts

</>

Text generation options for generate.

pub type GenerateOpts {
  GenerateOpts(
    max_new_tokens: Int,
    temperature: Float,
    top_k: GenerateTopK,
    top_p: Float,
    seed: Int,
    stop_on_eos: Bool,
  )
}

Constructors

GenerateOpts(
  max_new_tokens: Int,
  temperature: Float,
  top_k: GenerateTopK,
  top_p: Float,
  seed: Int,
  stop_on_eos: Bool,
)

GenerateTopK

</>

Top-k sampling setting for GenerateOpts.

pub type GenerateTopK {
  TopKInfinity
  TopK(Int)
}

Constructors

```
TopKInfinity
```
```
TopK(Int)
```

Generation

</>

Text generation result.

pub type Generation {
  Generation(
    tokens: List(Int),
    text: String,
    ms_per_token: Float,
    total_tokens: Int,
  )
}

Constructors

Generation(
  tokens: List(Int),
  text: String,
  ms_per_token: Float,
  total_tokens: Int,
)

GptBlock

</>

Single GPT-2/3 block (pre-norm LayerNorm + GELU FFN + causal MHA).

pub type GptBlock =
  @internal GptBlock

GptModel

</>

Full GPT model (HF: openai-community/gpt2, openai-community/gpt2-medium).

pub type GptModel =
  @internal GptModel

GpuWorkspace

</>

Workspace for persistent GPU buffers.

pub type GpuWorkspace =
  @internal GpuWorkspace

GradAggregation

</>

Strategy used to combine per-worker gradients in a synchronous step.

pub type GradAggregation =
  @internal GradAggregation

GradPair

</>

A gradient paired with the name of the parameter it belongs to.

pub type GradPair =
  @internal GradPair

GroupNorm

</>

Group normalization layer.

pub type GroupNorm =
  @internal GroupNorm

GruCell

</>

GRU cell parameters (reset/update/new gates stacked in weight rows).

pub type GruCell =
  @internal GruCell

HadamardPreprocess

</>

Reversible Hadamard preprocessing result for low-bit quantization.

pub type HadamardPreprocess =
  @internal HadamardPreprocess

HardwareFeature

</>

Hardware feature used by accelerator profile discovery.

pub type HardwareFeature =
  @internal HardwareFeature

HardwareGeneration

</>

Hardware generation used by accelerator profile discovery.

pub type HardwareGeneration =
  @internal HardwareGeneration

HardwareProfile

</>

Hardware target profile for current and future accelerator dispatch.

pub type HardwareProfile =
  @internal HardwareProfile

HfLoadError

</>

Loader-local error type. See viva_tensor/io/hf_loader.HfLoadError.

pub type HfLoadError =
  @internal HfLoadError

LayerNorm

</>

Layer normalization layer (normalizes along the last dimension).

pub type LayerNorm =
  @internal LayerNorm

LearnedPositionalEncoding

</>

Learnable positional encoding table.

pub type LearnedPositionalEncoding =
  @internal LearnedPositionalEncoding

LinearLayer

</>

Persisted linear layer parameters.

pub type LinearLayer =
  @internal LinearLayer

LlamaBlock

</>

Single block of a Llama-2 / Llama-3 decoder stack (RMSNorm + RoPE + causal MHA + SwiGLU FFN).

pub type LlamaBlock =
  @internal LlamaBlock

LlamaModel

</>

Full Llama model (HF: meta-llama/Llama-2-7b-hf, meta-llama/Meta-Llama-3-8B).

pub type LlamaModel =
  @internal LlamaModel

LstmCell

</>

LSTM cell parameters (input/forget/cell/output gates stacked).

pub type LstmCell =
  @internal LstmCell

MaxPool1dConfig

</>

1D max-pool configuration.

pub type MaxPool1dConfig =
  @internal MaxPool1dConfig

MaxUnpool2dConfig

</>

Config for max_unpool_2d_forward — inverse of max_pool_2d_with_indices.

pub type MaxUnpool2dConfig =
  @internal MaxUnpool2dConfig

ModelHandle

</>

Opaque handle for a loaded Llama-family HF model.

pub type ModelHandle

MoeBlock

</>

Sparse mixture-of-experts FFN block. See viva_tensor/nn/moe.

pub type MoeBlock =
  @internal MoeBlock

MultiHeadAttention

</>

Multi-Head Attention layer. See viva_tensor/nn/attention for details.

pub type MultiHeadAttention =
  @internal MultiHeadAttention

NativeTensorRef

</>

Opaque reference to a tensor stored in native NIF memory.

pub type NativeTensorRef =
  @internal NativeTensorRef

NoiseSchedule

</>

Noise schedule specification (LinearSchedule or CosineSchedule).

pub type NoiseSchedule =
  @internal NoiseSchedule

OnnxAttribute

</>

Re-export of viva_tensor/io/onnx.OnnxAttribute.

pub type OnnxAttribute =
  @internal OnnxAttribute

OnnxError

</>

Re-export of viva_tensor/io/onnx.OnnxError.

pub type OnnxError =
  @internal OnnxError

OnnxGraph

</>

Re-export of viva_tensor/io/onnx.OnnxGraph for the public facade.

pub type OnnxGraph =
  @internal OnnxGraph

OnnxNode

</>

Re-export of viva_tensor/io/onnx.OnnxNode.

pub type OnnxNode =
  @internal OnnxNode

Optimizer

</>

Optimizer record carrying hyperparameters and per-parameter state.

pub type Optimizer =
  @internal Optimizer

OptimizerKind

</>

Optimizer family tag.

pub type OptimizerKind =
  @internal OptimizerKind

PackedWeightFp8

</>

FP8 E4M3 prepacked weight handle.

pub type PackedWeightFp8 =
  @internal PackedWeightFp8

PackedWeightInt4Sparse

</>

INT4 2:4 structured-sparse prepacked weight handle.

pub type PackedWeightInt4Sparse =
  @internal PackedWeightInt4Sparse

PackedWeightInt8Sparse

</>

INT8 2:4 structured-sparse prepacked weight handle.

pub type PackedWeightInt8Sparse =
  @internal PackedWeightInt8Sparse

Param

</>

A named parameter tensor passed to step.

pub type Param =
  @internal Param

ParamState

</>

Per-parameter optimizer state (momentum/variance/etc.).

pub type ParamState =
  @internal ParamState

PrintOptions

</>

Options controlling tensor pretty-printing (precision, threshold, edgeitems, linewidth, scientific notation, etc.).

pub type PrintOptions =
  @internal PrintOptions

QuantFormat

</>

Quantized storage format metadata.

pub type QuantFormat =
  @internal QuantFormat

QuantLayout

</>

Quantized tensor layout metadata.

pub type QuantLayout =
  @internal QuantLayout

Reduction

</>

Reduction strategy for loss functions. See viva_tensor/nn/losses.

pub type Reduction =
  @internal Reduction

ResizeMode

</>

Resampling mode for vision_resize.

ResizeNearest: nearest-neighbour, blocky and cheap.
ResizeBilinear: linear interpolation along both spatial axes (align_corners=False).

pub type ResizeMode =
  @internal ResizeMode

RmsNorm

</>

Root-mean-square normalization layer.

pub type RmsNorm =
  @internal RmsNorm

RnnCell

</>

Vanilla Elman RNN cell parameters.

pub type RnnCell =
  @internal RnnCell

RoiAlignConfig

</>

Config for roi_align.

pub type RoiAlignConfig =
  @internal RoiAlignConfig

Router

</>

Routing network used by MoeBlock. See viva_tensor/nn/moe.

pub type Router =
  @internal Router

RuntimeOp

</>

Runtime operation planned against a concrete tensor spec.

pub type RuntimeOp {
  RuntimeElementwise
  RuntimeBroadcast
  RuntimeReduction
  RuntimeSoftmax
  RuntimeMatmul(m: Int, n: Int, k: Int)
  RuntimeLinear(batch: Int, in_features: Int, out_features: Int)
}

Constructors

```
RuntimeElementwise
```
```
RuntimeBroadcast
```
```
RuntimeReduction
```
```
RuntimeSoftmax
```
```
RuntimeMatmul(m: Int, n: Int, k: Int)
```

RuntimeLinear(batch: Int, in_features: Int, out_features: Int)

RuntimePlan

</>

Runtime plan cacheable by shape, dtype, device, layout, and op.

pub type RuntimePlan {
  RuntimePlan(
    spec: TensorSpec,
    operation: RuntimeOp,
    selected: TensorBackend,
    fallbacks: List(TensorBackend),
    rejected: List(RuntimeRejection),
    reason: String,
    cache_key: String,
  )
}

Constructors

RuntimePlan(
  spec: TensorSpec,
  operation: RuntimeOp,
  selected: TensorBackend,
  fallbacks: List(TensorBackend),
  rejected: List(RuntimeRejection),
  reason: String,
  cache_key: String,
)

RuntimeRejection

</>

Backend rejection in a runtime plan.

pub type RuntimeRejection {
  RuntimeRejection(backend: TensorBackend, reason: String)
}

Constructors

RuntimeRejection(backend: TensorBackend, reason: String)

Sample

</>

A labeled training example: an input tensor paired with a target tensor.

pub type Sample =
  @internal Sample

SamplerConfig

</>

Sampler configuration (schedule + DDIM eta).

pub type SamplerConfig =
  @internal SamplerConfig

SamplingConfig

</>

Sampling hyperparameters (temperature, top-k, top-p).

pub type SamplingConfig =
  @internal SamplingConfig

ScaleGranularity

</>

Quantization scale sharing granularity.

pub type ScaleGranularity =
  @internal ScaleGranularity

Scheduler

</>

Scheduler state record. See viva_tensor/nn/scheduler for the formulas behind each variant.

pub type Scheduler =
  @internal Scheduler

SchedulerKind

</>

Tag identifying which schedule a Scheduler is implementing.

pub type SchedulerKind =
  @internal SchedulerKind

SchedulerState

</>

Precomputed schedule lookup tables (betas, alphas, alpha_bars).

pub type SchedulerState =
  @internal SchedulerState

SciMode

</>

Scientific-notation mode used by PrintOptions.

pub type SciMode =
  @internal SciMode

SentencePieceMode

</>

Which underlying segmentation a SentencePieceTokenizer uses.

pub type SentencePieceMode =
  @internal SentencePieceMode

SentencePieceTokenizer

</>

SentencePiece-compatible wrapper around either a UnigramTokenizer or a BpeTokenizer, preserving the ▁-prefix convention.

pub type SentencePieceTokenizer =
  @internal SentencePieceTokenizer

SignMode

</>

Sign mode used by PrintOptions.

pub type SignMode =
  @internal SignMode

SpeculativeConfig

</>

Configuration for speculative decoding (draft tokens, sampling, cap).

pub type SpeculativeConfig =
  @internal SpeculativeConfig

T5Block

</>

Single T5 block (RMSNorm + GeGLU FFN; encoder=non-causal, decoder=causal+cross-attn).

pub type T5Block =
  @internal T5Block

T5Model

</>

Full T5 model (HF: google-t5/t5-base, google/flan-t5-base).

pub type T5Model =
  @internal T5Model

Tensor

</>

A tensor value backed by dense, strided, or native storage.

pub type Tensor =
  @internal Tensor

TensorBackend

</>

Stable backend names used by capability discovery and operation planning.

pub type TensorBackend {
  BackendPureGleam
  BackendZigSimd
  BackendMkl
  BackendCudaFp32
  BackendCudaFp16
  BackendCudaInt8
  BackendCudaSparse
}

Constructors

```
BackendPureGleam
```
```
BackendZigSimd
```
```
BackendMkl
```
```
BackendCudaFp32
```
```
BackendCudaFp16
```
```
BackendCudaInt8
```
```
BackendCudaSparse
```

TensorBackendPlan

</>

pub type TensorBackendPlan {
  TensorBackendPlan(
    operation: TensorOperation,
    selected: TensorBackend,
    fallbacks: List(TensorBackend),
    rejected: List(BackendRejection),
    reason: String,
  )
}

Constructors

TensorBackendPlan(
  operation: TensorOperation,
  selected: TensorBackend,
  fallbacks: List(TensorBackend),
  rejected: List(BackendRejection),
  reason: String,
)

TensorCapabilities

</>

Runtime acceleration capabilities detected for this VM.

pub type TensorCapabilities {
  TensorCapabilities(
    nif_loaded: Bool,
    zig_loaded: Bool,
    backend_info: String,
    tflops_backends: List(TflopsBackend),
    backend_capabilities: List(BackendCapability),
  )
}

Constructors

TensorCapabilities(
  nif_loaded: Bool,
  zig_loaded: Bool,
  backend_info: String,
  tflops_backends: List(TflopsBackend),
  backend_capabilities: List(BackendCapability),
)

TensorDevice

</>

Tensor payload location.

pub type TensorDevice {
  BeamCpu
  NativeCpu
  CudaDevice(Int)
}

Constructors

```
BeamCpu
```
```
NativeCpu
```
```
CudaDevice(Int)
```

TensorDtype

</>

Tensor element type.

pub type TensorDtype {
  Float64
  Float32
  Float16
  BFloat16
  Float8E4M3
  Int8
  Int4
  SparseFloat16
}

Constructors

```
Float64
```
```
Float32
```
```
Float16
```
```
BFloat16
```
```
Float8E4M3
```
```
Int8
```
```
Int4
```
```
SparseFloat16
```

TensorError

</>

Error returned by fallible tensor constructors and operations.

pub type TensorError =
  @internal TensorError

TensorLayout

</>

Canonical tensor layout metadata.

pub type TensorLayout =
  layout.TensorLayout

TensorMemoryLayout

</>

Logical memory layout used by runtime planning.

pub type TensorMemoryLayout {
  RowMajor
  ColumnMajor
  StridedLayout
  PackedFp8Layout
  PackedSparse24Layout
}

Constructors

```
RowMajor
```
```
ColumnMajor
```
```
StridedLayout
```
```
PackedFp8Layout
```
```
PackedSparse24Layout
```

TensorOperation

</>

Operation family used by the public backend planner.

pub type TensorOperation {
  OperationElementwise
  OperationBroadcast
  OperationReduction
  OperationSoftmax
  OperationMatmul(m: Int, n: Int, k: Int)
}

Constructors

```
OperationElementwise
```
```
OperationBroadcast
```
```
OperationReduction
```
```
OperationSoftmax
```
```
OperationMatmul(m: Int, n: Int, k: Int)
```

TensorSpec

</>

Shape, dtype, device, storage, and layout metadata for compiled execution.

pub type TensorSpec {
  TensorSpec(
    shape: List(Int),
    dtype: TensorDtype,
    device: TensorDevice,
    storage: TensorStorage,
    memory_layout: TensorMemoryLayout,
    rank: Int,
    size: Int,
  )
}

Constructors

TensorSpec(
  shape: List(Int),
  dtype: TensorDtype,
  device: TensorDevice,
  storage: TensorStorage,
  memory_layout: TensorMemoryLayout,
  rank: Int,
  size: Int,
)

TensorStorage

</>

Tensor payload representation.

pub type TensorStorage {
  DenseStorage
  StridedStorage
  NativeStorage
}

Constructors

```
DenseStorage
```
```
StridedStorage
```
```
NativeStorage
```

TflopsBackend

</>

Backend used when measuring matrix-multiplication throughput.

pub type TflopsBackend =
  @internal Backend

TflopsResult

</>

Result returned by TFLOPS measurement helpers.

pub type TflopsResult =
  @internal TflopsResult

TrainConfig

</>

Configuration for train_synchronous.

pub type TrainConfig =
  @internal TrainConfig

TrainResult

</>

Result of a train_synchronous run.

pub type TrainResult =
  @internal TrainResult

Transformer

</>

Full Transformer model — a stack of EncoderBlocks followed by a stack of DecoderBlocks.

pub type Transformer =
  @internal Transformer

TransformerConfig

</>

Structural config for from_safetensors_file. See viva_tensor/io/hf_loader.TransformerConfig.

pub type TransformerConfig =
  @internal TransformerConfig

UnigramTokenizer

</>

SentencePiece/T5-style Unigram LM tokenizer. Inference uses Viterbi dynamic programming over the lattice of in-vocab subwords.

pub type UnigramTokenizer =
  @internal UnigramTokenizer

UpsampleConfig

</>

Upsample configuration (mode + integer scale factor).

pub type UpsampleConfig =
  @internal UpsampleConfig

UpsampleMode

</>

Upsample mode selector.

pub type UpsampleMode =
  @internal UpsampleMode

WhitespaceTokenizer

</>

Whitespace tokenizer (encoding against a pre-trained vocabulary).

pub type WhitespaceTokenizer =
  @internal WhitespaceTokenizer

WordPieceTokenizer

</>

BERT-style WordPiece tokenizer (encoding-only).

pub type WordPieceTokenizer =
  @internal WordPieceTokenizer

Worker

</>

Handle for a spawned distributed worker process.

pub type Worker =
  @internal Worker

Values

abs

</>

pub fn abs(t: Tensor) -> Tensor

Absolute value for every element.

accelerated_backend

</>

pub fn accelerated_backend(
  t: AcceleratedTensor,
) -> AccelerationBackend

Inspect which backend was selected by matmul_auto.

accelerated_shape

</>

pub fn accelerated_shape(t: AcceleratedTensor) -> List(Int)

Shape of an accelerated tensor without forcing a download.

accelerated_sync

</>

pub fn accelerated_sync() -> Result(Nil, TensorError)

Wait for queued CUDA work to complete.

accelerated_to_string

</>

pub fn accelerated_to_string(t: AcceleratedTensor) -> String

Render an accelerated tensor (CudaFp16/CudaFp32/Cpu) as a pretty string. Large CUDA tensors above the threshold render as header-only to avoid surprise H2D copies.

accelerated_to_string_with

</>

pub fn accelerated_to_string_with(
  t: AcceleratedTensor,
  opts: PrintOptions,
) -> String

Render an accelerated tensor with caller-supplied print options.

accelerated_to_tensor

</>

pub fn accelerated_to_tensor(
  t: AcceleratedTensor,
) -> Result(Tensor, TensorError)

Download an accelerated tensor back to a regular CPU tensor.

accuracy

</>

pub fn accuracy(
  predictions: Tensor,
  targets: Tensor,
) -> Result(Float, TensorError)

Classification accuracy: (1/N) * sum_i [pred_i == target_i].

adam

</>

pub fn adam(lr: Float) -> Optimizer

Adam (Kingma & Ba, 2015). See viva_tensor/nn/optim.

adamw

</>

pub fn adamw(lr: Float, weight_decay: Float) -> Optimizer

AdamW (Loshchilov & Hutter, 2019). See viva_tensor/nn/optim.

adaptive_avg_pool_1d_forward

</>

pub fn adaptive_avg_pool_1d_forward(
  config: AdaptiveAvgPool1dConfig,
  input: Tensor,
) -> Result(Tensor, TensorError)

1D adaptive average pooling. Input [batch, channels, length], output [batch, channels, output_size].

adaptive_avg_pool_2d_forward

</>

pub fn adaptive_avg_pool_2d_forward(
  config: AdaptiveAvgPool2dConfig,
  input: Tensor,
) -> Result(Tensor, TensorError)

2D adaptive average pooling. Input [batch, channels, H, W], output [batch, channels, output_h, output_w].

add

</>

pub fn add(a: Tensor, b: Tensor) -> Result(Tensor, TensorError)

Add element-wise

add_broadcast

</>

pub fn add_broadcast(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Add with broadcasting

add_into

</>

pub fn add_into(
  out: Tensor,
  a: Tensor,
  b: Tensor,
) -> Result(Nil, TensorError)

Write out = a + b into a preallocated native tensor.

add_scalar

</>

pub fn add_scalar(t: Tensor, scalar: Float) -> Tensor

Add a scalar to every element.

all

</>

pub fn all(t: Tensor) -> Bool

Are all mask values non-zero?

all_axis

</>

pub fn all_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Are all values in each axis slice non-zero?

all_axis_keepdims

</>

pub fn all_axis_keepdims(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Are all values in each axis slice non-zero, preserving the reduced dimension.

all_close

</>

pub fn all_close(
  a: Tensor,
  b: Tensor,
  rtol: Float,
  atol: Float,
) -> Result(Bool, TensorError)

Compare two tensors element-wise and return whether all pairs are close.

any

</>

pub fn any(t: Tensor) -> Bool

Does the mask contain any non-zero value?

any_axis

</>

pub fn any_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Does each axis slice contain any non-zero value?

any_axis_keepdims

</>

pub fn any_axis_keepdims(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Does each axis slice contain any non-zero value, preserving the reduced dimension.

apply_temperature

</>

pub fn apply_temperature(
  logits: Tensor,
  temperature: Float,
) -> Tensor

Divide every logit by temperature (no-op for 1.0, defensive for 0.0).

apply_to_optimizer

</>

pub fn apply_to_optimizer(
  s: Scheduler,
  opt: Optimizer,
) -> #(Scheduler, Optimizer)

Apply the scheduler’s next learning rate to an optimizer. Advances the scheduler and returns the updated (scheduler, optimizer) pair.

argmax

</>

pub fn argmax(t: Tensor) -> Int

Index of maximum value

argmin

</>

pub fn argmin(t: Tensor) -> Int

Index of minimum value

average_grads

</>

pub const average_grads: GradAggregation

Average per-worker gradients before applying. See viva_tensor/distributed/trainer.GradAggregation.

avg_pool2d

</>

pub fn avg_pool2d(
  input: Tensor,
  pool_h: Int,
  pool_w: Int,
  stride_h: Int,
  stride_w: Int,
) -> Result(Tensor, TensorError)

Average pooling 2D

avg_pool_1d_forward

</>

pub fn avg_pool_1d_forward(
  config: AvgPool1dConfig,
  input: Tensor,
) -> Result(Tensor, TensorError)

1D average pooling. Input [batch, channels, length], output [batch, channels, (length + 2*padding - kernel_size) / stride + 1].

backend_capabilities

</>

pub fn backend_capabilities() -> List(BackendCapability)

Inspect stable backend capability records.

batch_norm_1d_forward

</>

pub fn batch_norm_1d_forward(
  layer: BatchNorm1d,
  input: Tensor,
  training: Bool,
) -> Result(#(BatchNorm1d, Tensor), TensorError)

Forward pass for BatchNorm1d.

In training mode, updates running stats via EMA and returns the new layer alongside the normalized output. In eval mode, uses running stats and returns the layer unchanged.

batch_norm_1d_init

</>

pub fn batch_norm_1d_init(num_features: Int) -> BatchNorm1d

Initialize a BatchNorm1d with default momentum = 0.1, eps = 1.0e-5.

batch_norm_2d_forward

</>

pub fn batch_norm_2d_forward(
  layer: BatchNorm2d,
  input: Tensor,
  training: Bool,
) -> Result(#(BatchNorm2d, Tensor), TensorError)

Forward pass for BatchNorm2d. Input [B, C, H, W], output same shape. In training mode updates running stats via EMA; in eval mode uses them directly. Returns the (possibly updated) layer plus the output.

batch_norm_2d_init

</>

pub fn batch_norm_2d_init(num_features: Int) -> BatchNorm2d

Initialize a BatchNorm2d with scale = ones([C]), bias = zeros([C]), running_mean = zeros([C]), running_var = ones([C]), momentum = 0.1, eps = 1.0e-5. C = num_features.

batched_matmul

</>

pub fn batched_matmul(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Batched 2-D matmul [Ba, M, K] @ [Bb, K, N] -> [max(Ba, Bb), M, N] with broadcasting when either batch dim is 1.

bce_loss

</>

pub fn bce_loss(
  prediction: Tensor,
  target: Tensor,
  reduction: Reduction,
) -> Result(Tensor, TensorError)

Binary Cross-Entropy. See viva_tensor/nn/losses.bce_loss.

bce_loss_backward

</>

pub fn bce_loss_backward(
  grad_out: Tensor,
  prediction: Tensor,
  target: Tensor,
  reduction: Reduction,
) -> Result(Tensor, TensorError)

Backward for bce_loss. Returns gradient w.r.t. prediction only. See viva_tensor/nn/backward.bce_loss_backward.

bert_block_forward

</>

pub fn bert_block_forward(
  block: BertBlock,
  input: Tensor,
) -> Result(Tensor, TensorError)

Run a BertBlock on [seq_len, embed_dim] input.

bert_block_init

</>

pub fn bert_block_init(
  embed_dim: Int,
  num_heads: Int,
  ffn_hidden_dim: Int,
) -> Result(BertBlock, TensorError)

Build a zero-weight BertBlock.

bert_embedding_forward

</>

pub fn bert_embedding_forward(
  layer: BertEmbedding,
  token_ids: Tensor,
  token_type_ids: Tensor,
) -> Result(Tensor, TensorError)

Run the BERT embedding layer.

bert_embedding_init

</>

pub fn bert_embedding_init(
  vocab_size: Int,
  embed_dim: Int,
  max_position: Int,
  num_token_types: Int,
) -> BertEmbedding

Build a zero-weight BertEmbedding.

bert_model_forward

</>

pub fn bert_model_forward(
  model: BertModel,
  token_ids: Tensor,
  token_type_ids: Tensor,
) -> Result(Tensor, TensorError)

End-to-end BERT forward: token_ids, token_type_ids -> hidden states.

bert_model_init

</>

pub fn bert_model_init(
  num_layers: Int,
  vocab_size: Int,
  embed_dim: Int,
  num_heads: Int,
  ffn_hidden_dim: Int,
  max_position: Int,
) -> Result(BertModel, TensorError)

Build a zero-weight BertModel with num_layers blocks.

bpe_decode

</>

pub fn bpe_decode(
  tokenizer: BpeTokenizer,
  ids: List(Int),
) -> String

Decode ids with a BpeTokenizer.

Example

import viva_tensor as t
let tok = t.bpe_tokenizer_from_vocab_and_merges(
  ["?", "l", "o", "lo"],
  [#("l", "o")],
  "?",
)
let _ = t.bpe_decode(tok, [3])

bpe_encode

</>

pub fn bpe_encode(
  tokenizer: BpeTokenizer,
  text: String,
) -> List(Int)

Encode text with a BpeTokenizer.

Example

import viva_tensor as t
let tok = t.bpe_tokenizer_from_vocab_and_merges(
  ["?", "l", "o", "lo"],
  [#("l", "o")],
  "?",
)
let _ = t.bpe_encode(tok, "lo")

bpe_tokenizer_from_vocab_and_merges

</>

pub fn bpe_tokenizer_from_vocab_and_merges(
  vocab: List(String),
  merges: List(#(String, String)),
  unk_token: String,
) -> BpeTokenizer

Build a BpeTokenizer from a vocab and pre-trained merges.

Example

import viva_tensor as t
let _ = t.bpe_tokenizer_from_vocab_and_merges(
  ["?", "l", "o", "lo"],
  [#("l", "o")],
  "?",
)

broadcast_pair

</>

pub fn broadcast_pair(
  a: Tensor,
  b: Tensor,
) -> Result(#(Tensor, Tensor), TensorError)

Broadcast two tensors to their common shape.

broadcast_shape

</>

pub fn broadcast_shape(
  a: List(Int),
  b: List(Int),
) -> Result(List(Int), TensorError)

Compute the common shape for two broadcastable shapes.

broadcast_shapes

</>

pub fn broadcast_shapes(
  shapes: List(List(Int)),
) -> Result(List(Int), TensorError)

Compute the common shape for any number of broadcastable shapes.

broadcast_to

</>

pub fn broadcast_to(
  t: Tensor,
  target_shape: List(Int),
) -> Result(Tensor, TensorError)

Broadcast tensor to a target shape.

build_schedule

</>

pub fn build_schedule(schedule: NoiseSchedule) -> SchedulerState

Precompute the schedule tables for either a linear or cosine NoiseSchedule.

cache_key

</>

pub fn cache_key(plan: RuntimePlan) -> String

Return the stable cache key for a runtime plan.

can_broadcast

</>

pub fn can_broadcast(a: List(Int), b: List(Int)) -> Bool

Can these shapes broadcast together?

capabilities

</>

pub fn capabilities() -> TensorCapabilities

Inspect native runtime acceleration availability.

causal_mask

</>

pub fn causal_mask(seq_len: Int) -> Tensor

Lower-triangular [seq_len, seq_len] mask of 1.0s used by causal SDPA.

ceil

</>

pub fn ceil(t: Tensor) -> Tensor

Ceiling every element.

char_decode

</>

pub fn char_decode(
  tokenizer: CharTokenizer,
  ids: List(Int),
) -> String

Decode ids with a CharTokenizer.

Example

import viva_tensor as t
let tok = t.char_tokenizer_from_alphabet(["?", "a", "b"], "?")
let _ = t.char_decode(tok, [1, 2])

char_encode

</>

pub fn char_encode(
  tokenizer: CharTokenizer,
  text: String,
) -> List(Int)

Encode text with a CharTokenizer.

Example

import viva_tensor as t
let tok = t.char_tokenizer_from_alphabet(["?", "a", "b"], "?")
let _ = t.char_encode(tok, "ab")

char_tokenizer_from_alphabet

</>

pub fn char_tokenizer_from_alphabet(
  alphabet: List(String),
  unk_token: String,
) -> CharTokenizer

Build a CharTokenizer from an alphabet.

Example

import viva_tensor as t
let _ = t.char_tokenizer_from_alphabet(["?", "a", "b"], "?")

cholesky

</>

pub fn cholesky(a: Tensor) -> Result(Tensor, TensorError)

Cholesky decomposition for symmetric positive-definite matrices. Returns lower-triangular L with A = L @ L^T.

clamp

</>

pub fn clamp(t: Tensor, min_val: Float, max_val: Float) -> Tensor

Clamp values

clip

</>

pub fn clip(t: Tensor, min_val: Float, max_val: Float) -> Tensor

Alias for clamp.

clip_by_norm

</>

pub fn clip_by_norm(t: Tensor, max_norm: Float) -> Tensor

Clip tensor L2 norm to at most max_norm.

color_jitter_forward

</>

pub fn color_jitter_forward(
  config: ColorJitterConfig,
  image: Tensor,
) -> Result(Tensor, TensorError)

Apply randomized brightness/contrast/saturation/hue to [C, H, W] or [B, C, H, W] RGB images.

color_jitter_init

</>

pub fn color_jitter_init(
  brightness: Float,
  contrast: Float,
  saturation: Float,
  hue: Float,
) -> ColorJitterConfig

Build a ColorJitterConfig (brightness/contrast/saturation/hue strengths).

compute_load_balance_loss

</>

pub fn compute_load_balance_loss(
  router_probs: Tensor,
  expert_assignments: Tensor,
  num_experts: Int,
) -> Result(Tensor, TensorError)

Switch Transformer load-balancing auxiliary loss.

confusion_matrix

</>

pub fn confusion_matrix(
  predictions: Tensor,
  targets: Tensor,
  num_classes: Int,
) -> Result(Tensor, TensorError)

Confusion matrix [num_classes, num_classes] where cm[true, pred] counts samples.

conv1d_forward

</>

pub fn conv1d_forward(
  config: Conv1dConfig,
  input: Tensor,
) -> Result(Tensor, TensorError)

1D convolution forward pass. Output length = (L_in + 2*padding - kernel) / stride + 1.

conv1d_init

</>

pub fn conv1d_init(
  in_channels in_channels: Int,
  out_channels out_channels: Int,
  kernel_size kernel_size: Int,
  stride stride: Int,
  padding padding: Int,
) -> Conv1dConfig

Initialize a Conv1d layer with zero weights and bias.

conv2d

</>

pub fn conv2d(
  input: Tensor,
  kernel: Tensor,
  config: Conv2dConfig,
) -> Result(Tensor, TensorError)

2D Convolution

conv2d_config

</>

pub fn conv2d_config() -> Conv2dConfig

Default conv2d config (3x3 kernel, stride 1, no padding)

conv2d_same

</>

pub fn conv2d_same(kernel_h: Int, kernel_w: Int) -> Conv2dConfig

Conv2d config with “same” padding

conv3d_forward

</>

pub fn conv3d_forward(
  config: Conv3dConfig,
  input: Tensor,
) -> Result(Tensor, TensorError)

3D convolution forward pass. Output dim = (In + 2*pad - kernel) / stride + 1 per spatial axis.

conv3d_init

</>

pub fn conv3d_init(
  in_channels in_channels: Int,
  out_channels out_channels: Int,
  kernel_size kernel_size: #(Int, Int, Int),
  stride stride: #(Int, Int, Int),
  padding padding: #(Int, Int, Int),
) -> Conv3dConfig

Initialize a Conv3d layer with zero weights and bias.

conv_transpose_2d_forward

</>

pub fn conv_transpose_2d_forward(
  config: ConvTranspose2dConfig,
  input: Tensor,
) -> Result(Tensor, TensorError)

2D transposed convolution (deconv) forward pass. Output dim = (In - 1) * stride - 2*padding + (kernel - 1) + output_padding + 1.

conv_transpose_2d_init

</>

pub fn conv_transpose_2d_init(
  in_channels in_channels: Int,
  out_channels out_channels: Int,
  kernel_size kernel_size: #(Int, Int),
  stride stride: #(Int, Int),
  padding padding: #(Int, Int),
  output_padding output_padding: #(Int, Int),
) -> ConvTranspose2dConfig

Initialize a ConvTranspose2d layer with zero weights and bias.

cosine_annealing_lr

</>

pub fn cosine_annealing_lr(
  base_lr: Float,
  t_max: Int,
  eta_min: Float,
) -> Scheduler

CosineAnnealingLR: half-cosine from base_lr down to eta_min over t_max steps.

cosine_similarity

</>

pub fn cosine_similarity(a: Tensor, b: Tensor) -> Float

Cosine similarity between two same-shaped tensors, flattened as vectors.

count_nonzero

</>

pub fn count_nonzero(t: Tensor) -> Int

Count non-zero values in a tensor.

count_nonzero_axis

</>

pub fn count_nonzero_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Count non-zero values along one axis.

count_nonzero_axis_keepdims

</>

pub fn count_nonzero_axis_keepdims(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Count non-zero values along one axis, preserving the reduced dimension.

cross_entropy_loss

</>

pub fn cross_entropy_loss(
  logits: Tensor,
  targets: Tensor,
  reduction: Reduction,
) -> Result(Tensor, TensorError)

Softmax cross-entropy with integer-valued class targets. See viva_tensor/nn/losses.cross_entropy_loss.

cross_entropy_loss_backward

</>

pub fn cross_entropy_loss_backward(
  grad_out: Tensor,
  logits: Tensor,
  targets: Tensor,
  reduction: Reduction,
) -> Result(Tensor, TensorError)

Backward for cross_entropy_loss. Returns gradient w.r.t. logits. See viva_tensor/nn/backward.cross_entropy_loss_backward.

cumprod

</>

pub fn cumprod(t: Tensor) -> Tensor

Cumulative product over the flattened tensor, preserving the original shape.

cumprod_axis

</>

pub fn cumprod_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Cumulative product along one axis, preserving the original shape.

cumsum

</>

pub fn cumsum(t: Tensor) -> Tensor

Cumulative sum over the flattened tensor, preserving the original shape.

cumsum_axis

</>

pub fn cumsum_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Cumulative sum along one axis, preserving the original shape.

cutmix

</>

pub fn cutmix(
  images: Tensor,
  labels: Tensor,
  num_classes: Int,
  alpha: Float,
) -> Result(#(Tensor, Tensor), TensorError)

CutMix on a batch — paste a random rectangle from a partner image, label mixing reflects the actual pasted area.

data_loader_batches

</>

pub fn data_loader_batches(
  loader: DataLoader,
) -> Result(List(Batch), TensorError)

Iterate the loader once, returning all batches.

Example

import viva_tensor as t
let ds =
  t.dataset_from_samples([
    t.Sample(input: t.from_list([1.0]), target: t.from_list([0.0])),
  ])
let loader = t.data_loader_new(ds, 1, False, False)
let assert Ok(_) = t.data_loader_batches(loader)

data_loader_len

</>

pub fn data_loader_len(loader: DataLoader) -> Int

Total number of batches a single iteration will yield.

Example

import viva_tensor as t
let loader = t.data_loader_new(t.dataset_from_samples([]), 4, False, False)
let _ = t.data_loader_len(loader)

data_loader_new

</>

pub fn data_loader_new(
  dataset: Dataset,
  batch_size: Int,
  shuffle: Bool,
  drop_last: Bool,
) -> DataLoader

Create a new data loader.

Example

import viva_tensor as t
let _ = t.data_loader_new(t.dataset_from_samples([]), 32, True, False)

dataset_from_lists

</>

pub fn dataset_from_lists(
  inputs: List(Tensor),
  targets: List(Tensor),
) -> Result(Dataset, TensorError)

Build a dataset from parallel input and target tensor lists.

Example

import viva_tensor as t
let assert Ok(_ds) =
  t.dataset_from_lists([t.from_list([1.0])], [t.from_list([0.0])])

dataset_from_samples

</>

pub fn dataset_from_samples(samples: List(Sample)) -> Dataset

Build an in-memory dataset from a list of labeled samples.

Example

import viva_tensor as t
let x = t.from_list([1.0])
let y = t.from_list([0.0])
let _ds = t.dataset_from_samples([t.Sample(input: x, target: y)])

dataset_get

</>

pub fn dataset_get(
  d: Dataset,
  index: Int,
) -> Result(Sample, TensorError)

Fetch the i-th sample (zero-indexed; negative indices wrap).

Example

import viva_tensor as t
let ds =
  t.dataset_from_samples([
    t.Sample(input: t.from_list([1.0]), target: t.from_list([0.0])),
  ])
let assert Ok(_) = t.dataset_get(ds, -1)

dataset_len

</>

pub fn dataset_len(d: Dataset) -> Int

Number of samples in the dataset.

Example

import viva_tensor as t
let _ = t.dataset_len(t.dataset_from_samples([]))

ddim_step

</>

pub fn ddim_step(
  state: SchedulerState,
  x_t: Tensor,
  model_pred: Tensor,
  t: Int,
  eta: Float,
) -> Result(Tensor, TensorError)

One DDIM reverse step at index t. eta=0 is deterministic.

ddpm_step

</>

pub fn ddpm_step(
  state: SchedulerState,
  x_t: Tensor,
  model_pred: Tensor,
  t: Int,
) -> Result(Tensor, TensorError)

One DDPM reverse step at index t.

decoder_block_forward

</>

pub fn decoder_block_forward(
  block: DecoderBlock,
  input: Tensor,
  encoder_output: Tensor,
) -> Result(Tensor, TensorError)

Decoder block forward pass. Input is [tgt_seq_len, embed_dim], encoder_output is [src_seq_len, embed_dim].

decoder_block_init

</>

pub fn decoder_block_init(
  embed_dim: Int,
  num_heads: Int,
  ffn_hidden_dim: Int,
  activation: Activation,
) -> Result(DecoderBlock, TensorError)

Build a zero-weight pre-norm decoder block (causal self-attn + cross-attn

FFN).

Forward (per block):

r1     = input + MHA_self(layer_norm1(input), is_causal=True)
r2     = r1    + MHA_cross(layer_norm2(r1), memory, memory)
output = r2    + FFN(layer_norm3(r2))

default_generate_opts

</>

pub fn default_generate_opts() -> GenerateOpts

Default deterministic argmax generation options.

default_print_options

</>

pub fn default_print_options() -> PrintOptions

Default print options. Matches a sensible NumPy/PyTorch baseline: precision=4, threshold=1000, edgeitems=3, linewidth=80.

det

</>

pub fn det(a: Tensor) -> Result(Float, TensorError)

Determinant via LU decomposition. Returns 0.0 for singular matrices.

detect_backends

</>

pub fn detect_backends() -> List(TflopsBackend)

Detect available compute backends

device

</>

pub fn device(t: Tensor) -> TensorDevice

Inspect where a tensor payload lives.

device_name

</>

pub fn device_name(device: TensorDevice) -> String

Stable device label used by runtime cache keys.

diag

</>

pub fn diag(t: Tensor) -> Tensor

Create a square diagonal matrix from a 1D tensor.

distribute_grads

</>

pub fn distribute_grads(
  per_worker_grads: List(List(GradPair)),
  aggregation: GradAggregation,
) -> Result(List(GradPair), TensorError)

Aggregate per-worker gradient lists synchronously. See viva_tensor/distributed/trainer.distribute_grads.

div

</>

pub fn div(a: Tensor, b: Tensor) -> Result(Tensor, TensorError)

Element-wise division

div_broadcast

</>

pub fn div_broadcast(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Divide with broadcasting

dot

</>

pub fn dot(a: Tensor, b: Tensor) -> Result(Float, TensorError)

Dot product (vectors only)

dot_similarity

</>

pub fn dot_similarity(a: Tensor, b: Tensor) -> Float

Dot similarity between two same-shaped tensors, flattened as vectors.

dropout_forward

</>

pub fn dropout_forward(
  layer: Dropout,
  input: Tensor,
  training: Bool,
) -> Tensor

Forward pass for inverted dropout. Output shape: same as input. Passthrough when training = False or p == 0.0. When p == 1.0 every element is zeroed.

dropout_init

</>

pub fn dropout_init(p: Float) -> Dropout

Initialize a Dropout layer with drop probability p. Output shape: same as input.

dtype

</>

pub fn dtype(t: Tensor) -> TensorDtype

Inspect the tensor element type.

dtype_name

</>

pub fn dtype_name(dtype: TensorDtype) -> String

Stable dtype label used by runtime cache keys.

eig

</>

pub fn eig(a: Tensor) -> Result(#(Tensor, Tensor), TensorError)

Eigendecomposition stub (not implemented in v1).

einsum

</>

pub fn einsum(
  equation: String,
  operands: List(Tensor),
) -> Result(Tensor, TensorError)

Einstein summation. See viva_tensor/tensor.einsum for the full spec.

elu

</>

pub fn elu(t: Tensor, alpha: Float) -> Tensor

Exponential Linear Unit: x if x > 0 else alpha * (exp(x) - 1).

elu_backward

</>

pub fn elu_backward(
  grad_out: Tensor,
  input: Tensor,
  alpha: Float,
) -> Result(Tensor, TensorError)

Backward for elu. See viva_tensor/nn/backward.elu_backward.

embedding_forward

</>

pub fn embedding_forward(
  layer: Embedding,
  indices: Tensor,
) -> Result(Tensor, TensorError)

Forward pass: gather rows of weight by integer indices.

embedding_init

</>

pub fn embedding_init(
  num_embeddings: Int,
  embedding_dim: Int,
) -> Embedding

Initialize an embedding table with zero weights.

embedding_init_uniform

</>

pub fn embedding_init_uniform(
  num_embeddings: Int,
  embedding_dim: Int,
) -> Embedding

Initialize an embedding table with uniform random weights in [-1/sqrt(embedding_dim), 1/sqrt(embedding_dim)].

encoder_block_forward

</>

pub fn encoder_block_forward(
  block: EncoderBlock,
  input: Tensor,
  is_causal: Bool,
) -> Result(Tensor, TensorError)

Encoder block forward pass on [seq_len, embed_dim] input.

encoder_block_init

</>

pub fn encoder_block_init(
  embed_dim: Int,
  num_heads: Int,
  ffn_hidden_dim: Int,
  activation: Activation,
) -> Result(EncoderBlock, TensorError)

Build a zero-weight pre-norm encoder block.

Forward (per block):

r1     = input + MHA(layer_norm(input), is_causal)
output = r1    + FFN(layer_norm(r1))

equal

</>

pub fn equal(a: Tensor, b: Tensor) -> Result(Tensor, TensorError)

Element-wise equality mask with NumPy-style broadcasting.

euclidean_distance

</>

pub fn euclidean_distance(a: Tensor, b: Tensor) -> Float

Euclidean distance between two same-shaped tensors, flattened as vectors.

exp

</>

pub fn exp(t: Tensor) -> Tensor

Exponential for every element.

expert_distribution

</>

pub fn expert_distribution(
  router_probs: Tensor,
  expert_assignments: Tensor,
  num_experts: Int,
) -> Result(#(Tensor, Tensor), TensorError)

Per-expert importance (sum of router probs) and load (count of top-k assignments). See viva_tensor/nn/moe.expert_distribution.

exponential_lr

</>

pub fn exponential_lr(base_lr: Float, gamma: Float) -> Scheduler

ExponentialLR: lr = base_lr * gamma^step — smooth exponential decay.

eye

</>

pub fn eye(n: Int) -> Tensor

Create a square identity matrix.

f1

</>

pub fn f1(
  predictions: Tensor,
  targets: Tensor,
  num_classes: Int,
  average: Average,
) -> Result(Float, TensorError)

F1-score aggregated by the chosen Average.

feed_forward_forward

</>

pub fn feed_forward_forward(
  ff: FeedForward,
  input: Tensor,
) -> Result(Tensor, TensorError)

Run the FFN forward pass on [seq_len, embed_dim]-shaped input.

feed_forward_init

</>

pub fn feed_forward_init(
  embed_dim: Int,
  hidden_dim: Int,
  activation: Activation,
) -> FeedForward

Build a zero-weight FeedForward sublayer.

Forward: activation(input @ w1 + b1) @ w2 + b2.

fill

</>

pub fn fill(shape: List(Int), value: Float) -> Tensor

Create tensor filled with value

flatten

</>

pub fn flatten(t: Tensor) -> Tensor

Flatten to 1D

floor

</>

pub fn floor(t: Tensor) -> Tensor

Floor every element.

from_list

</>

pub fn from_list(data: List(Float)) -> Tensor

Create tensor from list (1D)

from_list2d

</>

pub fn from_list2d(
  rows: List(List(Float)),
) -> Result(Tensor, TensorError)

Create 2D tensor from list of lists

from_native_ref

</>

pub fn from_native_ref(
  ref: NativeTensorRef,
  shape: List(Int),
) -> Tensor

Wrap an existing native NIF tensor resource.

from_safetensors_file

</>

pub fn from_safetensors_file(
  path: String,
  config: TransformerConfig,
) -> Result(Transformer, HfLoadError)

Read a .safetensors file then project it into a Transformer using the dimensions in config.

full_like

</>

pub fn full_like(t: Tensor, value: Float) -> Tensor

Create a tensor with the same shape as another tensor, filled with a value.

gather

</>

pub fn gather(
  t: Tensor,
  indices: Tensor,
) -> Result(Tensor, TensorError)

Convenience wrapper around take for 1D integer-valued index tensors.

gelu

</>

pub fn gelu(t: Tensor) -> Tensor

Gaussian Error Linear Unit: 0.5 * x * (1 + erf(x / sqrt(2))). Uses the exact erf-based formulation.

gelu_backward

</>

pub fn gelu_backward(
  grad_out: Tensor,
  input: Tensor,
) -> Result(Tensor, TensorError)

Backward for exact gelu. See viva_tensor/nn/backward.gelu_backward.

generate

</>

pub fn generate(
  handle: ModelHandle,
  prompt: String,
  opts: GenerateOpts,
) -> Result(Generation, String)

Generate text from a loaded model handle.

temperature == 0.0 uses the fused argmax decode-step path. Non-zero sampling is intentionally left for the next sampling-focused API pass.

global_avg_pool2d

</>

pub fn global_avg_pool2d(
  input: Tensor,
) -> Result(Tensor, TensorError)

Global average pooling

gpt_block_forward

</>

pub fn gpt_block_forward(
  block: GptBlock,
  input: Tensor,
) -> Result(Tensor, TensorError)

Run a GptBlock on [seq_len, embed_dim] input.

gpt_block_init

</>

pub fn gpt_block_init(
  embed_dim: Int,
  num_heads: Int,
  ffn_hidden_dim: Int,
) -> Result(GptBlock, TensorError)

Build a zero-weight GptBlock.

gpt_model_forward

</>

pub fn gpt_model_forward(
  model: GptModel,
  token_ids: Tensor,
) -> Result(Tensor, TensorError)

End-to-end GPT forward: 1D token_ids -> logits [seq, vocab].

gpt_model_init

</>

pub fn gpt_model_init(
  num_layers: Int,
  vocab_size: Int,
  embed_dim: Int,
  num_heads: Int,
  ffn_hidden_dim: Int,
  max_position: Int,
) -> Result(GptModel, TensorError)

Build a zero-weight GptModel.

gpu_workspace

</>

pub fn gpu_workspace() -> Result(GpuWorkspace, TensorError)

Create an RTX 4090 FP16 workspace.

greater

</>

pub fn greater(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Element-wise greater-than mask with NumPy-style broadcasting.

greater_equal

</>

pub fn greater_equal(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Element-wise greater-than-or-equal mask with NumPy-style broadcasting.

greedy_generate

</>

pub fn greedy_generate(
  initial_tokens: List(Int),
  max_new_tokens: Int,
  model_fn: fn(List(Int)) -> Result(Tensor, TensorError),
  stop_token: option.Option(Int),
) -> Result(List(Int), TensorError)

Autoregressive greedy generation with optional stop token.

greedy_sample

</>

pub fn greedy_sample(logits: Tensor) -> Result(Int, TensorError)

Greedy argmax over a 1-D logits tensor.

group_norm_forward

</>

pub fn group_norm_forward(
  layer: GroupNorm,
  input: Tensor,
) -> Result(Tensor, TensorError)

Forward pass for GroupNorm — supports [batch, channels] and [batch, channels, spatial] inputs.

group_norm_init

</>

pub fn group_norm_init(
  num_groups: Int,
  num_channels: Int,
) -> GroupNorm

Initialize a GroupNorm with num_groups groups over num_channels channels.

gru_cell_init

</>

pub fn gru_cell_init(
  input_size: Int,
  hidden_size: Int,
) -> GruCell

Build a GRU cell with Xavier-initialized stacked weights and zero biases.

gru_cell_step

</>

pub fn gru_cell_step(
  cell: GruCell,
  input: Tensor,
  hidden: Tensor,
) -> Result(Tensor, TensorError)

One GRU time step (PyTorch nn.GRUCell convention).

gru_sequence

</>

pub fn gru_sequence(
  cell: GruCell,
  inputs: List(Tensor),
  initial_hidden: Tensor,
) -> Result(#(List(Tensor), Tensor), TensorError)

Run a GRU cell over a list of time steps.

hardswish

</>

pub fn hardswish(t: Tensor) -> Tensor

HardSwish: x * relu6(x + 3) / 6.

hardtanh

</>

pub fn hardtanh(
  t: Tensor,
  min_val: Float,
  max_val: Float,
) -> Tensor

HardTanh: clamp(x, min_val, max_val).

hardware_profiles

</>

pub fn hardware_profiles() -> List(HardwareProfile)

Inspect hardware target profiles, including unavailable future targets.

he_init

</>

pub fn he_init(fan_in: Int, fan_out: Int) -> Tensor

He initialization (for ReLU networks)

huber_loss

</>

pub fn huber_loss(
  prediction: Tensor,
  target: Tensor,
  delta: Float,
  reduction: Reduction,
) -> Result(Tensor, TensorError)

Huber loss (smooth L1). See viva_tensor/nn/losses.huber_loss.

identity

</>

pub fn identity(n: Int) -> Tensor

Alias for eye.

ids_to_tensor

</>

pub fn ids_to_tensor(ids: List(Int)) -> Tensor

Convert a list of ids into a [seq_len] tensor of integer-valued floats.

Example

import viva_tensor as t
let _ = t.ids_to_tensor([1, 2, 3])

init_constant

</>

pub fn init_constant(shape: List(Int), value: Float) -> Tensor

init.constant — constant-filled tensor. Equivalent to fill.

init_identity

</>

pub fn init_identity(n: Int) -> Tensor

init.identity — [n, n] identity matrix. Same as identity/eye, exposed here for API symmetry.

init_ones

</>

pub fn init_ones(shape: List(Int)) -> Tensor

init.ones — all-ones tensor. Use case: LayerNorm scale parameters.

init_zeros

</>

pub fn init_zeros(shape: List(Int)) -> Tensor

init.zeros — all-zeros tensor. Same as zeros, exposed here for API symmetry with the rest of init_*.

inspect

</>

pub fn inspect(t: Tensor) -> String

Alias for to_string — matches the NumPy/PyTorch inspect / __repr__ convention.

int2_progressive_layout

</>

pub fn int2_progressive_layout(
  shape: List(Int),
  block_size: Int,
) -> Result(QuantLayout, TensorError)

Describe an experimental progressive INT2 layout.

int3_progressive_layout

</>

pub fn int3_progressive_layout(
  shape: List(Int),
  block_size: Int,
) -> Result(QuantLayout, TensorError)

Describe an experimental progressive INT3 layout.

inv

</>

pub fn inv(a: Tensor) -> Result(Tensor, TensorError)

Matrix inverse via solve(a, identity). Errors when a is singular.

iou_per_class

</>

pub fn iou_per_class(
  predictions: Tensor,
  targets: Tensor,
  num_classes: Int,
) -> Result(List(Float), TensorError)

Per-class intersection-over-union.

is_close

</>

pub fn is_close(
  a: Float,
  b: Float,
  rtol: Float,
  atol: Float,
) -> Bool

Compare two scalars with relative and absolute tolerances.

is_contiguous

</>

pub fn is_contiguous(t: Tensor) -> Bool

Check if contiguous

is_native

</>

pub fn is_native(t: Tensor) -> Bool

Check whether a tensor is backed by native NIF memory.

kaiming_normal

</>

pub fn kaiming_normal(
  fan_in: Int,
  fan_out: Int,
  gain: Float,
) -> Tensor

He normal init: N(0, std^2) with std = gain * sqrt(1 / fan_in).

kaiming_uniform

</>

pub fn kaiming_uniform(
  fan_in: Int,
  fan_out: Int,
  gain: Float,
) -> Tensor

He uniform init: U(-bound, bound) with bound = gain * sqrt(3 / fan_in).

l1_loss

</>

pub fn l1_loss(
  prediction: Tensor,
  target: Tensor,
  reduction: Reduction,
) -> Result(Tensor, TensorError)

L1 / Mean Absolute Error. See viva_tensor/nn/losses.l1_loss.

l1_loss_backward

</>

pub fn l1_loss_backward(
  grad_out: Tensor,
  prediction: Tensor,
  target: Tensor,
  reduction: Reduction,
) -> Result(Tensor, TensorError)

Backward for l1_loss. Returns gradient w.r.t. prediction only. See viva_tensor/nn/backward.l1_loss_backward.

layer_norm_backward

</>

pub fn layer_norm_backward(
  grad_out: Tensor,
  input: Tensor,
  scale: Tensor,
  mean: Tensor,
  variance: Tensor,
  eps: Float,
) -> Result(#(Tensor, Tensor, Tensor), TensorError)

Backward for layer_norm over the last dimension. Requires the mean and variance saved from the forward pass. See viva_tensor/nn/backward.layer_norm_backward.

layer_norm_forward

</>

pub fn layer_norm_forward(
  layer: LayerNorm,
  input: Tensor,
) -> Result(Tensor, TensorError)

Forward pass for LayerNorm — normalizes along the last dimension.

layer_norm_init

</>

pub fn layer_norm_init(num_features: Int) -> LayerNorm

Initialize a LayerNorm with default eps = 1.0e-5.

layer_norm_init_with_eps

</>

pub fn layer_norm_init_with_eps(
  num_features: Int,
  eps: Float,
) -> LayerNorm

Initialize a LayerNorm with custom eps.

layout

</>

pub fn layout(t: Tensor) -> layout.TensorLayout

Inspect storage, device, dtype, shape, strides, offset, size, and rank.

leaky_relu

</>

pub fn leaky_relu(t: Tensor, negative_slope: Float) -> Tensor

Leaky ReLU: x if x > 0 else negative_slope * x.

leaky_relu_backward

</>

pub fn leaky_relu_backward(
  grad_out: Tensor,
  input: Tensor,
  negative_slope: Float,
) -> Result(Tensor, TensorError)

Backward for leaky_relu. See viva_tensor/nn/backward.leaky_relu_backward.

leaky_relu_gain

</>

pub fn leaky_relu_gain(negative_slope: Float) -> Float

sqrt(2 / (1 + slope^2)) — gain for layers followed by Leaky ReLU.

learned_positional_forward

</>

pub fn learned_positional_forward(
  layer: LearnedPositionalEncoding,
  len: Int,
) -> Result(Tensor, TensorError)

Look up positions 0..len-1 from a learned positional encoding.

learned_positional_init

</>

pub fn learned_positional_init(
  max_len: Int,
  embedding_dim: Int,
) -> LearnedPositionalEncoding

Initialize a learned positional encoding table.

less

</>

pub fn less(a: Tensor, b: Tensor) -> Result(Tensor, TensorError)

Element-wise less-than mask with NumPy-style broadcasting.

less_equal

</>

pub fn less_equal(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Element-wise less-than-or-equal mask with NumPy-style broadcasting.

linear_backward

</>

pub fn linear_backward(
  grad_out: Tensor,
  input: Tensor,
  weight: Tensor,
) -> Result(#(Tensor, Tensor), TensorError)

Backward for a linear layer output = input @ weight. Returns #(grad_input, grad_weight). See viva_tensor/nn/backward.linear_backward.

linear_fp8

</>

pub fn linear_fp8(
  input: Tensor,
  weight: PackedWeightFp8,
  bias: option.Option(Tensor),
) -> Result(Tensor, TensorError)

FP8 linear: input @ weight + bias?.

linear_gain

</>

pub fn linear_gain() -> Float

1.0 — gain for layers followed by a linear activation.

linear_gelu_accelerated_into

</>

pub fn linear_gelu_accelerated_into(
  out: AcceleratedTensor,
  a: AcceleratedTensor,
  b: AcceleratedTensor,
  bias: AcceleratedTensor,
) -> Result(Nil, TensorError)

Write out = gelu(a @ b + bias) using the FP16 Tensor Core fused epilogue.

linear_gelu_forward_into

</>

pub fn linear_gelu_forward_into(
  out: AcceleratedTensor,
  input: AcceleratedTensor,
  layer: LinearLayer,
) -> Result(Nil, TensorError)

Run out = gelu(input @ layer.weight + layer.bias).

linear_gelu_fp8

</>

pub fn linear_gelu_fp8(
  input: Tensor,
  weight: PackedWeightFp8,
  bias: option.Option(Tensor),
) -> Result(Tensor, TensorError)

FP8 linear fused with bias + GELU activation (cuBLASLt epilogue=36).

linear_int4_sparse

</>

pub fn linear_int4_sparse(
  input: Tensor,
  weight: PackedWeightInt4Sparse,
  bias: option.Option(Tensor),
) -> Result(Tensor, TensorError)

INT4 2:4 sparse linear: input @ weight + bias?.

linear_int8_sparse

</>

pub fn linear_int8_sparse(
  input: Tensor,
  weight: PackedWeightInt8Sparse,
  bias: option.Option(Tensor),
) -> Result(Tensor, TensorError)

INT8 2:4 sparse linear with auto-shape backend dispatch.

linear_layer

</>

pub fn linear_layer(
  workspace: GpuWorkspace,
  weight: Tensor,
  bias: Tensor,
) -> Result(LinearLayer, TensorError)

Create a persisted linear layer in workspace memory.

linear_layer_backend

</>

pub fn linear_layer_backend(
  layer: LinearLayer,
) -> AccelerationBackend

Linear layer backend.

linear_layer_fp16

</>

pub fn linear_layer_fp16(
  weight: Tensor,
  bias: Tensor,
) -> Result(LinearLayer, TensorError)

Create a persisted FP16 linear layer on the RTX.

linear_layer_input_features

</>

pub fn linear_layer_input_features(layer: LinearLayer) -> Int

Linear layer input feature count.

linear_layer_output_features

</>

pub fn linear_layer_output_features(layer: LinearLayer) -> Int

Linear layer output feature count.

linear_output

</>

pub fn linear_output(
  workspace: GpuWorkspace,
  layer: LinearLayer,
  batch_size: Int,
) -> Result(AcceleratedTensor, TensorError)

Allocate a reusable output buffer for a persisted linear layer.

linear_relu

</>

pub fn linear_relu(
  a: Tensor,
  b: Tensor,
  bias: Tensor,
) -> Result(Tensor, TensorError)

Fused linear layer with ReLU: max(0, a @ b + bias).

linear_relu_accelerated_into

</>

pub fn linear_relu_accelerated_into(
  out: AcceleratedTensor,
  a: AcceleratedTensor,
  b: AcceleratedTensor,
  bias: AcceleratedTensor,
) -> Result(Nil, TensorError)

Write out = relu(a @ b + bias) using the FP16 Tensor Core fused epilogue.

linear_relu_forward_into

</>

pub fn linear_relu_forward_into(
  out: AcceleratedTensor,
  input: AcceleratedTensor,
  layer: LinearLayer,
) -> Result(Nil, TensorError)

Run out = relu(input @ layer.weight + layer.bias).

linear_relu_into

</>

pub fn linear_relu_into(
  out: Tensor,
  a: Tensor,
  b: Tensor,
  bias: Tensor,
) -> Result(Nil, TensorError)

Write out = max(0, a @ b + bias) into a preallocated native tensor.

linear_swiglu_fp8

</>

pub fn linear_swiglu_fp8(
  input: Tensor,
  gate_weight: PackedWeightFp8,
  up_weight: PackedWeightFp8,
  bias: option.Option(Tensor),
) -> Result(Tensor, TensorError)

FP8 SwiGLU block: silu(input @ gate) * (input @ up) (+ optional bias).

linear_warmup

</>

pub fn linear_warmup(
  base_lr: Float,
  warmup_steps: Int,
) -> Scheduler

LinearWarmup: linear ramp from 0 to base_lr over warmup_steps, then constant base_lr.

linspace

</>

pub fn linspace(start: Float, stop: Float, steps: Int) -> Tensor

Create a 1D tensor with evenly spaced values over a closed interval.

llama_block_forward

</>

pub fn llama_block_forward(
  block: LlamaBlock,
  input: Tensor,
) -> Result(Tensor, TensorError)

Run a LlamaBlock on [seq_len, embed_dim] input.

llama_block_init

</>

pub fn llama_block_init(
  embed_dim: Int,
  num_heads: Int,
  ffn_hidden_dim: Int,
) -> Result(LlamaBlock, TensorError)

Build a zero-weight LlamaBlock.

llama_model_forward

</>

pub fn llama_model_forward(
  model: LlamaModel,
  token_ids: Tensor,
) -> Result(Tensor, TensorError)

End-to-end Llama forward: 1D token_ids -> logits [seq, vocab].

llama_model_init

</>

pub fn llama_model_init(
  num_layers: Int,
  vocab_size: Int,
  embed_dim: Int,
  num_heads: Int,
  ffn_hidden_dim: Int,
) -> Result(LlamaModel, TensorError)

Build a zero-weight LlamaModel with num_layers blocks.

load_embedding

</>

pub fn load_embedding(
  weights: dict.Dict(String, Tensor),
  prefix: String,
  vocab_size: Int,
  embedding_dim: Int,
) -> Result(Embedding, HfLoadError)

Load an Embedding from prefix <> ".weight" ([vocab_size, embedding_dim]).

load_encoder_block

</>

pub fn load_encoder_block(
  weights: dict.Dict(String, Tensor),
  prefix: String,
  num_heads: Int,
  embed_dim: Int,
  hidden_dim: Int,
  activation: Activation,
) -> Result(EncoderBlock, HfLoadError)

Load a single EncoderBlock (MHA + 2× LayerNorm + FFN) under prefix (e.g. "encoder.layers.0").

load_feed_forward

</>

pub fn load_feed_forward(
  weights: dict.Dict(String, Tensor),
  prefix: String,
  embed_dim: Int,
  hidden_dim: Int,
  activation: Activation,
) -> Result(FeedForward, HfLoadError)

Load a FeedForward from linear1/linear2 (weight + bias each) under the supplied prefix.

load_layer_norm

</>

pub fn load_layer_norm(
  weights: dict.Dict(String, Tensor),
  prefix: String,
  num_features: Int,
) -> Result(LayerNorm, HfLoadError)

Load a LayerNorm from prefix <> ".weight" (scale) and prefix <> ".bias", both [num_features].

load_model

</>

pub fn load_model(path: String) -> Result(ModelHandle, String)

Load a Llama-family HuggingFace SafeTensors model into an opaque handle.

This caches the tokenizer, embedding table, blocked FP8 layer weights, final RMSNorm, lm_head, and RoPE frequencies for repeated generation.

load_multi_head_attention

</>

pub fn load_multi_head_attention(
  weights: dict.Dict(String, Tensor),
  prefix: String,
  num_heads: Int,
  embed_dim: Int,
) -> Result(MultiHeadAttention, HfLoadError)

Load a MultiHeadAttention from q_proj/k_proj/v_proj/out_proj (weight + bias each) under the supplied prefix.

load_safetensors_dict

</>

pub fn load_safetensors_dict(
  path: String,
) -> Result(dict.Dict(String, Tensor), HfLoadError)

Read a .safetensors file into a Dict(String, Tensor), mapping I/O failures into HfLoadError.IoError.

load_transformer

</>

pub fn load_transformer(
  weights: dict.Dict(String, Tensor),
  num_enc_layers: Int,
  num_dec_layers: Int,
  embed_dim: Int,
  num_heads: Int,
  hidden_dim: Int,
  activation: Activation,
) -> Result(Transformer, HfLoadError)

Load a full Transformer (encoder stack + decoder stack) under the conventional encoder.layers.{i} / decoder.layers.{i} prefixes.

log

</>

pub fn log(t: Tensor) -> Tensor

Natural logarithm for every element.

log_softmax

</>

pub fn log_softmax(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Log-softmax along axis: x - max - log(sum(exp(x - max))).

logical_and

</>

pub fn logical_and(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Logical AND over numeric masks with broadcasting.

logical_not

</>

pub fn logical_not(t: Tensor) -> Tensor

Logical NOT over a numeric mask.

logical_or

</>

pub fn logical_or(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Logical OR over numeric masks with broadcasting.

logical_xor

</>

pub fn logical_xor(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Logical XOR over numeric masks with broadcasting.

logspace

</>

pub fn logspace(
  start: Float,
  stop: Float,
  steps: Int,
  base: Float,
) -> Tensor

Create a 1D tensor with logarithmically spaced values.

lstm_cell_init

</>

pub fn lstm_cell_init(
  input_size: Int,
  hidden_size: Int,
) -> LstmCell

Build an LSTM cell with Xavier-initialized stacked weights and zero biases.

lstm_cell_step

</>

pub fn lstm_cell_step(
  cell: LstmCell,
  input: Tensor,
  hidden: Tensor,
  cell_state: Tensor,
) -> Result(#(Tensor, Tensor), TensorError)

One LSTM time step. Returns (new_hidden, new_cell_state).

lstm_sequence

</>

pub fn lstm_sequence(
  cell: LstmCell,
  inputs: List(Tensor),
  initial_hidden: Tensor,
  initial_cell: Tensor,
) -> Result(#(List(Tensor), Tensor, Tensor), TensorError)

Run an LSTM cell over a list of time steps. Returns (all_hidden_states, final_hidden, final_cell_state).

lu

</>

pub fn lu(
  a: Tensor,
) -> Result(#(Tensor, Tensor, List(Int)), TensorError)

LU decomposition with partial pivoting. Returns #(L, U, perm).

manhattan_distance

</>

pub fn manhattan_distance(a: Tensor, b: Tensor) -> Float

Manhattan distance between two same-shaped tensors, flattened as vectors.

map

</>

pub fn map(t: Tensor, f: fn(Float) -> Float) -> Tensor

Apply function to each element

map2

</>

pub fn map2(
  a: Tensor,
  b: Tensor,
  f: fn(Float, Float) -> Float,
) -> Result(Tensor, TensorError)

Apply a binary function element-wise over tensors with the same shape.

mask_select

</>

pub fn mask_select(
  t: Tensor,
  mask: Tensor,
) -> Result(Tensor, TensorError)

Select elements of t where the same-shaped mask tensor is non-zero.

masked_select

</>

pub fn masked_select(t: Tensor, mask: Tensor) -> Tensor

Select flattened values where a broadcasted mask is non-zero.

matmul

</>

pub fn matmul(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Matrix-matrix multiplication

matmul_accelerated

</>

pub fn matmul_accelerated(
  a: AcceleratedTensor,
  b: AcceleratedTensor,
) -> Result(AcceleratedTensor, TensorError)

Matrix multiplication between persistent accelerated tensors.

matmul_accelerated_into

</>

pub fn matmul_accelerated_into(
  out: AcceleratedTensor,
  a: AcceleratedTensor,
  b: AcceleratedTensor,
) -> Result(Nil, TensorError)

Write out = a @ b into a persistent accelerated output buffer.

matmul_auto

</>

pub fn matmul_auto(
  a: Tensor,
  b: Tensor,
) -> Result(AcceleratedTensor, TensorError)

Matrix multiplication with priority: RTX 4090 first, then MKL/native CPU.

matmul_backward

</>

pub fn matmul_backward(
  grad_out: Tensor,
  a: Tensor,
  b: Tensor,
) -> Result(#(Tensor, Tensor), TensorError)

Backward for matmul. Returns #(grad_a, grad_b). Same math as linear_backward, exposed for the user-facing matmul. See viva_tensor/nn/backward.matmul_backward.

matmul_gelu_accelerated_into

</>

pub fn matmul_gelu_accelerated_into(
  out: AcceleratedTensor,
  a: AcceleratedTensor,
  b: AcceleratedTensor,
) -> Result(Nil, TensorError)

Write out = gelu(a @ b) using the FP16 Tensor Core fused epilogue.

matmul_into

</>

pub fn matmul_into(
  out: Tensor,
  a: Tensor,
  b: Tensor,
) -> Result(Nil, TensorError)

Write out = a @ b into a preallocated native tensor.

matmul_planned

</>

pub fn matmul_planned(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

matmul_relu_accelerated_into

</>

pub fn matmul_relu_accelerated_into(
  out: AcceleratedTensor,
  a: AcceleratedTensor,
  b: AcceleratedTensor,
) -> Result(Nil, TensorError)

Write out = relu(a @ b) using the FP16 Tensor Core fused epilogue.

matmul_vec

</>

pub fn matmul_vec(
  mat: Tensor,
  vec: Tensor,
) -> Result(Tensor, TensorError)

Matrix-vector multiplication

matrix

</>

pub fn matrix(
  rows: Int,
  cols: Int,
  data: List(Float),
) -> Result(Tensor, TensorError)

Create matrix (2D tensor)

max

</>

pub fn max(t: Tensor) -> Float

Maximum value

max_axis

</>

pub fn max_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Maximum along one axis.

max_axis_keepdims

</>

pub fn max_axis_keepdims(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Maximum along one axis, preserving the reduced dimension as size 1.

max_pool2d

</>

pub fn max_pool2d(
  input: Tensor,
  pool_h: Int,
  pool_w: Int,
  stride_h: Int,
  stride_w: Int,
) -> Result(Tensor, TensorError)

Max pooling 2D

max_pool_1d_forward

</>

pub fn max_pool_1d_forward(
  config: MaxPool1dConfig,
  input: Tensor,
) -> Result(Tensor, TensorError)

1D max pooling. Input [batch, channels, length], output [batch, channels, (length + 2*padding - kernel_size) / stride + 1].

max_pool_2d_with_indices

</>

pub fn max_pool_2d_with_indices(
  input: Tensor,
  kernel_size: Int,
  stride: Int,
  padding: Int,
) -> Result(#(Tensor, Tensor), TensorError)

Run a 2D max-pool returning both pooled values and the flat argmax index per output cell. Input [N, C, H, W]; outputs are both [N, C, H_out, W_out]. Indices are stored as Float (truncated by the unpool consumer); fully-padded windows get -1.0.

max_unpool_2d_forward

</>

pub fn max_unpool_2d_forward(
  config: MaxUnpool2dConfig,
  input: Tensor,
  indices: Tensor,
  output_size: #(Int, Int),
) -> Result(Tensor, TensorError)

Inverse of max_pool_2d_with_indices. Scatters pooled values back at the stored indices, zeros elsewhere. Input [N, C, H_out, W_out], indices [N, C, H_out, W_out], output [N, C, H_in, W_in] where (H_in, W_in) comes from output_size.

maximum

</>

pub fn maximum(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Element-wise maximum with NumPy-style broadcasting.

mean

</>

pub fn mean(t: Tensor) -> Float

Mean of all elements

mean_absolute_error

</>

pub fn mean_absolute_error(
  predictions: Tensor,
  targets: Tensor,
) -> Result(Float, TensorError)

Mean Absolute Error: (1/N) * sum_i |pred_i - target_i|.

mean_absolute_percentage_error

</>

pub fn mean_absolute_percentage_error(
  predictions: Tensor,
  targets: Tensor,
) -> Result(Float, TensorError)

Mean Absolute Percentage Error: (100/N) * sum_i |pred_i - target_i| / |target_i|.

mean_axis

</>

pub fn mean_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Mean along one axis.

mean_axis_keepdims

</>

pub fn mean_axis_keepdims(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Mean along one axis, preserving the reduced dimension as size 1.

mean_iou

</>

pub fn mean_iou(
  predictions: Tensor,
  targets: Tensor,
  num_classes: Int,
) -> Result(Float, TensorError)

Mean of per-class IoU.

mean_squared_error

</>

pub fn mean_squared_error(
  predictions: Tensor,
  targets: Tensor,
) -> Result(Float, TensorError)

Mean Squared Error: (1/N) * sum_i (pred_i - target_i)^2.

measure_tflops

</>

pub fn measure_tflops(
  backend: TflopsBackend,
  m: Int,
  n: Int,
  k: Int,
) -> TflopsResult

Measure TFLOPS for a single matmul operation

measure_tflops_averaged

</>

pub fn measure_tflops_averaged(
  backend: TflopsBackend,
  m: Int,
  n: Int,
  k: Int,
  iterations: Int,
) -> TflopsResult

Measure averaged TFLOPS (warmup + iterations)

median

</>

pub fn median(t: Tensor) -> Float

Median value.

min

</>

pub fn min(t: Tensor) -> Float

Minimum value

min_axis

</>

pub fn min_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Minimum along one axis.

min_axis_keepdims

</>

pub fn min_axis_keepdims(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Minimum along one axis, preserving the reduced dimension as size 1.

minimum

</>

pub fn minimum(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Element-wise minimum with NumPy-style broadcasting.

minmax_scale

</>

pub fn minmax_scale(
  t: Tensor,
  feature_min: Float,
  feature_max: Float,
) -> Tensor

Scale all values into a target interval.

mish

</>

pub fn mish(t: Tensor) -> Tensor

Mish: x * tanh(softplus(x)).

mixup

</>

pub fn mixup(
  images: Tensor,
  labels: Tensor,
  num_classes: Int,
  alpha: Float,
) -> Result(#(Tensor, Tensor), TensorError)

MixUp on a batch — convex combination of images + soft labels with mixing ratio drawn from Beta(alpha, alpha).

moe_block_forward

</>

pub fn moe_block_forward(
  block: MoeBlock,
  tokens: Tensor,
) -> Result(#(Tensor, Tensor), TensorError)

Run the MoE block forward pass on [tokens, embed_dim] input.

moe_block_init

</>

pub fn moe_block_init(
  embed_dim: Int,
  hidden_dim: Int,
  num_experts: Int,
  top_k: Int,
) -> Result(MoeBlock, TensorError)

Build a MoeBlock with zero-weight experts and a fresh router.

mse_loss

</>

pub fn mse_loss(
  prediction: Tensor,
  target: Tensor,
  reduction: Reduction,
) -> Result(Tensor, TensorError)

Mean Squared Error. See viva_tensor/nn/losses.mse_loss.

mse_loss_backward

</>

pub fn mse_loss_backward(
  grad_out: Tensor,
  prediction: Tensor,
  target: Tensor,
  reduction: Reduction,
) -> Result(Tensor, TensorError)

Backward for mse_loss. Returns gradient w.r.t. prediction only. See viva_tensor/nn/backward.mse_loss_backward.

mul

</>

pub fn mul(a: Tensor, b: Tensor) -> Result(Tensor, TensorError)

Element-wise multiplication

mul_broadcast

</>

pub fn mul_broadcast(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Multiply with broadcasting

mul_into

</>

pub fn mul_into(
  out: Tensor,
  a: Tensor,
  b: Tensor,
) -> Result(Nil, TensorError)

Write out = a * b into a preallocated native tensor.

multi_head_attention_forward

</>

pub fn multi_head_attention_forward(
  mha: MultiHeadAttention,
  q: Tensor,
  k: Tensor,
  v: Tensor,
  is_causal: Bool,
) -> Result(Tensor, TensorError)

Multi-Head Attention forward pass.

multi_head_attention_init

</>

pub fn multi_head_attention_init(
  num_heads: Int,
  embed_dim: Int,
  use_bias: Bool,
) -> Result(MultiHeadAttention, TensorError)

Initialize a Multi-Head Attention module with zero weights.

native_fill

</>

pub fn native_fill(
  shape: List(Int),
  value: Float,
) -> Result(Tensor, TensorError)

Create a native-backed tensor filled with a value.

native_from_list

</>

pub fn native_from_list(
  data: List(Float),
  shape: List(Int),
) -> Result(Tensor, TensorError)

Create a native-backed tensor from row-major list data.

native_ones

</>

pub fn native_ones(
  shape: List(Int),
) -> Result(Tensor, TensorError)

Create a native-backed tensor of ones.

native_ref

</>

pub fn native_ref(t: Tensor) -> Result(NativeTensorRef, Nil)

Extract the native NIF tensor resource when present.

native_zeros

</>

pub fn native_zeros(
  shape: List(Int),
) -> Result(Tensor, TensorError)

Create a native-backed tensor of zeros.

negate

</>

pub fn negate(t: Tensor) -> Tensor

Negate every element.

nms

</>

pub fn nms(
  boxes: Tensor,
  scores: Tensor,
  iou_threshold: Float,
) -> Result(List(Int), TensorError)

Greedy Non-Maximum Suppression. boxes [N, 4] (rows [x1, y1, x2, y2]), scores [N]. Returns the indices of kept boxes, sorted by descending score.

nonzero

</>

pub fn nonzero(t: Tensor) -> Result(List(List(Int)), TensorError)

Return multi-dimensional indices of non-zero elements of t (NumPy nonzero).

nonzero_flat

</>

pub fn nonzero_flat(t: Tensor) -> Tensor

Return flattened indices for non-zero values, represented as floats (legacy).

norm

</>

pub fn norm(t: Tensor) -> Float

L2 norm (Euclidean length)

normal

</>

pub fn normal(
  shape: List(Int),
  mean: Float,
  std: Float,
) -> Tensor

Sample each element from N(mean, std^2) via the Box-Muller transform. See viva_tensor/nn/init.normal.

normalize

</>

pub fn normalize(t: Tensor) -> Tensor

Normalize to unit length

not_equal

</>

pub fn not_equal(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Element-wise inequality mask with NumPy-style broadcasting.

nvfp4_block_scaled_layout

</>

pub fn nvfp4_block_scaled_layout(shape: List(Int)) -> QuantLayout

Describe a Rubin-ready NVFP4 block-scaled layout using 16-value micro-blocks.

one_cycle_lr

</>

pub fn one_cycle_lr(
  base_lr: Float,
  max_lr: Float,
  total_steps: Int,
  pct_start: Float,
) -> Scheduler

OneCycleLR: linear warmup base_lr -> max_lr for the first pct_start * total_steps steps, then cosine anneal max_lr -> base_lr.

ones

</>

pub fn ones(shape: List(Int)) -> Tensor

Create tensor of ones

ones_like

</>

pub fn ones_like(t: Tensor) -> Tensor

Create a tensor with the same shape as another tensor, filled with ones.

onnx_parse_graph

</>

pub fn onnx_parse_graph(
  json_str: String,
) -> Result(OnnxGraph, OnnxError)

Parse a JSON-encoded ONNX graph.

Supported op set (v1): Add, Sub, Mul, MatMul, Gemm, Relu, Sigmoid, Tanh, Gelu, Softmax, Transpose, Reshape, Constant, LayerNormalization. See viva_tensor/io/onnx.parse_graph.

onnx_run_graph

</>

pub fn onnx_run_graph(
  graph: OnnxGraph,
  feeds: dict.Dict(String, Tensor),
) -> Result(dict.Dict(String, Tensor), OnnxError)

Execute a parsed ONNX graph against a dict of named input tensors.

Returns the full execution table — pick the named graph outputs from it. See viva_tensor/io/onnx.run_graph for the v1 supported op set.

onnx_supported_ops

</>

pub fn onnx_supported_ops() -> List(String)

Return the list of ONNX op_types supported by onnx_run_graph in v1.

orthogonal

</>

pub fn orthogonal(
  rows: Int,
  cols: Int,
  gain: Float,
) -> Result(Tensor, TensorError)

Orthogonal init via QR. Returns [rows, cols] with orthonormal columns (or rows, when rows < cols). See viva_tensor/nn/init.orthogonal.

outer

</>

pub fn outer(a: Tensor, b: Tensor) -> Result(Tensor, TensorError)

Outer product

pad2d

</>

pub fn pad2d(
  t: Tensor,
  pad_h: Int,
  pad_w: Int,
) -> Result(Tensor, TensorError)

Pad 2D tensor with zeros

pad4d

</>

pub fn pad4d(
  t: Tensor,
  pad_h: Int,
  pad_w: Int,
) -> Result(Tensor, TensorError)

Pad 4D tensor with zeros

pad_or_truncate

</>

pub fn pad_or_truncate(
  ids: List(Int),
  max_length: Int,
  pad_id: Int,
) -> List(Int)

Pad or truncate a list of ids to max_length using pad_id.

Example

import viva_tensor as t
let _ = t.pad_or_truncate([1, 2], 4, 0)

percentile

</>

pub fn percentile(t: Tensor, percentile: Int) -> Float

Percentile using linear interpolation between closest ranks.

plan_backend

</>

pub fn plan_backend(
  operation: TensorOperation,
) -> TensorBackendPlan

Plan which backend should handle an operation on this VM.

plan_runtime

</>

pub fn plan_runtime(
  spec: TensorSpec,
  operation: RuntimeOp,
) -> RuntimePlan

Plan a runtime operation from dtype/device/layout metadata.

precision

</>

pub fn precision(
  predictions: Tensor,
  targets: Tensor,
  num_classes: Int,
  average: Average,
) -> Result(Float, TensorError)

Precision aggregated by the chosen Average.

prepack_fp8_weight

</>

pub fn prepack_fp8_weight(
  weight: Tensor,
) -> Result(PackedWeightFp8, TensorError)

Quantize + lay out a dense weight for the CUTLASS FP8 GEMM path.

prepack_int4_sparse_24_weight

</>

pub fn prepack_int4_sparse_24_weight(
  weight: Tensor,
) -> Result(PackedWeightInt4Sparse, TensorError)

Quantize + 2:4-prune a weight into the INT4 sparse layout (the highest-TFLOPS inference path on Ada SM89).

prepack_int8_sparse_24_weight

</>

pub fn prepack_int8_sparse_24_weight(
  weight: Tensor,
) -> Result(PackedWeightInt8Sparse, TensorError)

Quantize + 2:4-prune a weight into the INT8 sparse layout.

product

</>

pub fn product(t: Tensor) -> Float

Product of all elements.

qr

</>

pub fn qr(a: Tensor) -> Result(#(Tensor, Tensor), TensorError)

QR decomposition via classical Gram-Schmidt. Returns #(Q, R).

quant_layout_compression_ratio_against

</>

pub fn quant_layout_compression_ratio_against(
  layout: QuantLayout,
  baseline_bits_per_value: Int,
) -> Float

Estimate compression ratio versus a baseline element width.

quant_layout_is_rubin_native_candidate

</>

pub fn quant_layout_is_rubin_native_candidate(
  layout: QuantLayout,
) -> Bool

Check whether a layout matches Rubin-style native micro-block assumptions.

quant_layout_memory_bytes

</>

pub fn quant_layout_memory_bytes(layout: QuantLayout) -> Int

Estimate payload bytes for a quantized layout.

r_squared

</>

pub fn r_squared(
  predictions: Tensor,
  targets: Tensor,
) -> Result(Float, TensorError)

Coefficient of determination: 1 - SS_res / SS_tot.

random_normal

</>

pub fn random_normal(
  shape: List(Int),
  mean: Float,
  std: Float,
) -> Tensor

Tensor with normal random values

random_uniform

</>

pub fn random_uniform(shape: List(Int)) -> Tensor

Random uniform [0, 1)

rank

</>

pub fn rank(t: Tensor) -> Int

Get rank (number of dimensions)

recall

</>

pub fn recall(
  predictions: Tensor,
  targets: Tensor,
  num_classes: Int,
  average: Average,
) -> Result(Float, TensorError)

Recall aggregated by the chosen Average.

reciprocal

</>

pub fn reciprocal(t: Tensor) -> Tensor

Reciprocal for every element.

reduction_mean

</>

pub const reduction_mean: Reduction

ReductionMean returns the arithmetic mean as a 1-element tensor.

reduction_none

</>

pub const reduction_none: Reduction

ReductionNone keeps the per-element loss tensor.

reduction_sum

</>

pub const reduction_sum: Reduction

ReductionSum returns the sum as a 1-element tensor.

relu

</>

pub fn relu(t: Tensor) -> Tensor

Rectified Linear Unit: max(0, x).

relu_backward

</>

pub fn relu_backward(
  grad_out: Tensor,
  input: Tensor,
) -> Result(Tensor, TensorError)

Backward for relu. See viva_tensor/nn/backward.relu_backward.

relu_gain

</>

pub fn relu_gain() -> Float

sqrt(2) — gain for layers followed by ReLU.

reshape

</>

pub fn reshape(
  t: Tensor,
  new_shape: List(Int),
) -> Result(Tensor, TensorError)

Reshape (total size must match)

resize_bilinear

</>

pub const resize_bilinear: ResizeMode

ResizeMode.Bilinear re-export for ergonomic call sites.

resize_nearest

</>

pub const resize_nearest: ResizeMode

ResizeMode.Nearest re-export for ergonomic call sites.

rms_norm_backward

</>

pub fn rms_norm_backward(
  grad_out: Tensor,
  input: Tensor,
  scale: Tensor,
  rms: Tensor,
  eps: Float,
) -> Result(#(Tensor, Tensor), TensorError)

Backward for rms_norm over the last dimension. Requires the rms saved from the forward pass. eps is retained for signature symmetry. See viva_tensor/nn/backward.rms_norm_backward.

rms_norm_forward

</>

pub fn rms_norm_forward(
  layer: RmsNorm,
  input: Tensor,
) -> Result(Tensor, TensorError)

Forward pass for RmsNorm — RMS normalize along the last dimension.

rms_norm_init

</>

pub fn rms_norm_init(num_features: Int) -> RmsNorm

Initialize an RmsNorm with default eps = 1.0e-6.

rms_norm_init_with_eps

</>

pub fn rms_norm_init_with_eps(
  num_features: Int,
  eps: Float,
) -> RmsNorm

Initialize an RmsNorm with custom eps.

rmsprop

</>

pub fn rmsprop(lr: Float, alpha: Float, eps: Float) -> Optimizer

RMSprop. See viva_tensor/nn/optim.

rnn_cell_init

</>

pub fn rnn_cell_init(
  input_size: Int,
  hidden_size: Int,
) -> RnnCell

Build an Elman RNN cell with Xavier-initialized weights and zero biases.

rnn_cell_step

</>

pub fn rnn_cell_step(
  cell: RnnCell,
  input: Tensor,
  hidden: Tensor,
) -> Result(Tensor, TensorError)

One Elman RNN time step: h' = tanh(W_ih @ x + b_ih + W_hh @ h + b_hh).

rnn_sequence

</>

pub fn rnn_sequence(
  cell: RnnCell,
  inputs: List(Tensor),
  initial_hidden: Tensor,
) -> Result(#(List(Tensor), Tensor), TensorError)

Run an Elman RNN cell over a list of time steps.

roi_align

</>

pub fn roi_align(
  config: RoiAlignConfig,
  features: Tensor,
  rois: Tensor,
) -> Result(Tensor, TensorError)

Bilinear ROIAlign. features [N, C, H, W], rois [K, 5] with rows [batch_index, x1, y1, x2, y2]. Output [K, C, output_h, output_w].

root_mean_squared_error

</>

pub fn root_mean_squared_error(
  predictions: Tensor,
  targets: Tensor,
) -> Result(Float, TensorError)

Root Mean Squared Error: sqrt(MSE).

rope

</>

pub fn rope(
  input: Tensor,
  base: Float,
) -> Result(Tensor, TensorError)

Apply Rotary Positional Embedding (RoPE) to a [seq_len, dim] tensor.

round

</>

pub fn round(t: Tensor) -> Tensor

Round every element to the nearest integer value.

router_init

</>

pub fn router_init(
  embed_dim: Int,
  num_experts: Int,
  top_k: Int,
) -> Router

Build a Router with a zero-filled gate [embed_dim, num_experts].

router_route

</>

pub fn router_route(
  router: Router,
  tokens: Tensor,
) -> Result(#(Tensor, Tensor, Tensor), TensorError)

Route [tokens, embed_dim] through the gate, returning #(expert_ids, expert_weights, aux_loss).

runtime_cache_key

</>

pub fn runtime_cache_key(plan: RuntimePlan) -> String

Return the stable cache key for a runtime plan.

safetensors_read

</>

pub fn safetensors_read(
  path: String,
) -> Result(dict.Dict(String, Tensor), TensorError)

Read a SafeTensors file into a Dict(String, Tensor).

Supports F32 and F64 payloads. See viva_tensor/io/safetensors.read.

Example

import gleam/dict
import viva_tensor as t

let assert Ok(weights) = t.safetensors_read("./model.safetensors")
let _ = dict.get(weights, "encoder.weight")

safetensors_write

</>

pub fn safetensors_write(
  path: String,
  tensors: dict.Dict(String, Tensor),
) -> Result(Nil, TensorError)

Write a Dict(String, Tensor) to disk in SafeTensors format (F64 payload).

Example

import gleam/dict
import viva_tensor as t

let weights = dict.from_list([#("w", t.ones([2, 2]))])
let assert Ok(Nil) = t.safetensors_write("./out.safetensors", weights)

sample

</>

pub fn sample(
  config: SamplerConfig,
  state: SchedulerState,
  shape: List(Int),
  model_fn: fn(Tensor, Int) -> Result(Tensor, TensorError),
) -> Result(Tensor, TensorError)

Full reverse sampling loop. Calls model_fn at each step.

sample_token

</>

pub fn sample_token(
  logits: Tensor,
  config: SamplingConfig,
) -> Result(Int, TensorError)

Temperature + top-k + top-p sample from a 1-D logits tensor.

Named sample_token to avoid colliding with viva_tensor.sample from the diffusion samplers re-export.

scale

</>

pub fn scale(t: Tensor, s: Float) -> Tensor

Scale by constant

scale_into

</>

pub fn scale_into(
  out: Tensor,
  a: Tensor,
  scalar: Float,
) -> Result(Nil, TensorError)

Write out = a * scalar into a preallocated native tensor.

scaled_dot_product_attention

</>

pub fn scaled_dot_product_attention(
  q: Tensor,
  k: Tensor,
  v: Tensor,
  mask: option.Option(Tensor),
  is_causal: Bool,
) -> Result(Tensor, TensorError)

softmax((Q @ K^T) / sqrt(d_k)) @ V. See nn/attention docs.

scheduler_lr

</>

pub fn scheduler_lr(s: Scheduler) -> Float

Compute the learning rate at the scheduler’s current step without advancing.

scheduler_step

</>

pub fn scheduler_step(s: Scheduler) -> #(Scheduler, Float)

Advance the scheduler by one step and return the new learning rate.

selu

</>

pub fn selu(t: Tensor) -> Tensor

Scaled ELU with the canonical SELU constants from Klambauer et al. (2017).

sentence_piece_bpe

</>

pub fn sentence_piece_bpe(
  bpe: BpeTokenizer,
) -> SentencePieceTokenizer

Wrap a BpeTokenizer as a SentencePieceTokenizer. Encoding uses the greedy merge loop on input renormalized to the ▁-prefix convention.

Example

import viva_tensor as t
let inner = t.bpe_tokenizer_from_vocab_and_merges(
  ["?", "▁", "h", "i"], [], "?",
)
let _ = t.sentence_piece_bpe(inner)

sentence_piece_decode

</>

pub fn sentence_piece_decode(
  tokenizer: SentencePieceTokenizer,
  ids: List(Int),
) -> String

Decode ids with a SentencePieceTokenizer. Reuses the inner Unigram or BPE decoder and unmaps ▁ to ASCII space.

Example

import viva_tensor as t
let inner = t.unigram_tokenizer_from_pieces(
  [#("<unk>", -100.0), #("<s>", -100.0), #("</s>", -100.0),
   #("▁hi", -1.0)],
  "<unk>", "<s>", "</s>",
)
let _ = t.sentence_piece_decode(t.sentence_piece_unigram(inner), [1, 3, 2])

sentence_piece_encode

</>

pub fn sentence_piece_encode(
  tokenizer: SentencePieceTokenizer,
  text: String,
) -> List(Int)

Encode text with a SentencePieceTokenizer. Dispatches on the wrapper’s mode (Viterbi for SpUnigram, greedy merges for SpBpe).

Example

import viva_tensor as t
let inner = t.unigram_tokenizer_from_pieces(
  [#("<unk>", -100.0), #("<s>", -100.0), #("</s>", -100.0),
   #("▁hi", -1.0)],
  "<unk>", "<s>", "</s>",
)
let _ = t.sentence_piece_encode(t.sentence_piece_unigram(inner), "hi")

sentence_piece_unigram

</>

pub fn sentence_piece_unigram(
  unigram: UnigramTokenizer,
) -> SentencePieceTokenizer

Wrap a UnigramTokenizer as a SentencePieceTokenizer. Encoding goes through Viterbi.

Example

import viva_tensor as t
let inner = t.unigram_tokenizer_from_pieces(
  [#("<unk>", -100.0), #("<s>", -100.0), #("</s>", -100.0),
   #("▁hi", -1.0)],
  "<unk>", "<s>", "</s>",
)
let _ = t.sentence_piece_unigram(inner)

sgd

</>

pub fn sgd(lr: Float) -> Optimizer

Vanilla stochastic gradient descent. See viva_tensor/nn/optim.

sgd_momentum

</>

pub fn sgd_momentum(lr: Float, momentum: Float) -> Optimizer

SGD with momentum. See viva_tensor/nn/optim.

shape

</>

pub fn shape(t: Tensor) -> List(Int)

Shape as list of dimensions

sigmoid

</>

pub fn sigmoid(t: Tensor) -> Tensor

Sigmoid activation: 1 / (1 + exp(-x)). Numerically stable for large negative inputs via exp(x) / (1 + exp(x)).

sigmoid_backward

</>

pub fn sigmoid_backward(
  grad_out: Tensor,
  output: Tensor,
) -> Result(Tensor, TensorError)

Backward for sigmoid. Takes the sigmoid output, not the original input. See viva_tensor/nn/backward.sigmoid_backward.

sigmoid_gain

</>

pub fn sigmoid_gain() -> Float

1.0 — gain for layers followed by sigmoid.

sign

</>

pub fn sign(t: Tensor) -> Tensor

Return -1, 0, or 1 for each element.

sinusoidal_encoding

</>

pub fn sinusoidal_encoding(
  max_len: Int,
  embedding_dim: Int,
) -> Result(Tensor, TensorError)

Sinusoidal positional encoding (“Attention Is All You Need”).

size

</>

pub fn size(t: Tensor) -> Int

Get total size

softmax

</>

pub fn softmax(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Softmax along axis: exp(x - max) / sum(exp(x - max)).

softmax_axis

</>

pub fn softmax_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Softmax along one axis, preserving shape and normalizing each slice.

softmax_backward

</>

pub fn softmax_backward(
  grad_out: Tensor,
  output: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Backward for softmax along axis. Takes the softmax output of the forward pass. See viva_tensor/nn/backward.softmax_backward.

softplus

</>

pub fn softplus(t: Tensor) -> Tensor

Softplus: log(1 + exp(x)), numerically stable.

solve

</>

pub fn solve(a: Tensor, b: Tensor) -> Result(Tensor, TensorError)

Solve A x = b for a square A using Gaussian elimination with partial pivoting. b may be 1D or 2D.

spec_from_parts

</>

pub fn spec_from_parts(
  shape shape: List(Int),
  dtype dtype: TensorDtype,
  device device: TensorDevice,
  storage storage: TensorStorage,
  memory_layout memory_layout: TensorMemoryLayout,
) -> TensorSpec

Build a runtime spec from explicit metadata.

spec_key

</>

pub fn spec_key(spec: TensorSpec) -> String

Stable tensor spec cache key.

speculative_decode

</>

pub fn speculative_decode(
  config: SpeculativeConfig,
  initial_tokens: List(Int),
  draft_fn: fn(List(Int)) -> Result(Tensor, TensorError),
  verify_fn: fn(List(Int)) -> Result(Tensor, TensorError),
) -> Result(List(Int), TensorError)

Speculative decoding (Chen 2023 / Leviathan 2023).

sqrt

</>

pub fn sqrt(t: Tensor) -> Tensor

Square root every element.

square

</>

pub fn square(t: Tensor) -> Tensor

Square every element.

squeeze

</>

pub fn squeeze(t: Tensor) -> Tensor

Remove dimensions of size 1

standardize

</>

pub fn standardize(t: Tensor) -> Tensor

Alias for zscore.

std

</>

pub fn std(t: Tensor) -> Float

Standard deviation

step

</>

pub fn step(
  opt: Optimizer,
  params: List(Param),
  grads: List(GradPair),
) -> Result(#(Optimizer, List(Param)), TensorError)

Apply one optimizer step. See viva_tensor/nn/optim.step.

step_lr

</>

pub fn step_lr(
  base_lr: Float,
  step_size: Int,
  gamma: Float,
) -> Scheduler

StepLR: lr = base_lr * gamma^floor(step / step_size) — staircase decay.

sub

</>

pub fn sub(a: Tensor, b: Tensor) -> Result(Tensor, TensorError)

Element-wise subtraction

sub_broadcast

</>

pub fn sub_broadcast(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Subtract with broadcasting

sub_into

</>

pub fn sub_into(
  out: Tensor,
  a: Tensor,
  b: Tensor,
) -> Result(Nil, TensorError)

Write out = a - b into a preallocated native tensor.

sum

</>

pub fn sum(t: Tensor) -> Float

Sum everything

sum_axis

</>

pub fn sum_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Sum along one axis.

sum_axis_keepdims

</>

pub fn sum_axis_keepdims(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Sum along one axis, preserving the reduced dimension as size 1.

sum_grads

</>

pub const sum_grads: GradAggregation

Sum per-worker gradients before applying.

svd

</>

pub fn svd(
  a: Tensor,
) -> Result(#(Tensor, Tensor, Tensor), TensorError)

SVD stub (not implemented in v1).

swish

</>

pub fn swish(t: Tensor) -> Tensor

Swish / SiLU: x * sigmoid(x).

synchronous_train_step

</>

pub fn synchronous_train_step(
  opt: Optimizer,
  params: List(Param),
  per_worker_grads: List(List(GradPair)),
  aggregation: GradAggregation,
) -> Result(#(Optimizer, List(Param)), TensorError)

Apply one synchronous data-parallel optimizer step. See viva_tensor/distributed/trainer.synchronous_train_step.

t5_decoder_block_forward

</>

pub fn t5_decoder_block_forward(
  block: T5Block,
  input: Tensor,
  memory: Tensor,
) -> Result(Tensor, TensorError)

Run a T5 decoder block on [seq_len, embed_dim] input attending to memory (encoder output).

t5_decoder_block_init

</>

pub fn t5_decoder_block_init(
  embed_dim: Int,
  num_heads: Int,
  ffn_hidden_dim: Int,
) -> Result(T5Block, TensorError)

Build a zero-weight T5 decoder block (causal self-attn + cross-attn).

t5_encoder_block_forward

</>

pub fn t5_encoder_block_forward(
  block: T5Block,
  input: Tensor,
) -> Result(Tensor, TensorError)

Run a T5 encoder block on [seq_len, embed_dim] input.

t5_encoder_block_init

</>

pub fn t5_encoder_block_init(
  embed_dim: Int,
  num_heads: Int,
  ffn_hidden_dim: Int,
) -> Result(T5Block, TensorError)

Build a zero-weight T5 encoder block.

t5_model_forward

</>

pub fn t5_model_forward(
  model: T5Model,
  src_token_ids: Tensor,
  tgt_token_ids: Tensor,
) -> Result(Tensor, TensorError)

End-to-end T5 forward: src_token_ids + tgt_token_ids -> logits.

t5_model_init

</>

pub fn t5_model_init(
  num_encoder_layers: Int,
  num_decoder_layers: Int,
  vocab_size: Int,
  embed_dim: Int,
  num_heads: Int,
  ffn_hidden_dim: Int,
) -> Result(T5Model, TensorError)

Build a zero-weight T5 model with num_encoder_layers + num_decoder_layers.

take

</>

pub fn take(
  t: Tensor,
  indices: List(Int),
  axis: Int,
) -> Result(Tensor, TensorError)

Gather slices along axis at each of the given indices (NumPy-style take).

take_flat

</>

pub fn take_flat(t: Tensor, indices: List(Int)) -> Tensor

Take flattened elements by explicit indices (legacy: ignores tensor shape).

tanh

</>

pub fn tanh(t: Tensor) -> Tensor

Hyperbolic tangent activation. Output range (-1, 1).

tanh_backward

</>

pub fn tanh_backward(
  grad_out: Tensor,
  output: Tensor,
) -> Result(Tensor, TensorError)

Backward for tanh. Takes the tanh output, not the original input. See viva_tensor/nn/backward.tanh_backward.

tanh_gain

</>

pub fn tanh_gain() -> Float

5/3 — gain for layers followed by tanh.

tensor_spec

</>

pub fn tensor_spec(t: Tensor) -> TensorSpec

Build a runtime spec from an existing tensor.

tensor_to_ids

</>

pub fn tensor_to_ids(tensor: Tensor) -> List(Int)

Convert a [seq_len] integer-valued tensor back to a List(Int).

Example

import viva_tensor as t
let _ = t.tensor_to_ids(t.ids_to_tensor([1, 2, 3]))

to_accelerated

</>

pub fn to_accelerated(
  t: Tensor,
) -> Result(AcceleratedTensor, TensorError)

Move a tensor to the best persistent backend: RTX 4090 first, then MKL/CPU.

to_contiguous

</>

pub fn to_contiguous(t: Tensor) -> Tensor

Convert to contiguous tensor

to_list

</>

pub fn to_list(t: Tensor) -> List(Float)

Convert to list

to_rtx4090_fp16

</>

pub fn to_rtx4090_fp16(
  t: Tensor,
) -> Result(AcceleratedTensor, TensorError)

Upload a tensor to persistent RTX 4090 FP16 memory.

to_rtx4090_fp32

</>

pub fn to_rtx4090_fp32(
  t: Tensor,
) -> Result(AcceleratedTensor, TensorError)

Upload a tensor to persistent RTX 4090 FP32 memory.

to_strided

</>

pub fn to_strided(t: Tensor) -> Tensor

Convert to strided representation for O(1) element access

to_string

</>

pub fn to_string(t: Tensor) -> String

Render a tensor as a pretty multi-line string with column alignment and elision for large tensors.

to_string_with

</>

pub fn to_string_with(t: Tensor, opts: PrintOptions) -> String

Render a tensor with caller-supplied print options.

top_k_accuracy

</>

pub fn top_k_accuracy(
  logits: Tensor,
  targets: Tensor,
  k: Int,
) -> Result(Float, TensorError)

Top-K accuracy on 2D logits with 1D class-index targets.

top_k_filter

</>

pub fn top_k_filter(
  logits: Tensor,
  k: Int,
) -> Result(Tensor, TensorError)

Mask logits outside the top-k (set to large negative sentinel).

top_p_filter

</>

pub fn top_p_filter(
  logits: Tensor,
  p: Float,
) -> Result(Tensor, TensorError)

Nucleus (top-p) filter — mask the smallest-prob tail until cum > p.

traced_add

</>

pub fn traced_add(
  tape: @internal Tape,
  a: @internal Variable,
  b: @internal Variable,
) -> Result(@internal Traced(@internal Variable), TensorError)

Traced add (saves INPUT shapes for broadcast reduction).

traced_gelu

</>

pub fn traced_gelu(
  tape: @internal Tape,
  x: @internal Variable,
) -> Result(@internal Traced(@internal Variable), TensorError)

Traced GELU (exact) forward + backward (saves INPUT in closure).

traced_l1_loss

</>

pub fn traced_l1_loss(
  tape: @internal Tape,
  pred: @internal Variable,
  target: @internal Tensor,
) -> Result(@internal Traced(@internal Variable), TensorError)

Traced L1 loss (target is constant; saves pred-target diff in closure).

traced_layer_norm

</>

pub fn traced_layer_norm(
  tape: @internal Tape,
  x: @internal Variable,
  scale: @internal Variable,
  bias: @internal Variable,
  eps: Float,
) -> Result(@internal Traced(@internal Variable), TensorError)

Traced LayerNorm over last axis (saves FORWARD STATS: x_hat + rstds).

traced_linear

</>

pub fn traced_linear(
  tape: @internal Tape,
  x: @internal Variable,
  w: @internal Variable,
) -> Result(@internal Traced(@internal Variable), TensorError)

Traced linear y = x @ w (saves INPUTS in closure).

traced_matmul

</>

pub fn traced_matmul(
  tape: @internal Tape,
  a: @internal Variable,
  b: @internal Variable,
) -> Result(@internal Traced(@internal Variable), TensorError)

Traced matmul (saves both INPUT operands in closure).

traced_mse_loss

</>

pub fn traced_mse_loss(
  tape: @internal Tape,
  pred: @internal Variable,
  target: @internal Tensor,
) -> Result(@internal Traced(@internal Variable), TensorError)

Traced MSE loss (target is constant; saves pred-target diff in closure).

traced_mul

</>

pub fn traced_mul(
  tape: @internal Tape,
  a: @internal Variable,
  b: @internal Variable,
) -> Result(@internal Traced(@internal Variable), TensorError)

Traced elementwise mul (saves INPUT operands).

traced_relu

</>

pub fn traced_relu(
  tape: @internal Tape,
  x: @internal Variable,
) -> Result(@internal Traced(@internal Variable), TensorError)

Traced ReLU forward + backward (saves INPUT in closure).

traced_scale

</>

pub fn traced_scale(
  tape: @internal Tape,
  x: @internal Variable,
  scalar: Float,
) -> Result(@internal Traced(@internal Variable), TensorError)

Traced scale-by-constant (saves only the float scalar).

traced_sigmoid

</>

pub fn traced_sigmoid(
  tape: @internal Tape,
  x: @internal Variable,
) -> Result(@internal Traced(@internal Variable), TensorError)

Traced sigmoid forward + backward (saves OUTPUT in closure).

traced_softmax

</>

pub fn traced_softmax(
  tape: @internal Tape,
  x: @internal Variable,
  axis: Int,
) -> Result(@internal Traced(@internal Variable), TensorError)

Traced softmax along axis (saves OUTPUT in closure).

traced_sub

</>

pub fn traced_sub(
  tape: @internal Tape,
  a: @internal Variable,
  b: @internal Variable,
) -> Result(@internal Traced(@internal Variable), TensorError)

Traced sub (saves INPUT shapes for broadcast reduction).

traced_tanh

</>

pub fn traced_tanh(
  tape: @internal Tape,
  x: @internal Variable,
) -> Result(@internal Traced(@internal Variable), TensorError)

Traced tanh forward + backward (saves OUTPUT in closure).

train_synchronous

</>

pub fn train_synchronous(
  config: TrainConfig,
  initial_params: List(Param),
  initial_optimizer: Optimizer,
  data_loader: DataLoader,
  compute_grads: fn(Batch, List(Param)) -> Result(
    List(GradPair),
    TensorError,
  ),
  num_steps: Int,
) -> Result(TrainResult, TensorError)

Run synchronous data-parallel SGD. See viva_tensor/distributed/trainer.train_synchronous.

transformer_decode

</>

pub fn transformer_decode(
  model: Transformer,
  tgt: Tensor,
  memory: Tensor,
) -> Result(Tensor, TensorError)

Run tgt through every decoder block, attending to memory per layer.

transformer_encode

</>

pub fn transformer_encode(
  model: Transformer,
  src: Tensor,
) -> Result(Tensor, TensorError)

Run src through every encoder block in order.

transformer_forward

</>

pub fn transformer_forward(
  model: Transformer,
  src: Tensor,
  tgt: Tensor,
) -> Result(Tensor, TensorError)

End-to-end forward: transformer_decode(model, tgt, transformer_encode(model, src)).

transformer_init

</>

pub fn transformer_init(
  num_encoder_layers: Int,
  num_decoder_layers: Int,
  embed_dim: Int,
  num_heads: Int,
  ffn_hidden_dim: Int,
  activation: Activation,
) -> Result(Transformer, TensorError)

Build a full encoder+decoder Transformer stack.

transpose

</>

pub fn transpose(t: Tensor) -> Result(Tensor, TensorError)

Matrix transpose

transpose_strided

</>

pub fn transpose_strided(
  t: Tensor,
) -> Result(Tensor, TensorError)

Zero-copy transpose

truncated_normal

</>

pub fn truncated_normal(
  shape: List(Int),
  mean: Float,
  std: Float,
  a: Float,
  b: Float,
) -> Tensor

Sample each element from N(mean, std^2) truncated to [a, b]. See viva_tensor/nn/init.truncated_normal.

try_abs

</>

pub fn try_abs(t: Tensor) -> Result(Tensor, TensorError)

Absolute value for every element, preserving materialization failures.

try_add_scalar

</>

pub fn try_add_scalar(
  t: Tensor,
  scalar: Float,
) -> Result(Tensor, TensorError)

Add a scalar to every element, preserving materialization failures.

try_all

</>

pub fn try_all(t: Tensor) -> Result(Bool, TensorError)

Are all mask values non-zero, preserving materialization failures.

try_all_axis

</>

pub fn try_all_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Are all values in each axis slice non-zero?

try_all_axis_keepdims

</>

pub fn try_all_axis_keepdims(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Are all values in each axis slice non-zero, preserving the reduced dimension.

try_any

</>

pub fn try_any(t: Tensor) -> Result(Bool, TensorError)

Does the mask contain any non-zero value, preserving materialization failures.

try_any_axis

</>

pub fn try_any_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Does each axis slice contain any non-zero value?

try_any_axis_keepdims

</>

pub fn try_any_axis_keepdims(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Does each axis slice contain any non-zero value, preserving the reduced dimension.

try_argmax

</>

pub fn try_argmax(t: Tensor) -> Result(Int, TensorError)

Index of maximum value, preserving materialization and empty-tensor errors.

try_argmin

</>

pub fn try_argmin(t: Tensor) -> Result(Int, TensorError)

Index of minimum value, preserving materialization and empty-tensor errors.

try_ceil

</>

pub fn try_ceil(t: Tensor) -> Result(Tensor, TensorError)

Ceiling every element, preserving materialization failures.

try_clamp

</>

pub fn try_clamp(
  t: Tensor,
  min_val: Float,
  max_val: Float,
) -> Result(Tensor, TensorError)

Clamp values, preserving materialization failures.

try_clip

</>

pub fn try_clip(
  t: Tensor,
  min_val: Float,
  max_val: Float,
) -> Result(Tensor, TensorError)

Alias for try_clamp.

try_clip_by_norm

</>

pub fn try_clip_by_norm(
  t: Tensor,
  max_norm: Float,
) -> Result(Tensor, TensorError)

Clip tensor L2 norm to at most max_norm, preserving errors.

try_cosine_similarity

</>

pub fn try_cosine_similarity(
  a: Tensor,
  b: Tensor,
) -> Result(Float, TensorError)

Cosine similarity between two same-shaped tensors, preserving errors.

try_count_nonzero

</>

pub fn try_count_nonzero(t: Tensor) -> Result(Int, TensorError)

Count non-zero values in a tensor, preserving materialization failures.

try_count_nonzero_axis

</>

pub fn try_count_nonzero_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Count non-zero values along one axis.

try_count_nonzero_axis_keepdims

</>

pub fn try_count_nonzero_axis_keepdims(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Count non-zero values along one axis, preserving the reduced dimension.

try_cumprod

</>

pub fn try_cumprod(t: Tensor) -> Result(Tensor, TensorError)

Cumulative product over the flattened tensor, preserving materialization failures.

try_cumprod_axis

</>

pub fn try_cumprod_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Cumulative product along one axis, preserving materialization failures.

try_cumsum

</>

pub fn try_cumsum(t: Tensor) -> Result(Tensor, TensorError)

Cumulative sum over the flattened tensor, preserving materialization failures.

try_cumsum_axis

</>

pub fn try_cumsum_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Cumulative sum along one axis, preserving materialization failures.

try_diag

</>

pub fn try_diag(t: Tensor) -> Result(Tensor, TensorError)

Create a square diagonal matrix from a 1D tensor.

try_dot_similarity

</>

pub fn try_dot_similarity(
  a: Tensor,
  b: Tensor,
) -> Result(Float, TensorError)

Dot similarity between two same-shaped tensors, preserving errors.

try_equal

</>

pub fn try_equal(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Alias for equal.

try_euclidean_distance

</>

pub fn try_euclidean_distance(
  a: Tensor,
  b: Tensor,
) -> Result(Float, TensorError)

Euclidean distance between two same-shaped tensors, preserving errors.

try_exp

</>

pub fn try_exp(t: Tensor) -> Result(Tensor, TensorError)

Exponential for every element, preserving materialization failures.

try_eye

</>

pub fn try_eye(n: Int) -> Result(Tensor, TensorError)

Create a square identity matrix.

try_flatten

</>

pub fn try_flatten(t: Tensor) -> Result(Tensor, TensorError)

Flatten to 1D, preserving materialization failures.

try_floor

</>

pub fn try_floor(t: Tensor) -> Result(Tensor, TensorError)

Floor every element, preserving materialization failures.

try_greater

</>

pub fn try_greater(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Alias for greater.

try_greater_equal

</>

pub fn try_greater_equal(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Alias for greater_equal.

try_hadamard_preprocess

</>

pub fn try_hadamard_preprocess(
  input: Tensor,
  seed: Int,
) -> Result(HadamardPreprocess, TensorError)

Apply randomized normalized Hadamard preprocessing to a vector tensor.

try_inverse_hadamard_preprocess

</>

pub fn try_inverse_hadamard_preprocess(
  preprocessed: HadamardPreprocess,
) -> Result(Tensor, TensorError)

Invert a previously applied Hadamard preprocessing plan.

try_less

</>

pub fn try_less(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Alias for less.

try_less_equal

</>

pub fn try_less_equal(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Alias for less_equal.

try_linspace

</>

pub fn try_linspace(
  start: Float,
  stop: Float,
  steps: Int,
) -> Result(Tensor, TensorError)

Create a 1D tensor with evenly spaced values over a closed interval.

try_log

</>

pub fn try_log(t: Tensor) -> Result(Tensor, TensorError)

Natural logarithm for every element, rejecting non-positive values.

try_logical_and

</>

pub fn try_logical_and(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Alias for logical_and.

try_logical_not

</>

pub fn try_logical_not(t: Tensor) -> Result(Tensor, TensorError)

Logical NOT over a numeric mask, preserving materialization failures.

try_logical_or

</>

pub fn try_logical_or(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Alias for logical_or.

try_logical_xor

</>

pub fn try_logical_xor(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Alias for logical_xor.

try_logspace

</>

pub fn try_logspace(
  start: Float,
  stop: Float,
  steps: Int,
  base: Float,
) -> Result(Tensor, TensorError)

Create a 1D tensor with logarithmically spaced values.

try_manhattan_distance

</>

pub fn try_manhattan_distance(
  a: Tensor,
  b: Tensor,
) -> Result(Float, TensorError)

Manhattan distance between two same-shaped tensors, preserving errors.

try_map

</>

pub fn try_map(
  t: Tensor,
  f: fn(Float) -> Float,
) -> Result(Tensor, TensorError)

Apply function to each element, preserving materialization failures.

try_masked_select

</>

pub fn try_masked_select(
  t: Tensor,
  mask: Tensor,
) -> Result(Tensor, TensorError)

Select flattened values where a broadcasted mask is non-zero, preserving errors.

try_max

</>

pub fn try_max(t: Tensor) -> Result(Float, TensorError)

Maximum value, preserving materialization and empty-tensor errors.

try_max_axis

</>

pub fn try_max_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Maximum along one axis, preserving materialization failures.

try_max_axis_keepdims

</>

pub fn try_max_axis_keepdims(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Maximum along one axis with keepdims, preserving materialization failures.

try_maximum

</>

pub fn try_maximum(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Alias for maximum.

try_mean

</>

pub fn try_mean(t: Tensor) -> Result(Float, TensorError)

Mean of all elements, preserving materialization and empty-tensor errors.

try_mean_axis

</>

pub fn try_mean_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Mean along one axis, preserving materialization failures.

try_mean_axis_keepdims

</>

pub fn try_mean_axis_keepdims(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Mean along one axis with keepdims, preserving materialization failures.

try_median

</>

pub fn try_median(t: Tensor) -> Result(Float, TensorError)

Median value, preserving materialization and empty-tensor errors.

try_min

</>

pub fn try_min(t: Tensor) -> Result(Float, TensorError)

Minimum value, preserving materialization and empty-tensor errors.

try_min_axis

</>

pub fn try_min_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Minimum along one axis, preserving materialization failures.

try_min_axis_keepdims

</>

pub fn try_min_axis_keepdims(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Minimum along one axis with keepdims, preserving materialization failures.

try_minimum

</>

pub fn try_minimum(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Alias for minimum.

try_minmax_scale

</>

pub fn try_minmax_scale(
  t: Tensor,
  feature_min: Float,
  feature_max: Float,
) -> Result(Tensor, TensorError)

Scale all values into a target interval, preserving errors.

try_negate

</>

pub fn try_negate(t: Tensor) -> Result(Tensor, TensorError)

Negate every element, preserving materialization failures.

try_nonzero

</>

pub fn try_nonzero(t: Tensor) -> Result(Tensor, TensorError)

Return flattened indices for non-zero values, preserving materialization failures.

try_nonzero_flat

</>

pub fn try_nonzero_flat(t: Tensor) -> Result(Tensor, TensorError)

Return flattened indices for non-zero values, represented as floats (legacy).

try_norm

</>

pub fn try_norm(t: Tensor) -> Result(Float, TensorError)

L2 norm, preserving materialization failures.

try_normalize

</>

pub fn try_normalize(t: Tensor) -> Result(Tensor, TensorError)

Normalize to unit length, preserving materialization failures.

try_normalized_walsh_hadamard

</>

pub fn try_normalized_walsh_hadamard(
  values: List(Float),
) -> Result(List(Float), TensorError)

Apply a normalized Walsh-Hadamard transform to power-of-two vector data.

try_not_equal

</>

pub fn try_not_equal(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Alias for not_equal.

try_percentile

</>

pub fn try_percentile(
  t: Tensor,
  percentile: Int,
) -> Result(Float, TensorError)

Percentile using linear interpolation between closest ranks.

try_product

</>

pub fn try_product(t: Tensor) -> Result(Float, TensorError)

Product of all elements, preserving materialization failures.

try_reciprocal

</>

pub fn try_reciprocal(t: Tensor) -> Result(Tensor, TensorError)

Reciprocal for every element, rejecting zeros.

try_round

</>

pub fn try_round(t: Tensor) -> Result(Tensor, TensorError)

Round every element to the nearest integer value, preserving failures.

try_scale

</>

pub fn try_scale(
  t: Tensor,
  s: Float,
) -> Result(Tensor, TensorError)

Scale by constant, preserving materialization failures.

try_sign

</>

pub fn try_sign(t: Tensor) -> Result(Tensor, TensorError)

Return -1, 0, or 1 for each element, preserving failures.

try_softmax_axis

</>

pub fn try_softmax_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Softmax along one axis, preserving materialization failures.

try_sqrt

</>

pub fn try_sqrt(t: Tensor) -> Result(Tensor, TensorError)

Square root every element, rejecting negative values.

try_square

</>

pub fn try_square(t: Tensor) -> Result(Tensor, TensorError)

Square every element, preserving materialization failures.

try_standardize

</>

pub fn try_standardize(t: Tensor) -> Result(Tensor, TensorError)

Alias for try_zscore.

try_std

</>

pub fn try_std(t: Tensor) -> Result(Float, TensorError)

Standard deviation, preserving materialization and empty-tensor errors.

try_sum

</>

pub fn try_sum(t: Tensor) -> Result(Float, TensorError)

Sum everything, preserving materialization failures.

try_sum_axis

</>

pub fn try_sum_axis(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Sum along one axis, preserving materialization failures.

try_sum_axis_keepdims

</>

pub fn try_sum_axis_keepdims(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Sum along one axis with keepdims, preserving materialization failures.

try_take

</>

pub fn try_take(
  t: Tensor,
  indices: List(Int),
) -> Result(Tensor, TensorError)

Take flattened elements by explicit indices, preserving index errors.

try_take_flat

</>

pub fn try_take_flat(
  t: Tensor,
  indices: List(Int),
) -> Result(Tensor, TensorError)

Take flattened elements by explicit indices, preserving index errors.

try_to_contiguous

</>

pub fn try_to_contiguous(
  t: Tensor,
) -> Result(Tensor, TensorError)

Convert to contiguous tensor, preserving materialization failures.

try_to_list

</>

pub fn try_to_list(t: Tensor) -> Result(List(Float), TensorError)

Convert to list, preserving native materialization failures.

try_to_strided

</>

pub fn try_to_strided(t: Tensor) -> Result(Tensor, TensorError)

Convert to strided representation, preserving materialization failures.

try_unsqueeze

</>

pub fn try_unsqueeze(
  t: Tensor,
  axis: Int,
) -> Result(Tensor, TensorError)

Add dimension of size 1, preserving invalid-axis errors.

try_variance

</>

pub fn try_variance(t: Tensor) -> Result(Float, TensorError)

Variance, preserving materialization and empty-tensor errors.

try_where

</>

pub fn try_where(
  condition: Tensor,
  when_true: Tensor,
  when_false: Tensor,
) -> Result(Tensor, TensorError)

Alias for where.

try_zscore

</>

pub fn try_zscore(t: Tensor) -> Result(Tensor, TensorError)

Z-score standardization over all elements, preserving errors.

uniform

</>

pub fn uniform(
  shape: List(Int),
  low: Float,
  high: Float,
) -> Tensor

Sample each element uniformly from [low, high). See viva_tensor/nn/init.uniform.

unigram_decode

</>

pub fn unigram_decode(
  tokenizer: UnigramTokenizer,
  ids: List(Int),
) -> String

Decode unigram ids back to a string. Strips bos_id/eos_id, undoes the ▁-prefix convention, removes a single leading space.

Example

import viva_tensor as t
let tok = t.unigram_tokenizer_from_pieces(
  [#("<unk>", -100.0), #("<s>", -100.0), #("</s>", -100.0),
   #("▁hello", -1.0)],
  "<unk>", "<s>", "</s>",
)
let _ = t.unigram_decode(tok, [1, 3, 2])

unigram_encode

</>

pub fn unigram_encode(
  tokenizer: UnigramTokenizer,
  text: String,
) -> List(Int)

Encode text into ids using Viterbi dynamic programming. Output is [bos_id, ..pieces, eos_id]. Pieces falling outside the vocab become unk_id.

Example

import viva_tensor as t
let tok = t.unigram_tokenizer_from_pieces(
  [#("<unk>", -100.0), #("<s>", -100.0), #("</s>", -100.0),
   #("▁hello", -1.0)],
  "<unk>", "<s>", "</s>",
)
let _ = t.unigram_encode(tok, "hello")

unigram_tokenizer_from_pieces

</>

pub fn unigram_tokenizer_from_pieces(
  pieces: List(#(String, Float)),
  unk_token: String,
  bos_token: String,
  eos_token: String,
) -> UnigramTokenizer

Build a UnigramTokenizer from a list of (token, log_prob) pairs. Encoding uses Viterbi (max-sum of log-probs), not greedy.

Example

import viva_tensor as t
let _ = t.unigram_tokenizer_from_pieces(
  [#("<unk>", -100.0), #("<s>", -100.0), #("</s>", -100.0),
   #("▁hello", -1.0)],
  "<unk>", "<s>", "</s>",
)

unsqueeze

</>

pub fn unsqueeze(t: Tensor, axis: Int) -> Tensor

Add dimension of size 1

upsample_forward

</>

pub fn upsample_forward(
  config: UpsampleConfig,
  input: Tensor,
) -> Result(Tensor, TensorError)

2D upsampling (nearest or bilinear). Input [batch, channels, H, W], output [batch, channels, H * scale_factor, W * scale_factor].

variance

</>

pub fn variance(t: Tensor) -> Float

Variance

vector

</>

pub fn vector(data: List(Float)) -> Tensor

Create vector (1D tensor)

vision_adjust_brightness

</>

pub fn vision_adjust_brightness(
  image: Tensor,
  factor: Float,
) -> Result(Tensor, TensorError)

Multiply pixel values by factor, clamped to [0, 1].

vision_adjust_contrast

</>

pub fn vision_adjust_contrast(
  image: Tensor,
  factor: Float,
) -> Result(Tensor, TensorError)

Linearly interpolate each pixel toward its channel mean and clamp to [0, 1].

vision_center_crop

</>

pub fn vision_center_crop(
  image: Tensor,
  target_h: Int,
  target_w: Int,
) -> Result(Tensor, TensorError)

Crop the centre target_h x target_w region of a CHW/NCHW image.

vision_compose

</>

pub fn vision_compose(
  transforms: List(fn(Tensor) -> Result(Tensor, TensorError)),
  image: Tensor,
) -> Result(Tensor, TensorError)

Apply a list of transforms in order, threading the result through each step. Bails on the first Error.

vision_horizontal_flip

</>

pub fn vision_horizontal_flip(
  image: Tensor,
) -> Result(Tensor, TensorError)

Mirror the image along the width axis.

vision_normalize

</>

pub fn vision_normalize(
  image: Tensor,
  mean: List(Float),
  std: List(Float),
) -> Result(Tensor, TensorError)

Per-channel (x - mean[c]) / std[c] normalization.

vision_random_crop

</>

pub fn vision_random_crop(
  image: Tensor,
  target_h: Int,
  target_w: Int,
) -> Result(Tensor, TensorError)

Crop a target_h x target_w window at a random top-left corner. Non-deterministic.

vision_random_horizontal_flip

</>

pub fn vision_random_horizontal_flip(
  image: Tensor,
  p: Float,
) -> Result(Tensor, TensorError)

Flip horizontally with probability p. Non-deterministic.

vision_resize

</>

pub fn vision_resize(
  image: Tensor,
  new_h: Int,
  new_w: Int,
  mode: ResizeMode,
) -> Result(Tensor, TensorError)

Resize a CHW ([C, H, W]) or NCHW ([B, C, H, W]) image to [..., C, new_h, new_w] using the requested resampling mode.

vision_to_byte_image

</>

pub fn vision_to_byte_image(
  image: Tensor,
) -> Result(List(Int), TensorError)

CHW tensor in [0, 1] → HWC byte image ([0..255]).

vision_to_grayscale

</>

pub fn vision_to_grayscale(
  image: Tensor,
  num_output_channels: Int,
) -> Result(Tensor, TensorError)

Convert a 3-channel image to grayscale (ITU-R 601 luma).

vision_to_tensor

</>

pub fn vision_to_tensor(
  byte_image: List(Int),
  height: Int,
  width: Int,
  channels: Int,
) -> Result(Tensor, TensorError)

HWC byte image ([0..255]) → CHW tensor in [0, 1].

vision_vertical_flip

</>

pub fn vision_vertical_flip(
  image: Tensor,
) -> Result(Tensor, TensorError)

Mirror the image along the height axis.

where

</>

pub fn where(
  condition: Tensor,
  when_true: Tensor,
  when_false: Tensor,
) -> Result(Tensor, TensorError)

Select values from two tensors using a non-zero condition mask.

whitespace_decode

</>

pub fn whitespace_decode(
  tokenizer: WhitespaceTokenizer,
  ids: List(Int),
) -> String

Decode ids with a WhitespaceTokenizer.

Example

import viva_tensor as t
let tok = t.whitespace_tokenizer_from_vocab(
  ["[PAD]", "[UNK]", "hello"],
  "[UNK]",
  "[PAD]",
)
let _ = t.whitespace_decode(tok, [2])

whitespace_encode

</>

pub fn whitespace_encode(
  tokenizer: WhitespaceTokenizer,
  text: String,
) -> List(Int)

Encode text with a WhitespaceTokenizer.

Example

import viva_tensor as t
let tok = t.whitespace_tokenizer_from_vocab(
  ["[PAD]", "[UNK]", "hello"],
  "[UNK]",
  "[PAD]",
)
let _ = t.whitespace_encode(tok, "hello")

whitespace_tokenizer_from_vocab

</>

pub fn whitespace_tokenizer_from_vocab(
  vocab: List(String),
  unk_token: String,
  pad_token: String,
) -> WhitespaceTokenizer

Build a WhitespaceTokenizer from an ordered vocabulary list.

Example

import viva_tensor as t
let _ = t.whitespace_tokenizer_from_vocab(
  ["[PAD]", "[UNK]", "hello"],
  "[UNK]",
  "[PAD]",
)

word_piece_decode

</>

pub fn word_piece_decode(
  tokenizer: WordPieceTokenizer,
  ids: List(Int),
) -> String

Decode ids with a WordPieceTokenizer.

Example

import viva_tensor as t
let tok = t.word_piece_tokenizer_from_vocab(
  ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "hello"],
  "[UNK]",
  "[CLS]",
  "[SEP]",
  "[PAD]",
)
let _ = t.word_piece_decode(tok, [2, 4, 3])

word_piece_encode

</>

pub fn word_piece_encode(
  tokenizer: WordPieceTokenizer,
  text: String,
) -> List(Int)

Encode text with a WordPieceTokenizer.

Example

import viva_tensor as t
let tok = t.word_piece_tokenizer_from_vocab(
  ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "hello"],
  "[UNK]",
  "[CLS]",
  "[SEP]",
  "[PAD]",
)
let _ = t.word_piece_encode(tok, "hello")

word_piece_tokenizer_from_vocab

</>

pub fn word_piece_tokenizer_from_vocab(
  vocab: List(String),
  unk_token: String,
  cls_token: String,
  sep_token: String,
  pad_token: String,
) -> WordPieceTokenizer

Build a WordPieceTokenizer from an ordered vocabulary list.

Example

import viva_tensor as t
let _ = t.word_piece_tokenizer_from_vocab(
  ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "hello"],
  "[UNK]",
  "[CLS]",
  "[SEP]",
  "[PAD]",
)

workspace_backend

</>

pub fn workspace_backend(
  workspace: GpuWorkspace,
) -> AccelerationBackend

Workspace backend.

workspace_from_tensor

</>

pub fn workspace_from_tensor(
  workspace: GpuWorkspace,
  tensor: Tensor,
) -> Result(AcceleratedTensor, TensorError)

Move a tensor into workspace memory.

workspace_zeros

</>

pub fn workspace_zeros(
  workspace: GpuWorkspace,
  shape: List(Int),
) -> Result(AcceleratedTensor, TensorError)

Allocate a reusable zero-filled output buffer in workspace memory.

xavier_init

</>

pub fn xavier_init(fan_in: Int, fan_out: Int) -> Tensor

Xavier initialization for neural network weights

xavier_normal

</>

pub fn xavier_normal(fan_in: Int, fan_out: Int) -> Tensor

Glorot normal init: N(0, std^2) with std = sqrt(2 / (fan_in + fan_out)).

xavier_uniform

</>

pub fn xavier_uniform(fan_in: Int, fan_out: Int) -> Tensor

Glorot uniform init: U(-a, a) with a = sqrt(6 / (fan_in + fan_out)).

zero_grad

</>

pub fn zero_grad(grads: List(GradPair)) -> List(GradPair)

Zero every gradient tensor, preserving shapes. See viva_tensor/nn/optim.zero_grad.

zeros

</>

pub fn zeros(shape: List(Int)) -> Tensor

Create a tensor filled with zeros.

zeros_like

</>

pub fn zeros_like(t: Tensor) -> Tensor

Create a tensor with the same shape as another tensor, filled with zeros.

zscore

</>

pub fn zscore(t: Tensor) -> Tensor

Z-score standardization over all elements, preserving shape.