This guide walks you through using Trittask, from installing the Python driver to deploying on FPGA hardware.
No hardware required. Start with simulation mode to explore the API.
from trittask import Trittask
import numpy as np
# Create accelerator in simulation mode
accel = Trittask(simulation=True)
# Load ternary weights (-1, 0, +1)
weights = np.random.choice([-1, 0, 1], size=(768, 768)).astype(np.int8)
accel.load_weights(weights)
# Run inference
x = np.random.randn(768).astype(np.float32)
y = accel.forward(x)
print(f"Output shape: {y.shape}")
# Install the Python driver cd drivers/pynq pip install . # For development (with testing tools) pip install -e .[dev] # On PYNQ board (with hardware support) pip install .[pynq]
# macOS brew install icarus-verilog verilator # Ubuntu/Debian sudo apt install iverilog verilator # Verify installation iverilog -V verilator --version
from trittask import Trittask
import numpy as np
# Initialize (simulation mode for development)
accel = Trittask(simulation=True)
# Or with real hardware
# accel = Trittask('trittask.bit')
# Load model weights
weights = np.load('model_weights.npz')
for layer_name, layer_weights in weights.items():
accel.load_weights(layer_weights, layer=layer_name)
# Run inference
input_data = np.random.randn(1, 768).astype(np.float32)
output = accel.forward(input_data)
# Process multiple inputs
batch = np.random.randn(32, 768).astype(np.float32)
results = []
for i in range(batch.shape[0]):
result = accel.forward(batch[i])
results.append(result)
outputs = np.stack(results)
# Compile a PyTorch model to Trittask format python3 scripts/compile_pytorch.py \ path/to/model.pt --output build/weights
from trittask.models import quantize_to_ternary
# Load FP32 weights
fp32_weights = np.load('original_weights.npy')
# Quantize to ternary (-1, 0, +1)
ternary_weights, scale = quantize_to_ternary(fp32_weights)
# Save for Trittask
np.savez('trittask_weights.npz',
weights=ternary_weights,
scale=scale)
from trittask.models import translate_model, print_isa_summary # Translate any PyTorch model translated = translate_model(pytorch_model) # Or specify the architecture type translated = translate_model(resnet18, model_type='cnn') translated = translate_model(bert, model_type='transformer') translated = translate_model(lstm, model_type='lstm') # View required ISA operations print_isa_summary(translated)
from trittask import Trittask
from trittask.lora import LoRAConfig, LoRATrainer
# Initialize accelerator
accel = Trittask('trittask.bit')
accel.load_weights(base_weights)
# Configure LoRA
config = LoRAConfig(
rank=16, # Bottleneck dimension
alpha=16.0, # Scaling factor
learning_rate=1e-4, # Adam learning rate
use_adam=True # Use Adam optimizer
)
# Create trainer
trainer = LoRATrainer(accel, config)
# Training loop
for epoch in range(10):
total_loss = 0
for x, y in dataloader:
loss = trainer.step(x, y)
total_loss += loss
print(f"Epoch {epoch}: Loss = {total_loss / len(dataloader):.4f}")
# Save checkpoint
trainer.save_checkpoint('lora_adapted.npz')
Train and serve inference simultaneously. Zero downtime adaptation.
# Enable double buffering
accel.enable_lora(
rank=16,
double_buffer=True # Train while serving
)
# Inference continues uninterrupted
# during training
# Standard builds make vivado_zcu102 # ZCU102 (max performance) make vivado_pynq # PYNQ-Z2 (best efficiency) make vivado_ebaz # EBAZ4205 (lowest cost) # Maximum utilization builds (~90% FPGA fill) make vivado_zcu102 DEFINES="+define+TARGET_ZCU102_MAX" make vivado_pynq DEFINES="+define+TARGET_PYNQ_Z2_MAX"
from trittask import Trittask
# Load bitstream
accel = Trittask('trittask.bit')
# Verify connection
status = accel.status()
print(f"Accelerator: {status}")
# Load weights and run
accel.load_weights(weights)
output = accel.forward(input_data)
Choose between balanced configurations (headroom for timing closure) or MAX configurations (maximum throughput).
Default configuration with headroom for timing closure. Recommended for development and production deployments.
128 TRUs with 4-way systolic read (PYNQ-Z2). Guaranteed 100MHz timing. 4x read throughput with 32-cycle max latency.
MAX configs trade power for throughput. Use balanced for battery-powered or fanless deployments.
# ZCU102 MAX (~100 TOPS, 512+ TRUs) make vivado_zcu102 DEFINES="+define+TARGET_ZCU102_MAX" # PYNQ-Z2 MAX (~25 TOPS, 128 TRUs, 4-way systolic) make vivado_pynq DEFINES="+define+TARGET_PYNQ_Z2_MAX" # Configurable core count for PYNQ-Z2: vivado -mode batch -source scripts/vivado_pynq.tcl -tclargs 128 200 # 128 TRUs, 200 DSPs vivado -mode batch -source scripts/vivado_pynq.tcl -tclargs 80 # 80 TRUs (faster timing) # EBAZ4205 MAX (~5 TOPS, 48 TRUs) make vivado_ebaz DEFINES="+define+TARGET_EBAZ4205_MAX"
Bit-accurate Python simulations that match RTL behavior exactly:
# Verify ternary MAC (BitNet b1.58) python3 scripts/digital_twin/ternary_mac.py # Verify SANTA stochastic attention python3 scripts/digital_twin/santa.py # Verify ECO L1-distance (EcoTransformer) python3 scripts/digital_twin/eco.py # Verify PWA softmax (4-segment exp) python3 scripts/digital_twin/softmax.py # Full MFE integration test python3 scripts/digital_twin/mfe.py
# Compile MFE testbench iverilog -g2012 -o build/tb_mfe \ tb/tb_multiplier_free_engine.v \ rtl/multiplier_free_engine.v \ rtl/ternary_mac.v rtl/santa_unit.v \ rtl/eco_transformer.v rtl/pwa_activation.v # Run simulation vvp build/tb_mfe # View waveforms (optional) gtkwave build/tb_multiplier_free_engine.vcd
# Compile cached MFE with double-buffering iverilog -g2012 -o build/tb_cached_mfe \ tb/tb_cached_mfe.sv \ rtl/cached_mfe_top.sv \ rtl/cache/weight_double_buffer.sv \ rtl/cache/weight_cache_simple.sv \ rtl/multiplier_free_engine.v \ rtl/ternary_mac.v rtl/santa_unit.v # Run simulation vvp build/tb_cached_mfe # Expected: Stall cycles: 1 (99.8% efficiency)
# Verify Python matches RTL exactly python3 scripts/debug_cosim.py # Expected output: # "SUCCESS: Python and Verilog match perfectly!"
# Assemble a test program python3 scripts/overlay/assembler.py \ tb/overlay/test_programs/test_matmul.asm \ -o build/test_matmul.bin # Run overlay testbench iverilog -g2012 -o build/tb_overlay \ tb/overlay/tb_temporal_overlay_pe.sv \ rtl/overlay/*.sv vvp build/tb_overlay
Normal on development machines. The driver runs in simulation mode. Install PYNQ on your FPGA board: pip install pynq
Ensure weights are 2D and match expected dimensions. Expected: (dim_out, dim_in) e.g., (768, 768)
Check mode with accel.simulation. Returns True for simulation, or hardware status for real FPGA.
Configure Trittask for your target FPGA with compile-time feature flags. Smaller FPGAs can disable unused features to fit.
# Minimal build for tiny FPGAs (<10K LUTs) # Tang Nano 9K, iCE40, etc. make synth PRESET=PRESET_MINIMAL # Inference-only for mid-range FPGAs # ECP5, PYNQ-Z2 (no training) make synth PRESET=PRESET_INFERENCE_ONLY # Full training support # ZCU102, ZCU104 make synth PRESET=PRESET_TRAINING_FOCUS
# Enable/disable features individually +define+FEATURE_LORA # LoRA training +define+FEATURE_SANTA # Stochastic attention +define+FEATURE_ECO # L1-distance attention +define+FEATURE_PWA # PWA activation +define+FEATURE_SOFTMAX # PWA softmax +define+FEATURE_CACHE # Weight cache # Example: inference-only with softmax make vivado_pynq DEFINES="+define+FEATURE_SOFTMAX"
See docs/ARCHITECTURE.md for technical details on the hardware design.
Check drivers/pynq/notebooks/ for Jupyter tutorials and example workflows.
See docs/ARCHITECTURE.md for technical details
Key terms for understanding Trittask's ternary AI acceleration technology.
All algorithms are verified against their paper specifications:
# Run algorithm verification suite python3 tests/test_algorithms_pynq.py # Or run on PYNQ hardware jupyter notebook drivers/pynq/notebooks/04_algorithm_verification.ipynb