| This example is taken from the AES package of the standard Go library. It makes use of Go Assembly to leverage Intel’s hardware support for AES, calling the AES-NI CPU instructions that can perform a “round” of encryption or decrpytion of the AES algorithm. | |
| package aes
 | |
| 
 | func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
 | 
| #include "textflag.h"
 | |
| As you can see, the total length of the arguments and return values has been omitted: the last argument of
             | TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
	MOVQ nr+0(FP), CX
	MOVQ xk+8(FP), AX
	MOVQ dst+16(FP), DX
	MOVQ src+24(FP), BX
 | 
| 
 | 	MOVUPS 0(AX), X1
	MOVUPS 0(BX), X0
	ADDQ $16, AX // next round key
	PXOR X1, X0
 | 
| AES accepts keys of various length: 128-bit, 192-bit and 256-bit. These three versions all accept a
            different number of rounds. We compare the  | 	SUBQ $12, CX
	JE Lenc196
	JB Lenc128
Lenc256:
 | 
| A round always works the same way. The relevant 128-bit round key is loaded in a 128-bit SSE register
            ( | 	MOVUPS 0(AX), X1
	AESENC X1, X0
	MOVUPS 16(AX), X1
	AESENC X1, X0
	ADDQ $32, AX // next round keys
 | 
| AES-256 has two more rounds compare to AES-196, so when the previous two operations are done we can fall through to the AES-192 branch of the code. | Lenc196:
	MOVUPS 0(AX), X1
	AESENC X1, X0
	MOVUPS 16(AX), X1
	AESENC X1, X0
	ADDQ $32, AX
 | 
| The previous note respectively applies to AES-192 and AES-128. | Lenc128:
	MOVUPS 0(AX), X1
	AESENC X1, X0
	MOVUPS 16(AX), X1
	AESENC X1, X0
	MOVUPS 32(AX), X1
	AESENC X1, X0
	MOVUPS 48(AX), X1
	AESENC X1, X0
	MOVUPS 64(AX), X1
	AESENC X1, X0
	MOVUPS 80(AX), X1
	AESENC X1, X0
	MOVUPS 96(AX), X1
	AESENC X1, X0
	MOVUPS 112(AX), X1
	AESENC X1, X0
	MOVUPS 128(AX), X1
	AESENC X1, X0
	MOVUPS 144(AX), X1
	AESENCLAST X1, X0
	MOVUPS X0, 0(DX)
	RET
 | 
Next example: Sqrt.