75 lines
1.5 KiB
ArmAsm
75 lines
1.5 KiB
ArmAsm
|
// Code generated by command: go run swap64_asm.go -pkg bswap -out ../bswap/swap64_amd64.s -stubs ../bswap/swap64_amd64.go. DO NOT EDIT.
|
|||
|
|
|||
|
//go:build !purego
|
|||
|
|
|||
|
#include "textflag.h"
|
|||
|
|
|||
|
// func swap64(b []byte)
|
|||
|
// Requires: AVX, AVX2
|
|||
|
TEXT ·swap64(SB), NOSPLIT, $0-24
|
|||
|
MOVQ b_base+0(FP), AX
|
|||
|
MOVQ b_len+8(FP), CX
|
|||
|
MOVQ AX, DX
|
|||
|
ADDQ CX, DX
|
|||
|
BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
|
|||
|
JCC x86_loop
|
|||
|
VMOVDQU shuffle_mask<>+0(SB), Y0
|
|||
|
|
|||
|
avx2_loop:
|
|||
|
MOVQ AX, CX
|
|||
|
ADDQ $0x80, CX
|
|||
|
CMPQ CX, DX
|
|||
|
JAE x86_loop
|
|||
|
VMOVDQU (AX), Y1
|
|||
|
VMOVDQU 32(AX), Y2
|
|||
|
VMOVDQU 64(AX), Y3
|
|||
|
VMOVDQU 96(AX), Y4
|
|||
|
VPSHUFB Y0, Y1, Y1
|
|||
|
VPSHUFB Y0, Y2, Y2
|
|||
|
VPSHUFB Y0, Y3, Y3
|
|||
|
VPSHUFB Y0, Y4, Y4
|
|||
|
VMOVDQU Y1, (AX)
|
|||
|
VMOVDQU Y2, 32(AX)
|
|||
|
VMOVDQU Y3, 64(AX)
|
|||
|
VMOVDQU Y4, 96(AX)
|
|||
|
MOVQ CX, AX
|
|||
|
JMP avx2_loop
|
|||
|
|
|||
|
x86_loop:
|
|||
|
MOVQ AX, CX
|
|||
|
ADDQ $0x20, CX
|
|||
|
CMPQ CX, DX
|
|||
|
JAE slow_loop
|
|||
|
MOVQ (AX), BX
|
|||
|
MOVQ 8(AX), SI
|
|||
|
MOVQ 16(AX), DI
|
|||
|
MOVQ 24(AX), R8
|
|||
|
BSWAPQ BX
|
|||
|
BSWAPQ SI
|
|||
|
BSWAPQ DI
|
|||
|
BSWAPQ R8
|
|||
|
MOVQ BX, (AX)
|
|||
|
MOVQ SI, 8(AX)
|
|||
|
MOVQ DI, 16(AX)
|
|||
|
MOVQ R8, 24(AX)
|
|||
|
MOVQ CX, AX
|
|||
|
JMP x86_loop
|
|||
|
|
|||
|
slow_loop:
|
|||
|
CMPQ AX, DX
|
|||
|
JAE done
|
|||
|
MOVQ (AX), CX
|
|||
|
BSWAPQ CX
|
|||
|
MOVQ CX, (AX)
|
|||
|
ADDQ $0x08, AX
|
|||
|
JMP slow_loop
|
|||
|
|
|||
|
done:
|
|||
|
RET
|
|||
|
|
|||
|
DATA shuffle_mask<>+0(SB)/8, $0x0001020304050607
|
|||
|
DATA shuffle_mask<>+8(SB)/8, $0x08090a0b0c0d0e0f
|
|||
|
DATA shuffle_mask<>+16(SB)/8, $0x0001020304050607
|
|||
|
DATA shuffle_mask<>+24(SB)/8, $0x08090a0b0c0d0e0f
|
|||
|
GLOBL shuffle_mask<>(SB), RODATA|NOPTR, $32
|