75 lines
1.5 KiB
ArmAsm
75 lines
1.5 KiB
ArmAsm
// Code generated by command: go run swap64_asm.go -pkg bswap -out ../bswap/swap64_amd64.s -stubs ../bswap/swap64_amd64.go. DO NOT EDIT.
|
||
|
||
//go:build !purego
|
||
|
||
#include "textflag.h"
|
||
|
||
// func swap64(b []byte)
|
||
// Requires: AVX, AVX2
|
||
TEXT ·swap64(SB), NOSPLIT, $0-24
|
||
MOVQ b_base+0(FP), AX
|
||
MOVQ b_len+8(FP), CX
|
||
MOVQ AX, DX
|
||
ADDQ CX, DX
|
||
BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
|
||
JCC x86_loop
|
||
VMOVDQU shuffle_mask<>+0(SB), Y0
|
||
|
||
avx2_loop:
|
||
MOVQ AX, CX
|
||
ADDQ $0x80, CX
|
||
CMPQ CX, DX
|
||
JAE x86_loop
|
||
VMOVDQU (AX), Y1
|
||
VMOVDQU 32(AX), Y2
|
||
VMOVDQU 64(AX), Y3
|
||
VMOVDQU 96(AX), Y4
|
||
VPSHUFB Y0, Y1, Y1
|
||
VPSHUFB Y0, Y2, Y2
|
||
VPSHUFB Y0, Y3, Y3
|
||
VPSHUFB Y0, Y4, Y4
|
||
VMOVDQU Y1, (AX)
|
||
VMOVDQU Y2, 32(AX)
|
||
VMOVDQU Y3, 64(AX)
|
||
VMOVDQU Y4, 96(AX)
|
||
MOVQ CX, AX
|
||
JMP avx2_loop
|
||
|
||
x86_loop:
|
||
MOVQ AX, CX
|
||
ADDQ $0x20, CX
|
||
CMPQ CX, DX
|
||
JAE slow_loop
|
||
MOVQ (AX), BX
|
||
MOVQ 8(AX), SI
|
||
MOVQ 16(AX), DI
|
||
MOVQ 24(AX), R8
|
||
BSWAPQ BX
|
||
BSWAPQ SI
|
||
BSWAPQ DI
|
||
BSWAPQ R8
|
||
MOVQ BX, (AX)
|
||
MOVQ SI, 8(AX)
|
||
MOVQ DI, 16(AX)
|
||
MOVQ R8, 24(AX)
|
||
MOVQ CX, AX
|
||
JMP x86_loop
|
||
|
||
slow_loop:
|
||
CMPQ AX, DX
|
||
JAE done
|
||
MOVQ (AX), CX
|
||
BSWAPQ CX
|
||
MOVQ CX, (AX)
|
||
ADDQ $0x08, AX
|
||
JMP slow_loop
|
||
|
||
done:
|
||
RET
|
||
|
||
DATA shuffle_mask<>+0(SB)/8, $0x0001020304050607
|
||
DATA shuffle_mask<>+8(SB)/8, $0x08090a0b0c0d0e0f
|
||
DATA shuffle_mask<>+16(SB)/8, $0x0001020304050607
|
||
DATA shuffle_mask<>+24(SB)/8, $0x08090a0b0c0d0e0f
|
||
GLOBL shuffle_mask<>(SB), RODATA|NOPTR, $32
|