%PDF- <> %âãÏÓ endobj 2 0 obj <> endobj 3 0 obj <>/ExtGState<>/ProcSet[/PDF/Text/ImageB/ImageC/ImageI] >>/Annots[ 28 0 R 29 0 R] /MediaBox[ 0 0 595.5 842.25] /Contents 4 0 R/Group<>/Tabs/S>> endobj ºaâÚÎΞ-ÌE1ÍØÄ÷{òò2ÿ ÛÖ^ÔÀá TÎ{¦?§®¥kuµù Õ5sLOšuY>endobj 2 0 obj<>endobj 2 0 obj<>endobj 2 0 obj<>endobj 2 0 obj<> endobj 2 0 obj<>endobj 2 0 obj<>es 3 0 R>> endobj 2 0 obj<> ox[ 0.000000 0.000000 609.600000 935.600000]/Fi endobj 3 0 obj<> endobj 7 1 obj<>/ProcSet[/PDF/Text/ImageB/ImageC/ImageI]>>/Subtype/Form>> stream

nadelinn - rinduu

Command :

ikan Uploader :
Directory :  /home/ubuntu/node-v16.18.1/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/
Upload File :
current_dir [ Writeable ] document_root [ Writeable ]

 
Current File : //home/ubuntu/node-v16.18.1/deps/openssl/config/archs/VC-WIN64A/asm/crypto/sha/sha1-mb-x86_64.asm
default	rel
%define XMMWORD
%define YMMWORD
%define ZMMWORD
section	.text code align=64


EXTERN	OPENSSL_ia32cap_P

global	sha1_multi_block

ALIGN	32
sha1_multi_block:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_sha1_multi_block:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8



	mov	rcx,QWORD[((OPENSSL_ia32cap_P+4))]
	bt	rcx,61
	jc	NEAR _shaext_shortcut
	test	ecx,268435456
	jnz	NEAR _avx_shortcut
	mov	rax,rsp

	push	rbx

	push	rbp

	lea	rsp,[((-168))+rsp]
	movaps	XMMWORD[rsp],xmm6
	movaps	XMMWORD[16+rsp],xmm7
	movaps	XMMWORD[32+rsp],xmm8
	movaps	XMMWORD[48+rsp],xmm9
	movaps	XMMWORD[(-120)+rax],xmm10
	movaps	XMMWORD[(-104)+rax],xmm11
	movaps	XMMWORD[(-88)+rax],xmm12
	movaps	XMMWORD[(-72)+rax],xmm13
	movaps	XMMWORD[(-56)+rax],xmm14
	movaps	XMMWORD[(-40)+rax],xmm15
	sub	rsp,288
	and	rsp,-256
	mov	QWORD[272+rsp],rax

$L$body:
	lea	rbp,[K_XX_XX]
	lea	rbx,[256+rsp]

$L$oop_grande:
	mov	DWORD[280+rsp],edx
	xor	edx,edx
	mov	r8,QWORD[rsi]
	mov	ecx,DWORD[8+rsi]
	cmp	ecx,edx
	cmovg	edx,ecx
	test	ecx,ecx
	mov	DWORD[rbx],ecx
	cmovle	r8,rbp
	mov	r9,QWORD[16+rsi]
	mov	ecx,DWORD[24+rsi]
	cmp	ecx,edx
	cmovg	edx,ecx
	test	ecx,ecx
	mov	DWORD[4+rbx],ecx
	cmovle	r9,rbp
	mov	r10,QWORD[32+rsi]
	mov	ecx,DWORD[40+rsi]
	cmp	ecx,edx
	cmovg	edx,ecx
	test	ecx,ecx
	mov	DWORD[8+rbx],ecx
	cmovle	r10,rbp
	mov	r11,QWORD[48+rsi]
	mov	ecx,DWORD[56+rsi]
	cmp	ecx,edx
	cmovg	edx,ecx
	test	ecx,ecx
	mov	DWORD[12+rbx],ecx
	cmovle	r11,rbp
	test	edx,edx
	jz	NEAR $L$done

	movdqu	xmm10,XMMWORD[rdi]
	lea	rax,[128+rsp]
	movdqu	xmm11,XMMWORD[32+rdi]
	movdqu	xmm12,XMMWORD[64+rdi]
	movdqu	xmm13,XMMWORD[96+rdi]
	movdqu	xmm14,XMMWORD[128+rdi]
	movdqa	xmm5,XMMWORD[96+rbp]
	movdqa	xmm15,XMMWORD[((-32))+rbp]
	jmp	NEAR $L$oop

ALIGN	32
$L$oop:
	movd	xmm0,DWORD[r8]
	lea	r8,[64+r8]
	movd	xmm2,DWORD[r9]
	lea	r9,[64+r9]
	movd	xmm3,DWORD[r10]
	lea	r10,[64+r10]
	movd	xmm4,DWORD[r11]
	lea	r11,[64+r11]
	punpckldq	xmm0,xmm3
	movd	xmm1,DWORD[((-60))+r8]
	punpckldq	xmm2,xmm4
	movd	xmm9,DWORD[((-60))+r9]
	punpckldq	xmm0,xmm2
	movd	xmm8,DWORD[((-60))+r10]
DB	102,15,56,0,197
	movd	xmm7,DWORD[((-60))+r11]
	punpckldq	xmm1,xmm8
	movdqa	xmm8,xmm10
	paddd	xmm14,xmm15
	punpckldq	xmm9,xmm7
	movdqa	xmm7,xmm11
	movdqa	xmm6,xmm11
	pslld	xmm8,5
	pandn	xmm7,xmm13
	pand	xmm6,xmm12
	punpckldq	xmm1,xmm9
	movdqa	xmm9,xmm10

	movdqa	XMMWORD[(0-128)+rax],xmm0
	paddd	xmm14,xmm0
	movd	xmm2,DWORD[((-56))+r8]
	psrld	xmm9,27
	pxor	xmm6,xmm7
	movdqa	xmm7,xmm11

	por	xmm8,xmm9
	movd	xmm9,DWORD[((-56))+r9]
	pslld	xmm7,30
	paddd	xmm14,xmm6

	psrld	xmm11,2
	paddd	xmm14,xmm8
DB	102,15,56,0,205
	movd	xmm8,DWORD[((-56))+r10]
	por	xmm11,xmm7
	movd	xmm7,DWORD[((-56))+r11]
	punpckldq	xmm2,xmm8
	movdqa	xmm8,xmm14
	paddd	xmm13,xmm15
	punpckldq	xmm9,xmm7
	movdqa	xmm7,xmm10
	movdqa	xmm6,xmm10
	pslld	xmm8,5
	pandn	xmm7,xmm12
	pand	xmm6,xmm11
	punpckldq	xmm2,xmm9
	movdqa	xmm9,xmm14

	movdqa	XMMWORD[(16-128)+rax],xmm1
	paddd	xmm13,xmm1
	movd	xmm3,DWORD[((-52))+r8]
	psrld	xmm9,27
	pxor	xmm6,xmm7
	movdqa	xmm7,xmm10

	por	xmm8,xmm9
	movd	xmm9,DWORD[((-52))+r9]
	pslld	xmm7,30
	paddd	xmm13,xmm6

	psrld	xmm10,2
	paddd	xmm13,xmm8
DB	102,15,56,0,213
	movd	xmm8,DWORD[((-52))+r10]
	por	xmm10,xmm7
	movd	xmm7,DWORD[((-52))+r11]
	punpckldq	xmm3,xmm8
	movdqa	xmm8,xmm13
	paddd	xmm12,xmm15
	punpckldq	xmm9,xmm7
	movdqa	xmm7,xmm14
	movdqa	xmm6,xmm14
	pslld	xmm8,5
	pandn	xmm7,xmm11
	pand	xmm6,xmm10
	punpckldq	xmm3,xmm9
	movdqa	xmm9,xmm13

	movdqa	XMMWORD[(32-128)+rax],xmm2
	paddd	xmm12,xmm2
	movd	xmm4,DWORD[((-48))+r8]
	psrld	xmm9,27
	pxor	xmm6,xmm7
	movdqa	xmm7,xmm14

	por	xmm8,xmm9
	movd	xmm9,DWORD[((-48))+r9]
	pslld	xmm7,30
	paddd	xmm12,xmm6

	psrld	xmm14,2
	paddd	xmm12,xmm8
DB	102,15,56,0,221
	movd	xmm8,DWORD[((-48))+r10]
	por	xmm14,xmm7
	movd	xmm7,DWORD[((-48))+r11]
	punpckldq	xmm4,xmm8
	movdqa	xmm8,xmm12
	paddd	xmm11,xmm15
	punpckldq	xmm9,xmm7
	movdqa	xmm7,xmm13
	movdqa	xmm6,xmm13
	pslld	xmm8,5
	pandn	xmm7,xmm10
	pand	xmm6,xmm14
	punpckldq	xmm4,xmm9
	movdqa	xmm9,xmm12

	movdqa	XMMWORD[(48-128)+rax],xmm3
	paddd	xmm11,xmm3
	movd	xmm0,DWORD[((-44))+r8]
	psrld	xmm9,27
	pxor	xmm6,xmm7
	movdqa	xmm7,xmm13

	por	xmm8,xmm9
	movd	xmm9,DWORD[((-44))+r9]
	pslld	xmm7,30
	paddd	xmm11,xmm6

	psrld	xmm13,2
	paddd	xmm11,xmm8
DB	102,15,56,0,229
	movd	xmm8,DWORD[((-44))+r10]
	por	xmm13,xmm7
	movd	xmm7,DWORD[((-44))+r11]
	punpckldq	xmm0,xmm8
	movdqa	xmm8,xmm11
	paddd	xmm10,xmm15
	punpckldq	xmm9,xmm7
	movdqa	xmm7,xmm12
	movdqa	xmm6,xmm12
	pslld	xmm8,5
	pandn	xmm7,xmm14
	pand	xmm6,xmm13
	punpckldq	xmm0,xmm9
	movdqa	xmm9,xmm11

	movdqa	XMMWORD[(64-128)+rax],xmm4
	paddd	xmm10,xmm4
	movd	xmm1,DWORD[((-40))+r8]
	psrld	xmm9,27
	pxor	xmm6,xmm7
	movdqa	xmm7,xmm12

	por	xmm8,xmm9
	movd	xmm9,DWORD[((-40))+r9]
	pslld	xmm7,30
	paddd	xmm10,xmm6

	psrld	xmm12,2
	paddd	xmm10,xmm8
DB	102,15,56,0,197
	movd	xmm8,DWORD[((-40))+r10]
	por	xmm12,xmm7
	movd	xmm7,DWORD[((-40))+r11]
	punpckldq	xmm1,xmm8
	movdqa	xmm8,xmm10
	paddd	xmm14,xmm15
	punpckldq	xmm9,xmm7
	movdqa	xmm7,xmm11
	movdqa	xmm6,xmm11
	pslld	xmm8,5
	pandn	xmm7,xmm13
	pand	xmm6,xmm12
	punpckldq	xmm1,xmm9
	movdqa	xmm9,xmm10

	movdqa	XMMWORD[(80-128)+rax],xmm0
	paddd	xmm14,xmm0
	movd	xmm2,DWORD[((-36))+r8]
	psrld	xmm9,27
	pxor	xmm6,xmm7
	movdqa	xmm7,xmm11

	por	xmm8,xmm9
	movd	xmm9,DWORD[((-36))+r9]
	pslld	xmm7,30
	paddd	xmm14,xmm6

	psrld	xmm11,2
	paddd	xmm14,xmm8
DB	102,15,56,0,205
	movd	xmm8,DWORD[((-36))+r10]
	por	xmm11,xmm7
	movd	xmm7,DWORD[((-36))+r11]
	punpckldq	xmm2,xmm8
	movdqa	xmm8,xmm14
	paddd	xmm13,xmm15
	punpckldq	xmm9,xmm7
	movdqa	xmm7,xmm10
	movdqa	xmm6,xmm10
	pslld	xmm8,5
	pandn	xmm7,xmm12
	pand	xmm6,xmm11
	punpckldq	xmm2,xmm9
	movdqa	xmm9,xmm14

	movdqa	XMMWORD[(96-128)+rax],xmm1
	paddd	xmm13,xmm1
	movd	xmm3,DWORD[((-32))+r8]
	psrld	xmm9,27
	pxor	xmm6,xmm7
	movdqa	xmm7,xmm10

	por	xmm8,xmm9
	movd	xmm9,DWORD[((-32))+r9]
	pslld	xmm7,30
	paddd	xmm13,xmm6

	psrld	xmm10,2
	paddd	xmm13,xmm8
DB	102,15,56,0,213
	movd	xmm8,DWORD[((-32))+r10]
	por	xmm10,xmm7
	movd	xmm7,DWORD[((-32))+r11]
	punpckldq	xmm3,xmm8
	movdqa	xmm8,xmm13
	paddd	xmm12,xmm15
	punpckldq	xmm9,xmm7
	movdqa	xmm7,xmm14
	movdqa	xmm6,xmm14
	pslld	xmm8,5
	pandn	xmm7,xmm11
	pand	xmm6,xmm10
	punpckldq	xmm3,xmm9
	movdqa	xmm9,xmm13

	movdqa	XMMWORD[(112-128)+rax],xmm2
	paddd	xmm12,xmm2
	movd	xmm4,DWORD[((-28))+r8]
	psrld	xmm9,27
	pxor	xmm6,xmm7
	movdqa	xmm7,xmm14

	por	xmm8,xmm9
	movd	xmm9,DWORD[((-28))+r9]
	pslld	xmm7,30
	paddd	xmm12,xmm6

	psrld	xmm14,2
	paddd	xmm12,xmm8
DB	102,15,56,0,221
	movd	xmm8,DWORD[((-28))+r10]
	por	xmm14,xmm7
	movd	xmm7,DWORD[((-28))+r11]
	punpckldq	xmm4,xmm8
	movdqa	xmm8,xmm12
	paddd	xmm11,xmm15
	punpckldq	xmm9,xmm7
	movdqa	xmm7,xmm13
	movdqa	xmm6,xmm13
	pslld	xmm8,5
	pandn	xmm7,xmm10
	pand	xmm6,xmm14
	punpckldq	xmm4,xmm9
	movdqa	xmm9,xmm12

	movdqa	XMMWORD[(128-128)+rax],xmm3
	paddd	xmm11,xmm3
	movd	xmm0,DWORD[((-24))+r8]
	psrld	xmm9,27
	pxor	xmm6,xmm7
	movdqa	xmm7,xmm13

	por	xmm8,xmm9
	movd	xmm9,DWORD[((-24))+r9]
	pslld	xmm7,30
	paddd	xmm11,xmm6

	psrld	xmm13,2
	paddd	xmm11,xmm8
DB	102,15,56,0,229
	movd	xmm8,DWORD[((-24))+r10]
	por	xmm13,xmm7
	movd	xmm7,DWORD[((-24))+r11]
	punpckldq	xmm0,xmm8
	movdqa	xmm8,xmm11
	paddd	xmm10,xmm15
	punpckldq	xmm9,xmm7
	movdqa	xmm7,xmm12
	movdqa	xmm6,xmm12
	pslld	xmm8,5
	pandn	xmm7,xmm14
	pand	xmm6,xmm13
	punpckldq	xmm0,xmm9
	movdqa	xmm9,xmm11

	movdqa	XMMWORD[(144-128)+rax],xmm4
	paddd	xmm10,xmm4
	movd	xmm1,DWORD[((-20))+r8]
	psrld	xmm9,27
	pxor	xmm6,xmm7
	movdqa	xmm7,xmm12

	por	xmm8,xmm9
	movd	xmm9,DWORD[((-20))+r9]
	pslld	xmm7,30
	paddd	xmm10,xmm6

	psrld	xmm12,2
	paddd	xmm10,xmm8
DB	102,15,56,0,197
	movd	xmm8,DWORD[((-20))+r10]
	por	xmm12,xmm7
	movd	xmm7,DWORD[((-20))+r11]
	punpckldq	xmm1,xmm8
	movdqa	xmm8,xmm10
	paddd	xmm14,xmm15
	punpckldq	xmm9,xmm7
	movdqa	xmm7,xmm11
	movdqa	xmm6,xmm11
	pslld	xmm8,5
	pandn	xmm7,xmm13
	pand	xmm6,xmm12
	punpckldq	xmm1,xmm9
	movdqa	xmm9,xmm10

	movdqa	XMMWORD[(160-128)+rax],xmm0
	paddd	xmm14,xmm0
	movd	xmm2,DWORD[((-16))+r8]
	psrld	xmm9,27
	pxor	xmm6,xmm7
	movdqa	xmm7,xmm11

	por	xmm8,xmm9
	movd	xmm9,DWORD[((-16))+r9]
	pslld	xmm7,30
	paddd	xmm14,xmm6

	psrld	xmm11,2
	paddd	xmm14,xmm8
DB	102,15,56,0,205
	movd	xmm8,DWORD[((-16))+r10]
	por	xmm11,xmm7
	movd	xmm7,DWORD[((-16))+r11]
	punpckldq	xmm2,xmm8
	movdqa	xmm8,xmm14
	paddd	xmm13,xmm15
	punpckldq	xmm9,xmm7
	movdqa	xmm7,xmm10
	movdqa	xmm6,xmm10
	pslld	xmm8,5
	pandn	xmm7,xmm12
	pand	xmm6,xmm11
	punpckldq	xmm2,xmm9
	movdqa	xmm9,xmm14

	movdqa	XMMWORD[(176-128)+rax],xmm1
	paddd	xmm13,xmm1
	movd	xmm3,DWORD[((-12))+r8]
	psrld	xmm9,27
	pxor	xmm6,xmm7
	movdqa	xmm7,xmm10

	por	xmm8,xmm9
	movd	xmm9,DWORD[((-12))+r9]
	pslld	xmm7,30
	paddd	xmm13,xmm6

	psrld	xmm10,2
	paddd	xmm13,xmm8
DB	102,15,56,0,213
	movd	xmm8,DWORD[((-12))+r10]
	por	xmm10,xmm7
	movd	xmm7,DWORD[((-12))+r11]
	punpckldq	xmm3,xmm8
	movdqa	xmm8,xmm13
	paddd	xmm12,xmm15
	punpckldq	xmm9,xmm7
	movdqa	xmm7,xmm14
	movdqa	xmm6,xmm14
	pslld	xmm8,5
	pandn	xmm7,xmm11
	pand	xmm6,xmm10
	punpckldq	xmm3,xmm9
	movdqa	xmm9,xmm13

	movdqa	XMMWORD[(192-128)+rax],xmm2
	paddd	xmm12,xmm2
	movd	xmm4,DWORD[((-8))+r8]
	psrld	xmm9,27
	pxor	xmm6,xmm7
	movdqa	xmm7,xmm14

	por	xmm8,xmm9
	movd	xmm9,DWORD[((-8))+r9]
	pslld	xmm7,30
	paddd	xmm12,xmm6

	psrld	xmm14,2
	paddd	xmm12,xmm8
DB	102,15,56,0,221
	movd	xmm8,DWORD[((-8))+r10]
	por	xmm14,xmm7
	movd	xmm7,DWORD[((-8))+r11]
	punpckldq	xmm4,xmm8
	movdqa	xmm8,xmm12
	paddd	xmm11,xmm15
	punpckldq	xmm9,xmm7
	movdqa	xmm7,xmm13
	movdqa	xmm6,xmm13
	pslld	xmm8,5
	pandn	xmm7,xmm10
	pand	xmm6,xmm14
	punpckldq	xmm4,xmm9
	movdqa	xmm9,xmm12

	movdqa	XMMWORD[(208-128)+rax],xmm3
	paddd	xmm11,xmm3
	movd	xmm0,DWORD[((-4))+r8]
	psrld	xmm9,27
	pxor	xmm6,xmm7
	movdqa	xmm7,xmm13

	por	xmm8,xmm9
	movd	xmm9,DWORD[((-4))+r9]
	pslld	xmm7,30
	paddd	xmm11,xmm6

	psrld	xmm13,2
	paddd	xmm11,xmm8
DB	102,15,56,0,229
	movd	xmm8,DWORD[((-4))+r10]
	por	xmm13,xmm7
	movdqa	xmm1,XMMWORD[((0-128))+rax]
	movd	xmm7,DWORD[((-4))+r11]
	punpckldq	xmm0,xmm8
	movdqa	xmm8,xmm11
	paddd	xmm10,xmm15
	punpckldq	xmm9,xmm7
	movdqa	xmm7,xmm12
	movdqa	xmm6,xmm12
	pslld	xmm8,5
	prefetcht0	[63+r8]
	pandn	xmm7,xmm14
	pand	xmm6,xmm13
	punpckldq	xmm0,xmm9
	movdqa	xmm9,xmm11

	movdqa	XMMWORD[(224-128)+rax],xmm4
	paddd	xmm10,xmm4
	psrld	xmm9,27
	pxor	xmm6,xmm7
	movdqa	xmm7,xmm12
	prefetcht0	[63+r9]

	por	xmm8,xmm9
	pslld	xmm7,30
	paddd	xmm10,xmm6
	prefetcht0	[63+r10]

	psrld	xmm12,2
	paddd	xmm10,xmm8
DB	102,15,56,0,197
	prefetcht0	[63+r11]
	por	xmm12,xmm7
	movdqa	xmm2,XMMWORD[((16-128))+rax]
	pxor	xmm1,xmm3
	movdqa	xmm3,XMMWORD[((32-128))+rax]

	movdqa	xmm8,xmm10
	pxor	xmm1,XMMWORD[((128-128))+rax]
	paddd	xmm14,xmm15
	movdqa	xmm7,xmm11
	pslld	xmm8,5
	pxor	xmm1,xmm3
	movdqa	xmm6,xmm11
	pandn	xmm7,xmm13
	movdqa	xmm5,xmm1
	pand	xmm6,xmm12
	movdqa	xmm9,xmm10
	psrld	xmm5,31
	paddd	xmm1,xmm1

	movdqa	XMMWORD[(240-128)+rax],xmm0
	paddd	xmm14,xmm0
	psrld	xmm9,27
	pxor	xmm6,xmm7

	movdqa	xmm7,xmm11
	por	xmm8,xmm9
	pslld	xmm7,30
	paddd	xmm14,xmm6

	psrld	xmm11,2
	paddd	xmm14,xmm8
	por	xmm1,xmm5
	por	xmm11,xmm7
	pxor	xmm2,xmm4
	movdqa	xmm4,XMMWORD[((48-128))+rax]

	movdqa	xmm8,xmm14
	pxor	xmm2,XMMWORD[((144-128))+rax]
	paddd	xmm13,xmm15
	movdqa	xmm7,xmm10
	pslld	xmm8,5
	pxor	xmm2,xmm4
	movdqa	xmm6,xmm10
	pandn	xmm7,xmm12
	movdqa	xmm5,xmm2
	pand	xmm6,xmm11
	movdqa	xmm9,xmm14
	psrld	xmm5,31
	paddd	xmm2,xmm2

	movdqa	XMMWORD[(0-128)+rax],xmm1
	paddd	xmm13,xmm1
	psrld	xmm9,27
	pxor	xmm6,xmm7

	movdqa	xmm7,xmm10
	por	xmm8,xmm9
	pslld	xmm7,30
	paddd	xmm13,xmm6

	psrld	xmm10,2
	paddd	xmm13,xmm8
	por	xmm2,xmm5
	por	xmm10,xmm7
	pxor	xmm3,xmm0
	movdqa	xmm0,XMMWORD[((64-128))+rax]

	movdqa	xmm8,xmm13
	pxor	xmm3,XMMWORD[((160-128))+rax]
	paddd	xmm12,xmm15
	movdqa	xmm7,xmm14
	pslld	xmm8,5
	pxor	xmm3,xmm0
	movdqa	xmm6,xmm14
	pandn	xmm7,xmm11
	movdqa	xmm5,xmm3
	pand	xmm6,xmm10
	movdqa	xmm9,xmm13
	psrld	xmm5,31
	paddd	xmm3,xmm3

	movdqa	XMMWORD[(16-128)+rax],xmm2
	paddd	xmm12,xmm2
	psrld	xmm9,27
	pxor	xmm6,xmm7

	movdqa	xmm7,xmm14
	por	xmm8,xmm9
	pslld	xmm7,30
	paddd	xmm12,xmm6

	psrld	xmm14,2
	paddd	xmm12,xmm8
	por	xmm3,xmm5
	por	xmm14,xmm7
	pxor	xmm4,xmm1
	movdqa	xmm1,XMMWORD[((80-128))+rax]

	movdqa	xmm8,xmm12
	pxor	xmm4,XMMWORD[((176-128))+rax]
	paddd	xmm11,xmm15
	movdqa	xmm7,xmm13
	pslld	xmm8,5
	pxor	xmm4,xmm1
	movdqa	xmm6,xmm13
	pandn	xmm7,xmm10
	movdqa	xmm5,xmm4
	pand	xmm6,xmm14
	movdqa	xmm9,xmm12
	psrld	xmm5,31
	paddd	xmm4,xmm4

	movdqa	XMMWORD[(32-128)+rax],xmm3
	paddd	xmm11,xmm3
	psrld	xmm9,27
	pxor	xmm6,xmm7

	movdqa	xmm7,xmm13
	por	xmm8,xmm9
	pslld	xmm7,30
	paddd	xmm11,xmm6

	psrld	xmm13,2
	paddd	xmm11,xmm8
	por	xmm4,xmm5
	por	xmm13,xmm7
	pxor	xmm0,xmm2
	movdqa	xmm2,XMMWORD[((96-128))+rax]

	movdqa	xmm8,xmm11
	pxor	xmm0,XMMWORD[((192-128))+rax]
	paddd	xmm10,xmm15
	movdqa	xmm7,xmm12
	pslld	xmm8,5
	pxor	xmm0,xmm2
	movdqa	xmm6,xmm12
	pandn	xmm7,xmm14
	movdqa	xmm5,xmm0
	pand	xmm6,xmm13
	movdqa	xmm9,xmm11
	psrld	xmm5,31
	paddd	xmm0,xmm0

	movdqa	XMMWORD[(48-128)+rax],xmm4
	paddd	xmm10,xmm4
	psrld	xmm9,27
	pxor	xmm6,xmm7

	movdqa	xmm7,xmm12
	por	xmm8,xmm9
	pslld	xmm7,30
	paddd	xmm10,xmm6

	psrld	xmm12,2
	paddd	xmm10,xmm8
	por	xmm0,xmm5
	por	xmm12,xmm7
	movdqa	xmm15,XMMWORD[rbp]
	pxor	xmm1,xmm3
	movdqa	xmm3,XMMWORD[((112-128))+rax]

	movdqa	xmm8,xmm10
	movdqa	xmm6,xmm13
	pxor	xmm1,XMMWORD[((208-128))+rax]
	paddd	xmm14,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm11

	movdqa	xmm9,xmm10
	movdqa	XMMWORD[(64-128)+rax],xmm0
	paddd	xmm14,xmm0
	pxor	xmm1,xmm3
	psrld	xmm9,27
	pxor	xmm6,xmm12
	movdqa	xmm7,xmm11

	pslld	xmm7,30
	movdqa	xmm5,xmm1
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm14,xmm6
	paddd	xmm1,xmm1

	psrld	xmm11,2
	paddd	xmm14,xmm8
	por	xmm1,xmm5
	por	xmm11,xmm7
	pxor	xmm2,xmm4
	movdqa	xmm4,XMMWORD[((128-128))+rax]

	movdqa	xmm8,xmm14
	movdqa	xmm6,xmm12
	pxor	xmm2,XMMWORD[((224-128))+rax]
	paddd	xmm13,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm10

	movdqa	xmm9,xmm14
	movdqa	XMMWORD[(80-128)+rax],xmm1
	paddd	xmm13,xmm1
	pxor	xmm2,xmm4
	psrld	xmm9,27
	pxor	xmm6,xmm11
	movdqa	xmm7,xmm10

	pslld	xmm7,30
	movdqa	xmm5,xmm2
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm13,xmm6
	paddd	xmm2,xmm2

	psrld	xmm10,2
	paddd	xmm13,xmm8
	por	xmm2,xmm5
	por	xmm10,xmm7
	pxor	xmm3,xmm0
	movdqa	xmm0,XMMWORD[((144-128))+rax]

	movdqa	xmm8,xmm13
	movdqa	xmm6,xmm11
	pxor	xmm3,XMMWORD[((240-128))+rax]
	paddd	xmm12,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm14

	movdqa	xmm9,xmm13
	movdqa	XMMWORD[(96-128)+rax],xmm2
	paddd	xmm12,xmm2
	pxor	xmm3,xmm0
	psrld	xmm9,27
	pxor	xmm6,xmm10
	movdqa	xmm7,xmm14

	pslld	xmm7,30
	movdqa	xmm5,xmm3
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm12,xmm6
	paddd	xmm3,xmm3

	psrld	xmm14,2
	paddd	xmm12,xmm8
	por	xmm3,xmm5
	por	xmm14,xmm7
	pxor	xmm4,xmm1
	movdqa	xmm1,XMMWORD[((160-128))+rax]

	movdqa	xmm8,xmm12
	movdqa	xmm6,xmm10
	pxor	xmm4,XMMWORD[((0-128))+rax]
	paddd	xmm11,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm13

	movdqa	xmm9,xmm12
	movdqa	XMMWORD[(112-128)+rax],xmm3
	paddd	xmm11,xmm3
	pxor	xmm4,xmm1
	psrld	xmm9,27
	pxor	xmm6,xmm14
	movdqa	xmm7,xmm13

	pslld	xmm7,30
	movdqa	xmm5,xmm4
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm11,xmm6
	paddd	xmm4,xmm4

	psrld	xmm13,2
	paddd	xmm11,xmm8
	por	xmm4,xmm5
	por	xmm13,xmm7
	pxor	xmm0,xmm2
	movdqa	xmm2,XMMWORD[((176-128))+rax]

	movdqa	xmm8,xmm11
	movdqa	xmm6,xmm14
	pxor	xmm0,XMMWORD[((16-128))+rax]
	paddd	xmm10,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm12

	movdqa	xmm9,xmm11
	movdqa	XMMWORD[(128-128)+rax],xmm4
	paddd	xmm10,xmm4
	pxor	xmm0,xmm2
	psrld	xmm9,27
	pxor	xmm6,xmm13
	movdqa	xmm7,xmm12

	pslld	xmm7,30
	movdqa	xmm5,xmm0
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm10,xmm6
	paddd	xmm0,xmm0

	psrld	xmm12,2
	paddd	xmm10,xmm8
	por	xmm0,xmm5
	por	xmm12,xmm7
	pxor	xmm1,xmm3
	movdqa	xmm3,XMMWORD[((192-128))+rax]

	movdqa	xmm8,xmm10
	movdqa	xmm6,xmm13
	pxor	xmm1,XMMWORD[((32-128))+rax]
	paddd	xmm14,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm11

	movdqa	xmm9,xmm10
	movdqa	XMMWORD[(144-128)+rax],xmm0
	paddd	xmm14,xmm0
	pxor	xmm1,xmm3
	psrld	xmm9,27
	pxor	xmm6,xmm12
	movdqa	xmm7,xmm11

	pslld	xmm7,30
	movdqa	xmm5,xmm1
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm14,xmm6
	paddd	xmm1,xmm1

	psrld	xmm11,2
	paddd	xmm14,xmm8
	por	xmm1,xmm5
	por	xmm11,xmm7
	pxor	xmm2,xmm4
	movdqa	xmm4,XMMWORD[((208-128))+rax]

	movdqa	xmm8,xmm14
	movdqa	xmm6,xmm12
	pxor	xmm2,XMMWORD[((48-128))+rax]
	paddd	xmm13,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm10

	movdqa	xmm9,xmm14
	movdqa	XMMWORD[(160-128)+rax],xmm1
	paddd	xmm13,xmm1
	pxor	xmm2,xmm4
	psrld	xmm9,27
	pxor	xmm6,xmm11
	movdqa	xmm7,xmm10

	pslld	xmm7,30
	movdqa	xmm5,xmm2
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm13,xmm6
	paddd	xmm2,xmm2

	psrld	xmm10,2
	paddd	xmm13,xmm8
	por	xmm2,xmm5
	por	xmm10,xmm7
	pxor	xmm3,xmm0
	movdqa	xmm0,XMMWORD[((224-128))+rax]

	movdqa	xmm8,xmm13
	movdqa	xmm6,xmm11
	pxor	xmm3,XMMWORD[((64-128))+rax]
	paddd	xmm12,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm14

	movdqa	xmm9,xmm13
	movdqa	XMMWORD[(176-128)+rax],xmm2
	paddd	xmm12,xmm2
	pxor	xmm3,xmm0
	psrld	xmm9,27
	pxor	xmm6,xmm10
	movdqa	xmm7,xmm14

	pslld	xmm7,30
	movdqa	xmm5,xmm3
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm12,xmm6
	paddd	xmm3,xmm3

	psrld	xmm14,2
	paddd	xmm12,xmm8
	por	xmm3,xmm5
	por	xmm14,xmm7
	pxor	xmm4,xmm1
	movdqa	xmm1,XMMWORD[((240-128))+rax]

	movdqa	xmm8,xmm12
	movdqa	xmm6,xmm10
	pxor	xmm4,XMMWORD[((80-128))+rax]
	paddd	xmm11,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm13

	movdqa	xmm9,xmm12
	movdqa	XMMWORD[(192-128)+rax],xmm3
	paddd	xmm11,xmm3
	pxor	xmm4,xmm1
	psrld	xmm9,27
	pxor	xmm6,xmm14
	movdqa	xmm7,xmm13

	pslld	xmm7,30
	movdqa	xmm5,xmm4
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm11,xmm6
	paddd	xmm4,xmm4

	psrld	xmm13,2
	paddd	xmm11,xmm8
	por	xmm4,xmm5
	por	xmm13,xmm7
	pxor	xmm0,xmm2
	movdqa	xmm2,XMMWORD[((0-128))+rax]

	movdqa	xmm8,xmm11
	movdqa	xmm6,xmm14
	pxor	xmm0,XMMWORD[((96-128))+rax]
	paddd	xmm10,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm12

	movdqa	xmm9,xmm11
	movdqa	XMMWORD[(208-128)+rax],xmm4
	paddd	xmm10,xmm4
	pxor	xmm0,xmm2
	psrld	xmm9,27
	pxor	xmm6,xmm13
	movdqa	xmm7,xmm12

	pslld	xmm7,30
	movdqa	xmm5,xmm0
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm10,xmm6
	paddd	xmm0,xmm0

	psrld	xmm12,2
	paddd	xmm10,xmm8
	por	xmm0,xmm5
	por	xmm12,xmm7
	pxor	xmm1,xmm3
	movdqa	xmm3,XMMWORD[((16-128))+rax]

	movdqa	xmm8,xmm10
	movdqa	xmm6,xmm13
	pxor	xmm1,XMMWORD[((112-128))+rax]
	paddd	xmm14,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm11

	movdqa	xmm9,xmm10
	movdqa	XMMWORD[(224-128)+rax],xmm0
	paddd	xmm14,xmm0
	pxor	xmm1,xmm3
	psrld	xmm9,27
	pxor	xmm6,xmm12
	movdqa	xmm7,xmm11

	pslld	xmm7,30
	movdqa	xmm5,xmm1
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm14,xmm6
	paddd	xmm1,xmm1

	psrld	xmm11,2
	paddd	xmm14,xmm8
	por	xmm1,xmm5
	por	xmm11,xmm7
	pxor	xmm2,xmm4
	movdqa	xmm4,XMMWORD[((32-128))+rax]

	movdqa	xmm8,xmm14
	movdqa	xmm6,xmm12
	pxor	xmm2,XMMWORD[((128-128))+rax]
	paddd	xmm13,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm10

	movdqa	xmm9,xmm14
	movdqa	XMMWORD[(240-128)+rax],xmm1
	paddd	xmm13,xmm1
	pxor	xmm2,xmm4
	psrld	xmm9,27
	pxor	xmm6,xmm11
	movdqa	xmm7,xmm10

	pslld	xmm7,30
	movdqa	xmm5,xmm2
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm13,xmm6
	paddd	xmm2,xmm2

	psrld	xmm10,2
	paddd	xmm13,xmm8
	por	xmm2,xmm5
	por	xmm10,xmm7
	pxor	xmm3,xmm0
	movdqa	xmm0,XMMWORD[((48-128))+rax]

	movdqa	xmm8,xmm13
	movdqa	xmm6,xmm11
	pxor	xmm3,XMMWORD[((144-128))+rax]
	paddd	xmm12,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm14

	movdqa	xmm9,xmm13
	movdqa	XMMWORD[(0-128)+rax],xmm2
	paddd	xmm12,xmm2
	pxor	xmm3,xmm0
	psrld	xmm9,27
	pxor	xmm6,xmm10
	movdqa	xmm7,xmm14

	pslld	xmm7,30
	movdqa	xmm5,xmm3
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm12,xmm6
	paddd	xmm3,xmm3

	psrld	xmm14,2
	paddd	xmm12,xmm8
	por	xmm3,xmm5
	por	xmm14,xmm7
	pxor	xmm4,xmm1
	movdqa	xmm1,XMMWORD[((64-128))+rax]

	movdqa	xmm8,xmm12
	movdqa	xmm6,xmm10
	pxor	xmm4,XMMWORD[((160-128))+rax]
	paddd	xmm11,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm13

	movdqa	xmm9,xmm12
	movdqa	XMMWORD[(16-128)+rax],xmm3
	paddd	xmm11,xmm3
	pxor	xmm4,xmm1
	psrld	xmm9,27
	pxor	xmm6,xmm14
	movdqa	xmm7,xmm13

	pslld	xmm7,30
	movdqa	xmm5,xmm4
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm11,xmm6
	paddd	xmm4,xmm4

	psrld	xmm13,2
	paddd	xmm11,xmm8
	por	xmm4,xmm5
	por	xmm13,xmm7
	pxor	xmm0,xmm2
	movdqa	xmm2,XMMWORD[((80-128))+rax]

	movdqa	xmm8,xmm11
	movdqa	xmm6,xmm14
	pxor	xmm0,XMMWORD[((176-128))+rax]
	paddd	xmm10,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm12

	movdqa	xmm9,xmm11
	movdqa	XMMWORD[(32-128)+rax],xmm4
	paddd	xmm10,xmm4
	pxor	xmm0,xmm2
	psrld	xmm9,27
	pxor	xmm6,xmm13
	movdqa	xmm7,xmm12

	pslld	xmm7,30
	movdqa	xmm5,xmm0
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm10,xmm6
	paddd	xmm0,xmm0

	psrld	xmm12,2
	paddd	xmm10,xmm8
	por	xmm0,xmm5
	por	xmm12,xmm7
	pxor	xmm1,xmm3
	movdqa	xmm3,XMMWORD[((96-128))+rax]

	movdqa	xmm8,xmm10
	movdqa	xmm6,xmm13
	pxor	xmm1,XMMWORD[((192-128))+rax]
	paddd	xmm14,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm11

	movdqa	xmm9,xmm10
	movdqa	XMMWORD[(48-128)+rax],xmm0
	paddd	xmm14,xmm0
	pxor	xmm1,xmm3
	psrld	xmm9,27
	pxor	xmm6,xmm12
	movdqa	xmm7,xmm11

	pslld	xmm7,30
	movdqa	xmm5,xmm1
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm14,xmm6
	paddd	xmm1,xmm1

	psrld	xmm11,2
	paddd	xmm14,xmm8
	por	xmm1,xmm5
	por	xmm11,xmm7
	pxor	xmm2,xmm4
	movdqa	xmm4,XMMWORD[((112-128))+rax]

	movdqa	xmm8,xmm14
	movdqa	xmm6,xmm12
	pxor	xmm2,XMMWORD[((208-128))+rax]
	paddd	xmm13,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm10

	movdqa	xmm9,xmm14
	movdqa	XMMWORD[(64-128)+rax],xmm1
	paddd	xmm13,xmm1
	pxor	xmm2,xmm4
	psrld	xmm9,27
	pxor	xmm6,xmm11
	movdqa	xmm7,xmm10

	pslld	xmm7,30
	movdqa	xmm5,xmm2
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm13,xmm6
	paddd	xmm2,xmm2

	psrld	xmm10,2
	paddd	xmm13,xmm8
	por	xmm2,xmm5
	por	xmm10,xmm7
	pxor	xmm3,xmm0
	movdqa	xmm0,XMMWORD[((128-128))+rax]

	movdqa	xmm8,xmm13
	movdqa	xmm6,xmm11
	pxor	xmm3,XMMWORD[((224-128))+rax]
	paddd	xmm12,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm14

	movdqa	xmm9,xmm13
	movdqa	XMMWORD[(80-128)+rax],xmm2
	paddd	xmm12,xmm2
	pxor	xmm3,xmm0
	psrld	xmm9,27
	pxor	xmm6,xmm10
	movdqa	xmm7,xmm14

	pslld	xmm7,30
	movdqa	xmm5,xmm3
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm12,xmm6
	paddd	xmm3,xmm3

	psrld	xmm14,2
	paddd	xmm12,xmm8
	por	xmm3,xmm5
	por	xmm14,xmm7
	pxor	xmm4,xmm1
	movdqa	xmm1,XMMWORD[((144-128))+rax]

	movdqa	xmm8,xmm12
	movdqa	xmm6,xmm10
	pxor	xmm4,XMMWORD[((240-128))+rax]
	paddd	xmm11,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm13

	movdqa	xmm9,xmm12
	movdqa	XMMWORD[(96-128)+rax],xmm3
	paddd	xmm11,xmm3
	pxor	xmm4,xmm1
	psrld	xmm9,27
	pxor	xmm6,xmm14
	movdqa	xmm7,xmm13

	pslld	xmm7,30
	movdqa	xmm5,xmm4
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm11,xmm6
	paddd	xmm4,xmm4

	psrld	xmm13,2
	paddd	xmm11,xmm8
	por	xmm4,xmm5
	por	xmm13,xmm7
	pxor	xmm0,xmm2
	movdqa	xmm2,XMMWORD[((160-128))+rax]

	movdqa	xmm8,xmm11
	movdqa	xmm6,xmm14
	pxor	xmm0,XMMWORD[((0-128))+rax]
	paddd	xmm10,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm12

	movdqa	xmm9,xmm11
	movdqa	XMMWORD[(112-128)+rax],xmm4
	paddd	xmm10,xmm4
	pxor	xmm0,xmm2
	psrld	xmm9,27
	pxor	xmm6,xmm13
	movdqa	xmm7,xmm12

	pslld	xmm7,30
	movdqa	xmm5,xmm0
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm10,xmm6
	paddd	xmm0,xmm0

	psrld	xmm12,2
	paddd	xmm10,xmm8
	por	xmm0,xmm5
	por	xmm12,xmm7
	movdqa	xmm15,XMMWORD[32+rbp]
	pxor	xmm1,xmm3
	movdqa	xmm3,XMMWORD[((176-128))+rax]

	movdqa	xmm8,xmm10
	movdqa	xmm7,xmm13
	pxor	xmm1,XMMWORD[((16-128))+rax]
	pxor	xmm1,xmm3
	paddd	xmm14,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm10
	pand	xmm7,xmm12

	movdqa	xmm6,xmm13
	movdqa	xmm5,xmm1
	psrld	xmm9,27
	paddd	xmm14,xmm7
	pxor	xmm6,xmm12

	movdqa	XMMWORD[(128-128)+rax],xmm0
	paddd	xmm14,xmm0
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm11
	movdqa	xmm7,xmm11

	pslld	xmm7,30
	paddd	xmm1,xmm1
	paddd	xmm14,xmm6

	psrld	xmm11,2
	paddd	xmm14,xmm8
	por	xmm1,xmm5
	por	xmm11,xmm7
	pxor	xmm2,xmm4
	movdqa	xmm4,XMMWORD[((192-128))+rax]

	movdqa	xmm8,xmm14
	movdqa	xmm7,xmm12
	pxor	xmm2,XMMWORD[((32-128))+rax]
	pxor	xmm2,xmm4
	paddd	xmm13,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm14
	pand	xmm7,xmm11

	movdqa	xmm6,xmm12
	movdqa	xmm5,xmm2
	psrld	xmm9,27
	paddd	xmm13,xmm7
	pxor	xmm6,xmm11

	movdqa	XMMWORD[(144-128)+rax],xmm1
	paddd	xmm13,xmm1
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm10
	movdqa	xmm7,xmm10

	pslld	xmm7,30
	paddd	xmm2,xmm2
	paddd	xmm13,xmm6

	psrld	xmm10,2
	paddd	xmm13,xmm8
	por	xmm2,xmm5
	por	xmm10,xmm7
	pxor	xmm3,xmm0
	movdqa	xmm0,XMMWORD[((208-128))+rax]

	movdqa	xmm8,xmm13
	movdqa	xmm7,xmm11
	pxor	xmm3,XMMWORD[((48-128))+rax]
	pxor	xmm3,xmm0
	paddd	xmm12,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm13
	pand	xmm7,xmm10

	movdqa	xmm6,xmm11
	movdqa	xmm5,xmm3
	psrld	xmm9,27
	paddd	xmm12,xmm7
	pxor	xmm6,xmm10

	movdqa	XMMWORD[(160-128)+rax],xmm2
	paddd	xmm12,xmm2
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm14
	movdqa	xmm7,xmm14

	pslld	xmm7,30
	paddd	xmm3,xmm3
	paddd	xmm12,xmm6

	psrld	xmm14,2
	paddd	xmm12,xmm8
	por	xmm3,xmm5
	por	xmm14,xmm7
	pxor	xmm4,xmm1
	movdqa	xmm1,XMMWORD[((224-128))+rax]

	movdqa	xmm8,xmm12
	movdqa	xmm7,xmm10
	pxor	xmm4,XMMWORD[((64-128))+rax]
	pxor	xmm4,xmm1
	paddd	xmm11,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm12
	pand	xmm7,xmm14

	movdqa	xmm6,xmm10
	movdqa	xmm5,xmm4
	psrld	xmm9,27
	paddd	xmm11,xmm7
	pxor	xmm6,xmm14

	movdqa	XMMWORD[(176-128)+rax],xmm3
	paddd	xmm11,xmm3
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm13
	movdqa	xmm7,xmm13

	pslld	xmm7,30
	paddd	xmm4,xmm4
	paddd	xmm11,xmm6

	psrld	xmm13,2
	paddd	xmm11,xmm8
	por	xmm4,xmm5
	por	xmm13,xmm7
	pxor	xmm0,xmm2
	movdqa	xmm2,XMMWORD[((240-128))+rax]

	movdqa	xmm8,xmm11
	movdqa	xmm7,xmm14
	pxor	xmm0,XMMWORD[((80-128))+rax]
	pxor	xmm0,xmm2
	paddd	xmm10,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm11
	pand	xmm7,xmm13

	movdqa	xmm6,xmm14
	movdqa	xmm5,xmm0
	psrld	xmm9,27
	paddd	xmm10,xmm7
	pxor	xmm6,xmm13

	movdqa	XMMWORD[(192-128)+rax],xmm4
	paddd	xmm10,xmm4
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm12
	movdqa	xmm7,xmm12

	pslld	xmm7,30
	paddd	xmm0,xmm0
	paddd	xmm10,xmm6

	psrld	xmm12,2
	paddd	xmm10,xmm8
	por	xmm0,xmm5
	por	xmm12,xmm7
	pxor	xmm1,xmm3
	movdqa	xmm3,XMMWORD[((0-128))+rax]

	movdqa	xmm8,xmm10
	movdqa	xmm7,xmm13
	pxor	xmm1,XMMWORD[((96-128))+rax]
	pxor	xmm1,xmm3
	paddd	xmm14,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm10
	pand	xmm7,xmm12

	movdqa	xmm6,xmm13
	movdqa	xmm5,xmm1
	psrld	xmm9,27
	paddd	xmm14,xmm7
	pxor	xmm6,xmm12

	movdqa	XMMWORD[(208-128)+rax],xmm0
	paddd	xmm14,xmm0
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm11
	movdqa	xmm7,xmm11

	pslld	xmm7,30
	paddd	xmm1,xmm1
	paddd	xmm14,xmm6

	psrld	xmm11,2
	paddd	xmm14,xmm8
	por	xmm1,xmm5
	por	xmm11,xmm7
	pxor	xmm2,xmm4
	movdqa	xmm4,XMMWORD[((16-128))+rax]

	movdqa	xmm8,xmm14
	movdqa	xmm7,xmm12
	pxor	xmm2,XMMWORD[((112-128))+rax]
	pxor	xmm2,xmm4
	paddd	xmm13,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm14
	pand	xmm7,xmm11

	movdqa	xmm6,xmm12
	movdqa	xmm5,xmm2
	psrld	xmm9,27
	paddd	xmm13,xmm7
	pxor	xmm6,xmm11

	movdqa	XMMWORD[(224-128)+rax],xmm1
	paddd	xmm13,xmm1
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm10
	movdqa	xmm7,xmm10

	pslld	xmm7,30
	paddd	xmm2,xmm2
	paddd	xmm13,xmm6

	psrld	xmm10,2
	paddd	xmm13,xmm8
	por	xmm2,xmm5
	por	xmm10,xmm7
	pxor	xmm3,xmm0
	movdqa	xmm0,XMMWORD[((32-128))+rax]

	movdqa	xmm8,xmm13
	movdqa	xmm7,xmm11
	pxor	xmm3,XMMWORD[((128-128))+rax]
	pxor	xmm3,xmm0
	paddd	xmm12,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm13
	pand	xmm7,xmm10

	movdqa	xmm6,xmm11
	movdqa	xmm5,xmm3
	psrld	xmm9,27
	paddd	xmm12,xmm7
	pxor	xmm6,xmm10

	movdqa	XMMWORD[(240-128)+rax],xmm2
	paddd	xmm12,xmm2
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm14
	movdqa	xmm7,xmm14

	pslld	xmm7,30
	paddd	xmm3,xmm3
	paddd	xmm12,xmm6

	psrld	xmm14,2
	paddd	xmm12,xmm8
	por	xmm3,xmm5
	por	xmm14,xmm7
	pxor	xmm4,xmm1
	movdqa	xmm1,XMMWORD[((48-128))+rax]

	movdqa	xmm8,xmm12
	movdqa	xmm7,xmm10
	pxor	xmm4,XMMWORD[((144-128))+rax]
	pxor	xmm4,xmm1
	paddd	xmm11,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm12
	pand	xmm7,xmm14

	movdqa	xmm6,xmm10
	movdqa	xmm5,xmm4
	psrld	xmm9,27
	paddd	xmm11,xmm7
	pxor	xmm6,xmm14

	movdqa	XMMWORD[(0-128)+rax],xmm3
	paddd	xmm11,xmm3
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm13
	movdqa	xmm7,xmm13

	pslld	xmm7,30
	paddd	xmm4,xmm4
	paddd	xmm11,xmm6

	psrld	xmm13,2
	paddd	xmm11,xmm8
	por	xmm4,xmm5
	por	xmm13,xmm7
	pxor	xmm0,xmm2
	movdqa	xmm2,XMMWORD[((64-128))+rax]

	movdqa	xmm8,xmm11
	movdqa	xmm7,xmm14
	pxor	xmm0,XMMWORD[((160-128))+rax]
	pxor	xmm0,xmm2
	paddd	xmm10,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm11
	pand	xmm7,xmm13

	movdqa	xmm6,xmm14
	movdqa	xmm5,xmm0
	psrld	xmm9,27
	paddd	xmm10,xmm7
	pxor	xmm6,xmm13

	movdqa	XMMWORD[(16-128)+rax],xmm4
	paddd	xmm10,xmm4
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm12
	movdqa	xmm7,xmm12

	pslld	xmm7,30
	paddd	xmm0,xmm0
	paddd	xmm10,xmm6

	psrld	xmm12,2
	paddd	xmm10,xmm8
	por	xmm0,xmm5
	por	xmm12,xmm7
	pxor	xmm1,xmm3
	movdqa	xmm3,XMMWORD[((80-128))+rax]

	movdqa	xmm8,xmm10
	movdqa	xmm7,xmm13
	pxor	xmm1,XMMWORD[((176-128))+rax]
	pxor	xmm1,xmm3
	paddd	xmm14,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm10
	pand	xmm7,xmm12

	movdqa	xmm6,xmm13
	movdqa	xmm5,xmm1
	psrld	xmm9,27
	paddd	xmm14,xmm7
	pxor	xmm6,xmm12

	movdqa	XMMWORD[(32-128)+rax],xmm0
	paddd	xmm14,xmm0
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm11
	movdqa	xmm7,xmm11

	pslld	xmm7,30
	paddd	xmm1,xmm1
	paddd	xmm14,xmm6

	psrld	xmm11,2
	paddd	xmm14,xmm8
	por	xmm1,xmm5
	por	xmm11,xmm7
	pxor	xmm2,xmm4
	movdqa	xmm4,XMMWORD[((96-128))+rax]

	movdqa	xmm8,xmm14
	movdqa	xmm7,xmm12
	pxor	xmm2,XMMWORD[((192-128))+rax]
	pxor	xmm2,xmm4
	paddd	xmm13,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm14
	pand	xmm7,xmm11

	movdqa	xmm6,xmm12
	movdqa	xmm5,xmm2
	psrld	xmm9,27
	paddd	xmm13,xmm7
	pxor	xmm6,xmm11

	movdqa	XMMWORD[(48-128)+rax],xmm1
	paddd	xmm13,xmm1
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm10
	movdqa	xmm7,xmm10

	pslld	xmm7,30
	paddd	xmm2,xmm2
	paddd	xmm13,xmm6

	psrld	xmm10,2
	paddd	xmm13,xmm8
	por	xmm2,xmm5
	por	xmm10,xmm7
	pxor	xmm3,xmm0
	movdqa	xmm0,XMMWORD[((112-128))+rax]

	movdqa	xmm8,xmm13
	movdqa	xmm7,xmm11
	pxor	xmm3,XMMWORD[((208-128))+rax]
	pxor	xmm3,xmm0
	paddd	xmm12,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm13
	pand	xmm7,xmm10

	movdqa	xmm6,xmm11
	movdqa	xmm5,xmm3
	psrld	xmm9,27
	paddd	xmm12,xmm7
	pxor	xmm6,xmm10

	movdqa	XMMWORD[(64-128)+rax],xmm2
	paddd	xmm12,xmm2
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm14
	movdqa	xmm7,xmm14

	pslld	xmm7,30
	paddd	xmm3,xmm3
	paddd	xmm12,xmm6

	psrld	xmm14,2
	paddd	xmm12,xmm8
	por	xmm3,xmm5
	por	xmm14,xmm7
	pxor	xmm4,xmm1
	movdqa	xmm1,XMMWORD[((128-128))+rax]

	movdqa	xmm8,xmm12
	movdqa	xmm7,xmm10
	pxor	xmm4,XMMWORD[((224-128))+rax]
	pxor	xmm4,xmm1
	paddd	xmm11,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm12
	pand	xmm7,xmm14

	movdqa	xmm6,xmm10
	movdqa	xmm5,xmm4
	psrld	xmm9,27
	paddd	xmm11,xmm7
	pxor	xmm6,xmm14

	movdqa	XMMWORD[(80-128)+rax],xmm3
	paddd	xmm11,xmm3
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm13
	movdqa	xmm7,xmm13

	pslld	xmm7,30
	paddd	xmm4,xmm4
	paddd	xmm11,xmm6

	psrld	xmm13,2
	paddd	xmm11,xmm8
	por	xmm4,xmm5
	por	xmm13,xmm7
	pxor	xmm0,xmm2
	movdqa	xmm2,XMMWORD[((144-128))+rax]

	movdqa	xmm8,xmm11
	movdqa	xmm7,xmm14
	pxor	xmm0,XMMWORD[((240-128))+rax]
	pxor	xmm0,xmm2
	paddd	xmm10,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm11
	pand	xmm7,xmm13

	movdqa	xmm6,xmm14
	movdqa	xmm5,xmm0
	psrld	xmm9,27
	paddd	xmm10,xmm7
	pxor	xmm6,xmm13

	movdqa	XMMWORD[(96-128)+rax],xmm4
	paddd	xmm10,xmm4
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm12
	movdqa	xmm7,xmm12

	pslld	xmm7,30
	paddd	xmm0,xmm0
	paddd	xmm10,xmm6

	psrld	xmm12,2
	paddd	xmm10,xmm8
	por	xmm0,xmm5
	por	xmm12,xmm7
	pxor	xmm1,xmm3
	movdqa	xmm3,XMMWORD[((160-128))+rax]

	movdqa	xmm8,xmm10
	movdqa	xmm7,xmm13
	pxor	xmm1,XMMWORD[((0-128))+rax]
	pxor	xmm1,xmm3
	paddd	xmm14,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm10
	pand	xmm7,xmm12

	movdqa	xmm6,xmm13
	movdqa	xmm5,xmm1
	psrld	xmm9,27
	paddd	xmm14,xmm7
	pxor	xmm6,xmm12

	movdqa	XMMWORD[(112-128)+rax],xmm0
	paddd	xmm14,xmm0
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm11
	movdqa	xmm7,xmm11

	pslld	xmm7,30
	paddd	xmm1,xmm1
	paddd	xmm14,xmm6

	psrld	xmm11,2
	paddd	xmm14,xmm8
	por	xmm1,xmm5
	por	xmm11,xmm7
	pxor	xmm2,xmm4
	movdqa	xmm4,XMMWORD[((176-128))+rax]

	movdqa	xmm8,xmm14
	movdqa	xmm7,xmm12
	pxor	xmm2,XMMWORD[((16-128))+rax]
	pxor	xmm2,xmm4
	paddd	xmm13,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm14
	pand	xmm7,xmm11

	movdqa	xmm6,xmm12
	movdqa	xmm5,xmm2
	psrld	xmm9,27
	paddd	xmm13,xmm7
	pxor	xmm6,xmm11

	movdqa	XMMWORD[(128-128)+rax],xmm1
	paddd	xmm13,xmm1
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm10
	movdqa	xmm7,xmm10

	pslld	xmm7,30
	paddd	xmm2,xmm2
	paddd	xmm13,xmm6

	psrld	xmm10,2
	paddd	xmm13,xmm8
	por	xmm2,xmm5
	por	xmm10,xmm7
	pxor	xmm3,xmm0
	movdqa	xmm0,XMMWORD[((192-128))+rax]

	movdqa	xmm8,xmm13
	movdqa	xmm7,xmm11
	pxor	xmm3,XMMWORD[((32-128))+rax]
	pxor	xmm3,xmm0
	paddd	xmm12,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm13
	pand	xmm7,xmm10

	movdqa	xmm6,xmm11
	movdqa	xmm5,xmm3
	psrld	xmm9,27
	paddd	xmm12,xmm7
	pxor	xmm6,xmm10

	movdqa	XMMWORD[(144-128)+rax],xmm2
	paddd	xmm12,xmm2
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm14
	movdqa	xmm7,xmm14

	pslld	xmm7,30
	paddd	xmm3,xmm3
	paddd	xmm12,xmm6

	psrld	xmm14,2
	paddd	xmm12,xmm8
	por	xmm3,xmm5
	por	xmm14,xmm7
	pxor	xmm4,xmm1
	movdqa	xmm1,XMMWORD[((208-128))+rax]

	movdqa	xmm8,xmm12
	movdqa	xmm7,xmm10
	pxor	xmm4,XMMWORD[((48-128))+rax]
	pxor	xmm4,xmm1
	paddd	xmm11,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm12
	pand	xmm7,xmm14

	movdqa	xmm6,xmm10
	movdqa	xmm5,xmm4
	psrld	xmm9,27
	paddd	xmm11,xmm7
	pxor	xmm6,xmm14

	movdqa	XMMWORD[(160-128)+rax],xmm3
	paddd	xmm11,xmm3
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm13
	movdqa	xmm7,xmm13

	pslld	xmm7,30
	paddd	xmm4,xmm4
	paddd	xmm11,xmm6

	psrld	xmm13,2
	paddd	xmm11,xmm8
	por	xmm4,xmm5
	por	xmm13,xmm7
	pxor	xmm0,xmm2
	movdqa	xmm2,XMMWORD[((224-128))+rax]

	movdqa	xmm8,xmm11
	movdqa	xmm7,xmm14
	pxor	xmm0,XMMWORD[((64-128))+rax]
	pxor	xmm0,xmm2
	paddd	xmm10,xmm15
	pslld	xmm8,5
	movdqa	xmm9,xmm11
	pand	xmm7,xmm13

	movdqa	xmm6,xmm14
	movdqa	xmm5,xmm0
	psrld	xmm9,27
	paddd	xmm10,xmm7
	pxor	xmm6,xmm13

	movdqa	XMMWORD[(176-128)+rax],xmm4
	paddd	xmm10,xmm4
	por	xmm8,xmm9
	psrld	xmm5,31
	pand	xmm6,xmm12
	movdqa	xmm7,xmm12

	pslld	xmm7,30
	paddd	xmm0,xmm0
	paddd	xmm10,xmm6

	psrld	xmm12,2
	paddd	xmm10,xmm8
	por	xmm0,xmm5
	por	xmm12,xmm7
	movdqa	xmm15,XMMWORD[64+rbp]
	pxor	xmm1,xmm3
	movdqa	xmm3,XMMWORD[((240-128))+rax]

	movdqa	xmm8,xmm10
	movdqa	xmm6,xmm13
	pxor	xmm1,XMMWORD[((80-128))+rax]
	paddd	xmm14,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm11

	movdqa	xmm9,xmm10
	movdqa	XMMWORD[(192-128)+rax],xmm0
	paddd	xmm14,xmm0
	pxor	xmm1,xmm3
	psrld	xmm9,27
	pxor	xmm6,xmm12
	movdqa	xmm7,xmm11

	pslld	xmm7,30
	movdqa	xmm5,xmm1
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm14,xmm6
	paddd	xmm1,xmm1

	psrld	xmm11,2
	paddd	xmm14,xmm8
	por	xmm1,xmm5
	por	xmm11,xmm7
	pxor	xmm2,xmm4
	movdqa	xmm4,XMMWORD[((0-128))+rax]

	movdqa	xmm8,xmm14
	movdqa	xmm6,xmm12
	pxor	xmm2,XMMWORD[((96-128))+rax]
	paddd	xmm13,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm10

	movdqa	xmm9,xmm14
	movdqa	XMMWORD[(208-128)+rax],xmm1
	paddd	xmm13,xmm1
	pxor	xmm2,xmm4
	psrld	xmm9,27
	pxor	xmm6,xmm11
	movdqa	xmm7,xmm10

	pslld	xmm7,30
	movdqa	xmm5,xmm2
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm13,xmm6
	paddd	xmm2,xmm2

	psrld	xmm10,2
	paddd	xmm13,xmm8
	por	xmm2,xmm5
	por	xmm10,xmm7
	pxor	xmm3,xmm0
	movdqa	xmm0,XMMWORD[((16-128))+rax]

	movdqa	xmm8,xmm13
	movdqa	xmm6,xmm11
	pxor	xmm3,XMMWORD[((112-128))+rax]
	paddd	xmm12,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm14

	movdqa	xmm9,xmm13
	movdqa	XMMWORD[(224-128)+rax],xmm2
	paddd	xmm12,xmm2
	pxor	xmm3,xmm0
	psrld	xmm9,27
	pxor	xmm6,xmm10
	movdqa	xmm7,xmm14

	pslld	xmm7,30
	movdqa	xmm5,xmm3
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm12,xmm6
	paddd	xmm3,xmm3

	psrld	xmm14,2
	paddd	xmm12,xmm8
	por	xmm3,xmm5
	por	xmm14,xmm7
	pxor	xmm4,xmm1
	movdqa	xmm1,XMMWORD[((32-128))+rax]

	movdqa	xmm8,xmm12
	movdqa	xmm6,xmm10
	pxor	xmm4,XMMWORD[((128-128))+rax]
	paddd	xmm11,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm13

	movdqa	xmm9,xmm12
	movdqa	XMMWORD[(240-128)+rax],xmm3
	paddd	xmm11,xmm3
	pxor	xmm4,xmm1
	psrld	xmm9,27
	pxor	xmm6,xmm14
	movdqa	xmm7,xmm13

	pslld	xmm7,30
	movdqa	xmm5,xmm4
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm11,xmm6
	paddd	xmm4,xmm4

	psrld	xmm13,2
	paddd	xmm11,xmm8
	por	xmm4,xmm5
	por	xmm13,xmm7
	pxor	xmm0,xmm2
	movdqa	xmm2,XMMWORD[((48-128))+rax]

	movdqa	xmm8,xmm11
	movdqa	xmm6,xmm14
	pxor	xmm0,XMMWORD[((144-128))+rax]
	paddd	xmm10,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm12

	movdqa	xmm9,xmm11
	movdqa	XMMWORD[(0-128)+rax],xmm4
	paddd	xmm10,xmm4
	pxor	xmm0,xmm2
	psrld	xmm9,27
	pxor	xmm6,xmm13
	movdqa	xmm7,xmm12

	pslld	xmm7,30
	movdqa	xmm5,xmm0
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm10,xmm6
	paddd	xmm0,xmm0

	psrld	xmm12,2
	paddd	xmm10,xmm8
	por	xmm0,xmm5
	por	xmm12,xmm7
	pxor	xmm1,xmm3
	movdqa	xmm3,XMMWORD[((64-128))+rax]

	movdqa	xmm8,xmm10
	movdqa	xmm6,xmm13
	pxor	xmm1,XMMWORD[((160-128))+rax]
	paddd	xmm14,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm11

	movdqa	xmm9,xmm10
	movdqa	XMMWORD[(16-128)+rax],xmm0
	paddd	xmm14,xmm0
	pxor	xmm1,xmm3
	psrld	xmm9,27
	pxor	xmm6,xmm12
	movdqa	xmm7,xmm11

	pslld	xmm7,30
	movdqa	xmm5,xmm1
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm14,xmm6
	paddd	xmm1,xmm1

	psrld	xmm11,2
	paddd	xmm14,xmm8
	por	xmm1,xmm5
	por	xmm11,xmm7
	pxor	xmm2,xmm4
	movdqa	xmm4,XMMWORD[((80-128))+rax]

	movdqa	xmm8,xmm14
	movdqa	xmm6,xmm12
	pxor	xmm2,XMMWORD[((176-128))+rax]
	paddd	xmm13,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm10

	movdqa	xmm9,xmm14
	movdqa	XMMWORD[(32-128)+rax],xmm1
	paddd	xmm13,xmm1
	pxor	xmm2,xmm4
	psrld	xmm9,27
	pxor	xmm6,xmm11
	movdqa	xmm7,xmm10

	pslld	xmm7,30
	movdqa	xmm5,xmm2
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm13,xmm6
	paddd	xmm2,xmm2

	psrld	xmm10,2
	paddd	xmm13,xmm8
	por	xmm2,xmm5
	por	xmm10,xmm7
	pxor	xmm3,xmm0
	movdqa	xmm0,XMMWORD[((96-128))+rax]

	movdqa	xmm8,xmm13
	movdqa	xmm6,xmm11
	pxor	xmm3,XMMWORD[((192-128))+rax]
	paddd	xmm12,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm14

	movdqa	xmm9,xmm13
	movdqa	XMMWORD[(48-128)+rax],xmm2
	paddd	xmm12,xmm2
	pxor	xmm3,xmm0
	psrld	xmm9,27
	pxor	xmm6,xmm10
	movdqa	xmm7,xmm14

	pslld	xmm7,30
	movdqa	xmm5,xmm3
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm12,xmm6
	paddd	xmm3,xmm3

	psrld	xmm14,2
	paddd	xmm12,xmm8
	por	xmm3,xmm5
	por	xmm14,xmm7
	pxor	xmm4,xmm1
	movdqa	xmm1,XMMWORD[((112-128))+rax]

	movdqa	xmm8,xmm12
	movdqa	xmm6,xmm10
	pxor	xmm4,XMMWORD[((208-128))+rax]
	paddd	xmm11,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm13

	movdqa	xmm9,xmm12
	movdqa	XMMWORD[(64-128)+rax],xmm3
	paddd	xmm11,xmm3
	pxor	xmm4,xmm1
	psrld	xmm9,27
	pxor	xmm6,xmm14
	movdqa	xmm7,xmm13

	pslld	xmm7,30
	movdqa	xmm5,xmm4
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm11,xmm6
	paddd	xmm4,xmm4

	psrld	xmm13,2
	paddd	xmm11,xmm8
	por	xmm4,xmm5
	por	xmm13,xmm7
	pxor	xmm0,xmm2
	movdqa	xmm2,XMMWORD[((128-128))+rax]

	movdqa	xmm8,xmm11
	movdqa	xmm6,xmm14
	pxor	xmm0,XMMWORD[((224-128))+rax]
	paddd	xmm10,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm12

	movdqa	xmm9,xmm11
	movdqa	XMMWORD[(80-128)+rax],xmm4
	paddd	xmm10,xmm4
	pxor	xmm0,xmm2
	psrld	xmm9,27
	pxor	xmm6,xmm13
	movdqa	xmm7,xmm12

	pslld	xmm7,30
	movdqa	xmm5,xmm0
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm10,xmm6
	paddd	xmm0,xmm0

	psrld	xmm12,2
	paddd	xmm10,xmm8
	por	xmm0,xmm5
	por	xmm12,xmm7
	pxor	xmm1,xmm3
	movdqa	xmm3,XMMWORD[((144-128))+rax]

	movdqa	xmm8,xmm10
	movdqa	xmm6,xmm13
	pxor	xmm1,XMMWORD[((240-128))+rax]
	paddd	xmm14,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm11

	movdqa	xmm9,xmm10
	movdqa	XMMWORD[(96-128)+rax],xmm0
	paddd	xmm14,xmm0
	pxor	xmm1,xmm3
	psrld	xmm9,27
	pxor	xmm6,xmm12
	movdqa	xmm7,xmm11

	pslld	xmm7,30
	movdqa	xmm5,xmm1
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm14,xmm6
	paddd	xmm1,xmm1

	psrld	xmm11,2
	paddd	xmm14,xmm8
	por	xmm1,xmm5
	por	xmm11,xmm7
	pxor	xmm2,xmm4
	movdqa	xmm4,XMMWORD[((160-128))+rax]

	movdqa	xmm8,xmm14
	movdqa	xmm6,xmm12
	pxor	xmm2,XMMWORD[((0-128))+rax]
	paddd	xmm13,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm10

	movdqa	xmm9,xmm14
	movdqa	XMMWORD[(112-128)+rax],xmm1
	paddd	xmm13,xmm1
	pxor	xmm2,xmm4
	psrld	xmm9,27
	pxor	xmm6,xmm11
	movdqa	xmm7,xmm10

	pslld	xmm7,30
	movdqa	xmm5,xmm2
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm13,xmm6
	paddd	xmm2,xmm2

	psrld	xmm10,2
	paddd	xmm13,xmm8
	por	xmm2,xmm5
	por	xmm10,xmm7
	pxor	xmm3,xmm0
	movdqa	xmm0,XMMWORD[((176-128))+rax]

	movdqa	xmm8,xmm13
	movdqa	xmm6,xmm11
	pxor	xmm3,XMMWORD[((16-128))+rax]
	paddd	xmm12,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm14

	movdqa	xmm9,xmm13
	paddd	xmm12,xmm2
	pxor	xmm3,xmm0
	psrld	xmm9,27
	pxor	xmm6,xmm10
	movdqa	xmm7,xmm14

	pslld	xmm7,30
	movdqa	xmm5,xmm3
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm12,xmm6
	paddd	xmm3,xmm3

	psrld	xmm14,2
	paddd	xmm12,xmm8
	por	xmm3,xmm5
	por	xmm14,xmm7
	pxor	xmm4,xmm1
	movdqa	xmm1,XMMWORD[((192-128))+rax]

	movdqa	xmm8,xmm12
	movdqa	xmm6,xmm10
	pxor	xmm4,XMMWORD[((32-128))+rax]
	paddd	xmm11,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm13

	movdqa	xmm9,xmm12
	paddd	xmm11,xmm3
	pxor	xmm4,xmm1
	psrld	xmm9,27
	pxor	xmm6,xmm14
	movdqa	xmm7,xmm13

	pslld	xmm7,30
	movdqa	xmm5,xmm4
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm11,xmm6
	paddd	xmm4,xmm4

	psrld	xmm13,2
	paddd	xmm11,xmm8
	por	xmm4,xmm5
	por	xmm13,xmm7
	pxor	xmm0,xmm2
	movdqa	xmm2,XMMWORD[((208-128))+rax]

	movdqa	xmm8,xmm11
	movdqa	xmm6,xmm14
	pxor	xmm0,XMMWORD[((48-128))+rax]
	paddd	xmm10,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm12

	movdqa	xmm9,xmm11
	paddd	xmm10,xmm4
	pxor	xmm0,xmm2
	psrld	xmm9,27
	pxor	xmm6,xmm13
	movdqa	xmm7,xmm12

	pslld	xmm7,30
	movdqa	xmm5,xmm0
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm10,xmm6
	paddd	xmm0,xmm0

	psrld	xmm12,2
	paddd	xmm10,xmm8
	por	xmm0,xmm5
	por	xmm12,xmm7
	pxor	xmm1,xmm3
	movdqa	xmm3,XMMWORD[((224-128))+rax]

	movdqa	xmm8,xmm10
	movdqa	xmm6,xmm13
	pxor	xmm1,XMMWORD[((64-128))+rax]
	paddd	xmm14,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm11

	movdqa	xmm9,xmm10
	paddd	xmm14,xmm0
	pxor	xmm1,xmm3
	psrld	xmm9,27
	pxor	xmm6,xmm12
	movdqa	xmm7,xmm11

	pslld	xmm7,30
	movdqa	xmm5,xmm1
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm14,xmm6
	paddd	xmm1,xmm1

	psrld	xmm11,2
	paddd	xmm14,xmm8
	por	xmm1,xmm5
	por	xmm11,xmm7
	pxor	xmm2,xmm4
	movdqa	xmm4,XMMWORD[((240-128))+rax]

	movdqa	xmm8,xmm14
	movdqa	xmm6,xmm12
	pxor	xmm2,XMMWORD[((80-128))+rax]
	paddd	xmm13,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm10

	movdqa	xmm9,xmm14
	paddd	xmm13,xmm1
	pxor	xmm2,xmm4
	psrld	xmm9,27
	pxor	xmm6,xmm11
	movdqa	xmm7,xmm10

	pslld	xmm7,30
	movdqa	xmm5,xmm2
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm13,xmm6
	paddd	xmm2,xmm2

	psrld	xmm10,2
	paddd	xmm13,xmm8
	por	xmm2,xmm5
	por	xmm10,xmm7
	pxor	xmm3,xmm0
	movdqa	xmm0,XMMWORD[((0-128))+rax]

	movdqa	xmm8,xmm13
	movdqa	xmm6,xmm11
	pxor	xmm3,XMMWORD[((96-128))+rax]
	paddd	xmm12,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm14

	movdqa	xmm9,xmm13
	paddd	xmm12,xmm2
	pxor	xmm3,xmm0
	psrld	xmm9,27
	pxor	xmm6,xmm10
	movdqa	xmm7,xmm14

	pslld	xmm7,30
	movdqa	xmm5,xmm3
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm12,xmm6
	paddd	xmm3,xmm3

	psrld	xmm14,2
	paddd	xmm12,xmm8
	por	xmm3,xmm5
	por	xmm14,xmm7
	pxor	xmm4,xmm1
	movdqa	xmm1,XMMWORD[((16-128))+rax]

	movdqa	xmm8,xmm12
	movdqa	xmm6,xmm10
	pxor	xmm4,XMMWORD[((112-128))+rax]
	paddd	xmm11,xmm15
	pslld	xmm8,5
	pxor	xmm6,xmm13

	movdqa	xmm9,xmm12
	paddd	xmm11,xmm3
	pxor	xmm4,xmm1
	psrld	xmm9,27
	pxor	xmm6,xmm14
	movdqa	xmm7,xmm13

	pslld	xmm7,30
	movdqa	xmm5,xmm4
	por	xmm8,xmm9
	psrld	xmm5,31
	paddd	xmm11,xmm6
	paddd	xmm4,xmm4

	psrld	xmm13,2
	paddd	xmm11,xmm8
	por	xmm4,xmm5
	por	xmm13,xmm7
	movdqa	xmm8,xmm11
	paddd	xmm10,xmm15
	movdqa	xmm6,xmm14
	pslld	xmm8,5
	pxor	xmm6,xmm12

	movdqa	xmm9,xmm11
	paddd	xmm10,xmm4
	psrld	xmm9,27
	movdqa	xmm7,xmm12
	pxor	xmm6,xmm13

	pslld	xmm7,30
	por	xmm8,xmm9
	paddd	xmm10,xmm6

	psrld	xmm12,2
	paddd	xmm10,xmm8
	por	xmm12,xmm7
	movdqa	xmm0,XMMWORD[rbx]
	mov	ecx,1
	cmp	ecx,DWORD[rbx]
	pxor	xmm8,xmm8
	cmovge	r8,rbp
	cmp	ecx,DWORD[4+rbx]
	movdqa	xmm1,xmm0
	cmovge	r9,rbp
	cmp	ecx,DWORD[8+rbx]
	pcmpgtd	xmm1,xmm8
	cmovge	r10,rbp
	cmp	ecx,DWORD[12+rbx]
	paddd	xmm0,xmm1
	cmovge	r11,rbp

	movdqu	xmm6,XMMWORD[rdi]
	pand	xmm10,xmm1
	movdqu	xmm7,XMMWORD[32+rdi]
	pand	xmm11,xmm1
	paddd	xmm10,xmm6
	movdqu	xmm8,XMMWORD[64+rdi]
	pand	xmm12,xmm1
	paddd	xmm11,xmm7
	movdqu	xmm9,XMMWORD[96+rdi]
	pand	xmm13,xmm1
	paddd	xmm12,xmm8
	movdqu	xmm5,XMMWORD[128+rdi]
	pand	xmm14,xmm1
	movdqu	XMMWORD[rdi],xmm10
	paddd	xmm13,xmm9
	movdqu	XMMWORD[32+rdi],xmm11
	paddd	xmm14,xmm5
	movdqu	XMMWORD[64+rdi],xmm12
	movdqu	XMMWORD[96+rdi],xmm13
	movdqu	XMMWORD[128+rdi],xmm14

	movdqa	XMMWORD[rbx],xmm0
	movdqa	xmm5,XMMWORD[96+rbp]
	movdqa	xmm15,XMMWORD[((-32))+rbp]
	dec	edx
	jnz	NEAR $L$oop

	mov	edx,DWORD[280+rsp]
	lea	rdi,[16+rdi]
	lea	rsi,[64+rsi]
	dec	edx
	jnz	NEAR $L$oop_grande

$L$done:
	mov	rax,QWORD[272+rsp]

	movaps	xmm6,XMMWORD[((-184))+rax]
	movaps	xmm7,XMMWORD[((-168))+rax]
	movaps	xmm8,XMMWORD[((-152))+rax]
	movaps	xmm9,XMMWORD[((-136))+rax]
	movaps	xmm10,XMMWORD[((-120))+rax]
	movaps	xmm11,XMMWORD[((-104))+rax]
	movaps	xmm12,XMMWORD[((-88))+rax]
	movaps	xmm13,XMMWORD[((-72))+rax]
	movaps	xmm14,XMMWORD[((-56))+rax]
	movaps	xmm15,XMMWORD[((-40))+rax]
	mov	rbp,QWORD[((-16))+rax]

	mov	rbx,QWORD[((-8))+rax]

	lea	rsp,[rax]

$L$epilogue:
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	DB	0F3h,0C3h		;repret

$L$SEH_end_sha1_multi_block:

ALIGN	32
sha1_multi_block_shaext:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_sha1_multi_block_shaext:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8



_shaext_shortcut:
	mov	rax,rsp

	push	rbx

	push	rbp

	lea	rsp,[((-168))+rsp]
	movaps	XMMWORD[rsp],xmm6
	movaps	XMMWORD[16+rsp],xmm7
	movaps	XMMWORD[32+rsp],xmm8
	movaps	XMMWORD[48+rsp],xmm9
	movaps	XMMWORD[(-120)+rax],xmm10
	movaps	XMMWORD[(-104)+rax],xmm11
	movaps	XMMWORD[(-88)+rax],xmm12
	movaps	XMMWORD[(-72)+rax],xmm13
	movaps	XMMWORD[(-56)+rax],xmm14
	movaps	XMMWORD[(-40)+rax],xmm15
	sub	rsp,288
	shl	edx,1
	and	rsp,-256
	lea	rdi,[64+rdi]
	mov	QWORD[272+rsp],rax
$L$body_shaext:
	lea	rbx,[256+rsp]
	movdqa	xmm3,XMMWORD[((K_XX_XX+128))]

$L$oop_grande_shaext:
	mov	DWORD[280+rsp],edx
	xor	edx,edx
	mov	r8,QWORD[rsi]
	mov	ecx,DWORD[8+rsi]
	cmp	ecx,edx
	cmovg	edx,ecx
	test	ecx,ecx
	mov	DWORD[rbx],ecx
	cmovle	r8,rsp
	mov	r9,QWORD[16+rsi]
	mov	ecx,DWORD[24+rsi]
	cmp	ecx,edx
	cmovg	edx,ecx
	test	ecx,ecx
	mov	DWORD[4+rbx],ecx
	cmovle	r9,rsp
	test	edx,edx
	jz	NEAR $L$done_shaext

	movq	xmm0,QWORD[((0-64))+rdi]
	movq	xmm4,QWORD[((32-64))+rdi]
	movq	xmm5,QWORD[((64-64))+rdi]
	movq	xmm6,QWORD[((96-64))+rdi]
	movq	xmm7,QWORD[((128-64))+rdi]

	punpckldq	xmm0,xmm4
	punpckldq	xmm5,xmm6

	movdqa	xmm8,xmm0
	punpcklqdq	xmm0,xmm5
	punpckhqdq	xmm8,xmm5

	pshufd	xmm1,xmm7,63
	pshufd	xmm9,xmm7,127
	pshufd	xmm0,xmm0,27
	pshufd	xmm8,xmm8,27
	jmp	NEAR $L$oop_shaext

ALIGN	32
$L$oop_shaext:
	movdqu	xmm4,XMMWORD[r8]
	movdqu	xmm11,XMMWORD[r9]
	movdqu	xmm5,XMMWORD[16+r8]
	movdqu	xmm12,XMMWORD[16+r9]
	movdqu	xmm6,XMMWORD[32+r8]
DB	102,15,56,0,227
	movdqu	xmm13,XMMWORD[32+r9]
DB	102,68,15,56,0,219
	movdqu	xmm7,XMMWORD[48+r8]
	lea	r8,[64+r8]
DB	102,15,56,0,235
	movdqu	xmm14,XMMWORD[48+r9]
	lea	r9,[64+r9]
DB	102,68,15,56,0,227

	movdqa	XMMWORD[80+rsp],xmm1
	paddd	xmm1,xmm4
	movdqa	XMMWORD[112+rsp],xmm9
	paddd	xmm9,xmm11
	movdqa	XMMWORD[64+rsp],xmm0
	movdqa	xmm2,xmm0
	movdqa	XMMWORD[96+rsp],xmm8
	movdqa	xmm10,xmm8
DB	15,58,204,193,0
DB	15,56,200,213
DB	69,15,58,204,193,0
DB	69,15,56,200,212
DB	102,15,56,0,243
	prefetcht0	[127+r8]
DB	15,56,201,229
DB	102,68,15,56,0,235
	prefetcht0	[127+r9]
DB	69,15,56,201,220

DB	102,15,56,0,251
	movdqa	xmm1,xmm0
DB	102,68,15,56,0,243
	movdqa	xmm9,xmm8
DB	15,58,204,194,0
DB	15,56,200,206
DB	69,15,58,204,194,0
DB	69,15,56,200,205
	pxor	xmm4,xmm6
DB	15,56,201,238
	pxor	xmm11,xmm13
DB	69,15,56,201,229
	movdqa	xmm2,xmm0
	movdqa	xmm10,xmm8
DB	15,58,204,193,0
DB	15,56,200,215
DB	69,15,58,204,193,0
DB	69,15,56,200,214
DB	15,56,202,231
DB	69,15,56,202,222
	pxor	xmm5,xmm7
DB	15,56,201,247
	pxor	xmm12,xmm14
DB	69,15,56,201,238
	movdqa	xmm1,xmm0
	movdqa	xmm9,xmm8
DB	15,58,204,194,0
DB	15,56,200,204
DB	69,15,58,204,194,0
DB	69,15,56,200,203
DB	15,56,202,236
DB	69,15,56,202,227
	pxor	xmm6,xmm4
DB	15,56,201,252
	pxor	xmm13,xmm11
DB	69,15,56,201,243
	movdqa	xmm2,xmm0
	movdqa	xmm10,xmm8
DB	15,58,204,193,0
DB	15,56,200,213
DB	69,15,58,204,193,0
DB	69,15,56,200,212
DB	15,56,202,245
DB	69,15,56,202,236
	pxor	xmm7,xmm5
DB	15,56,201,229
	pxor	xmm14,xmm12
DB	69,15,56,201,220
	movdqa	xmm1,xmm0
	movdqa	xmm9,xmm8
DB	15,58,204,194,1
DB	15,56,200,206
DB	69,15,58,204,194,1
DB	69,15,56,200,205
DB	15,56,202,254
DB	69,15,56,202,245
	pxor	xmm4,xmm6
DB	15,56,201,238
	pxor	xmm11,xmm13
DB	69,15,56,201,229
	movdqa	xmm2,xmm0
	movdqa	xmm10,xmm8
DB	15,58,204,193,1
DB	15,56,200,215
DB	69,15,58,204,193,1
DB	69,15,56,200,214
DB	15,56,202,231
DB	69,15,56,202,222
	pxor	xmm5,xmm7
DB	15,56,201,247
	pxor	xmm12,xmm14
DB	69,15,56,201,238
	movdqa	xmm1,xmm0
	movdqa	xmm9,xmm8
DB	15,58,204,194,1
DB	15,56,200,204
DB	69,15,58,204,194,1
DB	69,15,56,200,203
DB	15,56,202,236
DB	69,15,56,202,227
	pxor	xmm6,xmm4
DB	15,56,201,252
	pxor	xmm13,xmm11
DB	69,15,56,201,243
	movdqa	xmm2,xmm0
	movdqa	xmm10,xmm8
DB	15,58,204,193,1
DB	15,56,200,213
DB	69,15,58,204,193,1
DB	69,15,56,200,212
DB	15,56,202,245
DB	69,15,56,202,236
	pxor	xmm7,xmm5
DB	15,56,201,229
	pxor	xmm14,xmm12
DB	69,15,56,201,220
	movdqa	xmm1,xmm0
	movdqa	xmm9,xmm8
DB	15,58,204,194,1
DB	15,56,200,206
DB	69,15,58,204,194,1
DB	69,15,56,200,205
DB	15,56,202,254
DB	69,15,56,202,245
	pxor	xmm4,xmm6
DB	15,56,201,238
	pxor	xmm11,xmm13
DB	69,15,56,201,229
	movdqa	xmm2,xmm0
	movdqa	xmm10,xmm8
DB	15,58,204,193,2
DB	15,56,200,215
DB	69,15,58,204,193,2
DB	69,15,56,200,214
DB	15,56,202,231
DB	69,15,56,202,222
	pxor	xmm5,xmm7
DB	15,56,201,247
	pxor	xmm12,xmm14
DB	69,15,56,201,238
	movdqa	xmm1,xmm0
	movdqa	xmm9,xmm8
DB	15,58,204,194,2
DB	15,56,200,204
DB	69,15,58,204,194,2
DB	69,15,56,200,203
DB	15,56,202,236
DB	69,15,56,202,227
	pxor	xmm6,xmm4
DB	15,56,201,252
	pxor	xmm13,xmm11
DB	69,15,56,201,243
	movdqa	xmm2,xmm0
	movdqa	xmm10,xmm8
DB	15,58,204,193,2
DB	15,56,200,213
DB	69,15,58,204,193,2
DB	69,15,56,200,212
DB	15,56,202,245
DB	69,15,56,202,236
	pxor	xmm7,xmm5
DB	15,56,201,229
	pxor	xmm14,xmm12
DB	69,15,56,201,220
	movdqa	xmm1,xmm0
	movdqa	xmm9,xmm8
DB	15,58,204,194,2
DB	15,56,200,206
DB	69,15,58,204,194,2
DB	69,15,56,200,205
DB	15,56,202,254
DB	69,15,56,202,245
	pxor	xmm4,xmm6
DB	15,56,201,238
	pxor	xmm11,xmm13
DB	69,15,56,201,229
	movdqa	xmm2,xmm0
	movdqa	xmm10,xmm8
DB	15,58,204,193,2
DB	15,56,200,215
DB	69,15,58,204,193,2
DB	69,15,56,200,214
DB	15,56,202,231
DB	69,15,56,202,222
	pxor	xmm5,xmm7
DB	15,56,201,247
	pxor	xmm12,xmm14
DB	69,15,56,201,238
	movdqa	xmm1,xmm0
	movdqa	xmm9,xmm8
DB	15,58,204,194,3
DB	15,56,200,204
DB	69,15,58,204,194,3
DB	69,15,56,200,203
DB	15,56,202,236
DB	69,15,56,202,227
	pxor	xmm6,xmm4
DB	15,56,201,252
	pxor	xmm13,xmm11
DB	69,15,56,201,243
	movdqa	xmm2,xmm0
	movdqa	xmm10,xmm8
DB	15,58,204,193,3
DB	15,56,200,213
DB	69,15,58,204,193,3
DB	69,15,56,200,212
DB	15,56,202,245
DB	69,15,56,202,236
	pxor	xmm7,xmm5
	pxor	xmm14,xmm12

	mov	ecx,1
	pxor	xmm4,xmm4
	cmp	ecx,DWORD[rbx]
	cmovge	r8,rsp

	movdqa	xmm1,xmm0
	movdqa	xmm9,xmm8
DB	15,58,204,194,3
DB	15,56,200,206
DB	69,15,58,204,194,3
DB	69,15,56,200,205
DB	15,56,202,254
DB	69,15,56,202,245

	cmp	ecx,DWORD[4+rbx]
	cmovge	r9,rsp
	movq	xmm6,QWORD[rbx]

	movdqa	xmm2,xmm0
	movdqa	xmm10,xmm8
DB	15,58,204,193,3
DB	15,56,200,215
DB	69,15,58,204,193,3
DB	69,15,56,200,214

	pshufd	xmm11,xmm6,0x00
	pshufd	xmm12,xmm6,0x55
	movdqa	xmm7,xmm6
	pcmpgtd	xmm11,xmm4
	pcmpgtd	xmm12,xmm4

	movdqa	xmm1,xmm0
	movdqa	xmm9,xmm8
DB	15,58,204,194,3
DB	15,56,200,204
DB	69,15,58,204,194,3
DB	68,15,56,200,204

	pcmpgtd	xmm7,xmm4
	pand	xmm0,xmm11
	pand	xmm1,xmm11
	pand	xmm8,xmm12
	pand	xmm9,xmm12
	paddd	xmm6,xmm7

	paddd	xmm0,XMMWORD[64+rsp]
	paddd	xmm1,XMMWORD[80+rsp]
	paddd	xmm8,XMMWORD[96+rsp]
	paddd	xmm9,XMMWORD[112+rsp]

	movq	QWORD[rbx],xmm6
	dec	edx
	jnz	NEAR $L$oop_shaext

	mov	edx,DWORD[280+rsp]

	pshufd	xmm0,xmm0,27
	pshufd	xmm8,xmm8,27

	movdqa	xmm6,xmm0
	punpckldq	xmm0,xmm8
	punpckhdq	xmm6,xmm8
	punpckhdq	xmm1,xmm9
	movq	QWORD[(0-64)+rdi],xmm0
	psrldq	xmm0,8
	movq	QWORD[(64-64)+rdi],xmm6
	psrldq	xmm6,8
	movq	QWORD[(32-64)+rdi],xmm0
	psrldq	xmm1,8
	movq	QWORD[(96-64)+rdi],xmm6
	movq	QWORD[(128-64)+rdi],xmm1

	lea	rdi,[8+rdi]
	lea	rsi,[32+rsi]
	dec	edx
	jnz	NEAR $L$oop_grande_shaext

$L$done_shaext:

	movaps	xmm6,XMMWORD[((-184))+rax]
	movaps	xmm7,XMMWORD[((-168))+rax]
	movaps	xmm8,XMMWORD[((-152))+rax]
	movaps	xmm9,XMMWORD[((-136))+rax]
	movaps	xmm10,XMMWORD[((-120))+rax]
	movaps	xmm11,XMMWORD[((-104))+rax]
	movaps	xmm12,XMMWORD[((-88))+rax]
	movaps	xmm13,XMMWORD[((-72))+rax]
	movaps	xmm14,XMMWORD[((-56))+rax]
	movaps	xmm15,XMMWORD[((-40))+rax]
	mov	rbp,QWORD[((-16))+rax]

	mov	rbx,QWORD[((-8))+rax]

	lea	rsp,[rax]

$L$epilogue_shaext:
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	DB	0F3h,0C3h		;repret

$L$SEH_end_sha1_multi_block_shaext:

ALIGN	32
sha1_multi_block_avx:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_sha1_multi_block_avx:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8



_avx_shortcut:
	shr	rcx,32
	cmp	edx,2
	jb	NEAR $L$avx
	test	ecx,32
	jnz	NEAR _avx2_shortcut
	jmp	NEAR $L$avx
ALIGN	32
$L$avx:
	mov	rax,rsp

	push	rbx

	push	rbp

	lea	rsp,[((-168))+rsp]
	movaps	XMMWORD[rsp],xmm6
	movaps	XMMWORD[16+rsp],xmm7
	movaps	XMMWORD[32+rsp],xmm8
	movaps	XMMWORD[48+rsp],xmm9
	movaps	XMMWORD[(-120)+rax],xmm10
	movaps	XMMWORD[(-104)+rax],xmm11
	movaps	XMMWORD[(-88)+rax],xmm12
	movaps	XMMWORD[(-72)+rax],xmm13
	movaps	XMMWORD[(-56)+rax],xmm14
	movaps	XMMWORD[(-40)+rax],xmm15
	sub	rsp,288
	and	rsp,-256
	mov	QWORD[272+rsp],rax

$L$body_avx:
	lea	rbp,[K_XX_XX]
	lea	rbx,[256+rsp]

	vzeroupper
$L$oop_grande_avx:
	mov	DWORD[280+rsp],edx
	xor	edx,edx
	mov	r8,QWORD[rsi]
	mov	ecx,DWORD[8+rsi]
	cmp	ecx,edx
	cmovg	edx,ecx
	test	ecx,ecx
	mov	DWORD[rbx],ecx
	cmovle	r8,rbp
	mov	r9,QWORD[16+rsi]
	mov	ecx,DWORD[24+rsi]
	cmp	ecx,edx
	cmovg	edx,ecx
	test	ecx,ecx
	mov	DWORD[4+rbx],ecx
	cmovle	r9,rbp
	mov	r10,QWORD[32+rsi]
	mov	ecx,DWORD[40+rsi]
	cmp	ecx,edx
	cmovg	edx,ecx
	test	ecx,ecx
	mov	DWORD[8+rbx],ecx
	cmovle	r10,rbp
	mov	r11,QWORD[48+rsi]
	mov	ecx,DWORD[56+rsi]
	cmp	ecx,edx
	cmovg	edx,ecx
	test	ecx,ecx
	mov	DWORD[12+rbx],ecx
	cmovle	r11,rbp
	test	edx,edx
	jz	NEAR $L$done_avx

	vmovdqu	xmm10,XMMWORD[rdi]
	lea	rax,[128+rsp]
	vmovdqu	xmm11,XMMWORD[32+rdi]
	vmovdqu	xmm12,XMMWORD[64+rdi]
	vmovdqu	xmm13,XMMWORD[96+rdi]
	vmovdqu	xmm14,XMMWORD[128+rdi]
	vmovdqu	xmm5,XMMWORD[96+rbp]
	jmp	NEAR $L$oop_avx

ALIGN	32
$L$oop_avx:
	vmovdqa	xmm15,XMMWORD[((-32))+rbp]
	vmovd	xmm0,DWORD[r8]
	lea	r8,[64+r8]
	vmovd	xmm2,DWORD[r9]
	lea	r9,[64+r9]
	vpinsrd	xmm0,xmm0,DWORD[r10],1
	lea	r10,[64+r10]
	vpinsrd	xmm2,xmm2,DWORD[r11],1
	lea	r11,[64+r11]
	vmovd	xmm1,DWORD[((-60))+r8]
	vpunpckldq	xmm0,xmm0,xmm2
	vmovd	xmm9,DWORD[((-60))+r9]
	vpshufb	xmm0,xmm0,xmm5
	vpinsrd	xmm1,xmm1,DWORD[((-60))+r10],1
	vpinsrd	xmm9,xmm9,DWORD[((-60))+r11],1
	vpaddd	xmm14,xmm14,xmm15
	vpslld	xmm8,xmm10,5
	vpandn	xmm7,xmm11,xmm13
	vpand	xmm6,xmm11,xmm12

	vmovdqa	XMMWORD[(0-128)+rax],xmm0
	vpaddd	xmm14,xmm14,xmm0
	vpunpckldq	xmm1,xmm1,xmm9
	vpsrld	xmm9,xmm10,27
	vpxor	xmm6,xmm6,xmm7
	vmovd	xmm2,DWORD[((-56))+r8]

	vpslld	xmm7,xmm11,30
	vpor	xmm8,xmm8,xmm9
	vmovd	xmm9,DWORD[((-56))+r9]
	vpaddd	xmm14,xmm14,xmm6

	vpsrld	xmm11,xmm11,2
	vpaddd	xmm14,xmm14,xmm8
	vpshufb	xmm1,xmm1,xmm5
	vpor	xmm11,xmm11,xmm7
	vpinsrd	xmm2,xmm2,DWORD[((-56))+r10],1
	vpinsrd	xmm9,xmm9,DWORD[((-56))+r11],1
	vpaddd	xmm13,xmm13,xmm15
	vpslld	xmm8,xmm14,5
	vpandn	xmm7,xmm10,xmm12
	vpand	xmm6,xmm10,xmm11

	vmovdqa	XMMWORD[(16-128)+rax],xmm1
	vpaddd	xmm13,xmm13,xmm1
	vpunpckldq	xmm2,xmm2,xmm9
	vpsrld	xmm9,xmm14,27
	vpxor	xmm6,xmm6,xmm7
	vmovd	xmm3,DWORD[((-52))+r8]

	vpslld	xmm7,xmm10,30
	vpor	xmm8,xmm8,xmm9
	vmovd	xmm9,DWORD[((-52))+r9]
	vpaddd	xmm13,xmm13,xmm6

	vpsrld	xmm10,xmm10,2
	vpaddd	xmm13,xmm13,xmm8
	vpshufb	xmm2,xmm2,xmm5
	vpor	xmm10,xmm10,xmm7
	vpinsrd	xmm3,xmm3,DWORD[((-52))+r10],1
	vpinsrd	xmm9,xmm9,DWORD[((-52))+r11],1
	vpaddd	xmm12,xmm12,xmm15
	vpslld	xmm8,xmm13,5
	vpandn	xmm7,xmm14,xmm11
	vpand	xmm6,xmm14,xmm10

	vmovdqa	XMMWORD[(32-128)+rax],xmm2
	vpaddd	xmm12,xmm12,xmm2
	vpunpckldq	xmm3,xmm3,xmm9
	vpsrld	xmm9,xmm13,27
	vpxor	xmm6,xmm6,xmm7
	vmovd	xmm4,DWORD[((-48))+r8]

	vpslld	xmm7,xmm14,30
	vpor	xmm8,xmm8,xmm9
	vmovd	xmm9,DWORD[((-48))+r9]
	vpaddd	xmm12,xmm12,xmm6

	vpsrld	xmm14,xmm14,2
	vpaddd	xmm12,xmm12,xmm8
	vpshufb	xmm3,xmm3,xmm5
	vpor	xmm14,xmm14,xmm7
	vpinsrd	xmm4,xmm4,DWORD[((-48))+r10],1
	vpinsrd	xmm9,xmm9,DWORD[((-48))+r11],1
	vpaddd	xmm11,xmm11,xmm15
	vpslld	xmm8,xmm12,5
	vpandn	xmm7,xmm13,xmm10
	vpand	xmm6,xmm13,xmm14

	vmovdqa	XMMWORD[(48-128)+rax],xmm3
	vpaddd	xmm11,xmm11,xmm3
	vpunpckldq	xmm4,xmm4,xmm9
	vpsrld	xmm9,xmm12,27
	vpxor	xmm6,xmm6,xmm7
	vmovd	xmm0,DWORD[((-44))+r8]

	vpslld	xmm7,xmm13,30
	vpor	xmm8,xmm8,xmm9
	vmovd	xmm9,DWORD[((-44))+r9]
	vpaddd	xmm11,xmm11,xmm6

	vpsrld	xmm13,xmm13,2
	vpaddd	xmm11,xmm11,xmm8
	vpshufb	xmm4,xmm4,xmm5
	vpor	xmm13,xmm13,xmm7
	vpinsrd	xmm0,xmm0,DWORD[((-44))+r10],1
	vpinsrd	xmm9,xmm9,DWORD[((-44))+r11],1
	vpaddd	xmm10,xmm10,xmm15
	vpslld	xmm8,xmm11,5
	vpandn	xmm7,xmm12,xmm14
	vpand	xmm6,xmm12,xmm13

	vmovdqa	XMMWORD[(64-128)+rax],xmm4
	vpaddd	xmm10,xmm10,xmm4
	vpunpckldq	xmm0,xmm0,xmm9
	vpsrld	xmm9,xmm11,27
	vpxor	xmm6,xmm6,xmm7
	vmovd	xmm1,DWORD[((-40))+r8]

	vpslld	xmm7,xmm12,30
	vpor	xmm8,xmm8,xmm9
	vmovd	xmm9,DWORD[((-40))+r9]
	vpaddd	xmm10,xmm10,xmm6

	vpsrld	xmm12,xmm12,2
	vpaddd	xmm10,xmm10,xmm8
	vpshufb	xmm0,xmm0,xmm5
	vpor	xmm12,xmm12,xmm7
	vpinsrd	xmm1,xmm1,DWORD[((-40))+r10],1
	vpinsrd	xmm9,xmm9,DWORD[((-40))+r11],1
	vpaddd	xmm14,xmm14,xmm15
	vpslld	xmm8,xmm10,5
	vpandn	xmm7,xmm11,xmm13
	vpand	xmm6,xmm11,xmm12

	vmovdqa	XMMWORD[(80-128)+rax],xmm0
	vpaddd	xmm14,xmm14,xmm0
	vpunpckldq	xmm1,xmm1,xmm9
	vpsrld	xmm9,xmm10,27
	vpxor	xmm6,xmm6,xmm7
	vmovd	xmm2,DWORD[((-36))+r8]

	vpslld	xmm7,xmm11,30
	vpor	xmm8,xmm8,xmm9
	vmovd	xmm9,DWORD[((-36))+r9]
	vpaddd	xmm14,xmm14,xmm6

	vpsrld	xmm11,xmm11,2
	vpaddd	xmm14,xmm14,xmm8
	vpshufb	xmm1,xmm1,xmm5
	vpor	xmm11,xmm11,xmm7
	vpinsrd	xmm2,xmm2,DWORD[((-36))+r10],1
	vpinsrd	xmm9,xmm9,DWORD[((-36))+r11],1
	vpaddd	xmm13,xmm13,xmm15
	vpslld	xmm8,xmm14,5
	vpandn	xmm7,xmm10,xmm12
	vpand	xmm6,xmm10,xmm11

	vmovdqa	XMMWORD[(96-128)+rax],xmm1
	vpaddd	xmm13,xmm13,xmm1
	vpunpckldq	xmm2,xmm2,xmm9
	vpsrld	xmm9,xmm14,27
	vpxor	xmm6,xmm6,xmm7
	vmovd	xmm3,DWORD[((-32))+r8]

	vpslld	xmm7,xmm10,30
	vpor	xmm8,xmm8,xmm9
	vmovd	xmm9,DWORD[((-32))+r9]
	vpaddd	xmm13,xmm13,xmm6

	vpsrld	xmm10,xmm10,2
	vpaddd	xmm13,xmm13,xmm8
	vpshufb	xmm2,xmm2,xmm5
	vpor	xmm10,xmm10,xmm7
	vpinsrd	xmm3,xmm3,DWORD[((-32))+r10],1
	vpinsrd	xmm9,xmm9,DWORD[((-32))+r11],1
	vpaddd	xmm12,xmm12,xmm15
	vpslld	xmm8,xmm13,5
	vpandn	xmm7,xmm14,xmm11
	vpand	xmm6,xmm14,xmm10

	vmovdqa	XMMWORD[(112-128)+rax],xmm2
	vpaddd	xmm12,xmm12,xmm2
	vpunpckldq	xmm3,xmm3,xmm9
	vpsrld	xmm9,xmm13,27
	vpxor	xmm6,xmm6,xmm7
	vmovd	xmm4,DWORD[((-28))+r8]

	vpslld	xmm7,xmm14,30
	vpor	xmm8,xmm8,xmm9
	vmovd	xmm9,DWORD[((-28))+r9]
	vpaddd	xmm12,xmm12,xmm6

	vpsrld	xmm14,xmm14,2
	vpaddd	xmm12,xmm12,xmm8
	vpshufb	xmm3,xmm3,xmm5
	vpor	xmm14,xmm14,xmm7
	vpinsrd	xmm4,xmm4,DWORD[((-28))+r10],1
	vpinsrd	xmm9,xmm9,DWORD[((-28))+r11],1
	vpaddd	xmm11,xmm11,xmm15
	vpslld	xmm8,xmm12,5
	vpandn	xmm7,xmm13,xmm10
	vpand	xmm6,xmm13,xmm14

	vmovdqa	XMMWORD[(128-128)+rax],xmm3
	vpaddd	xmm11,xmm11,xmm3
	vpunpckldq	xmm4,xmm4,xmm9
	vpsrld	xmm9,xmm12,27
	vpxor	xmm6,xmm6,xmm7
	vmovd	xmm0,DWORD[((-24))+r8]

	vpslld	xmm7,xmm13,30
	vpor	xmm8,xmm8,xmm9
	vmovd	xmm9,DWORD[((-24))+r9]
	vpaddd	xmm11,xmm11,xmm6

	vpsrld	xmm13,xmm13,2
	vpaddd	xmm11,xmm11,xmm8
	vpshufb	xmm4,xmm4,xmm5
	vpor	xmm13,xmm13,xmm7
	vpinsrd	xmm0,xmm0,DWORD[((-24))+r10],1
	vpinsrd	xmm9,xmm9,DWORD[((-24))+r11],1
	vpaddd	xmm10,xmm10,xmm15
	vpslld	xmm8,xmm11,5
	vpandn	xmm7,xmm12,xmm14
	vpand	xmm6,xmm12,xmm13

	vmovdqa	XMMWORD[(144-128)+rax],xmm4
	vpaddd	xmm10,xmm10,xmm4
	vpunpckldq	xmm0,xmm0,xmm9
	vpsrld	xmm9,xmm11,27
	vpxor	xmm6,xmm6,xmm7
	vmovd	xmm1,DWORD[((-20))+r8]

	vpslld	xmm7,xmm12,30
	vpor	xmm8,xmm8,xmm9
	vmovd	xmm9,DWORD[((-20))+r9]
	vpaddd	xmm10,xmm10,xmm6

	vpsrld	xmm12,xmm12,2
	vpaddd	xmm10,xmm10,xmm8
	vpshufb	xmm0,xmm0,xmm5
	vpor	xmm12,xmm12,xmm7
	vpinsrd	xmm1,xmm1,DWORD[((-20))+r10],1
	vpinsrd	xmm9,xmm9,DWORD[((-20))+r11],1
	vpaddd	xmm14,xmm14,xmm15
	vpslld	xmm8,xmm10,5
	vpandn	xmm7,xmm11,xmm13
	vpand	xmm6,xmm11,xmm12

	vmovdqa	XMMWORD[(160-128)+rax],xmm0
	vpaddd	xmm14,xmm14,xmm0
	vpunpckldq	xmm1,xmm1,xmm9
	vpsrld	xmm9,xmm10,27
	vpxor	xmm6,xmm6,xmm7
	vmovd	xmm2,DWORD[((-16))+r8]

	vpslld	xmm7,xmm11,30
	vpor	xmm8,xmm8,xmm9
	vmovd	xmm9,DWORD[((-16))+r9]
	vpaddd	xmm14,xmm14,xmm6

	vpsrld	xmm11,xmm11,2
	vpaddd	xmm14,xmm14,xmm8
	vpshufb	xmm1,xmm1,xmm5
	vpor	xmm11,xmm11,xmm7
	vpinsrd	xmm2,xmm2,DWORD[((-16))+r10],1
	vpinsrd	xmm9,xmm9,DWORD[((-16))+r11],1
	vpaddd	xmm13,xmm13,xmm15
	vpslld	xmm8,xmm14,5
	vpandn	xmm7,xmm10,xmm12
	vpand	xmm6,xmm10,xmm11

	vmovdqa	XMMWORD[(176-128)+rax],xmm1
	vpaddd	xmm13,xmm13,xmm1
	vpunpckldq	xmm2,xmm2,xmm9
	vpsrld	xmm9,xmm14,27
	vpxor	xmm6,xmm6,xmm7
	vmovd	xmm3,DWORD[((-12))+r8]

	vpslld	xmm7,xmm10,30
	vpor	xmm8,xmm8,xmm9
	vmovd	xmm9,DWORD[((-12))+r9]
	vpaddd	xmm13,xmm13,xmm6

	vpsrld	xmm10,xmm10,2
	vpaddd	xmm13,xmm13,xmm8
	vpshufb	xmm2,xmm2,xmm5
	vpor	xmm10,xmm10,xmm7
	vpinsrd	xmm3,xmm3,DWORD[((-12))+r10],1
	vpinsrd	xmm9,xmm9,DWORD[((-12))+r11],1
	vpaddd	xmm12,xmm12,xmm15
	vpslld	xmm8,xmm13,5
	vpandn	xmm7,xmm14,xmm11
	vpand	xmm6,xmm14,xmm10

	vmovdqa	XMMWORD[(192-128)+rax],xmm2
	vpaddd	xmm12,xmm12,xmm2
	vpunpckldq	xmm3,xmm3,xmm9
	vpsrld	xmm9,xmm13,27
	vpxor	xmm6,xmm6,xmm7
	vmovd	xmm4,DWORD[((-8))+r8]

	vpslld	xmm7,xmm14,30
	vpor	xmm8,xmm8,xmm9
	vmovd	xmm9,DWORD[((-8))+r9]
	vpaddd	xmm12,xmm12,xmm6

	vpsrld	xmm14,xmm14,2
	vpaddd	xmm12,xmm12,xmm8
	vpshufb	xmm3,xmm3,xmm5
	vpor	xmm14,xmm14,xmm7
	vpinsrd	xmm4,xmm4,DWORD[((-8))+r10],1
	vpinsrd	xmm9,xmm9,DWORD[((-8))+r11],1
	vpaddd	xmm11,xmm11,xmm15
	vpslld	xmm8,xmm12,5
	vpandn	xmm7,xmm13,xmm10
	vpand	xmm6,xmm13,xmm14

	vmovdqa	XMMWORD[(208-128)+rax],xmm3
	vpaddd	xmm11,xmm11,xmm3
	vpunpckldq	xmm4,xmm4,xmm9
	vpsrld	xmm9,xmm12,27
	vpxor	xmm6,xmm6,xmm7
	vmovd	xmm0,DWORD[((-4))+r8]

	vpslld	xmm7,xmm13,30
	vpor	xmm8,xmm8,xmm9
	vmovd	xmm9,DWORD[((-4))+r9]
	vpaddd	xmm11,xmm11,xmm6

	vpsrld	xmm13,xmm13,2
	vpaddd	xmm11,xmm11,xmm8
	vpshufb	xmm4,xmm4,xmm5
	vpor	xmm13,xmm13,xmm7
	vmovdqa	xmm1,XMMWORD[((0-128))+rax]
	vpinsrd	xmm0,xmm0,DWORD[((-4))+r10],1
	vpinsrd	xmm9,xmm9,DWORD[((-4))+r11],1
	vpaddd	xmm10,xmm10,xmm15
	prefetcht0	[63+r8]
	vpslld	xmm8,xmm11,5
	vpandn	xmm7,xmm12,xmm14
	vpand	xmm6,xmm12,xmm13

	vmovdqa	XMMWORD[(224-128)+rax],xmm4
	vpaddd	xmm10,xmm10,xmm4
	vpunpckldq	xmm0,xmm0,xmm9
	vpsrld	xmm9,xmm11,27
	prefetcht0	[63+r9]
	vpxor	xmm6,xmm6,xmm7

	vpslld	xmm7,xmm12,30
	vpor	xmm8,xmm8,xmm9
	prefetcht0	[63+r10]
	vpaddd	xmm10,xmm10,xmm6

	vpsrld	xmm12,xmm12,2
	vpaddd	xmm10,xmm10,xmm8
	prefetcht0	[63+r11]
	vpshufb	xmm0,xmm0,xmm5
	vpor	xmm12,xmm12,xmm7
	vmovdqa	xmm2,XMMWORD[((16-128))+rax]
	vpxor	xmm1,xmm1,xmm3
	vmovdqa	xmm3,XMMWORD[((32-128))+rax]

	vpaddd	xmm14,xmm14,xmm15
	vpslld	xmm8,xmm10,5
	vpandn	xmm7,xmm11,xmm13

	vpand	xmm6,xmm11,xmm12

	vmovdqa	XMMWORD[(240-128)+rax],xmm0
	vpaddd	xmm14,xmm14,xmm0
	vpxor	xmm1,xmm1,XMMWORD[((128-128))+rax]
	vpsrld	xmm9,xmm10,27
	vpxor	xmm6,xmm6,xmm7
	vpxor	xmm1,xmm1,xmm3


	vpslld	xmm7,xmm11,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm14,xmm14,xmm6

	vpsrld	xmm5,xmm1,31
	vpaddd	xmm1,xmm1,xmm1

	vpsrld	xmm11,xmm11,2

	vpaddd	xmm14,xmm14,xmm8
	vpor	xmm1,xmm1,xmm5
	vpor	xmm11,xmm11,xmm7
	vpxor	xmm2,xmm2,xmm4
	vmovdqa	xmm4,XMMWORD[((48-128))+rax]

	vpaddd	xmm13,xmm13,xmm15
	vpslld	xmm8,xmm14,5
	vpandn	xmm7,xmm10,xmm12

	vpand	xmm6,xmm10,xmm11

	vmovdqa	XMMWORD[(0-128)+rax],xmm1
	vpaddd	xmm13,xmm13,xmm1
	vpxor	xmm2,xmm2,XMMWORD[((144-128))+rax]
	vpsrld	xmm9,xmm14,27
	vpxor	xmm6,xmm6,xmm7
	vpxor	xmm2,xmm2,xmm4


	vpslld	xmm7,xmm10,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm13,xmm13,xmm6

	vpsrld	xmm5,xmm2,31
	vpaddd	xmm2,xmm2,xmm2

	vpsrld	xmm10,xmm10,2

	vpaddd	xmm13,xmm13,xmm8
	vpor	xmm2,xmm2,xmm5
	vpor	xmm10,xmm10,xmm7
	vpxor	xmm3,xmm3,xmm0
	vmovdqa	xmm0,XMMWORD[((64-128))+rax]

	vpaddd	xmm12,xmm12,xmm15
	vpslld	xmm8,xmm13,5
	vpandn	xmm7,xmm14,xmm11

	vpand	xmm6,xmm14,xmm10

	vmovdqa	XMMWORD[(16-128)+rax],xmm2
	vpaddd	xmm12,xmm12,xmm2
	vpxor	xmm3,xmm3,XMMWORD[((160-128))+rax]
	vpsrld	xmm9,xmm13,27
	vpxor	xmm6,xmm6,xmm7
	vpxor	xmm3,xmm3,xmm0


	vpslld	xmm7,xmm14,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm12,xmm12,xmm6

	vpsrld	xmm5,xmm3,31
	vpaddd	xmm3,xmm3,xmm3

	vpsrld	xmm14,xmm14,2

	vpaddd	xmm12,xmm12,xmm8
	vpor	xmm3,xmm3,xmm5
	vpor	xmm14,xmm14,xmm7
	vpxor	xmm4,xmm4,xmm1
	vmovdqa	xmm1,XMMWORD[((80-128))+rax]

	vpaddd	xmm11,xmm11,xmm15
	vpslld	xmm8,xmm12,5
	vpandn	xmm7,xmm13,xmm10

	vpand	xmm6,xmm13,xmm14

	vmovdqa	XMMWORD[(32-128)+rax],xmm3
	vpaddd	xmm11,xmm11,xmm3
	vpxor	xmm4,xmm4,XMMWORD[((176-128))+rax]
	vpsrld	xmm9,xmm12,27
	vpxor	xmm6,xmm6,xmm7
	vpxor	xmm4,xmm4,xmm1


	vpslld	xmm7,xmm13,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm11,xmm11,xmm6

	vpsrld	xmm5,xmm4,31
	vpaddd	xmm4,xmm4,xmm4

	vpsrld	xmm13,xmm13,2

	vpaddd	xmm11,xmm11,xmm8
	vpor	xmm4,xmm4,xmm5
	vpor	xmm13,xmm13,xmm7
	vpxor	xmm0,xmm0,xmm2
	vmovdqa	xmm2,XMMWORD[((96-128))+rax]

	vpaddd	xmm10,xmm10,xmm15
	vpslld	xmm8,xmm11,5
	vpandn	xmm7,xmm12,xmm14

	vpand	xmm6,xmm12,xmm13

	vmovdqa	XMMWORD[(48-128)+rax],xmm4
	vpaddd	xmm10,xmm10,xmm4
	vpxor	xmm0,xmm0,XMMWORD[((192-128))+rax]
	vpsrld	xmm9,xmm11,27
	vpxor	xmm6,xmm6,xmm7
	vpxor	xmm0,xmm0,xmm2


	vpslld	xmm7,xmm12,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm10,xmm10,xmm6

	vpsrld	xmm5,xmm0,31
	vpaddd	xmm0,xmm0,xmm0

	vpsrld	xmm12,xmm12,2

	vpaddd	xmm10,xmm10,xmm8
	vpor	xmm0,xmm0,xmm5
	vpor	xmm12,xmm12,xmm7
	vmovdqa	xmm15,XMMWORD[rbp]
	vpxor	xmm1,xmm1,xmm3
	vmovdqa	xmm3,XMMWORD[((112-128))+rax]

	vpslld	xmm8,xmm10,5
	vpaddd	xmm14,xmm14,xmm15
	vpxor	xmm6,xmm13,xmm11
	vmovdqa	XMMWORD[(64-128)+rax],xmm0
	vpaddd	xmm14,xmm14,xmm0
	vpxor	xmm1,xmm1,XMMWORD[((208-128))+rax]
	vpsrld	xmm9,xmm10,27
	vpxor	xmm6,xmm6,xmm12
	vpxor	xmm1,xmm1,xmm3

	vpslld	xmm7,xmm11,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm14,xmm14,xmm6
	vpsrld	xmm5,xmm1,31
	vpaddd	xmm1,xmm1,xmm1

	vpsrld	xmm11,xmm11,2
	vpaddd	xmm14,xmm14,xmm8
	vpor	xmm1,xmm1,xmm5
	vpor	xmm11,xmm11,xmm7
	vpxor	xmm2,xmm2,xmm4
	vmovdqa	xmm4,XMMWORD[((128-128))+rax]

	vpslld	xmm8,xmm14,5
	vpaddd	xmm13,xmm13,xmm15
	vpxor	xmm6,xmm12,xmm10
	vmovdqa	XMMWORD[(80-128)+rax],xmm1
	vpaddd	xmm13,xmm13,xmm1
	vpxor	xmm2,xmm2,XMMWORD[((224-128))+rax]
	vpsrld	xmm9,xmm14,27
	vpxor	xmm6,xmm6,xmm11
	vpxor	xmm2,xmm2,xmm4

	vpslld	xmm7,xmm10,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm13,xmm13,xmm6
	vpsrld	xmm5,xmm2,31
	vpaddd	xmm2,xmm2,xmm2

	vpsrld	xmm10,xmm10,2
	vpaddd	xmm13,xmm13,xmm8
	vpor	xmm2,xmm2,xmm5
	vpor	xmm10,xmm10,xmm7
	vpxor	xmm3,xmm3,xmm0
	vmovdqa	xmm0,XMMWORD[((144-128))+rax]

	vpslld	xmm8,xmm13,5
	vpaddd	xmm12,xmm12,xmm15
	vpxor	xmm6,xmm11,xmm14
	vmovdqa	XMMWORD[(96-128)+rax],xmm2
	vpaddd	xmm12,xmm12,xmm2
	vpxor	xmm3,xmm3,XMMWORD[((240-128))+rax]
	vpsrld	xmm9,xmm13,27
	vpxor	xmm6,xmm6,xmm10
	vpxor	xmm3,xmm3,xmm0

	vpslld	xmm7,xmm14,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm12,xmm12,xmm6
	vpsrld	xmm5,xmm3,31
	vpaddd	xmm3,xmm3,xmm3

	vpsrld	xmm14,xmm14,2
	vpaddd	xmm12,xmm12,xmm8
	vpor	xmm3,xmm3,xmm5
	vpor	xmm14,xmm14,xmm7
	vpxor	xmm4,xmm4,xmm1
	vmovdqa	xmm1,XMMWORD[((160-128))+rax]

	vpslld	xmm8,xmm12,5
	vpaddd	xmm11,xmm11,xmm15
	vpxor	xmm6,xmm10,xmm13
	vmovdqa	XMMWORD[(112-128)+rax],xmm3
	vpaddd	xmm11,xmm11,xmm3
	vpxor	xmm4,xmm4,XMMWORD[((0-128))+rax]
	vpsrld	xmm9,xmm12,27
	vpxor	xmm6,xmm6,xmm14
	vpxor	xmm4,xmm4,xmm1

	vpslld	xmm7,xmm13,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm11,xmm11,xmm6
	vpsrld	xmm5,xmm4,31
	vpaddd	xmm4,xmm4,xmm4

	vpsrld	xmm13,xmm13,2
	vpaddd	xmm11,xmm11,xmm8
	vpor	xmm4,xmm4,xmm5
	vpor	xmm13,xmm13,xmm7
	vpxor	xmm0,xmm0,xmm2
	vmovdqa	xmm2,XMMWORD[((176-128))+rax]

	vpslld	xmm8,xmm11,5
	vpaddd	xmm10,xmm10,xmm15
	vpxor	xmm6,xmm14,xmm12
	vmovdqa	XMMWORD[(128-128)+rax],xmm4
	vpaddd	xmm10,xmm10,xmm4
	vpxor	xmm0,xmm0,XMMWORD[((16-128))+rax]
	vpsrld	xmm9,xmm11,27
	vpxor	xmm6,xmm6,xmm13
	vpxor	xmm0,xmm0,xmm2

	vpslld	xmm7,xmm12,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm10,xmm10,xmm6
	vpsrld	xmm5,xmm0,31
	vpaddd	xmm0,xmm0,xmm0

	vpsrld	xmm12,xmm12,2
	vpaddd	xmm10,xmm10,xmm8
	vpor	xmm0,xmm0,xmm5
	vpor	xmm12,xmm12,xmm7
	vpxor	xmm1,xmm1,xmm3
	vmovdqa	xmm3,XMMWORD[((192-128))+rax]

	vpslld	xmm8,xmm10,5
	vpaddd	xmm14,xmm14,xmm15
	vpxor	xmm6,xmm13,xmm11
	vmovdqa	XMMWORD[(144-128)+rax],xmm0
	vpaddd	xmm14,xmm14,xmm0
	vpxor	xmm1,xmm1,XMMWORD[((32-128))+rax]
	vpsrld	xmm9,xmm10,27
	vpxor	xmm6,xmm6,xmm12
	vpxor	xmm1,xmm1,xmm3

	vpslld	xmm7,xmm11,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm14,xmm14,xmm6
	vpsrld	xmm5,xmm1,31
	vpaddd	xmm1,xmm1,xmm1

	vpsrld	xmm11,xmm11,2
	vpaddd	xmm14,xmm14,xmm8
	vpor	xmm1,xmm1,xmm5
	vpor	xmm11,xmm11,xmm7
	vpxor	xmm2,xmm2,xmm4
	vmovdqa	xmm4,XMMWORD[((208-128))+rax]

	vpslld	xmm8,xmm14,5
	vpaddd	xmm13,xmm13,xmm15
	vpxor	xmm6,xmm12,xmm10
	vmovdqa	XMMWORD[(160-128)+rax],xmm1
	vpaddd	xmm13,xmm13,xmm1
	vpxor	xmm2,xmm2,XMMWORD[((48-128))+rax]
	vpsrld	xmm9,xmm14,27
	vpxor	xmm6,xmm6,xmm11
	vpxor	xmm2,xmm2,xmm4

	vpslld	xmm7,xmm10,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm13,xmm13,xmm6
	vpsrld	xmm5,xmm2,31
	vpaddd	xmm2,xmm2,xmm2

	vpsrld	xmm10,xmm10,2
	vpaddd	xmm13,xmm13,xmm8
	vpor	xmm2,xmm2,xmm5
	vpor	xmm10,xmm10,xmm7
	vpxor	xmm3,xmm3,xmm0
	vmovdqa	xmm0,XMMWORD[((224-128))+rax]

	vpslld	xmm8,xmm13,5
	vpaddd	xmm12,xmm12,xmm15
	vpxor	xmm6,xmm11,xmm14
	vmovdqa	XMMWORD[(176-128)+rax],xmm2
	vpaddd	xmm12,xmm12,xmm2
	vpxor	xmm3,xmm3,XMMWORD[((64-128))+rax]
	vpsrld	xmm9,xmm13,27
	vpxor	xmm6,xmm6,xmm10
	vpxor	xmm3,xmm3,xmm0

	vpslld	xmm7,xmm14,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm12,xmm12,xmm6
	vpsrld	xmm5,xmm3,31
	vpaddd	xmm3,xmm3,xmm3

	vpsrld	xmm14,xmm14,2
	vpaddd	xmm12,xmm12,xmm8
	vpor	xmm3,xmm3,xmm5
	vpor	xmm14,xmm14,xmm7
	vpxor	xmm4,xmm4,xmm1
	vmovdqa	xmm1,XMMWORD[((240-128))+rax]

	vpslld	xmm8,xmm12,5
	vpaddd	xmm11,xmm11,xmm15
	vpxor	xmm6,xmm10,xmm13
	vmovdqa	XMMWORD[(192-128)+rax],xmm3
	vpaddd	xmm11,xmm11,xmm3
	vpxor	xmm4,xmm4,XMMWORD[((80-128))+rax]
	vpsrld	xmm9,xmm12,27
	vpxor	xmm6,xmm6,xmm14
	vpxor	xmm4,xmm4,xmm1

	vpslld	xmm7,xmm13,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm11,xmm11,xmm6
	vpsrld	xmm5,xmm4,31
	vpaddd	xmm4,xmm4,xmm4

	vpsrld	xmm13,xmm13,2
	vpaddd	xmm11,xmm11,xmm8
	vpor	xmm4,xmm4,xmm5
	vpor	xmm13,xmm13,xmm7
	vpxor	xmm0,xmm0,xmm2
	vmovdqa	xmm2,XMMWORD[((0-128))+rax]

	vpslld	xmm8,xmm11,5
	vpaddd	xmm10,xmm10,xmm15
	vpxor	xmm6,xmm14,xmm12
	vmovdqa	XMMWORD[(208-128)+rax],xmm4
	vpaddd	xmm10,xmm10,xmm4
	vpxor	xmm0,xmm0,XMMWORD[((96-128))+rax]
	vpsrld	xmm9,xmm11,27
	vpxor	xmm6,xmm6,xmm13
	vpxor	xmm0,xmm0,xmm2

	vpslld	xmm7,xmm12,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm10,xmm10,xmm6
	vpsrld	xmm5,xmm0,31
	vpaddd	xmm0,xmm0,xmm0

	vpsrld	xmm12,xmm12,2
	vpaddd	xmm10,xmm10,xmm8
	vpor	xmm0,xmm0,xmm5
	vpor	xmm12,xmm12,xmm7
	vpxor	xmm1,xmm1,xmm3
	vmovdqa	xmm3,XMMWORD[((16-128))+rax]

	vpslld	xmm8,xmm10,5
	vpaddd	xmm14,xmm14,xmm15
	vpxor	xmm6,xmm13,xmm11
	vmovdqa	XMMWORD[(224-128)+rax],xmm0
	vpaddd	xmm14,xmm14,xmm0
	vpxor	xmm1,xmm1,XMMWORD[((112-128))+rax]
	vpsrld	xmm9,xmm10,27
	vpxor	xmm6,xmm6,xmm12
	vpxor	xmm1,xmm1,xmm3

	vpslld	xmm7,xmm11,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm14,xmm14,xmm6
	vpsrld	xmm5,xmm1,31
	vpaddd	xmm1,xmm1,xmm1

	vpsrld	xmm11,xmm11,2
	vpaddd	xmm14,xmm14,xmm8
	vpor	xmm1,xmm1,xmm5
	vpor	xmm11,xmm11,xmm7
	vpxor	xmm2,xmm2,xmm4
	vmovdqa	xmm4,XMMWORD[((32-128))+rax]

	vpslld	xmm8,xmm14,5
	vpaddd	xmm13,xmm13,xmm15
	vpxor	xmm6,xmm12,xmm10
	vmovdqa	XMMWORD[(240-128)+rax],xmm1
	vpaddd	xmm13,xmm13,xmm1
	vpxor	xmm2,xmm2,XMMWORD[((128-128))+rax]
	vpsrld	xmm9,xmm14,27
	vpxor	xmm6,xmm6,xmm11
	vpxor	xmm2,xmm2,xmm4

	vpslld	xmm7,xmm10,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm13,xmm13,xmm6
	vpsrld	xmm5,xmm2,31
	vpaddd	xmm2,xmm2,xmm2

	vpsrld	xmm10,xmm10,2
	vpaddd	xmm13,xmm13,xmm8
	vpor	xmm2,xmm2,xmm5
	vpor	xmm10,xmm10,xmm7
	vpxor	xmm3,xmm3,xmm0
	vmovdqa	xmm0,XMMWORD[((48-128))+rax]

	vpslld	xmm8,xmm13,5
	vpaddd	xmm12,xmm12,xmm15
	vpxor	xmm6,xmm11,xmm14
	vmovdqa	XMMWORD[(0-128)+rax],xmm2
	vpaddd	xmm12,xmm12,xmm2
	vpxor	xmm3,xmm3,XMMWORD[((144-128))+rax]
	vpsrld	xmm9,xmm13,27
	vpxor	xmm6,xmm6,xmm10
	vpxor	xmm3,xmm3,xmm0

	vpslld	xmm7,xmm14,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm12,xmm12,xmm6
	vpsrld	xmm5,xmm3,31
	vpaddd	xmm3,xmm3,xmm3

	vpsrld	xmm14,xmm14,2
	vpaddd	xmm12,xmm12,xmm8
	vpor	xmm3,xmm3,xmm5
	vpor	xmm14,xmm14,xmm7
	vpxor	xmm4,xmm4,xmm1
	vmovdqa	xmm1,XMMWORD[((64-128))+rax]

	vpslld	xmm8,xmm12,5
	vpaddd	xmm11,xmm11,xmm15
	vpxor	xmm6,xmm10,xmm13
	vmovdqa	XMMWORD[(16-128)+rax],xmm3
	vpaddd	xmm11,xmm11,xmm3
	vpxor	xmm4,xmm4,XMMWORD[((160-128))+rax]
	vpsrld	xmm9,xmm12,27
	vpxor	xmm6,xmm6,xmm14
	vpxor	xmm4,xmm4,xmm1

	vpslld	xmm7,xmm13,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm11,xmm11,xmm6
	vpsrld	xmm5,xmm4,31
	vpaddd	xmm4,xmm4,xmm4

	vpsrld	xmm13,xmm13,2
	vpaddd	xmm11,xmm11,xmm8
	vpor	xmm4,xmm4,xmm5
	vpor	xmm13,xmm13,xmm7
	vpxor	xmm0,xmm0,xmm2
	vmovdqa	xmm2,XMMWORD[((80-128))+rax]

	vpslld	xmm8,xmm11,5
	vpaddd	xmm10,xmm10,xmm15
	vpxor	xmm6,xmm14,xmm12
	vmovdqa	XMMWORD[(32-128)+rax],xmm4
	vpaddd	xmm10,xmm10,xmm4
	vpxor	xmm0,xmm0,XMMWORD[((176-128))+rax]
	vpsrld	xmm9,xmm11,27
	vpxor	xmm6,xmm6,xmm13
	vpxor	xmm0,xmm0,xmm2

	vpslld	xmm7,xmm12,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm10,xmm10,xmm6
	vpsrld	xmm5,xmm0,31
	vpaddd	xmm0,xmm0,xmm0

	vpsrld	xmm12,xmm12,2
	vpaddd	xmm10,xmm10,xmm8
	vpor	xmm0,xmm0,xmm5
	vpor	xmm12,xmm12,xmm7
	vpxor	xmm1,xmm1,xmm3
	vmovdqa	xmm3,XMMWORD[((96-128))+rax]

	vpslld	xmm8,xmm10,5
	vpaddd	xmm14,xmm14,xmm15
	vpxor	xmm6,xmm13,xmm11
	vmovdqa	XMMWORD[(48-128)+rax],xmm0
	vpaddd	xmm14,xmm14,xmm0
	vpxor	xmm1,xmm1,XMMWORD[((192-128))+rax]
	vpsrld	xmm9,xmm10,27
	vpxor	xmm6,xmm6,xmm12
	vpxor	xmm1,xmm1,xmm3

	vpslld	xmm7,xmm11,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm14,xmm14,xmm6
	vpsrld	xmm5,xmm1,31
	vpaddd	xmm1,xmm1,xmm1

	vpsrld	xmm11,xmm11,2
	vpaddd	xmm14,xmm14,xmm8
	vpor	xmm1,xmm1,xmm5
	vpor	xmm11,xmm11,xmm7
	vpxor	xmm2,xmm2,xmm4
	vmovdqa	xmm4,XMMWORD[((112-128))+rax]

	vpslld	xmm8,xmm14,5
	vpaddd	xmm13,xmm13,xmm15
	vpxor	xmm6,xmm12,xmm10
	vmovdqa	XMMWORD[(64-128)+rax],xmm1
	vpaddd	xmm13,xmm13,xmm1
	vpxor	xmm2,xmm2,XMMWORD[((208-128))+rax]
	vpsrld	xmm9,xmm14,27
	vpxor	xmm6,xmm6,xmm11
	vpxor	xmm2,xmm2,xmm4

	vpslld	xmm7,xmm10,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm13,xmm13,xmm6
	vpsrld	xmm5,xmm2,31
	vpaddd	xmm2,xmm2,xmm2

	vpsrld	xmm10,xmm10,2
	vpaddd	xmm13,xmm13,xmm8
	vpor	xmm2,xmm2,xmm5
	vpor	xmm10,xmm10,xmm7
	vpxor	xmm3,xmm3,xmm0
	vmovdqa	xmm0,XMMWORD[((128-128))+rax]

	vpslld	xmm8,xmm13,5
	vpaddd	xmm12,xmm12,xmm15
	vpxor	xmm6,xmm11,xmm14
	vmovdqa	XMMWORD[(80-128)+rax],xmm2
	vpaddd	xmm12,xmm12,xmm2
	vpxor	xmm3,xmm3,XMMWORD[((224-128))+rax]
	vpsrld	xmm9,xmm13,27
	vpxor	xmm6,xmm6,xmm10
	vpxor	xmm3,xmm3,xmm0

	vpslld	xmm7,xmm14,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm12,xmm12,xmm6
	vpsrld	xmm5,xmm3,31
	vpaddd	xmm3,xmm3,xmm3

	vpsrld	xmm14,xmm14,2
	vpaddd	xmm12,xmm12,xmm8
	vpor	xmm3,xmm3,xmm5
	vpor	xmm14,xmm14,xmm7
	vpxor	xmm4,xmm4,xmm1
	vmovdqa	xmm1,XMMWORD[((144-128))+rax]

	vpslld	xmm8,xmm12,5
	vpaddd	xmm11,xmm11,xmm15
	vpxor	xmm6,xmm10,xmm13
	vmovdqa	XMMWORD[(96-128)+rax],xmm3
	vpaddd	xmm11,xmm11,xmm3
	vpxor	xmm4,xmm4,XMMWORD[((240-128))+rax]
	vpsrld	xmm9,xmm12,27
	vpxor	xmm6,xmm6,xmm14
	vpxor	xmm4,xmm4,xmm1

	vpslld	xmm7,xmm13,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm11,xmm11,xmm6
	vpsrld	xmm5,xmm4,31
	vpaddd	xmm4,xmm4,xmm4

	vpsrld	xmm13,xmm13,2
	vpaddd	xmm11,xmm11,xmm8
	vpor	xmm4,xmm4,xmm5
	vpor	xmm13,xmm13,xmm7
	vpxor	xmm0,xmm0,xmm2
	vmovdqa	xmm2,XMMWORD[((160-128))+rax]

	vpslld	xmm8,xmm11,5
	vpaddd	xmm10,xmm10,xmm15
	vpxor	xmm6,xmm14,xmm12
	vmovdqa	XMMWORD[(112-128)+rax],xmm4
	vpaddd	xmm10,xmm10,xmm4
	vpxor	xmm0,xmm0,XMMWORD[((0-128))+rax]
	vpsrld	xmm9,xmm11,27
	vpxor	xmm6,xmm6,xmm13
	vpxor	xmm0,xmm0,xmm2

	vpslld	xmm7,xmm12,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm10,xmm10,xmm6
	vpsrld	xmm5,xmm0,31
	vpaddd	xmm0,xmm0,xmm0

	vpsrld	xmm12,xmm12,2
	vpaddd	xmm10,xmm10,xmm8
	vpor	xmm0,xmm0,xmm5
	vpor	xmm12,xmm12,xmm7
	vmovdqa	xmm15,XMMWORD[32+rbp]
	vpxor	xmm1,xmm1,xmm3
	vmovdqa	xmm3,XMMWORD[((176-128))+rax]

	vpaddd	xmm14,xmm14,xmm15
	vpslld	xmm8,xmm10,5
	vpand	xmm7,xmm13,xmm12
	vpxor	xmm1,xmm1,XMMWORD[((16-128))+rax]

	vpaddd	xmm14,xmm14,xmm7
	vpsrld	xmm9,xmm10,27
	vpxor	xmm6,xmm13,xmm12
	vpxor	xmm1,xmm1,xmm3

	vmovdqu	XMMWORD[(128-128)+rax],xmm0
	vpaddd	xmm14,xmm14,xmm0
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm1,31
	vpand	xmm6,xmm6,xmm11
	vpaddd	xmm1,xmm1,xmm1

	vpslld	xmm7,xmm11,30
	vpaddd	xmm14,xmm14,xmm6

	vpsrld	xmm11,xmm11,2
	vpaddd	xmm14,xmm14,xmm8
	vpor	xmm1,xmm1,xmm5
	vpor	xmm11,xmm11,xmm7
	vpxor	xmm2,xmm2,xmm4
	vmovdqa	xmm4,XMMWORD[((192-128))+rax]

	vpaddd	xmm13,xmm13,xmm15
	vpslld	xmm8,xmm14,5
	vpand	xmm7,xmm12,xmm11
	vpxor	xmm2,xmm2,XMMWORD[((32-128))+rax]

	vpaddd	xmm13,xmm13,xmm7
	vpsrld	xmm9,xmm14,27
	vpxor	xmm6,xmm12,xmm11
	vpxor	xmm2,xmm2,xmm4

	vmovdqu	XMMWORD[(144-128)+rax],xmm1
	vpaddd	xmm13,xmm13,xmm1
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm2,31
	vpand	xmm6,xmm6,xmm10
	vpaddd	xmm2,xmm2,xmm2

	vpslld	xmm7,xmm10,30
	vpaddd	xmm13,xmm13,xmm6

	vpsrld	xmm10,xmm10,2
	vpaddd	xmm13,xmm13,xmm8
	vpor	xmm2,xmm2,xmm5
	vpor	xmm10,xmm10,xmm7
	vpxor	xmm3,xmm3,xmm0
	vmovdqa	xmm0,XMMWORD[((208-128))+rax]

	vpaddd	xmm12,xmm12,xmm15
	vpslld	xmm8,xmm13,5
	vpand	xmm7,xmm11,xmm10
	vpxor	xmm3,xmm3,XMMWORD[((48-128))+rax]

	vpaddd	xmm12,xmm12,xmm7
	vpsrld	xmm9,xmm13,27
	vpxor	xmm6,xmm11,xmm10
	vpxor	xmm3,xmm3,xmm0

	vmovdqu	XMMWORD[(160-128)+rax],xmm2
	vpaddd	xmm12,xmm12,xmm2
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm3,31
	vpand	xmm6,xmm6,xmm14
	vpaddd	xmm3,xmm3,xmm3

	vpslld	xmm7,xmm14,30
	vpaddd	xmm12,xmm12,xmm6

	vpsrld	xmm14,xmm14,2
	vpaddd	xmm12,xmm12,xmm8
	vpor	xmm3,xmm3,xmm5
	vpor	xmm14,xmm14,xmm7
	vpxor	xmm4,xmm4,xmm1
	vmovdqa	xmm1,XMMWORD[((224-128))+rax]

	vpaddd	xmm11,xmm11,xmm15
	vpslld	xmm8,xmm12,5
	vpand	xmm7,xmm10,xmm14
	vpxor	xmm4,xmm4,XMMWORD[((64-128))+rax]

	vpaddd	xmm11,xmm11,xmm7
	vpsrld	xmm9,xmm12,27
	vpxor	xmm6,xmm10,xmm14
	vpxor	xmm4,xmm4,xmm1

	vmovdqu	XMMWORD[(176-128)+rax],xmm3
	vpaddd	xmm11,xmm11,xmm3
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm4,31
	vpand	xmm6,xmm6,xmm13
	vpaddd	xmm4,xmm4,xmm4

	vpslld	xmm7,xmm13,30
	vpaddd	xmm11,xmm11,xmm6

	vpsrld	xmm13,xmm13,2
	vpaddd	xmm11,xmm11,xmm8
	vpor	xmm4,xmm4,xmm5
	vpor	xmm13,xmm13,xmm7
	vpxor	xmm0,xmm0,xmm2
	vmovdqa	xmm2,XMMWORD[((240-128))+rax]

	vpaddd	xmm10,xmm10,xmm15
	vpslld	xmm8,xmm11,5
	vpand	xmm7,xmm14,xmm13
	vpxor	xmm0,xmm0,XMMWORD[((80-128))+rax]

	vpaddd	xmm10,xmm10,xmm7
	vpsrld	xmm9,xmm11,27
	vpxor	xmm6,xmm14,xmm13
	vpxor	xmm0,xmm0,xmm2

	vmovdqu	XMMWORD[(192-128)+rax],xmm4
	vpaddd	xmm10,xmm10,xmm4
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm0,31
	vpand	xmm6,xmm6,xmm12
	vpaddd	xmm0,xmm0,xmm0

	vpslld	xmm7,xmm12,30
	vpaddd	xmm10,xmm10,xmm6

	vpsrld	xmm12,xmm12,2
	vpaddd	xmm10,xmm10,xmm8
	vpor	xmm0,xmm0,xmm5
	vpor	xmm12,xmm12,xmm7
	vpxor	xmm1,xmm1,xmm3
	vmovdqa	xmm3,XMMWORD[((0-128))+rax]

	vpaddd	xmm14,xmm14,xmm15
	vpslld	xmm8,xmm10,5
	vpand	xmm7,xmm13,xmm12
	vpxor	xmm1,xmm1,XMMWORD[((96-128))+rax]

	vpaddd	xmm14,xmm14,xmm7
	vpsrld	xmm9,xmm10,27
	vpxor	xmm6,xmm13,xmm12
	vpxor	xmm1,xmm1,xmm3

	vmovdqu	XMMWORD[(208-128)+rax],xmm0
	vpaddd	xmm14,xmm14,xmm0
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm1,31
	vpand	xmm6,xmm6,xmm11
	vpaddd	xmm1,xmm1,xmm1

	vpslld	xmm7,xmm11,30
	vpaddd	xmm14,xmm14,xmm6

	vpsrld	xmm11,xmm11,2
	vpaddd	xmm14,xmm14,xmm8
	vpor	xmm1,xmm1,xmm5
	vpor	xmm11,xmm11,xmm7
	vpxor	xmm2,xmm2,xmm4
	vmovdqa	xmm4,XMMWORD[((16-128))+rax]

	vpaddd	xmm13,xmm13,xmm15
	vpslld	xmm8,xmm14,5
	vpand	xmm7,xmm12,xmm11
	vpxor	xmm2,xmm2,XMMWORD[((112-128))+rax]

	vpaddd	xmm13,xmm13,xmm7
	vpsrld	xmm9,xmm14,27
	vpxor	xmm6,xmm12,xmm11
	vpxor	xmm2,xmm2,xmm4

	vmovdqu	XMMWORD[(224-128)+rax],xmm1
	vpaddd	xmm13,xmm13,xmm1
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm2,31
	vpand	xmm6,xmm6,xmm10
	vpaddd	xmm2,xmm2,xmm2

	vpslld	xmm7,xmm10,30
	vpaddd	xmm13,xmm13,xmm6

	vpsrld	xmm10,xmm10,2
	vpaddd	xmm13,xmm13,xmm8
	vpor	xmm2,xmm2,xmm5
	vpor	xmm10,xmm10,xmm7
	vpxor	xmm3,xmm3,xmm0
	vmovdqa	xmm0,XMMWORD[((32-128))+rax]

	vpaddd	xmm12,xmm12,xmm15
	vpslld	xmm8,xmm13,5
	vpand	xmm7,xmm11,xmm10
	vpxor	xmm3,xmm3,XMMWORD[((128-128))+rax]

	vpaddd	xmm12,xmm12,xmm7
	vpsrld	xmm9,xmm13,27
	vpxor	xmm6,xmm11,xmm10
	vpxor	xmm3,xmm3,xmm0

	vmovdqu	XMMWORD[(240-128)+rax],xmm2
	vpaddd	xmm12,xmm12,xmm2
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm3,31
	vpand	xmm6,xmm6,xmm14
	vpaddd	xmm3,xmm3,xmm3

	vpslld	xmm7,xmm14,30
	vpaddd	xmm12,xmm12,xmm6

	vpsrld	xmm14,xmm14,2
	vpaddd	xmm12,xmm12,xmm8
	vpor	xmm3,xmm3,xmm5
	vpor	xmm14,xmm14,xmm7
	vpxor	xmm4,xmm4,xmm1
	vmovdqa	xmm1,XMMWORD[((48-128))+rax]

	vpaddd	xmm11,xmm11,xmm15
	vpslld	xmm8,xmm12,5
	vpand	xmm7,xmm10,xmm14
	vpxor	xmm4,xmm4,XMMWORD[((144-128))+rax]

	vpaddd	xmm11,xmm11,xmm7
	vpsrld	xmm9,xmm12,27
	vpxor	xmm6,xmm10,xmm14
	vpxor	xmm4,xmm4,xmm1

	vmovdqu	XMMWORD[(0-128)+rax],xmm3
	vpaddd	xmm11,xmm11,xmm3
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm4,31
	vpand	xmm6,xmm6,xmm13
	vpaddd	xmm4,xmm4,xmm4

	vpslld	xmm7,xmm13,30
	vpaddd	xmm11,xmm11,xmm6

	vpsrld	xmm13,xmm13,2
	vpaddd	xmm11,xmm11,xmm8
	vpor	xmm4,xmm4,xmm5
	vpor	xmm13,xmm13,xmm7
	vpxor	xmm0,xmm0,xmm2
	vmovdqa	xmm2,XMMWORD[((64-128))+rax]

	vpaddd	xmm10,xmm10,xmm15
	vpslld	xmm8,xmm11,5
	vpand	xmm7,xmm14,xmm13
	vpxor	xmm0,xmm0,XMMWORD[((160-128))+rax]

	vpaddd	xmm10,xmm10,xmm7
	vpsrld	xmm9,xmm11,27
	vpxor	xmm6,xmm14,xmm13
	vpxor	xmm0,xmm0,xmm2

	vmovdqu	XMMWORD[(16-128)+rax],xmm4
	vpaddd	xmm10,xmm10,xmm4
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm0,31
	vpand	xmm6,xmm6,xmm12
	vpaddd	xmm0,xmm0,xmm0

	vpslld	xmm7,xmm12,30
	vpaddd	xmm10,xmm10,xmm6

	vpsrld	xmm12,xmm12,2
	vpaddd	xmm10,xmm10,xmm8
	vpor	xmm0,xmm0,xmm5
	vpor	xmm12,xmm12,xmm7
	vpxor	xmm1,xmm1,xmm3
	vmovdqa	xmm3,XMMWORD[((80-128))+rax]

	vpaddd	xmm14,xmm14,xmm15
	vpslld	xmm8,xmm10,5
	vpand	xmm7,xmm13,xmm12
	vpxor	xmm1,xmm1,XMMWORD[((176-128))+rax]

	vpaddd	xmm14,xmm14,xmm7
	vpsrld	xmm9,xmm10,27
	vpxor	xmm6,xmm13,xmm12
	vpxor	xmm1,xmm1,xmm3

	vmovdqu	XMMWORD[(32-128)+rax],xmm0
	vpaddd	xmm14,xmm14,xmm0
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm1,31
	vpand	xmm6,xmm6,xmm11
	vpaddd	xmm1,xmm1,xmm1

	vpslld	xmm7,xmm11,30
	vpaddd	xmm14,xmm14,xmm6

	vpsrld	xmm11,xmm11,2
	vpaddd	xmm14,xmm14,xmm8
	vpor	xmm1,xmm1,xmm5
	vpor	xmm11,xmm11,xmm7
	vpxor	xmm2,xmm2,xmm4
	vmovdqa	xmm4,XMMWORD[((96-128))+rax]

	vpaddd	xmm13,xmm13,xmm15
	vpslld	xmm8,xmm14,5
	vpand	xmm7,xmm12,xmm11
	vpxor	xmm2,xmm2,XMMWORD[((192-128))+rax]

	vpaddd	xmm13,xmm13,xmm7
	vpsrld	xmm9,xmm14,27
	vpxor	xmm6,xmm12,xmm11
	vpxor	xmm2,xmm2,xmm4

	vmovdqu	XMMWORD[(48-128)+rax],xmm1
	vpaddd	xmm13,xmm13,xmm1
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm2,31
	vpand	xmm6,xmm6,xmm10
	vpaddd	xmm2,xmm2,xmm2

	vpslld	xmm7,xmm10,30
	vpaddd	xmm13,xmm13,xmm6

	vpsrld	xmm10,xmm10,2
	vpaddd	xmm13,xmm13,xmm8
	vpor	xmm2,xmm2,xmm5
	vpor	xmm10,xmm10,xmm7
	vpxor	xmm3,xmm3,xmm0
	vmovdqa	xmm0,XMMWORD[((112-128))+rax]

	vpaddd	xmm12,xmm12,xmm15
	vpslld	xmm8,xmm13,5
	vpand	xmm7,xmm11,xmm10
	vpxor	xmm3,xmm3,XMMWORD[((208-128))+rax]

	vpaddd	xmm12,xmm12,xmm7
	vpsrld	xmm9,xmm13,27
	vpxor	xmm6,xmm11,xmm10
	vpxor	xmm3,xmm3,xmm0

	vmovdqu	XMMWORD[(64-128)+rax],xmm2
	vpaddd	xmm12,xmm12,xmm2
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm3,31
	vpand	xmm6,xmm6,xmm14
	vpaddd	xmm3,xmm3,xmm3

	vpslld	xmm7,xmm14,30
	vpaddd	xmm12,xmm12,xmm6

	vpsrld	xmm14,xmm14,2
	vpaddd	xmm12,xmm12,xmm8
	vpor	xmm3,xmm3,xmm5
	vpor	xmm14,xmm14,xmm7
	vpxor	xmm4,xmm4,xmm1
	vmovdqa	xmm1,XMMWORD[((128-128))+rax]

	vpaddd	xmm11,xmm11,xmm15
	vpslld	xmm8,xmm12,5
	vpand	xmm7,xmm10,xmm14
	vpxor	xmm4,xmm4,XMMWORD[((224-128))+rax]

	vpaddd	xmm11,xmm11,xmm7
	vpsrld	xmm9,xmm12,27
	vpxor	xmm6,xmm10,xmm14
	vpxor	xmm4,xmm4,xmm1

	vmovdqu	XMMWORD[(80-128)+rax],xmm3
	vpaddd	xmm11,xmm11,xmm3
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm4,31
	vpand	xmm6,xmm6,xmm13
	vpaddd	xmm4,xmm4,xmm4

	vpslld	xmm7,xmm13,30
	vpaddd	xmm11,xmm11,xmm6

	vpsrld	xmm13,xmm13,2
	vpaddd	xmm11,xmm11,xmm8
	vpor	xmm4,xmm4,xmm5
	vpor	xmm13,xmm13,xmm7
	vpxor	xmm0,xmm0,xmm2
	vmovdqa	xmm2,XMMWORD[((144-128))+rax]

	vpaddd	xmm10,xmm10,xmm15
	vpslld	xmm8,xmm11,5
	vpand	xmm7,xmm14,xmm13
	vpxor	xmm0,xmm0,XMMWORD[((240-128))+rax]

	vpaddd	xmm10,xmm10,xmm7
	vpsrld	xmm9,xmm11,27
	vpxor	xmm6,xmm14,xmm13
	vpxor	xmm0,xmm0,xmm2

	vmovdqu	XMMWORD[(96-128)+rax],xmm4
	vpaddd	xmm10,xmm10,xmm4
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm0,31
	vpand	xmm6,xmm6,xmm12
	vpaddd	xmm0,xmm0,xmm0

	vpslld	xmm7,xmm12,30
	vpaddd	xmm10,xmm10,xmm6

	vpsrld	xmm12,xmm12,2
	vpaddd	xmm10,xmm10,xmm8
	vpor	xmm0,xmm0,xmm5
	vpor	xmm12,xmm12,xmm7
	vpxor	xmm1,xmm1,xmm3
	vmovdqa	xmm3,XMMWORD[((160-128))+rax]

	vpaddd	xmm14,xmm14,xmm15
	vpslld	xmm8,xmm10,5
	vpand	xmm7,xmm13,xmm12
	vpxor	xmm1,xmm1,XMMWORD[((0-128))+rax]

	vpaddd	xmm14,xmm14,xmm7
	vpsrld	xmm9,xmm10,27
	vpxor	xmm6,xmm13,xmm12
	vpxor	xmm1,xmm1,xmm3

	vmovdqu	XMMWORD[(112-128)+rax],xmm0
	vpaddd	xmm14,xmm14,xmm0
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm1,31
	vpand	xmm6,xmm6,xmm11
	vpaddd	xmm1,xmm1,xmm1

	vpslld	xmm7,xmm11,30
	vpaddd	xmm14,xmm14,xmm6

	vpsrld	xmm11,xmm11,2
	vpaddd	xmm14,xmm14,xmm8
	vpor	xmm1,xmm1,xmm5
	vpor	xmm11,xmm11,xmm7
	vpxor	xmm2,xmm2,xmm4
	vmovdqa	xmm4,XMMWORD[((176-128))+rax]

	vpaddd	xmm13,xmm13,xmm15
	vpslld	xmm8,xmm14,5
	vpand	xmm7,xmm12,xmm11
	vpxor	xmm2,xmm2,XMMWORD[((16-128))+rax]

	vpaddd	xmm13,xmm13,xmm7
	vpsrld	xmm9,xmm14,27
	vpxor	xmm6,xmm12,xmm11
	vpxor	xmm2,xmm2,xmm4

	vmovdqu	XMMWORD[(128-128)+rax],xmm1
	vpaddd	xmm13,xmm13,xmm1
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm2,31
	vpand	xmm6,xmm6,xmm10
	vpaddd	xmm2,xmm2,xmm2

	vpslld	xmm7,xmm10,30
	vpaddd	xmm13,xmm13,xmm6

	vpsrld	xmm10,xmm10,2
	vpaddd	xmm13,xmm13,xmm8
	vpor	xmm2,xmm2,xmm5
	vpor	xmm10,xmm10,xmm7
	vpxor	xmm3,xmm3,xmm0
	vmovdqa	xmm0,XMMWORD[((192-128))+rax]

	vpaddd	xmm12,xmm12,xmm15
	vpslld	xmm8,xmm13,5
	vpand	xmm7,xmm11,xmm10
	vpxor	xmm3,xmm3,XMMWORD[((32-128))+rax]

	vpaddd	xmm12,xmm12,xmm7
	vpsrld	xmm9,xmm13,27
	vpxor	xmm6,xmm11,xmm10
	vpxor	xmm3,xmm3,xmm0

	vmovdqu	XMMWORD[(144-128)+rax],xmm2
	vpaddd	xmm12,xmm12,xmm2
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm3,31
	vpand	xmm6,xmm6,xmm14
	vpaddd	xmm3,xmm3,xmm3

	vpslld	xmm7,xmm14,30
	vpaddd	xmm12,xmm12,xmm6

	vpsrld	xmm14,xmm14,2
	vpaddd	xmm12,xmm12,xmm8
	vpor	xmm3,xmm3,xmm5
	vpor	xmm14,xmm14,xmm7
	vpxor	xmm4,xmm4,xmm1
	vmovdqa	xmm1,XMMWORD[((208-128))+rax]

	vpaddd	xmm11,xmm11,xmm15
	vpslld	xmm8,xmm12,5
	vpand	xmm7,xmm10,xmm14
	vpxor	xmm4,xmm4,XMMWORD[((48-128))+rax]

	vpaddd	xmm11,xmm11,xmm7
	vpsrld	xmm9,xmm12,27
	vpxor	xmm6,xmm10,xmm14
	vpxor	xmm4,xmm4,xmm1

	vmovdqu	XMMWORD[(160-128)+rax],xmm3
	vpaddd	xmm11,xmm11,xmm3
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm4,31
	vpand	xmm6,xmm6,xmm13
	vpaddd	xmm4,xmm4,xmm4

	vpslld	xmm7,xmm13,30
	vpaddd	xmm11,xmm11,xmm6

	vpsrld	xmm13,xmm13,2
	vpaddd	xmm11,xmm11,xmm8
	vpor	xmm4,xmm4,xmm5
	vpor	xmm13,xmm13,xmm7
	vpxor	xmm0,xmm0,xmm2
	vmovdqa	xmm2,XMMWORD[((224-128))+rax]

	vpaddd	xmm10,xmm10,xmm15
	vpslld	xmm8,xmm11,5
	vpand	xmm7,xmm14,xmm13
	vpxor	xmm0,xmm0,XMMWORD[((64-128))+rax]

	vpaddd	xmm10,xmm10,xmm7
	vpsrld	xmm9,xmm11,27
	vpxor	xmm6,xmm14,xmm13
	vpxor	xmm0,xmm0,xmm2

	vmovdqu	XMMWORD[(176-128)+rax],xmm4
	vpaddd	xmm10,xmm10,xmm4
	vpor	xmm8,xmm8,xmm9
	vpsrld	xmm5,xmm0,31
	vpand	xmm6,xmm6,xmm12
	vpaddd	xmm0,xmm0,xmm0

	vpslld	xmm7,xmm12,30
	vpaddd	xmm10,xmm10,xmm6

	vpsrld	xmm12,xmm12,2
	vpaddd	xmm10,xmm10,xmm8
	vpor	xmm0,xmm0,xmm5
	vpor	xmm12,xmm12,xmm7
	vmovdqa	xmm15,XMMWORD[64+rbp]
	vpxor	xmm1,xmm1,xmm3
	vmovdqa	xmm3,XMMWORD[((240-128))+rax]

	vpslld	xmm8,xmm10,5
	vpaddd	xmm14,xmm14,xmm15
	vpxor	xmm6,xmm13,xmm11
	vmovdqa	XMMWORD[(192-128)+rax],xmm0
	vpaddd	xmm14,xmm14,xmm0
	vpxor	xmm1,xmm1,XMMWORD[((80-128))+rax]
	vpsrld	xmm9,xmm10,27
	vpxor	xmm6,xmm6,xmm12
	vpxor	xmm1,xmm1,xmm3

	vpslld	xmm7,xmm11,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm14,xmm14,xmm6
	vpsrld	xmm5,xmm1,31
	vpaddd	xmm1,xmm1,xmm1

	vpsrld	xmm11,xmm11,2
	vpaddd	xmm14,xmm14,xmm8
	vpor	xmm1,xmm1,xmm5
	vpor	xmm11,xmm11,xmm7
	vpxor	xmm2,xmm2,xmm4
	vmovdqa	xmm4,XMMWORD[((0-128))+rax]

	vpslld	xmm8,xmm14,5
	vpaddd	xmm13,xmm13,xmm15
	vpxor	xmm6,xmm12,xmm10
	vmovdqa	XMMWORD[(208-128)+rax],xmm1
	vpaddd	xmm13,xmm13,xmm1
	vpxor	xmm2,xmm2,XMMWORD[((96-128))+rax]
	vpsrld	xmm9,xmm14,27
	vpxor	xmm6,xmm6,xmm11
	vpxor	xmm2,xmm2,xmm4

	vpslld	xmm7,xmm10,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm13,xmm13,xmm6
	vpsrld	xmm5,xmm2,31
	vpaddd	xmm2,xmm2,xmm2

	vpsrld	xmm10,xmm10,2
	vpaddd	xmm13,xmm13,xmm8
	vpor	xmm2,xmm2,xmm5
	vpor	xmm10,xmm10,xmm7
	vpxor	xmm3,xmm3,xmm0
	vmovdqa	xmm0,XMMWORD[((16-128))+rax]

	vpslld	xmm8,xmm13,5
	vpaddd	xmm12,xmm12,xmm15
	vpxor	xmm6,xmm11,xmm14
	vmovdqa	XMMWORD[(224-128)+rax],xmm2
	vpaddd	xmm12,xmm12,xmm2
	vpxor	xmm3,xmm3,XMMWORD[((112-128))+rax]
	vpsrld	xmm9,xmm13,27
	vpxor	xmm6,xmm6,xmm10
	vpxor	xmm3,xmm3,xmm0

	vpslld	xmm7,xmm14,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm12,xmm12,xmm6
	vpsrld	xmm5,xmm3,31
	vpaddd	xmm3,xmm3,xmm3

	vpsrld	xmm14,xmm14,2
	vpaddd	xmm12,xmm12,xmm8
	vpor	xmm3,xmm3,xmm5
	vpor	xmm14,xmm14,xmm7
	vpxor	xmm4,xmm4,xmm1
	vmovdqa	xmm1,XMMWORD[((32-128))+rax]

	vpslld	xmm8,xmm12,5
	vpaddd	xmm11,xmm11,xmm15
	vpxor	xmm6,xmm10,xmm13
	vmovdqa	XMMWORD[(240-128)+rax],xmm3
	vpaddd	xmm11,xmm11,xmm3
	vpxor	xmm4,xmm4,XMMWORD[((128-128))+rax]
	vpsrld	xmm9,xmm12,27
	vpxor	xmm6,xmm6,xmm14
	vpxor	xmm4,xmm4,xmm1

	vpslld	xmm7,xmm13,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm11,xmm11,xmm6
	vpsrld	xmm5,xmm4,31
	vpaddd	xmm4,xmm4,xmm4

	vpsrld	xmm13,xmm13,2
	vpaddd	xmm11,xmm11,xmm8
	vpor	xmm4,xmm4,xmm5
	vpor	xmm13,xmm13,xmm7
	vpxor	xmm0,xmm0,xmm2
	vmovdqa	xmm2,XMMWORD[((48-128))+rax]

	vpslld	xmm8,xmm11,5
	vpaddd	xmm10,xmm10,xmm15
	vpxor	xmm6,xmm14,xmm12
	vmovdqa	XMMWORD[(0-128)+rax],xmm4
	vpaddd	xmm10,xmm10,xmm4
	vpxor	xmm0,xmm0,XMMWORD[((144-128))+rax]
	vpsrld	xmm9,xmm11,27
	vpxor	xmm6,xmm6,xmm13
	vpxor	xmm0,xmm0,xmm2

	vpslld	xmm7,xmm12,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm10,xmm10,xmm6
	vpsrld	xmm5,xmm0,31
	vpaddd	xmm0,xmm0,xmm0

	vpsrld	xmm12,xmm12,2
	vpaddd	xmm10,xmm10,xmm8
	vpor	xmm0,xmm0,xmm5
	vpor	xmm12,xmm12,xmm7
	vpxor	xmm1,xmm1,xmm3
	vmovdqa	xmm3,XMMWORD[((64-128))+rax]

	vpslld	xmm8,xmm10,5
	vpaddd	xmm14,xmm14,xmm15
	vpxor	xmm6,xmm13,xmm11
	vmovdqa	XMMWORD[(16-128)+rax],xmm0
	vpaddd	xmm14,xmm14,xmm0
	vpxor	xmm1,xmm1,XMMWORD[((160-128))+rax]
	vpsrld	xmm9,xmm10,27
	vpxor	xmm6,xmm6,xmm12
	vpxor	xmm1,xmm1,xmm3

	vpslld	xmm7,xmm11,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm14,xmm14,xmm6
	vpsrld	xmm5,xmm1,31
	vpaddd	xmm1,xmm1,xmm1

	vpsrld	xmm11,xmm11,2
	vpaddd	xmm14,xmm14,xmm8
	vpor	xmm1,xmm1,xmm5
	vpor	xmm11,xmm11,xmm7
	vpxor	xmm2,xmm2,xmm4
	vmovdqa	xmm4,XMMWORD[((80-128))+rax]

	vpslld	xmm8,xmm14,5
	vpaddd	xmm13,xmm13,xmm15
	vpxor	xmm6,xmm12,xmm10
	vmovdqa	XMMWORD[(32-128)+rax],xmm1
	vpaddd	xmm13,xmm13,xmm1
	vpxor	xmm2,xmm2,XMMWORD[((176-128))+rax]
	vpsrld	xmm9,xmm14,27
	vpxor	xmm6,xmm6,xmm11
	vpxor	xmm2,xmm2,xmm4

	vpslld	xmm7,xmm10,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm13,xmm13,xmm6
	vpsrld	xmm5,xmm2,31
	vpaddd	xmm2,xmm2,xmm2

	vpsrld	xmm10,xmm10,2
	vpaddd	xmm13,xmm13,xmm8
	vpor	xmm2,xmm2,xmm5
	vpor	xmm10,xmm10,xmm7
	vpxor	xmm3,xmm3,xmm0
	vmovdqa	xmm0,XMMWORD[((96-128))+rax]

	vpslld	xmm8,xmm13,5
	vpaddd	xmm12,xmm12,xmm15
	vpxor	xmm6,xmm11,xmm14
	vmovdqa	XMMWORD[(48-128)+rax],xmm2
	vpaddd	xmm12,xmm12,xmm2
	vpxor	xmm3,xmm3,XMMWORD[((192-128))+rax]
	vpsrld	xmm9,xmm13,27
	vpxor	xmm6,xmm6,xmm10
	vpxor	xmm3,xmm3,xmm0

	vpslld	xmm7,xmm14,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm12,xmm12,xmm6
	vpsrld	xmm5,xmm3,31
	vpaddd	xmm3,xmm3,xmm3

	vpsrld	xmm14,xmm14,2
	vpaddd	xmm12,xmm12,xmm8
	vpor	xmm3,xmm3,xmm5
	vpor	xmm14,xmm14,xmm7
	vpxor	xmm4,xmm4,xmm1
	vmovdqa	xmm1,XMMWORD[((112-128))+rax]

	vpslld	xmm8,xmm12,5
	vpaddd	xmm11,xmm11,xmm15
	vpxor	xmm6,xmm10,xmm13
	vmovdqa	XMMWORD[(64-128)+rax],xmm3
	vpaddd	xmm11,xmm11,xmm3
	vpxor	xmm4,xmm4,XMMWORD[((208-128))+rax]
	vpsrld	xmm9,xmm12,27
	vpxor	xmm6,xmm6,xmm14
	vpxor	xmm4,xmm4,xmm1

	vpslld	xmm7,xmm13,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm11,xmm11,xmm6
	vpsrld	xmm5,xmm4,31
	vpaddd	xmm4,xmm4,xmm4

	vpsrld	xmm13,xmm13,2
	vpaddd	xmm11,xmm11,xmm8
	vpor	xmm4,xmm4,xmm5
	vpor	xmm13,xmm13,xmm7
	vpxor	xmm0,xmm0,xmm2
	vmovdqa	xmm2,XMMWORD[((128-128))+rax]

	vpslld	xmm8,xmm11,5
	vpaddd	xmm10,xmm10,xmm15
	vpxor	xmm6,xmm14,xmm12
	vmovdqa	XMMWORD[(80-128)+rax],xmm4
	vpaddd	xmm10,xmm10,xmm4
	vpxor	xmm0,xmm0,XMMWORD[((224-128))+rax]
	vpsrld	xmm9,xmm11,27
	vpxor	xmm6,xmm6,xmm13
	vpxor	xmm0,xmm0,xmm2

	vpslld	xmm7,xmm12,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm10,xmm10,xmm6
	vpsrld	xmm5,xmm0,31
	vpaddd	xmm0,xmm0,xmm0

	vpsrld	xmm12,xmm12,2
	vpaddd	xmm10,xmm10,xmm8
	vpor	xmm0,xmm0,xmm5
	vpor	xmm12,xmm12,xmm7
	vpxor	xmm1,xmm1,xmm3
	vmovdqa	xmm3,XMMWORD[((144-128))+rax]

	vpslld	xmm8,xmm10,5
	vpaddd	xmm14,xmm14,xmm15
	vpxor	xmm6,xmm13,xmm11
	vmovdqa	XMMWORD[(96-128)+rax],xmm0
	vpaddd	xmm14,xmm14,xmm0
	vpxor	xmm1,xmm1,XMMWORD[((240-128))+rax]
	vpsrld	xmm9,xmm10,27
	vpxor	xmm6,xmm6,xmm12
	vpxor	xmm1,xmm1,xmm3

	vpslld	xmm7,xmm11,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm14,xmm14,xmm6
	vpsrld	xmm5,xmm1,31
	vpaddd	xmm1,xmm1,xmm1

	vpsrld	xmm11,xmm11,2
	vpaddd	xmm14,xmm14,xmm8
	vpor	xmm1,xmm1,xmm5
	vpor	xmm11,xmm11,xmm7
	vpxor	xmm2,xmm2,xmm4
	vmovdqa	xmm4,XMMWORD[((160-128))+rax]

	vpslld	xmm8,xmm14,5
	vpaddd	xmm13,xmm13,xmm15
	vpxor	xmm6,xmm12,xmm10
	vmovdqa	XMMWORD[(112-128)+rax],xmm1
	vpaddd	xmm13,xmm13,xmm1
	vpxor	xmm2,xmm2,XMMWORD[((0-128))+rax]
	vpsrld	xmm9,xmm14,27
	vpxor	xmm6,xmm6,xmm11
	vpxor	xmm2,xmm2,xmm4

	vpslld	xmm7,xmm10,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm13,xmm13,xmm6
	vpsrld	xmm5,xmm2,31
	vpaddd	xmm2,xmm2,xmm2

	vpsrld	xmm10,xmm10,2
	vpaddd	xmm13,xmm13,xmm8
	vpor	xmm2,xmm2,xmm5
	vpor	xmm10,xmm10,xmm7
	vpxor	xmm3,xmm3,xmm0
	vmovdqa	xmm0,XMMWORD[((176-128))+rax]

	vpslld	xmm8,xmm13,5
	vpaddd	xmm12,xmm12,xmm15
	vpxor	xmm6,xmm11,xmm14
	vpaddd	xmm12,xmm12,xmm2
	vpxor	xmm3,xmm3,XMMWORD[((16-128))+rax]
	vpsrld	xmm9,xmm13,27
	vpxor	xmm6,xmm6,xmm10
	vpxor	xmm3,xmm3,xmm0

	vpslld	xmm7,xmm14,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm12,xmm12,xmm6
	vpsrld	xmm5,xmm3,31
	vpaddd	xmm3,xmm3,xmm3

	vpsrld	xmm14,xmm14,2
	vpaddd	xmm12,xmm12,xmm8
	vpor	xmm3,xmm3,xmm5
	vpor	xmm14,xmm14,xmm7
	vpxor	xmm4,xmm4,xmm1
	vmovdqa	xmm1,XMMWORD[((192-128))+rax]

	vpslld	xmm8,xmm12,5
	vpaddd	xmm11,xmm11,xmm15
	vpxor	xmm6,xmm10,xmm13
	vpaddd	xmm11,xmm11,xmm3
	vpxor	xmm4,xmm4,XMMWORD[((32-128))+rax]
	vpsrld	xmm9,xmm12,27
	vpxor	xmm6,xmm6,xmm14
	vpxor	xmm4,xmm4,xmm1

	vpslld	xmm7,xmm13,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm11,xmm11,xmm6
	vpsrld	xmm5,xmm4,31
	vpaddd	xmm4,xmm4,xmm4

	vpsrld	xmm13,xmm13,2
	vpaddd	xmm11,xmm11,xmm8
	vpor	xmm4,xmm4,xmm5
	vpor	xmm13,xmm13,xmm7
	vpxor	xmm0,xmm0,xmm2
	vmovdqa	xmm2,XMMWORD[((208-128))+rax]

	vpslld	xmm8,xmm11,5
	vpaddd	xmm10,xmm10,xmm15
	vpxor	xmm6,xmm14,xmm12
	vpaddd	xmm10,xmm10,xmm4
	vpxor	xmm0,xmm0,XMMWORD[((48-128))+rax]
	vpsrld	xmm9,xmm11,27
	vpxor	xmm6,xmm6,xmm13
	vpxor	xmm0,xmm0,xmm2

	vpslld	xmm7,xmm12,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm10,xmm10,xmm6
	vpsrld	xmm5,xmm0,31
	vpaddd	xmm0,xmm0,xmm0

	vpsrld	xmm12,xmm12,2
	vpaddd	xmm10,xmm10,xmm8
	vpor	xmm0,xmm0,xmm5
	vpor	xmm12,xmm12,xmm7
	vpxor	xmm1,xmm1,xmm3
	vmovdqa	xmm3,XMMWORD[((224-128))+rax]

	vpslld	xmm8,xmm10,5
	vpaddd	xmm14,xmm14,xmm15
	vpxor	xmm6,xmm13,xmm11
	vpaddd	xmm14,xmm14,xmm0
	vpxor	xmm1,xmm1,XMMWORD[((64-128))+rax]
	vpsrld	xmm9,xmm10,27
	vpxor	xmm6,xmm6,xmm12
	vpxor	xmm1,xmm1,xmm3

	vpslld	xmm7,xmm11,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm14,xmm14,xmm6
	vpsrld	xmm5,xmm1,31
	vpaddd	xmm1,xmm1,xmm1

	vpsrld	xmm11,xmm11,2
	vpaddd	xmm14,xmm14,xmm8
	vpor	xmm1,xmm1,xmm5
	vpor	xmm11,xmm11,xmm7
	vpxor	xmm2,xmm2,xmm4
	vmovdqa	xmm4,XMMWORD[((240-128))+rax]

	vpslld	xmm8,xmm14,5
	vpaddd	xmm13,xmm13,xmm15
	vpxor	xmm6,xmm12,xmm10
	vpaddd	xmm13,xmm13,xmm1
	vpxor	xmm2,xmm2,XMMWORD[((80-128))+rax]
	vpsrld	xmm9,xmm14,27
	vpxor	xmm6,xmm6,xmm11
	vpxor	xmm2,xmm2,xmm4

	vpslld	xmm7,xmm10,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm13,xmm13,xmm6
	vpsrld	xmm5,xmm2,31
	vpaddd	xmm2,xmm2,xmm2

	vpsrld	xmm10,xmm10,2
	vpaddd	xmm13,xmm13,xmm8
	vpor	xmm2,xmm2,xmm5
	vpor	xmm10,xmm10,xmm7
	vpxor	xmm3,xmm3,xmm0
	vmovdqa	xmm0,XMMWORD[((0-128))+rax]

	vpslld	xmm8,xmm13,5
	vpaddd	xmm12,xmm12,xmm15
	vpxor	xmm6,xmm11,xmm14
	vpaddd	xmm12,xmm12,xmm2
	vpxor	xmm3,xmm3,XMMWORD[((96-128))+rax]
	vpsrld	xmm9,xmm13,27
	vpxor	xmm6,xmm6,xmm10
	vpxor	xmm3,xmm3,xmm0

	vpslld	xmm7,xmm14,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm12,xmm12,xmm6
	vpsrld	xmm5,xmm3,31
	vpaddd	xmm3,xmm3,xmm3

	vpsrld	xmm14,xmm14,2
	vpaddd	xmm12,xmm12,xmm8
	vpor	xmm3,xmm3,xmm5
	vpor	xmm14,xmm14,xmm7
	vpxor	xmm4,xmm4,xmm1
	vmovdqa	xmm1,XMMWORD[((16-128))+rax]

	vpslld	xmm8,xmm12,5
	vpaddd	xmm11,xmm11,xmm15
	vpxor	xmm6,xmm10,xmm13
	vpaddd	xmm11,xmm11,xmm3
	vpxor	xmm4,xmm4,XMMWORD[((112-128))+rax]
	vpsrld	xmm9,xmm12,27
	vpxor	xmm6,xmm6,xmm14
	vpxor	xmm4,xmm4,xmm1

	vpslld	xmm7,xmm13,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm11,xmm11,xmm6
	vpsrld	xmm5,xmm4,31
	vpaddd	xmm4,xmm4,xmm4

	vpsrld	xmm13,xmm13,2
	vpaddd	xmm11,xmm11,xmm8
	vpor	xmm4,xmm4,xmm5
	vpor	xmm13,xmm13,xmm7
	vpslld	xmm8,xmm11,5
	vpaddd	xmm10,xmm10,xmm15
	vpxor	xmm6,xmm14,xmm12

	vpsrld	xmm9,xmm11,27
	vpaddd	xmm10,xmm10,xmm4
	vpxor	xmm6,xmm6,xmm13

	vpslld	xmm7,xmm12,30
	vpor	xmm8,xmm8,xmm9
	vpaddd	xmm10,xmm10,xmm6

	vpsrld	xmm12,xmm12,2
	vpaddd	xmm10,xmm10,xmm8
	vpor	xmm12,xmm12,xmm7
	mov	ecx,1
	cmp	ecx,DWORD[rbx]
	cmovge	r8,rbp
	cmp	ecx,DWORD[4+rbx]
	cmovge	r9,rbp
	cmp	ecx,DWORD[8+rbx]
	cmovge	r10,rbp
	cmp	ecx,DWORD[12+rbx]
	cmovge	r11,rbp
	vmovdqu	xmm6,XMMWORD[rbx]
	vpxor	xmm8,xmm8,xmm8
	vmovdqa	xmm7,xmm6
	vpcmpgtd	xmm7,xmm7,xmm8
	vpaddd	xmm6,xmm6,xmm7

	vpand	xmm10,xmm10,xmm7
	vpand	xmm11,xmm11,xmm7
	vpaddd	xmm10,xmm10,XMMWORD[rdi]
	vpand	xmm12,xmm12,xmm7
	vpaddd	xmm11,xmm11,XMMWORD[32+rdi]
	vpand	xmm13,xmm13,xmm7
	vpaddd	xmm12,xmm12,XMMWORD[64+rdi]
	vpand	xmm14,xmm14,xmm7
	vpaddd	xmm13,xmm13,XMMWORD[96+rdi]
	vpaddd	xmm14,xmm14,XMMWORD[128+rdi]
	vmovdqu	XMMWORD[rdi],xmm10
	vmovdqu	XMMWORD[32+rdi],xmm11
	vmovdqu	XMMWORD[64+rdi],xmm12
	vmovdqu	XMMWORD[96+rdi],xmm13
	vmovdqu	XMMWORD[128+rdi],xmm14

	vmovdqu	XMMWORD[rbx],xmm6
	vmovdqu	xmm5,XMMWORD[96+rbp]
	dec	edx
	jnz	NEAR $L$oop_avx

	mov	edx,DWORD[280+rsp]
	lea	rdi,[16+rdi]
	lea	rsi,[64+rsi]
	dec	edx
	jnz	NEAR $L$oop_grande_avx

$L$done_avx:
	mov	rax,QWORD[272+rsp]

	vzeroupper
	movaps	xmm6,XMMWORD[((-184))+rax]
	movaps	xmm7,XMMWORD[((-168))+rax]
	movaps	xmm8,XMMWORD[((-152))+rax]
	movaps	xmm9,XMMWORD[((-136))+rax]
	movaps	xmm10,XMMWORD[((-120))+rax]
	movaps	xmm11,XMMWORD[((-104))+rax]
	movaps	xmm12,XMMWORD[((-88))+rax]
	movaps	xmm13,XMMWORD[((-72))+rax]
	movaps	xmm14,XMMWORD[((-56))+rax]
	movaps	xmm15,XMMWORD[((-40))+rax]
	mov	rbp,QWORD[((-16))+rax]

	mov	rbx,QWORD[((-8))+rax]

	lea	rsp,[rax]

$L$epilogue_avx:
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	DB	0F3h,0C3h		;repret

$L$SEH_end_sha1_multi_block_avx:

ALIGN	32
sha1_multi_block_avx2:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_sha1_multi_block_avx2:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8



_avx2_shortcut:
	mov	rax,rsp

	push	rbx

	push	rbp

	push	r12

	push	r13

	push	r14

	push	r15

	lea	rsp,[((-168))+rsp]
	movaps	XMMWORD[rsp],xmm6
	movaps	XMMWORD[16+rsp],xmm7
	movaps	XMMWORD[32+rsp],xmm8
	movaps	XMMWORD[48+rsp],xmm9
	movaps	XMMWORD[64+rsp],xmm10
	movaps	XMMWORD[80+rsp],xmm11
	movaps	XMMWORD[(-120)+rax],xmm12
	movaps	XMMWORD[(-104)+rax],xmm13
	movaps	XMMWORD[(-88)+rax],xmm14
	movaps	XMMWORD[(-72)+rax],xmm15
	sub	rsp,576
	and	rsp,-256
	mov	QWORD[544+rsp],rax

$L$body_avx2:
	lea	rbp,[K_XX_XX]
	shr	edx,1

	vzeroupper
$L$oop_grande_avx2:
	mov	DWORD[552+rsp],edx
	xor	edx,edx
	lea	rbx,[512+rsp]
	mov	r12,QWORD[rsi]
	mov	ecx,DWORD[8+rsi]
	cmp	ecx,edx
	cmovg	edx,ecx
	test	ecx,ecx
	mov	DWORD[rbx],ecx
	cmovle	r12,rbp
	mov	r13,QWORD[16+rsi]
	mov	ecx,DWORD[24+rsi]
	cmp	ecx,edx
	cmovg	edx,ecx
	test	ecx,ecx
	mov	DWORD[4+rbx],ecx
	cmovle	r13,rbp
	mov	r14,QWORD[32+rsi]
	mov	ecx,DWORD[40+rsi]
	cmp	ecx,edx
	cmovg	edx,ecx
	test	ecx,ecx
	mov	DWORD[8+rbx],ecx
	cmovle	r14,rbp
	mov	r15,QWORD[48+rsi]
	mov	ecx,DWORD[56+rsi]
	cmp	ecx,edx
	cmovg	edx,ecx
	test	ecx,ecx
	mov	DWORD[12+rbx],ecx
	cmovle	r15,rbp
	mov	r8,QWORD[64+rsi]
	mov	ecx,DWORD[72+rsi]
	cmp	ecx,edx
	cmovg	edx,ecx
	test	ecx,ecx
	mov	DWORD[16+rbx],ecx
	cmovle	r8,rbp
	mov	r9,QWORD[80+rsi]
	mov	ecx,DWORD[88+rsi]
	cmp	ecx,edx
	cmovg	edx,ecx
	test	ecx,ecx
	mov	DWORD[20+rbx],ecx
	cmovle	r9,rbp
	mov	r10,QWORD[96+rsi]
	mov	ecx,DWORD[104+rsi]
	cmp	ecx,edx
	cmovg	edx,ecx
	test	ecx,ecx
	mov	DWORD[24+rbx],ecx
	cmovle	r10,rbp
	mov	r11,QWORD[112+rsi]
	mov	ecx,DWORD[120+rsi]
	cmp	ecx,edx
	cmovg	edx,ecx
	test	ecx,ecx
	mov	DWORD[28+rbx],ecx
	cmovle	r11,rbp
	vmovdqu	ymm0,YMMWORD[rdi]
	lea	rax,[128+rsp]
	vmovdqu	ymm1,YMMWORD[32+rdi]
	lea	rbx,[((256+128))+rsp]
	vmovdqu	ymm2,YMMWORD[64+rdi]
	vmovdqu	ymm3,YMMWORD[96+rdi]
	vmovdqu	ymm4,YMMWORD[128+rdi]
	vmovdqu	ymm9,YMMWORD[96+rbp]
	jmp	NEAR $L$oop_avx2

ALIGN	32
$L$oop_avx2:
	vmovdqa	ymm15,YMMWORD[((-32))+rbp]
	vmovd	xmm10,DWORD[r12]
	lea	r12,[64+r12]
	vmovd	xmm12,DWORD[r8]
	lea	r8,[64+r8]
	vmovd	xmm7,DWORD[r13]
	lea	r13,[64+r13]
	vmovd	xmm6,DWORD[r9]
	lea	r9,[64+r9]
	vpinsrd	xmm10,xmm10,DWORD[r14],1
	lea	r14,[64+r14]
	vpinsrd	xmm12,xmm12,DWORD[r10],1
	lea	r10,[64+r10]
	vpinsrd	xmm7,xmm7,DWORD[r15],1
	lea	r15,[64+r15]
	vpunpckldq	ymm10,ymm10,ymm7
	vpinsrd	xmm6,xmm6,DWORD[r11],1
	lea	r11,[64+r11]
	vpunpckldq	ymm12,ymm12,ymm6
	vmovd	xmm11,DWORD[((-60))+r12]
	vinserti128	ymm10,ymm10,xmm12,1
	vmovd	xmm8,DWORD[((-60))+r8]
	vpshufb	ymm10,ymm10,ymm9
	vmovd	xmm7,DWORD[((-60))+r13]
	vmovd	xmm6,DWORD[((-60))+r9]
	vpinsrd	xmm11,xmm11,DWORD[((-60))+r14],1
	vpinsrd	xmm8,xmm8,DWORD[((-60))+r10],1
	vpinsrd	xmm7,xmm7,DWORD[((-60))+r15],1
	vpunpckldq	ymm11,ymm11,ymm7
	vpinsrd	xmm6,xmm6,DWORD[((-60))+r11],1
	vpunpckldq	ymm8,ymm8,ymm6
	vpaddd	ymm4,ymm4,ymm15
	vpslld	ymm7,ymm0,5
	vpandn	ymm6,ymm1,ymm3
	vpand	ymm5,ymm1,ymm2

	vmovdqa	YMMWORD[(0-128)+rax],ymm10
	vpaddd	ymm4,ymm4,ymm10
	vinserti128	ymm11,ymm11,xmm8,1
	vpsrld	ymm8,ymm0,27
	vpxor	ymm5,ymm5,ymm6
	vmovd	xmm12,DWORD[((-56))+r12]

	vpslld	ymm6,ymm1,30
	vpor	ymm7,ymm7,ymm8
	vmovd	xmm8,DWORD[((-56))+r8]
	vpaddd	ymm4,ymm4,ymm5

	vpsrld	ymm1,ymm1,2
	vpaddd	ymm4,ymm4,ymm7
	vpshufb	ymm11,ymm11,ymm9
	vpor	ymm1,ymm1,ymm6
	vmovd	xmm7,DWORD[((-56))+r13]
	vmovd	xmm6,DWORD[((-56))+r9]
	vpinsrd	xmm12,xmm12,DWORD[((-56))+r14],1
	vpinsrd	xmm8,xmm8,DWORD[((-56))+r10],1
	vpinsrd	xmm7,xmm7,DWORD[((-56))+r15],1
	vpunpckldq	ymm12,ymm12,ymm7
	vpinsrd	xmm6,xmm6,DWORD[((-56))+r11],1
	vpunpckldq	ymm8,ymm8,ymm6
	vpaddd	ymm3,ymm3,ymm15
	vpslld	ymm7,ymm4,5
	vpandn	ymm6,ymm0,ymm2
	vpand	ymm5,ymm0,ymm1

	vmovdqa	YMMWORD[(32-128)+rax],ymm11
	vpaddd	ymm3,ymm3,ymm11
	vinserti128	ymm12,ymm12,xmm8,1
	vpsrld	ymm8,ymm4,27
	vpxor	ymm5,ymm5,ymm6
	vmovd	xmm13,DWORD[((-52))+r12]

	vpslld	ymm6,ymm0,30
	vpor	ymm7,ymm7,ymm8
	vmovd	xmm8,DWORD[((-52))+r8]
	vpaddd	ymm3,ymm3,ymm5

	vpsrld	ymm0,ymm0,2
	vpaddd	ymm3,ymm3,ymm7
	vpshufb	ymm12,ymm12,ymm9
	vpor	ymm0,ymm0,ymm6
	vmovd	xmm7,DWORD[((-52))+r13]
	vmovd	xmm6,DWORD[((-52))+r9]
	vpinsrd	xmm13,xmm13,DWORD[((-52))+r14],1
	vpinsrd	xmm8,xmm8,DWORD[((-52))+r10],1
	vpinsrd	xmm7,xmm7,DWORD[((-52))+r15],1
	vpunpckldq	ymm13,ymm13,ymm7
	vpinsrd	xmm6,xmm6,DWORD[((-52))+r11],1
	vpunpckldq	ymm8,ymm8,ymm6
	vpaddd	ymm2,ymm2,ymm15
	vpslld	ymm7,ymm3,5
	vpandn	ymm6,ymm4,ymm1
	vpand	ymm5,ymm4,ymm0

	vmovdqa	YMMWORD[(64-128)+rax],ymm12
	vpaddd	ymm2,ymm2,ymm12
	vinserti128	ymm13,ymm13,xmm8,1
	vpsrld	ymm8,ymm3,27
	vpxor	ymm5,ymm5,ymm6
	vmovd	xmm14,DWORD[((-48))+r12]

	vpslld	ymm6,ymm4,30
	vpor	ymm7,ymm7,ymm8
	vmovd	xmm8,DWORD[((-48))+r8]
	vpaddd	ymm2,ymm2,ymm5

	vpsrld	ymm4,ymm4,2
	vpaddd	ymm2,ymm2,ymm7
	vpshufb	ymm13,ymm13,ymm9
	vpor	ymm4,ymm4,ymm6
	vmovd	xmm7,DWORD[((-48))+r13]
	vmovd	xmm6,DWORD[((-48))+r9]
	vpinsrd	xmm14,xmm14,DWORD[((-48))+r14],1
	vpinsrd	xmm8,xmm8,DWORD[((-48))+r10],1
	vpinsrd	xmm7,xmm7,DWORD[((-48))+r15],1
	vpunpckldq	ymm14,ymm14,ymm7
	vpinsrd	xmm6,xmm6,DWORD[((-48))+r11],1
	vpunpckldq	ymm8,ymm8,ymm6
	vpaddd	ymm1,ymm1,ymm15
	vpslld	ymm7,ymm2,5
	vpandn	ymm6,ymm3,ymm0
	vpand	ymm5,ymm3,ymm4

	vmovdqa	YMMWORD[(96-128)+rax],ymm13
	vpaddd	ymm1,ymm1,ymm13
	vinserti128	ymm14,ymm14,xmm8,1
	vpsrld	ymm8,ymm2,27
	vpxor	ymm5,ymm5,ymm6
	vmovd	xmm10,DWORD[((-44))+r12]

	vpslld	ymm6,ymm3,30
	vpor	ymm7,ymm7,ymm8
	vmovd	xmm8,DWORD[((-44))+r8]
	vpaddd	ymm1,ymm1,ymm5

	vpsrld	ymm3,ymm3,2
	vpaddd	ymm1,ymm1,ymm7
	vpshufb	ymm14,ymm14,ymm9
	vpor	ymm3,ymm3,ymm6
	vmovd	xmm7,DWORD[((-44))+r13]
	vmovd	xmm6,DWORD[((-44))+r9]
	vpinsrd	xmm10,xmm10,DWORD[((-44))+r14],1
	vpinsrd	xmm8,xmm8,DWORD[((-44))+r10],1
	vpinsrd	xmm7,xmm7,DWORD[((-44))+r15],1
	vpunpckldq	ymm10,ymm10,ymm7
	vpinsrd	xmm6,xmm6,DWORD[((-44))+r11],1
	vpunpckldq	ymm8,ymm8,ymm6
	vpaddd	ymm0,ymm0,ymm15
	vpslld	ymm7,ymm1,5
	vpandn	ymm6,ymm2,ymm4
	vpand	ymm5,ymm2,ymm3

	vmovdqa	YMMWORD[(128-128)+rax],ymm14
	vpaddd	ymm0,ymm0,ymm14
	vinserti128	ymm10,ymm10,xmm8,1
	vpsrld	ymm8,ymm1,27
	vpxor	ymm5,ymm5,ymm6
	vmovd	xmm11,DWORD[((-40))+r12]

	vpslld	ymm6,ymm2,30
	vpor	ymm7,ymm7,ymm8
	vmovd	xmm8,DWORD[((-40))+r8]
	vpaddd	ymm0,ymm0,ymm5

	vpsrld	ymm2,ymm2,2
	vpaddd	ymm0,ymm0,ymm7
	vpshufb	ymm10,ymm10,ymm9
	vpor	ymm2,ymm2,ymm6
	vmovd	xmm7,DWORD[((-40))+r13]
	vmovd	xmm6,DWORD[((-40))+r9]
	vpinsrd	xmm11,xmm11,DWORD[((-40))+r14],1
	vpinsrd	xmm8,xmm8,DWORD[((-40))+r10],1
	vpinsrd	xmm7,xmm7,DWORD[((-40))+r15],1
	vpunpckldq	ymm11,ymm11,ymm7
	vpinsrd	xmm6,xmm6,DWORD[((-40))+r11],1
	vpunpckldq	ymm8,ymm8,ymm6
	vpaddd	ymm4,ymm4,ymm15
	vpslld	ymm7,ymm0,5
	vpandn	ymm6,ymm1,ymm3
	vpand	ymm5,ymm1,ymm2

	vmovdqa	YMMWORD[(160-128)+rax],ymm10
	vpaddd	ymm4,ymm4,ymm10
	vinserti128	ymm11,ymm11,xmm8,1
	vpsrld	ymm8,ymm0,27
	vpxor	ymm5,ymm5,ymm6
	vmovd	xmm12,DWORD[((-36))+r12]

	vpslld	ymm6,ymm1,30
	vpor	ymm7,ymm7,ymm8
	vmovd	xmm8,DWORD[((-36))+r8]
	vpaddd	ymm4,ymm4,ymm5

	vpsrld	ymm1,ymm1,2
	vpaddd	ymm4,ymm4,ymm7
	vpshufb	ymm11,ymm11,ymm9
	vpor	ymm1,ymm1,ymm6
	vmovd	xmm7,DWORD[((-36))+r13]
	vmovd	xmm6,DWORD[((-36))+r9]
	vpinsrd	xmm12,xmm12,DWORD[((-36))+r14],1
	vpinsrd	xmm8,xmm8,DWORD[((-36))+r10],1
	vpinsrd	xmm7,xmm7,DWORD[((-36))+r15],1
	vpunpckldq	ymm12,ymm12,ymm7
	vpinsrd	xmm6,xmm6,DWORD[((-36))+r11],1
	vpunpckldq	ymm8,ymm8,ymm6
	vpaddd	ymm3,ymm3,ymm15
	vpslld	ymm7,ymm4,5
	vpandn	ymm6,ymm0,ymm2
	vpand	ymm5,ymm0,ymm1

	vmovdqa	YMMWORD[(192-128)+rax],ymm11
	vpaddd	ymm3,ymm3,ymm11
	vinserti128	ymm12,ymm12,xmm8,1
	vpsrld	ymm8,ymm4,27
	vpxor	ymm5,ymm5,ymm6
	vmovd	xmm13,DWORD[((-32))+r12]

	vpslld	ymm6,ymm0,30
	vpor	ymm7,ymm7,ymm8
	vmovd	xmm8,DWORD[((-32))+r8]
	vpaddd	ymm3,ymm3,ymm5

	vpsrld	ymm0,ymm0,2
	vpaddd	ymm3,ymm3,ymm7
	vpshufb	ymm12,ymm12,ymm9
	vpor	ymm0,ymm0,ymm6
	vmovd	xmm7,DWORD[((-32))+r13]
	vmovd	xmm6,DWORD[((-32))+r9]
	vpinsrd	xmm13,xmm13,DWORD[((-32))+r14],1
	vpinsrd	xmm8,xmm8,DWORD[((-32))+r10],1
	vpinsrd	xmm7,xmm7,DWORD[((-32))+r15],1
	vpunpckldq	ymm13,ymm13,ymm7
	vpinsrd	xmm6,xmm6,DWORD[((-32))+r11],1
	vpunpckldq	ymm8,ymm8,ymm6
	vpaddd	ymm2,ymm2,ymm15
	vpslld	ymm7,ymm3,5
	vpandn	ymm6,ymm4,ymm1
	vpand	ymm5,ymm4,ymm0

	vmovdqa	YMMWORD[(224-128)+rax],ymm12
	vpaddd	ymm2,ymm2,ymm12
	vinserti128	ymm13,ymm13,xmm8,1
	vpsrld	ymm8,ymm3,27
	vpxor	ymm5,ymm5,ymm6
	vmovd	xmm14,DWORD[((-28))+r12]

	vpslld	ymm6,ymm4,30
	vpor	ymm7,ymm7,ymm8
	vmovd	xmm8,DWORD[((-28))+r8]
	vpaddd	ymm2,ymm2,ymm5

	vpsrld	ymm4,ymm4,2
	vpaddd	ymm2,ymm2,ymm7
	vpshufb	ymm13,ymm13,ymm9
	vpor	ymm4,ymm4,ymm6
	vmovd	xmm7,DWORD[((-28))+r13]
	vmovd	xmm6,DWORD[((-28))+r9]
	vpinsrd	xmm14,xmm14,DWORD[((-28))+r14],1
	vpinsrd	xmm8,xmm8,DWORD[((-28))+r10],1
	vpinsrd	xmm7,xmm7,DWORD[((-28))+r15],1
	vpunpckldq	ymm14,ymm14,ymm7
	vpinsrd	xmm6,xmm6,DWORD[((-28))+r11],1
	vpunpckldq	ymm8,ymm8,ymm6
	vpaddd	ymm1,ymm1,ymm15
	vpslld	ymm7,ymm2,5
	vpandn	ymm6,ymm3,ymm0
	vpand	ymm5,ymm3,ymm4

	vmovdqa	YMMWORD[(256-256-128)+rbx],ymm13
	vpaddd	ymm1,ymm1,ymm13
	vinserti128	ymm14,ymm14,xmm8,1
	vpsrld	ymm8,ymm2,27
	vpxor	ymm5,ymm5,ymm6
	vmovd	xmm10,DWORD[((-24))+r12]

	vpslld	ymm6,ymm3,30
	vpor	ymm7,ymm7,ymm8
	vmovd	xmm8,DWORD[((-24))+r8]
	vpaddd	ymm1,ymm1,ymm5

	vpsrld	ymm3,ymm3,2
	vpaddd	ymm1,ymm1,ymm7
	vpshufb	ymm14,ymm14,ymm9
	vpor	ymm3,ymm3,ymm6
	vmovd	xmm7,DWORD[((-24))+r13]
	vmovd	xmm6,DWORD[((-24))+r9]
	vpinsrd	xmm10,xmm10,DWORD[((-24))+r14],1
	vpinsrd	xmm8,xmm8,DWORD[((-24))+r10],1
	vpinsrd	xmm7,xmm7,DWORD[((-24))+r15],1
	vpunpckldq	ymm10,ymm10,ymm7
	vpinsrd	xmm6,xmm6,DWORD[((-24))+r11],1
	vpunpckldq	ymm8,ymm8,ymm6
	vpaddd	ymm0,ymm0,ymm15
	vpslld	ymm7,ymm1,5
	vpandn	ymm6,ymm2,ymm4
	vpand	ymm5,ymm2,ymm3

	vmovdqa	YMMWORD[(288-256-128)+rbx],ymm14
	vpaddd	ymm0,ymm0,ymm14
	vinserti128	ymm10,ymm10,xmm8,1
	vpsrld	ymm8,ymm1,27
	vpxor	ymm5,ymm5,ymm6
	vmovd	xmm11,DWORD[((-20))+r12]

	vpslld	ymm6,ymm2,30
	vpor	ymm7,ymm7,ymm8
	vmovd	xmm8,DWORD[((-20))+r8]
	vpaddd	ymm0,ymm0,ymm5

	vpsrld	ymm2,ymm2,2
	vpaddd	ymm0,ymm0,ymm7
	vpshufb	ymm10,ymm10,ymm9
	vpor	ymm2,ymm2,ymm6
	vmovd	xmm7,DWORD[((-20))+r13]
	vmovd	xmm6,DWORD[((-20))+r9]
	vpinsrd	xmm11,xmm11,DWORD[((-20))+r14],1
	vpinsrd	xmm8,xmm8,DWORD[((-20))+r10],1
	vpinsrd	xmm7,xmm7,DWORD[((-20))+r15],1
	vpunpckldq	ymm11,ymm11,ymm7
	vpinsrd	xmm6,xmm6,DWORD[((-20))+r11],1
	vpunpckldq	ymm8,ymm8,ymm6
	vpaddd	ymm4,ymm4,ymm15
	vpslld	ymm7,ymm0,5
	vpandn	ymm6,ymm1,ymm3
	vpand	ymm5,ymm1,ymm2

	vmovdqa	YMMWORD[(320-256-128)+rbx],ymm10
	vpaddd	ymm4,ymm4,ymm10
	vinserti128	ymm11,ymm11,xmm8,1
	vpsrld	ymm8,ymm0,27
	vpxor	ymm5,ymm5,ymm6
	vmovd	xmm12,DWORD[((-16))+r12]

	vpslld	ymm6,ymm1,30
	vpor	ymm7,ymm7,ymm8
	vmovd	xmm8,DWORD[((-16))+r8]
	vpaddd	ymm4,ymm4,ymm5

	vpsrld	ymm1,ymm1,2
	vpaddd	ymm4,ymm4,ymm7
	vpshufb	ymm11,ymm11,ymm9
	vpor	ymm1,ymm1,ymm6
	vmovd	xmm7,DWORD[((-16))+r13]
	vmovd	xmm6,DWORD[((-16))+r9]
	vpinsrd	xmm12,xmm12,DWORD[((-16))+r14],1
	vpinsrd	xmm8,xmm8,DWORD[((-16))+r10],1
	vpinsrd	xmm7,xmm7,DWORD[((-16))+r15],1
	vpunpckldq	ymm12,ymm12,ymm7
	vpinsrd	xmm6,xmm6,DWORD[((-16))+r11],1
	vpunpckldq	ymm8,ymm8,ymm6
	vpaddd	ymm3,ymm3,ymm15
	vpslld	ymm7,ymm4,5
	vpandn	ymm6,ymm0,ymm2
	vpand	ymm5,ymm0,ymm1

	vmovdqa	YMMWORD[(352-256-128)+rbx],ymm11
	vpaddd	ymm3,ymm3,ymm11
	vinserti128	ymm12,ymm12,xmm8,1
	vpsrld	ymm8,ymm4,27
	vpxor	ymm5,ymm5,ymm6
	vmovd	xmm13,DWORD[((-12))+r12]

	vpslld	ymm6,ymm0,30
	vpor	ymm7,ymm7,ymm8
	vmovd	xmm8,DWORD[((-12))+r8]
	vpaddd	ymm3,ymm3,ymm5

	vpsrld	ymm0,ymm0,2
	vpaddd	ymm3,ymm3,ymm7
	vpshufb	ymm12,ymm12,ymm9
	vpor	ymm0,ymm0,ymm6
	vmovd	xmm7,DWORD[((-12))+r13]
	vmovd	xmm6,DWORD[((-12))+r9]
	vpinsrd	xmm13,xmm13,DWORD[((-12))+r14],1
	vpinsrd	xmm8,xmm8,DWORD[((-12))+r10],1
	vpinsrd	xmm7,xmm7,DWORD[((-12))+r15],1
	vpunpckldq	ymm13,ymm13,ymm7
	vpinsrd	xmm6,xmm6,DWORD[((-12))+r11],1
	vpunpckldq	ymm8,ymm8,ymm6
	vpaddd	ymm2,ymm2,ymm15
	vpslld	ymm7,ymm3,5
	vpandn	ymm6,ymm4,ymm1
	vpand	ymm5,ymm4,ymm0

	vmovdqa	YMMWORD[(384-256-128)+rbx],ymm12
	vpaddd	ymm2,ymm2,ymm12
	vinserti128	ymm13,ymm13,xmm8,1
	vpsrld	ymm8,ymm3,27
	vpxor	ymm5,ymm5,ymm6
	vmovd	xmm14,DWORD[((-8))+r12]

	vpslld	ymm6,ymm4,30
	vpor	ymm7,ymm7,ymm8
	vmovd	xmm8,DWORD[((-8))+r8]
	vpaddd	ymm2,ymm2,ymm5

	vpsrld	ymm4,ymm4,2
	vpaddd	ymm2,ymm2,ymm7
	vpshufb	ymm13,ymm13,ymm9
	vpor	ymm4,ymm4,ymm6
	vmovd	xmm7,DWORD[((-8))+r13]
	vmovd	xmm6,DWORD[((-8))+r9]
	vpinsrd	xmm14,xmm14,DWORD[((-8))+r14],1
	vpinsrd	xmm8,xmm8,DWORD[((-8))+r10],1
	vpinsrd	xmm7,xmm7,DWORD[((-8))+r15],1
	vpunpckldq	ymm14,ymm14,ymm7
	vpinsrd	xmm6,xmm6,DWORD[((-8))+r11],1
	vpunpckldq	ymm8,ymm8,ymm6
	vpaddd	ymm1,ymm1,ymm15
	vpslld	ymm7,ymm2,5
	vpandn	ymm6,ymm3,ymm0
	vpand	ymm5,ymm3,ymm4

	vmovdqa	YMMWORD[(416-256-128)+rbx],ymm13
	vpaddd	ymm1,ymm1,ymm13
	vinserti128	ymm14,ymm14,xmm8,1
	vpsrld	ymm8,ymm2,27
	vpxor	ymm5,ymm5,ymm6
	vmovd	xmm10,DWORD[((-4))+r12]

	vpslld	ymm6,ymm3,30
	vpor	ymm7,ymm7,ymm8
	vmovd	xmm8,DWORD[((-4))+r8]
	vpaddd	ymm1,ymm1,ymm5

	vpsrld	ymm3,ymm3,2
	vpaddd	ymm1,ymm1,ymm7
	vpshufb	ymm14,ymm14,ymm9
	vpor	ymm3,ymm3,ymm6
	vmovdqa	ymm11,YMMWORD[((0-128))+rax]
	vmovd	xmm7,DWORD[((-4))+r13]
	vmovd	xmm6,DWORD[((-4))+r9]
	vpinsrd	xmm10,xmm10,DWORD[((-4))+r14],1
	vpinsrd	xmm8,xmm8,DWORD[((-4))+r10],1
	vpinsrd	xmm7,xmm7,DWORD[((-4))+r15],1
	vpunpckldq	ymm10,ymm10,ymm7
	vpinsrd	xmm6,xmm6,DWORD[((-4))+r11],1
	vpunpckldq	ymm8,ymm8,ymm6
	vpaddd	ymm0,ymm0,ymm15
	prefetcht0	[63+r12]
	vpslld	ymm7,ymm1,5
	vpandn	ymm6,ymm2,ymm4
	vpand	ymm5,ymm2,ymm3

	vmovdqa	YMMWORD[(448-256-128)+rbx],ymm14
	vpaddd	ymm0,ymm0,ymm14
	vinserti128	ymm10,ymm10,xmm8,1
	vpsrld	ymm8,ymm1,27
	prefetcht0	[63+r13]
	vpxor	ymm5,ymm5,ymm6

	vpslld	ymm6,ymm2,30
	vpor	ymm7,ymm7,ymm8
	prefetcht0	[63+r14]
	vpaddd	ymm0,ymm0,ymm5

	vpsrld	ymm2,ymm2,2
	vpaddd	ymm0,ymm0,ymm7
	prefetcht0	[63+r15]
	vpshufb	ymm10,ymm10,ymm9
	vpor	ymm2,ymm2,ymm6
	vmovdqa	ymm12,YMMWORD[((32-128))+rax]
	vpxor	ymm11,ymm11,ymm13
	vmovdqa	ymm13,YMMWORD[((64-128))+rax]

	vpaddd	ymm4,ymm4,ymm15
	vpslld	ymm7,ymm0,5
	vpandn	ymm6,ymm1,ymm3
	prefetcht0	[63+r8]
	vpand	ymm5,ymm1,ymm2

	vmovdqa	YMMWORD[(480-256-128)+rbx],ymm10
	vpaddd	ymm4,ymm4,ymm10
	vpxor	ymm11,ymm11,YMMWORD[((256-256-128))+rbx]
	vpsrld	ymm8,ymm0,27
	vpxor	ymm5,ymm5,ymm6
	vpxor	ymm11,ymm11,ymm13
	prefetcht0	[63+r9]

	vpslld	ymm6,ymm1,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm4,ymm4,ymm5
	prefetcht0	[63+r10]
	vpsrld	ymm9,ymm11,31
	vpaddd	ymm11,ymm11,ymm11

	vpsrld	ymm1,ymm1,2
	prefetcht0	[63+r11]
	vpaddd	ymm4,ymm4,ymm7
	vpor	ymm11,ymm11,ymm9
	vpor	ymm1,ymm1,ymm6
	vpxor	ymm12,ymm12,ymm14
	vmovdqa	ymm14,YMMWORD[((96-128))+rax]

	vpaddd	ymm3,ymm3,ymm15
	vpslld	ymm7,ymm4,5
	vpandn	ymm6,ymm0,ymm2

	vpand	ymm5,ymm0,ymm1

	vmovdqa	YMMWORD[(0-128)+rax],ymm11
	vpaddd	ymm3,ymm3,ymm11
	vpxor	ymm12,ymm12,YMMWORD[((288-256-128))+rbx]
	vpsrld	ymm8,ymm4,27
	vpxor	ymm5,ymm5,ymm6
	vpxor	ymm12,ymm12,ymm14


	vpslld	ymm6,ymm0,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm3,ymm3,ymm5

	vpsrld	ymm9,ymm12,31
	vpaddd	ymm12,ymm12,ymm12

	vpsrld	ymm0,ymm0,2

	vpaddd	ymm3,ymm3,ymm7
	vpor	ymm12,ymm12,ymm9
	vpor	ymm0,ymm0,ymm6
	vpxor	ymm13,ymm13,ymm10
	vmovdqa	ymm10,YMMWORD[((128-128))+rax]

	vpaddd	ymm2,ymm2,ymm15
	vpslld	ymm7,ymm3,5
	vpandn	ymm6,ymm4,ymm1

	vpand	ymm5,ymm4,ymm0

	vmovdqa	YMMWORD[(32-128)+rax],ymm12
	vpaddd	ymm2,ymm2,ymm12
	vpxor	ymm13,ymm13,YMMWORD[((320-256-128))+rbx]
	vpsrld	ymm8,ymm3,27
	vpxor	ymm5,ymm5,ymm6
	vpxor	ymm13,ymm13,ymm10


	vpslld	ymm6,ymm4,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm2,ymm2,ymm5

	vpsrld	ymm9,ymm13,31
	vpaddd	ymm13,ymm13,ymm13

	vpsrld	ymm4,ymm4,2

	vpaddd	ymm2,ymm2,ymm7
	vpor	ymm13,ymm13,ymm9
	vpor	ymm4,ymm4,ymm6
	vpxor	ymm14,ymm14,ymm11
	vmovdqa	ymm11,YMMWORD[((160-128))+rax]

	vpaddd	ymm1,ymm1,ymm15
	vpslld	ymm7,ymm2,5
	vpandn	ymm6,ymm3,ymm0

	vpand	ymm5,ymm3,ymm4

	vmovdqa	YMMWORD[(64-128)+rax],ymm13
	vpaddd	ymm1,ymm1,ymm13
	vpxor	ymm14,ymm14,YMMWORD[((352-256-128))+rbx]
	vpsrld	ymm8,ymm2,27
	vpxor	ymm5,ymm5,ymm6
	vpxor	ymm14,ymm14,ymm11


	vpslld	ymm6,ymm3,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm1,ymm1,ymm5

	vpsrld	ymm9,ymm14,31
	vpaddd	ymm14,ymm14,ymm14

	vpsrld	ymm3,ymm3,2

	vpaddd	ymm1,ymm1,ymm7
	vpor	ymm14,ymm14,ymm9
	vpor	ymm3,ymm3,ymm6
	vpxor	ymm10,ymm10,ymm12
	vmovdqa	ymm12,YMMWORD[((192-128))+rax]

	vpaddd	ymm0,ymm0,ymm15
	vpslld	ymm7,ymm1,5
	vpandn	ymm6,ymm2,ymm4

	vpand	ymm5,ymm2,ymm3

	vmovdqa	YMMWORD[(96-128)+rax],ymm14
	vpaddd	ymm0,ymm0,ymm14
	vpxor	ymm10,ymm10,YMMWORD[((384-256-128))+rbx]
	vpsrld	ymm8,ymm1,27
	vpxor	ymm5,ymm5,ymm6
	vpxor	ymm10,ymm10,ymm12


	vpslld	ymm6,ymm2,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm0,ymm0,ymm5

	vpsrld	ymm9,ymm10,31
	vpaddd	ymm10,ymm10,ymm10

	vpsrld	ymm2,ymm2,2

	vpaddd	ymm0,ymm0,ymm7
	vpor	ymm10,ymm10,ymm9
	vpor	ymm2,ymm2,ymm6
	vmovdqa	ymm15,YMMWORD[rbp]
	vpxor	ymm11,ymm11,ymm13
	vmovdqa	ymm13,YMMWORD[((224-128))+rax]

	vpslld	ymm7,ymm0,5
	vpaddd	ymm4,ymm4,ymm15
	vpxor	ymm5,ymm3,ymm1
	vmovdqa	YMMWORD[(128-128)+rax],ymm10
	vpaddd	ymm4,ymm4,ymm10
	vpxor	ymm11,ymm11,YMMWORD[((416-256-128))+rbx]
	vpsrld	ymm8,ymm0,27
	vpxor	ymm5,ymm5,ymm2
	vpxor	ymm11,ymm11,ymm13

	vpslld	ymm6,ymm1,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm4,ymm4,ymm5
	vpsrld	ymm9,ymm11,31
	vpaddd	ymm11,ymm11,ymm11

	vpsrld	ymm1,ymm1,2
	vpaddd	ymm4,ymm4,ymm7
	vpor	ymm11,ymm11,ymm9
	vpor	ymm1,ymm1,ymm6
	vpxor	ymm12,ymm12,ymm14
	vmovdqa	ymm14,YMMWORD[((256-256-128))+rbx]

	vpslld	ymm7,ymm4,5
	vpaddd	ymm3,ymm3,ymm15
	vpxor	ymm5,ymm2,ymm0
	vmovdqa	YMMWORD[(160-128)+rax],ymm11
	vpaddd	ymm3,ymm3,ymm11
	vpxor	ymm12,ymm12,YMMWORD[((448-256-128))+rbx]
	vpsrld	ymm8,ymm4,27
	vpxor	ymm5,ymm5,ymm1
	vpxor	ymm12,ymm12,ymm14

	vpslld	ymm6,ymm0,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm3,ymm3,ymm5
	vpsrld	ymm9,ymm12,31
	vpaddd	ymm12,ymm12,ymm12

	vpsrld	ymm0,ymm0,2
	vpaddd	ymm3,ymm3,ymm7
	vpor	ymm12,ymm12,ymm9
	vpor	ymm0,ymm0,ymm6
	vpxor	ymm13,ymm13,ymm10
	vmovdqa	ymm10,YMMWORD[((288-256-128))+rbx]

	vpslld	ymm7,ymm3,5
	vpaddd	ymm2,ymm2,ymm15
	vpxor	ymm5,ymm1,ymm4
	vmovdqa	YMMWORD[(192-128)+rax],ymm12
	vpaddd	ymm2,ymm2,ymm12
	vpxor	ymm13,ymm13,YMMWORD[((480-256-128))+rbx]
	vpsrld	ymm8,ymm3,27
	vpxor	ymm5,ymm5,ymm0
	vpxor	ymm13,ymm13,ymm10

	vpslld	ymm6,ymm4,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm2,ymm2,ymm5
	vpsrld	ymm9,ymm13,31
	vpaddd	ymm13,ymm13,ymm13

	vpsrld	ymm4,ymm4,2
	vpaddd	ymm2,ymm2,ymm7
	vpor	ymm13,ymm13,ymm9
	vpor	ymm4,ymm4,ymm6
	vpxor	ymm14,ymm14,ymm11
	vmovdqa	ymm11,YMMWORD[((320-256-128))+rbx]

	vpslld	ymm7,ymm2,5
	vpaddd	ymm1,ymm1,ymm15
	vpxor	ymm5,ymm0,ymm3
	vmovdqa	YMMWORD[(224-128)+rax],ymm13
	vpaddd	ymm1,ymm1,ymm13
	vpxor	ymm14,ymm14,YMMWORD[((0-128))+rax]
	vpsrld	ymm8,ymm2,27
	vpxor	ymm5,ymm5,ymm4
	vpxor	ymm14,ymm14,ymm11

	vpslld	ymm6,ymm3,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm1,ymm1,ymm5
	vpsrld	ymm9,ymm14,31
	vpaddd	ymm14,ymm14,ymm14

	vpsrld	ymm3,ymm3,2
	vpaddd	ymm1,ymm1,ymm7
	vpor	ymm14,ymm14,ymm9
	vpor	ymm3,ymm3,ymm6
	vpxor	ymm10,ymm10,ymm12
	vmovdqa	ymm12,YMMWORD[((352-256-128))+rbx]

	vpslld	ymm7,ymm1,5
	vpaddd	ymm0,ymm0,ymm15
	vpxor	ymm5,ymm4,ymm2
	vmovdqa	YMMWORD[(256-256-128)+rbx],ymm14
	vpaddd	ymm0,ymm0,ymm14
	vpxor	ymm10,ymm10,YMMWORD[((32-128))+rax]
	vpsrld	ymm8,ymm1,27
	vpxor	ymm5,ymm5,ymm3
	vpxor	ymm10,ymm10,ymm12

	vpslld	ymm6,ymm2,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm0,ymm0,ymm5
	vpsrld	ymm9,ymm10,31
	vpaddd	ymm10,ymm10,ymm10

	vpsrld	ymm2,ymm2,2
	vpaddd	ymm0,ymm0,ymm7
	vpor	ymm10,ymm10,ymm9
	vpor	ymm2,ymm2,ymm6
	vpxor	ymm11,ymm11,ymm13
	vmovdqa	ymm13,YMMWORD[((384-256-128))+rbx]

	vpslld	ymm7,ymm0,5
	vpaddd	ymm4,ymm4,ymm15
	vpxor	ymm5,ymm3,ymm1
	vmovdqa	YMMWORD[(288-256-128)+rbx],ymm10
	vpaddd	ymm4,ymm4,ymm10
	vpxor	ymm11,ymm11,YMMWORD[((64-128))+rax]
	vpsrld	ymm8,ymm0,27
	vpxor	ymm5,ymm5,ymm2
	vpxor	ymm11,ymm11,ymm13

	vpslld	ymm6,ymm1,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm4,ymm4,ymm5
	vpsrld	ymm9,ymm11,31
	vpaddd	ymm11,ymm11,ymm11

	vpsrld	ymm1,ymm1,2
	vpaddd	ymm4,ymm4,ymm7
	vpor	ymm11,ymm11,ymm9
	vpor	ymm1,ymm1,ymm6
	vpxor	ymm12,ymm12,ymm14
	vmovdqa	ymm14,YMMWORD[((416-256-128))+rbx]

	vpslld	ymm7,ymm4,5
	vpaddd	ymm3,ymm3,ymm15
	vpxor	ymm5,ymm2,ymm0
	vmovdqa	YMMWORD[(320-256-128)+rbx],ymm11
	vpaddd	ymm3,ymm3,ymm11
	vpxor	ymm12,ymm12,YMMWORD[((96-128))+rax]
	vpsrld	ymm8,ymm4,27
	vpxor	ymm5,ymm5,ymm1
	vpxor	ymm12,ymm12,ymm14

	vpslld	ymm6,ymm0,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm3,ymm3,ymm5
	vpsrld	ymm9,ymm12,31
	vpaddd	ymm12,ymm12,ymm12

	vpsrld	ymm0,ymm0,2
	vpaddd	ymm3,ymm3,ymm7
	vpor	ymm12,ymm12,ymm9
	vpor	ymm0,ymm0,ymm6
	vpxor	ymm13,ymm13,ymm10
	vmovdqa	ymm10,YMMWORD[((448-256-128))+rbx]

	vpslld	ymm7,ymm3,5
	vpaddd	ymm2,ymm2,ymm15
	vpxor	ymm5,ymm1,ymm4
	vmovdqa	YMMWORD[(352-256-128)+rbx],ymm12
	vpaddd	ymm2,ymm2,ymm12
	vpxor	ymm13,ymm13,YMMWORD[((128-128))+rax]
	vpsrld	ymm8,ymm3,27
	vpxor	ymm5,ymm5,ymm0
	vpxor	ymm13,ymm13,ymm10

	vpslld	ymm6,ymm4,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm2,ymm2,ymm5
	vpsrld	ymm9,ymm13,31
	vpaddd	ymm13,ymm13,ymm13

	vpsrld	ymm4,ymm4,2
	vpaddd	ymm2,ymm2,ymm7
	vpor	ymm13,ymm13,ymm9
	vpor	ymm4,ymm4,ymm6
	vpxor	ymm14,ymm14,ymm11
	vmovdqa	ymm11,YMMWORD[((480-256-128))+rbx]

	vpslld	ymm7,ymm2,5
	vpaddd	ymm1,ymm1,ymm15
	vpxor	ymm5,ymm0,ymm3
	vmovdqa	YMMWORD[(384-256-128)+rbx],ymm13
	vpaddd	ymm1,ymm1,ymm13
	vpxor	ymm14,ymm14,YMMWORD[((160-128))+rax]
	vpsrld	ymm8,ymm2,27
	vpxor	ymm5,ymm5,ymm4
	vpxor	ymm14,ymm14,ymm11

	vpslld	ymm6,ymm3,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm1,ymm1,ymm5
	vpsrld	ymm9,ymm14,31
	vpaddd	ymm14,ymm14,ymm14

	vpsrld	ymm3,ymm3,2
	vpaddd	ymm1,ymm1,ymm7
	vpor	ymm14,ymm14,ymm9
	vpor	ymm3,ymm3,ymm6
	vpxor	ymm10,ymm10,ymm12
	vmovdqa	ymm12,YMMWORD[((0-128))+rax]

	vpslld	ymm7,ymm1,5
	vpaddd	ymm0,ymm0,ymm15
	vpxor	ymm5,ymm4,ymm2
	vmovdqa	YMMWORD[(416-256-128)+rbx],ymm14
	vpaddd	ymm0,ymm0,ymm14
	vpxor	ymm10,ymm10,YMMWORD[((192-128))+rax]
	vpsrld	ymm8,ymm1,27
	vpxor	ymm5,ymm5,ymm3
	vpxor	ymm10,ymm10,ymm12

	vpslld	ymm6,ymm2,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm0,ymm0,ymm5
	vpsrld	ymm9,ymm10,31
	vpaddd	ymm10,ymm10,ymm10

	vpsrld	ymm2,ymm2,2
	vpaddd	ymm0,ymm0,ymm7
	vpor	ymm10,ymm10,ymm9
	vpor	ymm2,ymm2,ymm6
	vpxor	ymm11,ymm11,ymm13
	vmovdqa	ymm13,YMMWORD[((32-128))+rax]

	vpslld	ymm7,ymm0,5
	vpaddd	ymm4,ymm4,ymm15
	vpxor	ymm5,ymm3,ymm1
	vmovdqa	YMMWORD[(448-256-128)+rbx],ymm10
	vpaddd	ymm4,ymm4,ymm10
	vpxor	ymm11,ymm11,YMMWORD[((224-128))+rax]
	vpsrld	ymm8,ymm0,27
	vpxor	ymm5,ymm5,ymm2
	vpxor	ymm11,ymm11,ymm13

	vpslld	ymm6,ymm1,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm4,ymm4,ymm5
	vpsrld	ymm9,ymm11,31
	vpaddd	ymm11,ymm11,ymm11

	vpsrld	ymm1,ymm1,2
	vpaddd	ymm4,ymm4,ymm7
	vpor	ymm11,ymm11,ymm9
	vpor	ymm1,ymm1,ymm6
	vpxor	ymm12,ymm12,ymm14
	vmovdqa	ymm14,YMMWORD[((64-128))+rax]

	vpslld	ymm7,ymm4,5
	vpaddd	ymm3,ymm3,ymm15
	vpxor	ymm5,ymm2,ymm0
	vmovdqa	YMMWORD[(480-256-128)+rbx],ymm11
	vpaddd	ymm3,ymm3,ymm11
	vpxor	ymm12,ymm12,YMMWORD[((256-256-128))+rbx]
	vpsrld	ymm8,ymm4,27
	vpxor	ymm5,ymm5,ymm1
	vpxor	ymm12,ymm12,ymm14

	vpslld	ymm6,ymm0,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm3,ymm3,ymm5
	vpsrld	ymm9,ymm12,31
	vpaddd	ymm12,ymm12,ymm12

	vpsrld	ymm0,ymm0,2
	vpaddd	ymm3,ymm3,ymm7
	vpor	ymm12,ymm12,ymm9
	vpor	ymm0,ymm0,ymm6
	vpxor	ymm13,ymm13,ymm10
	vmovdqa	ymm10,YMMWORD[((96-128))+rax]

	vpslld	ymm7,ymm3,5
	vpaddd	ymm2,ymm2,ymm15
	vpxor	ymm5,ymm1,ymm4
	vmovdqa	YMMWORD[(0-128)+rax],ymm12
	vpaddd	ymm2,ymm2,ymm12
	vpxor	ymm13,ymm13,YMMWORD[((288-256-128))+rbx]
	vpsrld	ymm8,ymm3,27
	vpxor	ymm5,ymm5,ymm0
	vpxor	ymm13,ymm13,ymm10

	vpslld	ymm6,ymm4,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm2,ymm2,ymm5
	vpsrld	ymm9,ymm13,31
	vpaddd	ymm13,ymm13,ymm13

	vpsrld	ymm4,ymm4,2
	vpaddd	ymm2,ymm2,ymm7
	vpor	ymm13,ymm13,ymm9
	vpor	ymm4,ymm4,ymm6
	vpxor	ymm14,ymm14,ymm11
	vmovdqa	ymm11,YMMWORD[((128-128))+rax]

	vpslld	ymm7,ymm2,5
	vpaddd	ymm1,ymm1,ymm15
	vpxor	ymm5,ymm0,ymm3
	vmovdqa	YMMWORD[(32-128)+rax],ymm13
	vpaddd	ymm1,ymm1,ymm13
	vpxor	ymm14,ymm14,YMMWORD[((320-256-128))+rbx]
	vpsrld	ymm8,ymm2,27
	vpxor	ymm5,ymm5,ymm4
	vpxor	ymm14,ymm14,ymm11

	vpslld	ymm6,ymm3,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm1,ymm1,ymm5
	vpsrld	ymm9,ymm14,31
	vpaddd	ymm14,ymm14,ymm14

	vpsrld	ymm3,ymm3,2
	vpaddd	ymm1,ymm1,ymm7
	vpor	ymm14,ymm14,ymm9
	vpor	ymm3,ymm3,ymm6
	vpxor	ymm10,ymm10,ymm12
	vmovdqa	ymm12,YMMWORD[((160-128))+rax]

	vpslld	ymm7,ymm1,5
	vpaddd	ymm0,ymm0,ymm15
	vpxor	ymm5,ymm4,ymm2
	vmovdqa	YMMWORD[(64-128)+rax],ymm14
	vpaddd	ymm0,ymm0,ymm14
	vpxor	ymm10,ymm10,YMMWORD[((352-256-128))+rbx]
	vpsrld	ymm8,ymm1,27
	vpxor	ymm5,ymm5,ymm3
	vpxor	ymm10,ymm10,ymm12

	vpslld	ymm6,ymm2,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm0,ymm0,ymm5
	vpsrld	ymm9,ymm10,31
	vpaddd	ymm10,ymm10,ymm10

	vpsrld	ymm2,ymm2,2
	vpaddd	ymm0,ymm0,ymm7
	vpor	ymm10,ymm10,ymm9
	vpor	ymm2,ymm2,ymm6
	vpxor	ymm11,ymm11,ymm13
	vmovdqa	ymm13,YMMWORD[((192-128))+rax]

	vpslld	ymm7,ymm0,5
	vpaddd	ymm4,ymm4,ymm15
	vpxor	ymm5,ymm3,ymm1
	vmovdqa	YMMWORD[(96-128)+rax],ymm10
	vpaddd	ymm4,ymm4,ymm10
	vpxor	ymm11,ymm11,YMMWORD[((384-256-128))+rbx]
	vpsrld	ymm8,ymm0,27
	vpxor	ymm5,ymm5,ymm2
	vpxor	ymm11,ymm11,ymm13

	vpslld	ymm6,ymm1,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm4,ymm4,ymm5
	vpsrld	ymm9,ymm11,31
	vpaddd	ymm11,ymm11,ymm11

	vpsrld	ymm1,ymm1,2
	vpaddd	ymm4,ymm4,ymm7
	vpor	ymm11,ymm11,ymm9
	vpor	ymm1,ymm1,ymm6
	vpxor	ymm12,ymm12,ymm14
	vmovdqa	ymm14,YMMWORD[((224-128))+rax]

	vpslld	ymm7,ymm4,5
	vpaddd	ymm3,ymm3,ymm15
	vpxor	ymm5,ymm2,ymm0
	vmovdqa	YMMWORD[(128-128)+rax],ymm11
	vpaddd	ymm3,ymm3,ymm11
	vpxor	ymm12,ymm12,YMMWORD[((416-256-128))+rbx]
	vpsrld	ymm8,ymm4,27
	vpxor	ymm5,ymm5,ymm1
	vpxor	ymm12,ymm12,ymm14

	vpslld	ymm6,ymm0,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm3,ymm3,ymm5
	vpsrld	ymm9,ymm12,31
	vpaddd	ymm12,ymm12,ymm12

	vpsrld	ymm0,ymm0,2
	vpaddd	ymm3,ymm3,ymm7
	vpor	ymm12,ymm12,ymm9
	vpor	ymm0,ymm0,ymm6
	vpxor	ymm13,ymm13,ymm10
	vmovdqa	ymm10,YMMWORD[((256-256-128))+rbx]

	vpslld	ymm7,ymm3,5
	vpaddd	ymm2,ymm2,ymm15
	vpxor	ymm5,ymm1,ymm4
	vmovdqa	YMMWORD[(160-128)+rax],ymm12
	vpaddd	ymm2,ymm2,ymm12
	vpxor	ymm13,ymm13,YMMWORD[((448-256-128))+rbx]
	vpsrld	ymm8,ymm3,27
	vpxor	ymm5,ymm5,ymm0
	vpxor	ymm13,ymm13,ymm10

	vpslld	ymm6,ymm4,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm2,ymm2,ymm5
	vpsrld	ymm9,ymm13,31
	vpaddd	ymm13,ymm13,ymm13

	vpsrld	ymm4,ymm4,2
	vpaddd	ymm2,ymm2,ymm7
	vpor	ymm13,ymm13,ymm9
	vpor	ymm4,ymm4,ymm6
	vpxor	ymm14,ymm14,ymm11
	vmovdqa	ymm11,YMMWORD[((288-256-128))+rbx]

	vpslld	ymm7,ymm2,5
	vpaddd	ymm1,ymm1,ymm15
	vpxor	ymm5,ymm0,ymm3
	vmovdqa	YMMWORD[(192-128)+rax],ymm13
	vpaddd	ymm1,ymm1,ymm13
	vpxor	ymm14,ymm14,YMMWORD[((480-256-128))+rbx]
	vpsrld	ymm8,ymm2,27
	vpxor	ymm5,ymm5,ymm4
	vpxor	ymm14,ymm14,ymm11

	vpslld	ymm6,ymm3,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm1,ymm1,ymm5
	vpsrld	ymm9,ymm14,31
	vpaddd	ymm14,ymm14,ymm14

	vpsrld	ymm3,ymm3,2
	vpaddd	ymm1,ymm1,ymm7
	vpor	ymm14,ymm14,ymm9
	vpor	ymm3,ymm3,ymm6
	vpxor	ymm10,ymm10,ymm12
	vmovdqa	ymm12,YMMWORD[((320-256-128))+rbx]

	vpslld	ymm7,ymm1,5
	vpaddd	ymm0,ymm0,ymm15
	vpxor	ymm5,ymm4,ymm2
	vmovdqa	YMMWORD[(224-128)+rax],ymm14
	vpaddd	ymm0,ymm0,ymm14
	vpxor	ymm10,ymm10,YMMWORD[((0-128))+rax]
	vpsrld	ymm8,ymm1,27
	vpxor	ymm5,ymm5,ymm3
	vpxor	ymm10,ymm10,ymm12

	vpslld	ymm6,ymm2,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm0,ymm0,ymm5
	vpsrld	ymm9,ymm10,31
	vpaddd	ymm10,ymm10,ymm10

	vpsrld	ymm2,ymm2,2
	vpaddd	ymm0,ymm0,ymm7
	vpor	ymm10,ymm10,ymm9
	vpor	ymm2,ymm2,ymm6
	vmovdqa	ymm15,YMMWORD[32+rbp]
	vpxor	ymm11,ymm11,ymm13
	vmovdqa	ymm13,YMMWORD[((352-256-128))+rbx]

	vpaddd	ymm4,ymm4,ymm15
	vpslld	ymm7,ymm0,5
	vpand	ymm6,ymm3,ymm2
	vpxor	ymm11,ymm11,YMMWORD[((32-128))+rax]

	vpaddd	ymm4,ymm4,ymm6
	vpsrld	ymm8,ymm0,27
	vpxor	ymm5,ymm3,ymm2
	vpxor	ymm11,ymm11,ymm13

	vmovdqu	YMMWORD[(256-256-128)+rbx],ymm10
	vpaddd	ymm4,ymm4,ymm10
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm11,31
	vpand	ymm5,ymm5,ymm1
	vpaddd	ymm11,ymm11,ymm11

	vpslld	ymm6,ymm1,30
	vpaddd	ymm4,ymm4,ymm5

	vpsrld	ymm1,ymm1,2
	vpaddd	ymm4,ymm4,ymm7
	vpor	ymm11,ymm11,ymm9
	vpor	ymm1,ymm1,ymm6
	vpxor	ymm12,ymm12,ymm14
	vmovdqa	ymm14,YMMWORD[((384-256-128))+rbx]

	vpaddd	ymm3,ymm3,ymm15
	vpslld	ymm7,ymm4,5
	vpand	ymm6,ymm2,ymm1
	vpxor	ymm12,ymm12,YMMWORD[((64-128))+rax]

	vpaddd	ymm3,ymm3,ymm6
	vpsrld	ymm8,ymm4,27
	vpxor	ymm5,ymm2,ymm1
	vpxor	ymm12,ymm12,ymm14

	vmovdqu	YMMWORD[(288-256-128)+rbx],ymm11
	vpaddd	ymm3,ymm3,ymm11
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm12,31
	vpand	ymm5,ymm5,ymm0
	vpaddd	ymm12,ymm12,ymm12

	vpslld	ymm6,ymm0,30
	vpaddd	ymm3,ymm3,ymm5

	vpsrld	ymm0,ymm0,2
	vpaddd	ymm3,ymm3,ymm7
	vpor	ymm12,ymm12,ymm9
	vpor	ymm0,ymm0,ymm6
	vpxor	ymm13,ymm13,ymm10
	vmovdqa	ymm10,YMMWORD[((416-256-128))+rbx]

	vpaddd	ymm2,ymm2,ymm15
	vpslld	ymm7,ymm3,5
	vpand	ymm6,ymm1,ymm0
	vpxor	ymm13,ymm13,YMMWORD[((96-128))+rax]

	vpaddd	ymm2,ymm2,ymm6
	vpsrld	ymm8,ymm3,27
	vpxor	ymm5,ymm1,ymm0
	vpxor	ymm13,ymm13,ymm10

	vmovdqu	YMMWORD[(320-256-128)+rbx],ymm12
	vpaddd	ymm2,ymm2,ymm12
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm13,31
	vpand	ymm5,ymm5,ymm4
	vpaddd	ymm13,ymm13,ymm13

	vpslld	ymm6,ymm4,30
	vpaddd	ymm2,ymm2,ymm5

	vpsrld	ymm4,ymm4,2
	vpaddd	ymm2,ymm2,ymm7
	vpor	ymm13,ymm13,ymm9
	vpor	ymm4,ymm4,ymm6
	vpxor	ymm14,ymm14,ymm11
	vmovdqa	ymm11,YMMWORD[((448-256-128))+rbx]

	vpaddd	ymm1,ymm1,ymm15
	vpslld	ymm7,ymm2,5
	vpand	ymm6,ymm0,ymm4
	vpxor	ymm14,ymm14,YMMWORD[((128-128))+rax]

	vpaddd	ymm1,ymm1,ymm6
	vpsrld	ymm8,ymm2,27
	vpxor	ymm5,ymm0,ymm4
	vpxor	ymm14,ymm14,ymm11

	vmovdqu	YMMWORD[(352-256-128)+rbx],ymm13
	vpaddd	ymm1,ymm1,ymm13
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm14,31
	vpand	ymm5,ymm5,ymm3
	vpaddd	ymm14,ymm14,ymm14

	vpslld	ymm6,ymm3,30
	vpaddd	ymm1,ymm1,ymm5

	vpsrld	ymm3,ymm3,2
	vpaddd	ymm1,ymm1,ymm7
	vpor	ymm14,ymm14,ymm9
	vpor	ymm3,ymm3,ymm6
	vpxor	ymm10,ymm10,ymm12
	vmovdqa	ymm12,YMMWORD[((480-256-128))+rbx]

	vpaddd	ymm0,ymm0,ymm15
	vpslld	ymm7,ymm1,5
	vpand	ymm6,ymm4,ymm3
	vpxor	ymm10,ymm10,YMMWORD[((160-128))+rax]

	vpaddd	ymm0,ymm0,ymm6
	vpsrld	ymm8,ymm1,27
	vpxor	ymm5,ymm4,ymm3
	vpxor	ymm10,ymm10,ymm12

	vmovdqu	YMMWORD[(384-256-128)+rbx],ymm14
	vpaddd	ymm0,ymm0,ymm14
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm10,31
	vpand	ymm5,ymm5,ymm2
	vpaddd	ymm10,ymm10,ymm10

	vpslld	ymm6,ymm2,30
	vpaddd	ymm0,ymm0,ymm5

	vpsrld	ymm2,ymm2,2
	vpaddd	ymm0,ymm0,ymm7
	vpor	ymm10,ymm10,ymm9
	vpor	ymm2,ymm2,ymm6
	vpxor	ymm11,ymm11,ymm13
	vmovdqa	ymm13,YMMWORD[((0-128))+rax]

	vpaddd	ymm4,ymm4,ymm15
	vpslld	ymm7,ymm0,5
	vpand	ymm6,ymm3,ymm2
	vpxor	ymm11,ymm11,YMMWORD[((192-128))+rax]

	vpaddd	ymm4,ymm4,ymm6
	vpsrld	ymm8,ymm0,27
	vpxor	ymm5,ymm3,ymm2
	vpxor	ymm11,ymm11,ymm13

	vmovdqu	YMMWORD[(416-256-128)+rbx],ymm10
	vpaddd	ymm4,ymm4,ymm10
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm11,31
	vpand	ymm5,ymm5,ymm1
	vpaddd	ymm11,ymm11,ymm11

	vpslld	ymm6,ymm1,30
	vpaddd	ymm4,ymm4,ymm5

	vpsrld	ymm1,ymm1,2
	vpaddd	ymm4,ymm4,ymm7
	vpor	ymm11,ymm11,ymm9
	vpor	ymm1,ymm1,ymm6
	vpxor	ymm12,ymm12,ymm14
	vmovdqa	ymm14,YMMWORD[((32-128))+rax]

	vpaddd	ymm3,ymm3,ymm15
	vpslld	ymm7,ymm4,5
	vpand	ymm6,ymm2,ymm1
	vpxor	ymm12,ymm12,YMMWORD[((224-128))+rax]

	vpaddd	ymm3,ymm3,ymm6
	vpsrld	ymm8,ymm4,27
	vpxor	ymm5,ymm2,ymm1
	vpxor	ymm12,ymm12,ymm14

	vmovdqu	YMMWORD[(448-256-128)+rbx],ymm11
	vpaddd	ymm3,ymm3,ymm11
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm12,31
	vpand	ymm5,ymm5,ymm0
	vpaddd	ymm12,ymm12,ymm12

	vpslld	ymm6,ymm0,30
	vpaddd	ymm3,ymm3,ymm5

	vpsrld	ymm0,ymm0,2
	vpaddd	ymm3,ymm3,ymm7
	vpor	ymm12,ymm12,ymm9
	vpor	ymm0,ymm0,ymm6
	vpxor	ymm13,ymm13,ymm10
	vmovdqa	ymm10,YMMWORD[((64-128))+rax]

	vpaddd	ymm2,ymm2,ymm15
	vpslld	ymm7,ymm3,5
	vpand	ymm6,ymm1,ymm0
	vpxor	ymm13,ymm13,YMMWORD[((256-256-128))+rbx]

	vpaddd	ymm2,ymm2,ymm6
	vpsrld	ymm8,ymm3,27
	vpxor	ymm5,ymm1,ymm0
	vpxor	ymm13,ymm13,ymm10

	vmovdqu	YMMWORD[(480-256-128)+rbx],ymm12
	vpaddd	ymm2,ymm2,ymm12
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm13,31
	vpand	ymm5,ymm5,ymm4
	vpaddd	ymm13,ymm13,ymm13

	vpslld	ymm6,ymm4,30
	vpaddd	ymm2,ymm2,ymm5

	vpsrld	ymm4,ymm4,2
	vpaddd	ymm2,ymm2,ymm7
	vpor	ymm13,ymm13,ymm9
	vpor	ymm4,ymm4,ymm6
	vpxor	ymm14,ymm14,ymm11
	vmovdqa	ymm11,YMMWORD[((96-128))+rax]

	vpaddd	ymm1,ymm1,ymm15
	vpslld	ymm7,ymm2,5
	vpand	ymm6,ymm0,ymm4
	vpxor	ymm14,ymm14,YMMWORD[((288-256-128))+rbx]

	vpaddd	ymm1,ymm1,ymm6
	vpsrld	ymm8,ymm2,27
	vpxor	ymm5,ymm0,ymm4
	vpxor	ymm14,ymm14,ymm11

	vmovdqu	YMMWORD[(0-128)+rax],ymm13
	vpaddd	ymm1,ymm1,ymm13
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm14,31
	vpand	ymm5,ymm5,ymm3
	vpaddd	ymm14,ymm14,ymm14

	vpslld	ymm6,ymm3,30
	vpaddd	ymm1,ymm1,ymm5

	vpsrld	ymm3,ymm3,2
	vpaddd	ymm1,ymm1,ymm7
	vpor	ymm14,ymm14,ymm9
	vpor	ymm3,ymm3,ymm6
	vpxor	ymm10,ymm10,ymm12
	vmovdqa	ymm12,YMMWORD[((128-128))+rax]

	vpaddd	ymm0,ymm0,ymm15
	vpslld	ymm7,ymm1,5
	vpand	ymm6,ymm4,ymm3
	vpxor	ymm10,ymm10,YMMWORD[((320-256-128))+rbx]

	vpaddd	ymm0,ymm0,ymm6
	vpsrld	ymm8,ymm1,27
	vpxor	ymm5,ymm4,ymm3
	vpxor	ymm10,ymm10,ymm12

	vmovdqu	YMMWORD[(32-128)+rax],ymm14
	vpaddd	ymm0,ymm0,ymm14
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm10,31
	vpand	ymm5,ymm5,ymm2
	vpaddd	ymm10,ymm10,ymm10

	vpslld	ymm6,ymm2,30
	vpaddd	ymm0,ymm0,ymm5

	vpsrld	ymm2,ymm2,2
	vpaddd	ymm0,ymm0,ymm7
	vpor	ymm10,ymm10,ymm9
	vpor	ymm2,ymm2,ymm6
	vpxor	ymm11,ymm11,ymm13
	vmovdqa	ymm13,YMMWORD[((160-128))+rax]

	vpaddd	ymm4,ymm4,ymm15
	vpslld	ymm7,ymm0,5
	vpand	ymm6,ymm3,ymm2
	vpxor	ymm11,ymm11,YMMWORD[((352-256-128))+rbx]

	vpaddd	ymm4,ymm4,ymm6
	vpsrld	ymm8,ymm0,27
	vpxor	ymm5,ymm3,ymm2
	vpxor	ymm11,ymm11,ymm13

	vmovdqu	YMMWORD[(64-128)+rax],ymm10
	vpaddd	ymm4,ymm4,ymm10
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm11,31
	vpand	ymm5,ymm5,ymm1
	vpaddd	ymm11,ymm11,ymm11

	vpslld	ymm6,ymm1,30
	vpaddd	ymm4,ymm4,ymm5

	vpsrld	ymm1,ymm1,2
	vpaddd	ymm4,ymm4,ymm7
	vpor	ymm11,ymm11,ymm9
	vpor	ymm1,ymm1,ymm6
	vpxor	ymm12,ymm12,ymm14
	vmovdqa	ymm14,YMMWORD[((192-128))+rax]

	vpaddd	ymm3,ymm3,ymm15
	vpslld	ymm7,ymm4,5
	vpand	ymm6,ymm2,ymm1
	vpxor	ymm12,ymm12,YMMWORD[((384-256-128))+rbx]

	vpaddd	ymm3,ymm3,ymm6
	vpsrld	ymm8,ymm4,27
	vpxor	ymm5,ymm2,ymm1
	vpxor	ymm12,ymm12,ymm14

	vmovdqu	YMMWORD[(96-128)+rax],ymm11
	vpaddd	ymm3,ymm3,ymm11
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm12,31
	vpand	ymm5,ymm5,ymm0
	vpaddd	ymm12,ymm12,ymm12

	vpslld	ymm6,ymm0,30
	vpaddd	ymm3,ymm3,ymm5

	vpsrld	ymm0,ymm0,2
	vpaddd	ymm3,ymm3,ymm7
	vpor	ymm12,ymm12,ymm9
	vpor	ymm0,ymm0,ymm6
	vpxor	ymm13,ymm13,ymm10
	vmovdqa	ymm10,YMMWORD[((224-128))+rax]

	vpaddd	ymm2,ymm2,ymm15
	vpslld	ymm7,ymm3,5
	vpand	ymm6,ymm1,ymm0
	vpxor	ymm13,ymm13,YMMWORD[((416-256-128))+rbx]

	vpaddd	ymm2,ymm2,ymm6
	vpsrld	ymm8,ymm3,27
	vpxor	ymm5,ymm1,ymm0
	vpxor	ymm13,ymm13,ymm10

	vmovdqu	YMMWORD[(128-128)+rax],ymm12
	vpaddd	ymm2,ymm2,ymm12
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm13,31
	vpand	ymm5,ymm5,ymm4
	vpaddd	ymm13,ymm13,ymm13

	vpslld	ymm6,ymm4,30
	vpaddd	ymm2,ymm2,ymm5

	vpsrld	ymm4,ymm4,2
	vpaddd	ymm2,ymm2,ymm7
	vpor	ymm13,ymm13,ymm9
	vpor	ymm4,ymm4,ymm6
	vpxor	ymm14,ymm14,ymm11
	vmovdqa	ymm11,YMMWORD[((256-256-128))+rbx]

	vpaddd	ymm1,ymm1,ymm15
	vpslld	ymm7,ymm2,5
	vpand	ymm6,ymm0,ymm4
	vpxor	ymm14,ymm14,YMMWORD[((448-256-128))+rbx]

	vpaddd	ymm1,ymm1,ymm6
	vpsrld	ymm8,ymm2,27
	vpxor	ymm5,ymm0,ymm4
	vpxor	ymm14,ymm14,ymm11

	vmovdqu	YMMWORD[(160-128)+rax],ymm13
	vpaddd	ymm1,ymm1,ymm13
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm14,31
	vpand	ymm5,ymm5,ymm3
	vpaddd	ymm14,ymm14,ymm14

	vpslld	ymm6,ymm3,30
	vpaddd	ymm1,ymm1,ymm5

	vpsrld	ymm3,ymm3,2
	vpaddd	ymm1,ymm1,ymm7
	vpor	ymm14,ymm14,ymm9
	vpor	ymm3,ymm3,ymm6
	vpxor	ymm10,ymm10,ymm12
	vmovdqa	ymm12,YMMWORD[((288-256-128))+rbx]

	vpaddd	ymm0,ymm0,ymm15
	vpslld	ymm7,ymm1,5
	vpand	ymm6,ymm4,ymm3
	vpxor	ymm10,ymm10,YMMWORD[((480-256-128))+rbx]

	vpaddd	ymm0,ymm0,ymm6
	vpsrld	ymm8,ymm1,27
	vpxor	ymm5,ymm4,ymm3
	vpxor	ymm10,ymm10,ymm12

	vmovdqu	YMMWORD[(192-128)+rax],ymm14
	vpaddd	ymm0,ymm0,ymm14
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm10,31
	vpand	ymm5,ymm5,ymm2
	vpaddd	ymm10,ymm10,ymm10

	vpslld	ymm6,ymm2,30
	vpaddd	ymm0,ymm0,ymm5

	vpsrld	ymm2,ymm2,2
	vpaddd	ymm0,ymm0,ymm7
	vpor	ymm10,ymm10,ymm9
	vpor	ymm2,ymm2,ymm6
	vpxor	ymm11,ymm11,ymm13
	vmovdqa	ymm13,YMMWORD[((320-256-128))+rbx]

	vpaddd	ymm4,ymm4,ymm15
	vpslld	ymm7,ymm0,5
	vpand	ymm6,ymm3,ymm2
	vpxor	ymm11,ymm11,YMMWORD[((0-128))+rax]

	vpaddd	ymm4,ymm4,ymm6
	vpsrld	ymm8,ymm0,27
	vpxor	ymm5,ymm3,ymm2
	vpxor	ymm11,ymm11,ymm13

	vmovdqu	YMMWORD[(224-128)+rax],ymm10
	vpaddd	ymm4,ymm4,ymm10
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm11,31
	vpand	ymm5,ymm5,ymm1
	vpaddd	ymm11,ymm11,ymm11

	vpslld	ymm6,ymm1,30
	vpaddd	ymm4,ymm4,ymm5

	vpsrld	ymm1,ymm1,2
	vpaddd	ymm4,ymm4,ymm7
	vpor	ymm11,ymm11,ymm9
	vpor	ymm1,ymm1,ymm6
	vpxor	ymm12,ymm12,ymm14
	vmovdqa	ymm14,YMMWORD[((352-256-128))+rbx]

	vpaddd	ymm3,ymm3,ymm15
	vpslld	ymm7,ymm4,5
	vpand	ymm6,ymm2,ymm1
	vpxor	ymm12,ymm12,YMMWORD[((32-128))+rax]

	vpaddd	ymm3,ymm3,ymm6
	vpsrld	ymm8,ymm4,27
	vpxor	ymm5,ymm2,ymm1
	vpxor	ymm12,ymm12,ymm14

	vmovdqu	YMMWORD[(256-256-128)+rbx],ymm11
	vpaddd	ymm3,ymm3,ymm11
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm12,31
	vpand	ymm5,ymm5,ymm0
	vpaddd	ymm12,ymm12,ymm12

	vpslld	ymm6,ymm0,30
	vpaddd	ymm3,ymm3,ymm5

	vpsrld	ymm0,ymm0,2
	vpaddd	ymm3,ymm3,ymm7
	vpor	ymm12,ymm12,ymm9
	vpor	ymm0,ymm0,ymm6
	vpxor	ymm13,ymm13,ymm10
	vmovdqa	ymm10,YMMWORD[((384-256-128))+rbx]

	vpaddd	ymm2,ymm2,ymm15
	vpslld	ymm7,ymm3,5
	vpand	ymm6,ymm1,ymm0
	vpxor	ymm13,ymm13,YMMWORD[((64-128))+rax]

	vpaddd	ymm2,ymm2,ymm6
	vpsrld	ymm8,ymm3,27
	vpxor	ymm5,ymm1,ymm0
	vpxor	ymm13,ymm13,ymm10

	vmovdqu	YMMWORD[(288-256-128)+rbx],ymm12
	vpaddd	ymm2,ymm2,ymm12
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm13,31
	vpand	ymm5,ymm5,ymm4
	vpaddd	ymm13,ymm13,ymm13

	vpslld	ymm6,ymm4,30
	vpaddd	ymm2,ymm2,ymm5

	vpsrld	ymm4,ymm4,2
	vpaddd	ymm2,ymm2,ymm7
	vpor	ymm13,ymm13,ymm9
	vpor	ymm4,ymm4,ymm6
	vpxor	ymm14,ymm14,ymm11
	vmovdqa	ymm11,YMMWORD[((416-256-128))+rbx]

	vpaddd	ymm1,ymm1,ymm15
	vpslld	ymm7,ymm2,5
	vpand	ymm6,ymm0,ymm4
	vpxor	ymm14,ymm14,YMMWORD[((96-128))+rax]

	vpaddd	ymm1,ymm1,ymm6
	vpsrld	ymm8,ymm2,27
	vpxor	ymm5,ymm0,ymm4
	vpxor	ymm14,ymm14,ymm11

	vmovdqu	YMMWORD[(320-256-128)+rbx],ymm13
	vpaddd	ymm1,ymm1,ymm13
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm14,31
	vpand	ymm5,ymm5,ymm3
	vpaddd	ymm14,ymm14,ymm14

	vpslld	ymm6,ymm3,30
	vpaddd	ymm1,ymm1,ymm5

	vpsrld	ymm3,ymm3,2
	vpaddd	ymm1,ymm1,ymm7
	vpor	ymm14,ymm14,ymm9
	vpor	ymm3,ymm3,ymm6
	vpxor	ymm10,ymm10,ymm12
	vmovdqa	ymm12,YMMWORD[((448-256-128))+rbx]

	vpaddd	ymm0,ymm0,ymm15
	vpslld	ymm7,ymm1,5
	vpand	ymm6,ymm4,ymm3
	vpxor	ymm10,ymm10,YMMWORD[((128-128))+rax]

	vpaddd	ymm0,ymm0,ymm6
	vpsrld	ymm8,ymm1,27
	vpxor	ymm5,ymm4,ymm3
	vpxor	ymm10,ymm10,ymm12

	vmovdqu	YMMWORD[(352-256-128)+rbx],ymm14
	vpaddd	ymm0,ymm0,ymm14
	vpor	ymm7,ymm7,ymm8
	vpsrld	ymm9,ymm10,31
	vpand	ymm5,ymm5,ymm2
	vpaddd	ymm10,ymm10,ymm10

	vpslld	ymm6,ymm2,30
	vpaddd	ymm0,ymm0,ymm5

	vpsrld	ymm2,ymm2,2
	vpaddd	ymm0,ymm0,ymm7
	vpor	ymm10,ymm10,ymm9
	vpor	ymm2,ymm2,ymm6
	vmovdqa	ymm15,YMMWORD[64+rbp]
	vpxor	ymm11,ymm11,ymm13
	vmovdqa	ymm13,YMMWORD[((480-256-128))+rbx]

	vpslld	ymm7,ymm0,5
	vpaddd	ymm4,ymm4,ymm15
	vpxor	ymm5,ymm3,ymm1
	vmovdqa	YMMWORD[(384-256-128)+rbx],ymm10
	vpaddd	ymm4,ymm4,ymm10
	vpxor	ymm11,ymm11,YMMWORD[((160-128))+rax]
	vpsrld	ymm8,ymm0,27
	vpxor	ymm5,ymm5,ymm2
	vpxor	ymm11,ymm11,ymm13

	vpslld	ymm6,ymm1,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm4,ymm4,ymm5
	vpsrld	ymm9,ymm11,31
	vpaddd	ymm11,ymm11,ymm11

	vpsrld	ymm1,ymm1,2
	vpaddd	ymm4,ymm4,ymm7
	vpor	ymm11,ymm11,ymm9
	vpor	ymm1,ymm1,ymm6
	vpxor	ymm12,ymm12,ymm14
	vmovdqa	ymm14,YMMWORD[((0-128))+rax]

	vpslld	ymm7,ymm4,5
	vpaddd	ymm3,ymm3,ymm15
	vpxor	ymm5,ymm2,ymm0
	vmovdqa	YMMWORD[(416-256-128)+rbx],ymm11
	vpaddd	ymm3,ymm3,ymm11
	vpxor	ymm12,ymm12,YMMWORD[((192-128))+rax]
	vpsrld	ymm8,ymm4,27
	vpxor	ymm5,ymm5,ymm1
	vpxor	ymm12,ymm12,ymm14

	vpslld	ymm6,ymm0,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm3,ymm3,ymm5
	vpsrld	ymm9,ymm12,31
	vpaddd	ymm12,ymm12,ymm12

	vpsrld	ymm0,ymm0,2
	vpaddd	ymm3,ymm3,ymm7
	vpor	ymm12,ymm12,ymm9
	vpor	ymm0,ymm0,ymm6
	vpxor	ymm13,ymm13,ymm10
	vmovdqa	ymm10,YMMWORD[((32-128))+rax]

	vpslld	ymm7,ymm3,5
	vpaddd	ymm2,ymm2,ymm15
	vpxor	ymm5,ymm1,ymm4
	vmovdqa	YMMWORD[(448-256-128)+rbx],ymm12
	vpaddd	ymm2,ymm2,ymm12
	vpxor	ymm13,ymm13,YMMWORD[((224-128))+rax]
	vpsrld	ymm8,ymm3,27
	vpxor	ymm5,ymm5,ymm0
	vpxor	ymm13,ymm13,ymm10

	vpslld	ymm6,ymm4,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm2,ymm2,ymm5
	vpsrld	ymm9,ymm13,31
	vpaddd	ymm13,ymm13,ymm13

	vpsrld	ymm4,ymm4,2
	vpaddd	ymm2,ymm2,ymm7
	vpor	ymm13,ymm13,ymm9
	vpor	ymm4,ymm4,ymm6
	vpxor	ymm14,ymm14,ymm11
	vmovdqa	ymm11,YMMWORD[((64-128))+rax]

	vpslld	ymm7,ymm2,5
	vpaddd	ymm1,ymm1,ymm15
	vpxor	ymm5,ymm0,ymm3
	vmovdqa	YMMWORD[(480-256-128)+rbx],ymm13
	vpaddd	ymm1,ymm1,ymm13
	vpxor	ymm14,ymm14,YMMWORD[((256-256-128))+rbx]
	vpsrld	ymm8,ymm2,27
	vpxor	ymm5,ymm5,ymm4
	vpxor	ymm14,ymm14,ymm11

	vpslld	ymm6,ymm3,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm1,ymm1,ymm5
	vpsrld	ymm9,ymm14,31
	vpaddd	ymm14,ymm14,ymm14

	vpsrld	ymm3,ymm3,2
	vpaddd	ymm1,ymm1,ymm7
	vpor	ymm14,ymm14,ymm9
	vpor	ymm3,ymm3,ymm6
	vpxor	ymm10,ymm10,ymm12
	vmovdqa	ymm12,YMMWORD[((96-128))+rax]

	vpslld	ymm7,ymm1,5
	vpaddd	ymm0,ymm0,ymm15
	vpxor	ymm5,ymm4,ymm2
	vmovdqa	YMMWORD[(0-128)+rax],ymm14
	vpaddd	ymm0,ymm0,ymm14
	vpxor	ymm10,ymm10,YMMWORD[((288-256-128))+rbx]
	vpsrld	ymm8,ymm1,27
	vpxor	ymm5,ymm5,ymm3
	vpxor	ymm10,ymm10,ymm12

	vpslld	ymm6,ymm2,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm0,ymm0,ymm5
	vpsrld	ymm9,ymm10,31
	vpaddd	ymm10,ymm10,ymm10

	vpsrld	ymm2,ymm2,2
	vpaddd	ymm0,ymm0,ymm7
	vpor	ymm10,ymm10,ymm9
	vpor	ymm2,ymm2,ymm6
	vpxor	ymm11,ymm11,ymm13
	vmovdqa	ymm13,YMMWORD[((128-128))+rax]

	vpslld	ymm7,ymm0,5
	vpaddd	ymm4,ymm4,ymm15
	vpxor	ymm5,ymm3,ymm1
	vmovdqa	YMMWORD[(32-128)+rax],ymm10
	vpaddd	ymm4,ymm4,ymm10
	vpxor	ymm11,ymm11,YMMWORD[((320-256-128))+rbx]
	vpsrld	ymm8,ymm0,27
	vpxor	ymm5,ymm5,ymm2
	vpxor	ymm11,ymm11,ymm13

	vpslld	ymm6,ymm1,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm4,ymm4,ymm5
	vpsrld	ymm9,ymm11,31
	vpaddd	ymm11,ymm11,ymm11

	vpsrld	ymm1,ymm1,2
	vpaddd	ymm4,ymm4,ymm7
	vpor	ymm11,ymm11,ymm9
	vpor	ymm1,ymm1,ymm6
	vpxor	ymm12,ymm12,ymm14
	vmovdqa	ymm14,YMMWORD[((160-128))+rax]

	vpslld	ymm7,ymm4,5
	vpaddd	ymm3,ymm3,ymm15
	vpxor	ymm5,ymm2,ymm0
	vmovdqa	YMMWORD[(64-128)+rax],ymm11
	vpaddd	ymm3,ymm3,ymm11
	vpxor	ymm12,ymm12,YMMWORD[((352-256-128))+rbx]
	vpsrld	ymm8,ymm4,27
	vpxor	ymm5,ymm5,ymm1
	vpxor	ymm12,ymm12,ymm14

	vpslld	ymm6,ymm0,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm3,ymm3,ymm5
	vpsrld	ymm9,ymm12,31
	vpaddd	ymm12,ymm12,ymm12

	vpsrld	ymm0,ymm0,2
	vpaddd	ymm3,ymm3,ymm7
	vpor	ymm12,ymm12,ymm9
	vpor	ymm0,ymm0,ymm6
	vpxor	ymm13,ymm13,ymm10
	vmovdqa	ymm10,YMMWORD[((192-128))+rax]

	vpslld	ymm7,ymm3,5
	vpaddd	ymm2,ymm2,ymm15
	vpxor	ymm5,ymm1,ymm4
	vmovdqa	YMMWORD[(96-128)+rax],ymm12
	vpaddd	ymm2,ymm2,ymm12
	vpxor	ymm13,ymm13,YMMWORD[((384-256-128))+rbx]
	vpsrld	ymm8,ymm3,27
	vpxor	ymm5,ymm5,ymm0
	vpxor	ymm13,ymm13,ymm10

	vpslld	ymm6,ymm4,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm2,ymm2,ymm5
	vpsrld	ymm9,ymm13,31
	vpaddd	ymm13,ymm13,ymm13

	vpsrld	ymm4,ymm4,2
	vpaddd	ymm2,ymm2,ymm7
	vpor	ymm13,ymm13,ymm9
	vpor	ymm4,ymm4,ymm6
	vpxor	ymm14,ymm14,ymm11
	vmovdqa	ymm11,YMMWORD[((224-128))+rax]

	vpslld	ymm7,ymm2,5
	vpaddd	ymm1,ymm1,ymm15
	vpxor	ymm5,ymm0,ymm3
	vmovdqa	YMMWORD[(128-128)+rax],ymm13
	vpaddd	ymm1,ymm1,ymm13
	vpxor	ymm14,ymm14,YMMWORD[((416-256-128))+rbx]
	vpsrld	ymm8,ymm2,27
	vpxor	ymm5,ymm5,ymm4
	vpxor	ymm14,ymm14,ymm11

	vpslld	ymm6,ymm3,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm1,ymm1,ymm5
	vpsrld	ymm9,ymm14,31
	vpaddd	ymm14,ymm14,ymm14

	vpsrld	ymm3,ymm3,2
	vpaddd	ymm1,ymm1,ymm7
	vpor	ymm14,ymm14,ymm9
	vpor	ymm3,ymm3,ymm6
	vpxor	ymm10,ymm10,ymm12
	vmovdqa	ymm12,YMMWORD[((256-256-128))+rbx]

	vpslld	ymm7,ymm1,5
	vpaddd	ymm0,ymm0,ymm15
	vpxor	ymm5,ymm4,ymm2
	vmovdqa	YMMWORD[(160-128)+rax],ymm14
	vpaddd	ymm0,ymm0,ymm14
	vpxor	ymm10,ymm10,YMMWORD[((448-256-128))+rbx]
	vpsrld	ymm8,ymm1,27
	vpxor	ymm5,ymm5,ymm3
	vpxor	ymm10,ymm10,ymm12

	vpslld	ymm6,ymm2,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm0,ymm0,ymm5
	vpsrld	ymm9,ymm10,31
	vpaddd	ymm10,ymm10,ymm10

	vpsrld	ymm2,ymm2,2
	vpaddd	ymm0,ymm0,ymm7
	vpor	ymm10,ymm10,ymm9
	vpor	ymm2,ymm2,ymm6
	vpxor	ymm11,ymm11,ymm13
	vmovdqa	ymm13,YMMWORD[((288-256-128))+rbx]

	vpslld	ymm7,ymm0,5
	vpaddd	ymm4,ymm4,ymm15
	vpxor	ymm5,ymm3,ymm1
	vmovdqa	YMMWORD[(192-128)+rax],ymm10
	vpaddd	ymm4,ymm4,ymm10
	vpxor	ymm11,ymm11,YMMWORD[((480-256-128))+rbx]
	vpsrld	ymm8,ymm0,27
	vpxor	ymm5,ymm5,ymm2
	vpxor	ymm11,ymm11,ymm13

	vpslld	ymm6,ymm1,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm4,ymm4,ymm5
	vpsrld	ymm9,ymm11,31
	vpaddd	ymm11,ymm11,ymm11

	vpsrld	ymm1,ymm1,2
	vpaddd	ymm4,ymm4,ymm7
	vpor	ymm11,ymm11,ymm9
	vpor	ymm1,ymm1,ymm6
	vpxor	ymm12,ymm12,ymm14
	vmovdqa	ymm14,YMMWORD[((320-256-128))+rbx]

	vpslld	ymm7,ymm4,5
	vpaddd	ymm3,ymm3,ymm15
	vpxor	ymm5,ymm2,ymm0
	vmovdqa	YMMWORD[(224-128)+rax],ymm11
	vpaddd	ymm3,ymm3,ymm11
	vpxor	ymm12,ymm12,YMMWORD[((0-128))+rax]
	vpsrld	ymm8,ymm4,27
	vpxor	ymm5,ymm5,ymm1
	vpxor	ymm12,ymm12,ymm14

	vpslld	ymm6,ymm0,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm3,ymm3,ymm5
	vpsrld	ymm9,ymm12,31
	vpaddd	ymm12,ymm12,ymm12

	vpsrld	ymm0,ymm0,2
	vpaddd	ymm3,ymm3,ymm7
	vpor	ymm12,ymm12,ymm9
	vpor	ymm0,ymm0,ymm6
	vpxor	ymm13,ymm13,ymm10
	vmovdqa	ymm10,YMMWORD[((352-256-128))+rbx]

	vpslld	ymm7,ymm3,5
	vpaddd	ymm2,ymm2,ymm15
	vpxor	ymm5,ymm1,ymm4
	vpaddd	ymm2,ymm2,ymm12
	vpxor	ymm13,ymm13,YMMWORD[((32-128))+rax]
	vpsrld	ymm8,ymm3,27
	vpxor	ymm5,ymm5,ymm0
	vpxor	ymm13,ymm13,ymm10

	vpslld	ymm6,ymm4,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm2,ymm2,ymm5
	vpsrld	ymm9,ymm13,31
	vpaddd	ymm13,ymm13,ymm13

	vpsrld	ymm4,ymm4,2
	vpaddd	ymm2,ymm2,ymm7
	vpor	ymm13,ymm13,ymm9
	vpor	ymm4,ymm4,ymm6
	vpxor	ymm14,ymm14,ymm11
	vmovdqa	ymm11,YMMWORD[((384-256-128))+rbx]

	vpslld	ymm7,ymm2,5
	vpaddd	ymm1,ymm1,ymm15
	vpxor	ymm5,ymm0,ymm3
	vpaddd	ymm1,ymm1,ymm13
	vpxor	ymm14,ymm14,YMMWORD[((64-128))+rax]
	vpsrld	ymm8,ymm2,27
	vpxor	ymm5,ymm5,ymm4
	vpxor	ymm14,ymm14,ymm11

	vpslld	ymm6,ymm3,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm1,ymm1,ymm5
	vpsrld	ymm9,ymm14,31
	vpaddd	ymm14,ymm14,ymm14

	vpsrld	ymm3,ymm3,2
	vpaddd	ymm1,ymm1,ymm7
	vpor	ymm14,ymm14,ymm9
	vpor	ymm3,ymm3,ymm6
	vpxor	ymm10,ymm10,ymm12
	vmovdqa	ymm12,YMMWORD[((416-256-128))+rbx]

	vpslld	ymm7,ymm1,5
	vpaddd	ymm0,ymm0,ymm15
	vpxor	ymm5,ymm4,ymm2
	vpaddd	ymm0,ymm0,ymm14
	vpxor	ymm10,ymm10,YMMWORD[((96-128))+rax]
	vpsrld	ymm8,ymm1,27
	vpxor	ymm5,ymm5,ymm3
	vpxor	ymm10,ymm10,ymm12

	vpslld	ymm6,ymm2,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm0,ymm0,ymm5
	vpsrld	ymm9,ymm10,31
	vpaddd	ymm10,ymm10,ymm10

	vpsrld	ymm2,ymm2,2
	vpaddd	ymm0,ymm0,ymm7
	vpor	ymm10,ymm10,ymm9
	vpor	ymm2,ymm2,ymm6
	vpxor	ymm11,ymm11,ymm13
	vmovdqa	ymm13,YMMWORD[((448-256-128))+rbx]

	vpslld	ymm7,ymm0,5
	vpaddd	ymm4,ymm4,ymm15
	vpxor	ymm5,ymm3,ymm1
	vpaddd	ymm4,ymm4,ymm10
	vpxor	ymm11,ymm11,YMMWORD[((128-128))+rax]
	vpsrld	ymm8,ymm0,27
	vpxor	ymm5,ymm5,ymm2
	vpxor	ymm11,ymm11,ymm13

	vpslld	ymm6,ymm1,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm4,ymm4,ymm5
	vpsrld	ymm9,ymm11,31
	vpaddd	ymm11,ymm11,ymm11

	vpsrld	ymm1,ymm1,2
	vpaddd	ymm4,ymm4,ymm7
	vpor	ymm11,ymm11,ymm9
	vpor	ymm1,ymm1,ymm6
	vpxor	ymm12,ymm12,ymm14
	vmovdqa	ymm14,YMMWORD[((480-256-128))+rbx]

	vpslld	ymm7,ymm4,5
	vpaddd	ymm3,ymm3,ymm15
	vpxor	ymm5,ymm2,ymm0
	vpaddd	ymm3,ymm3,ymm11
	vpxor	ymm12,ymm12,YMMWORD[((160-128))+rax]
	vpsrld	ymm8,ymm4,27
	vpxor	ymm5,ymm5,ymm1
	vpxor	ymm12,ymm12,ymm14

	vpslld	ymm6,ymm0,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm3,ymm3,ymm5
	vpsrld	ymm9,ymm12,31
	vpaddd	ymm12,ymm12,ymm12

	vpsrld	ymm0,ymm0,2
	vpaddd	ymm3,ymm3,ymm7
	vpor	ymm12,ymm12,ymm9
	vpor	ymm0,ymm0,ymm6
	vpxor	ymm13,ymm13,ymm10
	vmovdqa	ymm10,YMMWORD[((0-128))+rax]

	vpslld	ymm7,ymm3,5
	vpaddd	ymm2,ymm2,ymm15
	vpxor	ymm5,ymm1,ymm4
	vpaddd	ymm2,ymm2,ymm12
	vpxor	ymm13,ymm13,YMMWORD[((192-128))+rax]
	vpsrld	ymm8,ymm3,27
	vpxor	ymm5,ymm5,ymm0
	vpxor	ymm13,ymm13,ymm10

	vpslld	ymm6,ymm4,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm2,ymm2,ymm5
	vpsrld	ymm9,ymm13,31
	vpaddd	ymm13,ymm13,ymm13

	vpsrld	ymm4,ymm4,2
	vpaddd	ymm2,ymm2,ymm7
	vpor	ymm13,ymm13,ymm9
	vpor	ymm4,ymm4,ymm6
	vpxor	ymm14,ymm14,ymm11
	vmovdqa	ymm11,YMMWORD[((32-128))+rax]

	vpslld	ymm7,ymm2,5
	vpaddd	ymm1,ymm1,ymm15
	vpxor	ymm5,ymm0,ymm3
	vpaddd	ymm1,ymm1,ymm13
	vpxor	ymm14,ymm14,YMMWORD[((224-128))+rax]
	vpsrld	ymm8,ymm2,27
	vpxor	ymm5,ymm5,ymm4
	vpxor	ymm14,ymm14,ymm11

	vpslld	ymm6,ymm3,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm1,ymm1,ymm5
	vpsrld	ymm9,ymm14,31
	vpaddd	ymm14,ymm14,ymm14

	vpsrld	ymm3,ymm3,2
	vpaddd	ymm1,ymm1,ymm7
	vpor	ymm14,ymm14,ymm9
	vpor	ymm3,ymm3,ymm6
	vpslld	ymm7,ymm1,5
	vpaddd	ymm0,ymm0,ymm15
	vpxor	ymm5,ymm4,ymm2

	vpsrld	ymm8,ymm1,27
	vpaddd	ymm0,ymm0,ymm14
	vpxor	ymm5,ymm5,ymm3

	vpslld	ymm6,ymm2,30
	vpor	ymm7,ymm7,ymm8
	vpaddd	ymm0,ymm0,ymm5

	vpsrld	ymm2,ymm2,2
	vpaddd	ymm0,ymm0,ymm7
	vpor	ymm2,ymm2,ymm6
	mov	ecx,1
	lea	rbx,[512+rsp]
	cmp	ecx,DWORD[rbx]
	cmovge	r12,rbp
	cmp	ecx,DWORD[4+rbx]
	cmovge	r13,rbp
	cmp	ecx,DWORD[8+rbx]
	cmovge	r14,rbp
	cmp	ecx,DWORD[12+rbx]
	cmovge	r15,rbp
	cmp	ecx,DWORD[16+rbx]
	cmovge	r8,rbp
	cmp	ecx,DWORD[20+rbx]
	cmovge	r9,rbp
	cmp	ecx,DWORD[24+rbx]
	cmovge	r10,rbp
	cmp	ecx,DWORD[28+rbx]
	cmovge	r11,rbp
	vmovdqu	ymm5,YMMWORD[rbx]
	vpxor	ymm7,ymm7,ymm7
	vmovdqa	ymm6,ymm5
	vpcmpgtd	ymm6,ymm6,ymm7
	vpaddd	ymm5,ymm5,ymm6

	vpand	ymm0,ymm0,ymm6
	vpand	ymm1,ymm1,ymm6
	vpaddd	ymm0,ymm0,YMMWORD[rdi]
	vpand	ymm2,ymm2,ymm6
	vpaddd	ymm1,ymm1,YMMWORD[32+rdi]
	vpand	ymm3,ymm3,ymm6
	vpaddd	ymm2,ymm2,YMMWORD[64+rdi]
	vpand	ymm4,ymm4,ymm6
	vpaddd	ymm3,ymm3,YMMWORD[96+rdi]
	vpaddd	ymm4,ymm4,YMMWORD[128+rdi]
	vmovdqu	YMMWORD[rdi],ymm0
	vmovdqu	YMMWORD[32+rdi],ymm1
	vmovdqu	YMMWORD[64+rdi],ymm2
	vmovdqu	YMMWORD[96+rdi],ymm3
	vmovdqu	YMMWORD[128+rdi],ymm4

	vmovdqu	YMMWORD[rbx],ymm5
	lea	rbx,[((256+128))+rsp]
	vmovdqu	ymm9,YMMWORD[96+rbp]
	dec	edx
	jnz	NEAR $L$oop_avx2







$L$done_avx2:
	mov	rax,QWORD[544+rsp]

	vzeroupper
	movaps	xmm6,XMMWORD[((-216))+rax]
	movaps	xmm7,XMMWORD[((-200))+rax]
	movaps	xmm8,XMMWORD[((-184))+rax]
	movaps	xmm9,XMMWORD[((-168))+rax]
	movaps	xmm10,XMMWORD[((-152))+rax]
	movaps	xmm11,XMMWORD[((-136))+rax]
	movaps	xmm12,XMMWORD[((-120))+rax]
	movaps	xmm13,XMMWORD[((-104))+rax]
	movaps	xmm14,XMMWORD[((-88))+rax]
	movaps	xmm15,XMMWORD[((-72))+rax]
	mov	r15,QWORD[((-48))+rax]

	mov	r14,QWORD[((-40))+rax]

	mov	r13,QWORD[((-32))+rax]

	mov	r12,QWORD[((-24))+rax]

	mov	rbp,QWORD[((-16))+rax]

	mov	rbx,QWORD[((-8))+rax]

	lea	rsp,[rax]

$L$epilogue_avx2:
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	DB	0F3h,0C3h		;repret

$L$SEH_end_sha1_multi_block_avx2:

ALIGN	256
	DD	0x5a827999,0x5a827999,0x5a827999,0x5a827999
	DD	0x5a827999,0x5a827999,0x5a827999,0x5a827999
K_XX_XX:
	DD	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
	DD	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
	DD	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
	DD	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
	DD	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
	DD	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
	DD	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
	DD	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
DB	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
DB	83,72,65,49,32,109,117,108,116,105,45,98,108,111,99,107
DB	32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120
DB	56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77
DB	83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110
DB	115,115,108,46,111,114,103,62,0
EXTERN	__imp_RtlVirtualUnwind

ALIGN	16
se_handler:
	push	rsi
	push	rdi
	push	rbx
	push	rbp
	push	r12
	push	r13
	push	r14
	push	r15
	pushfq
	sub	rsp,64

	mov	rax,QWORD[120+r8]
	mov	rbx,QWORD[248+r8]

	mov	rsi,QWORD[8+r9]
	mov	r11,QWORD[56+r9]

	mov	r10d,DWORD[r11]
	lea	r10,[r10*1+rsi]
	cmp	rbx,r10
	jb	NEAR $L$in_prologue

	mov	rax,QWORD[152+r8]

	mov	r10d,DWORD[4+r11]
	lea	r10,[r10*1+rsi]
	cmp	rbx,r10
	jae	NEAR $L$in_prologue

	mov	rax,QWORD[272+rax]

	mov	rbx,QWORD[((-8))+rax]
	mov	rbp,QWORD[((-16))+rax]
	mov	QWORD[144+r8],rbx
	mov	QWORD[160+r8],rbp

	lea	rsi,[((-24-160))+rax]
	lea	rdi,[512+r8]
	mov	ecx,20
	DD	0xa548f3fc

$L$in_prologue:
	mov	rdi,QWORD[8+rax]
	mov	rsi,QWORD[16+rax]
	mov	QWORD[152+r8],rax
	mov	QWORD[168+r8],rsi
	mov	QWORD[176+r8],rdi

	mov	rdi,QWORD[40+r9]
	mov	rsi,r8
	mov	ecx,154
	DD	0xa548f3fc

	mov	rsi,r9
	xor	rcx,rcx
	mov	rdx,QWORD[8+rsi]
	mov	r8,QWORD[rsi]
	mov	r9,QWORD[16+rsi]
	mov	r10,QWORD[40+rsi]
	lea	r11,[56+rsi]
	lea	r12,[24+rsi]
	mov	QWORD[32+rsp],r10
	mov	QWORD[40+rsp],r11
	mov	QWORD[48+rsp],r12
	mov	QWORD[56+rsp],rcx
	call	QWORD[__imp_RtlVirtualUnwind]

	mov	eax,1
	add	rsp,64
	popfq
	pop	r15
	pop	r14
	pop	r13
	pop	r12
	pop	rbp
	pop	rbx
	pop	rdi
	pop	rsi
	DB	0F3h,0C3h		;repret


ALIGN	16
avx2_handler:
	push	rsi
	push	rdi
	push	rbx
	push	rbp
	push	r12
	push	r13
	push	r14
	push	r15
	pushfq
	sub	rsp,64

	mov	rax,QWORD[120+r8]
	mov	rbx,QWORD[248+r8]

	mov	rsi,QWORD[8+r9]
	mov	r11,QWORD[56+r9]

	mov	r10d,DWORD[r11]
	lea	r10,[r10*1+rsi]
	cmp	rbx,r10
	jb	NEAR $L$in_prologue

	mov	rax,QWORD[152+r8]

	mov	r10d,DWORD[4+r11]
	lea	r10,[r10*1+rsi]
	cmp	rbx,r10
	jae	NEAR $L$in_prologue

	mov	rax,QWORD[544+r8]

	mov	rbx,QWORD[((-8))+rax]
	mov	rbp,QWORD[((-16))+rax]
	mov	r12,QWORD[((-24))+rax]
	mov	r13,QWORD[((-32))+rax]
	mov	r14,QWORD[((-40))+rax]
	mov	r15,QWORD[((-48))+rax]
	mov	QWORD[144+r8],rbx
	mov	QWORD[160+r8],rbp
	mov	QWORD[216+r8],r12
	mov	QWORD[224+r8],r13
	mov	QWORD[232+r8],r14
	mov	QWORD[240+r8],r15

	lea	rsi,[((-56-160))+rax]
	lea	rdi,[512+r8]
	mov	ecx,20
	DD	0xa548f3fc

	jmp	NEAR $L$in_prologue

section	.pdata rdata align=4
ALIGN	4
	DD	$L$SEH_begin_sha1_multi_block wrt ..imagebase
	DD	$L$SEH_end_sha1_multi_block wrt ..imagebase
	DD	$L$SEH_info_sha1_multi_block wrt ..imagebase
	DD	$L$SEH_begin_sha1_multi_block_shaext wrt ..imagebase
	DD	$L$SEH_end_sha1_multi_block_shaext wrt ..imagebase
	DD	$L$SEH_info_sha1_multi_block_shaext wrt ..imagebase
	DD	$L$SEH_begin_sha1_multi_block_avx wrt ..imagebase
	DD	$L$SEH_end_sha1_multi_block_avx wrt ..imagebase
	DD	$L$SEH_info_sha1_multi_block_avx wrt ..imagebase
	DD	$L$SEH_begin_sha1_multi_block_avx2 wrt ..imagebase
	DD	$L$SEH_end_sha1_multi_block_avx2 wrt ..imagebase
	DD	$L$SEH_info_sha1_multi_block_avx2 wrt ..imagebase
section	.xdata rdata align=8
ALIGN	8
$L$SEH_info_sha1_multi_block:
DB	9,0,0,0
	DD	se_handler wrt ..imagebase
	DD	$L$body wrt ..imagebase,$L$epilogue wrt ..imagebase
$L$SEH_info_sha1_multi_block_shaext:
DB	9,0,0,0
	DD	se_handler wrt ..imagebase
	DD	$L$body_shaext wrt ..imagebase,$L$epilogue_shaext wrt ..imagebase
$L$SEH_info_sha1_multi_block_avx:
DB	9,0,0,0
	DD	se_handler wrt ..imagebase
	DD	$L$body_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
$L$SEH_info_sha1_multi_block_avx2:
DB	9,0,0,0
	DD	avx2_handler wrt ..imagebase
	DD	$L$body_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase

Kontol Shell Bypass