[Info-vax] x86-64 data aligment / faulting

Mon Mar 7 21:41:09 EST 2022

On 2/28/2022 1:55 PM, Simon Clubley wrote:
> On 2022-02-28, Simon Clubley <clubley at remove_me.eisner.decus.org-Earth.UFP> wrote:
>> On 2022-02-27, Arne Vajhøj <arne at vajhoej.dk> wrote:
>>>
>>> I modified the program to test with different data sizes, verified
>>> that the code was indeed working on an unaligned addresses and
>>> tried both sequential and random access to array.
>>>
>>> I simply can't get a big difference between aligned and unaligned access.
>>>
>>
>> Have you looked at the generated code to verify that unaligned access
>> is really occuring ?
>>
>> Another possibility, depending on how smart the compiler is, is that
>> it could always do aligned access lookups and then only extract the
>> data it needs out of each lookup.
>>
> 
> Also, the optimisation settings and level might make a difference.
> Try varying them to see if the behaviour changes, especially when
> you switch between optimising for time versus space (and yes, include
> -O0 in the list of things you try. :-)).

No alignment effect in neither default, -O0 or -O3.

> It's for reasons like this that I suggested at the start of this
> to look at the generated code, just in case the compiler is doing
> something you are not expecting.

I am not good enough in x86-64 assembler to see if it does
anything unexpected.

       SUBROUTINE TEST(IX,X,N,REP)
       INTEGER*4 IX,N,REP
       REAL*8 X(*)
       INTEGER*4 I,T1,T2,DUMMY
       CALL SYSTEM_CLOCK(T1,DUMMY,DUMMY)
       DO 200 J=1,REP
         DO 100 I=1,N
           X(I)=I
100     CONTINUE
200   CONTINUE
       CALL SYSTEM_CLOCK(T2,DUMMY,DUMMY)
       WRITE(6,300) IX-1,T2-T1
       RETURN
300   FORMAT(1X,'OFFSET ',I2,' : ',I6,' ms')
       END

becomes:

	.seh_proc	test_
test_:
	pushq	%r13
	.seh_pushreg	%r13
	pushq	%r12
	.seh_pushreg	%r12
	pushq	%rbp
	.seh_pushreg	%rbp
	pushq	%rdi
	.seh_pushreg	%rdi
	pushq	%rsi
	.seh_pushreg	%rsi
	pushq	%rbx
	.seh_pushreg	%rbx
	subq	$600, %rsp
	.seh_stackalloc	600
	.seh_endprologue
	movq	%r9, %rbp
	movq	%rcx, %rbx
	movq	%rdx, %rsi
	movq	%r8, %r12
	leaq	40(%rsp), %rdx
	leaq	36(%rsp), %rcx
	leaq	44(%rsp), %r8
	call	_gfortran_system_clock_4
	movl	0(%rbp), %r11d
	movl	36(%rsp), %edi
	testl	%r11d, %r11d
	jle	.L2
	movl	(%r12), %r8d
	testl	%r8d, %r8d
	jle	.L2
	movl	%r8d, %eax
	movl	%r8d, %ebp
	addl	$1, %r11d
	movl	$1, %ecx
	shrl	$2, %eax
	andl	$-4, %ebp
	movdqa	.LC1(%rip), %xmm3
	leal	-1(%r8), %r12d
	subl	$1, %eax
	leal	1(%rbp), %r13d
	salq	$5, %rax
	leaq	32(%rsi,%rax), %rdx
	.p2align 4,,10
	.p2align 3
.L6:
	cmpl	$2, %r12d
	jbe	.L7
	movdqa	.LC0(%rip), %xmm1
	movq	%rsi, %rax
	.p2align 4,,10
	.p2align 3
.L4:
	movdqa	%xmm1, %xmm0
	addq	$32, %rax
	paddd	%xmm3, %xmm1
	cvtdq2pd	%xmm0, %xmm2
	pshufd	$238, %xmm0, %xmm0
	movups	%xmm2, -32(%rax)
	cvtdq2pd	%xmm0, %xmm0
	movups	%xmm0, -16(%rax)
	cmpq	%rdx, %rax
	jne	.L4
	movl	%r13d, %eax
	cmpl	%ebp, %r8d
	je	.L5
.L3:
	pxor	%xmm0, %xmm0
	movslq	%eax, %r9
	leal	1(%rax), %r10d
	cvtsi2sdl	%eax, %xmm0
	leaq	(%rsi,%r9,8), %r9
	movsd	%xmm0, -8(%r9)
	cmpl	%r10d, %r8d
	jl	.L5
	pxor	%xmm0, %xmm0
	addl	$2, %eax
	cvtsi2sdl	%r10d, %xmm0
	movsd	%xmm0, (%r9)
	cmpl	%eax, %r8d
	jl	.L5
	pxor	%xmm0, %xmm0
	cvtsi2sdl	%eax, %xmm0
	movsd	%xmm0, 8(%r9)
.L5:
	addl	$1, %ecx
	cmpl	%r11d, %ecx
	jne	.L6
.L2:
	leaq	52(%rsp), %rdx
	leaq	56(%rsp), %r8
	leaq	48(%rsp), %rcx
	leaq	64(%rsp), %r12
	call	_gfortran_system_clock_4
	leaq	.LC2(%rip), %rax
	movq	%r12, %rcx
	movl	48(%rsp), %esi
	movq	%rax, 72(%rsp)
	leaq	.LC3(%rip), %rax
	leaq	60(%rsp), %r13
	movq	%rax, 144(%rsp)
	movq	.LC4(%rip), %rax
	subl	%edi, %esi
	movl	$12, 80(%rsp)
	movq	%rax, 64(%rsp)
	movq	$32, 152(%rsp)
	call	_gfortran_st_write
	movl	(%rbx), %eax
	movq	%r13, %rdx
	movq	%r12, %rcx
	movl	$4, %r8d
	subl	$1, %eax
	movl	%eax, 60(%rsp)
	call	_gfortran_transfer_integer_write
	movq	%r13, %rdx
	movq	%r12, %rcx
	movl	%esi, 60(%rsp)
	movl	$4, %r8d
	call	_gfortran_transfer_integer_write
	movq	%r12, %rcx
	call	_gfortran_st_write_done
	nop
	addq	$600, %rsp
	popq	%rbx
	popq	%rsi
	popq	%rdi
	popq	%rbp
	popq	%r12
	popq	%r13
	ret
.L7:
	movl	$1, %eax
	jmp	.L3
	.seh_endproc

Arne