[Info-vax] x86-64 data aligment / faulting

Mon Mar 7 22:21:56 EST 2022

On Monday, March 7, 2022 at 9:41:17 PM UTC-5, Arne Vajhøj wrote:
> On 2/28/2022 1:55 PM, Simon Clubley wrote: 
> > On 2022-02-28, Simon Clubley <clubley at remove_me.eisner.decus.org-Earth.UFP> wrote: 
> >> On 2022-02-27, Arne Vajhøj <ar... at vajhoej.dk> wrote: 
> >>> 
> >>> I modified the program to test with different data sizes, verified 
> >>> that the code was indeed working on an unaligned addresses and 
> >>> tried both sequential and random access to array. 
> >>> 
> >>> I simply can't get a big difference between aligned and unaligned access. 
> >>> 
> >> 
> >> Have you looked at the generated code to verify that unaligned access 
> >> is really occuring ? 
> >> 
> >> Another possibility, depending on how smart the compiler is, is that 
> >> it could always do aligned access lookups and then only extract the 
> >> data it needs out of each lookup. 
> >> 
> > 
> > Also, the optimisation settings and level might make a difference. 
> > Try varying them to see if the behaviour changes, especially when 
> > you switch between optimising for time versus space (and yes, include 
> > -O0 in the list of things you try. :-)).
> No alignment effect in neither default, -O0 or -O3.
> > It's for reasons like this that I suggested at the start of this 
> > to look at the generated code, just in case the compiler is doing 
> > something you are not expecting.
> I am not good enough in x86-64 assembler to see if it does 
> anything unexpected. 
> 
> SUBROUTINE TEST(IX,X,N,REP) 
> INTEGER*4 IX,N,REP 
> REAL*8 X(*) 
> INTEGER*4 I,T1,T2,DUMMY
> CALL SYSTEM_CLOCK(T1,DUMMY,DUMMY) 
> DO 200 J=1,REP
> DO 100 I=1,N
> X(I)=I 
> 100 CONTINUE 
> 200 CONTINUE
> CALL SYSTEM_CLOCK(T2,DUMMY,DUMMY) 
> WRITE(6,300) IX-1,T2-T1 
> RETURN 
> 300 FORMAT(1X,'OFFSET ',I2,' : ',I6,' ms') 
> END 
> 
> becomes: 
> 
> .seh_proc test_ 
> test_: 
> pushq %r13 
> .seh_pushreg %r13 
> pushq %r12 
> .seh_pushreg %r12 
> pushq %rbp 
> .seh_pushreg %rbp 
> pushq %rdi 
> .seh_pushreg %rdi 
> pushq %rsi 
> .seh_pushreg %rsi 
> pushq %rbx 
> .seh_pushreg %rbx 
> subq $600, %rsp 
> .seh_stackalloc 600 
> .seh_endprologue 
> movq %r9, %rbp 
> movq %rcx, %rbx 
> movq %rdx, %rsi 
> movq %r8, %r12 
> leaq 40(%rsp), %rdx 
> leaq 36(%rsp), %rcx 
> leaq 44(%rsp), %r8 
> call _gfortran_system_clock_4 
> movl 0(%rbp), %r11d 
> movl 36(%rsp), %edi 
> testl %r11d, %r11d 
> jle .L2 
> movl (%r12), %r8d 
> testl %r8d, %r8d 
> jle .L2 
> movl %r8d, %eax 
> movl %r8d, %ebp 
> addl $1, %r11d 
> movl $1, %ecx 
> shrl $2, %eax 
> andl $-4, %ebp 
> movdqa .LC1(%rip), %xmm3 
> leal -1(%r8), %r12d 
> subl $1, %eax 
> leal 1(%rbp), %r13d 
> salq $5, %rax 
> leaq 32(%rsi,%rax), %rdx 
> .p2align 4,,10 
> .p2align 3 
> .L6: 
> cmpl $2, %r12d 
> jbe .L7 
> movdqa .LC0(%rip), %xmm1 
> movq %rsi, %rax 
> .p2align 4,,10 
> .p2align 3 
> .L4: 
> movdqa %xmm1, %xmm0 
> addq $32, %rax 
> paddd %xmm3, %xmm1 
> cvtdq2pd %xmm0, %xmm2 
> pshufd $238, %xmm0, %xmm0 
> movups %xmm2, -32(%rax) 
> cvtdq2pd %xmm0, %xmm0 
> movups %xmm0, -16(%rax) 
> cmpq %rdx, %rax 
> jne .L4 
> movl %r13d, %eax 
> cmpl %ebp, %r8d 
> je .L5 
> .L3: 
> pxor %xmm0, %xmm0 
> movslq %eax, %r9 
> leal 1(%rax), %r10d 
> cvtsi2sdl %eax, %xmm0 
> leaq (%rsi,%r9,8), %r9 
> movsd %xmm0, -8(%r9) 
> cmpl %r10d, %r8d 
> jl .L5 
> pxor %xmm0, %xmm0 
> addl $2, %eax 
> cvtsi2sdl %r10d, %xmm0 
> movsd %xmm0, (%r9) 
> cmpl %eax, %r8d 
> jl .L5 
> pxor %xmm0, %xmm0 
> cvtsi2sdl %eax, %xmm0 
> movsd %xmm0, 8(%r9) 
> .L5: 
> addl $1, %ecx 
> cmpl %r11d, %ecx 
> jne .L6 
> .L2: 
> leaq 52(%rsp), %rdx 
> leaq 56(%rsp), %r8 
> leaq 48(%rsp), %rcx 
> leaq 64(%rsp), %r12 
> call _gfortran_system_clock_4 
> leaq .LC2(%rip), %rax 
> movq %r12, %rcx 
> movl 48(%rsp), %esi 
> movq %rax, 72(%rsp) 
> leaq .LC3(%rip), %rax 
> leaq 60(%rsp), %r13 
> movq %rax, 144(%rsp) 
> movq .LC4(%rip), %rax 
> subl %edi, %esi 
> movl $12, 80(%rsp) 
> movq %rax, 64(%rsp) 
> movq $32, 152(%rsp) 
> call _gfortran_st_write 
> movl (%rbx), %eax 
> movq %r13, %rdx 
> movq %r12, %rcx 
> movl $4, %r8d 
> subl $1, %eax 
> movl %eax, 60(%rsp) 
> call _gfortran_transfer_integer_write 
> movq %r13, %rdx 
> movq %r12, %rcx 
> movl %esi, 60(%rsp) 
> movl $4, %r8d 
> call _gfortran_transfer_integer_write 
> movq %r12, %rcx 
> call _gfortran_st_write_done 
> nop 
> addq $600, %rsp 
> popq %rbx 
> popq %rsi 
> popq %rdi 
> popq %rbp 
> popq %r12 
> popq %r13 
> ret 
> .L7: 
> movl $1, %eax 
> jmp .L3 
> .seh_endproc 
> 
> Arne
Looks to me like the memory references are longwords and the required portion is obtained via shifting.  You won't see alignment issues here.

Dan