[Info-vax] x86-64 data aligment / faulting
abrsvc
dansabrservices at yahoo.com
Mon Mar 7 22:21:56 EST 2022
On Monday, March 7, 2022 at 9:41:17 PM UTC-5, Arne Vajhøj wrote:
> On 2/28/2022 1:55 PM, Simon Clubley wrote:
> > On 2022-02-28, Simon Clubley <clubley at remove_me.eisner.decus.org-Earth.UFP> wrote:
> >> On 2022-02-27, Arne Vajhøj <ar... at vajhoej.dk> wrote:
> >>>
> >>> I modified the program to test with different data sizes, verified
> >>> that the code was indeed working on an unaligned addresses and
> >>> tried both sequential and random access to array.
> >>>
> >>> I simply can't get a big difference between aligned and unaligned access.
> >>>
> >>
> >> Have you looked at the generated code to verify that unaligned access
> >> is really occuring ?
> >>
> >> Another possibility, depending on how smart the compiler is, is that
> >> it could always do aligned access lookups and then only extract the
> >> data it needs out of each lookup.
> >>
> >
> > Also, the optimisation settings and level might make a difference.
> > Try varying them to see if the behaviour changes, especially when
> > you switch between optimising for time versus space (and yes, include
> > -O0 in the list of things you try. :-)).
> No alignment effect in neither default, -O0 or -O3.
> > It's for reasons like this that I suggested at the start of this
> > to look at the generated code, just in case the compiler is doing
> > something you are not expecting.
> I am not good enough in x86-64 assembler to see if it does
> anything unexpected.
>
> SUBROUTINE TEST(IX,X,N,REP)
> INTEGER*4 IX,N,REP
> REAL*8 X(*)
> INTEGER*4 I,T1,T2,DUMMY
> CALL SYSTEM_CLOCK(T1,DUMMY,DUMMY)
> DO 200 J=1,REP
> DO 100 I=1,N
> X(I)=I
> 100 CONTINUE
> 200 CONTINUE
> CALL SYSTEM_CLOCK(T2,DUMMY,DUMMY)
> WRITE(6,300) IX-1,T2-T1
> RETURN
> 300 FORMAT(1X,'OFFSET ',I2,' : ',I6,' ms')
> END
>
> becomes:
>
> .seh_proc test_
> test_:
> pushq %r13
> .seh_pushreg %r13
> pushq %r12
> .seh_pushreg %r12
> pushq %rbp
> .seh_pushreg %rbp
> pushq %rdi
> .seh_pushreg %rdi
> pushq %rsi
> .seh_pushreg %rsi
> pushq %rbx
> .seh_pushreg %rbx
> subq $600, %rsp
> .seh_stackalloc 600
> .seh_endprologue
> movq %r9, %rbp
> movq %rcx, %rbx
> movq %rdx, %rsi
> movq %r8, %r12
> leaq 40(%rsp), %rdx
> leaq 36(%rsp), %rcx
> leaq 44(%rsp), %r8
> call _gfortran_system_clock_4
> movl 0(%rbp), %r11d
> movl 36(%rsp), %edi
> testl %r11d, %r11d
> jle .L2
> movl (%r12), %r8d
> testl %r8d, %r8d
> jle .L2
> movl %r8d, %eax
> movl %r8d, %ebp
> addl $1, %r11d
> movl $1, %ecx
> shrl $2, %eax
> andl $-4, %ebp
> movdqa .LC1(%rip), %xmm3
> leal -1(%r8), %r12d
> subl $1, %eax
> leal 1(%rbp), %r13d
> salq $5, %rax
> leaq 32(%rsi,%rax), %rdx
> .p2align 4,,10
> .p2align 3
> .L6:
> cmpl $2, %r12d
> jbe .L7
> movdqa .LC0(%rip), %xmm1
> movq %rsi, %rax
> .p2align 4,,10
> .p2align 3
> .L4:
> movdqa %xmm1, %xmm0
> addq $32, %rax
> paddd %xmm3, %xmm1
> cvtdq2pd %xmm0, %xmm2
> pshufd $238, %xmm0, %xmm0
> movups %xmm2, -32(%rax)
> cvtdq2pd %xmm0, %xmm0
> movups %xmm0, -16(%rax)
> cmpq %rdx, %rax
> jne .L4
> movl %r13d, %eax
> cmpl %ebp, %r8d
> je .L5
> .L3:
> pxor %xmm0, %xmm0
> movslq %eax, %r9
> leal 1(%rax), %r10d
> cvtsi2sdl %eax, %xmm0
> leaq (%rsi,%r9,8), %r9
> movsd %xmm0, -8(%r9)
> cmpl %r10d, %r8d
> jl .L5
> pxor %xmm0, %xmm0
> addl $2, %eax
> cvtsi2sdl %r10d, %xmm0
> movsd %xmm0, (%r9)
> cmpl %eax, %r8d
> jl .L5
> pxor %xmm0, %xmm0
> cvtsi2sdl %eax, %xmm0
> movsd %xmm0, 8(%r9)
> .L5:
> addl $1, %ecx
> cmpl %r11d, %ecx
> jne .L6
> .L2:
> leaq 52(%rsp), %rdx
> leaq 56(%rsp), %r8
> leaq 48(%rsp), %rcx
> leaq 64(%rsp), %r12
> call _gfortran_system_clock_4
> leaq .LC2(%rip), %rax
> movq %r12, %rcx
> movl 48(%rsp), %esi
> movq %rax, 72(%rsp)
> leaq .LC3(%rip), %rax
> leaq 60(%rsp), %r13
> movq %rax, 144(%rsp)
> movq .LC4(%rip), %rax
> subl %edi, %esi
> movl $12, 80(%rsp)
> movq %rax, 64(%rsp)
> movq $32, 152(%rsp)
> call _gfortran_st_write
> movl (%rbx), %eax
> movq %r13, %rdx
> movq %r12, %rcx
> movl $4, %r8d
> subl $1, %eax
> movl %eax, 60(%rsp)
> call _gfortran_transfer_integer_write
> movq %r13, %rdx
> movq %r12, %rcx
> movl %esi, 60(%rsp)
> movl $4, %r8d
> call _gfortran_transfer_integer_write
> movq %r12, %rcx
> call _gfortran_st_write_done
> nop
> addq $600, %rsp
> popq %rbx
> popq %rsi
> popq %rdi
> popq %rbp
> popq %r12
> popq %r13
> ret
> .L7:
> movl $1, %eax
> jmp .L3
> .seh_endproc
>
> Arne
Looks to me like the memory references are longwords and the required portion is obtained via shifting. You won't see alignment issues here.
Dan
More information about the Info-vax
mailing list