[Openmcl-cvs-notifications] r15081 - /trunk/source/level-0/ARM/arm-misc.lisp
gb at clozure.com
gb at clozure.com
Sat Nov 19 16:30:43 CST 2011
Author: gb
Date: Sat Nov 19 16:30:43 2011
New Revision: 15081
Log:
Hopefully faster copying between pointers and ivectors/ivectors and
ivectors. If both source and destination are aligned on 32-bit
boundaries, we can use FLDM/FSTM instructions to load/store several
words at a time.
Modified:
trunk/source/level-0/ARM/arm-misc.lisp
Modified: trunk/source/level-0/ARM/arm-misc.lisp
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D
--- trunk/source/level-0/ARM/arm-misc.lisp (original)
+++ trunk/source/level-0/ARM/arm-misc.lisp Sat Nov 19 16:30:43 2011
@@ -19,6 +19,7 @@
=
(in-package "CCL")
=
+ =
;;; Copy N bytes from pointer src, starting at byte offset src-offset,
;;; to ivector dest, starting at offset dest-offset.
;;; It's fine to leave this in lap.
@@ -26,11 +27,25 @@
;;; a byte at a time.
;;; Does no arg checking of any kind. Really.
=
-(defarmlapfunction %copy-ptr-to-ivector ((src (* 1 arm::node-size) )
- (src-byte-offset 0) =
- (dest arg_x)
- (dest-byte-offset arg_y)
- (nbytes arg_z))
+
+(defun %copy-ptr-to-ivector (src src-byte-offset dest dest-byte-offset nby=
tes)
+ (declare (fixnum src-byte-offset dest-byte-offset nbytes)
+ (optimize (speed 3) (safety 0)))
+ (let* ((ptr-align (logand 7 (%ptr-to-int src))))
+ (declare (type (mod 8) ptr-align))
+ (if (and (>=3D nbytes 32)
+ (=3D 0 (logand nbytes 3))
+ (=3D 0 (logand dest-byte-offset 3))
+ (=3D 0 (logand (the fixnum (+ ptr-align src-byte-offset)) 3)))
+ (%copy-ptr-to-ivector-32bit src src-byte-offset dest dest-byte-offse=
t nbytes)
+ (%copy-ptr-to-ivector-8bit src src-byte-offset dest dest-byte-offset=
nbytes))
+ dest))
+ =
+(defarmlapfunction %copy-ptr-to-ivector-8bit ((src (* 1 arm::node-size) )
+ (src-byte-offset 0) =
+ (dest arg_x)
+ (dest-byte-offset arg_y)
+ (nbytes arg_z))
(let ((src-reg imm0)
(src-byteptr temp2)
(src-node-reg temp0)
@@ -56,11 +71,91 @@
(add vsp vsp '2)
(bx lr)))
=
-(defarmlapfunction %copy-ivector-to-ptr ((src (* 1 arm::node-size))
- (src-byte-offset 0)
- (dest arg_x)
- (dest-byte-offset arg_y)
- (nbytes arg_z))
+;;; Everything's aligned OK and NBYTES is a multiple of 4.
+(defarmlapfunction %copy-ptr-to-ivector-32bit ((src (* 1 arm::node-size) )
+ (src-byte-offset 0) =
+ (dest arg_x)
+ (dest-byte-offset arg_y)
+ (nbytes arg_z))
+ (add imm1 vsp (:$ (* 2 arm::node-size)))
+ (build-lisp-frame imm0 imm1)
+ (add lr dest (:$ arm::misc-data-offset))
+ (add lr lr (:asr dest-byte-offset (:$ arm::fixnumshift)))
+ (ldr temp0 (:@ vsp (:$ src)))
+ (ldr imm1 (:@ vsp (:$ src-byte-offset)))
+ (macptr-ptr imm0 temp0)
+ (add imm0 imm0 (:asr imm1 (:$ arm::fixnumshift)))
+ (b @test32)
+ @loop32
+ (fldmias s0 (:! imm0) 8)
+ (fstmias s0 (:! lr) 8)
+ (sub nbytes nbytes '32)
+ @test32
+ (cmp nbytes '32)
+ (bge @loop32)
+ (add pc pc (:asr nbytes (:$ arm::fixnumshift)))
+ (nop)
+ (b @0)
+ (b @4)
+ (b @8)
+ (b @12)
+ (b @16)
+ (b @20)
+ (b @24)
+ (b @28)
+ (nop)
+ @0
+ (mov arg_z dest)
+ (restore-lisp-frame imm0)
+ (bx lr)
+ @4
+ (flds s0 (:@ imm0 (:$ 0)))
+ (fsts s0 (:@ lr (:$ 0)))
+ (b @0)
+ @8
+ (fldmias s0 imm0 2)
+ (fstmias s0 lr 2)
+ (b @0)
+ @12
+ (fldmias s0 imm0 3)
+ (fstmias s0 lr 3)
+ (b @0)
+ @16
+ (fldmias s0 imm0 4)
+ (fstmias s0 lr 4)
+ (b @0)
+ @20
+ (fldmias s0 imm0 5)
+ (fstmias s0 lr 5)
+ (b @0)
+ @24
+ (fldmias s0 imm0 6)
+ (fstmias s0 lr 6)
+ (b @0)
+ @28
+ (fldmias s0 imm0 7)
+ (fstmias s0 lr 7)
+ (b @0))
+ =
+
+(defun %copy-ivector-to-ptr (src src-byte-offset dest dest-byte-offset nby=
tes)
+ (declare (fixnum src-byte-offset dest-byte-offset nbytes)
+ (optimize (speed 3) (safety 0)))
+ (let* ((ptr-align (logand (the (unsigned-byte 32)(%ptr-to-int dest)) 7)))
+ (declare (type (mod 8) ptr-align))
+ (if (or (< nbytes 32)
+ (not (=3D 0 (logand nbytes 3)))
+ (not (=3D 0 (logand src-byte-offset 3)))
+ (not (=3D 0 (logand (the fixnum (+ ptr-align dest-byte-offset)=
) 3))))
+ (%copy-ivector-to-ptr-8bit src src-byte-offset dest dest-byte-offset=
nbytes)
+ (%copy-ivector-to-ptr-32bit src src-byte-offset dest dest-byte-offse=
t nbytes))
+ dest))
+
+(defarmlapfunction %copy-ivector-to-ptr-8bit ((src (* 1 arm::node-size))
+ (src-byte-offset 0)
+ (dest arg_x)
+ (dest-byte-offset arg_y)
+ (nbytes arg_z))
(ldr temp0 (:@ vsp (:$ src)))
(cmp nbytes (:$ 0))
(ldr imm0 (:@ vsp (:$ src-byte-offset)))
@@ -80,78 +175,360 @@
(add vsp vsp '2)
(bx lr))
=
-(defarmlapfunction %copy-ivector-to-ivector ((src 4) =
- (src-byte-offset 0) =
- (dest arg_x)
- (dest-byte-offset arg_y)
- (nbytes arg_z))
+;;; Everything's aligned OK and NBYTES is a multiple of 4.
+(defarmlapfunction %copy-ivector-to-ptr-32bit ((src (* 1 arm::node-size) )
+ (src-byte-offset 0) =
+ (dest arg_x)
+ (dest-byte-offset arg_y)
+ (nbytes arg_z))
+ (add imm1 vsp (:$ (* 2 arm::node-size)))
+ (build-lisp-frame imm0 imm1)
+ (ldr temp0 (:@ vsp (:$ src)))
+ (ldr imm1 (:@ vsp (:$ src-byte-offset)))
+ (add lr temp0 (:$ arm::misc-data-offset))
+ (add lr lr (:asr imm1 (:$ arm::fixnumshift)))
+ (macptr-ptr imm0 dest)
+ (add imm0 imm0 (:asr dest-byte-offset (:$ arm::fixnumshift)))
+ (b @test32)
+ @loop32
+ (fldmias s0 (:! lr) 8)
+ (fstmias s0 (:! imm0) 8)
+ (sub nbytes nbytes '32)
+ @test32
+ (cmp nbytes '32)
+ (bge @loop32)
+ (add pc pc (:asr nbytes (:$ arm::fixnumshift)))
+ (nop)
+ (b @0)
+ (b @4)
+ (b @8)
+ (b @12)
+ (b @16)
+ (b @20)
+ (b @24)
+ (b @28)
+ (nop)
+ @0
+ (mov arg_z dest)
+ (restore-lisp-frame imm0)
+ (bx lr)
+ @4
+ (flds s0 (:@ lr (:$ 0)))
+ (fsts s0 (:@ imm0 (:$ 0)))
+ (b @0)
+ @8
+ (fldmias s0 lr 2)
+ (fstmias s0 imm0 2)
+ (b @0)
+ @12
+ (fldmias s0 lr 3)
+ (fstmias s0 imm0 3)
+ (b @0)
+ @16
+ (fldmias s0 lr 4)
+ (fstmias s0 imm0 4)
+ (b @0)
+ @20
+ (fldmias s0 lr 5)
+ (fstmias s0 imm0 5)
+ (b @0)
+ @24
+ (fldmias s0 lr 6)
+ (fstmias s0 imm0 6)
+ (b @0)
+ @28
+ (fldmias s0 lr 7)
+ (fstmias s0 imm0 7)
+ (b @0))
+
+
+(defun %copy-ivector-to-ivector (src src-byte-offset dest dest-byte-offset=
nbytes)
+ (declare (fixnum src-byte-offset dest-byte-offset nbytes))
+ (if (or (not (eq src dest))
+ (< dest-byte-offset src-byte-offset)
+ (>=3D dest-byte-offset (the fixnum (+ src-byte-offset nbytes))))
+ (%copy-ivector-to-ivector-postincrement src src-byte-offset dest dest-=
byte-offset nbytes)
+ (if (and (eq src dest)
+ (eql src-byte-offset dest-byte-offset))
+ dest
+ (%copy-ivector-to-ivector-predecrement src
+ (the fixnum (+ src-byte-offse=
t nbytes))
+ dest
+ (the fixnum (+ dest-byte-offs=
et nbytes))
+ nbytes)))
+ dest)
+
+(defun %copy-ivector-to-ivector-postincrement (src src-byte-offset dest de=
st-byte-offset nbytes)
+ (declare (fixnum src-byte-offset dest-byte-offset nbytes))
+ =
+ (cond ((or (< nbytes 8)
+ (not (=3D (logand src-byte-offset 3)
+ (logand dest-byte-offset 3))))
+ (%copy-ivector-to-ivector-postincrement-8bit src src-byte-offset =
dest dest-byte-offset nbytes))
+ (t
+ (let* ((prefix-size (- 4 (logand src-byte-offset 3))))
+ (declare (fixnum prefix-size))
+ (unless (=3D 4 prefix-size)
+ (%copy-ivector-to-ivector-postincrement-8bit src src-byte-off=
set dest dest-byte-offset prefix-size)
+ (incf src-byte-offset prefix-size)
+ (incf dest-byte-offset prefix-size)
+ (decf nbytes prefix-size)))
+ (let* ((tail-size (logand nbytes 3))
+ (fullword-size (- nbytes tail-size)))
+ (declare (fixnum tail-size fullword-size))
+ (unless (zerop fullword-size)
+ (%copy-ivector-to-ivector-postincrement-32bit src src-byte-of=
fset dest dest-byte-offset fullword-size))
+ (unless (zerop tail-size)
+ (%copy-ivector-to-ivector-postincrement-8bit src (the fixnum =
(+ src-byte-offset fullword-size)) dest (the fixnum (+ dest-byte-offset ful=
lword-size)) tail-size))))))
+
+(defun %copy-ivector-to-ivector-predecrement (src src-byte-offset dest des=
t-byte-offset nbytes)
+ (declare (fixnum src-byte-offset dest-byte-offset nbytes))
+ (cond ((or (< nbytes 8)
+ (not (=3D (logand src-byte-offset 3)
+ (logand dest-byte-offset 3))))
+ (%copy-ivector-to-ivector-predecrement-8bit src src-byte-offset d=
est dest-byte-offset nbytes))
+ (t
+ (let* ((suffix-size (logand src-byte-offset 3)))
+ (declare (fixnum suffix-size))
+ (unless (zerop suffix-size)
+ (%copy-ivector-to-ivector-predecrement-8bit src src-byte-offset =
dest dest-byte-offset suffix-size)
+ (decf src-byte-offset suffix-size)
+ (decf dest-byte-offset suffix-size)
+ (decf nbytes suffix-size)))
+ (let* ((head-size (logand nbytes 3))
+ (fullword-size (- nbytes head-size)))
+ (declare (fixnum head-size fullword-size))
+ (unless (zerop fullword-size)
+ (%copy-ivector-to-ivector-predecrement-32bit src src-byte-offset=
dest dest-byte-offset fullword-size))
+ (unless (zerop head-size)
+ (%copy-ivector-to-ivector-predecrement-8bit src (the fixnum (- s=
rc-byte-offset fullword-size)) dest (the fixnum (- dest-byte-offset fullwor=
d-size)) head-size))))
+))
+
+(defarmlapfunction %copy-ivector-to-ivector-postincrement-8bit ((src 4) =
+ (src-byte-=
offset 0) =
+ (dest arg_=
x)
+ (dest-byte=
-offset arg_y)
+ (nbytes ar=
g_z))
(let ((rsrc temp0)
(scaled-src-idx imm1)
(scaled-dest-idx imm2)
- (val imm0)
- (nwords dest-byte-offset))
+ (val imm0))
(cmp nbytes (:$ 0))
(vpop1 scaled-src-idx)
(mov scaled-src-idx (:lsr scaled-src-idx (:$ arm::fixnumshift)))
- (mov val scaled-src-idx)
(add scaled-src-idx scaled-src-idx (:$ arm::misc-data-offset))
+ (mov scaled-dest-idx (:lsr dest-byte-offset (:$ arm::fixnumshift)))
+ (add scaled-dest-idx scaled-dest-idx (:$ arm::misc-data-offset)) =
(vpop1 rsrc)
- (beq @done)
- (cmp rsrc dest)
- (mov scaled-dest-idx (:lsr dest-byte-offset (:$ arm::fixnumshift)))
- (orr val val scaled-dest-idx)
- (add scaled-dest-idx scaled-dest-idx (:$ arm::misc-data-offset))
- (beq @SisD)
- @fwd
- (tst val (:$ 3))
- (bne @loop)
- ;; src and dest offsets are word-aligned. Copy words.
- (b @wtest)
- @words ; source and dest different - word=
s =
- (sub nbytes nbytes '4) =
- (ldr val (:@ rsrc scaled-src-idx))
- (add scaled-src-idx scaled-src-idx '1)
- (str val (:@ dest scaled-dest-idx))
- (add scaled-dest-idx scaled-dest-idx '1)
- @wtest
- (cmp nbytes '4)
- (bge @words)
- (cmp nbytes '0)
(b @test)
@loop
(subs nbytes nbytes '1)
- (ldrb val (:@ temp0 scaled-src-idx))
+ (ldrb val (:@ rsrc scaled-src-idx))
(add scaled-src-idx scaled-src-idx (:$ 1))
(strb val (:@ dest scaled-dest-idx))
(add scaled-dest-idx scaled-dest-idx (:$ 1))
@test
- (bne @loop)
- @done
+ (bne @loop)
(mov arg_z dest)
- (bx lr)
-
- @SisD
- (cmp scaled-src-idx scaled-dest-idx) ; cmp src and dest
- (beq @done)
- (bgt @fwd)
-
- =
- ;; Copy backwards when src & dest are the same and we're sliding down
- @bwd
- (add scaled-src-idx scaled-src-idx (:lsr nbytes (:$ arm::fixnumshift)))
- (add scaled-dest-idx scaled-dest-idx (:lsr nbytes (:$ arm::fixnumshift=
)))
- @loop2
+ (bx lr)))
+
+(defarmlapfunction %copy-ivector-to-ivector-postincrement-32bit ((src 4) =
+ (src-byte=
-offset 0) =
+ (dest arg=
_x)
+ (dest-byt=
e-offset arg_y)
+ (nbytes a=
rg_z))
+ (let ((rsrc temp0)
+ (scaled-src-idx imm1)
+ (scaled-dest-idx imm2)
+ (val imm0))
+ (cmp nbytes '32)
+ (vpop1 scaled-src-idx)
+ (mov scaled-src-idx (:lsr scaled-src-idx (:$ arm::fixnumshift)))
+ (add scaled-src-idx scaled-src-idx (:$ arm::misc-data-offset))
+ (mov scaled-dest-idx (:lsr dest-byte-offset (:$ arm::fixnumshift)))
+ (add scaled-dest-idx scaled-dest-idx (:$ arm::misc-data-offset)) =
+ (vpop1 rsrc)
+ (build-lisp-frame imm0) =
+ (b @test)
+ @loop
+ (sub nbytes nbytes '32)
+ (cmp nbytes '32)
+ (add lr rsrc scaled-src-idx)
+ (fldmias s0 lr 8)
+ (add scaled-src-idx scaled-src-idx (:$ 32))
+ (add lr dest scaled-dest-idx)
+ (fstmias s0 lr 8)
+ (add scaled-dest-idx scaled-dest-idx (:$ 32))
+ @test
+ (bge @loop)
+ (add pc pc (:asr nbytes (:$ arm::fixnumshift)))
+ (nop)
+ (b @0)
+ (b @4)
+ (b @8)
+ (b @12)
+ (b @16)
+ (b @20)
+ (b @24)
+ (b @28)
+ (nop)
+ @4
+ (ldr val (:@ rsrc scaled-src-idx))
+ (str val (:@ dest scaled-dest-idx))
+ (b @0)
+ @8
+ (add lr rsrc scaled-src-idx)
+ (fldmias s0 lr 2)
+ (add lr dest scaled-dest-idx)
+ (fstmias s0 lr 2)
+ (b @0)
+ @12
+ (add lr rsrc scaled-src-idx)
+ (fldmias s0 lr 3)
+ (add lr dest scaled-dest-idx)
+ (fstmias s0 lr 3)
+ (b @0)
+ @16
+ (add lr rsrc scaled-src-idx)
+ (fldmias s0 lr 4)
+ (add lr dest scaled-dest-idx)
+ (fstmias s0 lr 4)
+ (b @0)
+ @20
+ (add lr rsrc scaled-src-idx)
+ (fldmias s0 lr 5)
+ (add lr dest scaled-dest-idx)
+ (fstmias s0 lr 5)
+ (b @0)
+ @24
+ (add lr rsrc scaled-src-idx)
+ (fldmias s0 lr 6)
+ (add lr dest scaled-dest-idx)
+ (fstmias s0 lr 6)
+ (b @0)
+ @28
+ (add lr rsrc scaled-src-idx)
+ (fldmias s0 lr 7)
+ (add lr dest scaled-dest-idx)
+ (fstmias s0 lr 7)
+ @0
+ (mov arg_z dest)
+ (restore-lisp-frame imm0)
+ (bx lr)))
+
+(defarmlapfunction %copy-ivector-to-ivector-predecrement-8bit ((src 4) =
+ (src-byte-o=
ffset 0) =
+ (dest arg_x)
+ (dest-byte-=
offset arg_y)
+ (nbytes arg=
_z))
+ (let ((rsrc temp0)
+ (scaled-src-idx imm1)
+ (scaled-dest-idx imm2)
+ (val imm0))
+ (cmp nbytes (:$ 0))
+ (vpop1 scaled-src-idx)
+ (mov scaled-src-idx (:lsr scaled-src-idx (:$ arm::fixnumshift)))
+ (add scaled-src-idx scaled-src-idx (:$ arm::misc-data-offset))
+ (mov scaled-dest-idx (:lsr dest-byte-offset (:$ arm::fixnumshift)))
+ (add scaled-dest-idx scaled-dest-idx (:$ arm::misc-data-offset)) =
+ (vpop1 rsrc)
+ (b @test)
+ @loop
(sub scaled-src-idx scaled-src-idx (:$ 1))
(sub scaled-dest-idx scaled-dest-idx (:$ 1))
(subs nbytes nbytes '1)
(ldrb val (:@ rsrc scaled-src-idx))
(strb val (:@ dest scaled-dest-idx))
- @test2
- (bne @loop2)
- (b @done)))
-
-
- =
+ @test
+ (bne @loop)
+ (mov arg_z dest)
+ (bx lr)))
+
+(defarmlapfunction %copy-ivector-to-ivector-predecrement-32bit ((src 4) =
+ (src-byte-=
offset 0) =
+ (dest arg_=
x)
+ (dest-byte=
-offset arg_y)
+ (nbytes ar=
g_z))
+ (let ((rsrc temp0)
+ (scaled-src-idx imm1)
+ (scaled-dest-idx imm2)
+ (val imm0))
+ (cmp nbytes (:$ 32))
+ (vpop1 scaled-src-idx)
+ (mov scaled-src-idx (:lsr scaled-src-idx (:$ arm::fixnumshift)))
+ (add scaled-src-idx scaled-src-idx (:$ arm::misc-data-offset))
+ (mov scaled-dest-idx (:lsr dest-byte-offset (:$ arm::fixnumshift)))
+ (add scaled-dest-idx scaled-dest-idx (:$ arm::misc-data-offset)) =
+ (vpop1 rsrc)
+ (build-lisp-frame imm0) =
+ (b @test)
+ @loop
+ (sub scaled-src-idx scaled-src-idx (:$ 32))
+ (sub scaled-dest-idx scaled-dest-idx (:$ 32))
+ (sub nbytes nbytes '32)
+ (cmp nbytes '32)
+ (add lr rsrc scaled-src-idx)
+ (fldmias s0 lr 8)
+ (add lr dest scaled-dest-idx)
+ (fstmias s0 lr 8)
+ @test
+ (bge @loop)
+ (sub scaled-src-idx scaled-src-idx (:asr nbytes (:$ arm::fixnumshift)))
+ (sub scaled-dest-idx scaled-dest-idx (:asr nbytes (:$ arm::fixnumshift=
)))
+ (add pc pc (:asr nbytes (:$ arm::fixnumshift)))
+ (nop)
+ (b @0)
+ (b @4)
+ (b @8)
+ (b @12)
+ (b @16)
+ (b @20)
+ (b @24)
+ (b @28)
+ (nop)
+ @4
+ (ldr val (:@ rsrc scaled-src-idx))
+ (str val (:@ dest scaled-dest-idx))
+ (b @0)
+ @8
+ (add lr rsrc scaled-src-idx)
+ (fldmias s0 lr 2)
+ (add lr dest scaled-dest-idx)
+ (fstmias s0 lr 2)
+ (b @0)
+ @12
+ (add lr rsrc scaled-src-idx)
+ (fldmias s0 lr 3)
+ (add lr dest scaled-dest-idx)
+ (fstmias s0 lr 3)
+ (b @0)
+ @16
+ (add lr rsrc scaled-src-idx)
+ (fldmias s0 lr 4)
+ (add lr dest scaled-dest-idx)
+ (fstmias s0 lr 4)
+ (b @0)
+ @20
+ (add lr rsrc scaled-src-idx)
+ (fldmias s0 lr 5)
+ (add lr dest scaled-dest-idx)
+ (fstmias s0 lr 5)
+ (b @0)
+ @24
+ (add lr rsrc scaled-src-idx)
+ (fldmias s0 lr 6)
+ (add lr dest scaled-dest-idx)
+ (fstmias s0 lr 6)
+ (b @0)
+ @28
+ (add lr rsrc scaled-src-idx)
+ (fldmias s0 lr 7)
+ (add lr dest scaled-dest-idx)
+ (fstmias s0 lr 7)
+ @0
+ (mov arg_z dest)
+ (restore-lisp-frame imm0)
+ (bx lr)))
=
(defarmlapfunction %copy-gvector-to-gvector ((src (* 1 arm::node-size))
(src-element 0)
More information about the Openmcl-cvs-notifications
mailing list