居然没有人反汇编来吐槽一下?
用objdump反汇编libgeekbench-jni.so,以Mandelbrot测试项目为例:
1. x86:
- 000ccb2a <_ZN10Mandelbrot6workerEi>:
- ccb2a: 55 push %ebp
- ccb2b: 89 e5 mov %esp,%ebp
- ccb2d: 57 push %edi
- ccb2e: 56 push %esi
- ccb2f: 53 push %ebx
- ccb30: 83 ec 3c sub $0x3c,%esp
- ccb33: 8b 75 08 mov 0x8(%ebp),%esi
- ccb36: e8 86 91 f6 ff call 35cc1 <pthread_join@plt+0x109>
- ccb3b: 81 c3 01 34 17 00 add $0x173401,%ebx
- ccb41: c7 45 e0 00 00 00 00 movl $0x0,-0x20(%ebp)
- ccb48: c7 45 e4 00 00 00 00 movl $0x0,-0x1c(%ebp)
- ccb4f: 8b 46 38 mov 0x38(%esi),%eax
- ccb52: 8b 4e 3c mov 0x3c(%esi),%ecx
- ccb55: f2 0f 10 bb 4c af f8 movsd -0x750b4(%ebx),%xmm7
- ccb5c: ff
- ccb5d: f2 0f 10 b3 e4 e6 f8 movsd -0x7191c(%ebx),%xmm6
- ccb64: ff
- ccb65: f2 0f 2a c0 cvtsi2sd %eax,%xmm0
- ccb69: 89 45 d8 mov %eax,-0x28(%ebp)
- ccb6c: f2 0f 5e f8 divsd %xmm0,%xmm7
- ccb70: 89 4d dc mov %ecx,-0x24(%ebp)
- ccb73: f2 0f 2a c1 cvtsi2sd %ecx,%xmm0
- ccb77: 31 c0 xor %eax,%eax
- ccb79: f2 0f 5e f0 divsd %xmm0,%xmm6
- ccb7d: e9 aa 00 00 00 jmp ccc2c <_ZN10Mandelbrot6workerEi+0x102>
- ccb82: 8d b3 ec e6 f8 ff lea -0x71914(%ebx),%esi
- ccb88: f2 0f 2a c8 cvtsi2sd %eax,%xmm1
- ccb8c: f2 0f 59 cf mulsd %xmm7,%xmm1
- ccb90: f2 0f 5c 0e subsd (%esi),%xmm1
- ccb94: 8b 75 c4 mov -0x3c(%ebp),%esi
- ccb97: 8d bb c4 aa f8 ff lea -0x7553c(%ebx),%edi
- ccb9d: f2 0f 2a da cvtsi2sd %edx,%xmm3
- ccba1: 0f 28 c1 movaps %xmm1,%xmm0
- ccba4: f2 0f 59 de mulsd %xmm6,%xmm3
- ccba8: 31 c9 xor %ecx,%ecx
- ccbaa: f2 0f 58 1f addsd (%edi),%xmm3
- ccbae: 0f 28 d3 movaps %xmm3,%xmm2
- ccbb1: 0f 28 e2 movaps %xmm2,%xmm4
- ccbb4: f2 0f 59 c0 mulsd %xmm0,%xmm0
- ccbb8: 41 inc %ecx
- ccbb9: f2 0f 59 e2 mulsd %xmm2,%xmm4
- ccbbd: f2 0f 5c c4 subsd %xmm4,%xmm0
- ccbc1: f2 0f 58 c1 addsd %xmm1,%xmm0
- ccbc5: 0f 28 e0 movaps %xmm0,%xmm4
- ccbc8: f2 0f 58 e0 addsd %xmm0,%xmm4
- ccbcc: f2 0f 59 d4 mulsd %xmm4,%xmm2
- ccbd0: 0f 28 e0 movaps %xmm0,%xmm4
- ccbd3: f2 0f 58 d3 addsd %xmm3,%xmm2
- ccbd7: 0f 28 ea movaps %xmm2,%xmm5
- ccbda: f2 0f 5c e1 subsd %xmm1,%xmm4
- ccbde: f2 0f 5c eb subsd %xmm3,%xmm5
- ccbe2: f2 0f 59 e4 mulsd %xmm4,%xmm4
- ccbe6: f2 0f 59 ed mulsd %xmm5,%xmm5
- ccbea: 8d bb f4 e6 f8 ff lea -0x7190c(%ebx),%edi
- ccbf0: f2 0f 58 e5 addsd %xmm5,%xmm4
- ...
复制代码
2. ARMv7:
- 000a7f88 <_ZN10Mandelbrot6workerEi>:
- a7f88: e92d 47f0 stmdb sp!, {r4, r5, r6, r7, r8, r9, sl, lr}
- a7f8c: ed2d 8b06 vpush {d8-d10}
- a7f90: 6c05 ldr r5, [r0, #64] ; 0x40
- a7f92: ee02 5a10 vmov s4, r5
- a7f96: eeb8 7bc2 vcvt.f64.s32 d7, s4
- a7f9a: ed9f 6b3b vldr d6, [pc, #236] ; a8088 <_ZN10Mandelbrot6workerEi+0x100>
- a7f9e: f8d0 8044 ldr.w r8, [r0, #68] ; 0x44
- a7fa2: ee02 8a90 vmov s5, r8
- a7fa6: 4604 mov r4, r0
- a7fa8: 2600 movs r6, #0
- a7faa: 2700 movs r7, #0
- a7fac: 2300 movs r3, #0
- a7fae: ed9f 8b38 vldr d8, [pc, #224] ; a8090 <_ZN10Mandelbrot6workerEi+0x108>
- a7fb2: ed9f 0b39 vldr d0, [pc, #228] ; a8098 <_ZN10Mandelbrot6workerEi+0x110>
- a7fb6: ed9f 1b3a vldr d1, [pc, #232] ; a80a0 <_ZN10Mandelbrot6workerEi+0x118>
- a7fba: ee86 ab07 vdiv.f64 d10, d6, d7
- a7fbe: ed9f 6b3a vldr d6, [pc, #232] ; a80a8 <_ZN10Mandelbrot6workerEi+0x120>
- a7fc2: eeb8 7be2 vcvt.f64.s32 d7, s5
- a7fc6: ee86 9b07 vdiv.f64 d9, d6, d7
- a7fca: e03f b.n a804c <_ZN10Mandelbrot6workerEi+0xc4>
- a7fcc: ee06 3a90 vmov s13, r3
- a7fd0: eeb8 5be6 vcvt.f64.s32 d5, s13
- a7fd4: 2100 movs r1, #0
- a7fd6: eeb0 7b48 vmov.f64 d7, d8
- a7fda: ee15 7b0a vnmls.f64 d7, d5, d10
- a7fde: eeb0 5b47 vmov.f64 d5, d7
- a7fe2: ee07 2a90 vmov s15, r2
- a7fe6: eeb8 4be7 vcvt.f64.s32 d4, s15
- a7fea: eeb0 2b40 vmov.f64 d2, d0
- a7fee: ee04 2b09 vmla.f64 d2, d4, d9
- a7ff2: eeb0 6b45 vmov.f64 d6, d5
- a7ff6: eeb0 4b42 vmov.f64 d4, d2
- a7ffa: eeb0 7b42 vmov.f64 d7, d2
- a7ffe: ee27 3b07 vmul.f64 d3, d7, d7
- a8002: 3101 adds r1, #1
- a8004: ee16 3b06 vnmls.f64 d3, d6, d6
- a8008: ee33 6b05 vadd.f64 d6, d3, d5
- a800c: ee36 3b06 vadd.f64 d3, d6, d6
- a8010: eeb0 2b44 vmov.f64 d2, d4
- a8014: ee03 2b07 vmla.f64 d2, d3, d7
- a8018: ee32 3b44 vsub.f64 d3, d2, d4
- a801c: ee23 3b03 vmul.f64 d3, d3, d3
- a8020: eeb0 7b42 vmov.f64 d7, d2
- a8024: ee36 2b45 vsub.f64 d2, d6, d5
- a8028: ee02 3b02 vmla.f64 d3, d2, d2
- a802c: eeb4 3bc1 vcmpe.f64 d3, d1
- a8030: eef1 fa10 vmrs APSR_nzcv, fpscr
- a8034: da01 bge.n a803a <_ZN10Mandelbrot6workerEi+0xb2>
- a8036: 29ff cmp r1, #255 ; 0xff
- a8038: d1e1 bne.n a7ffe <_ZN10Mandelbrot6workerEi+0x76>
- a803a: fb00 f101 mul.w r1, r0, r1
- a803e: 1876 adds r6, r6, r1
- a8040: f147 0700 adc.w r7, r7, #0
- a8044: 3201 adds r2, #1
- a8046: 4542 cmp r2, r8
- ...
复制代码
很明显:
1、这个项目,测的是fp64(不代表其它项目也是fp64)
2、x86上,用的是标量的SSE,没有矢量化
3、ARMv7上,用的是VFP,没有动用NEON(fp64也无法NEON)。
|