也许吧。intel的simd内置函数出来的时候有谁联想到了万亿次?
用sse实现sin就涉及到算法,还要考虑到那些sse指令的延迟/吞吐,弄得不好就会慢了点,也不算奇怪。除了少数几个函数以外,大多数操作都是sse会比x87快点。
sin_sse.o我反汇编了给各位看看,P大认为很简单嘛?:
sin_sse2.o: file format elf32-i386
Disassembly of section .text:
00000000 <__libm_sse2_sin>:
0: 55 push %ebp
1: 8b ec mov %esp,%ebp
3: 83 ec 68 sub $0x68,%esp
6: 66 0f c5 c0 03 pextrw $0x3,%xmm0,%eax
b: 25 ff 7f 00 00 and $0x7fff,%eax
10: 2d 30 30 00 00 sub $0x3030,%eax
15: 3d c5 10 00 00 cmp $0x10c5,%eax
1a: 0f 87 37 01 00 00 ja 157 <__libm_sse2_sin+0x157>
20: f2 0f 10 0d 70 08 00 movsd 0x870,%xmm1
27: 00
28: f2 0f 59 c8 mulsd %xmm0,%xmm1
2c: f2 0f 10 15 80 08 00 movsd 0x880,%xmm2
33: 00
34: f2 0f 2d d1 cvtsd2si %xmm1,%edx
38: f2 0f 58 ca addsd %xmm2,%xmm1
3c: f2 0f 10 1d 50 08 00 movsd 0x850,%xmm3
43: 00
44: f2 0f 5c ca subsd %xmm2,%xmm1
48: 66 0f 28 15 40 08 00 movapd 0x840,%xmm2
4f: 00
50: f2 0f 59 d9 mulsd %xmm1,%xmm3
54: 66 0f 14 c9 unpcklpd %xmm1,%xmm1
58: 81 c2 00 76 1c 00 add $0x1c7600,%edx
5e: 66 0f 28 e0 movapd %xmm0,%xmm4
62: 83 e2 3f and $0x3f,%edx
65: 66 0f 28 2d 30 08 00 movapd 0x830,%xmm5
6c: 00
6d: 8d 05 00 00 00 00 lea 0x0,%eax
73: c1 e2 05 shl $0x5,%edx
76: 03 c2 add %edx,%eax
78: 66 0f 59 d1 mulpd %xmm1,%xmm2
7c: f2 0f 5c c3 subsd %xmm3,%xmm0
80: f2 0f 59 0d 60 08 00 mulsd 0x860,%xmm1
87: 00
88: f2 0f 5c e3 subsd %xmm3,%xmm4
8c: f2 0f 10 78 08 movsd 0x8(%eax),%xmm7
91: 66 0f 14 c0 unpcklpd %xmm0,%xmm0
95: 66 0f 28 dc movapd %xmm4,%xmm3
99: f2 0f 5c e2 subsd %xmm2,%xmm4
9d: 66 0f 59 e8 mulpd %xmm0,%xmm5
a1: 66 0f 5c c2 subpd %xmm2,%xmm0
a5: 66 0f 28 35 10 08 00 movapd 0x810,%xmm6
ac: 00
ad: f2 0f 59 fc mulsd %xmm4,%xmm7
b1: f2 0f 5c dc subsd %xmm4,%xmm3
b5: 66 0f 59 e8 mulpd %xmm0,%xmm5
b9: 66 0f 59 c0 mulpd %xmm0,%xmm0
bd: f2 0f 5c da subsd %xmm2,%xmm3
c1: 66 0f 28 10 movapd (%eax),%xmm2
c5: f2 0f 5c cb subsd %xmm3,%xmm1
c9: f2 0f 10 58 18 movsd 0x18(%eax),%xmm3
ce: f2 0f 58 d3 addsd %xmm3,%xmm2
d2: f2 0f 5c fa subsd %xmm2,%xmm7
d6: f2 0f 59 d4 mulsd %xmm4,%xmm2
da: 66 0f 59 f0 mulpd %xmm0,%xmm6
de: f2 0f 59 dc mulsd %xmm4,%xmm3
e2: 66 0f 59 d0 mulpd %xmm0,%xmm2
e6: 66 0f 59 c0 mulpd %xmm0,%xmm0
ea: 66 0f 58 2d 20 08 00 addpd 0x820,%xmm5
f1: 00
f2: f2 0f 59 20 mulsd (%eax),%xmm4
f6: 66 0f 58 35 00 08 00 addpd 0x800,%xmm6
fd: 00
fe: 66 0f 59 e8 mulpd %xmm0,%xmm5
102: 66 0f 28 c3 movapd %xmm3,%xmm0
106: f2 0f 58 58 08 addsd 0x8(%eax),%xmm3
10b: 66 0f 59 cf mulpd %xmm7,%xmm1
10f: 66 0f 28 fc movapd %xmm4,%xmm7
113: f2 0f 58 e3 addsd %xmm3,%xmm4
117: 66 0f 58 f5 addpd %xmm5,%xmm6
11b: f2 0f 10 68 08 movsd 0x8(%eax),%xmm5
120: f2 0f 5c eb subsd %xmm3,%xmm5
124: f2 0f 5c dc subsd %xmm4,%xmm3
128: f2 0f 58 48 10 addsd 0x10(%eax),%xmm1
12d: 66 0f 59 f2 mulpd %xmm2,%xmm6
131: f2 0f 58 e8 addsd %xmm0,%xmm5
135: f2 0f 58 df addsd %xmm7,%xmm3
139: f2 0f 58 cd addsd %xmm5,%xmm1
13d: f2 0f 58 cb addsd %xmm3,%xmm1
141: f2 0f 58 ce addsd %xmm6,%xmm1
145: 66 0f 15 f6 unpckhpd %xmm6,%xmm6
149: f2 0f 58 ce addsd %xmm6,%xmm1
14d: f2 0f 58 e1 addsd %xmm1,%xmm4
151: 66 0f 28 c4 movapd %xmm4,%xmm0
155: eb 72 jmp 1c9 <__libm_sse2_sin+0x1c9>
157: 7f 2e jg 187 <__libm_sse2_sin+0x187>
159: c1 e8 04 shr $0x4,%eax
15c: 3d fd fc ff 0f cmp $0xffffcfd,%eax
161: 75 0a jne 16d <__libm_sse2_sin+0x16d>
163: f2 0f 59 05 b0 08 00 mulsd 0x8b0,%xmm0
16a: 00
16b: eb 5c jmp 1c9 <__libm_sse2_sin+0x1c9>
16d: f2 0f 10 1d 90 08 00 movsd 0x890,%xmm3
174: 00
175: f2 0f 59 d8 mulsd %xmm0,%xmm3
179: f2 0f 5c d8 subsd %xmm0,%xmm3
17d: f2 0f 59 1d a0 08 00 mulsd 0x8a0,%xmm3
184: 00
185: eb 42 jmp 1c9 <__libm_sse2_sin+0x1c9>
187: 66 0f c5 c0 03 pextrw $0x3,%xmm0,%eax
18c: 25 f0 7f 00 00 and $0x7ff0,%eax
191: 3d f0 7f 00 00 cmp $0x7ff0,%eax
196: 74 29 je 1c1 <__libm_sse2_sin+0x1c1>
198: 83 ec 20 sub $0x20,%esp
19b: f2 0f 11 04 24 movsd %xmm0,(%esp)
1a0: 8d 44 24 20 lea 0x20(%esp),%eax
1a4: 89 44 24 08 mov %eax,0x8(%esp)
1a8: b8 02 00 00 00 mov $0x2,%eax
1ad: 89 44 24 0c mov %eax,0xc(%esp)
1b1: e8 fc ff ff ff call 1b2 <__libm_sse2_sin+0x1b2>
1b6: 83 c4 20 add $0x20,%esp
1b9: f3 0f 7e 44 24 08 movq 0x8(%esp),%xmm0
1bf: eb 08 jmp 1c9 <__libm_sse2_sin+0x1c9>
1c1: f2 0f 59 05 c0 08 00 mulsd 0x8c0,%xmm0
1c8: 00
1c9: 8b e5 mov %ebp,%esp
1cb: 5d pop %ebp
1cc: c3 ret
1cd: 90 nop
1ce: 90 nop
1cf: 90 nop |