哦。
今天我随便编了个程序(名为tx1.f95),主要计算量就是双精度浮点运算。用gfortran编译,当然是在linux下面。使用gcc的-mfpmath=sse 与 -mfpmath=387 分别选择387和sse进行浮点计算。
具体是:
1)使用387
gfortran -fopenmp -mtune=pentium4 -mfpmath=387 tx1.f95
2) 使用sse
gfortran -fopenmp -mtune=pentium4 -msse2 -mfpmath=sse tx1.f95
跑的机器是台双Xeon 2.4G, 支持到sse2, 即不支持sse3。结果运行时间几乎相同。一个是117秒,一个是118。我去掉-mtune=pentium4效果也一样。
各位解释一下,对于双精度浮点运算,是不是对于老P4的两周期一条sse指令,其实速度就等于387?
手上没有扣肉机器,不知道单周期sse的力量到底怎么样?能提高接近一倍速度?
还有,巴塞罗那的sse性能有网站出测试结果没有?
程序:
=============================================
program testsse
use omp_lib
implicit none
integer(kind=8),parameter :: N=800000000_8
integer(kind=8) :: i
real(kind=10) :: fu=1.0_10,tmp=1.0_10
type etime_type
real :: time
real :: tarray(2)
end type etime_type
type (etime_type) :: time1,time2
integer :: itime1,itime2
integer :: omp_get_thread_num
intrinsic mod,etime,system_clock
call etime(time1%tarray,time1%time)
print *,"An openmp test program :"
print *,"============================"
print *,"etime :",time1
print *,
call system_clock(itime1)
print *,"system_clock :",itime1
print *,
print *,"begin :"
fu=1.0_10
!$omp parallel do schedule(static) private(tmp) reduction(*:fu)
do i=1_8,N
if(mod(i,N/100)==0) print *,i/(N/100),omp_get_thread_num()
tmp=tan(real(i,10))*tan(real(i,10)*2)
fu=fu*tmp
enddo
!$omp end parallel do
print *,"result=",fu
print *,
call etime(time2%tarray,time2%time)
call system_clock(itime2)
print *,"etime :",time2
print *,"etime : it lasts",time2%time-time1%time,"second"
print *,
print *,"system_clock :",itime2
print *,"system_clock : it lasts",(itime2-itime1)/1000,"second"
end program testsse
[ 本帖最后由 紫色 于 2007-9-15 14:21 编辑 ] |