Home » Code » How much execution performance gets left on the table with vanilla code?

# How much execution performance gets left on the table with vanilla code?

We know that competitive code for vanilla Black Scholes computes valuation in the tens of nanoseconds on contemporary commodity microprocessors.  What does code that performs like that look like? Well this is what the vectorized Black Scoles code looks like:

`int vbs(long blk, double *MTM, double *T, double *S, double *K, double *R, double *SIGMA) {`
`int i;`
`vmlSetMode(VML_EP);`
`vdDiv(BLOCK, S, K, V1); // s/k`
`vmlSetMode(VML_EP);`
`vdLn(BLOCK, V1, V2);   // log(S/K)`
`vmlSetMode(VML_EP);`
`vdInvSqrt(BLOCK, T, V3);  // 1/sqrt(t)`
`#pragma ivdef`
`for(i=0; i < BLOCK; i++) V4[i] = (V2[i] + (R[i] + SIGMA[i] * SIGMA[i] * 0.5) * T[i]) * V3[i];`
`vmlSetMode(VML_EP);`
`vdDiv(BLOCK, V4, SIGMA, V5);  // d1`
`vmlSetMode(VML_EP);`
`//    vN(BLOCK, V5, V7);   // N(d1)`
`vdCdfNorm(BLOCK, V5, V7);   // N(d1)`
`#pragma ivdef`
`for (i=0; i < BLOCK; i++)  V6[i] = V5[i] - SIGMA[i] * T[i]; // d2`
`#pragma ivdef`
`for(i=0; i < BLOCK; i++) V9[i] = - R[i] *T[i];`
`vmlSetMode(VML_EP);`
`vdExp(BLOCK, V9, V10);`
`vmlSetMode(VML_EP);`
`vdCdfNorm(BLOCK, V6, V8);   // N(d2)`
`//    vN(BLOCK, V6, V8);   // N(d2)`
`#pragma ivdef`
`for(i=0; i < BLOCK; i++) MTM[i] =  S[i] * V7[i] - K[i] * V10[i] * V8[i];`
`return(0);`
`}`

This is what the CdfNorm() code (actually it’s the equivalent  vN() that I rewrote because I wasn’t totally bought into the optimality of the CdfNorm() intrinsic) looks like:

```int vN(long blk, double *X, double *Y) {

long i;```
`vmlSetMode(VML_EP);`
`vdAbs(BLOCK, X, V1);`
`#pragma ivdef`
`for(i=0; i < BLOCK; i++){`
`V2[i] = 1.0 + p * V1[i];`
`V4[i] = V1[i] * V1[i] * h;`
`}`
`vmlSetMode(VML_EP);`
`vdInv(BLOCK, V2, V3);`
`vmlSetMode(VML_EP);`
`vdExp(BLOCK, V4, V5);`
`#pragma ivdef`
`for(i=0; i < BLOCK; i++)    Y[i] = 1.0 - c * V5[i] * V3[i] * ( V3[i] *( V3[i] * ( V3[i] * ( V3[i] * b5 + b4 ) + b3 ) + b2 ) + b1 );`
`return 0;`
`}`

Lots of pragmas in odd looking sort of code. Remember Dr. Johnson’s code for Black Scholes? It looked like this (after I instrumented it a bit):

`#include <iostream>`

#include <string.h>

#include <cmath>

#include <cstdlib>

#include <sys/time.h>

#include <time.h>

// Johnson’s code

using namespace std;

// calculate the value of the cumulative normal distribution using Simpson’s method

double normalDistribution(double x)

{

if(x<-10.)return 0.; // return sensible values on limits

if(x>10.)return 1.;

// number of steps

int N=1000;

// range of integration

double a=0,b=x;

// local variables

double s,h,sum=0.;

// initialise the variables

h=(b-a)/N;

// add in the first few terms

sum = sum + exp(-a*a/2.) + 4.*exp(-(a+h)*(a+h)/2.);

// and the last one

sum = sum + exp(-b*b/2.);

// loop over terms 2 up to N-1

for(int i=1;i<N/2;i++)

{

s = a + 2*i*h;

sum = sum + 2.*exp(-s*s/2.);

s = s + h;

sum = sum + 4.*exp(-s*s/2.);

}

// complete the integral

sum = 0.5 + h*sum/3./sqrt(8.*atan(1.));

// return result

return sum;

}

// return the value of a call option using the black scholes formula

double callOptionPrice(double S,double t,double X,double r,double sigma,double T)

{

if(S<1.e-14)return 0.; // check if asset worthless

if(sigma<1.e-14) // check if sigma zero

{

if(S<X*exp(-r*(T-t)))return 0.;

else return S-X*exp(-r*(T-t));

}

if(fabs(T-t)<1.e-14) // check if we are at maturity

{

if(S<X)return 0.;

else return S-X;

}

// calculate option price

double d1=(log(S/X) + (r+sigma*sigma/2.)*(T-t))/(sigma*sqrt(T-t));

double d2=(log(S/X) + (r-sigma*sigma/2.)*(T-t))/(sigma*sqrt(T-t));

return normalDistribution(d1)*S – normalDistribution(d2)*X*exp(-r*(T-t));

}

// Johnson’s code no Simpsons Rule

double N(double x) {

double t, temp;

const double b1 =  0.319381530;

const double b2 = -0.356563782;

const double b3 =  1.781477937;

const double b4 = -1.821255978;

const double b5 =  1.330274429;

const double p  =  0.2316419;

const double c  =  0.39894228;

if(x >= 0.0) {

t = 1.0 / ( 1.0 + p * x );

temp = 1.0 – c * exp( -x*x*0.5) * t * ( t *( t * ( t * ( t * b5 + b4 ) + b3 ) + b2 ) + b1 );

return(temp);

}

else {

t = 1.0 / ( 1.0 – p * x );

temp = c * exp( -x * x * 0.5) * t *( t *( t * ( t * ( t * b5 + b4 ) + b3 ) + b2 ) + b1 );

return(temp);

}

}

// return the value of a call option using the black scholes formula

double callOptionPriceNoSimpsons(double S,double t,double X,double r,double sigma,double T)

{

if(S<1.e-14)return 0.; // check if asset worthless

if(sigma<1.e-14) // check if sigma zero

{

if(S<X*exp(-r*(T-t)))return 0.;

else return S-X*exp(-r*(T-t));

}

if(fabs(T-t)<1.e-14) // check if we are at maturity

{

if(S<X)return 0.;

else return S-X;

}

// calculate option price

double d1=(log(S/X) + (r+sigma*sigma/2.)*(T-t))/(sigma*sqrt(T-t));

double d2=(log(S/X) + (r-sigma*sigma/2.)*(T-t))/(sigma*sqrt(T-t));

return N(d1)*S – N(d2)*X*exp(-r*(T-t));

}

int main(int argc, const char * argv[])

{

double sumprice=0.0;

long iss;

double s=1.0, t=0.0, x=1.0, r=0.05, sigma=0.2, T=1.0;

time_t start, term, diff;

struct timeval t1, t2;

double elapsedtime, rate;

long seed=12321;

if(argc == 1) {

// just run once and return

cout << “Call Option Price = ” << callOptionPrice(1,0,1,0.05,0.2,1) << endl;

}

else if(argc >= 2 && !strcmp(argv[1],”-r”)) {

// run multiple iterations of Johnson’s code unaltered but compiled -O3 and return the execution time

iss = atol(argv[2]);

srand48(seed);

start = time(NULL);

gettimeofday(&t1, NULL);

for(long i=0; i < iss; i++) {

sumprice += callOptionPrice(s,t,x,r+0.01*drand48(),sigma,T);

}

gettimeofday(&t2, NULL);

term = time(NULL);

diff = term – start;

elapsedtime = (t2.tv_sec – t1.tv_sec) * 1000.0; // sec to ms

elapsedtime += (t2.tv_usec – t1.tv_usec)/1000.0; // us to ms

rate = (double)iss/ elapsedtime;

cout << iss << ” Iterations, sumprice = “<< sumprice << endl;

cout << “iterations/ms = ” << rate << endl;

cout << ” elapsed time = ” << elapsedtime << endl;

}

else if(argc >= 2 && !strcmp(argv[1],”-s”)) {

// run multiple iterations of Johnson’s code but lose the Simpson’s Rule CNF  and return the execution time

// cout << “multiple runs ”  << argv[2] << endl;

iss = atol(argv[2]);

srand48(seed);

start = time(NULL);

gettimeofday(&t1, NULL);

for(long i=0; i < iss; i++) {

sumprice += callOptionPriceNoSimpsons(s,t,x,r+0.01*drand48(),sigma,T);

}

gettimeofday(&t2, NULL);

term = time(NULL);

diff = term – start;

elapsedtime = (t2.tv_sec – t1.tv_sec) * 1000.0; // sec to ms

elapsedtime += (t2.tv_usec – t1.tv_usec)/1000.0; // us to ms

rate = (double)iss/ elapsedtime;

cout << iss << ” Iterations, sumprice = “<< sumprice << endl;

cout << “iterations/ms = ” << rate << endl;

cout << ” elapsed time = ” << elapsedtime << endl;

}

else {

cout << “Usage: bs -r <iterations> Johnson’s code\n\t bs -s <iterations> Johnson’s Code no Simpson CDF” << endl;

}

return 0;

}

The code is real straight-forward more or less the same format you would read in Numerical Recipes. If I have to learn what Black Scholes does computationally then I probably want to read code like this. Given that we know you can compute something approximating 100 million Black Scholes evaluations every second how much does this code leave on the table?

So we compiled the code: g++ -m64 –O3 –funroll-loops –o bs main.cpp

and we ran it:

`% ./bs –r 100`

and it does 31 iterations per millisecond, that 31K per second. So optimal performance is like 100 million evaluations per second and I’m getting 31K, hmm. Note we could work harder on the optimize flags in GCC for this naive code, but I don’t think it is going to change much. Maybe that Simpson Rule for the CdfNorm() function is overkill. Let’s fix that. Look at the Johnson’s code No Simpson Rule. It produces the same answer but it’s a little faster. We wanted to know how much so we ran it:

`%./bs –s 1000`

and it does much better about 5890 iterations per millisecond or about 6 million per second.  But here’s the point, even with No Simpson’s Rule the code is still at least an order of magitude off of competitive performance and a factor of about 6 off of competitive for the machine I am running the test on.  This is not unusual and its inline with Hager’s plot of Mflops versus vector length.