Proudly made in Namek by serge-sans-paille
/me
$ whoami
sguelton
Plotting for the Mass!
A language is not slow, but it can make it hard for compilers to make it run fast
Scientific code written in Python looks like FORTRAN
function rosen(x, n)
real(8) :: x(n),t0(n-1), t1(n-1)
real(8) rosen
t0 = 100 * (x(2:n) - (x(1:n-1)** 2))** 2
t1 = (1 - x(1:n-1)** 2)
rosen = sum(t0 + t1)
end function
import numpy as np
def rosen(x):
t0 = 100 * (x[1:] - x[:-1] ** 2) ** 2
t1 = (1 - x[:-1]) ** 2
return np.sum(t0 + t1)
for Python/Numpy codes
Turn annotated code into C extension
cdef inline int mandel(double real, double imag, int max_iterations=20):
cdef double z_real = 0., z_imag = 0.
cdef int i
for i in range(0, max_iterations):
z_real, z_imag = ( z_real*z_real - z_imag*z_imag + real,
2*z_real*z_imag + imag )
if (z_real*z_real + z_imag*z_imag) >= 4:
return i
return -1
def mandel(real, imag, max_iterations=20):
z_real, z_imag = 0., 0.
for i in range(0, max_iterations):
z_real, z_imag = ( z_real*z_real - z_imag*z_imag + real,
2*z_real*z_imag + imag )
if (z_real*z_real + z_imag*z_imag) >= 4:
return i
return -1
from numba import autojit
@autojit
def mandel(real, imag, max_iterations=20):
z_real, z_imag = 0., 0.
for i in range(0, max_iterations):
z_real, z_imag = ( z_real*z_real - z_imag*z_imag + real,
2*z_real*z_imag + imag )
if (z_real*z_real + z_imag*z_imag) >= 4:
return i
return -1
from parakeet import jit
@jit
def mandel(real, imag, max_iterations=20):
z_real, z_imag = 0., 0.
for i in range(0, max_iterations):
z_real, z_imag = ( z_real*z_real - z_imag*z_imag + real,
2*z_real*z_imag + imag )
if (z_real*z_real + z_imag*z_imag) >= 4:
return i
return -1
#pythran export mandel(float, float, int)
def mandel(real, imag, max_iterations=20):
z_real, z_imag = 0., 0.
for i in range(0, max_iterations):
z_real, z_imag = ( z_real*z_real - z_imag*z_imag + real,
2*z_real*z_imag + imag )
if (z_real*z_real + z_imag*z_imag) >= 4:
return i
return -1
Language | Implementation | Speedup |
---|---|---|
Python | CPython 2.7.6 | 1.0 |
Python | CPython 2.7.6 with -O | 1.0 |
Cython | Cython 0.20.1post0 | 1.6 |
Annotated Cython | Cython 0.20.1post0 | 95.8 |
Python3 | CPython 3.4.0 | 0.9 |
Python | Jython 2.5.3 | 3.4 |
Python | PyPy 2.3.1 | 82.8 |
Python | Parakeet 0.23.2 | 68.8 |
Python | Pythran 0.5.0 | 92.7 |
C | GCC 4.8.2 with -O2 | 103.3 |
Python/Numpy ==
high level programming
reliab = zeros((N,N))
for i in range(N):
for j in range(N):
if random() < 0.6:
reliab[i,j] = 1
else:
reliab[i,j] = 0
vs.
reliab = numpy.int32(numpy.random.rand(N,N) < 0.6)
def cronbach(itemscores):
itemvars = itemscores.var(axis=1, ddof=1)
tscores = itemscores.sum(axis=0)
nitems = len(itemscores)
return nitems / (nitems-1) * (1 - itemvars.sum() / tscores.var(ddof=1))
def l2norm(x):
return np.sqrt(np.sum(np.abs(x) ** 2, axis=1))
np.abs(x)**2
np.sum(..., axis=1)
np.sum(..., axis=1)
np.abs(x) ** 2
#pythran export l2norm(float32[][])
def l2norm(x):
return np.sqrt(np.sum(np.abs(x)**2, axis=1))
$> pythran l2norm.py -march=native -fopenmp
Inlining + Loop fusion + scalarization + use-def elimination
Forward Substitution + Expression Templates
A generic approach to loop unrolling
for i in range(3):
foo(i)
(Forward Substitution +) Constant Folding
for i in [0, 1, 2]:
foo(i)
Loop Full Unrolling
i = 0
foo(i)
i = 1
foo(i)
i = 2
foo(i)
Use-def elimination...
Use SSE or AVX vector registers of modern CPUs
Pythran's runtime is partially vectorized thanks to Boost.SIMD
bench | seq (µs) | SSE | AVX |
---|---|---|---|
allpairs_distances | 1711 | 916 | 673 |
arc_distance | 1020 | 613 | 414 |
l2norm | 913 | 577 | 494 |
rosen | 1234 | 1096 | 997 |
slowparts | 1154 | 1176 | 848 |
vibr_energy | 1518 | 769 | 570 |
def hyantes(xmin, ymin, xmax, ymax, step, range_, range_x, range_y, t):
X,Y = t.shape
pt = np.zeros((X,Y))
#omp parallel for
for i in range(X):
for j in range(Y):
for k in t:
tmp = 6368.* np.arccos( np.cos(xmin+step*i)*np.cos( k[0] ) * np.cos((ymin+step*j)-k[1])+ np.sin(xmin+step*i)*np.sin(k[0]))
if tmp < range_:
pt[i,j]+=k[2] / (1+tmp)
return pt
a = np.random.rand(n, m)
b = np.random.rand(3, n - 1)
c = a[1:,:-1] + 3 * b[i]
a = np.random.rand(n, m)
b = np.random.rand(n, m)
c = np.sum(a ** 2 + b ** 2)
Python | parakeet | pythran | -march=native | |
---|---|---|---|---|
allpairs_distances | 38129 | 1784 | 1713 | 667 |
allpairs_distances_loops | 49750 | 1794 | 1681 | 1654 |
conv | 1916161 | 2040 | 1879 | 1880 |
growcut | 1967622 | 2203 | 5976 | 5841 |
harris | 5595 | 5246 | 2949 | 2863 |
hasting | 8 | 57 | 1 | 1 |
hyantes | 255760 | 1882 | 1881 | 1820 |
julia | 2761543 | 2938 | 2578 | 2504 |
l2norm | 5483 | 12648 | 919 | 494 |
multiple_sum | 3026 | 8394 | 1245 | 1029 |
pairwise | 3742614 | 3708 | 3563 | 3507 |
rosen | 14168 | 1625 | 1238 | 992 |
$ pip install pythran
$ apt-get install pythran