The process of brute force calculation by Coulomb force with multiprocessing and numpy. TEST3

Each result returned by the distributed processing is not made into a PN-sized matrix, but is placed as is. This lightens the processing.

import random
import math
import time
import gc
import numpy as np

from multiprocessing import Pool

np.set_printoptions(threshold=np.inf)
random.seed(1)

XX = 0
YY = 1
ZZ = 2

core = 6

#total particle num
PN = 10000
#---------------------------------------------------
#
# parameter : divide_N
#
# The number of particles per side of a small square 
# divided for the splitting process when creating a 
# large PN^2 region with all particles PN.
#
# The smaller the setting, the more it will be divided
# into many smaller jobs.
#---------------------------------------------------
divide_N = 1000
xyzV = [[0 for i in range(3)] for j in range(PN)]
xyzP = np.zeros((PN,3))
xyzF = np.zeros((PN,3))


def wrapper(args):
  return calc_XY_start_end(*args)

def find_pair():
  global core
  global PN
  global xyzF
  global xyzP

  worklist = []
  thread   = 0

  remainder = PN %  divide_N
  quotient  = PN // divide_N

  if remainder != 0:                 #ex PN=11, divide_N=4, remainder=3, quotient=2
      quotient = quotient + 1        #   2 -> 3
  extra = divide_N - remainder       #   1

  X_st = 0
  X_ed = 0
  Y_st = 0
  Y_ed = 0

  Thead_num = 0
  sum_f = np.zeros((PN,PN,3))

  for Xn in range(0,quotient):
    for Yn in range(0,Xn+1):

      worklist.append([Xn, Yn, quotient, remainder, divide_N, extra])
      Thead_num = Thead_num + 1
      #----------------------------------------
      #local_force = calc_XY_start_end(Xn, Yn, quotient, remainder, divide_N, extra)
      #----------------------------------------


  #start thread. results is callback in array.
  p = Pool(core)
  callback = p.map(wrapper, worklist)
  p.close()


  t1 = 0
  for Th in range(0,Thead_num):

    xn_l = callback[Th][0]
    yn_l = callback[Th][1]
    xxst = callback[Th][2]
    xxed = callback[Th][3]
    yyst = callback[Th][4]
    yyed = callback[Th][5]
    local_force = callback[Th][6]

    time_sta = time.time()

    sb = local_force.shape
    ten   = -1 * local_force.transpose(1,0,2)
    tensb = ten.shape

    sum_f[yyst:yyst+sb[0], xxst:xxst+sb[1]] += local_force

    if xn_l != yn_l:
      sum_f[xxst:xxst+tensb[0], yyst:yyst+tensb[1]] += ten

    time_end = time.time()
    t1 =t1 + (time_end  - time_sta)


  print ("=====t1=",t1)
  print(np.sum(sum_f))
  #mmm = sum_f.sum(axis=1) 
  #xyzF = xyzF + mmm 


def calc_XY_start_end(Xn, Yn, quotient, remainder, divide_N, extra):
  global PN
  global xyzP

  if Xn != quotient-1 and remainder != 0 or remainder == 0:
    X_st = Xn*divide_N
    X_ed = (Xn+1)*divide_N-1
  else:
    X_st = Xn*divide_N - (divide_N - remainder)
    X_ed = (Xn+1)*divide_N  - (divide_N - remainder) -1
  if Yn != quotient-1 and remainder != 0 or remainder == 0:
    Y_st = Yn*divide_N
    Y_ed = (Yn+1)*divide_N-1
  else:
    Y_st = Yn*divide_N - (divide_N - remainder)
    Y_ed = (Yn+1)*divide_N  -(divide_N - remainder) -1

  Xa = np.arange(X_st,X_ed+1)
  Ya = np.arange(Y_st,Y_ed+1)
  mx1,my1 = np.meshgrid(Xa,Ya)
  local_xyz  = xyzP[mx1]-xyzP[my1]
  distance_a = np.linalg.norm(local_xyz, axis=2)

  tmp_d = distance_a[:,:,np.newaxis]
  tmp_f = local_xyz/tmp_d
  tmp_f = np.nan_to_num(tmp_f,0)

  #extra clear
  if remainder != 0:
    if Xn + 1 == quotient:
      for dd in range(0,extra):
        tmp_f[:,dd] = 0
    if Yn + 1 == quotient:
      for dd in range(0,extra):
        tmp_f[[dd]] = 0

  return Xn,Yn,X_st,X_ed,Y_st,Y_ed,tmp_f



def init_lattice():
  global xyzF
  global xyzP

  pnum = 0
  while pnum < PN:
    xyzP[pnum][XX] = random.uniform(-1,1)
    xyzP[pnum][YY] = random.uniform(-1,1)
    xyzP[pnum][ZZ] = random.uniform(-1,1)
    xyzF[pnum][XX] = random.uniform(-1,1)
    xyzF[pnum][YY] = random.uniform(-1,1)
    xyzF[pnum][ZZ] = random.uniform(-1,1)
    pnum += 1


def results_sum():
  global xyzF
  pnum = 0
  total_F = 0
#  while pnum < PN:
#    total_F = total_F + xyzF[pnum][XX]
#    total_F = total_F + xyzF[pnum][YY]
#    total_F = total_F + xyzF[pnum][ZZ]
#    pnum += 1
#
#  print (total_F)


if __name__ == "__main__":

  init_lattice()
  find_pair()
  results_sum()

The process of brute force calculation by Coulomb force with multiprocessing and numpy. TEST2

The split process reduced the matrix size as much as possible, but it still eats up too much memory.
Multiprocessing consumes a lot of memory.

import random
import math
import time
import gc
import numpy as np

from multiprocessing import Pool

np.set_printoptions(threshold=np.inf)
random.seed(1)

XX = 0
YY = 1
ZZ = 2

#total particle num
PN = 1200
#---------------------------------------------------
#
# parameter : divide_N
#
# The number of particles per side of a small square 
# divided for the splitting process when creating a 
# large PN^2 region with all particles PN.
#
# The smaller the setting, the more it will be divided
# into many smaller jobs.
#---------------------------------------------------
divide_N = 50
core = 2
xyzV = [[0 for i in range(3)] for j in range(PN)]
xyzP = np.zeros((PN,3))
xyzF = np.zeros((PN,3))

def wrapper(args):
  return calc_XY_start_end(*args)

def find_pair():
  global PN
  global xyzF
  global xyzP

  worklist = []
  thread   = 0

  remainder = PN %  divide_N
  quotient  = PN // divide_N

  if remainder != 0:                 #ex PN=11, divide_N=4, remainder=3, quotient=2
      quotient = quotient + 1        #   2 -> 3
  extra = divide_N - remainder       #   1

  X_st = 0
  X_ed = 0
  Y_st = 0
  Y_ed = 0

  Thead_num = 0
  sum_f = np.zeros((PN,PN,3))

  for Xn in range(0,quotient):
    for Yn in range(0,Xn+1):

      worklist.append([Xn, Yn, quotient, remainder, divide_N, extra])
      Thead_num = Thead_num + 1
      #----------------------------------------
      #local_force = calc_XY_start_end(Xn, Yn, quotient, remainder, divide_N, extra)
      #----------------------------------------

  p = Pool(core)

  #start thread. results is callback in array.
  callback = p.map(wrapper, worklist)
  p.close()


  for Th in range(0,Thead_num):

    xn_l = callback[Th][0]
    yn_l = callback[Th][1]
    xxst = callback[Th][2]
    xxed = callback[Th][3]
    yyst = callback[Th][4]
    yyed = callback[Th][5]
    local_force = callback[Th][6]

    if xxst != 0:
      r2 = np.zeros((xxst,divide_N,3))
      local_force        = np.insert(local_force     ,0,r2,axis=1)
    if xxed+1 != PN:
      r2 = np.zeros((PN - xxed - 1,divide_N,3))
      local_force      = np.insert(local_force     ,xxed + 1,r2,axis=1)

    if yyst != 0:
      r2 = np.zeros((yyst,PN,3))
      local_force      = np.insert(local_force     ,0,r2,axis=0)
    if yyed+1 != PN:
      r2 = np.zeros((PN - yyed - 1,PN,3))
      local_force      = np.insert(local_force     ,yyed + 1,r2,axis=0)

    del r2

    if xn_l != yn_l:
      sum_f = sum_f + local_force
      sum_f = sum_f - local_force.transpose(1,0,2)
    else:
      sum_f = sum_f + local_force

    del local_force
    gc.collect()

  #print (sum_f)

  print("=====================================")
  mmm = sum_f.sum(axis=1)
  xyzF = xyzF + mmm


def calc_XY_start_end(Xn, Yn, quotient, remainder, divide_N, extra):
  global PN
  global xyzP

  if Xn != quotient-1 and remainder != 0 or remainder == 0:
    X_st = Xn*divide_N
    X_ed = (Xn+1)*divide_N-1
  else:
    X_st = Xn*divide_N - (divide_N - remainder)
    X_ed = (Xn+1)*divide_N  - (divide_N - remainder) -1
  if Yn != quotient-1 and remainder != 0 or remainder == 0:
    Y_st = Yn*divide_N
    Y_ed = (Yn+1)*divide_N-1
  else:
    Y_st = Yn*divide_N - (divide_N - remainder)
    Y_ed = (Yn+1)*divide_N  -(divide_N - remainder) -1

  Xa = np.arange(X_st,X_ed+1)
  Ya = np.arange(Y_st,Y_ed+1)
  mx1,my1 = np.meshgrid(Xa,Ya)
  local_xyz  = xyzP[mx1]-xyzP[my1]
  distance_a = np.linalg.norm(local_xyz, axis=2)

  tmp_d = distance_a[:,:,np.newaxis]
  tmp_f = local_xyz/tmp_d
  tmp_f = np.nan_to_num(tmp_f,0)

  #extra clear
  if remainder != 0:
    if Xn + 1 == quotient:
      for dd in range(0,extra):
        tmp_f[:,dd] = 0
    if Yn + 1 == quotient:
      for dd in range(0,extra):
        tmp_f[[dd]] = 0

  return Xn,Yn,X_st,X_ed,Y_st,Y_ed,tmp_f



def init_lattice():
  global xyzF
  global xyzP

  pnum = 0
  while pnum < PN:
    xyzP[pnum][XX] = random.uniform(-1,1)
    xyzP[pnum][YY] = random.uniform(-1,1)
    xyzP[pnum][ZZ] = random.uniform(-1,1)
    xyzF[pnum][XX] = random.uniform(-1,1)
    xyzF[pnum][YY] = random.uniform(-1,1)
    xyzF[pnum][ZZ] = random.uniform(-1,1)
    pnum += 1


def results_sum():
  global xyzF
  pnum = 0
  total_F = 0
  while pnum < PN:
    total_F = total_F + xyzF[pnum][XX]
    total_F = total_F + xyzF[pnum][YY]
    total_F = total_F + xyzF[pnum][ZZ]
    pnum += 1

  print (total_F)


if __name__ == "__main__":

  init_lattice()
  find_pair()
  results_sum()

The process of brute force calculation by Coulomb force with multiprocessing and numpy. TEST1

The small distributed matrices also eat too much memory because they are only PNxPN in size.
Too slow.

###########################
# multiprocessing test ####
###########################
import random
import math
import time
import scipy.special as scm
from multiprocessing import Pool

random.seed(1)

PX = 0;PY = 1;PZ = 2;
VX = 3;VY = 4;VZ = 5;
FX = 6;FY = 7;FZ = 8;

#number of particles in a line
line_num = 15

#total particle num
PN = line_num * line_num * line_num

#ready to 9 parameters for particle 
#(PX, PY, PZ, VX, VY, VZ, FX, FY, FZ)
xyz = [[0 for i in range(9)] for j in range(PN)]

#Number of combinations of coulomb force calculation
combinum = int(scm.comb(PN, 2))

#thread number(local thread num)
core = 4

def find_pair_sub(prep,pend,thread):
  global xyz

  #local results array
  xyzF = [[0 for i in range(3)] for j in range(PN)]
  fx = 0; fy = 1; fz = 2

  for i in range(prep,pend):
    for j in range(i + 1, PN):
      dx = xyz[i][PX] - xyz[j][PX]
      dy = xyz[i][PY] - xyz[j][PY]
      dz = xyz[i][PZ] - xyz[j][PZ]
      r  = math.sqrt(dx*dx + dy*dy + dz*dz)

      xyzF[i][fx] = xyzF[i][fx] + dx/(r*r*r)
      xyzF[i][fy] = xyzF[i][fy] + dy/(r*r*r)
      xyzF[i][fz] = xyzF[i][fz] + dz/(r*r*r)
      xyzF[j][fx] = xyzF[j][fx] - dx/(r*r*r)
      xyzF[j][fy] = xyzF[j][fy] - dy/(r*r*r)
      xyzF[j][fz] = xyzF[j][fz] - dz/(r*r*r)

  return xyzF

def wrapper(args):
  return find_pair_sub(*args)


def find_pair():
  global PN
  global combinum

  pw = combinum // core
  pl = combinum % core

  localt = 0
  thread = 0
  pre = 0
  #each thread work list
  worklist = []
  ppp = pw

  for i in range(PN) :
    if core == 1:
      worklist.append([pre,PN,thread])
      break

    localt = localt + (PN - i - 1)
    if localt >= ppp:
      worklist.append([pre,i,thread])
      ppp += pw
      thread += 1
      pre = i

  if i != pre:
    prep = worklist[thread-1][0]
    worklist[thread-1] = [prep,PN,thread-1]

  #make thread core num
  p = Pool(core)

  #start thread. results is callback in array.
  callback = p.map(wrapper, worklist)
  p.close()

  #summation each thread results
  for j in range(core):
    for i in range(PN):
      xyz[i][FX] += callback[j][i][0]
      xyz[i][FY] += callback[j][i][1]
      xyz[i][FZ] += callback[j][i][2]

def init_lattice():
  global xyz

  pnum = 0
  while pnum < PN:
    xyz[pnum][PX] = random.uniform(-1,1)
    xyz[pnum][PY] = random.uniform(-1,1)
    xyz[pnum][PZ] = random.uniform(-1,1)
    xyz[pnum][FX] = random.uniform(-1,1)
    xyz[pnum][FY] = random.uniform(-1,1)
    xyz[pnum][FZ] = random.uniform(-1,1)
    pnum += 1

if __name__ == "__main__":
  init_lattice()
  find_pair()

The process of brute force calculation by Coulomb force

The process of brute force calculation by Coulomb force is calculated as a matrix using numpy. In the calculation, each element is calculated in a small matrix and summed up at the end to enable parallel processing later.

import random
import math
import time
import numpy as np

np.set_printoptions(threshold=np.inf)
random.seed(1)

XX = 0
YY = 1
ZZ = 2

#total particle num
PN = 100
#---------------------------------------------------
#
# parameter : divide_N
#
# The number of particles per side of a small square 
# divided for the splitting process when creating a 
# large PN^2 region with all particles PN.
#
# The smaller the setting, the more it will be divided
# into many smaller jobs.
#---------------------------------------------------
divide_N = 3

xyzV = [[0 for i in range(3)] for j in range(PN)]
xyzP = np.zeros((PN,3))
xyzF = np.zeros((PN,3))

def find_pair():
  global PN
  global xyzF
  global xyzP

  remainder = PN %  divide_N
  quotient  = PN // divide_N

  if remainder != 0:                 #ex PN=11, divide_N=4, remainder=3, quotient=2
      quotient = quotient + 1        #   2 -> 3
  extra = divide_N - remainder       #   1

  X_st = 0
  X_ed = 0
  Y_st = 0
  Y_ed = 0

  sum_f = np.zeros((PN,PN,3))

  for Xn in range(0,quotient):
    for Yn in range(0,Xn+1):

      #----------------------------------------
      local_force = calc_XY_start_end(Xn, Yn, quotient, remainder, divide_N, extra)
      #----------------------------------------

      if Xn != Yn:
        sum_f = sum_f + local_force
        sum_f = sum_f - local_force.transpose(1,0,2)
      else:
        sum_f = sum_f + local_force

  print (sum_f)

  print("=====================================")
  mmm = sum_f.sum(axis=1)
  xyzF = xyzF + mmm


def calc_XY_start_end(Xn, Yn, quotient, remainder, divide_N, extra):
  global PN
  global xyzP

  if Xn != quotient-1 and remainder != 0 or remainder == 0:
    X_st = Xn*divide_N
    X_ed = (Xn+1)*divide_N-1
  else:
    X_st = Xn*divide_N - (divide_N - remainder)
    X_ed = (Xn+1)*divide_N  - (divide_N - remainder) -1
  if Yn != quotient-1 and remainder != 0 or remainder == 0:
    Y_st = Yn*divide_N
    Y_ed = (Yn+1)*divide_N-1
  else:
    Y_st = Yn*divide_N - (divide_N - remainder)
    Y_ed = (Yn+1)*divide_N  -(divide_N - remainder) -1

  Xa = np.arange(X_st,X_ed+1)
  Ya = np.arange(Y_st,Y_ed+1)
  mx1,my1 = np.meshgrid(Xa,Ya)
  local_xyz  = xyzP[mx1]-xyzP[my1]
  distance_a = np.linalg.norm(local_xyz, axis=2)

  tmp_d = distance_a[:,:,np.newaxis]
  tmp_f = local_xyz/tmp_d
  tmp_f = np.nan_to_num(tmp_f,0)

  #extra clear
  if remainder != 0:
    if Xn + 1 == quotient:
      for dd in range(0,extra):
        tmp_f[:,dd] = 0
    if Yn + 1 == quotient:
      for dd in range(0,extra):
        tmp_f[[dd]] = 0

  if X_st != 0:
    r2 = np.zeros((X_st,divide_N,3))
    tmp_f        = np.insert(tmp_f     ,0,r2,axis=1)
  if X_ed+1 != PN:
    r2 = np.zeros((PN - X_ed - 1,divide_N,3))
    tmp_f      = np.insert(tmp_f     ,X_ed + 1,r2,axis=1)

  if Y_st != 0:
    r2 = np.zeros((Y_st,PN,3))
    tmp_f      = np.insert(tmp_f     ,0,r2,axis=0)
  if Y_ed+1 != PN:
    r2 = np.zeros((PN - Y_ed - 1,PN,3))
    tmp_f      = np.insert(tmp_f     ,Y_ed + 1,r2,axis=0)

  #print (tmp_f)
  #print("===================================================")
  return tmp_f



def init_lattice():
  global xyzF
  global xyzP

  pnum = 0
  while pnum < PN:
    xyzP[pnum][XX] = random.uniform(-1,1)
    xyzP[pnum][YY] = random.uniform(-1,1)
    xyzP[pnum][ZZ] = random.uniform(-1,1)
    xyzF[pnum][XX] = random.uniform(-1,1)
    xyzF[pnum][YY] = random.uniform(-1,1)
    xyzF[pnum][ZZ] = random.uniform(-1,1)
    pnum += 1


def results_sum():
  global xyzF
  pnum = 0
  total_F = 0
  while pnum < PN:
    total_F = total_F + xyzF[pnum][XX]
    total_F = total_F + xyzF[pnum][YY]
    total_F = total_F + xyzF[pnum][ZZ]
    pnum += 1

  print (total_F)


if __name__ == "__main__":

  init_lattice()
  find_pair()
  results_sum()

Calculating long-range particle interactions with numpy (handmade mesh grid)

Too slow

import random
import math
import time
import numpy as np

np.set_printoptions(threshold=np.inf)
random.seed(1)

XX = 0
YY = 1
ZZ = 2

#total particle num
PN = 300

xyzV = [[0 for i in range(3)] for j in range(PN)]
xyzP = np.zeros((PN,3))
xyzF = np.zeros((PN,3))


def find_pair():
  global PN
  global xyzF
  global xyzP

  x1 = np.empty(0)
  y1 = np.empty(0)

  for mm in range(1,PN):
    for nn in range(mm,PN):
      x1 = np.append(x1,[nn],axis=0)
      y1 = np.append(y1,[mm-1],axis=0)

  x1 = x1.astype(np.int32)
  x1 = x1.reshape(x1.shape[0],1)
  y1 = y1.astype(np.int32)
  y1 = y1.reshape(y1.shape[0],1)

  dd = np.linalg.norm(xyzP[x1]-xyzP[y1], axis=2)

  distances = np.zeros((PN,PN))
  ly = 0
  lx = ly + 1
  for kk in range(0,dd.shape[0]):
    distances[lx][ly] = dd[kk]
    distances[ly][lx] = dd[kk]
    lx = lx + 1
    if lx == PN:
      ly = ly + 1
      lx = ly + 1

  tmp_index = np.arange(xyzP.shape[0])
  xx, yy = np.meshgrid(tmp_index, tmp_index)

  dxyz = xyzP[xx] - xyzP[yy]

  distances = distances[:,:,np.newaxis]
  tmp_force = dxyz/distances
  tmp_force = np.nan_to_num(tmp_force,0)

  mmm = tmp_force.sum(axis=1)
  xyzF = xyzF + mmm

def init_lattice():
  global xyzF
  global xyzP

  pnum = 0
  while pnum < PN:
    xyzP[pnum][XX] = random.uniform(-1,1)
    xyzP[pnum][YY] = random.uniform(-1,1)
    xyzP[pnum][ZZ] = random.uniform(-1,1)
    xyzF[pnum][XX] = random.uniform(-1,1)
    xyzF[pnum][YY] = random.uniform(-1,1)
    xyzF[pnum][ZZ] = random.uniform(-1,1)
    pnum += 1

def results_sum():
  #global xyzF
  pnum = 0
  total_F = 0
  while pnum < PN:
    total_F = total_F + xyzF[pnum][XX]
    total_F = total_F + xyzF[pnum][YY]
    total_F = total_F + xyzF[pnum][ZZ]
    pnum += 1

  print (total_F)


if __name__ == "__main__":

  init_lattice()
  find_pair()
  results_sum()

Calculating long-range particle interactions with numpy

import random
import math
import time
import numpy as np

#import scipy.misc as scm
from multiprocessing import Pool

np.set_printoptions(threshold=np.inf)
random.seed(1)

XX = 0
YY = 1
ZZ = 2

#total particle num
PN = 1000

xyzV = [[0 for i in range(3)] for j in range(PN)]
xyzP = np.zeros((PN,3))
xyzF = np.zeros((PN,3))

def find_pair():
  global PN
  global xyzF
  global xyzP

  tmp_index = np.arange(xyzP.shape[0])

  xx, yy = np.meshgrid(tmp_index, tmp_index)
  distances = np.linalg.norm(xyzP[xx]-xyzP[yy], axis=2)
  dxyz = xyzP[xx] - xyzP[yy]

  distances = distances[:,:,np.newaxis]
  tmp_force = dxyz/distances
  tmp_force = np.nan_to_num(tmp_force,0)

  mmm = tmp_force.sum(axis=1)
  xyzF = xyzF + mmm

def init_lattice():
  global xyzF
  global xyzP

  pnum = 0
  while pnum < PN:
    xyzP[pnum][XX] = random.uniform(-1,1)
    xyzP[pnum][YY] = random.uniform(-1,1)
    xyzP[pnum][ZZ] = random.uniform(-1,1)
    xyzF[pnum][XX] = random.uniform(-1,1)
    xyzF[pnum][YY] = random.uniform(-1,1)
    xyzF[pnum][ZZ] = random.uniform(-1,1)
    pnum += 1


def results_sum():
  #global xyzF
  pnum = 0
  total_F = 0
  while pnum < PN:
    total_F = total_F + xyzF[pnum][XX]
    total_F = total_F + xyzF[pnum][YY]
    total_F = total_F + xyzF[pnum][ZZ]
    pnum += 1

  print (total_F)


if __name__ == "__main__":

  init_lattice()
  find_pair()
  results_sum()

乱数を使って円周率もどきを求める2

先日試した乱数を使った円周率を求める方法を
numpyを使った方法に置き換えた。

import math
import time
import numpy as np


def calc_pi ( NN ):
  arr1 = np.random.rand(NN)
  arr2 = np.random.rand(NN)
  arr3 = np.sqrt(arr1 * arr1 + arr2 * arr2)
  arr3[arr3 > 1] = 0   #1より大きい要素は0に
  arr3[arr3 > 0] = 1   #0より大きい要素は1に
  in_n = np.sum(arr3)  #円内に入ったやつの合計

  return ( in_n / NN ) * 4


if __name__ == "__main__":

  for j in range (1,9):
    NN = 10 ** j
    start_time = time.perf_counter()
    pi = calc_pi(NN)
    execution_time = time.perf_counter() - start_time
    delt = math.fabs(math.pi - pi)
    print (str(NN).rjust(10),"  ",f'{pi:11.010f}',"  ",f'{delt:7.06f}',"  ",execution_time)

実行結果

   試行回数        結果      誤差     処理時間[sec]
        10    2.8000000000    0.341593    8.769400028540986e-05
       100    3.1200000000    0.021593    4.6955000016168924e-05
      1000    3.1960000000    0.054407    5.6955000218295027e-05
     10000    3.1420000000    0.000407    0.0003660469997157634
    100000    3.1376400000    0.003953    0.0036325510000096983
   1000000    3.1415240000    0.000069    0.03182136600025842
  10000000    3.1416492000    0.000057    0.2625446400002147
 100000000    3.1416174400    0.000025    2.535640128000068

numpy使わずにwhileで計算していたまえの処理に較べて15倍位早い結果。全て配列に収めてから処理するとさすがに早い。ただしメモリもバカ喰い。

画像に対するフーリエ変換の物理的意味を考える

対象 説明
信号 縦軸を振幅、横軸を時間とした波形情報を周波数の分布に変換する
画像 縦軸を1ピクセルあたりの画素強度、横軸を座標として、ピクセル間の強度値変化を周波数分布に変換する

画像に対するフーリエ変換のピクセルあたりの画素強度の例として、1×7サイズのグレースケール画像を考える。例えば下記のようなもの(ここでは視認性のため、拡大した画像で表示)。

2dfft_row_column

画素強度値は明るいほど値が大きい。つまりグレースケールの255階調で考えれば、白い箇所は255、黒い箇所は0の値となる。先ほどの画像にピクセル毎に強度値を重ねて表示すると下記のような値を持つ。

2dfft_row_column

上記画像の強度値を、縦軸を1ピクセルあたりの画素強度、横軸を座標として表しなおしたグラフが下記。画像に対するフーリエ変換はこのグラフに対して行っていることになる
pixel peak

ただし通常画像はもっと大きなサイズであり、1ピクセル高ということはない。そのため、実際には各ピクセル行に対してフーリエ変換を行う。各行へのフーリエ変換の実施後はデータを転置し、転置後のデータに対して再度各行に対してフーリエ変換を実施する。その後再度転置を行いデータの向きを元に戻す作業を行う。これで画像に対する2次元フーリエ変換となる。
2dfft_row_column

ここまでの内容をpythonコードにしたものが下記。

import cv2
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

image = cv2.imread("mt_fuji.jpg", 0)
h, w = image.shape

ff = np.zeros(image.shape, dtype=complex)
for i in range(h):
  ff[i] = np.fft.fft(image[i])
ff = ff.T
for i in range(w):
  ff[i] = np.fft.fft(ff[i])
ff = ff.T

fimg = np.fft.fftshift(ff)
mag = 20*np.log(np.abs(fimg))

dst = np.fft.ifft2(ff)
dst = dst.real
dst = np.uint8(dst)

plt.subplot(221)
plt.title("original")
plt.imshow(image, cmap = 'gray')
plt.subplot(222)
plt.title("power spectrum")
plt.imshow(mag, cmap = 'gray')
plt.subplot(223)
plt.title("fft and invers fft result")
plt.imshow(dst, cmap = 'gray')
plt.show()

2dfft_row_column

numpyとopenCVを使った画像のフーリエ変換と逆変換

openCVを使い画像読み込み、fftで周波数データに変換。その後逆変換で元の画像に戻すテスト。

入力に使用する画像は↓。サイズは360×240。
input data to fft

まず画像入力、グレースケールで取り込む。

###########
# code 1
###########
import cv2
from PIL import Image

image = cv2.imread("mt_fuji.jpg", cv2.IMREAD_GRAYSCALE)
cv2.imshow("original",image)
cv2.waitKey(0)
cv2.destroyWindow()

実行すると元の画像がグレースケールで表示される。
imreadは引数2つ、1つ目は読み込みファイル、2つ目は読み込みオプションで以下3つ(1,0,-1でも指定可)。

  cv2.IMREAD_COLOR (or 1)
  cv2.IMREAD_GRAYSCALE (or 0)
  cv2.IMREAD_UNCHANGED (or -1)

続けてFFT実施

###########
# code 2
###########
import cv2
import numpy as np
from PIL import Image

image = cv2.imread("mt_fuji.jpg", cv2.IMREAD_GRAYSCALE)
#cv2.imshow("original",image)
#cv2.waitKey(0)
#cv2.destroyWindow()

fimage = np.fft.fft2(image)
print fimage
print fimage.shape

np.fft.fft2()は2次元FFT。
実行結果が下記。入力が360×240画像だったので、360×240のnumpy arrayで返ってくる。

[[ 1.20523050e+07      +0.        j -3.67019062e+05 +764482.77644721j
   8.97803335e+04 +528353.446206  j ... -8.56635235e+04 -151789.8195801 j
   8.97803335e+04 -528353.446206  j -3.67019062e+05 -764482.77644721j]
 [-5.06881547e+05-1884930.25115523j -4.50139049e+04 -190712.10861819j
  -1.76090294e+05 -185634.30113876j ...  8.13464047e+04  +33421.45807695j
   1.54174557e+05 +151124.32767313j  2.22512389e+05 +112510.91249437j]
 [-2.70716797e+05 +598066.58169905j  2.74010196e+05  +76140.89477773j
   3.81093895e+04 -142669.2043982 j ...  7.45069646e+04  -13227.88422994j
   2.39614427e+03 -107962.49955725j -1.21014694e+05  +16070.44342708j]
 ...
 [-4.93325081e+05+1019497.68318583j  1.10749640e+05 -185659.62071232j
  -2.02180759e+04 -244150.48215119j ... -7.68268395e+03   +7667.31543078j
   1.36815674e+05   -2094.526859  j  4.18478921e+04  -76427.43943856j]
 [-2.70716797e+05 -598066.58169905j -1.21014694e+05  -16070.44342708j
   2.39614427e+03 +107962.49955725j ... -5.75415408e+02  +50912.11759434j
   3.81093895e+04 +142669.2043982 j  2.74010196e+05  -76140.89477773j]
 [-5.06881547e+05+1884930.25115523j  2.22512389e+05 -112510.91249437j
   1.54174557e+05 -151124.32767313j ... -4.22761652e+04  +70475.447572  j
  -1.76090294e+05 +185634.30113877j -4.50139049e+04 +190712.10861819j]]
(240, 360)

変換結果をパワースペクトルで確認してみる。確認に使用したコードが下記。

###########
# code 3
###########
import cv2
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

image = cv2.imread("mt_fuji.jpg", cv2.IMREAD_GRAYSCALE)
#cv2.imshow("original",image)
#cv2.waitKey(0)
#cv2.destroyWindow()

fimage = np.fft.fft2(image)
#print fimage
#print fimage.shape

# Replace quadrant
# 1st <-> 3rd, 2nd <-> 4th
fimg =  np.fft.fftshift(fimage)
# Power spectrum calculation
mag = 20*np.log(np.abs(fimg))
plt.subplot(121)
plt.imshow(image,cmap = 'gray')
plt.subplot(122)
plt.imshow(mag,cmap = 'gray')
plt.show()

実行結果下記
input data to fft

最後に逆変換で元の画像に戻すコード。注意として変換後の結果には複素数成分が含まれているので、実部を取り出す処理が必要。またグレースケールで表示するために0-255階調への値調整が必要になる。

###########
# code 4
###########
import cv2
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

image = cv2.imread("mt_fuji.jpg", cv2.IMREAD_GRAYSCALE)
#cv2.imshow("original",image)
#cv2.waitKey(0)
#cv2.destroyWindow()

fimage = np.fft.fft2(image)
#print fimage
#print fimage.shape

ifimage = np.fft.ifft2(fimage)
# Extract real part
ifimage = ifimage.real
# Convert to 255 tones for gray scale
ifimage = np.uint8(ifimage)

cv2.imshow("fft and ifft",ifimage)
cv2.waitKey(0)
cv2.destroyWindow()

実行結果が下記グレースケールでもとの画像に戻ってきた。
results fft and inv fft

Read images using openCV, convert to frequency data with fft. And then back to the original image with reverse transformation.
Code 1 is reading image by gray scale.
Code 2 is 2D fft by numpy. Frequency distribution is returned.
Code 3 is checking Power spectrum.
Code 4 is invers Fourie by numpy.

pythonのmultiprocessingを使いクーロン力の並列計算をするテスト

前回クーロン力計算部分の高速化を検討した。結果としてはシンプルに2重ループを使った計算。今回はこのクーロン力計算部分をmultiprocessingを使ったマルチプロセスによる並列化で高速化を試してみる。

作成したコードが下記。結果は2.09[sec]。使用coreは4スレッド仕様なので、4プロセスでの処理を行った。前回の処理時間が約8秒であったのを考えると、理想どおり4分の1になる結果が得られた。


###########################
# multiprocessing test ####
###########################
import random
import math
import time
import scipy.special as scm
from multiprocessing import Pool

random.seed(1)

PX = 0;PY = 1;PZ = 2;
VX = 3;VY = 4;VZ = 5;
FX = 6;FY = 7;FZ = 8;

#number of particles in a line
line_num = 15

#total particle num
PN = line_num * line_num * line_num

#ready to 9 parameters for particle 
#(PX, PY, PZ, VX, VY, VZ, FX, FY, FZ)
xyz = [[0 for i in range(9)] for j in range(PN)]

#Number of combinations of coulomb force calculation
combinum = int(scm.comb(PN, 2))

#thread number(local thread num)
core = 4

def find_pair_sub(prep,pend,thread):
  global xyz

  #local results array
  xyzF = [[0 for i in range(3)] for j in range(PN)]
  fx = 0; fy = 1; fz = 2

  for i in range(prep,pend):
    for j in range(i + 1, PN):
      dx = xyz[i][PX] - xyz[j][PX]
      dy = xyz[i][PY] - xyz[j][PY]
      dz = xyz[i][PZ] - xyz[j][PZ]
      r  = math.sqrt(dx*dx + dy*dy + dz*dz)

      xyzF[i][fx] = xyzF[i][fx] + dx/(r*r*r)
      xyzF[i][fy] = xyzF[i][fy] + dy/(r*r*r)
      xyzF[i][fz] = xyzF[i][fz] + dz/(r*r*r)
      xyzF[j][fx] = xyzF[j][fx] - dx/(r*r*r)
      xyzF[j][fy] = xyzF[j][fy] - dy/(r*r*r)
      xyzF[j][fz] = xyzF[j][fz] - dz/(r*r*r)

  return xyzF

def wrapper(args):
  return find_pair_sub(*args)


def find_pair():
  global PN
  global combinum

  pw = combinum // core
  pl = combinum % core

  localt = 0
  thread = 0
  pre = 0
  #each thread work list
  worklist = []
  ppp = pw

  for i in range(PN) :
    if core == 1:
      worklist.append([pre,PN,thread])
      break

    localt = localt + (PN - i - 1)
    if localt >= ppp:
      worklist.append([pre,i,thread])
      ppp += pw
      thread += 1
      pre = i

  if i != pre:
    prep = worklist[thread-1][0]
    worklist[thread-1] = [prep,PN,thread-1]

  #make thread core num
  p = Pool(core)

  #start thread. results is callback in array.
  callback = p.map(wrapper, worklist)
  p.close()

  #summation each thread results
  for j in range(core):
    for i in range(PN):
      xyz[i][FX] += callback[j][i][0]
      xyz[i][FY] += callback[j][i][1]
      xyz[i][FZ] += callback[j][i][2]

def init_lattice():
  global xyz

  pnum = 0
  while pnum < PN:
    xyz[pnum][PX] = random.uniform(-1,1)
    xyz[pnum][PY] = random.uniform(-1,1)
    xyz[pnum][PZ] = random.uniform(-1,1)
    xyz[pnum][FX] = random.uniform(-1,1)
    xyz[pnum][FY] = random.uniform(-1,1)
    xyz[pnum][FZ] = random.uniform(-1,1)
    pnum += 1

if __name__ == "__main__":
  init_lattice()
  find_pair()

■結果 [Results summary]

code type 時間[sec]
multiprocessing (4core) 2.09 <-(new)
itertools使用 (no1) 8.18
range記述 (no2) 7.93
xrange記述 (no3) 7.89
ループ内周でnumpy使用 (no4) 78.46

We examined the speedup of the previous Coulomb force calculation part. As a result, more fast results calculation was using a simple double loop. In this time, I will try speeding up by multiprocessing parallel processing of this Coulomb force calculation part.

The code you created is below. The result is 2.09 [sec]. Since core used is 4 thread specification, processing in 4 processes was done. Considering that the last processing time was about 8 seconds, the result was 1/4 as ideal.