thread化 – Python in the box

以前行った、multiprocessingでのクーロン力並列計算に引き続き、threadingを使ったクーロン力の並列計算をテストする。計算条件はmultiprocessingの時と同じくcore数を4、計算粒子数を15^3個とした。

結果、計算時間は14.36[sec]。multiprocessingでの計算は2.09[sec]であったため、大分時間がかかるというかシングルスレッドで計算していた時より時間がかかっている。threadingでは各threadがメモリを共有するために必要な情報を取得するためメモリアクセスする際に、排他ロック(GIL)が起きているのかな、と考えられる。試しにthread数を1にして実行してみると処理時間は6.92[sec]。thread化しない方が高速であった。

■結果　[Results summary]

code type	時間[sec]
threading (4threads)	14.36 <-(new!)
threading (1thread)	6.92 <-(new!)
multiprocessing	2.09
itertools使用 (no1)	8.18
range記述 (no2)	7.93
xrange記述 (no3)	7.89
ループ内周でnumpy使用 (no4)	78.46

■使用したコードは下記(use 4 threads)

###########################
# threading test ##########
###########################
import random
import math
import itertools
import time
import scipy.misc as scm
import numpy as np
import threading
from Queue import Queue

random.seed(1)

PX = 0;PY = 1;PZ = 2;
VX = 3;VY = 4;VZ = 5;
FX = 6;FY = 7;FZ = 8;

#number of particles in a line
line_num = 15

#total particle num
PN = line_num * line_num * line_num

#ready to 9 parameters for particle 
#(PX, PY, PZ, VX, VY, VZ, FX, FY, FZ)
xyz = [[0 for i in range(9)] for j in range(PN)]

#Number of combinations of coulomb force calculation
combinum = int(scm.comb(PN, 2))

#thread number(local thread num)
core = 20

def find_pair_sub(prep,pend,thread,q):
  global xyz

  #local results array
  xyzF = [[0 for i in range(3)] for j in range(PN)]
  fx = 0; fy = 1; fz = 2

  for i in xrange(prep,pend):
    for j in xrange(i + 1, PN):
      dx = xyz[i][PX] - xyz[j][PX]
      dy = xyz[i][PY] - xyz[j][PY]
      dz = xyz[i][PZ] - xyz[j][PZ]
      r  = math.sqrt(dx*dx + dy*dy + dz*dz)

      xyzF[i][fx] = xyzF[i][fx] + dx/(r*r*r)
      xyzF[i][fy] = xyzF[i][fy] + dy/(r*r*r)
      xyzF[i][fz] = xyzF[i][fz] + dz/(r*r*r)
      xyzF[j][fx] = xyzF[j][fx] - dx/(r*r*r)
      xyzF[j][fy] = xyzF[j][fy] - dy/(r*r*r)
      xyzF[j][fz] = xyzF[j][fz] - dz/(r*r*r)

  q.put(xyzF)

def find_pair():
  global PN
  global combinum

  q = Queue()
  pw = combinum // core
  pl = combinum % core

  localt = 0
  thread = 0
  pre = 0
  worklist = []
  ppp = pw

  for i in range(PN) :

    if core == 1:
      worklist.append([pre,PN,thread])
      break

    localt = localt + (PN - i - 1)
    if localt >= ppp:
      worklist.append([pre,i,thread])
      ppp += pw
      thread += 1
      pre = i

  if i != pre:
    prep = worklist[thread-1][0]
    worklist[thread-1] = [prep,PN,thread-1]

  results = []
  for i in range(core):
    thread = threading.Thread(target=find_pair_sub, args=(worklist[i][0],worklist[i][1],worklist[i][2],q))
    thread.start()

  thread_list = threading.enumerate()
  main_thread = threading.currentThread()
  thread_list.remove(main_thread)
  for thread in thread_list:
    thread.join()
    results.append(q.get())

  for j in range(core):
    for i in range(PN):
      xyz[i][FX] += results[j][i][0]
      xyz[i][FY] += results[j][i][1]
      xyz[i][FZ] += results[j][i][2]

def init_lattice():
  global xyz

  pnum = 0
  while pnum < PN:
    xyz[pnum][PX] = random.uniform(-1,1)
    xyz[pnum][PY] = random.uniform(-1,1)
    xyz[pnum][PZ] = random.uniform(-1,1)
    xyz[pnum][FX] = random.uniform(-1,1)
    xyz[pnum][FY] = random.uniform(-1,1)
    xyz[pnum][FZ] = random.uniform(-1,1)
    pnum += 1

if __name__ == "__main__":
  init_lattice()
  find_pair()

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

###########################

# threading test ##########

###########################

import random

import math

import itertools

import time

import scipy.misc as scm

import numpy as np

import threading

from Queue import Queue

random.seed(1)

PX = 0;PY = 1;PZ = 2;

VX = 3;VY = 4;VZ = 5;

FX = 6;FY = 7;FZ = 8;

#number of particles in a line

line_num = 15

#total particle num

PN = line_num * line_num * line_num

#ready to 9 parameters for particle

#(PX, PY, PZ, VX, VY, VZ, FX, FY, FZ)

xyz = [[0 for i in range(9)] for j in range(PN)]

#Number of combinations of coulomb force calculation

combinum = int(scm.comb(PN, 2))

#thread number(local thread num)

core = 20

def find_pair_sub(prep,pend,thread,q):

global xyz

#local results array

xyzF = [[0 for i in range(3)] for j in range(PN)]

fx = 0; fy = 1; fz = 2

for i in xrange(prep,pend):

for j in xrange(i + 1, PN):

dx = xyz[i][PX] - xyz[j][PX]

dy = xyz[i][PY] - xyz[j][PY]

dz = xyz[i][PZ] - xyz[j][PZ]

r = math.sqrt(dx*dx + dy*dy + dz*dz)

xyzF[i][fx] = xyzF[i][fx] + dx/(r*r*r)

xyzF[i][fy] = xyzF[i][fy] + dy/(r*r*r)

xyzF[i][fz] = xyzF[i][fz] + dz/(r*r*r)

xyzF[j][fx] = xyzF[j][fx] - dx/(r*r*r)

xyzF[j][fy] = xyzF[j][fy] - dy/(r*r*r)

xyzF[j][fz] = xyzF[j][fz] - dz/(r*r*r)

q.put(xyzF)

def find_pair():

global PN

global combinum

q = Queue()

pw = combinum // core

pl = combinum % core

localt = 0

thread = 0

pre = 0

worklist = []

ppp = pw

for i in range(PN) :

if core == 1:

worklist.append([pre,PN,thread])

break

localt = localt + (PN - i - 1)

if localt >= ppp:

worklist.append([pre,i,thread])

ppp += pw

thread += 1

pre = i

if i != pre:

prep = worklist[thread-1][0]

worklist[thread-1] = [prep,PN,thread-1]

results = []

for i in range(core):

thread = threading.Thread(target=find_pair_sub, args=(worklist[i][0],worklist[i][1],worklist[i][2],q))

thread.start()

thread_list = threading.enumerate()

main_thread = threading.currentThread()

thread_list.remove(main_thread)

for thread in thread_list:

thread.join()

results.append(q.get())

for j in range(core):

for i in range(PN):

xyz[i][FX] += results[j][i][0]

xyz[i][FY] += results[j][i][1]

xyz[i][FZ] += results[j][i][2]

def init_lattice():

global xyz

pnum = 0

while pnum < PN:

xyz[pnum][PX] = random.uniform(-1,1)

xyz[pnum][PY] = random.uniform(-1,1)

xyz[pnum][PZ] = random.uniform(-1,1)

xyz[pnum][FX] = random.uniform(-1,1)

xyz[pnum][FY] = random.uniform(-1,1)

xyz[pnum][FZ] = random.uniform(-1,1)

pnum += 1

if __name__ == "__main__":

init_lattice()

find_pair()

Following previous parallel computation of Coulomb force in multiprocessing, we test parallel computation of Coulomb force using threading. The same condition as before the multiprocessing calculation condition, the core number was set to 4 and the number of calculated particles was set to 15 ^ 3.

As a result, the calculation time is 14.36 [sec]. The computation with multiprocessing was 2.09 [sec], so it took a long time or it took more time than when computing with single thread. In threading, it is considered that an exclusive lock (GIL) is occurring when memory access is performed in order to acquire necessary information for each thread to share memory. If we try to run with thread number 1, the processing time is 6.92 [sec]. It was faster to not thread.

スレッドを発行して並列化するテスト。スレッドの開始はstart(),終了待ちはjoin()で行う。すべてのスレッドの終了をチェックするためには、現在発行中の全スレッドをリストアップするenumerate()を使う。ただしこのリストの中には、メインスレッドも含まれているので、メインスレッドは抜く必要がある。メインスレッドはcurrentThread()であらわせるので、これをremoveで取り除く。取り除いた結果に対して、join()で終了待ちし、全スレッドが終了したら終了メッセージを出す。各スレッドからの返り値はQueueを使って貯めることができる。貯めたいqueueを引数に渡してやることで、スレッド終了後に全スレッドの結果を読み出すことができる。

import threading, time
import random
from Queue import Queue

def ppp(a, q):
    c = random.uniform(0,5)
    print "thread = ",a," rand = ",c
    time.sleep(c)
    q.put(c)
    print "end thread = ",a

if __name__ == "__main__":
  q = Queue()
  results = []
  for i in range(8):
    thread = threading.Thread(target=ppp, args=(i, q))  # Initialize
    thread.start() # Start


  thread_list = threading.enumerate()
  main_thread = threading.currentThread()
  thread_list.remove(main_thread)
  for thread in thread_list:
    thread.join()
    results.append(q.get())

  print results
  print "end"

import threading, time

import random

from Queue import Queue

def ppp(a, q):

c = random.uniform(0,5)

print "thread = ",a," rand = ",c

time.sleep(c)

q.put(c)

print "end thread = ",a

if __name__ == "__main__":

q = Queue()

results = []

for i in range(8):

thread = threading.Thread(target=ppp, args=(i, q)) # Initialize

thread.start() # Start

thread_list = threading.enumerate()

main_thread = threading.currentThread()

thread_list.remove(main_thread)

for thread in thread_list:

thread.join()

results.append(q.get())

print results

print "end"

■実行結果
thread = 0 rand = 1.97370804812
thread = 1 rand = 3.45525477664
thread = 2 rand = 0.651252139145
thread = 3 rand = 3.96551177159
thread = 4 rand = 3.57845628232
thread = 5 rand = 1.81758678574
thread = 6 rand = 2.31490107118
thread = 7 rand = 1.21471678276
end thread = 2
end thread = 7
end thread = 5
end thread = 0
end thread = 6
end thread = 1
end thread = 4
end thread = 3
[0.6512521391445797, 1.2147167827645151, 1.817586785739616, 1.9737080481200742, 2.3149010711817626, 3.4552547766386787, 3.5784562823201704, 3.9655117715879653]
end

A test that issues threads and parallelizes them. The start of the thread is done with start (), and the wait for completion is done with join (). To check the termination of all threads, use enumerate () which lists all the threads that are currently issuing. However, since the main thread is also included in this list, it is necessary to withdraw the main thread. The main thread can be set with currentThread (), so remove it with remove. Wait for join () to finish removing, and issue a termination message when all threads are finished. The return value from each thread can be stored using Queue. By passing the queue you want to save as an argument, you can read the results of all threads after the thread ends.

カテゴリー: thread化

pythonのthreadingを使いクーロン力の並列計算をするテスト

マルチスレッドのテスト