CUDA × OpenGL:格子ボルツマン法(LBM)によるリアルタイム流体シミュレーターの構築

GPUの演算能力を、物理シミュレーションとグラフィックスの融合に活用してみました。 本プロジェクトでは、流体解析の手法である「格子ボルツマン法(LBM: Lattice Boltzmann Method)」をCUDAで実装し、OpenGLを用いてその動態をリアルタイムに可視化するシミュレーターを作成しました。

中心には複雑な干渉を生む「星型」の障害物を配置。マウスドラッグによる滑らかな波紋の生成や、障害物による波の回折・干渉を、512×512のグリッド上で高フレームレートにシミュレートします。物理演算と描画の両方をGPU内で完結させることで、CPUでは到底不可能なリアルタイム性を実現しています。


技術詳細解説

本プログラムは、大きく分けて**「物理演算(LBM)」「パルス注入(入力)」「可視化」**の3つのフェーズで構成されています。

1. 格子ボルツマン法 (D2Q9モデル)

流体を微視的な粒子の集まりとして捉え、その速度分布関数の変化を計算します。

  • D2Q9モデル: 各格子点は9つの速度方向(静止、上下左右、斜め)を持ちます。

  • ストリーミング(移動): 隣接する格子から粒子が移動してくるステップです。本コードでは、境界条件(星型の壁)に衝突した粒子を跳ね返す「Bounce-back法」を実装しています。

  • 衝突(緩和): 粒子同士の衝突により、局所的な平衡状態へ近づくステップです。定数 omega(緩和時間)が流体の粘性を決定します。

2. 滑らかなマウス入力(線形補間パルス)

マウスドラッグの軌跡を滑らかに表現するため、単なる点ではなく「線分」としてエネルギーを注入しています。

  • 線分最短距離計算: 前フレームと現フレームの座標を結ぶ線分から、各格子点への最短距離を計算します。

  • ガウス分布による注入: 距離に応じた重み付け(expf)を行い、密度を加算することで、航跡のような自然な波紋を生成します。

3. GPUダイレクト可視化

演算結果を一度もCPUに戻すことなく、GPU内で直接映像へ変換しています。

  • 物理量のカラーマッピング: 格子点ごとの流速(ux, uy)と密度(rho)の変化を色相に変換。tanhf 関数を用いることで、激しい変化もサチュレーションを起こさず美しく描写します。

  • テクスチャ共有: CUDAで計算した画像バッファをOpenGLのテクスチャとしてバインドし、1枚の板ポリゴンに貼り付けて表示しています。

    コンパイルコマンド
    nvcc lbm_cuda_opengl.cu -o lbm_sim -arch=sm_61 -lGLEW -lGLU -lGL -lglut -lm
    
    #include <GL/glew.h>
    #include <GL/freeglut.h>
    #include <cuda_runtime.h>
    #include <vector>
    #include <math.h>
    #include <stdio.h>
    
    const int WIDTH = 512, HEIGHT = 512, Q = 9;
    int win_w = WIDTH, win_h = HEIGHT;
    const float omega = 0.7f;
    
    bool is_dragging = false;
    int prev_mx = -1, prev_my = -1, cur_mx = -1, cur_my = -1;
    
    float *d_f1, *d_f2, *d_ux, *d_uy, *d_rho;
    int *d_boundary;
    uchar4 *d_img;
    std::vector<uchar4> h_output_buffer(WIDTH * HEIGHT);
    GLuint tex;
    
    __device__ const int d_cx[9] = {0, 1, 0, -1, 0, 1, -1, -1, 1};
    __device__ const int d_cy[9] = {0, 0, 1, 0, -1, 1, 1, -1, -1};
    __device__ const float d_w[9] = {4/9.f, 1/9.f, 1/9.f, 1/9.f, 1/9.f, 1/36.f, 1/36.f, 1/36.f, 1/36.f};
    const float h_w[9] = {4/9.f, 1/9.f, 1/9.f, 1/9.f, 1/9.f, 1/36.f, 1/36.f, 1/36.f, 1/36.f};
    
    // --- パルス生成:線形補間 ---
    __global__ void pulse_line_kernel(float *f, int x1, int y1, int x2, int y2, float mag) {
        int x = blockIdx.x * blockDim.x + threadIdx.x, y = blockIdx.y * blockDim.y + threadIdx.y;
        if (x >= WIDTH || y >= HEIGHT) return;
        float dx = (float)(x2 - x1), dy = (float)(y2 - y1);
        float t = fmaxf(0.0f, fminf(1.0f, ((x - x1) * dx + (y - y1) * dy) / (dx * dx + dy * dy + 1e-8f)));
        float px = x1 + t * dx, py = y1 + t * dy;
        float dist2 = (x - px) * (x - px) + (y - py) * (y - py);
        float weight = expf(-dist2 / 200.0f);
        if (weight > 0.01f) {
            for (int q = 0; q < Q; q++) f[q * WIDTH * HEIGHT + y * WIDTH + x] += d_w[q] * weight * mag;
        }
    }
    
    // --- LBM演算 ---
    __global__ void lbm_kernel(float *fin, float *fout, float *rho, float *ux, float *uy, const int *bound) {
        int x = blockIdx.x * blockDim.x + threadIdx.x, y = blockIdx.y * blockDim.y + threadIdx.y;
        if (x >= WIDTH || y >= HEIGHT) return;
        int idx = y * WIDTH + x;
        if (bound[idx]) return;
    
        float f[9], r = 0, vx = 0, vy = 0;
        int opp[9] = {0, 3, 4, 1, 2, 7, 8, 5, 6};
        for (int q = 0; q < Q; q++) {
            int sx = (x - d_cx[q] + WIDTH) % WIDTH, sy = (y - d_cy[q] + HEIGHT) % HEIGHT;
            f[q] = bound[sy * WIDTH + sx] ? fin[opp[q] * WIDTH * HEIGHT + idx] : fin[q * WIDTH * HEIGHT + sy * WIDTH + sx];
            r += f[q]; vx += d_cx[q] * f[q]; vy += d_cy[q] * f[q];
        }
        rho[idx] = r; ux[idx] = vx / (r + 1e-8f); uy[idx] = vy / (r + 1e-8f);
        for (int q = 0; q < Q; q++) {
            float cu = 3.f * (d_cx[q] * ux[idx] + d_cy[q] * uy[idx]);
            float feq = d_w[q] * r * (1.f + cu + 0.5f * cu * cu - 1.5f * (ux[idx] * ux[idx] + uy[idx] * uy[idx]));
            fout[q * WIDTH * HEIGHT + idx] = f[q] + omega * (feq - f[q]);
        }
    }
    
    // --- 可視化(障害物を灰色に設定) ---
    __global__ void visual_kernel(uchar4 *out, const float *ux, const float *uy, const float *rho, const int *bound) {
        int x = blockIdx.x * blockDim.x + threadIdx.x, y = blockIdx.y * blockDim.y + threadIdx.y, idx = y * WIDTH + x;
        if (x >= WIDTH || y >= HEIGHT) return;
        if (bound[idx]) {
            out[idx] = make_uchar4(100, 100, 100, 255); // 灰色に変更
            return;
        }
        float i = tanhf(sqrtf(ux[idx]*ux[idx] + uy[idx]*uy[idx]) * 30.f + fabsf(rho[idx] - 1.f) * 6.f);
        out[idx] = make_uchar4(i * 255, i * 130, 210 * (1 - i) + 40, 255);
    }
    
    void reset() {
        std::vector<float> h_f(Q * WIDTH * HEIGHT);
        for (int i = 0; i < WIDTH * HEIGHT; i++) for (int q = 0; q < Q; q++) h_f[q * WIDTH * HEIGHT + i] = h_w[q];
        cudaMemcpy(d_f1, h_f.data(), Q * WIDTH * HEIGHT * sizeof(float), cudaMemcpyHostToDevice);
    }
    
    void disp() {
        dim3 b(16, 16), g((WIDTH + 15) / 16, (HEIGHT + 15) / 16);
        if (is_dragging && cur_mx != -1) {
            pulse_line_kernel<<<g, b>>>(d_f1, prev_mx, prev_my, cur_mx, cur_my, 0.08f);
            prev_mx = cur_mx; prev_my = cur_my;
        }
        lbm_kernel<<<g, b>>>(d_f1, d_f2, d_rho, d_ux, d_uy, d_boundary);
        std::swap(d_f1, d_f2);
        visual_kernel<<<g, b>>>(d_img, d_ux, d_uy, d_rho, d_boundary);
        cudaMemcpy(h_output_buffer.data(), d_img, WIDTH * HEIGHT * sizeof(uchar4), cudaMemcpyDeviceToHost);
        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, WIDTH, HEIGHT, GL_RGBA, GL_UNSIGNED_BYTE, h_output_buffer.data());
        glClear(GL_COLOR_BUFFER_BIT);
        glBegin(GL_QUADS); glTexCoord2f(0,0); glVertex2f(-1,-1); glTexCoord2f(1,0); glVertex2f(1,-1);
        glTexCoord2f(1,1); glVertex2f(1,1); glTexCoord2f(0,1); glVertex2f(-1,1); glEnd();
        glutSwapBuffers(); glutPostRedisplay();
    }
    
    void mouse(int b, int s, int x, int y) {
        if (b == GLUT_LEFT_BUTTON) {
            if (s == GLUT_DOWN) {
                is_dragging = true;
                cur_mx = prev_mx = x * WIDTH / win_w;
                cur_my = prev_my = (win_h - y) * HEIGHT / win_h;
                pulse_line_kernel<<<dim3((WIDTH+15)/16,(HEIGHT+15)/16), dim3(16,16)>>>(d_f1, cur_mx, cur_my, cur_mx, cur_my, 0.2f);
            } else is_dragging = false;
        }
    }
    
    void motion(int x, int y) {
        if (is_dragging) { cur_mx = x * WIDTH / win_w; cur_my = (win_h - y) * HEIGHT / win_h; }
    }
    
    // 星型判定関数
    bool is_star(float x, float y, float cx, float cy, float r, float inner_r) {
        float dx = x - cx, dy = y - cy;
        float dist = sqrtf(dx*dx + dy*dy);
        float angle = atan2f(dy, dx);
        // 5つの頂点を持つ星の形状パラメータ
        float s = (angle + M_PI/2.0f) * 5.0f / (2.0f * M_PI);
        float f = s - floorf(s);
        float p = (f < 0.5f) ? f : 1.0f - f;
        float star_r = r * (1.0f - p * (r - inner_r) / r * 4.0f);
        return dist < star_r;
    }
    
    int main(int argc, char **argv) {
        glutInit(&argc, argv); glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
        glutInitWindowSize(WIDTH, HEIGHT); glutCreateWindow("LBM: Star Obstacle");
        glewInit();
        cudaMalloc(&d_f1, Q*WIDTH*HEIGHT*4); cudaMalloc(&d_f2, Q*WIDTH*HEIGHT*4);
        cudaMalloc(&d_ux, WIDTH*HEIGHT*4); cudaMalloc(&d_uy, WIDTH*HEIGHT*4);
        cudaMalloc(&d_rho, WIDTH*HEIGHT*4); cudaMalloc(&d_boundary, WIDTH*HEIGHT*4);
        cudaMalloc(&d_img, WIDTH*HEIGHT*4);
    
        std::vector<int> h_b(WIDTH * HEIGHT, 0);
        for (int i = 0; i < WIDTH * HEIGHT; i++) {
            int x = i % WIDTH, y = i / WIDTH;
            // 外壁
            if (x == 0 || x == WIDTH - 1 || y == 0 || y == HEIGHT - 1) h_b[i] = 1;
            // 星型障害物 (中心 256, 256 付近、半径 60, 内径 25)
            if (is_star((float)x, (float)y, 256.0f, 256.0f, 65.0f, 25.0f)) h_b[i] = 1;
        }
        cudaMemcpy(d_boundary, h_b.data(), WIDTH * HEIGHT * 4, cudaMemcpyHostToDevice);
        reset();
    
        glGenTextures(1, &tex); glBindTexture(GL_TEXTURE_2D, tex);
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
        glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, WIDTH, HEIGHT, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0);
        glEnable(GL_TEXTURE_2D);
    
        glutDisplayFunc(disp); glutReshapeFunc([](int w, int h) { glViewport(0,0,w,h); win_w=w; win_h=h; });
        glutMouseFunc(mouse); glutMotionFunc(motion);
        glutKeyboardFunc([](unsigned char k, int x, int y) { if (k == 'r' || k == ' ' || k == 'R') reset(); });
        glutMainLoop(); return 0;
    }
    

    This project is a real-time fluid dynamics simulator that implements the Lattice Boltzmann Method (LBM) by leveraging the parallel processing power of the GPU. By tightly integrating CUDA for physical computations and OpenGL for rendering, it achieves exceptionally smooth visualization of fluid behavior on a high-resolution grid.

    Technical Highlights

    • Star-Shaped Obstacle: A gray, star-shaped obstacle is positioned at the center of the domain. Utilizing a mathematical Signed Distance Function (SDF) approach based on polar coordinates, the simulator accurately handles wave diffraction and interference against complex geometries.

    • Linearly Interpolated Drag Input: To ensure seamless wave generation regardless of mouse movement speed, a custom kernel interpolates between previous and current mouse coordinates, injecting energy along the continuous path.

    • Zero-Copy Visualization: Physical quantities are converted directly into pixel colors within GPU memory. These are then rendered as OpenGL textures without being transferred back to the CPU, eliminating data bottlenecks and ensuring high frame rates.

    How It Works

    1. Collision & Streaming: The particle distribution functions at each grid point move to neighboring nodes and collide to approach a local equilibrium state.

    2. Boundary Conditions: Particles hitting the star-shaped wall undergo a “bounce-back” process, creating intricate wave patterns around the object.

    3. Visualization: Changes in local velocity and density are mapped to a vibrant color palette using a tanh activation function to prevent color saturation.

CUDA C サンプルプログラム

Gemini先生の教えメモ

このプログラムは、2つの配列(ベクトル)の各要素をGPUの数千ものスレッドで同時に足し合わせるもの。

 

// GPUで実行される「カーネル」関数
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) {
    int i = blockDim.x * blockIdx.x + threadIdx.x; // スレッドIDの計算

    if (i < numElements) {
        C[i] = A[i] + B[i];
    }
}

 

1. __global__ 修飾子

これは**「カーネル (Kernel)」**であることを宣言するキーワードです。

  • 意味: 「CPUから呼び出され、GPU上で実行される関数」であることを示します。

  • 役割: コンパイラ(nvcc)はこのキーワードを見て、「この関数はx86(CPU)用ではなく、NVIDIAのアーキテクチャ(PTXコード)用にビルドせよ」と判断します。

  • 制約: 戻り値は必ず void である必要があります。


2. カーネルの引数 (const float *A など)

引数はすべて「GPUのメモリ(ビデオメモリ/VRAM)」を指すポインタです。

  • CPU側のメモリを直接渡すことはできず、事前に cudaMalloc で確保し、cudaMemcpy でデータを転送しておいたアドレスを渡します。

  • numElements は、配列のサイズを伝えて、範囲外アクセスを防ぐために必要です。


3. スレッドIDの計算(最重要ポイント)

C++

int i = blockDim.x * blockIdx.x + threadIdx.x;

ここが並列計算の魔法の種明かしです。GPUは、この関数を数万個のスレッドで同時に実行します。各スレッドは「自分は誰か?」を知るために、組み込み変数(構造体)を参照します。

変数名 意味 例(今回のケース)
blockIdx.x 現在のスレッドが所属しているブロックの番号 0 〜 195
blockDim.x 1つのブロックの中に含まれるスレッドの数 256
threadIdx.x ブロック内でのスレッドの通し番号 0 〜 255

4. if (i < numElements) によるガード

なぜこれが必要なのでしょうか?

GPUは「ブロック(今回なら256単位)」でスレッドを起動します。

  • 計算したい要素数:50,000

  • 起動したスレッド数:196ブロック × 256 = 50,176スレッド

  • 余り:176スレッド

もしこの if 文がないと、50,001番目以降の存在しないメモリ領域にアクセスしようとして、プログラムがクラッシュ(セグメンテーション違反)してしまいます。これを防ぐために「自分の番号 i が配列サイズ内にあるときだけ仕事をする」という条件分岐を入れています。


5. 計算処理:C[i] = A[i] + B[i];

ここまで来て、ようやく実際の計算です。

  • CPUの場合: for(int i=0; i<N; i++) と書いて、1つずつ順番に足していきます。

  • GPUの場合: 50,000人の作業員(スレッド)が、一斉に自分の担当番号 i の箱を開け、中身を足して、結果を C[i] に書き込みます。

論理的には**「一瞬」**で全要素の足し算が終わります(実際にはメモリ帯域などの物理的制約がありますが、CPUのループより遥かに高速です)。


まとめ:GTX 1060の中での動き

あなたがプログラムを実行したとき、GTX 1060の中では以下のことが起きています。

  1. GPU内部のスケジューラが、196個のブロックを各コア(SM)に割り当てる。

  2. 各コアで256個ずつのスレッドが爆速で立ち上がる。

  3. 各スレッドが計算式を使って「俺は3456番目の担当だ!」と計算する。

  4. 一斉にメモリを読み書きする。

この「スレッドIDの計算」さえマスターすれば、どんなに複雑な並列計算も作れるようになります。

 

#include <stdio.h>
#include <cuda_runtime.h>

// GPUで実行される「カーネル」関数
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) {
    int i = blockDim.x * blockIdx.x + threadIdx.x; // スレッドIDの計算

    if (i < numElements) {
        C[i] = A[i] + B[i];
    }
}

int main() {
    int numElements = 50000;
    size_t size = numElements * sizeof(float);

    // ホスト(CPU)用メモリ確保
    float *h_A = (float *)malloc(size);
    float *h_B = (float *)malloc(size);
    float *h_C = (float *)malloc(size);

    // データの初期化
    for (int i = 0; i < numElements; ++i) {
        h_A[i] = 1.0f;
        h_B[i] = 2.0f;
    }

    // デバイス(GPU)用メモリ確保
    float *d_A, *d_B, *d_C;
    cudaMalloc((void **)&d_A, size);
    cudaMalloc((void **)&d_B, size);
    cudaMalloc((void **)&d_C, size);

    // CPUからGPUへデータをコピー
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // カーネルの実行設定(256スレッド/ブロック)
    int threadsPerBlock = 256;
    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;

    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);

    // 結果をGPUからCPUへコピー
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // 結果の検証
    for (int i = 0; i < 10; ++i) {
        printf("Element %d: %.1f + %.1f = %.1f\n", i, h_A[i], h_B[i], h_C[i]);
    }

    // メモリ解放
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    free(h_A); free(h_B); free(h_C);

    return 0;
}

The process of brute force calculation by Coulomb force with multiprocessing and numpy. TEST3

Each result returned by the distributed processing is not made into a PN-sized matrix, but is placed as is. This lightens the processing.

import random
import math
import time
import gc
import numpy as np

from multiprocessing import Pool

np.set_printoptions(threshold=np.inf)
random.seed(1)

XX = 0
YY = 1
ZZ = 2

core = 6

#total particle num
PN = 10000
#---------------------------------------------------
#
# parameter : divide_N
#
# The number of particles per side of a small square 
# divided for the splitting process when creating a 
# large PN^2 region with all particles PN.
#
# The smaller the setting, the more it will be divided
# into many smaller jobs.
#---------------------------------------------------
divide_N = 1000
xyzV = [[0 for i in range(3)] for j in range(PN)]
xyzP = np.zeros((PN,3))
xyzF = np.zeros((PN,3))


def wrapper(args):
  return calc_XY_start_end(*args)

def find_pair():
  global core
  global PN
  global xyzF
  global xyzP

  worklist = []
  thread   = 0

  remainder = PN %  divide_N
  quotient  = PN // divide_N

  if remainder != 0:                 #ex PN=11, divide_N=4, remainder=3, quotient=2
      quotient = quotient + 1        #   2 -> 3
  extra = divide_N - remainder       #   1

  X_st = 0
  X_ed = 0
  Y_st = 0
  Y_ed = 0

  Thead_num = 0
  sum_f = np.zeros((PN,PN,3))

  for Xn in range(0,quotient):
    for Yn in range(0,Xn+1):

      worklist.append([Xn, Yn, quotient, remainder, divide_N, extra])
      Thead_num = Thead_num + 1
      #----------------------------------------
      #local_force = calc_XY_start_end(Xn, Yn, quotient, remainder, divide_N, extra)
      #----------------------------------------


  #start thread. results is callback in array.
  p = Pool(core)
  callback = p.map(wrapper, worklist)
  p.close()


  t1 = 0
  for Th in range(0,Thead_num):

    xn_l = callback[Th][0]
    yn_l = callback[Th][1]
    xxst = callback[Th][2]
    xxed = callback[Th][3]
    yyst = callback[Th][4]
    yyed = callback[Th][5]
    local_force = callback[Th][6]

    time_sta = time.time()

    sb = local_force.shape
    ten   = -1 * local_force.transpose(1,0,2)
    tensb = ten.shape

    sum_f[yyst:yyst+sb[0], xxst:xxst+sb[1]] += local_force

    if xn_l != yn_l:
      sum_f[xxst:xxst+tensb[0], yyst:yyst+tensb[1]] += ten

    time_end = time.time()
    t1 =t1 + (time_end  - time_sta)


  print ("=====t1=",t1)
  print(np.sum(sum_f))
  #mmm = sum_f.sum(axis=1) 
  #xyzF = xyzF + mmm 


def calc_XY_start_end(Xn, Yn, quotient, remainder, divide_N, extra):
  global PN
  global xyzP

  if Xn != quotient-1 and remainder != 0 or remainder == 0:
    X_st = Xn*divide_N
    X_ed = (Xn+1)*divide_N-1
  else:
    X_st = Xn*divide_N - (divide_N - remainder)
    X_ed = (Xn+1)*divide_N  - (divide_N - remainder) -1
  if Yn != quotient-1 and remainder != 0 or remainder == 0:
    Y_st = Yn*divide_N
    Y_ed = (Yn+1)*divide_N-1
  else:
    Y_st = Yn*divide_N - (divide_N - remainder)
    Y_ed = (Yn+1)*divide_N  -(divide_N - remainder) -1

  Xa = np.arange(X_st,X_ed+1)
  Ya = np.arange(Y_st,Y_ed+1)
  mx1,my1 = np.meshgrid(Xa,Ya)
  local_xyz  = xyzP[mx1]-xyzP[my1]
  distance_a = np.linalg.norm(local_xyz, axis=2)

  tmp_d = distance_a[:,:,np.newaxis]
  tmp_f = local_xyz/tmp_d
  tmp_f = np.nan_to_num(tmp_f,0)

  #extra clear
  if remainder != 0:
    if Xn + 1 == quotient:
      for dd in range(0,extra):
        tmp_f[:,dd] = 0
    if Yn + 1 == quotient:
      for dd in range(0,extra):
        tmp_f[[dd]] = 0

  return Xn,Yn,X_st,X_ed,Y_st,Y_ed,tmp_f



def init_lattice():
  global xyzF
  global xyzP

  pnum = 0
  while pnum < PN:
    xyzP[pnum][XX] = random.uniform(-1,1)
    xyzP[pnum][YY] = random.uniform(-1,1)
    xyzP[pnum][ZZ] = random.uniform(-1,1)
    xyzF[pnum][XX] = random.uniform(-1,1)
    xyzF[pnum][YY] = random.uniform(-1,1)
    xyzF[pnum][ZZ] = random.uniform(-1,1)
    pnum += 1


def results_sum():
  global xyzF
  pnum = 0
  total_F = 0
#  while pnum < PN:
#    total_F = total_F + xyzF[pnum][XX]
#    total_F = total_F + xyzF[pnum][YY]
#    total_F = total_F + xyzF[pnum][ZZ]
#    pnum += 1
#
#  print (total_F)


if __name__ == "__main__":

  init_lattice()
  find_pair()
  results_sum()

The process of brute force calculation by Coulomb force with multiprocessing and numpy. TEST2

The split process reduced the matrix size as much as possible, but it still eats up too much memory.
Multiprocessing consumes a lot of memory.

import random
import math
import time
import gc
import numpy as np

from multiprocessing import Pool

np.set_printoptions(threshold=np.inf)
random.seed(1)

XX = 0
YY = 1
ZZ = 2

#total particle num
PN = 1200
#---------------------------------------------------
#
# parameter : divide_N
#
# The number of particles per side of a small square 
# divided for the splitting process when creating a 
# large PN^2 region with all particles PN.
#
# The smaller the setting, the more it will be divided
# into many smaller jobs.
#---------------------------------------------------
divide_N = 50
core = 2
xyzV = [[0 for i in range(3)] for j in range(PN)]
xyzP = np.zeros((PN,3))
xyzF = np.zeros((PN,3))

def wrapper(args):
  return calc_XY_start_end(*args)

def find_pair():
  global PN
  global xyzF
  global xyzP

  worklist = []
  thread   = 0

  remainder = PN %  divide_N
  quotient  = PN // divide_N

  if remainder != 0:                 #ex PN=11, divide_N=4, remainder=3, quotient=2
      quotient = quotient + 1        #   2 -> 3
  extra = divide_N - remainder       #   1

  X_st = 0
  X_ed = 0
  Y_st = 0
  Y_ed = 0

  Thead_num = 0
  sum_f = np.zeros((PN,PN,3))

  for Xn in range(0,quotient):
    for Yn in range(0,Xn+1):

      worklist.append([Xn, Yn, quotient, remainder, divide_N, extra])
      Thead_num = Thead_num + 1
      #----------------------------------------
      #local_force = calc_XY_start_end(Xn, Yn, quotient, remainder, divide_N, extra)
      #----------------------------------------

  p = Pool(core)

  #start thread. results is callback in array.
  callback = p.map(wrapper, worklist)
  p.close()


  for Th in range(0,Thead_num):

    xn_l = callback[Th][0]
    yn_l = callback[Th][1]
    xxst = callback[Th][2]
    xxed = callback[Th][3]
    yyst = callback[Th][4]
    yyed = callback[Th][5]
    local_force = callback[Th][6]

    if xxst != 0:
      r2 = np.zeros((xxst,divide_N,3))
      local_force        = np.insert(local_force     ,0,r2,axis=1)
    if xxed+1 != PN:
      r2 = np.zeros((PN - xxed - 1,divide_N,3))
      local_force      = np.insert(local_force     ,xxed + 1,r2,axis=1)

    if yyst != 0:
      r2 = np.zeros((yyst,PN,3))
      local_force      = np.insert(local_force     ,0,r2,axis=0)
    if yyed+1 != PN:
      r2 = np.zeros((PN - yyed - 1,PN,3))
      local_force      = np.insert(local_force     ,yyed + 1,r2,axis=0)

    del r2

    if xn_l != yn_l:
      sum_f = sum_f + local_force
      sum_f = sum_f - local_force.transpose(1,0,2)
    else:
      sum_f = sum_f + local_force

    del local_force
    gc.collect()

  #print (sum_f)

  print("=====================================")
  mmm = sum_f.sum(axis=1)
  xyzF = xyzF + mmm


def calc_XY_start_end(Xn, Yn, quotient, remainder, divide_N, extra):
  global PN
  global xyzP

  if Xn != quotient-1 and remainder != 0 or remainder == 0:
    X_st = Xn*divide_N
    X_ed = (Xn+1)*divide_N-1
  else:
    X_st = Xn*divide_N - (divide_N - remainder)
    X_ed = (Xn+1)*divide_N  - (divide_N - remainder) -1
  if Yn != quotient-1 and remainder != 0 or remainder == 0:
    Y_st = Yn*divide_N
    Y_ed = (Yn+1)*divide_N-1
  else:
    Y_st = Yn*divide_N - (divide_N - remainder)
    Y_ed = (Yn+1)*divide_N  -(divide_N - remainder) -1

  Xa = np.arange(X_st,X_ed+1)
  Ya = np.arange(Y_st,Y_ed+1)
  mx1,my1 = np.meshgrid(Xa,Ya)
  local_xyz  = xyzP[mx1]-xyzP[my1]
  distance_a = np.linalg.norm(local_xyz, axis=2)

  tmp_d = distance_a[:,:,np.newaxis]
  tmp_f = local_xyz/tmp_d
  tmp_f = np.nan_to_num(tmp_f,0)

  #extra clear
  if remainder != 0:
    if Xn + 1 == quotient:
      for dd in range(0,extra):
        tmp_f[:,dd] = 0
    if Yn + 1 == quotient:
      for dd in range(0,extra):
        tmp_f[[dd]] = 0

  return Xn,Yn,X_st,X_ed,Y_st,Y_ed,tmp_f



def init_lattice():
  global xyzF
  global xyzP

  pnum = 0
  while pnum < PN:
    xyzP[pnum][XX] = random.uniform(-1,1)
    xyzP[pnum][YY] = random.uniform(-1,1)
    xyzP[pnum][ZZ] = random.uniform(-1,1)
    xyzF[pnum][XX] = random.uniform(-1,1)
    xyzF[pnum][YY] = random.uniform(-1,1)
    xyzF[pnum][ZZ] = random.uniform(-1,1)
    pnum += 1


def results_sum():
  global xyzF
  pnum = 0
  total_F = 0
  while pnum < PN:
    total_F = total_F + xyzF[pnum][XX]
    total_F = total_F + xyzF[pnum][YY]
    total_F = total_F + xyzF[pnum][ZZ]
    pnum += 1

  print (total_F)


if __name__ == "__main__":

  init_lattice()
  find_pair()
  results_sum()

The process of brute force calculation by Coulomb force with multiprocessing and numpy. TEST1

The small distributed matrices also eat too much memory because they are only PNxPN in size.
Too slow.

###########################
# multiprocessing test ####
###########################
import random
import math
import time
import scipy.special as scm
from multiprocessing import Pool

random.seed(1)

PX = 0;PY = 1;PZ = 2;
VX = 3;VY = 4;VZ = 5;
FX = 6;FY = 7;FZ = 8;

#number of particles in a line
line_num = 15

#total particle num
PN = line_num * line_num * line_num

#ready to 9 parameters for particle 
#(PX, PY, PZ, VX, VY, VZ, FX, FY, FZ)
xyz = [[0 for i in range(9)] for j in range(PN)]

#Number of combinations of coulomb force calculation
combinum = int(scm.comb(PN, 2))

#thread number(local thread num)
core = 4

def find_pair_sub(prep,pend,thread):
  global xyz

  #local results array
  xyzF = [[0 for i in range(3)] for j in range(PN)]
  fx = 0; fy = 1; fz = 2

  for i in range(prep,pend):
    for j in range(i + 1, PN):
      dx = xyz[i][PX] - xyz[j][PX]
      dy = xyz[i][PY] - xyz[j][PY]
      dz = xyz[i][PZ] - xyz[j][PZ]
      r  = math.sqrt(dx*dx + dy*dy + dz*dz)

      xyzF[i][fx] = xyzF[i][fx] + dx/(r*r*r)
      xyzF[i][fy] = xyzF[i][fy] + dy/(r*r*r)
      xyzF[i][fz] = xyzF[i][fz] + dz/(r*r*r)
      xyzF[j][fx] = xyzF[j][fx] - dx/(r*r*r)
      xyzF[j][fy] = xyzF[j][fy] - dy/(r*r*r)
      xyzF[j][fz] = xyzF[j][fz] - dz/(r*r*r)

  return xyzF

def wrapper(args):
  return find_pair_sub(*args)


def find_pair():
  global PN
  global combinum

  pw = combinum // core
  pl = combinum % core

  localt = 0
  thread = 0
  pre = 0
  #each thread work list
  worklist = []
  ppp = pw

  for i in range(PN) :
    if core == 1:
      worklist.append([pre,PN,thread])
      break

    localt = localt + (PN - i - 1)
    if localt >= ppp:
      worklist.append([pre,i,thread])
      ppp += pw
      thread += 1
      pre = i

  if i != pre:
    prep = worklist[thread-1][0]
    worklist[thread-1] = [prep,PN,thread-1]

  #make thread core num
  p = Pool(core)

  #start thread. results is callback in array.
  callback = p.map(wrapper, worklist)
  p.close()

  #summation each thread results
  for j in range(core):
    for i in range(PN):
      xyz[i][FX] += callback[j][i][0]
      xyz[i][FY] += callback[j][i][1]
      xyz[i][FZ] += callback[j][i][2]

def init_lattice():
  global xyz

  pnum = 0
  while pnum < PN:
    xyz[pnum][PX] = random.uniform(-1,1)
    xyz[pnum][PY] = random.uniform(-1,1)
    xyz[pnum][PZ] = random.uniform(-1,1)
    xyz[pnum][FX] = random.uniform(-1,1)
    xyz[pnum][FY] = random.uniform(-1,1)
    xyz[pnum][FZ] = random.uniform(-1,1)
    pnum += 1

if __name__ == "__main__":
  init_lattice()
  find_pair()

The process of brute force calculation by Coulomb force

The process of brute force calculation by Coulomb force is calculated as a matrix using numpy. In the calculation, each element is calculated in a small matrix and summed up at the end to enable parallel processing later.

import random
import math
import time
import numpy as np

np.set_printoptions(threshold=np.inf)
random.seed(1)

XX = 0
YY = 1
ZZ = 2

#total particle num
PN = 100
#---------------------------------------------------
#
# parameter : divide_N
#
# The number of particles per side of a small square 
# divided for the splitting process when creating a 
# large PN^2 region with all particles PN.
#
# The smaller the setting, the more it will be divided
# into many smaller jobs.
#---------------------------------------------------
divide_N = 3

xyzV = [[0 for i in range(3)] for j in range(PN)]
xyzP = np.zeros((PN,3))
xyzF = np.zeros((PN,3))

def find_pair():
  global PN
  global xyzF
  global xyzP

  remainder = PN %  divide_N
  quotient  = PN // divide_N

  if remainder != 0:                 #ex PN=11, divide_N=4, remainder=3, quotient=2
      quotient = quotient + 1        #   2 -> 3
  extra = divide_N - remainder       #   1

  X_st = 0
  X_ed = 0
  Y_st = 0
  Y_ed = 0

  sum_f = np.zeros((PN,PN,3))

  for Xn in range(0,quotient):
    for Yn in range(0,Xn+1):

      #----------------------------------------
      local_force = calc_XY_start_end(Xn, Yn, quotient, remainder, divide_N, extra)
      #----------------------------------------

      if Xn != Yn:
        sum_f = sum_f + local_force
        sum_f = sum_f - local_force.transpose(1,0,2)
      else:
        sum_f = sum_f + local_force

  print (sum_f)

  print("=====================================")
  mmm = sum_f.sum(axis=1)
  xyzF = xyzF + mmm


def calc_XY_start_end(Xn, Yn, quotient, remainder, divide_N, extra):
  global PN
  global xyzP

  if Xn != quotient-1 and remainder != 0 or remainder == 0:
    X_st = Xn*divide_N
    X_ed = (Xn+1)*divide_N-1
  else:
    X_st = Xn*divide_N - (divide_N - remainder)
    X_ed = (Xn+1)*divide_N  - (divide_N - remainder) -1
  if Yn != quotient-1 and remainder != 0 or remainder == 0:
    Y_st = Yn*divide_N
    Y_ed = (Yn+1)*divide_N-1
  else:
    Y_st = Yn*divide_N - (divide_N - remainder)
    Y_ed = (Yn+1)*divide_N  -(divide_N - remainder) -1

  Xa = np.arange(X_st,X_ed+1)
  Ya = np.arange(Y_st,Y_ed+1)
  mx1,my1 = np.meshgrid(Xa,Ya)
  local_xyz  = xyzP[mx1]-xyzP[my1]
  distance_a = np.linalg.norm(local_xyz, axis=2)

  tmp_d = distance_a[:,:,np.newaxis]
  tmp_f = local_xyz/tmp_d
  tmp_f = np.nan_to_num(tmp_f,0)

  #extra clear
  if remainder != 0:
    if Xn + 1 == quotient:
      for dd in range(0,extra):
        tmp_f[:,dd] = 0
    if Yn + 1 == quotient:
      for dd in range(0,extra):
        tmp_f[[dd]] = 0

  if X_st != 0:
    r2 = np.zeros((X_st,divide_N,3))
    tmp_f        = np.insert(tmp_f     ,0,r2,axis=1)
  if X_ed+1 != PN:
    r2 = np.zeros((PN - X_ed - 1,divide_N,3))
    tmp_f      = np.insert(tmp_f     ,X_ed + 1,r2,axis=1)

  if Y_st != 0:
    r2 = np.zeros((Y_st,PN,3))
    tmp_f      = np.insert(tmp_f     ,0,r2,axis=0)
  if Y_ed+1 != PN:
    r2 = np.zeros((PN - Y_ed - 1,PN,3))
    tmp_f      = np.insert(tmp_f     ,Y_ed + 1,r2,axis=0)

  #print (tmp_f)
  #print("===================================================")
  return tmp_f



def init_lattice():
  global xyzF
  global xyzP

  pnum = 0
  while pnum < PN:
    xyzP[pnum][XX] = random.uniform(-1,1)
    xyzP[pnum][YY] = random.uniform(-1,1)
    xyzP[pnum][ZZ] = random.uniform(-1,1)
    xyzF[pnum][XX] = random.uniform(-1,1)
    xyzF[pnum][YY] = random.uniform(-1,1)
    xyzF[pnum][ZZ] = random.uniform(-1,1)
    pnum += 1


def results_sum():
  global xyzF
  pnum = 0
  total_F = 0
  while pnum < PN:
    total_F = total_F + xyzF[pnum][XX]
    total_F = total_F + xyzF[pnum][YY]
    total_F = total_F + xyzF[pnum][ZZ]
    pnum += 1

  print (total_F)


if __name__ == "__main__":

  init_lattice()
  find_pair()
  results_sum()

Calculating long-range particle interactions with numpy (handmade mesh grid)

Too slow

import random
import math
import time
import numpy as np

np.set_printoptions(threshold=np.inf)
random.seed(1)

XX = 0
YY = 1
ZZ = 2

#total particle num
PN = 300

xyzV = [[0 for i in range(3)] for j in range(PN)]
xyzP = np.zeros((PN,3))
xyzF = np.zeros((PN,3))


def find_pair():
  global PN
  global xyzF
  global xyzP

  x1 = np.empty(0)
  y1 = np.empty(0)

  for mm in range(1,PN):
    for nn in range(mm,PN):
      x1 = np.append(x1,[nn],axis=0)
      y1 = np.append(y1,[mm-1],axis=0)

  x1 = x1.astype(np.int32)
  x1 = x1.reshape(x1.shape[0],1)
  y1 = y1.astype(np.int32)
  y1 = y1.reshape(y1.shape[0],1)

  dd = np.linalg.norm(xyzP[x1]-xyzP[y1], axis=2)

  distances = np.zeros((PN,PN))
  ly = 0
  lx = ly + 1
  for kk in range(0,dd.shape[0]):
    distances[lx][ly] = dd[kk]
    distances[ly][lx] = dd[kk]
    lx = lx + 1
    if lx == PN:
      ly = ly + 1
      lx = ly + 1

  tmp_index = np.arange(xyzP.shape[0])
  xx, yy = np.meshgrid(tmp_index, tmp_index)

  dxyz = xyzP[xx] - xyzP[yy]

  distances = distances[:,:,np.newaxis]
  tmp_force = dxyz/distances
  tmp_force = np.nan_to_num(tmp_force,0)

  mmm = tmp_force.sum(axis=1)
  xyzF = xyzF + mmm

def init_lattice():
  global xyzF
  global xyzP

  pnum = 0
  while pnum < PN:
    xyzP[pnum][XX] = random.uniform(-1,1)
    xyzP[pnum][YY] = random.uniform(-1,1)
    xyzP[pnum][ZZ] = random.uniform(-1,1)
    xyzF[pnum][XX] = random.uniform(-1,1)
    xyzF[pnum][YY] = random.uniform(-1,1)
    xyzF[pnum][ZZ] = random.uniform(-1,1)
    pnum += 1

def results_sum():
  #global xyzF
  pnum = 0
  total_F = 0
  while pnum < PN:
    total_F = total_F + xyzF[pnum][XX]
    total_F = total_F + xyzF[pnum][YY]
    total_F = total_F + xyzF[pnum][ZZ]
    pnum += 1

  print (total_F)


if __name__ == "__main__":

  init_lattice()
  find_pair()
  results_sum()

Calculating long-range particle interactions with numpy

import random
import math
import time
import numpy as np

#import scipy.misc as scm
from multiprocessing import Pool

np.set_printoptions(threshold=np.inf)
random.seed(1)

XX = 0
YY = 1
ZZ = 2

#total particle num
PN = 1000

xyzV = [[0 for i in range(3)] for j in range(PN)]
xyzP = np.zeros((PN,3))
xyzF = np.zeros((PN,3))

def find_pair():
  global PN
  global xyzF
  global xyzP

  tmp_index = np.arange(xyzP.shape[0])

  xx, yy = np.meshgrid(tmp_index, tmp_index)
  distances = np.linalg.norm(xyzP[xx]-xyzP[yy], axis=2)
  dxyz = xyzP[xx] - xyzP[yy]

  distances = distances[:,:,np.newaxis]
  tmp_force = dxyz/distances
  tmp_force = np.nan_to_num(tmp_force,0)

  mmm = tmp_force.sum(axis=1)
  xyzF = xyzF + mmm

def init_lattice():
  global xyzF
  global xyzP

  pnum = 0
  while pnum < PN:
    xyzP[pnum][XX] = random.uniform(-1,1)
    xyzP[pnum][YY] = random.uniform(-1,1)
    xyzP[pnum][ZZ] = random.uniform(-1,1)
    xyzF[pnum][XX] = random.uniform(-1,1)
    xyzF[pnum][YY] = random.uniform(-1,1)
    xyzF[pnum][ZZ] = random.uniform(-1,1)
    pnum += 1


def results_sum():
  #global xyzF
  pnum = 0
  total_F = 0
  while pnum < PN:
    total_F = total_F + xyzF[pnum][XX]
    total_F = total_F + xyzF[pnum][YY]
    total_F = total_F + xyzF[pnum][ZZ]
    pnum += 1

  print (total_F)


if __name__ == "__main__":

  init_lattice()
  find_pair()
  results_sum()

乱数を使って円周率もどきを求める2

先日試した乱数を使った円周率を求める方法を
numpyを使った方法に置き換えた。

import math
import time
import numpy as np


def calc_pi ( NN ):
  arr1 = np.random.rand(NN)
  arr2 = np.random.rand(NN)
  arr3 = np.sqrt(arr1 * arr1 + arr2 * arr2)
  arr3[arr3 > 1] = 0   #1より大きい要素は0に
  arr3[arr3 > 0] = 1   #0より大きい要素は1に
  in_n = np.sum(arr3)  #円内に入ったやつの合計

  return ( in_n / NN ) * 4


if __name__ == "__main__":

  for j in range (1,9):
    NN = 10 ** j
    start_time = time.perf_counter()
    pi = calc_pi(NN)
    execution_time = time.perf_counter() - start_time
    delt = math.fabs(math.pi - pi)
    print (str(NN).rjust(10),"  ",f'{pi:11.010f}',"  ",f'{delt:7.06f}',"  ",execution_time)

実行結果

   試行回数        結果      誤差     処理時間[sec]
        10    2.8000000000    0.341593    8.769400028540986e-05
       100    3.1200000000    0.021593    4.6955000016168924e-05
      1000    3.1960000000    0.054407    5.6955000218295027e-05
     10000    3.1420000000    0.000407    0.0003660469997157634
    100000    3.1376400000    0.003953    0.0036325510000096983
   1000000    3.1415240000    0.000069    0.03182136600025842
  10000000    3.1416492000    0.000057    0.2625446400002147
 100000000    3.1416174400    0.000025    2.535640128000068

numpy使わずにwhileで計算していたまえの処理に較べて15倍位早い結果。全て配列に収めてから処理するとさすがに早い。ただしメモリもバカ喰い。