Post

Pi

Pi

计算 $\pi$

前情提要:

我们已经在 gmp 中展示了计算 $\pi$ 的方法, 接下来我们进一步对程序进行加速

Exploit CPU

\[\pi = \sum_{k=0}^{\infty}\left[\frac{1}{16^k}(\frac{4}{8k+1}-\frac{2}{8k+4}-\frac{1}{8k+5}-\frac{1}{8k+6})\right]\]

根据上面的公式, 最简单的想法就是使用多线程, 每个线程计算各自的分量, 最后合并到一起。伪代码如下, 详细源代码见源代码清单

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#pragma omp parallel // spawn threads
{
    local_ans = 0; // private
    
    #pragma omp for // distribute iterations
    for (k = 0; k < PRECISION; ++k) {
        tmp = pow(16, PRECISION - k);
        a = tmp * 4 / (8 * k + 1)
        b = tmp * 2 / (8 * k + 4)
        c = tmp / (8 * k + 5)
        d = tmp / (8 * k + 6)
        local_ans += a - b - c - d;
    }

    #pragma omp critical
    ans += local_ans;
}

使用 24 个核, 2.4GHz 的 Intel CPU 在 linux 上的运行结果如下。

precisionreal timeuser timesystem time
1000080m0.601s0m7.524s0m0.043s
2000080m2.384s0m28.138s0m0.049s
4000080m9.263s1m50.827s0m0.080s
8000080m36.470s7m22.519s0m0.333s

Exploit GPU

难点在于 GPU 不支持可变精度的整型运算

,,,,,coming soon,,,,,

公式一览

\[\int_{0}^{1}\frac{1}{1+x^2} = arctan(x)\bigg|_0^1 = \frac{\pi}{4}\]

参考

源代码清单

pi_speedup1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <gmp.h>
#include <omp.h>

#define PREC 100008

int main() {
  mpz_t ans, base;
  unsigned long int k;
  char *res_str;

  mpz_init(ans);
  mpz_init(base);
  mpz_init_set_ui(base, 16);

  // spawn threads
  #pragma omp parallel
  {
    mpz_t a, b, c, d, denom;
    mpz_t v, tmp;
    mpz_t local_ans;

    mpz_init(a);
    mpz_init(b);
    mpz_init(c);
    mpz_init(d);
    mpz_init(denom);
    mpz_init(tmp);
    mpz_init(local_ans);
    mpz_set_ui(local_ans, 0);

    // distribute iterations
    #pragma omp for
    for (k = 0; k < PREC; ++k) {
      mpz_init_set_ui(v, PREC - k);
      mpz_pow_ui(tmp, base, mpz_get_ui(v));

      // Calculate a = tmp * 4 // (8 * k + 1)
      mpz_mul_ui(a, tmp, 4);
      mpz_set_ui(denom, 8 * k + 1);
      mpz_fdiv_q(a, a, denom); // Floor division

      // Calculate b = tmp * 2 // (8 * k + 4)
      mpz_mul_ui(b, tmp, 2);
      mpz_set_ui(denom, 8 * k + 4);
      mpz_fdiv_q(b, b, denom);

      // Calculate c = tmp // (8 * k + 5)
      mpz_set_ui(denom, 8 * k + 5);
      mpz_fdiv_q(c, tmp, denom);

      // Calculate d = tmp // (8 * k + 6)
      mpz_set_ui(denom, 8 * k + 6);
      mpz_fdiv_q(d, tmp, denom);

      // Update local_ans: local_ans += a - b - c - d
      mpz_sub(a, a, b);
      mpz_sub(a, a, c);
      mpz_sub(a, a, d);
      mpz_add(local_ans, local_ans, a);
    }

    #pragma omp critical
    mpz_add(ans, ans, local_ans);

    mpz_clear(local_ans);
    mpz_clear(a);
    mpz_clear(b);
    mpz_clear(c);
    mpz_clear(d);
    mpz_clear(denom);
    mpz_clear(tmp);
  }

  res_str = mpz_get_str(NULL, 16, ans);
  printf("%.*s\n", (int)strlen(res_str) - 8, res_str);
  free(res_str);

  mpz_clear(ans);
  mpz_clear(base);

  return 0;
}
This post is licensed under CC BY 4.0 by the author.