Pi
Pi
计算 $\pi$
前情提要:
我们已经在 gmp 中展示了计算 $\pi$ 的方法, 接下来我们进一步对程序进行加速
Exploit CPU
\[\pi = \sum_{k=0}^{\infty}\left[\frac{1}{16^k}(\frac{4}{8k+1}-\frac{2}{8k+4}-\frac{1}{8k+5}-\frac{1}{8k+6})\right]\]根据上面的公式, 最简单的想法就是使用多线程, 每个线程计算各自的分量, 最后合并到一起。伪代码如下, 详细源代码见源代码清单
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#pragma omp parallel // spawn threads
{
local_ans = 0; // private
#pragma omp for // distribute iterations
for (k = 0; k < PRECISION; ++k) {
tmp = pow(16, PRECISION - k);
a = tmp * 4 / (8 * k + 1)
b = tmp * 2 / (8 * k + 4)
c = tmp / (8 * k + 5)
d = tmp / (8 * k + 6)
local_ans += a - b - c - d;
}
#pragma omp critical
ans += local_ans;
}
使用 24 个核, 2.4GHz 的 Intel CPU 在 linux 上的运行结果如下。
precision | real time | user time | system time |
---|---|---|---|
100008 | 0m0.601s | 0m7.524s | 0m0.043s |
200008 | 0m2.384s | 0m28.138s | 0m0.049s |
400008 | 0m9.263s | 1m50.827s | 0m0.080s |
800008 | 0m36.470s | 7m22.519s | 0m0.333s |
Exploit GPU
难点在于 GPU 不支持可变精度的整型运算
,,,,,coming soon,,,,,
公式一览
\[\int_{0}^{1}\frac{1}{1+x^2} = arctan(x)\bigg|_0^1 = \frac{\pi}{4}\]参考
- Stephan O’Brien’s tutorial about parallel-cpp
- Computing Digits of π with CUDA
- David H. Bailey, Peter B. Borwein and Simon Plouffe. “On the Rapid Computation of Various Polylogarithmic Constants”
- David H. Bailey, Jonathan M. Borwein, Peter B. Borwein and Simon Plouffe. “The Quest for Pi”
源代码清单
pi_speedup1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <gmp.h>
#include <omp.h>
#define PREC 100008
int main() {
mpz_t ans, base;
unsigned long int k;
char *res_str;
mpz_init(ans);
mpz_init(base);
mpz_init_set_ui(base, 16);
// spawn threads
#pragma omp parallel
{
mpz_t a, b, c, d, denom;
mpz_t v, tmp;
mpz_t local_ans;
mpz_init(a);
mpz_init(b);
mpz_init(c);
mpz_init(d);
mpz_init(denom);
mpz_init(tmp);
mpz_init(local_ans);
mpz_set_ui(local_ans, 0);
// distribute iterations
#pragma omp for
for (k = 0; k < PREC; ++k) {
mpz_init_set_ui(v, PREC - k);
mpz_pow_ui(tmp, base, mpz_get_ui(v));
// Calculate a = tmp * 4 // (8 * k + 1)
mpz_mul_ui(a, tmp, 4);
mpz_set_ui(denom, 8 * k + 1);
mpz_fdiv_q(a, a, denom); // Floor division
// Calculate b = tmp * 2 // (8 * k + 4)
mpz_mul_ui(b, tmp, 2);
mpz_set_ui(denom, 8 * k + 4);
mpz_fdiv_q(b, b, denom);
// Calculate c = tmp // (8 * k + 5)
mpz_set_ui(denom, 8 * k + 5);
mpz_fdiv_q(c, tmp, denom);
// Calculate d = tmp // (8 * k + 6)
mpz_set_ui(denom, 8 * k + 6);
mpz_fdiv_q(d, tmp, denom);
// Update local_ans: local_ans += a - b - c - d
mpz_sub(a, a, b);
mpz_sub(a, a, c);
mpz_sub(a, a, d);
mpz_add(local_ans, local_ans, a);
}
#pragma omp critical
mpz_add(ans, ans, local_ans);
mpz_clear(local_ans);
mpz_clear(a);
mpz_clear(b);
mpz_clear(c);
mpz_clear(d);
mpz_clear(denom);
mpz_clear(tmp);
}
res_str = mpz_get_str(NULL, 16, ans);
printf("%.*s\n", (int)strlen(res_str) - 8, res_str);
free(res_str);
mpz_clear(ans);
mpz_clear(base);
return 0;
}
This post is licensed under CC BY 4.0 by the author.