```  1 /*
2  * Solve a distributed lasso problem, i.e.,
3  *
4  *   minimize  (1/2)||Ax - b||_2^2 + lambda*||x||_1.
5  *
6  * The implementation uses MPI for distributed communication
7  * and the GNU Scientific Library (GSL) for math.
8  */
9
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <math.h>
13 #include "mmio.h"
14 #include <mpi.h>
15 #include <gsl/gsl_vector.h>
16 #include <gsl/gsl_matrix.h>
17 #include <gsl/gsl_blas.h>
18 #include <gsl/gsl_linalg.h>
19
20 void soft_threshold(gsl_vector *v, double k);
21 double objective(gsl_matrix *A, gsl_vector *b, double lambda, gsl_vector *z);
22
23 int main(int argc, char **argv) {
24 	const int MAX_ITER  = 50;
25 	const double RELTOL = 1e-2;
26 	const double ABSTOL = 1e-4;
27
28 	/*
29 	 * Some bookkeeping variables for MPI. The 'rank' of a process is its numeric id
30 	 * in the process pool. For example, if we run a program via `mpirun -np 4 foo', then
31 	 * the process ranks are 0 through 3. Here, N and size are the total number of processes
32 	 * running (in this example, 4).
33 	 */
34
35 	int rank;
36 	int size;
37
38 	MPI_Init(&argc, &argv);               // Initialize the MPI execution environment
39 	MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Determine current running process
40 	MPI_Comm_size(MPI_COMM_WORLD, &size); // Total number of processes
41 	double N = (double) size;             // Number of subsystems/slaves for ADMM
42
43 	/* Read in local data */
44
45 	int skinny;           // A flag indicating whether the matrix A is fat or skinny
46 	FILE *f;
47 	int m, n;
48 	int row, col;
49 	double entry;
50
51 	/*
52 	 * Subsystem n will look for files called An.dat and bn.dat
53 	 * in the current directory; these are its local data and do not need to be
54 	 * visible to any other processes. Note that
55 	 * m and n here refer to the dimensions of the *local* coefficient matrix.
56 	 */
57
59 	char s[20];
60 	sprintf(s, "data/A%d.dat", rank + 1);
61 	printf("[%d] reading %s\n", rank, s);
62
63 	f = fopen(s, "r");
64 	if (f == NULL) {
65 		printf("[%d] ERROR: %s does not exist, exiting.\n", rank, s);
66 		exit(EXIT_FAILURE);
67 	}
69 	gsl_matrix *A = gsl_matrix_calloc(m, n);
70 	for (int i = 0; i < m*n; i++) {
71 		row = i % m;
72 		col = floor(i/m);
73 		fscanf(f, "%lf", &entry);
74 		gsl_matrix_set(A, row, col, entry);
75 	}
76 	fclose(f);
77
79 	sprintf(s, "data/b%d.dat", rank + 1);
80 	printf("[%d] reading %s\n", rank, s);
81
82 	f = fopen(s, "r");
83 	if (f == NULL) {
84 		printf("[%d] ERROR: %s does not exist, exiting.\n", rank, s);
85 		exit(EXIT_FAILURE);
86 	}
88 	gsl_vector *b = gsl_vector_calloc(m);
89 	for (int i = 0; i < m; i++) {
90 		fscanf(f, "%lf", &entry);
91 		gsl_vector_set(b, i, entry);
92 	}
93 	fclose(f);
94
95 	m = A->size1;
96 	n = A->size2;
97 	skinny = (m >= n);
98
99 	/*
100 	 * These are all variables related to ADMM itself. There are many
101 	 * more variables than in the Matlab implementation because we also
102 	 * require vectors and matrices to store various intermediate results.
103 	 * The naming scheme follows the Matlab version of this solver.
104 	 */
105
106 	double rho = 1.0;
107
108 	gsl_vector *x      = gsl_vector_calloc(n);
109 	gsl_vector *u      = gsl_vector_calloc(n);
110 	gsl_vector *z      = gsl_vector_calloc(n);
111 	gsl_vector *y      = gsl_vector_calloc(n);
112 	gsl_vector *r      = gsl_vector_calloc(n);
113 	gsl_vector *zprev  = gsl_vector_calloc(n);
114 	gsl_vector *zdiff  = gsl_vector_calloc(n);
115
116 	gsl_vector *q      = gsl_vector_calloc(n);
117 	gsl_vector *w      = gsl_vector_calloc(n);
118 	gsl_vector *Aq     = gsl_vector_calloc(m);
119 	gsl_vector *p      = gsl_vector_calloc(m);
120
121 	gsl_vector *Atb    = gsl_vector_calloc(n);
122
123 	double send[3]; // an array used to aggregate 3 scalars at once
124 	double recv[3]; // used to receive the results of these aggregations
125
126 	double nxstack  = 0;
127 	double nystack  = 0;
128 	double prires   = 0;
129 	double dualres  = 0;
130 	double eps_pri  = 0;
131 	double eps_dual = 0;
132
133 	/* Precompute and cache factorizations */
134
135 	gsl_blas_dgemv(CblasTrans, 1, A, b, 0, Atb); // Atb = A^T b
136
137 	/*
138 	 * The lasso regularization parameter here is just hardcoded
139 	 * to 0.5 for simplicity. Using the lambda_max heuristic would require
140 	 * network communication, since it requires looking at the *global* A^T b.
141 	 */
142
143 	double lambda = 0.5;
144 	if (rank == 0) {
145 		printf("using lambda: %.4f\n", lambda);
146 	}
147
148 	gsl_matrix *L;
149
150 	/* Use the matrix inversion lemma for efficiency; see section 4.2 of the paper */
151 	if (skinny) {
152 		/* L = chol(AtA + rho*I) */
153 		L = gsl_matrix_calloc(n,n);
154
155 		gsl_matrix *AtA = gsl_matrix_calloc(n,n);
156 		gsl_blas_dsyrk(CblasLower, CblasTrans, 1, A, 0, AtA);
157
158 		gsl_matrix *rhoI = gsl_matrix_calloc(n,n);
159 		gsl_matrix_set_identity(rhoI);
160 		gsl_matrix_scale(rhoI, rho);
161
162 		gsl_matrix_memcpy(L, AtA);
164 		gsl_linalg_cholesky_decomp(L);
165
166 		gsl_matrix_free(AtA);
167 		gsl_matrix_free(rhoI);
168 	} else {
169 		/* L = chol(I + 1/rho*AAt) */
170 		L = gsl_matrix_calloc(m,m);
171
172 		gsl_matrix *AAt = gsl_matrix_calloc(m,m);
173 		gsl_blas_dsyrk(CblasLower, CblasNoTrans, 1, A, 0, AAt);
174 		gsl_matrix_scale(AAt, 1/rho);
175
176 		gsl_matrix *eye = gsl_matrix_calloc(m,m);
177 		gsl_matrix_set_identity(eye);
178
179 		gsl_matrix_memcpy(L, AAt);
181 		gsl_linalg_cholesky_decomp(L);
182
183 		gsl_matrix_free(AAt);
184 		gsl_matrix_free(eye);
185 	}
186
187 	/* Main ADMM solver loop */
188
189 	int iter = 0;
190 	if (rank == 0) {
191 		printf("%3s %10s %10s %10s %10s %10s\n", "#", "r norm", "eps_pri", "s norm", "eps_dual", "objective");
192 	}
193
194 	while (iter < MAX_ITER) {
195
196 		/* u-update: u = u + x - z */
197 		gsl_vector_sub(x, z);
199
200 		/* x-update: x = (A^T A + rho I) \ (A^T b + rho z - y) */
201 		gsl_vector_memcpy(q, z);
202 		gsl_vector_sub(q, u);
203 		gsl_vector_scale(q, rho);
204 		gsl_vector_add(q, Atb);   // q = A^T b + rho*(z - u)
205
206 		if (skinny) {
207 			/* x = U \ (L \ q) */
208 			gsl_linalg_cholesky_solve(L, q, x);
209 		} else {
210 			/* x = q/rho - 1/rho^2 * A^T * (U \ (L \ (A*q))) */
211 			gsl_blas_dgemv(CblasNoTrans, 1, A, q, 0, Aq);
212 			gsl_linalg_cholesky_solve(L, Aq, p);
213 			gsl_blas_dgemv(CblasTrans, 1, A, p, 0, x); /* now x = A^T * (U \ (L \ (A*q)) */
214 			gsl_vector_scale(x, -1/(rho*rho));
215 			gsl_vector_scale(q, 1/rho);
217 		}
218
219 		/*
220 		 * Message-passing: compute the global sum over all processors of the
221 		 * contents of w and t. Also, update z.
222 		 */
223
224 		gsl_vector_memcpy(w, x);
225 		gsl_vector_add(w, u);   // w = x + u
226
227 		gsl_blas_ddot(r, r, &send[0]);
228 		gsl_blas_ddot(x, x, &send[1]);
229 		gsl_blas_ddot(u, u, &send[2]);
230 		send[2] /= pow(rho, 2);
231
232 		gsl_vector_memcpy(zprev, z);
233
234 		// could be reduced to a single Allreduce call by concatenating send to w
235 		MPI_Allreduce(w->data, z->data,  n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
236 		MPI_Allreduce(send,    recv,     3, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
237
238 		prires  = sqrt(recv[0]);  /* sqrt(sum ||r_i||_2^2) */
239 		nxstack = sqrt(recv[1]);  /* sqrt(sum ||x_i||_2^2) */
240 		nystack = sqrt(recv[2]);  /* sqrt(sum ||y_i||_2^2) */
241
242 		gsl_vector_scale(z, 1/N);
243 		soft_threshold(z, lambda/(N*rho));
244
245 		/* Termination checks */
246
247 		/* dual residual */
248 		gsl_vector_memcpy(zdiff, z);
249 		gsl_vector_sub(zdiff, zprev);
250 		dualres = sqrt(N) * rho * gsl_blas_dnrm2(zdiff); /* ||s^k||_2^2 = N rho^2 ||z - zprev||_2^2 */
251
252 		/* compute primal and dual feasibility tolerances */
253 		eps_pri  = sqrt(n*N)*ABSTOL + RELTOL * fmax(nxstack, sqrt(N)*gsl_blas_dnrm2(z));
254 		eps_dual = sqrt(n*N)*ABSTOL + RELTOL * nystack;
255
256 		if (rank == 0) {
257 			printf("%3d %10.4f %10.4f %10.4f %10.4f %10.4f\n", iter,
258 					prires, eps_pri, dualres, eps_dual, objective(A, b, lambda, z));
259 		}
260
261 		if (prires <= eps_pri && dualres <= eps_dual) {
262 			break;
263 		}
264
265 		/* Compute residual: r = x - z */
266 		gsl_vector_memcpy(r, x);
267 		gsl_vector_sub(r, z);
268
269 		iter++;
270 	}
271
272 	/* Have the master write out the results to disk */
273 	if (rank == 0) {
274 		f = fopen("data/solution.dat", "w");
275 		gsl_vector_fprintf(f, z, "%lf");
276 		fclose(f);
277 	}
278
279 	MPI_Finalize(); /* Shut down the MPI execution environment */
280
281 	/* Clear memory */
282 	gsl_matrix_free(A);
283 	gsl_matrix_free(L);
284 	gsl_vector_free(b);
285 	gsl_vector_free(x);
286 	gsl_vector_free(u);
287 	gsl_vector_free(z);
288 	gsl_vector_free(y);
289 	gsl_vector_free(r);
290 	gsl_vector_free(w);
291 	gsl_vector_free(zprev);
292 	gsl_vector_free(zdiff);
293 	gsl_vector_free(q);
294 	gsl_vector_free(Aq);
295 	gsl_vector_free(Atb);
296 	gsl_vector_free(p);
297
298 	return EXIT_SUCCESS;
299 }
300
301 double objective(gsl_matrix *A, gsl_vector *b, double lambda, gsl_vector *z) {
302 	double obj = 0;
303 	gsl_vector *Azb = gsl_vector_calloc(A->size1);
304 	gsl_blas_dgemv(CblasNoTrans, 1, A, z, 0, Azb);
305 	gsl_vector_sub(Azb, b);
306 	double Azb_nrm2;
307 	gsl_blas_ddot(Azb, Azb, &Azb_nrm2);
308 	obj = 0.5 * Azb_nrm2 + lambda * gsl_blas_dasum(z);
309 	gsl_vector_free(Azb);
310 	return obj;
311 }
312
313 void soft_threshold(gsl_vector *v, double k) {
314 	double vi;
315 	for (int i = 0; i < v->size; i++) {
316 		vi = gsl_vector_get(v, i);
317 		if (vi > k)       { gsl_vector_set(v, i, vi - k); }
318 		else if (vi < -k) { gsl_vector_set(v, i, vi + k); }
319 		else              { gsl_vector_set(v, i, 0); }
320 	}
321 }

```

syntax highlighted by Code2HTML, v. 0.9.1