%=============================================================================================================================== % Implements the Sum of Functions Optimizer (SFO), as described in the paper: %=============================================================================================================================== classdef sfo < handle properties % parameters used to implement the algorithm display = 0; f_df; args; max_history = 10; max_gradient_noise = 1; hess_max_dev = 1e8; hessian_init = 1e5; N; subfunction_references; theta_original;% theta, in its original format; theta;% theta, flattented into a 1d array; theta_prior_step;% theta from the previous learning step -- initialize to theta; M; % number of data dimensions; step_scale = 1;% the update steps will be rescaled by this; eps = 1e-12; % 'very small' for various tasks; minimum_step_length = 1e-8; % shortest step length allowed. max_step_length_ratio = 10; % longest allowed update step length relative to the average length of prior update steps. % the min & max dimenstionality for the subspace; K_min; K_max; K_current = 1;% current dimensionality of the subspace; P;% obj.P holds the subspace; % store the minimum & maximum eigenvalue from each approximate Hessian; min_eig_sub; max_eig_sub; % store the total time spent in optimization, & the amount of time spent in the objective function; time_pass = 0.; time_func = 0.; iter_since_active_growth = 0;% how many steps since the active set size was increased; % which subfunctions are active; init_subf = 2; active; total_distance = 0.; % the total path length traveled during optimization; eval_count; % number of function evaluations for each subfunction; theta_proj;% theta projected into current working subspace; last_theta; % holds the last position for all the objective functions; last_df;% holds the last gradient for all the objective functions; hist_deltatheta;% the history of theta changes for each subfunction; hist_deltadf;% the history of gradient changes for each subfunction; hist_f;% the history of function values for each subfunction; hist_f_flat = [];% a flat history of all returned subfunction values for debugging/diagnostics; b;% the approximate Hessian for each subfunction is stored as dot(b(:.:.index), b(:.:.index).') full_H = 0; % the full Hessian (sum over all the subfunctions); varargin_stored = {};% parameters that are passed through to f_df f_predicted_total_improvement = 0;% the predicted improvement in the total objective from the current update step end methods %% Initializes the optimizer class: function obj = sfo(f_df, theta, subfunction_references, varargin) % Parameters: % f_df - Returns the function value and gradient for a single subfunction call % [f, dfdtheta] = f_df(theta, subfunction_references{idx},varargin{:}) % where idx is the index of a single subfunction. % theta - The initial parameters to be used for optimization. theta can be either a vector % , a matrix, or a cell array with a vector or matrix in every cell. % subfunction_references - A cell array containing an identifying element for each subfunction % The elements in this list could be, eg, matrices containing minibatches, or indices % identifying the subfunction. % varargin - Any additional parameters will be passed through to f_df each time it is called. % Returns: % obj - sfo class instance. obj.N = length(subfunction_references); obj.theta_original = theta; obj.theta = obj.theta_original_to_flat(obj.theta_original); obj.theta_prior_step = obj.theta; obj.M = length(obj.theta); obj.f_df = f_df; obj.varargin_stored = varargin; obj.subfunction_references = subfunction_references; subspace_dimensionality = 2.*obj.N+2; % 2 to include current location; subspace_dimensionality = min([subspace_dimensionality, obj.M]); % subspace can't be larger than the full space; % the min & max dimenstionality for the subspace; obj.K_min = subspace_dimensionality; obj.K_max = ceil(obj.K_min.*1.5); obj.K_max = min([obj.K_max, obj.M]); % obj.P holds the subspace; obj.P = zeros(obj.M,obj.K_max); % store the minimum & maximum eigenvalue from each approximate Hessian; obj.min_eig_sub = zeros(obj.N,1); obj.max_eig_sub = zeros(obj.N,1); % which subfunctions are active; obj.active = false(obj.N,1); obj.init_subf = min(obj.N, obj.init_subf); inds = randperm(obj.N, obj.init_subf); obj.active(inds) = true; obj.min_eig_sub(inds) = obj.hessian_init; obj.max_eig_sub(inds) = obj.hessian_init; % number of function evaluations for each subfunction; obj.eval_count = zeros(obj.N,1); % set the first column of the subspace to be the initial theta; rr = sqrt(sum(obj.theta.^2)); if rr > 0; obj.P(:,1) = obj.theta/rr; else % initial theta is 0 -- initialize randomly; obj.P(:,1) = randn(obj.M,1); obj.P(:,1) = obj.P(:,1) / sqrt(sum(obj.P(:,1).^2)); end if obj.M == obj.K_max % the subspace spans the full space ? obj.P = eye(obj.M); % then P is set to the identity matrix; obj.K_current = obj.M+1; end % theta projected into current working subspace; obj.theta_proj = obj.P' * obj.theta; % holds the last position & the last gradient for all the objective functions; obj.last_theta = obj.theta_proj * ones(1,obj.N); obj.last_df = zeros(obj.K_max,obj.N); % the history of theta changes for each subfunction; obj.hist_deltatheta = zeros(obj.K_max,obj.max_history,obj.N); % the history of gradient changes for each subfunction; obj.hist_deltadf = zeros(obj.K_max,obj.max_history,obj.N); % the history of function values for each subfunction; obj.hist_f = ones(obj.N, obj.max_history).*nan; % the approximate Hessian for each subfunction is stored; % as dot(obj.b(:.:.index), obj.b(:.:.inedx).') obj.b = zeros(obj.K_max,2.*obj.max_history,obj.N); end %% Optimize the objective function. function theta = optimize(obj, num_passes) % Parameters: % num_passes - The number of effective passes through subfunction_references to perform. % Returns: % theta - The estimated parameter vector after num_passes of optimization. num_steps = ceil(num_passes.*obj.N); % number of learning steps for i = 1:num_steps obj.optimization_step(); % perform one step optimization end theta = obj.theta_flat_to_original(obj.theta); end %% Following functions will be used to realize a single optimization step: % % The subspace is expanded at each optimization step t to include the most recent gradient direction as an additional % column by finding the gradient vector component that lies outside the exsiting subspace and appending it to the current subspace. % Following equations were used: % % $$ q_{orth}=f_{j}^{'}(x^{t})-P^{t-1}(P^{t-1})^{T} f_{j}^{'}(x^{t}) $$ Equation (7) % % $$ P^{t}=\left [P^{t-1} \frac{q_{orth}}{\left \|q_{orth}\right\|}\right] $$ Equation (8) % function update_subspace(obj, x_in) % Update the low dimensional subspace by adding a new gradient direction x_in to be incorporated into the subspace.; if obj.K_current >= obj.M return; % no need to update the subspace if it spans the full space; end if sum(~isfinite(x_in)) > 0 return; % bad vector! end x_in_length = sqrt(sum(x_in.^2)); if x_in_length < obj.eps return; % nothing to do with a short new vector end xnew = x_in/x_in_length; % make x unit length; % Find the component of x pointing out of the existing subspace.; % This is done multiple times for numerical stability.; % $$ q_{orth}=f_{j}^{'}(x^{t})-P^{t-1}(P^{t-1})^{T} f_{j}^{'}(x^{t}) $$ Equation (7) for i = 1:3 xnew = xnew - obj.P * (obj.P.'*xnew); ss = sqrt(sum(xnew.^2)); if ss < obj.eps % barely points out of the existing subspace return; % no need to add a new direction to the subspace; end xnew = xnew / ss; % make it unit length; if ss > 0.1 % largely orthogonal >> good numerical stability break; end end % update the subspace with a new column containing the new direction; % $$ P^{t}=\left [P^{t-1} \frac{q_{orth}}{\left \|q_{orth}\right\|}\right] $$ Equation (8) obj.P(:,obj.K_current+1) = xnew; obj.K_current = obj.K_current + 1; % Restricting the size of the subspace: In case the subspace dimensionality K^{t} exceeds K_{max}, it is collapsed to a new subspace % that includes only the most recent gradient and position measurements from each subfunction through QR decomposition if obj.K_current >= obj.K_max % collapse the subspace when it exceeded its maximum allowed size % xl is passed explicity to make sure it's used if not in history yet. xl = (obj.P.') * (x_in); obj.collapse_subspace(xl); % use the function collapse_subspace below end end % % Collapse the subspace to its smallest dimensionality. xl is a new direction that may not be in the history yet, % hence to be included. In case the subspace dimensionality K^{t} exceeds K_{max}, it is collapsed to a new subspace % that includes only the most recent gradient and position measurements from each subfunction through QR decomposition % % $$ P^{'}=orth \left(\left[f_{1}^{'}(x^{\tau_{1}^{t}}) \ldots f_{N}^{'}(x^{\tau_{N}^{t}}) \quad x^{\tau_{1}^{t}} \ldots x^{\tau_{N}^{t}}\right]\right) $$ Equation (9) % function collapse_subspace(obj, xl) % collapse the subspace to its smallest dimensionality. xl is a new direction that may not % be in the history yet, hence to be included Pl = zeros(obj.K_max,obj.K_max); % the projection matrix from old to new subspace; % yy will hold all the directions to pack into the subspace initialized with random noise % to span K_min dimensions even if not all the subfunctions are active yet; yy = randn(obj.K_max,obj.K_min); % most recent position & gradient for all active subfunctions and the current position & gradient % that are not saved in the history yet; yz = [obj.last_df(:,obj.active), obj.last_theta(:,obj.active), xl, (obj.P.') * (obj.theta)]; yy(:,1:size(yz, 2)) = yz; [Pl(:,1:obj.K_min), ~] = qr(yy, 0); obj.P = (obj.P) * (Pl);% update the subspace; obj.apply_subspace_transformation(Pl.', Pl);% apply Pl (projection matrix from old to new basis) to all the history terms; obj.K_current = obj.K_min; % update the stored subspace size; obj.reorthogonalize_subspace();% re-orthogonalize the subspace if small errors are accumulated end % % Re-orthogonalizes the subspace if small errors are accumulated by performing Orthogonal-triangular (QR) decomposition % to the non-orthogonal subspace. % function reorthogonalize_subspace(obj) % check if the subspace has become non-orthogonal subspace_eigs = eig(obj.P.'*obj.P); if max(subspace_eigs) <= 1 + obj.eps return end % performing QR to the non-orthogonal subspace [Porth,~] = qr(obj.P(:,1:obj.K_current),0); % Orthogonal-triangular decomposition. Pl = zeros(obj.K_max, obj.K_max); Pl(:,1:obj.K_current) = obj.P.' * Porth; obj.P(:,1:obj.K_current) = Porth;% update the subspace; obj.apply_subspace_transformation(Pl.',Pl); % apply Pl (projection matrix from old to new basis) to all history terms; end % % Apply change-of-subspace transformation when the subspace is collapsed to project into the new lower dimensional subspace % where T_left and T_right are the covariant and contravariant subspace projection matrices, respectively. % function apply_subspace_transformation(obj,T_left,T_right) % Apply change-of-subspace transformation. This function is called when the subspace is collapsed to project % into the new lower dimensional subspace.; % T_left - The covariant subspace projection matrix.; % T_right - The contravariant subspace projection matrix.; % (Note that for orthogonal subspace T_left = T_right) [tt, ss] = size(T_left); % project history terms into new subspace; obj.last_df = (T_right.') * (obj.last_df); obj.last_theta = (T_left) * (obj.last_theta); obj.hist_deltadf = obj.reshape_wrapper(T_right.' * obj.reshape_wrapper(obj.hist_deltadf, [ss,-1]), [tt,-1,obj.N]); obj.hist_deltatheta = obj.reshape_wrapper(T_left * obj.reshape_wrapper(obj.hist_deltatheta, [ss, -1]), [tt,-1,obj.N]); % project stored hessian for each subfunction in to new subspace; obj.b = obj.reshape_wrapper(T_right.' * obj.reshape_wrapper(obj.b, [ss,-1]), [tt, 2.*obj.max_history,obj.N]); %% RECOMPUTE full_H & theta_proj when the subspace is collapsed to avoid slow accumulation of numerical errors obj.theta_proj = (obj.P.') * (obj.theta); % theta projected into current working subspace; obj.full_H = real(obj.reshape_wrapper(obj.b,[ss,-1]) * obj.reshape_wrapper(obj.b,[ss,-1]).');% full approximate hessian; end % % Only one subfunction $g_{j}^t(x)$ of index $j$ is chosen (for update) according to Equation (13), that is last evaluated % farthest from current location (least accurate) % % $$ j= argmax_{i} [x^{t}-x^{\tau_{i}}]^{T} Q^{t} [x^{t}-x^{\tau_{i}}] $$ Equation (13) % % where ${\tau_{i}}$ is the time at which subfunction $i$ was last evaluated and Q is set either to approximated % Hessian for this subfunction (more accurate measure of distance) or full Hessian to avoid bad Hessian estimate % for a single subfunction (not able % to be evaluated then) % function indx = get_target_index(obj) % Choose which subfunction to update in certain learning iteration gd = find((obj.eval_count < 2) & obj.active); % If an active subfunction has less than two observations, then % Evaluate it again. Two evaluations per subfunction are needed to estimate its Hessian; if ~isempty(gd) indx = gd(randperm(length(gd), 1)); return end % choose the subfunction evaluated farthest using either total weighted Hessian, or by the subfunction Hessian if randn() < 0 max_dist = -1; indx = -1; for i = 1:obj.N dtheta = obj.theta_proj - obj.last_theta(:,i); bdtheta = obj.b(:,:,i).' * dtheta; dist = sum(bdtheta.^2) + sum(dtheta.^2)*obj.min_eig_sub(i); if (dist > max_dist) && obj.active(i) max_dist = dist; indx = i; end end else dtheta = bsxfun(@plus, obj.theta_proj, -obj.last_theta); % difference between current theta & all subfunctions recent evaluation; full_H_combined = obj.get_full_H_with_diagonal(); % full Hessian; distance = sum(dtheta.*(full_H_combined * dtheta), 1); % squared distance; [~, dist_ord] = sort(-distance); % sort the distances from largest to smallest; dist_ord = dist_ord(obj.active(dist_ord)); % keep indices of active subfunctions; indx = dist_ord(1); % choose the active subfunction that is farthest from current location;; end end % % The chosen subfunction of index (index) is updated using a second order power series (Equation (6)) around the vector $x^t$ % chosen in the step (i). % % $$ g_{i}^{t}(x)=f_{i}(x^{t})+(x-x^{t})^{T}f_{i}^{'}(x^{t})+\frac{1}{2}(x-x^{t})^{T}H_{i}^{t}(x-x^{t})$$ Equation (6) % % where the quadratic term $ H_{j}^{t} $ is the Hessian of this subfunction and is approximated using BFGS algorithm % based on its history of grdaient evaluation (details in step 4) % % The following function gets an estimate of subfunction indexed (indx) using a second order power series function f_pred = get_predicted_subf(obj, indx, theta_proj) % Get the predicted value of subfunction indx at theta_proj that lies insisde the subspace; dtheta = theta_proj - obj.last_theta(:,indx); % (x-x^{t}) bdtheta = obj.b(:,:,indx).' * dtheta; Hdtheta = real(obj.b(:,:,indx) * bdtheta); Hdtheta = Hdtheta + dtheta.*obj.min_eig_sub(indx); % the diagonal contribution f_pred = obj.hist_f(indx,1) + obj.last_df(:,indx).' * dtheta + 0.5.*(dtheta.') * (Hdtheta); end % % Online Hessian Approximation: In this section we addressed all the calculations correspond to maintain an independent Hessian % approximation for each subfunction computed through BFGS algorithm on its history of gradient evaluations and positions % % For each subfunction ${g_{j}^t}(x)$, two matrices $\Delta {f^'}$ and $\Delta x$ are constructed to hold the change in % subfunction gradient and the corresponding change in position x between successive evaluations. This history is updated % using the fllowing function that updates history of position differences & gradient differences for subfunction indx. % function update_history(obj, indx, theta_proj, f, df_proj) % Update history of position differences & gradient differences for subfunction indx.; if obj.eval_count(indx) > 1 % if there exist at least one earlier measurement from this; % subfunction, then use it to compute position & gradient differences.; ddf = df_proj - obj.last_df(:,indx); % difference in gradient; ddt = theta_proj - obj.last_theta(:,indx); % difference in position; lddf = sqrt(sum(ddf.^2)); % length of gradient change vector (l_2 norm); lddt = sqrt(sum(ddt.^2)); % length of position change vector (l_2 norm); corr_ddf_ddt = ddf.' * ddt / (lddt*lddf); % Update the corresponding history according to the following cases: if lddt > obj.eps || lddf > obj.eps || abs(corr_ddf_ddt) > obj.eps %% change in theta, gradient and inner product isn't very small obj.hist_deltatheta(:,2:end,indx) = obj.hist_deltatheta(:,1:end-1,indx); % shift the history by one timestep; obj.hist_deltatheta(:,1,indx) = ddt; % store the difference in theta since the subfunction was last evaluated; obj.hist_deltadf(:,2:end,indx) = obj.hist_deltadf(:,1:end-1,indx); % shift the history by one timestep; obj.hist_deltadf(:,1,indx) = ddf; % store the change in gradient since the subfunction was last evaluated; end end obj.last_theta(:,indx) = theta_proj; obj.last_df(:,indx) = df_proj; obj.hist_f(indx,2:end) = obj.hist_f(indx,1:end-1); obj.hist_f(indx,1) = f; end % Then, the Hessian approximation for a single subfunction (indx) is updated using BFGS algorithm as detailed in % function update_hessian(obj,indx) that iterates through the columns in $\Delta f^'$ and $\Delta x$ from oldest % to most recent. At certain column of index s the approximated Hessian should satisfy: % % * The secant equation % % $$ \Delta f_{s}^{'}=B_s \Delta x_s $$ % % * Smallest weighted Forbenius norm between the current and prior estimate of Hessian (standard BFGS update equation) % % $$ B_{s}= B_{s-1}+ \frac{\Delta f_{s}^{'} \Delta f_{s}^{'^{T}}}{\Delta f_{s}^{'^{T}} \Delta x_{s}}- \frac{B_{s-1}\Delta x_{s} \Delta x_{s}^{T} B_{s-1}}{\Delta x_{s}^{T} B_{s-1}\Delta x_{s}}$$ Equation (10) % function update_hessian(obj, indx) %% BFGS initialization: gd = find(sum(obj.hist_deltatheta(:,:,indx).^2, 1)>0);% test if there is any history for this subfunction num_gd = length(gd); %% CASE(I): NO sufficient history to run BFGS if num_gd == 0 % if no history, initialize with the median eigenvalue from full Hessian of other subfunctions; obj.b(:,:,indx) = 0.; H = obj.get_full_H_with_diagonal(); [U, ~] = obj.eigh_wrapper(H); obj.min_eig_sub(indx) = median(U)/sum(obj.active); % median eigenvalue of avg Hessian of other active subfunctions obj.max_eig_sub(indx) = obj.min_eig_sub(indx); if obj.eval_count(indx) > 2 if sum(obj.eval_count) < 5 % Subfunction was evaluated multiple times, but has no stored history. obj.min_eig_sub(indx) = obj.min_eig_sub(indx) / 10.; % Need to initialize SFO with a smaller hessian_init value >> scale down the hessian_init value end end return; end %% CASE(II): enough history to run BFGS % Working in the subspace defined by this subfunction's history (P_hist); [P_hist, ~] = qr([obj.hist_deltatheta(:,gd,indx),obj.hist_deltadf(:,gd,indx)], 0); deltatheta_P = P_hist.' * obj.hist_deltatheta(:,gd,indx); deltadf_P = P_hist.' * obj.hist_deltadf(:,gd,indx); %% approximate the smallest eigenvalue \beta by solving the squared secant equation for full history; %% \beta will be used as diagonal initialization for BFGS, (i.e. B_0=\beta I); % calculate Hessian using pinv & squared equation: % \Delta f_{s}^{'}=B_s \Delta x_s (secant equation >>(df = H dx)) % (df^T df = dx^T H^T H dx = dx^T H^2 dx) % Q=\left [(\Delta x)^{+^{T}} (\Delta f^{'})^{T} \Delta f^{'} (\Delta x)^{+}\right ]^{1/2} Equation (12) pdelthet = pinv(deltatheta_P); dd = (deltadf_P) * (pdelthet); H2 = (dd.') * (dd); [H2w, ~] = obj.eigh_wrapper(H2); H2w = sqrt(abs(H2w)); H2w = sort(H2w, 'descend'); % only the top (~num_gd) eigenvalues are expected to be well defined; H2w = H2w(1:num_gd); % Enforcing positive definiteness for the estimated Hessian: Each estimated $ {H_{i}}^t$ is constrained to be +ve definite by explicit eigendecomposition and seeting any % too-small eigen values to the median positive eigenvalue. if min(H2w) == 0 || sum(~isfinite(H2w)) > 0 % ~zero eigenvalue or close (poor estimate of curvature) % a failure using this history due to degenerate deltadf (0 case) or deltatheta (non-finite case).; H2w(:) = max(obj.min_eig_sub(obj.active)); % Initialize using other subfunctions; end obj.min_eig_sub(indx) = min(H2w); obj.max_eig_sub(indx) = max(H2w); % constraining Hessian initialization if obj.min_eig_sub(indx) < obj.max_eig_sub(indx)/obj.hess_max_dev % eigenvalues less than (\gamma \lambda_{max}) obj.min_eig_sub(indx) = obj.max_eig_sub(indx)/obj.hess_max_dev; % constrain using the allowed ratio; end %% Recalculate Hessian; num_hist = size(deltatheta_P, 2); % number of history terms (columns); % the new hessian will be (b_p)*(b_p.') + eye().*obj.min_eig_sub(indx); b_p = zeros(size(P_hist,2), num_hist*2); % step through the history; for hist_i = num_hist:-1:1 s = deltatheta_P(:,hist_i); y = deltadf_P(:,hist_i); % for numerical stability; rscl = sqrt(sum(s.^2)); s = s/rscl; y = y/rscl; % the BFGS step proper Hs = s.*obj.min_eig_sub(indx) + b_p*((b_p.')*(s)); term1 = y/sqrt(sum(y.*s)); sHs = sum(s.*Hs); term2 = sqrt(complex(-1.)).*Hs/sqrt(sHs); if sum(~isfinite(term1)) > 0 || sum(~isfinite(term2)) > 0 obj.min_eig_sub(indx) = max(H2w); % invalid bfgs history term continue; end b_p(:,2*(hist_i-1)+2) = term1; b_p(:,2*(hist_i-1)+1) = term2; end H = real((b_p) * (b_p.')) + eye(size(b_p, 1))*obj.min_eig_sub(indx); % constrain the Hessian to be positive definite; [U, V] = obj.eigh_wrapper(H); if max(U) <= 0. %if there aren't any positive eigenvalues after BFGS U(:) = obj.max_eig_sub(indx); % set them all to be the same conservative diagonal value; end % set any too-small eigenvalues to the median positive eigenvalue; U_median = median(U(U>0)); U(U<(max(abs(U))/obj.hess_max_dev)) = U_median; % Hessian after it's been forced to be positive definite; H_posdef = bsxfun(@times, V, U') * V.'; % break it apart into matrices b & a diagonal term again; B_pos = H_posdef - eye(size(b_p, 1))*obj.min_eig_sub(indx); [U, V] = obj.eigh_wrapper(B_pos); b_p = bsxfun(@times, V, sqrt(obj.reshape_wrapper(U, [1,-1]))); obj.b(:,:,indx) = 0.; obj.b(:,1:size(b_p,2),indx) = (P_hist)*(b_p); end % % The full approximated Hessian is etimated using the function: function full_H_combined = get_full_H_with_diagonal(obj) % Get the full approximate Hessian, including diagonal terms that are included in obj.full_H full_H_combined = obj.full_H + eye(obj.K_max).*sum(obj.min_eig_sub(obj.active)); end % % Detecting bad updates: bad proposed parameter updates are identidied using the function *handle_step_failure(obj, f, df_proj, indx)*. % Since only one subfunction is evaluated per update step, a line search on the full objective is impossible, therefore upades are instead lablelled % bad when the value of subfunction has increased since its previous evaluation, and also exceeds its approximating function by more than the % corresponding reduction in the summed approximating function (i.e. $f_j(x^t) -g_j^{t-1} > G^{t-1}(x^{t-1}) - G^{t-1}(x^{t})$) % % When a bad update is detected, $x^t$ is reset to its previous value $x^{t-1}$, the BFGS history matrices are also updated to % include the change in gradient in the failed update direction and the step length is updated as follows % % $$ \eta =\left\{\begin{matrix}\frac{1}{N}+\frac{N-1}{N} \eta^{t} & \text{successful update}\\ \frac{1}{2} \eta^{t} & \text{failed % update}\end{matrix}\right.$$ Equation (16) % function [step_failure, f, df_proj] = handle_step_failure(obj, f, df_proj, indx) % Check if an update step failed and hence update current position; step_failure = false; % check to see whether the step should be a failure; if ~isfinite(f) || sum(~isfinite(df_proj))>0 step_failure = true; % if function or its gradient is not-finite then it's failure; elseif obj.eval_count(indx) == 1 % new subfunction with only one evaluation if max(obj.eval_count) > 1 if f > mean(obj.hist_f(obj.eval_count>1,1)) + 3*std(obj.hist_f(obj.eval_count>1,1)) step_failure = true; % much larger than expected then it's failure; end end elseif f > obj.hist_f(indx,1) % subfunction has increased in value compared to its history % (in this case we check if it's larger than its predicted value by enough to trigger a failure); f_pred = obj.get_predicted_subf(indx, obj.theta_proj); % calculate the predicted value of this subfunction; % subfunction exceeds its predicted value by more than average gain in other subfunctions ? predicted_improvement_others = obj.f_predicted_total_improvement - (obj.hist_f(indx,1) - f_pred); if f - f_pred > predicted_improvement_others step_failure = true; % then the step is failure end end % Step length in both cases (Equation (16)) if ~step_failure obj.step_scale = 1./obj.N + obj.step_scale .* (1. - 1./obj.N); % decay the step_scale back towards 1; else obj.step_scale = obj.step_scale / 2.; % shorten the step length; % subspace may be updated during the function calls so store this in the full space; df = (obj.P) * (df_proj); [f_lastpos, df_lastpos_proj] = obj.f_df_wrapper(obj.theta_prior_step, indx); df_lastpos = (obj.P) * (df_lastpos_proj); % if the function value exploded, then back it off until it's a reasonable order of magnitude before adding anything to the history; f_pred = obj.get_predicted_subf(indx, obj.theta_proj); if isfinite(obj.hist_f(indx,1)) predicted_f_diff = abs(f_pred - obj.hist_f(indx,1)); else predicted_f_diff = abs(f - f_lastpos); end if ~isfinite(predicted_f_diff) || predicted_f_diff < obj.eps predicted_f_diff = obj.eps; end for i_ls = 1:10 if f - f_lastpos < 10*predicted_f_diff % failed update within an order of magnitude of the target update value break; % no backoff required; end % make the step length a factor of 100 shorter; obj.theta = 0.99.*obj.theta_prior_step + 0.01.*obj.theta; obj.theta_proj = (obj.P.') * (obj.theta); % recompute f & df at this new location; [f, df_proj] = obj.f_df_wrapper(obj.theta, indx); df = (obj.P) * (df_proj); end % move into the subspace.; df_proj = (obj.P.') * (df); df_lastpos_proj = (obj.P.') * (df_lastpos); if f < f_lastpos % step failed, but last position was even worse (original objective is better); theta_lastpos_proj = (obj.P.') * (obj.theta_prior_step); obj.update_history(indx, theta_lastpos_proj, f_lastpos, df_lastpos_proj); % add the newly evaluated point to the history; else % step failed but proposed f % add the change in theta & the change in gradient to the history for this subfunction before failing over to the last position; if isfinite(f) && sum(~isfinite(df_proj))==0 obj.update_history(indx, obj.theta_proj, f, df_proj); end f = f_lastpos; df_proj = df_lastpos_proj; obj.theta = obj.theta_prior_step; obj.theta_proj = (obj.P.') * (obj.theta); end end end % % Growing the number of active subfunctions: the algorithm begin with only small number of active subfunctions (two for our implementation) and expand the active % set as learning progresses by one subfunction every time the average gradient shrinks to within a factor $\alpha$ of % the standard error in the average gradient, i.e. the active subset is incremented whenever % % $$ (\bar{f}^{'t})^{T} H^{t}^{-1}\bar{f}^{'t} < \alpha \frac{\sum_{i}(\bar{f}^{'t})^{T} H^{t}^{-1}\bar{f}^{'t}}{(N^{t}-1)N^{t}} $$ Equation (14) % % where $N^{t}$ is the active subset size at time, $H^{t}$ is the full Hessian, and $\bar{f}^{'t}$ is the average gradient given by: % % $$ \bar{f}^{'t}= \frac{1}{N^t} \sum_{i} f_{i}^{'} (x_{i}^{t}) $$ Equation (15) % % The active subset is also increased when a bad update is detected or when a full pass through the active batch occurs without % a batch size increase % % function expand_active_subfunctions(obj, full_H_inv, step_failure) % expand the set of active subfunctions as appropriate; % power in the average gradient direction; df_avg = mean(obj.last_df(:,obj.active), 2); % average gradient (Equation (15)) p_df_avg = sum(df_avg .* (full_H_inv * df_avg)); % part of Equation (14) % power of the standard error; ldfs = obj.last_df(:,obj.active) - df_avg*ones(1,sum(obj.active)); num_active = sum(obj.active); % size of the active subset p_df_sum = sum(sum(ldfs .* (full_H_inv * ldfs))) / num_active / (num_active - 1); % increase size of the active set if standard errror in the estimated gradient is the same order of magnitude as the gradient; increase_desirable = (p_df_sum >= p_df_avg.*obj.max_gradient_noise); % increase size of the active set on step failure; increase_desirable = increase_desirable || step_failure; % increase size of the active set if a full pass is done without updating it; increase_desirable = increase_desirable || (obj.iter_since_active_growth > num_active); % check that all subfunctions have enough evaluations for a Hessian approximation before having new subfunctions; eligibile_for_increase = (min(obj.eval_count(obj.active)) >= 2); % one more iteration has passed since the active set was last expanded; obj.iter_since_active_growth = obj.iter_since_active_growth + 1; if increase_desirable && eligibile_for_increase && sum(~obj.active) > 0 % the index of the new subfunction to activate; new_gd = find(~obj.active); new_gd = new_gd(randperm(length(new_gd), 1)); if ~isempty(new_gd) obj.iter_since_active_growth = 0; obj.active(new_gd) = true; end end end % % Perform a single optimization step. Choose an index using get_target_index() function. % Calculate the gradient, inverse of hessian and finally the Newton step. % The Newton step length is adjusted in order to avoid numerical errors caused by small numbers or bouncing % around opitimized values caused by large step size. Active subfunction set is expended in the end. function optimization_step(obj) % Perform a single optimization step. time_pass_start = tic(); indx = obj.get_target_index(); %% choose an index to update; [f, df_proj] = obj.f_df_wrapper(obj.theta, indx); % evaluate subfunction value & gradient at new position; [step_failure, f, df_proj] = obj.handle_step_failure(f, df_proj, indx); % check for a failed update step, & adjust f, df, & obj.theta; % as appropriate if one occurs.; obj.update_history(indx, obj.theta_proj, f, df_proj); % add the change in theta & the change in gradient to the history for this subfunction; obj.total_distance = obj.total_distance + sqrt(sum((obj.theta - obj.theta_prior_step).^2)); % increment the total distance traveled using the last update; H_pre_update = real(obj.b(:,:,indx) * obj.b(:,:,indx).'); % the current contribution from this subfunction to the total Hessian approximation; obj.update_hessian(indx);%% update this subfunction's Hessian estimate; H_new = real(obj.b(:,:,indx) * obj.b(:,:,indx).');% the new contribution from this subfunction to the total approximate hessian; obj.full_H = obj.full_H + H_new - H_pre_update;% update total Hessian using this subfunction's updated contribution; % calculate the total gradient, total Hessian, & total function value at the current location; full_df = 0.; for i = 1:obj.N dtheta = obj.theta_proj - obj.last_theta(:,i); bdtheta = obj.b(:,:,i).' * dtheta; Hdtheta = real(obj.b(:,:,i) * bdtheta); Hdtheta = Hdtheta + dtheta.*obj.min_eig_sub(i); % the diagonal contribution; full_df = full_df + Hdtheta + obj.last_df(:,i); end full_H_combined = obj.get_full_H_with_diagonal(); full_H_inv = inv(full_H_combined); % calculate an update step; dtheta_proj = -(full_H_inv) * (full_df) .* obj.step_scale;% calculate the Newton step described in Equation (5) dtheta_proj_length = sqrt(sum(dtheta_proj(:).^2)); if dtheta_proj_length < obj.minimum_step_length dtheta_proj = dtheta_proj*obj.minimum_step_length/dtheta_proj_length;% forcing the update to have minimum step length to avoid numerical errors dtheta_proj_length = obj.minimum_step_length;% assign the minimum step length to the update step length end if sum(obj.eval_count) > obj.N && dtheta_proj_length > obj.eps % only allow a step to be up to a factor of max_step_length_ratio longer than the average step length avg_length = obj.total_distance / sum(obj.eval_count); length_ratio = dtheta_proj_length / avg_length; ratio_scale = obj.max_step_length_ratio; if length_ratio > ratio_scale % truncating step length from (dtheta_proj_length) to (ratio_scale*avg_length) dtheta_proj_length = dtheta_proj_length/(length_ratio/ratio_scale); dtheta_proj = dtheta_proj/(length_ratio/ratio_scale); end end dtheta = (obj.P) * (dtheta_proj);% the update to theta, in the full dimensional space; obj.theta_prior_step = obj.theta;% backup the prior position, in case this is a failed step; obj.theta = obj.theta + dtheta;% update theta to the new location; obj.theta_proj = obj.theta_proj + dtheta_proj; % the predicted improvement from this update step; obj.f_predicted_total_improvement = 0.5 .* dtheta_proj.' * (full_H_combined * dtheta_proj); obj.expand_active_subfunctions(full_H_inv, step_failure); %% expand the set of active subfunctions as appropriate; % record how much time was taken by this learning step; time_diff = toc(time_pass_start); obj.time_pass = obj.time_pass + time_diff; end %% function theta_flat = theta_original_to_flat(obj, theta_original) % Convert from the original parameter format into a 1d array % The original format can be an array, or a cell array full of arrays if iscell(theta_original) theta_length = 0; for theta_array = theta_original(:)'% iterate over cells theta_length = theta_length + numel(theta_array{1}); end theta_flat = zeros(theta_length,1); i_start = 1; for theta_array = theta_original(:)'% iterate over cells i_end = i_start + numel(theta_array{1})-1; theta_flat(i_start:i_end) = theta_array{1}(:); i_start = i_end+1; end else theta_flat = theta_original(:); end end function theta_new = theta_flat_to_original(obj, theta_flat) % Convert from a 1d array into the original parameter format.; if iscell(obj.theta_original) theta_new = cell(size(obj.theta_original)); i_start = 1; jj = 1; for theta_array = obj.theta_original(:)' % iterate over cells i_end = i_start + numel(theta_array{1})-1; theta_new{jj} = obj.reshape_wrapper(theta_flat(i_start:i_end), size(theta_array{1})); i_start = i_end + 1; jj = jj + 1; end else theta_new = obj.reshape_wrapper(theta_flat, size(obj.theta_original)); end end function [f, df_proj, df_full] = f_df_wrapper(obj, theta_in, idx) % A wrapper around the subfunction objective f_df, that handles the transformation; % into & out of the flattened parameterization used internally by SFO.; theta_local = obj.theta_flat_to_original(theta_in); % evaluate; t = tic(); [f, df_full] = obj.f_df(theta_local, obj.subfunction_references{idx}, obj.varargin_stored{:}); time_diff = toc(t); obj.time_func = obj.time_func + time_diff; % time spent in function evaluation; df_full = obj.theta_original_to_flat(df_full); % update the subspace with the new gradient direction; obj.update_subspace(df_full); % gradient projected into the current subspace; df_proj = ( obj.P.') * (df_full ); % keep a record of function evaluations; obj.hist_f_flat = [obj.hist_f_flat, f]; obj.eval_count(idx) = obj.eval_count(idx) + 1; end end methods(Static) function A = reshape_wrapper(A, shape) % a wrapper for reshape which duplicates the numpy behavior, and sets % any -1 dimensions to the appropriate length total_dims = numel(A); total_assigned = prod(shape(shape>0)); shape(shape==-1) = total_dims/total_assigned; A = reshape(A, shape); end function [U, V] = eigh_wrapper(A) % A wrapper which duplicates the order and format of the numpy % eigh routine (enforces A to be symmetric) [V,U] = eig(0.5 * (A + A')); U = diag(U); end end end