%===============================================================================================================================
% Implements the Sum of Functions Optimizer (SFO), as described in the paper:
%===============================================================================================================================
classdef sfo < handle

    properties % parameters used to implement the algorithm
        display = 0;
        f_df;
        args;
        max_history = 10;
        max_gradient_noise = 1;
        hess_max_dev = 1e8;
        hessian_init = 1e5;
        N;
        subfunction_references;
        theta_original;% theta, in its original format;
        theta;% theta, flattented into a 1d array;
        theta_prior_step;% theta from the previous learning step -- initialize to theta;
        M; % number of data dimensions;
        step_scale = 1;% the update steps will be rescaled by this;
        eps = 1e-12; % 'very small' for various tasks;
        minimum_step_length = 1e-8; % shortest step length allowed.  
        max_step_length_ratio = 10; % longest allowed update step length relative to the average length of prior update steps.  
        % the min & max dimenstionality for the subspace;
        K_min;
        K_max;
        K_current = 1;% current dimensionality of the subspace;
        P;% obj.P holds the subspace;
        % store the minimum & maximum eigenvalue from each approximate Hessian;
        min_eig_sub;
        max_eig_sub;
        % store the total time spent in optimization, & the amount of time spent in the objective function;
        time_pass = 0.;
        time_func = 0.;
        iter_since_active_growth = 0;% how many steps since the active set size was increased;
        % which subfunctions are active;
        init_subf = 2;
        active;
        total_distance = 0.; % the total path length traveled during optimization;
        eval_count; % number of function evaluations for each subfunction;
        theta_proj;% theta projected into current working subspace;
        last_theta; % holds the last position for all the objective functions;
        last_df;% holds the last gradient for all the objective functions;
        hist_deltatheta;% the history of theta changes for each subfunction;
        hist_deltadf;% the history of gradient changes for each subfunction;
        hist_f;% the history of function values for each subfunction;
        hist_f_flat = [];% a flat history of all returned subfunction values for debugging/diagnostics;
        b;% the approximate Hessian for each subfunction is stored as dot(b(:.:.index), b(:.:.index).')
        full_H = 0; % the full Hessian (sum over all the subfunctions);
        varargin_stored = {};% parameters that are passed through to f_df
        f_predicted_total_improvement = 0;% the predicted improvement in the total objective from the current update step
    end

    methods
%% Initializes the optimizer class:
        function obj = sfo(f_df, theta, subfunction_references, varargin)
            %     Parameters:
            %       f_df - Returns the function value and gradient for a single subfunction call
            %            [f, dfdtheta] = f_df(theta, subfunction_references{idx},varargin{:})                        
            %            where idx is the index of a single subfunction.
            %       theta - The initial parameters to be used for optimization. theta can be either a vector
            %            , a matrix, or a cell array with a vector or matrix in every cell.
            %       subfunction_references - A cell array containing an identifying element for each subfunction
            %            The elements in this list could be, eg, matrices containing minibatches, or indices 
            %            identifying the subfunction.
            %       varargin - Any additional parameters will be passed through to f_df each time it is called.
            %     Returns:
            %       obj - sfo class instance.

            obj.N = length(subfunction_references);
            obj.theta_original = theta;
            obj.theta = obj.theta_original_to_flat(obj.theta_original);
            obj.theta_prior_step = obj.theta;
            obj.M = length(obj.theta);
            obj.f_df = f_df;
            obj.varargin_stored = varargin;
            obj.subfunction_references = subfunction_references;
            subspace_dimensionality = 2.*obj.N+2;  % 2 to include current location;
            subspace_dimensionality = min([subspace_dimensionality, obj.M]); % subspace can't be larger than the full space;

            % the min & max dimenstionality for the subspace;
            obj.K_min = subspace_dimensionality;
            obj.K_max = ceil(obj.K_min.*1.5);
            obj.K_max = min([obj.K_max, obj.M]);
            % obj.P holds the subspace;
            obj.P = zeros(obj.M,obj.K_max);
            
            % store the minimum & maximum eigenvalue from each approximate Hessian;
            obj.min_eig_sub = zeros(obj.N,1);
            obj.max_eig_sub = zeros(obj.N,1);
            
            % which subfunctions are active;
            obj.active = false(obj.N,1);
            obj.init_subf = min(obj.N, obj.init_subf);
            inds = randperm(obj.N, obj.init_subf);
            obj.active(inds) = true;
            obj.min_eig_sub(inds) = obj.hessian_init;
            obj.max_eig_sub(inds) = obj.hessian_init;
            
            % number of function evaluations for each subfunction;
            obj.eval_count = zeros(obj.N,1);

            % set the first column of the subspace to be the initial theta;
            rr = sqrt(sum(obj.theta.^2));
            if rr > 0;
                obj.P(:,1) = obj.theta/rr;
            else
                % initial theta is 0 -- initialize randomly;
                obj.P(:,1) = randn(obj.M,1);
                obj.P(:,1) = obj.P(:,1) / sqrt(sum(obj.P(:,1).^2));
            end
                
            if obj.M == obj.K_max % the subspace spans the full space ? 
                obj.P = eye(obj.M); % then P is set to the identity matrix;
                obj.K_current = obj.M+1;
            end
            
            % theta projected into current working subspace;
            obj.theta_proj = obj.P' * obj.theta;
            % holds the last position & the last gradient for all the objective functions;
            obj.last_theta = obj.theta_proj * ones(1,obj.N);
            obj.last_df = zeros(obj.K_max,obj.N);
            % the history of theta changes for each subfunction;
            obj.hist_deltatheta = zeros(obj.K_max,obj.max_history,obj.N);
            % the history of gradient changes for each subfunction;
            obj.hist_deltadf = zeros(obj.K_max,obj.max_history,obj.N);
            % the history of function values for each subfunction;
            obj.hist_f = ones(obj.N, obj.max_history).*nan;
            
            % the approximate Hessian for each subfunction is stored;
            % as dot(obj.b(:.:.index), obj.b(:.:.inedx).')
            obj.b = zeros(obj.K_max,2.*obj.max_history,obj.N);
        end

%% Optimize the objective function.
        function theta = optimize(obj, num_passes)
            %     Parameters:
            %       num_passes - The number of effective passes through subfunction_references to perform.
            %     Returns:
            %       theta - The estimated parameter vector after num_passes of optimization.

            num_steps = ceil(num_passes.*obj.N); % number of learning steps
            for i = 1:num_steps
                obj.optimization_step(); % perform one step optimization
            end
            theta = obj.theta_flat_to_original(obj.theta);
        end

%% Following functions will be used to realize a single optimization step:
%
% The subspace is expanded at each optimization step t to include the most recent gradient direction as an additional 
% column by finding the gradient vector component that lies outside the exsiting subspace and appending it to the current subspace.
% Following equations were used:
%
% $$ q_{orth}=f_{j}^{'}(x^{t})-P^{t-1}(P^{t-1})^{T} f_{j}^{'}(x^{t}) $$ Equation (7)
%
% $$ P^{t}=\left [P^{t-1} \frac{q_{orth}}{\left \|q_{orth}\right\|}\right] $$ Equation (8)
%
        function update_subspace(obj, x_in)
        % Update the low dimensional subspace by adding a new gradient direction x_in to be incorporated into the subspace.;
            if obj.K_current >= obj.M
                return; % no need to update the subspace if it spans the full space;
            end
            if sum(~isfinite(x_in)) > 0
                return; % bad vector! 
            end
            x_in_length = sqrt(sum(x_in.^2));
            if x_in_length < obj.eps
                return; % nothing to do with a short new vector
            end
            xnew = x_in/x_in_length; % make x unit length;
            % Find the component of x pointing out of the existing subspace.;
            % This is done multiple times for numerical stability.;
            % $$ q_{orth}=f_{j}^{'}(x^{t})-P^{t-1}(P^{t-1})^{T} f_{j}^{'}(x^{t}) $$  Equation (7) 
            for i = 1:3
                xnew = xnew - obj.P * (obj.P.'*xnew);
                ss = sqrt(sum(xnew.^2));
                if ss < obj.eps % barely points out of the existing subspace
                    return; % no need to add a new direction to the subspace; 
                end 
                xnew = xnew / ss; % make it unit length;
                if ss > 0.1 % largely orthogonal >> good numerical stability
                    break;
                end
            end
            % update the subspace with a new column containing the new direction;
            % $$ P^{t}=\left [P^{t-1} \frac{q_{orth}}{\left \|q_{orth}\right\|}\right] $$ Equation (8)
            obj.P(:,obj.K_current+1) = xnew;
            obj.K_current = obj.K_current + 1;
            
% Restricting the size of the subspace: In case the subspace dimensionality K^{t} exceeds K_{max}, it is collapsed to a new subspace
% that includes only the most recent gradient and position measurements from each subfunction through QR decomposition
            if obj.K_current >= obj.K_max
            % collapse the subspace when it exceeded its maximum allowed size
            % xl is passed explicity to make sure it's used if not in history yet.
                xl = (obj.P.') * (x_in);
                obj.collapse_subspace(xl); % use the function collapse_subspace below
            end
        end
%
% Collapse the subspace to its smallest dimensionality. xl is a new direction that may not be in the history yet, 
% hence to be included. In case the subspace dimensionality K^{t} exceeds K_{max}, it is collapsed to a new subspace 
% that includes only the most recent gradient and position measurements from each subfunction through QR decomposition
%
% $$ P^{'}=orth \left(\left[f_{1}^{'}(x^{\tau_{1}^{t}}) \ldots f_{N}^{'}(x^{\tau_{N}^{t}}) \quad x^{\tau_{1}^{t}} \ldots x^{\tau_{N}^{t}}\right]\right) $$ Equation (9)
%
       function collapse_subspace(obj, xl)
        % collapse the subspace to its smallest dimensionality. xl is a new direction that may not 
        % be in the history yet, hence to be included   
            Pl = zeros(obj.K_max,obj.K_max); % the projection matrix from old to new subspace;
            % yy will hold all the directions to pack into the subspace initialized with random noise 
            % to span K_min dimensions even if not all the subfunctions are active yet;
            yy = randn(obj.K_max,obj.K_min);
            % most recent position & gradient for all active subfunctions and the current position & gradient 
            % that are not saved in the history yet;
            yz = [obj.last_df(:,obj.active), obj.last_theta(:,obj.active), xl, (obj.P.') * (obj.theta)];
            yy(:,1:size(yz, 2)) = yz;
            [Pl(:,1:obj.K_min), ~] = qr(yy, 0);
            obj.P = (obj.P) * (Pl);% update the subspace;
            obj.apply_subspace_transformation(Pl.', Pl);% apply Pl (projection matrix from old to new basis) to all the history terms;
            obj.K_current = obj.K_min; % update the stored subspace size;
            obj.reorthogonalize_subspace();% re-orthogonalize the subspace if small errors are accumulated
       end
%
% Re-orthogonalizes the subspace if small errors are accumulated by performing Orthogonal-triangular (QR) decomposition
% to the non-orthogonal subspace.    
%
       function reorthogonalize_subspace(obj)
            % check if the subspace has become non-orthogonal
            subspace_eigs = eig(obj.P.'*obj.P);
            if max(subspace_eigs) <= 1 + obj.eps
                return
            end
            % performing QR to the non-orthogonal subspace  
            [Porth,~] = qr(obj.P(:,1:obj.K_current),0); % Orthogonal-triangular decomposition.
            Pl = zeros(obj.K_max, obj.K_max);
            Pl(:,1:obj.K_current) = obj.P.' * Porth;
            obj.P(:,1:obj.K_current) = Porth;% update the subspace;
            obj.apply_subspace_transformation(Pl.',Pl); % apply Pl (projection matrix from old to new basis) to all history terms;
       end
%
% Apply change-of-subspace transformation when the subspace is collapsed to project into the new lower dimensional subspace
% where T_left and T_right are the covariant and contravariant subspace projection matrices, respectively. 
%
       function apply_subspace_transformation(obj,T_left,T_right)
        % Apply change-of-subspace transformation.  This function is called when the subspace is collapsed to project 
        % into the new lower dimensional subspace.;
            % T_left - The covariant subspace projection matrix.;
            % T_right - The contravariant subspace projection matrix.;
            % (Note that for orthogonal subspace T_left = T_right)
            [tt, ss] = size(T_left);
            % project history terms into new subspace;
            obj.last_df = (T_right.') * (obj.last_df);
            obj.last_theta = (T_left) * (obj.last_theta);
            obj.hist_deltadf = obj.reshape_wrapper(T_right.' * obj.reshape_wrapper(obj.hist_deltadf, [ss,-1]), [tt,-1,obj.N]);
            obj.hist_deltatheta = obj.reshape_wrapper(T_left * obj.reshape_wrapper(obj.hist_deltatheta, [ss, -1]), [tt,-1,obj.N]);
            % project stored hessian for each subfunction in to new subspace;
            obj.b = obj.reshape_wrapper(T_right.' * obj.reshape_wrapper(obj.b, [ss,-1]), [tt, 2.*obj.max_history,obj.N]);
            %% RECOMPUTE full_H & theta_proj when the subspace is collapsed to avoid slow accumulation of numerical errors 
            obj.theta_proj = (obj.P.') * (obj.theta); % theta projected into current working subspace;
            obj.full_H = real(obj.reshape_wrapper(obj.b,[ss,-1]) * obj.reshape_wrapper(obj.b,[ss,-1]).');% full approximate hessian;
       end
%
% Only one subfunction $g_{j}^t(x)$ of index $j$ is chosen (for update) according to Equation (13), that is last evaluated 
% farthest from current location (least accurate)
% 
% $$ j= argmax_{i} [x^{t}-x^{\tau_{i}}]^{T} Q^{t} [x^{t}-x^{\tau_{i}}] $$ Equation (13)
%
% where ${\tau_{i}}$ is the time at which subfunction $i$ was last evaluated and Q is set either to approximated 
% Hessian for this subfunction (more accurate measure of distance) or full Hessian to avoid bad Hessian estimate 
% for a single subfunction (not able
% to be evaluated then)
% 
        function indx = get_target_index(obj)
        % Choose which subfunction to update in certain learning iteration
            gd = find((obj.eval_count < 2) & obj.active);  % If an active subfunction has less than two observations, then 
        % Evaluate it again. Two evaluations per subfunction are needed to estimate its Hessian; 
            if ~isempty(gd)
                indx = gd(randperm(length(gd), 1));
                return
            end
            % choose the subfunction evaluated farthest using either total weighted Hessian, or by the subfunction Hessian
            if randn() < 0
                max_dist = -1;
                indx = -1;
                for i = 1:obj.N
                    dtheta = obj.theta_proj - obj.last_theta(:,i);
                    bdtheta = obj.b(:,:,i).' * dtheta;
                    dist = sum(bdtheta.^2) + sum(dtheta.^2)*obj.min_eig_sub(i);
                    if (dist > max_dist) && obj.active(i)
                        max_dist = dist;
                        indx = i;
                    end
                end
            else
                dtheta = bsxfun(@plus, obj.theta_proj, -obj.last_theta); % difference between current theta & all subfunctions recent evaluation;
                full_H_combined = obj.get_full_H_with_diagonal(); % full Hessian;
                distance = sum(dtheta.*(full_H_combined * dtheta), 1); % squared distance;
                [~, dist_ord] = sort(-distance); % sort the distances from largest to smallest;
                dist_ord = dist_ord(obj.active(dist_ord)); % keep indices of active subfunctions;
                indx = dist_ord(1); % choose the active subfunction that is farthest from current location;;
            end
        end
%
% The chosen subfunction of index (index) is updated using a second order power series (Equation (6)) around the vector $x^t$
% chosen in the step (i). 
%
% $$ g_{i}^{t}(x)=f_{i}(x^{t})+(x-x^{t})^{T}f_{i}^{'}(x^{t})+\frac{1}{2}(x-x^{t})^{T}H_{i}^{t}(x-x^{t})$$ Equation (6) 
%
% where the quadratic term $ H_{j}^{t} $ is the Hessian of this subfunction and is approximated using BFGS algorithm
% based on its history of grdaient evaluation (details in step 4)
% 
% The following function gets an estimate of subfunction indexed (indx) using a second order power series
        function f_pred = get_predicted_subf(obj, indx, theta_proj)
        % Get the predicted value of subfunction indx at theta_proj that lies insisde the subspace;
            dtheta = theta_proj - obj.last_theta(:,indx); % (x-x^{t})
            bdtheta = obj.b(:,:,indx).' * dtheta;
            Hdtheta = real(obj.b(:,:,indx) * bdtheta);
            Hdtheta = Hdtheta + dtheta.*obj.min_eig_sub(indx); % the diagonal contribution
            f_pred = obj.hist_f(indx,1) + obj.last_df(:,indx).' * dtheta + 0.5.*(dtheta.') * (Hdtheta);
        end
%
% Online Hessian Approximation: In this section we addressed all the calculations correspond to maintain an independent Hessian 
% approximation for each subfunction computed through BFGS algorithm on its history of gradient evaluations and positions 
%
% For each subfunction ${g_{j}^t}(x)$, two matrices $\Delta {f^'}$ and $\Delta x$ are constructed to hold the change in 
% subfunction gradient and the corresponding change in position x between successive evaluations. This history is updated 
% using the fllowing function that updates history of position differences & gradient differences for subfunction indx.
%
     function update_history(obj, indx, theta_proj, f, df_proj)
        % Update history of position differences & gradient differences for subfunction indx.;
        
            if obj.eval_count(indx) > 1 % if there exist at least one earlier measurement from this;
            % subfunction, then use it to compute position & gradient differences.;
                ddf = df_proj - obj.last_df(:,indx);       % difference in gradient;
                ddt = theta_proj - obj.last_theta(:,indx); % difference in position;
                lddf = sqrt(sum(ddf.^2)); % length of gradient change vector (l_2 norm);
                lddt = sqrt(sum(ddt.^2)); % length of position change vector (l_2 norm);

                corr_ddf_ddt = ddf.' * ddt / (lddt*lddf);
                % Update the corresponding history according to the following cases:
                if lddt > obj.eps || lddf > obj.eps || abs(corr_ddf_ddt) > obj.eps %% change in theta, gradient and inner product isn't very small
                    obj.hist_deltatheta(:,2:end,indx) = obj.hist_deltatheta(:,1:end-1,indx); % shift the history by one timestep;
                    obj.hist_deltatheta(:,1,indx) = ddt; % store the difference in theta since the subfunction was last evaluated;
                    obj.hist_deltadf(:,2:end,indx) = obj.hist_deltadf(:,1:end-1,indx); % shift the history by one timestep;
                    obj.hist_deltadf(:,1,indx) = ddf; % store the change in gradient since the subfunction was last evaluated;
                end
            end
            obj.last_theta(:,indx) = theta_proj;
            obj.last_df(:,indx) = df_proj;
            obj.hist_f(indx,2:end) = obj.hist_f(indx,1:end-1);
            obj.hist_f(indx,1) = f;
     end

% Then, the Hessian approximation for a single subfunction (indx) is updated using BFGS algorithm as detailed in 
% function update_hessian(obj,indx) that iterates through the columns in $\Delta f^'$ and $\Delta x$ from oldest 
% to most recent. At certain column of index s the approximated Hessian should satisfy:
%
% * The secant equation 
%
% $$ \Delta f_{s}^{'}=B_s \Delta x_s $$
%
% * Smallest weighted Forbenius norm between the current and prior estimate of Hessian (standard BFGS update equation)
%
% $$ B_{s}= B_{s-1}+ \frac{\Delta f_{s}^{'} \Delta f_{s}^{'^{T}}}{\Delta f_{s}^{'^{T}} \Delta x_{s}}- \frac{B_{s-1}\Delta x_{s} \Delta x_{s}^{T} B_{s-1}}{\Delta x_{s}^{T} B_{s-1}\Delta x_{s}}$$ Equation (10)
%
     function update_hessian(obj, indx)
        %% BFGS initialization:           
            gd = find(sum(obj.hist_deltatheta(:,:,indx).^2, 1)>0);% test if there is any history for this subfunction
            num_gd = length(gd);
            %% CASE(I): NO sufficient history to run BFGS
            if num_gd == 0 % if no history, initialize with the median eigenvalue from full Hessian of other subfunctions;
                obj.b(:,:,indx) = 0.;
                H = obj.get_full_H_with_diagonal();
                [U, ~] = obj.eigh_wrapper(H);
                obj.min_eig_sub(indx) = median(U)/sum(obj.active); % median eigenvalue of avg Hessian of other active subfunctions
                obj.max_eig_sub(indx) = obj.min_eig_sub(indx);
                if obj.eval_count(indx) > 2
                    if sum(obj.eval_count) < 5 % Subfunction was evaluated multiple times, but has no stored history.
                        obj.min_eig_sub(indx) = obj.min_eig_sub(indx) / 10.; % Need to initialize SFO with a smaller hessian_init value >> scale down the hessian_init value
                    end
                end
                return;
            end
            %% CASE(II): enough history to run BFGS
            % Working in the subspace defined by this subfunction's history (P_hist);
            [P_hist, ~] = qr([obj.hist_deltatheta(:,gd,indx),obj.hist_deltadf(:,gd,indx)], 0);
            deltatheta_P = P_hist.' * obj.hist_deltatheta(:,gd,indx);
            deltadf_P = P_hist.' * obj.hist_deltadf(:,gd,indx);
            %% approximate the smallest eigenvalue \beta by solving the squared secant equation for full history;
            %% \beta will be used as diagonal initialization for BFGS, (i.e. B_0=\beta I);
            % calculate Hessian using pinv & squared equation:   
            % \Delta f_{s}^{'}=B_s \Delta x_s (secant equation >>(df = H dx))
            % (df^T df = dx^T H^T H dx = dx^T H^2 dx)
            % Q=\left [(\Delta x)^{+^{T}} (\Delta f^{'})^{T} \Delta f^{'} (\Delta x)^{+}\right ]^{1/2} Equation (12)
            pdelthet = pinv(deltatheta_P);
            dd = (deltadf_P) * (pdelthet);
            H2 = (dd.') * (dd);
            [H2w, ~] = obj.eigh_wrapper(H2);
            H2w = sqrt(abs(H2w));
            H2w = sort(H2w, 'descend'); % only the top (~num_gd) eigenvalues are expected to be well defined;
            H2w = H2w(1:num_gd);
            
% Enforcing positive definiteness for the estimated Hessian: Each estimated $ {H_{i}}^t$ is constrained to be +ve definite by explicit eigendecomposition and seeting any
% too-small eigen values to the median positive eigenvalue.
            if min(H2w) == 0 || sum(~isfinite(H2w)) > 0 % ~zero eigenvalue or close (poor estimate of curvature)
            % a failure using this history due to degenerate deltadf (0 case) or deltatheta (non-finite case).;
                H2w(:) = max(obj.min_eig_sub(obj.active)); % Initialize using other subfunctions;
            end
            obj.min_eig_sub(indx) = min(H2w);
            obj.max_eig_sub(indx) = max(H2w);
            % constraining Hessian initialization
            if obj.min_eig_sub(indx) < obj.max_eig_sub(indx)/obj.hess_max_dev % eigenvalues less than (\gamma \lambda_{max})
                obj.min_eig_sub(indx) = obj.max_eig_sub(indx)/obj.hess_max_dev; % constrain using the allowed ratio;
            end

            %% Recalculate Hessian;
            num_hist = size(deltatheta_P, 2); % number of history terms (columns);
            % the new hessian will be (b_p)*(b_p.') + eye().*obj.min_eig_sub(indx);
            b_p = zeros(size(P_hist,2), num_hist*2);
            % step through the history;
            for hist_i = num_hist:-1:1
                s = deltatheta_P(:,hist_i);
                y = deltadf_P(:,hist_i);
                % for numerical stability;
                rscl = sqrt(sum(s.^2));
                s = s/rscl;
                y = y/rscl;
                % the BFGS step proper
                Hs = s.*obj.min_eig_sub(indx) + b_p*((b_p.')*(s));
                term1 = y/sqrt(sum(y.*s));
                sHs = sum(s.*Hs);
                term2 = sqrt(complex(-1.)).*Hs/sqrt(sHs);
                if sum(~isfinite(term1)) > 0 || sum(~isfinite(term2)) > 0
                    obj.min_eig_sub(indx) = max(H2w); % invalid bfgs history term
                    continue;
                end
                b_p(:,2*(hist_i-1)+2) = term1;
                b_p(:,2*(hist_i-1)+1) = term2;
            end

            H = real((b_p) * (b_p.')) + eye(size(b_p, 1))*obj.min_eig_sub(indx);
            % constrain the Hessian to be positive definite;
            [U, V] = obj.eigh_wrapper(H);
            if max(U) <= 0. %if there aren't any positive eigenvalues after BFGS
                U(:) = obj.max_eig_sub(indx); % set them all to be the same conservative diagonal value;
            end
            % set any too-small eigenvalues to the median positive eigenvalue;
            U_median = median(U(U>0)); 
            U(U<(max(abs(U))/obj.hess_max_dev)) = U_median;
            % Hessian after it's been forced to be positive definite;
            H_posdef = bsxfun(@times, V, U') * V.';
            % break it apart into matrices b & a diagonal term again;
            B_pos = H_posdef - eye(size(b_p, 1))*obj.min_eig_sub(indx);
            [U, V] = obj.eigh_wrapper(B_pos);
            b_p = bsxfun(@times, V, sqrt(obj.reshape_wrapper(U, [1,-1])));
            obj.b(:,:,indx) = 0.;
            obj.b(:,1:size(b_p,2),indx) = (P_hist)*(b_p);
     end
%
% The full approximated Hessian is etimated using the function:
      function full_H_combined = get_full_H_with_diagonal(obj)
        % Get the full approximate Hessian, including diagonal terms that are included in obj.full_H
            full_H_combined = obj.full_H + eye(obj.K_max).*sum(obj.min_eig_sub(obj.active));
      end
%
% Detecting bad updates: bad proposed parameter updates are identidied using the function *handle_step_failure(obj, f, df_proj, indx)*.
% Since only one subfunction is evaluated per update step, a line search on the full objective is impossible, therefore upades are instead lablelled
% bad when the value of subfunction has increased since its previous evaluation, and also exceeds its approximating function by more than the
% corresponding reduction in the summed approximating function (i.e. $f_j(x^t) -g_j^{t-1} > G^{t-1}(x^{t-1}) - G^{t-1}(x^{t})$)
%
% When a bad update is detected, $x^t$ is reset to its previous value $x^{t-1}$, the BFGS history matrices are also updated to 
% include the change in gradient in the failed update direction and the step length is updated as follows
%
% $$ \eta =\left\{\begin{matrix}\frac{1}{N}+\frac{N-1}{N} \eta^{t} & \text{successful update}\\ \frac{1}{2} \eta^{t} & \text{failed
% update}\end{matrix}\right.$$ Equation (16)
%
      function [step_failure, f, df_proj] = handle_step_failure(obj, f, df_proj, indx)
      % Check if an update step failed and hence update current position;
            step_failure = false; % check to see whether the step should be a failure;
            if ~isfinite(f) || sum(~isfinite(df_proj))>0
                step_failure = true; % if function or its gradient is not-finite then it's failure;
            elseif obj.eval_count(indx) == 1 % new subfunction with only one evaluation
                if max(obj.eval_count) > 1 
                    if f > mean(obj.hist_f(obj.eval_count>1,1)) + 3*std(obj.hist_f(obj.eval_count>1,1))
                        step_failure = true; % much larger than expected then it's failure;
                    end
                end
            elseif f > obj.hist_f(indx,1) % subfunction has increased in value compared to its history
                % (in this case we check if it's larger than its predicted value by enough to trigger a failure);
                f_pred = obj.get_predicted_subf(indx, obj.theta_proj); % calculate the predicted value of this subfunction;
                % subfunction exceeds its predicted value by more than average gain in other subfunctions ?
                predicted_improvement_others = obj.f_predicted_total_improvement - (obj.hist_f(indx,1) - f_pred);
                if f - f_pred > predicted_improvement_others
                    step_failure = true; % then the step is failure 
                end
            end
            % Step length in both cases (Equation (16))
            if ~step_failure
                obj.step_scale = 1./obj.N + obj.step_scale .* (1. - 1./obj.N); % decay the step_scale back towards 1;
            else
                obj.step_scale = obj.step_scale / 2.; % shorten the step length;
                % subspace may be updated during the function calls so store this in the full space;
                df = (obj.P) * (df_proj);
                [f_lastpos, df_lastpos_proj] = obj.f_df_wrapper(obj.theta_prior_step, indx);
                df_lastpos = (obj.P) * (df_lastpos_proj);

                % if the function value exploded, then back it off until it's a reasonable order of magnitude before adding anything to the history;
                f_pred = obj.get_predicted_subf(indx, obj.theta_proj);
                if isfinite(obj.hist_f(indx,1))
                    predicted_f_diff = abs(f_pred - obj.hist_f(indx,1));
                else
                    predicted_f_diff = abs(f - f_lastpos);
                end
                if ~isfinite(predicted_f_diff) || predicted_f_diff < obj.eps
                    predicted_f_diff = obj.eps;
                end

                for i_ls = 1:10
                    if f - f_lastpos < 10*predicted_f_diff % failed update within an order of magnitude of the target update value
                        break; % no backoff required;
                    end
                    % make the step length a factor of 100 shorter;
                    obj.theta = 0.99.*obj.theta_prior_step + 0.01.*obj.theta;
                    obj.theta_proj = (obj.P.') * (obj.theta);
                    % recompute f & df at this new location;
                    [f, df_proj] = obj.f_df_wrapper(obj.theta, indx);
                    df = (obj.P) * (df_proj);
                end
                % move into the subspace.;
                df_proj = (obj.P.') * (df);
                df_lastpos_proj = (obj.P.') * (df_lastpos);

                if f < f_lastpos % step failed, but last position was even worse (original objective is better);
                    theta_lastpos_proj = (obj.P.') * (obj.theta_prior_step);
                    obj.update_history(indx, theta_lastpos_proj, f_lastpos, df_lastpos_proj); % add the newly evaluated point to the history;
                else % step failed but proposed f
                    % add the change in theta & the change in gradient to the history for this subfunction before failing over to the last position;
                    if isfinite(f) && sum(~isfinite(df_proj))==0
                        obj.update_history(indx, obj.theta_proj, f, df_proj);
                    end
                    f = f_lastpos;
                    df_proj = df_lastpos_proj;
                    obj.theta = obj.theta_prior_step;
                    obj.theta_proj = (obj.P.') * (obj.theta);
                end
            end  
      end
%
% Growing the number of active subfunctions: the algorithm begin with only small number of active subfunctions (two for our implementation) and expand the active 
% set as learning progresses by one subfunction every time the average gradient shrinks to within a factor $\alpha$ of 
% the standard error in the average gradient, i.e. the active subset is incremented whenever 
%
% $$ (\bar{f}^{'t})^{T} H^{t}^{-1}\bar{f}^{'t} < \alpha \frac{\sum_{i}(\bar{f}^{'t})^{T} H^{t}^{-1}\bar{f}^{'t}}{(N^{t}-1)N^{t}} $$ Equation (14)
% 
% where $N^{t}$ is the active subset size at time, $H^{t}$ is the full Hessian, and $\bar{f}^{'t}$ is the average gradient given by:  
%
% $$ \bar{f}^{'t}= \frac{1}{N^t} \sum_{i} f_{i}^{'} (x_{i}^{t}) $$ Equation (15)
%
% The active subset is also increased when a bad update is detected or when a full pass through the active batch occurs without 
% a batch size increase
% 
%     
            function expand_active_subfunctions(obj, full_H_inv, step_failure)
                % expand the set of active subfunctions as appropriate;

                % power in the average gradient direction;
                df_avg = mean(obj.last_df(:,obj.active), 2); % average gradient (Equation (15))
                p_df_avg = sum(df_avg .* (full_H_inv * df_avg)); % part of Equation (14)
                % power of the standard error;
                ldfs = obj.last_df(:,obj.active) - df_avg*ones(1,sum(obj.active));
                num_active = sum(obj.active); % size of the active subset
                p_df_sum = sum(sum(ldfs .* (full_H_inv * ldfs))) / num_active / (num_active - 1);
                % increase size of the active set if standard errror in the estimated gradient is the same order of magnitude as the gradient;
                increase_desirable = (p_df_sum >= p_df_avg.*obj.max_gradient_noise);
                % increase size of the active set on step failure;
                increase_desirable = increase_desirable || step_failure;
                % increase size of the active set if a full pass is done without updating it;
                increase_desirable = increase_desirable || (obj.iter_since_active_growth > num_active);
                % check that all subfunctions have enough evaluations for a Hessian approximation before having new subfunctions;
                eligibile_for_increase = (min(obj.eval_count(obj.active)) >= 2);
                % one more iteration has passed since the active set was last expanded;
                obj.iter_since_active_growth = obj.iter_since_active_growth + 1;
                if increase_desirable && eligibile_for_increase && sum(~obj.active) > 0
                    % the index of the new subfunction to activate;
                    new_gd = find(~obj.active);
                    new_gd = new_gd(randperm(length(new_gd), 1));
                    if ~isempty(new_gd)
                        obj.iter_since_active_growth = 0;
                        obj.active(new_gd) = true;
                    end
                end
            end
%
% Perform a single optimization step. Choose an index using get_target_index() function.
% Calculate the gradient, inverse of hessian and finally the Newton step. 
% The Newton step length is adjusted in order to avoid numerical errors caused by small numbers or bouncing
% around opitimized values caused by large step size. Active subfunction set is expended in the end.
        function optimization_step(obj)
        % Perform a single optimization step. 
            time_pass_start = tic();
            indx = obj.get_target_index(); %% choose an index to update;
            [f, df_proj] = obj.f_df_wrapper(obj.theta, indx); % evaluate subfunction value & gradient at new position;
            [step_failure, f, df_proj] = obj.handle_step_failure(f, df_proj, indx); % check for a failed update step, & adjust f, df, & obj.theta;
            % as appropriate if one occurs.;
            obj.update_history(indx, obj.theta_proj, f, df_proj); % add the change in theta & the change in gradient to the history for this subfunction;
            obj.total_distance = obj.total_distance + sqrt(sum((obj.theta - obj.theta_prior_step).^2)); % increment the total distance traveled using the last update;
            H_pre_update = real(obj.b(:,:,indx) * obj.b(:,:,indx).'); % the current contribution from this subfunction to the total Hessian approximation;
            obj.update_hessian(indx);%% update this subfunction's Hessian estimate;
            H_new = real(obj.b(:,:,indx) * obj.b(:,:,indx).');% the new contribution from this subfunction to the total approximate hessian;
            obj.full_H = obj.full_H + H_new - H_pre_update;% update total Hessian using this subfunction's updated contribution;
            
            % calculate the total gradient, total Hessian, & total function value at the current location;
            full_df = 0.;
            for i = 1:obj.N
                dtheta = obj.theta_proj - obj.last_theta(:,i);
                bdtheta = obj.b(:,:,i).' * dtheta;
                Hdtheta = real(obj.b(:,:,i) * bdtheta);
                Hdtheta = Hdtheta + dtheta.*obj.min_eig_sub(i); % the diagonal contribution;
                full_df = full_df + Hdtheta + obj.last_df(:,i);
            end
            full_H_combined = obj.get_full_H_with_diagonal();
            full_H_inv = inv(full_H_combined);
            % calculate an update step;
            dtheta_proj = -(full_H_inv) * (full_df) .* obj.step_scale;% calculate the Newton step described in Equation (5)
            dtheta_proj_length = sqrt(sum(dtheta_proj(:).^2));
            if dtheta_proj_length < obj.minimum_step_length
                dtheta_proj = dtheta_proj*obj.minimum_step_length/dtheta_proj_length;% forcing the update to have minimum step length to avoid numerical errors
                dtheta_proj_length = obj.minimum_step_length;% assign the minimum step length to the update step length
            end
            if sum(obj.eval_count) > obj.N && dtheta_proj_length > obj.eps
            % only allow a step to be up to a factor of max_step_length_ratio longer than the average step length
                avg_length = obj.total_distance / sum(obj.eval_count);
                length_ratio = dtheta_proj_length / avg_length;
                ratio_scale = obj.max_step_length_ratio;
                if length_ratio > ratio_scale
                    % truncating step length from (dtheta_proj_length) to (ratio_scale*avg_length) 
                    dtheta_proj_length = dtheta_proj_length/(length_ratio/ratio_scale);
                    dtheta_proj = dtheta_proj/(length_ratio/ratio_scale);
                end
            end

            dtheta = (obj.P) * (dtheta_proj);% the update to theta, in the full dimensional space;
            obj.theta_prior_step = obj.theta;% backup the prior position, in case this is a failed step;
            obj.theta = obj.theta + dtheta;% update theta to the new location;
            obj.theta_proj = obj.theta_proj + dtheta_proj;
            % the predicted improvement from this update step;
            obj.f_predicted_total_improvement = 0.5 .* dtheta_proj.' * (full_H_combined * dtheta_proj);
            obj.expand_active_subfunctions(full_H_inv, step_failure); %% expand the set of active subfunctions as appropriate;
            % record how much time was taken by this learning step;
            time_diff = toc(time_pass_start);
            obj.time_pass = obj.time_pass + time_diff;
        end

%% 
        function theta_flat = theta_original_to_flat(obj, theta_original)
        % Convert from the original parameter format into a 1d array
        % The original format can be an array, or a cell array full of arrays

            if iscell(theta_original)
                theta_length = 0;
                for theta_array = theta_original(:)'% iterate over cells
                    theta_length = theta_length + numel(theta_array{1});
                end
                theta_flat = zeros(theta_length,1);
                i_start = 1;
                for theta_array = theta_original(:)'% iterate over cells
                    i_end = i_start + numel(theta_array{1})-1;
                    theta_flat(i_start:i_end) = theta_array{1}(:);
                    i_start = i_end+1;
                end
            else
                theta_flat = theta_original(:);
            end
        end
        
        function theta_new = theta_flat_to_original(obj, theta_flat)
        % Convert from a 1d array into the original parameter format.;

            if iscell(obj.theta_original)
                theta_new = cell(size(obj.theta_original));
                i_start = 1;
                jj = 1;
                for theta_array = obj.theta_original(:)'
                    % iterate over cells
                    i_end = i_start + numel(theta_array{1})-1;
                    theta_new{jj} = obj.reshape_wrapper(theta_flat(i_start:i_end), size(theta_array{1}));
                    i_start = i_end + 1;
                    jj = jj + 1;
                end                
            else
                theta_new = obj.reshape_wrapper(theta_flat, size(obj.theta_original));
            end
        end


        function [f, df_proj, df_full] = f_df_wrapper(obj, theta_in, idx)
            % A wrapper around the subfunction objective f_df, that handles the transformation;
            % into & out of the flattened parameterization used internally by SFO.;

            theta_local = obj.theta_flat_to_original(theta_in);
            % evaluate;
            t = tic();
            [f, df_full] = obj.f_df(theta_local, obj.subfunction_references{idx}, obj.varargin_stored{:});
            time_diff = toc(t);
            obj.time_func = obj.time_func + time_diff; % time spent in function evaluation;
            df_full = obj.theta_original_to_flat(df_full);
            % update the subspace with the new gradient direction;
            obj.update_subspace(df_full);
            % gradient projected into the current subspace;
            df_proj = ( obj.P.') * (df_full );
            % keep a record of function evaluations;
            obj.hist_f_flat = [obj.hist_f_flat, f];
            obj.eval_count(idx) = obj.eval_count(idx) + 1;
        end
    end

    methods(Static)
        function A = reshape_wrapper(A, shape)
            % a wrapper for reshape which duplicates the numpy behavior, and sets
            % any -1 dimensions to the appropriate length

            total_dims = numel(A);
            total_assigned = prod(shape(shape>0));
            shape(shape==-1) = total_dims/total_assigned;
            A = reshape(A, shape);
        end

        function [U, V] = eigh_wrapper(A)
            % A wrapper which duplicates the order and format of the numpy
            % eigh routine (enforces A to be symmetric)
            [V,U] = eig(0.5 * (A + A'));
            U = diag(U);
        end
    end
end