Source code for copt.gradient_descent

import warnings
import numpy as np
from scipy import optimize, linalg, sparse
from datetime import datetime
from numba import njit

# .. local imports ..
from .utils import DummyLoss


[docs]def minimize_PGD(
        f, g=None, x0=None, tol=1e-12, max_iter=100, verbose=0,
        callback=None, backtracking: bool=True, step_size=None,
        max_iter_backtracking=100, backtracking_factor=0.6, trace=False
        ) -> optimize.OptimizeResult:
    """Proximal gradient descent.

    Solves problems of the form

            minimize_x f(x) + g(x)

    where we have access to the gradient of f and to the proximal operator of g.

    Arguments:
        f : loss function (smooth)

        g : penalty term (proximal)

        x0 : array-like, optional
            Initial guess

        backtracking : boolean
            Whether to perform backtracking (i.e. line-search) or not.

        max_iter : int
            Maximum number of iterations.

        verbose : int
            Verbosity level, from 0 (no output) to 2 (output on each iteration)

        step_size : float
            Starting value for the line-search procedure. XXX

        callback : callable
            callback function (optional).

    Returns:
        res : The optimization result represented as a
            ``scipy.optimize.OptimizeResult`` object. Important attributes are:
            ``x`` the solution array, ``success`` a Boolean flag indicating if
            the optimizer exited successfully and ``message`` which describes
            the cause of the termination. See `scipy.optimize.OptimizeResult`
            for a description of other attributes.

    References:
        Beck, Amir, and Marc Teboulle. "Gradient-based algorithms with applications to signal
        recovery." Convex optimization in signal processing and communications (2009)
    """
    if x0 is None:
        xk = np.zeros(f.n_features)
    else:
        xk = np.array(x0, copy=True)
    if not max_iter_backtracking > 0:
        raise ValueError('Line search iterations need to be greater than 0')
    if g is None:
        g = DummyLoss()

    if step_size is None:
        # sample to estimate Lipschitz constant
        step_size_n_sample = 5
        L = []
        for _ in range(step_size_n_sample):
            x_tmp = np.random.randn(f.n_features)
            x_tmp /= linalg.norm(x_tmp)
            L.append(linalg.norm(f(xk) - f(x_tmp)))
        # give it a generous upper bound
        step_size = 1. / np.mean(L)

    success = False
    trace_func = []
    trace_time = []
    trace_x = []
    start_time = datetime.now()

    it = 1
    # .. a while loop instead of a for loop ..
    # .. allows for infinite or floating point max_iter ..

    if trace:
        trace_x.append(xk.copy())
        trace_func.append(f(xk) + g(xk))
        trace_time.append((datetime.now() - start_time).total_seconds())

    while it <= max_iter:
        # .. compute gradient and step size
        current_step_size = step_size
        grad_fk = f.gradient(xk)
        x_next = g.prox(xk - current_step_size * grad_fk, current_step_size)
        incr = x_next - xk
        if backtracking:
            fk = f(xk)
            f_next = f(x_next)
            for _ in range(max_iter_backtracking):
                if f_next <= fk + grad_fk.dot(incr) + incr.dot(incr) / (2.0 * current_step_size):
                    # .. step size found ..
                    break
                else:
                    # .. backtracking, reduce step size ..
                    current_step_size *= backtracking_factor
                    x_next = g.prox(xk - current_step_size * grad_fk, current_step_size)
                    incr = x_next - xk
                    f_next = f(x_next)
            else:
                warnings.warn("Maxium number of line-search iterations reached")
        certificate = np.linalg.norm((xk - x_next) / step_size)
        xk[:] = x_next

        if trace:
            trace_x.append(xk.copy())
            trace_func.append(f(xk) + g(xk))
            trace_time.append((datetime.now() - start_time).total_seconds())

        if verbose > 0:
            print("Iteration %s, step size: %s" % (it, step_size))

        if certificate < tol:
            if verbose:
                print("Achieved relative tolerance at iteration %s" % it)
            success = True
            break

        if callback is not None:
            callback(xk)
        it += 1
    if it >= max_iter:
        warnings.warn(
            "proximal_gradient did not reach the desired tolerance level",
            RuntimeWarning)

    return optimize.OptimizeResult(
        x=xk, success=success,
        certificate=certificate,
        nit=it, trace_x=np.array(trace_x), trace_func=np.array(trace_func),
        trace_time=trace_time)


[docs]def minimize_APGD(
        f, g=None, x0=None, tol=1e-12, max_iter=100, verbose=0,
        callback=None, backtracking: bool=True,
        step_size=None, max_iter_backtracking=100, backtracking_factor=0.6,
        trace=False) -> optimize.OptimizeResult:
    """Accelerated proximal gradient descent.

    Solves problems of the form

            minimize_x f(x) + alpha g(x)


    where we have access to the gradient of f and to the proximal operator of g.

    Arguments:
        f : loss function, differentiable

        g : penalty, proximable

        fun_deriv : f_prime(x) returns the gradient of f.

        g_prox : g_prox(x, alpha) returns the proximal operator of g at x
            with parameter alpha.

        x0 : array-like
            Initial guess

        backtracking : boolean
            Whether to perform backtracking (i.e. line-search) or not.

        max_iter : int
            Maximum number of iterations.

        verbose : int
            Verbosity level, from 0 (no output) to 2 (output on each iteration)

        step_size : float
            Starting value for the line-search procedure. XXX

        callback : callable
            callback function (optional).

    Returns:
        res : The optimization result represented as a
            ``scipy.optimize.OptimizeResult`` object. Important attributes are:
            ``x`` the solution array, ``success`` a Boolean flag indicating if
            the optimizer exited successfully and ``message`` which describes
            the cause of the termination. See `scipy.optimize.OptimizeResult`
            for a description of other attributes.

    References:
        Amir Beck and Marc Teboulle. "Gradient-based algorithms with applications to signal
        recovery." Convex optimization in signal processing and communications (2009)
    """
    if x0 is None:
        xk = np.zeros(f.n_features)
    else:
        xk = np.array(x0, copy=True)
    if not max_iter_backtracking > 0:
        raise ValueError('Line search iterations need to be greater than 0')
    if g is None:
        g = DummyLoss()

    if step_size is None:
        # sample to estimate Lipschitz constant
        step_size_n_sample = 5
        L = []
        for _ in range(step_size_n_sample):
            x_tmp = np.random.randn(f.n_features)
            x_tmp /= linalg.norm(x_tmp)
            L.append(linalg.norm(f(xk) - f(x_tmp)))
        # give it a generous upper bound
        step_size = 1. / np.mean(L)

    success = False
    trace_func = []
    trace_time = []
    trace_x = []
    trace_certificate = []
    start_time = datetime.now()

    it = 1
    tk = 1
    # .. a while loop instead of a for loop ..
    # .. allows for infinite or floating point max_iter ..
    yk = xk.copy()
    xk_prev = xk.copy()
    while it <= max_iter:
        # .. compute gradient and step size
        current_step_size = step_size
        grad_fk = f.gradient(yk)
        xk = g.prox(yk - current_step_size * grad_fk, current_step_size)
        if backtracking:
            for _ in range(max_iter_backtracking):
                incr = xk - yk
                if f(xk) <= f(yk) + grad_fk.dot(incr) + incr.dot(incr) / (2.0 * current_step_size):
                    # .. step size found ..
                    break
                else:
                    # .. backtracking, reduce step size ..
                    current_step_size *= backtracking_factor
                    xk = g.prox(yk - current_step_size * grad_fk, current_step_size)
            else:
                warnings.warn("Maxium number of line-search iterations reached")
        t_next = (1 + np.sqrt(1 + 4 * tk * tk)) / 2
        yk = xk + ((tk-1.) / t_next) * (xk - xk_prev)
        certificate = np.linalg.norm((xk - xk_prev) / step_size)
        tk = t_next
        xk_prev = xk.copy()

        if trace:
            trace_certificate.append(certificate)
            trace_x.append(xk.copy())
            trace_func.append(f(yk) + g(yk))
            trace_time.append((datetime.now() - start_time).total_seconds())

        if verbose > 0:
            print("Iteration %s, certificate: %s, step size: %s" % (it, certificate, step_size))

        if certificate < tol:
            if verbose:
                print("Achieved relative tolerance at iteration %s" % it)
            success = True
            break

        if callback is not None:
            callback(xk)
        it += 1
    if it >= max_iter:
        warnings.warn(
            "proximal_gradient did not reach the desired tolerance level",
            RuntimeWarning)

    return optimize.OptimizeResult(
        x=yk, success=success,
        certificate=certificate,
        # jac=incr / step_size,  # prox-grad mapping
        trace_certificate=trace_certificate,
        nit=it, trace_x=np.array(trace_x), trace_func=np.array(trace_func),
        trace_time=trace_time)


[docs]def minimize_DavisYin(
        fun, fun_deriv, g_prox, h_prox, y0, alpha=1.0, beta=1.0, tol=1e-6, max_iter=1000,
        g_prox_args=(), h_prox_args=(),
        verbose=0, callback=None, backtracking=True, step_size=None, max_iter_backtracking=100,
        backtracking_factor=0.4):
    """Davis-Yin three operator splitting method.

    This algorithm can solve problems of the form

               minimize_x f(x) + alpha * g(x) + beta * h(x)

    where f is a smooth function and g is a (possibly non-smooth)
    function for which the proximal operator is known.

    Parameters
    ----------
    fun : callable
        f(x) returns the value of f at x.

    fun_deriv : callable or None
        f_prime(x) returns the gradient of f.

    g_prox : callable or None
        g_prox(x, alpha, *args) returns the proximal operator of g at xa
        with parameter alpha. Extra arguments can be passed by g_prox_args.

    y0 : array-like
        Initial guess

    backtracking : boolean
        Whether to perform backtracking (i.e. line-search) to estimate
        the step size.

    max_iter : int
        Maximum number of iterations.

    verbose : int
        Verbosity level, from 0 (no output) to 2 (output on each iteration)

    step_size : float
        Starting value for the line-search procedure.

    callback : callable
        callback function (optional).

    Returns
    -------
    res : OptimizeResult
        The optimization result represented as a
        ``scipy.optimize.OptimizeResult`` object. Important attributes are:
        ``x`` the solution array, ``success`` a Boolean flag indicating if
        the optimizer exited successfully and ``message`` which describes
        the cause of the termination. See `scipy.optimize.OptimizeResult`
        for a description of other attributes.

    References
    ----------
    Davis, Damek, and Wotao Yin. "A three-operator splitting scheme and its optimization applications."
    arXiv preprint arXiv:1504.01032 (2015) https://arxiv.org/abs/1504.01032

    Pedregosa, Fabian. "On the convergence rate of the three operator splitting scheme." arXiv preprint
    arXiv:1610.07830 (2016) https://arxiv.org/abs/1610.07830
    """
    y = np.array(y0, copy=True)
    success = False
    if not max_iter_backtracking > 0:
        raise ValueError('Line search iterations need to be greater than 0')

    if g_prox is None:
        def g_prox(step_size, x, *args): return x
    if h_prox is None:
        def h_prox(step_size, x, *args): return x

    if step_size is None:
        # sample to estimate Lipschitz constant
        x0 = np.zeros(y.size)
        step_size_n_sample = 5
        L = []
        for _ in range(step_size_n_sample):
            x_tmp = np.random.randn(x0.size)
            x_tmp /= linalg.norm(x_tmp)
            L.append(linalg.norm(fun_deriv(x0) - fun_deriv(x_tmp)))
        # give it a generous upper bound
        step_size = 10. / np.mean(L)

    it = 1
    # .. a while loop instead of a for loop ..
    # .. allows for infinite or floating point max_iter ..
    current_step_size = step_size
    while it <= max_iter:
        x = g_prox(current_step_size * alpha, y, *g_prox_args)
        grad_fk = fun_deriv(x)
        z = h_prox(current_step_size * beta, 2 * x - y - current_step_size * grad_fk, *h_prox_args)
        incr = z - x
        norm_incr = linalg.norm(incr / current_step_size)
        if backtracking:
            for _ in range(max_iter_backtracking):
                expected_descent = fun(x) + grad_fk.dot(incr) + 0.5 * current_step_size * (norm_incr ** 2)
                if fun(z) <= expected_descent * (1 + x.size * np.finfo(np.float64).eps):
                    # step size found
                    break
                else:
                    current_step_size *= backtracking_factor
                    y = x + backtracking_factor * (y - x)
                    grad_fk = fun_deriv(x)
                    z = h_prox(current_step_size * beta, 2 * x - y - current_step_size * grad_fk, *h_prox_args)
                    incr = z - x
                    norm_incr = linalg.norm(incr / current_step_size)
            else:
                warnings.warn("Maximum number of line-search iterations reached")

        y += incr

        if verbose > 0:
            print("Iteration %s, prox-grad norm: %s, step size: %s" % (
                it, norm_incr / current_step_size, current_step_size))

        if norm_incr < tol * current_step_size:
            success = True
            if verbose:
                print("Achieved relative tolerance at iteration %s" % it)
            break

        if callback is not None:
            callback(x)
        if it >= max_iter:
            warnings.warn(
                "three_split did not reach the desired tolerance level",
                RuntimeWarning)
        it += 1

    x_sol = g_prox(current_step_size * alpha, y, *g_prox_args)
    return optimize.OptimizeResult(
        x=x_sol, success=success,
        jac=incr / current_step_size,  # prox-grad mapping
        nit=it)
Source code for copt.gradient_descent

copt

Navigation

Table Of Contents