Exercise 1: CG method for different quadratic problem

f(x)=\dfrac{1}{2}x^TAx-b^Tx\longrightarrow \min_{x\in\mathbb{R}^n}

Add the code for iterations of the conjugate gradient method in the function conjugate_gradient():

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(228)

# Parameters
n = 100  # Dimension of x
mu = 10
L = 100


def generate_problem(n=100, mu=mu, L=L, problem_type="clustered"):
    np.random.seed(228)
    if problem_type == "clustered":
        A = np.diagflat([mu*np.ones(n//4), (mu+ (L-mu)/3) * np.ones(n//4), (mu+ 2*(L-mu)/3)*np.ones(n//4), L*np.ones(n//4)])
        U = np.random.rand(n, n)
        Q, _ = np.linalg.qr(U)
        A = Q.dot(A).dot(Q.T)
        A = (A + A.T) * 0.5
        x_opt = np.random.rand(n)
        b = A@x_opt
        x_0 = 5*np.random.randn(n)

    elif problem_type == "random":
        A = np.random.randn(n, n)
        factual_L = max(np.linalg.eigvalsh(A.T@A))
        A = A.T.dot(A)/factual_L*L + mu*np.eye(n)
        x_opt = np.random.rand(n)
        b = A@x_opt
        x_0 = 3*np.random.randn(n)
    
    elif problem_type == "uniform spectrum":
        A = np.diag(np.linspace(mu, L, n, endpoint=True))
        x_opt = np.random.rand(n)
        b = A@x_opt
        x_0 = 3*np.random.randn(n)

    elif problem_type == "Hilbert":
        A = np.array([[1.0 / (i+j - 1) for i in range(1, n+1)] for j in range(1, n+1)])
        b = np.ones(n)
        x_0 = 3*np.random.randn(n)
        x_opt = np.linalg.lstsq(A, b)[0]

    elif problem_type == "worst_cg":
        # Parameter t controls the condition number
        t = 0.6  # Small t leads to worse conditioning
        # Create tridiagonal matrix W
        main_diag = np.ones(n)
        main_diag[0] = t
        main_diag[1:] = 1 + t
        off_diag = np.sqrt(t) * np.ones(n-1)
        A = np.diag(main_diag) + np.diag(off_diag, k=1) + np.diag(off_diag, k=-1)
        
        # Create b vector [1, 0, ..., 0]
        b = np.zeros(n)
        b[0] = 1
        
        # Since this is a specific problem, we compute x_opt explicitly
        x_opt = np.linalg.solve(A, b)
        x_0 = np.zeros(n)  # Start from zero vector
        return A, b, x_0, x_opt

    return A, b, x_0, x_opt

# Optimization methods
def gradient_descent(f, grad_f, x_0, step_size, iterations, x_opt):
    x = x_0.copy()
    f_opt = f(x_opt)
    values, gradients = [], []
    values.append(abs(f(x) - f_opt))
    gradients.append(np.linalg.norm(grad_f(x)))
    for _ in range(iterations):
        x -= step_size * grad_f(x)
        values.append(abs(f(x) - f_opt))
        gradients.append(np.linalg.norm(grad_f(x)))
    return values, gradients

def steepest_descent(A, f, grad_f, x_0, iterations, x_opt):
    x = x_0.copy()
    f_opt = f(x_opt)
    values, gradients = [], []
    values.append(abs(f(x) - f_opt))
    gradients.append(np.linalg.norm(grad_f(x)))
    for _ in range(iterations):
        grad = grad_f(x)
        step_size = np.dot(grad.T, grad) / np.dot(grad.T, np.dot(A, grad))
        x -= step_size * grad
        values.append(abs(f(x) - f_opt))
        gradients.append(np.linalg.norm(grad))
    return values, gradients

def conjugate_gradient(A, b, x_0, iterations, x_opt):
    x = x_0.copy()
    f = lambda x: 0.5 * x.T @ A @ x - b.T @ x
    f_opt = f(x_opt)

    r = b - np.dot(A, x)
    d = r.copy()
    values, gradients = [f(x) - f_opt], [np.linalg.norm(r)]
    for _ in range(iterations - 1):
        ## YOUR CODE HERE ##
        pass
    return values, gradients

def run_experiment(params):
    A, b, x_0, x_opt = generate_problem(n=params["n"], mu=params["mu"], L=params["L"], problem_type=params["problem_type"])
    eigs = np.linalg.eigvalsh(A)
    mu, L = min(eigs), max(eigs)

    f = lambda x: 0.5 * x.T @ A @ x - b.T @ x
    grad_f = lambda x: A@x - b

    if mu <= 1e-2:
        alpha = 1/L
    else:
        alpha = 2/(mu+L)  # Step size
    beta = (np.sqrt(L) - np.sqrt(mu))/(np.sqrt(L) + np.sqrt(mu))  # Momentum parameter

    results = {
        "methods": {
            "Gradient Descent": gradient_descent(f, grad_f, x_0, alpha, params["iterations"], x_opt),
            "Steepest Descent": steepest_descent(A, f, grad_f, x_0, params["iterations"], x_opt),
            "Conjugate Gradients": conjugate_gradient(A, b, x_0, params["iterations"], x_opt),
        },
        "problem":{
            "eigs": eigs,
            "params": params
        }
    }
    return results


def plot_results(results):
    linestyles = {
        "Gradient Descent": "r-",
        "Steepest Descent": "b-.",
        "Conjugate Gradients": "g--"
    }
    plt.figure(figsize=(10, 3.5))
    mu = results["problem"]["params"]["mu"]
    L = results["problem"]["params"]["L"]
    n = results["problem"]["params"]["n"]
    problem_type = results["problem"]["params"]["problem_type"]
    
    if mu > 1e-2:
        plt.suptitle(f"Strongly convex quadratics. n={n}, {problem_type} matrix. ")
    else:
        plt.suptitle(f"Convex quadratics. n={n}, {problem_type} matrix. ")

    plt.subplot(1, 3, 1)
    eigs = results["problem"]["eigs"]
    plt.scatter(np.arange(len(eigs)), eigs)
    plt.xlabel('Dimension')
    plt.ylabel('Eigenvalues of A')
    plt.grid(linestyle=":")
    plt.title("Eigenvalues")
    if results["problem"]["params"]["problem_type"] == "Hilbert":
        plt.yscale("log")

    plt.subplot(1, 3, 2)
    for method, result_  in results["methods"].items():
        plt.semilogy(result_[0], linestyles[method])
    plt.xlabel('Iteration')
    plt.ylabel(r'$|f(x) -f^*|$')
    plt.grid(linestyle=":")
    plt.title("Function gap")

    plt.subplot(1, 3, 3)
    for method, result_ in results["methods"].items():
        plt.semilogy(result_[1], linestyles[method], label=method)
    plt.ylabel(r'$\|\nabla f(x)\|_2$')
    plt.grid(linestyle=":")
    plt.title("Norm of Gradient")

    # Place the legend below the plots
    plt.figlegend(loc='lower center', ncol=4, bbox_to_anchor=(0.5, -0.00))
    # Adjust layout to make space for the legend below
    plt.tight_layout(rect=[0, 0.05, 1, 1])
    # plt.savefig(f"cg_{problem_type}_{mu}_{L}_{n}.pdf")
    plt.show()

After implementing the conjugate gradient method, let’s look at the several experimental results of solving a quadratic problem for different matrices A:

# Experiment parameters
params = {
    "n": 60,
    "mu": 1e-3,
    "L": 100,
    "iterations": 100,
    "problem_type": "random",
}

results = run_experiment(params)
plot_results(results)

# Experiment parameters
params = {
    "n": 60,
    "mu": 10,
    "L": 100,
    "iterations": 100,
    "problem_type": "random",
}

results = run_experiment(params)
plot_results(results)

# Experiment parameters
params = {
    "n": 60,
    "mu": 10,
    "L": 1000,
    "iterations": 500,
    "problem_type": "clustered",
}

results = run_experiment(params)
plot_results(results)

# Experiment parameters
params = {
    "n": 600,
    "mu": 10,
    "L": 1000,
    "iterations": 500,
    "problem_type": "clustered",
}

results = run_experiment(params)
plot_results(results)

# Experiment parameters
params = {
    "n": 60,
    "mu": 1,
    "L": 10,
    "iterations": 100,
    "problem_type": "Hilbert",
}

results = run_experiment(params)
plot_results(results)