import numpy as np
import matplotlib.pyplot as plt
Comparison for quadratics
228)
np.random.seed(
# Parameters
= 100 # Dimension of x
n = 10
mu = 100
L
def format_num(x):
"""
Formats a number into scientific notation with two decimal places
for the significand and removes the extra 0 in the exponent.
Example: 1.14e-01 becomes 1.14e-1.
"""
= f"{x:.2e}"
s = s.replace("e+0", "e+").replace("e-0", "e-")
s return
def generate_problem(n=100, mu=mu, L=L, problem_type="clustered"):
228)
np.random.seed(if problem_type == "clustered":
= np.diagflat([mu * np.ones(n // 4), (mu + (L - mu) / 3) * np.ones(n // 4),
A + 2 * (L - mu) / 3) * np.ones(n // 4), L * np.ones(n // 4)])
(mu = np.random.rand(n, n)
U = np.linalg.qr(U)
Q, _ = Q.dot(A).dot(Q.T)
A = (A + A.T) * 0.5
A = np.random.rand(n)
x_opt = A @ x_opt
b = 5 * np.random.randn(n)
x_0
elif problem_type == "random":
= np.random.randn(n, n)
A = max(np.linalg.eigvalsh(A.T @ A))
factual_L = A.T.dot(A) / factual_L * L + mu * np.eye(n)
A = np.random.rand(n)
x_opt = A @ x_opt
b = 3 * np.random.randn(n)
x_0
elif problem_type == "uniform spectrum":
= np.diag(np.linspace(mu, L, n, endpoint=True))
A = np.random.rand(n)
x_opt = A @ x_opt
b = 3 * np.random.randn(n)
x_0
elif problem_type == "Hilbert":
= np.array([[1.0 / (i + j - 1) for i in range(1, n+1)] for j in range(1, n+1)])
A = np.ones(n)
b = 3 * np.random.randn(n)
x_0 = np.linalg.lstsq(A, b, rcond=None)[0]
x_opt
elif problem_type == "worst_cg":
= 0.6 # Small t leads to worse conditioning
t = np.ones(n)
main_diag 0] = t
main_diag[1:] = 1 + t
main_diag[= np.sqrt(t) * np.ones(n-1)
off_diag = np.diag(main_diag) + np.diag(off_diag, k=1) + np.diag(off_diag, k=-1)
A = np.zeros(n)
b 0] = 1
b[= np.linalg.solve(A, b)
x_opt = np.zeros(n)
x_0 return A, b, x_0, x_opt
return A, b, x_0, x_opt
# Optimization methods now return (function_gap, domain_gap, grad_norm, alpha, beta)
def gradient_descent(f, grad_f, x_0, step_size, iterations, x_opt):
= x_0.copy()
x = f(x_opt)
f_opt = [abs(f(x) - f_opt)]
function_gap = [np.linalg.norm(x - x_opt)]
domain_gap = [np.linalg.norm(grad_f(x))]
grad_norms for _ in range(iterations):
= x - step_size * grad_f(x)
x abs(f(x) - f_opt))
function_gap.append(- x_opt))
domain_gap.append(np.linalg.norm(x
grad_norms.append(np.linalg.norm(grad_f(x)))return function_gap, domain_gap, grad_norms, step_size, 0
def heavy_ball(f, grad_f, x_0, iterations, x_opt, mu, L):
### YOUR CODE HERE
return
return function_gap, domain_gap, grad_norms, alpha, beta
def nesterov_accelerated_gradient(f, grad_f, x_0, iterations, x_opt, mu, L):
### YOUR CODE HERE
return
return function_gap, domain_gap, grad_norms, alpha, beta
def run_experiment(params):
= generate_problem(n=params["n"], mu=params["mu"], L=params["L"],
A, b, x_0, x_opt =params["problem_type"])
problem_type= np.linalg.eigvalsh(A)
eigs # Update mu and L based on the spectrum
= min(eigs), max(eigs)
mu_val, L_val
= lambda x: 0.5 * x.T @ A @ x - b.T @ x
f = lambda x: A @ x - b
grad_f
if mu_val <= 1e-2:
= 1 / L_val
alpha else:
= 2 / (mu_val + L_val)
alpha = (np.sqrt(L_val) - np.sqrt(mu_val)) / (np.sqrt(L_val) + np.sqrt(mu_val))
beta
= {
results "methods": {
"Gradient Descent": gradient_descent(f, grad_f, x_0, alpha, params["iterations"], x_opt),
"Heavy Ball": heavy_ball(f, grad_f, x_0, params["iterations"], x_opt, mu_val, L_val),
"NAG": nesterov_accelerated_gradient(f, grad_f, x_0, params["iterations"], x_opt, mu_val, L_val),
},"problem": {
"eigs": eigs,
"params": params
}
}return results
def plot_results(results):
# Define linestyles for methods
= {
linestyles "Gradient Descent": "r-",
"Heavy Ball": "g--",
"NAG": "m:",
}= results["problem"]["params"]
params = params["n"]
n = params["problem_type"]
problem_type = params["mu"]
mu_val = params["L"]
L_val
# Create a figure with 3 subplots for function gap, domain gap, and gradient norm
=(10, 3))
plt.figure(figsizef"{'Strongly convex' if mu_val > 1e-2 else 'Convex'} quadratics: n={n}, {problem_type} matrix, μ={mu_val}, L={L_val}")
plt.suptitle(
# Plot 1: Function gap
1, 3, 1)
plt.subplot(for method, result in results["methods"].items():
# result = (function_gap, domain_gap, grad_norms, alpha, beta)
0], linestyles[method])
plt.semilogy(result['Iteration')
plt.xlabel(r'$|f(x) - f^*|$')
plt.ylabel(=":")
plt.grid(linestyle"Function Gap")
plt.title(
# Plot 2: Domain gap (||x - x*||)
1, 3, 2)
plt.subplot(for method, result in results["methods"].items():
1], linestyles[method])
plt.semilogy(result['Iteration')
plt.xlabel(r'$\|x - x^*\|_2$')
plt.ylabel(=":")
plt.grid(linestyle"Domain Gap")
plt.title(
# Plot 3: Norm of Gradient
1, 3, 3)
plt.subplot(for method, result in results["methods"].items():
if method == "NAG":
if mu_val > 1e-2:
= f"{method}. α={result[3]:.2e}, β={result[4]:.2e}"
label else:
= f"{method}"
label elif method == "Heavy Ball":
if mu_val > 1e-2:
= f"{method}. α={result[3]:.2e}, β={result[4]:.2e}"
label else:
= f"{method}"
label elif method == "Gradient Descent":
= f"{method}. α={result[3]:.2e}"
label 2], linestyles[method], label=label)
plt.semilogy(result['Iteration')
plt.xlabel(r'$\|\nabla f(x)\|_2$')
plt.ylabel(=":")
plt.grid(linestyle"Gradient Norm")
plt.title(
# Place the legend below the plots
='lower center', ncol=3, bbox_to_anchor=(0.5, -0.05))
plt.figlegend(loc
plt.tight_layout()f"agd_{problem_type}_{mu_val}_{L_val}_{n}.pdf", bbox_inches='tight')
plt.savefig( plt.show()
# Experiment parameters
= {
params "n": 60,
"mu": 0,
"L": 10,
"iterations": 800,
"problem_type": "random", # Change to "clustered", "uniform spectrum", or "Hilbert" as needed
}
= run_experiment(params)
results plot_results(results)
# Experiment parameters
= {
params "n": 60,
"mu": 1,
"L": 10,
"iterations": 100,
"problem_type": "random", # Change to "clustered", "uniform spectrum", or "Hilbert" as needed
}
= run_experiment(params)
results plot_results(results)
# Experiment parameters
= {
params "n": 60,
"mu": 1,
"L": 1000,
"iterations": 1000,
"problem_type": "random", # Change to "clustered", "uniform spectrum", or "Hilbert" as needed
}
= run_experiment(params)
results plot_results(results)
# Experiment parameters
= {
params "n": 1000,
"mu": 1,
"L": 10,
"iterations": 100,
"problem_type": "random", # Change to "clustered", "uniform spectrum", or "Hilbert" as needed
}
= run_experiment(params)
results plot_results(results)
# Experiment parameters
= {
params "n": 1000,
"mu": 1,
"L": 1000,
"iterations": 1000,
"problem_type": "random", # Change to "clustered", "uniform spectrum", or "Hilbert" as needed
}
= run_experiment(params)
results plot_results(results)
Logreg
import numpy as np
import matplotlib.pyplot as plt
import cvxpy as cp
import jax
from jax import numpy as jnp, grad
from scipy.optimize import minimize_scalar
import jax.numpy as jnp
from jax import grad, jit, hessian
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import time
from ucimlrepo import fetch_ucirepo
from optax.losses import safe_softmax_cross_entropy as cros_entr
from sklearn.preprocessing import StandardScaler
from scipy.optimize import minimize_scalar
import sklearn.datasets as skldata
# Set a random seed for reproducibility
228)
np.random.seed(228)
jax.random.PRNGKey(
@jit
def logistic_loss(w, X, y, mu=1):
= X.shape
m, n return jnp.sum(jnp.logaddexp(0, -y * (X @ w))) / m + mu / 2 * jnp.sum(w**2)
def generate_problem(m=1000, n=300, mu=1):
= skldata.make_classification(n_classes=2, n_features=n, n_samples=m, n_informative=n//2, random_state=0)
X, y = jnp.array(X)
X = jnp.array(y)
y
= train_test_split(X, y, test_size=0.33, random_state=42)
X_train, X_test, y_train, y_test
return X_train, y_train, X_test, y_test
def compute_optimal(X, y, mu):
= cp.Variable(X.shape[1])
w = cp.Minimize(cp.sum(cp.logistic(cp.multiply(-y, X @ w))) / len(y) + mu / 2 * cp.norm(w, 2)**2)
objective = cp.Problem(objective)
problem
problem.solve()return w.value, problem.value
@jit
def compute_accuracy(w, X, y):
# Compute predicted probabilities using the logistic (sigmoid) function
= jax.nn.sigmoid(X @ w)
preds_probs # Convert probabilities to class predictions: -1 if p < 0.5, else 1
= jnp.where(preds_probs < 0.5, 0, 1)
preds # Calculate accuracy as the average of correct predictions
= jnp.mean(preds == y)
accuracy return accuracy
# @jit
def compute_metrics(trajectory, x_star, f_star, times, X_train, y_train, X_test, y_test, mu):
= lambda w: logistic_loss(w, X_train, y_train, mu)
f = {
metrics "f_gap": [jnp.abs(f(x) - f_star) for x in trajectory],
"x_gap": [jnp.linalg.norm(x - x_star) for x in trajectory],
"time": times,
"train_acc": [compute_accuracy(x, X_train, y_train) for x in trajectory],
"test_acc": [compute_accuracy(x, X_test, y_test) for x in trajectory],
}return metrics
def gradient_descent(w_0, X, y, learning_rate=0.01, num_iters=100, mu=0):
= [w_0]
trajectory = [0]
times = w_0
w = lambda w: logistic_loss(w, X, y, mu)
f = time.time()
iter_start for i in range(num_iters):
= grad(f)(w)
grad_val if learning_rate == "linesearch":
# Simple line search implementation
= lambda alpha: f(w - alpha*grad_val)
phi = minimize_scalar(fun=phi,
result =(1e-3, 2e2)
bounds
)= result.x
step_size else:
= learning_rate
step_size -= step_size * grad_val
w = time.time()
iter_time
trajectory.append(w)- iter_start)
times.append(iter_time return trajectory, times
def heavy_ball_method(w_0, X, y, learning_rate=0.01, momentum=0.9, num_iters=100, mu=0):
"""
Polyak Heavy-Ball Method implementation:
Given:
x^+ = x - α ∇f(x) (Gradient step)
d_k = β_k (x_k - x_{k-1}) (Momentum term)
The update is:
x_{k+1} = x_k^+ + d_k = x_k - α ∇f(x_k) + d_k
Parameters:
w_0 : initial point
X, y : data used in the objective function (e.g. logistic loss)
learning_rate: α, step size for the gradient step
momentum : β, momentum coefficient (can be constant or adjusted per iteration)
num_iters : number of iterations to run
mu : strong convexity parameter (not used in this explicit update)
Returns:
trajectory : list of iterates x_k (after the gradient update)
times : list of times (elapsed) at each iteration
"""
### YOUR CODE HERE
return
return trajectory, times
def nesterov_accelerated_gradient(w_0, X, y, learning_rate=0.01, momentum=0.9, num_iters=100, mu=0):
"""
Nesterov Accelerated Gradient (NAG) implementation using explicit momentum:
Given:
x^+ = x - α ∇f(x) (Gradient step)
d_k = β_k (x_k - x_{k-1}) (Momentum term)
The update is:
x_{k+1} = (x_k + d_k)^+ = (x_k + d_k) - α ∇f(x_k + d_k)
Parameters:
w_0 : initial point
X, y : data used in the objective function (e.g. logistic loss)
learning_rate: α, step size for the gradient step
momentum : β, momentum coefficient (can be constant or adjusted per iteration)
num_iters : number of iterations to run
mu : strong convexity parameter (not used in this explicit update)
Returns:
trajectory : list of iterates x_k (after the gradient update)
times : list of times (elapsed) at each iteration
"""
### YOUR CODE HERE
return
return trajectory, times
def run_experiments(params):
= params["mu"]
mu = params["m"], params["n"]
m, n = params["methods"]
methods = {}
results
= generate_problem(m, n, mu)
X_train, y_train, X_test, y_test = X_train.shape[1] # Number of features
n_features "n_features"] = n_features
params[
= jax.random.normal(jax.random.PRNGKey(0), (n_features, ))
x_0 = compute_optimal(X_train, y_train, mu)
x_star, f_star
for method in methods:
if method["method"] == "GD":
= method["learning_rate"]
learning_rate = method["iterations"]
iterations = gradient_descent(x_0, X_train, y_train, learning_rate, iterations, mu)
trajectory, times = method["method"] + " " + str(learning_rate)
label = compute_metrics(trajectory, x_star, f_star, times, X_train, y_train, X_test, y_test, mu)
results[label] elif method["method"] == "Heavy Ball":
= method.get("learning_rate", 0.01)
learning_rate = method.get("momentum", 0.9)
momentum = method["iterations"]
iterations = heavy_ball_method(x_0, X_train, y_train, learning_rate, momentum, iterations, mu)
trajectory, times # label = method["method"] + " " + str(learning_rate)
= f"{method['method']}. α={learning_rate:.2e}. β={momentum:.2e}"
label = compute_metrics(trajectory, x_star, f_star, times, X_train, y_train, X_test, y_test, mu)
results[label]
elif method["method"] == "NAG":
= method.get("learning_rate", 0.01)
learning_rate = method.get("momentum", 0.9)
momentum = method["iterations"]
iterations = nesterov_accelerated_gradient(x_0, X_train, y_train, learning_rate, momentum, iterations, mu)
trajectory, times = method["method"] + " " + str(learning_rate)
label = f"{method['method']}. α={learning_rate:.2e}. β={momentum:.2e}"
label = compute_metrics(trajectory, x_star, f_star, times, X_train, y_train, X_test, y_test, mu)
results[label]
return results, params
def plot_results(results, params):
=(11, 2.6))
plt.figure(figsize= params["mu"]
mu
if mu > 1e-2:
f"Strongly convex binary logistic regression. mu={mu}.")
plt.suptitle(else:
f"Convex binary logistic regression. mu={mu}.")
plt.suptitle(
1, 4, 1)
plt.subplot(for method, metrics in results.items():
'f_gap'])
plt.plot(metrics['Iteration')
plt.xlabel(r'$|f(x) -f^*|$')
plt.ylabel('log')
plt.yscale(=":")
plt.grid(linestyle
1, 4, 2)
plt.subplot(for method, metrics in results.items():
'x_gap'], label=method)
plt.plot(metrics['Iteration')
plt.xlabel('$\|x_k - x^*\|$')
plt.ylabel('log')
plt.yscale(=":")
plt.grid(linestyle
1, 4, 3)
plt.subplot(for method, metrics in results.items():
"train_acc"])
plt.plot(metrics['Iteration')
plt.xlabel('Train accuracy')
plt.ylabel(# plt.yscale('log')
=":")
plt.grid(linestyle
1, 4, 4)
plt.subplot(for method, metrics in results.items():
"test_acc"])
plt.plot(metrics['Iteration')
plt.xlabel('Test accuracy')
plt.ylabel(# plt.yscale('log')
=":")
plt.grid(linestyle
# Place the legend below the plots
='lower center', ncol=5, bbox_to_anchor=(0.5, -0.00))
plt.figlegend(loc# Adjust layout to make space for the legend below
= ""
filename for method, metrics in results.items():
+= method
filename += f"_{mu}.pdf"
filename =[0, 0.05, 1, 1])
plt.tight_layout(rect
plt.savefig(filename) plt.show()
<>:340: SyntaxWarning: invalid escape sequence '\|'
<>:340: SyntaxWarning: invalid escape sequence '\|'
/var/folders/6l/qhfv4nh50cqfd22s2mp1shlm0000gn/T/ipykernel_99724/1722798055.py:340: SyntaxWarning: invalid escape sequence '\|'
plt.ylabel('$\|x_k - x^*\|$')
= {
params "mu": 0,
"m": 1000,
"n": 100,
"methods": [
{"method": "GD",
"learning_rate":9e4,
"iterations": 400,
},
{"method": "Heavy Ball",
"learning_rate": 9e-1,
"iterations": 400,
"momentum": 0.99,
},
{"method": "NAG",
"learning_rate": 9e-1,
"iterations": 400,
"momentum": 0.99,
},
]
}
= run_experiments(params)
results, params plot_results(results, params)
= {
params "mu": 1,
"m": 1000,
"n": 100,
"methods": [
{"method": "GD",
"learning_rate": 5e-2,
"iterations": 300,
},
{"method": "Heavy Ball",
"learning_rate": 5e-2,
"iterations": 300,
"momentum": 0.9,
},
{"method": "NAG",
"learning_rate": 5e-2,
"iterations": 300,
"momentum": 0.9,
},
]
}
= run_experiments(params)
results, params plot_results(results, params)