import numpy as np
import matplotlib.pyplot as plt
42)
np.random.seed(= 100 # number of objects
N = np.random.randn(N//2, 2) + np.array([2, 2])
X1 = np.random.randn(N//2, 2) + np.array([-2, -2])
X2
= np.vstack([X1, X2]) # the matrix of objects x features
X = np.hstack([np.ones(N//2), -np.ones(N//2)]) # class labels: +1, -1
y
# Let's mix everything together for clarity
= np.random.permutation(N)
perm = X[perm]
X = y[perm] y
A problem of the form is solved: L(w) = \frac{\lambda}{2} \lVert w \rVert^2 + \frac{1}{N} \sum_{i=1}^{N} \max \left\{ 0,\ 1 - y_i w^T x_i \right\}. It can be shown that the subgradient g(w) = \lambda w + \frac{1}{N} \sum_{i=1}^{N} \begin{cases} -y_i x_i, & \text{if } \ y_i w^T x_i < 1, \\ 0, & \text{otherwise}. \end{cases} The function \ell_i(w) = \max\left\{ 0,\ 1 - y_i w^T x_i \right\} is called hinge loss.
Let’s generate data for classification:
Objective (hinge loss):
def hinge_svm_objective(w, X, y, lambd):
"""
Returns the value of the functional:
L(w) = (lambda/2)*||w||^2 + (1/N)*sum( max(0, 1 - y_i * w^T x_i) ).
"""
= 1 - y * (X.dot(w))
margins = np.maximum(0, margins)
hinge_losses return (lambd/2)*np.sum(w**2) + np.mean(hinge_losses)
The subgradient of the functional:
def hinge_svm_subgradient(w, X, y, lambd):
"""
Returns the subgradient g(w) for:
g(w) = lambda*w + (1/N)*sum( -y_i*x_i, если y_i*w^T x_i < 1 ).
"""
= X.shape[0]
N = 1 - y * (X.dot(w))
margins = (margins > 0).astype(float) # где hinge>0
active_mask
# Суммируем -y_i*x_i по всем активным (margin>0)
= -(y * active_mask)[:, np.newaxis] * X
grad_hinge = grad_hinge.mean(axis=0)
grad_hinge
= lambd*w + grad_hinge
g return g
Subgradient descent implementation:
def subgradient_descent(
=0.01,
X, y, lambd=150,
max_iter='constant',
step_rule=1.0,
c=0.0
f_star
):"""
Subgradient descent for SVM with hinge loss:
Параметры:
X, y : selection
lambd : the regularization coefficient
max_iter : iteration number
step_rule : step selection strategy:
- 'constant' -> alpha_k = c
- '1_over_k' -> alpha_k = c/k
- '1_over_sqrt_k' -> alpha_k = c/sqrt(k)
- 'polyak' -> alpha_k = (f(w_k) - f_star)/||g_k||^2
c : constant for the step (used in all but not for 'polyak')
f_star : estimated minimum (for polyak)
Возвращает:
w_history : the list of vectors w at each iteration
loss_history: the list of values of the functional L(w) at each iteration
"""
= np.zeros(X.shape[1])
w = [w.copy()]
w_history = [hinge_svm_objective(w, X, y, lambd)]
loss_history
for k in range(1, max_iter+1):
= hinge_svm_subgradient(w, X, y, lambd)
g = hinge_svm_objective(w, X, y, lambd)
loss_current
# Выбираем шаг
if step_rule == 'constant':
= c
alpha elif step_rule == '1_over_k':
= c / k
alpha elif step_rule == '1_over_sqrt_k':
= c / np.sqrt(k)
alpha elif step_rule == 'polyak':
# Polyak's step: (f(w) - f_star) / ||g||^2 (with a zero cut-off from below)
= np.dot(g, g)
denom if denom < 1e-15:
= 0.0
alpha else:
= (loss_current - f_star) / denom
alpha = max(alpha, 0.0) # чтобы шаг не был отрицательным
alpha else:
raise ValueError("Неизвестная стратегия шага")
# Update w
= w - alpha*g
w
w_history.append(w.copy())
loss_history.append(hinge_svm_objective(w, X, y, lambd))
return w_history, loss_history
Running an experiment for different step rules:
= 50
max_iter = 0.01
lambd
# For example Polyak step size will be considered f^*=0 (simplification for demonstration)
= 0.0
f_star_demo
= {
strategies 'constant': 1.5, # constant step size
'1_over_k': 5.0, # for 1/k
'1_over_sqrt_k': 5.0, # for 1/sqrt(k)
'polyak': None # for Polyak is None
}
= {}
results for rule, c_val in strategies.items():
= subgradient_descent(
w_hist, loss_hist =lambd, max_iter=max_iter,
X, y, lambd=rule,
step_rule=c_val if c_val is not None else 1.0,
c=f_star_demo
f_star
)= (w_hist, loss_hist) results[rule]
Vizualization:
plt.figure()for rule in strategies.keys():
= results[rule][1]
loss_hist =f"Stepsize rule: {rule}")
plt.plot(loss_hist, label"Iteration number")
plt.xlabel("L(w)")
plt.ylabel("Comparison of subgradient descent (SVM, hinge loss)\n with different steps")
plt.title(
plt.legend()True)
plt.grid(
plt.show()
plt.figure()==1, 0], X[y==1, 1], label="Class +1")
plt.scatter(X[y==-1, 0], X[y==-1, 1], label="Class -1")
plt.scatter(X[y
= np.linspace(X[:,0].min()-1, X[:,0].max()+1, 200)
x_vals for rule in strategies.keys():
= results[rule][0][-1]
w_final if abs(w_final[1]) < 1e-15:
= -0.0
x_const 1].min()-1, X[:,1].max()+1],
plt.plot([x_const, x_const], [X[:,=f"Dividing line ({rule})")
labelelse:
= -(w_final[0]/w_final[1]) * x_vals
y_vals =f"Dividing line ({rule})")
plt.plot(x_vals, y_vals, label
"$x_1$")
plt.xlabel("$x_2$")
plt.ylabel("Dividing lines after 150 iterations")
plt.title(
plt.legend() plt.show()