# Régression Ridge et erreur de généralisation 

In [3]:
from ipywidgets import interact, interactive, fixed, interact_manual, interactive_output
import ipywidgets as widgets
import numpy as np
import matplotlib.pyplot as plt
import random as rand
from sklearn import datasets, linear_model
import copy as cp

import numpy.random as randn
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn import preprocessing

In [4]:
def generate_data(sigma,alpha,n,d):
    X = np.zeros((n,d))
    Y = np.zeros(n) 
    Xhat = np.zeros((n,d)) 
    Yhat = np.zeros(n)
    beta = np.zeros(d+1)
    eps = randn.normal(0.0,sigma,n) ## vecteur des bruits
    meanY = 0.0
    meanX = np.zeros(d)
    varianceX = np.zeros(d)
    beta[:]=randn.normal(0.0,1.0/alpha,d+1)
    for i in range(0, n):
        X[i,:]=randn.uniform(0.0,1.0/alpha,d) # m nombre aléatoire entre 0 et 1
        Y[i]= np.dot(beta[0:-1],X[i,:])+beta[-1]+eps[i]

    return beta,X,Y,eps

In [6]:

# calcul les erreur pour la regression Rdige
def errorR(lam,beta_hat,alpha,X,Y,n,m):
    nnew=100
    if lam >0:
        reg = Ridge(alpha=lam,tol=10-10,solver='svd').fit(X,Y)
    else: 
        reg = LinearRegression().fit(X, Y)
     
    Yref = np.zeros(n); Ycom = np.zeros(n)   
    Ynewref = np.zeros(nnew); Ynewcom = np.zeros(nnew)
       
    Xnew = np.zeros((nnew,m)) # entrées et sortiesde l'échantillon de test
    for i in range(0, nnew):
        Xnew[i,:]=randn.uniform(0.0,1.0,m)   
        Ynewref[i]= np.dot(beta_hat[0:-1],Xnew[i,:])+beta_hat[-1]
        Ynewcom[i] = np.dot(reg.coef_,Xnew[i,:])+reg.intercept_

    for i in range(0, n): # sortie de l'échantillon d'entrainement
        Ycom[i] = np.dot(reg.coef_,X[i,:])+reg.intercept_
        Yref[i]= np.dot(beta_hat[0:-1],X[i,:])+beta_hat[-1]

    Error=0.0;  ErrorQuad=0.0; normp=0.0

    Error = np.mean((Ynewref - Ynewcom)**2)
    ErrorQuad = np.mean((Y - Ycom)**2) 

    normp = reg.intercept_**2
    for i in range(0,m):
        normp = normp+reg.coef_[i]**2
    normp/(m+1) 
    return Error,ErrorQuad,normp

def running_mean(x, N): ## Moyenne glissante
    out = np.zeros_like(x, dtype=np.float64)
    dim_len = len(x)
    for i in range(dim_len):
        if N%2 == 0:
            a, b = i - (N-1)//2, i + (N-1)//2 + 2
        else:
            a, b = i - (N-1)//2, i + (N-1)//2 + 1
        #cap indices to min and max indices
        a = max(0, a)
        b = min(dim_len, b)
        out[i] = np.mean(x[a:b])
    return out

In [44]:
def plot_overfitting_ridge(sigma,alpha,dim): ## affiche les erreurs pour une regréssion classique
    lamb_list = np.linspace(0.01,10,100)
    n = 100 
    E=[]; N=[]; EQ=[]
    beta,X,Y,eps= generate_data(sigma,alpha,n,dim)
    for il in range(0,len(lamb_list)):
        Error,ErrorQuad,normp = errorR(lamb_list[il],beta,alpha,X,Y,n,dim)
        E.append(Error); N.append(normp)
        EQ.append(ErrorQuad)

    fig, axes = plt.subplots(1, 3, figsize=(18, 3.5))
    axes[0].plot(lamb_list,E,'o-',alpha=0.8,color='#0571b0')
    axes[0].plot(lamb_list,running_mean(E,10),color='#ca0020',linewidth=3,alpha=0.8)
    axes[0].title.set_text('Erreur généralisation')
    
    axes[1].plot(lamb_list,EQ,'o-',alpha=0.8,color='#0571b0')
    axes[1].plot(lamb_list,running_mean(EQ,10),color='#ca0020',linewidth=3,alpha=0.8)
    axes[1].title.set_text('Erreur d entrainement')
    axes[2].plot(lamb_list,N,'o-',alpha=0.8,color='#0571b0')
    axes[2].title.set_text('Norme des poids')
    plt.show()

In [45]:
sigma = widgets.FloatSlider(value=0.01,min=0.01,max=0.05,step=0.01,description="sigma")
alpha = widgets.IntSlider(value=5,min=1,max=50,step=5,description="alpha")
dim = widgets.IntSlider(value=50,min=20,max=400,step=5,description="dim")

ui = widgets.HBox([sigma,alpha,dim])

out = widgets.interactive_output(plot_overfitting_ridge, {'sigma':sigma,'alpha': alpha,'dim':dim})

In [46]:
display(ui,out)

HBox(children=(FloatSlider(value=0.01, description='sigma', max=0.05, min=0.01, step=0.01), IntSlider(value=5,…

Output()