불균형 데이터

데이터 생성

from sklearn.datasets import make_classification
X, y = make_classification(
n_samples=1000, # 사례 수
n_features=2, # 특징(차원)
n_informative=2, n_redundant=0, n_repeated=0,
n_classes=2, # 클래스 수
n_clusters_per_class=1, # 클래스별 군집 수
weights=[0.02, 0.98], # 비율
class_sep=1.0, # 간격
flip_y=0.0, # 오분류 비율
random_state=0,
)

import matplotlib.pyplot as plt
plt.scatter(X[:, 0], X[:, 1], alpha=0.5, c=y)

output

<matplotlib.collections.PathCollection at 0x1fac77a6730>

<Figure size 640x480 with 1 Axes>

선형판별분석

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X, y)
lda.score(X, y)

output

0.992

시각화

import numpy as np
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 # 가로축 범위
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 # 세로축 범위
# 가로축과 세로축에 해당하는 값들을 그리드로 생성
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200)) 
# 그리드 포인트에 대한 예측 수행
Z = lda.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# 데이터 포인트 플롯
plt.scatter(X[:, 0], X[:, 1], alpha=0.5, c=y, edgecolor='k')
# 결정 경계 그리기
plt.contourf(xx, yy, Z, alpha=0.3, levels=np.linspace(Z.min(), Z.max(), 3), zorder=-1) 

output

<matplotlib.contour.QuadContourSet at 0x1fac9dba8b0>

<Figure size 640x480 with 1 Axes>

오버샘플링

랜덤 오버샘플링

from imblearn.over_sampling import RandomOverSampler
sampler = RandomOverSampler(random_state=0)
X_resampled, y_resampled = sampler.fit_resample(X, y)

X.shape # 980:20

output

(1000, 2)

X_resampled.shape # 980:980

output

(1960, 2)

lda = LinearDiscriminantAnalysis()
lda.fit(X_resampled, y_resampled)
lda.score(X, y)

output

0.89

import numpy as np
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 # 가로축 범위
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 # 세로축 범위
# 가로축과 세로축에 해당하는 값들을 그리드로 생성
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200)) 
# 그리드 포인트에 대한 예측 수행
Z = lda.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# 데이터 포인트 플롯
plt.scatter(X_resampled[:, 0], X_resampled[:, 1], alpha=0.5, c=y_resampled, 
edgecolor='k')
# 결정 경계 그리기
plt.contourf(xx, yy, Z, alpha=0.3, levels=np.linspace(Z.min(), Z.max(), 3), zorder=-1) 

output

<matplotlib.contour.QuadContourSet at 0x1facb514eb0>

<Figure size 640x480 with 1 Axes>

SMOTE

# 107p
from imblearn.over_sampling import SMOTE
sampler = SMOTE(random_state=0)
X_resampled, y_resampled = sampler.fit_resample(X, y)

# 선형판별분석
lda = LinearDiscriminantAnalysis()
lda.fit(X_resampled, y_resampled)
lda.score(X, y)

output

0.891

리샘플된 데이터로 점수를 매기면 더 잘나옴 -> 여기에 학습시켰기 때문.

궁극적 목적은 X, y를 더 잘 하는 건데 그 중에서도 minority를 더 잘하려는 것.

lda.score(X_resampled, y_resampled)

output

0.9306122448979591

import numpy as np
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 # 가로축 범위
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 # 세로축 범위
# 가로축과 세로축에 해당하는 값들을 그리드로 생성
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200)) 
# 그리드 포인트에 대한 예측 수행
Z = lda.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# 데이터 포인트 플롯(105p code로 교체)
plt.scatter(X_resampled[:, 0], X_resampled[:, 1], alpha=0.5, c=y_resampled, 
edgecolor='k') 
# 결정 경계 그리기
plt.contourf(xx, yy, Z, alpha=0.3, levels=np.linspace(Z.min(), Z.max(), 3), zorder=-1) 

output

<matplotlib.contour.QuadContourSet at 0x1facb3f1b50>

<Figure size 640x480 with 1 Axes>

ADASYN

from imblearn.over_sampling import ADASYN
sampler = ADASYN(random_state=0)
X_resampled, y_resampled = \
sampler.fit_resample(X, y)

# 선형판별분석
lda = LinearDiscriminantAnalysis()
lda.fit(X_resampled, y_resampled)
lda.score(X, y)

output

0.842

import numpy as np
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 # 가로축 범위
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 # 세로축 범위
# 가로축과 세로축에 해당하는 값들을 그리드로 생성
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200)) 
# 그리드 포인트에 대한 예측 수행
Z = lda.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# 데이터 포인트 플롯(105p code로 교체)
plt.scatter(X_resampled[:, 0], X_resampled[:, 1], alpha=0.5, c=y_resampled, 
edgecolor='k') 
# 결정 경계 그리기
plt.contourf(xx, yy, Z, alpha=0.3, levels=np.linspace(Z.min(), Z.max(), 3), zorder=-1) 

output

<matplotlib.contour.QuadContourSet at 0x1facb691250>

<Figure size 640x480 with 1 Axes>

언더샘플링

랜덤 언더샘플링

from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X, y)

# 선형판별분석
lda = LinearDiscriminantAnalysis()
lda.fit(X_resampled, y_resampled)
lda.score(X, y)

output

0.845

import numpy as np
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 # 가로축 범위
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 # 세로축 범위
# 가로축과 세로축에 해당하는 값들을 그리드로 생성
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200)) 
# 그리드 포인트에 대한 예측 수행
Z = lda.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# 데이터 포인트 플롯
plt.scatter(X_resampled[:, 0], X_resampled[:, 1], alpha=0.5, c=y_resampled, 
edgecolor='k') 
# 결정 경계 그리기
plt.contourf(xx, yy, Z, alpha=0.3, levels=np.linspace(Z.min(), Z.max(), 3), zorder=-1) 

output

<matplotlib.contour.QuadContourSet at 0x1fac9f64ac0>

<Figure size 640x480 with 1 Axes>

Near Miss

from imblearn.under_sampling import NearMiss
sampler = NearMiss()
X_resampled, y_resampled = sampler.fit_resample(X, y)

# 선형판별분석
lda = LinearDiscriminantAnalysis()
lda.fit(X_resampled, y_resampled)
lda.score(X, y)

output

0.989

import numpy as np
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 # 가로축 범위
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 # 세로축 범위
# 가로축과 세로축에 해당하는 값들을 그리드로 생성
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200)) 
# 그리드 포인트에 대한 예측 수행
Z = lda.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# 데이터 포인트 플롯
plt.scatter(X_resampled[:, 0], X_resampled[:, 1], alpha=0.5, c=y_resampled, 
edgecolor='k') 
# 결정 경계 그리기
plt.contourf(xx, yy, Z, alpha=0.3, levels=np.linspace(Z.min(), Z.max(), 3), zorder=-1) 

output

<matplotlib.contour.QuadContourSet at 0x1facb485d30>

<Figure size 640x480 with 1 Axes>

TomekLinks

from imblearn.under_sampling import TomekLinks
sampler = TomekLinks()
X_resampled, y_resampled = sampler.fit_resample(X, y)

# 선형판별분석
lda = LinearDiscriminantAnalysis()
lda.fit(X_resampled, y_resampled)
lda.score(X, y)

output

0.993

import numpy as np
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 # 가로축 범위
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 # 세로축 범위
# 가로축과 세로축에 해당하는 값들을 그리드로 생성
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200)) 
# 그리드 포인트에 대한 예측 수행
Z = lda.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# 데이터 포인트 플롯
plt.scatter(X_resampled[:, 0], X_resampled[:, 1], alpha=0.5, c=y_resampled, 
edgecolor='k') 
# 결정 경계 그리기
plt.contourf(xx, yy, Z, alpha=0.3, levels=np.linspace(Z.min(), Z.max(), 3), zorder=-1) 

output

<matplotlib.contour.QuadContourSet at 0x1facb622460>

<Figure size 640x480 with 1 Axes>

퀴즈

사용자 정보 입력

퀴즈를 시작하기 전에 이름과 소속을 입력해주세요.

이름

별명

소속

선형판별분석​

오버샘플링​

랜덤 오버샘플링​

SMOTE​

ADASYN​

언더샘플링​

랜덤 언더샘플링​

Near Miss​

TomekLinks​

퀴즈​

Q&A​

선형판별분석

오버샘플링

랜덤 오버샘플링

SMOTE

ADASYN

언더샘플링

랜덤 언더샘플링

Near Miss

TomekLinks

퀴즈

Q&A