모델 성능 비교 및 선택 전략
예제 데이터
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
X, y = make_classification(
n_samples=1000, # 사례 수
n_features=2, # 특징(차원)
n_informative=2, n_redundant=0, n_repeated=0,
n_classes=2, # 클래스 수
n_clusters_per_class=1, # 클래스별 군집 수
weights=[0.8, 0.2], # 비율
class_sep=0.5, # 간격
flip_y=0.0, # 오분류 비율
random_state=0,
)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
로지스틱 회귀분석
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
model = LinearDiscriminantAnalysis()
model.fit(x_train, y_train)
output
LinearDiscriminantAnalysis()
혼동 행렬
from sklearn.metrics import confusion_matrix
import numpy as np
# 예측
y_prob = model.predict_proba(x_test)[:, 1]
threshold = 0.5
y_pred = np.where(y_prob > threshold, 1, 0)
# 혼동행렬 만들기
confusion_matrix(y_test, y_pred)
output
array([[252, 5],
[ 33, 40]], dtype=int64)
정확도
from sklearn.metrics import *
accuracy_score(y_test, y_pred)
output
0.8848484848484849
정밀도
precision_score(y_test, y_pred)
output
0.8888888888888888
재현도
recall_score(y_test, y_pred)
output
0.547945205479452
문턱을 낮췄을 때
threshold = 0.4
y_pred = np.where(y_prob > threshold, 1, 0)
confusion_matrix(y_test, y_pred)
output
array([[247, 10],
[ 27, 46]], dtype=int64)
precision_score(y_test, y_pred)
output
0.8214285714285714
recall_score(y_test, y_pred)
output
0.6301369863013698
문턱을 높였을 때
threshold = 0.6
y_pred = np.where(y_prob > threshold, 1, 0)
confusion_matrix(y_test, y_pred)
output
array([[253, 4],
[ 39, 34]], dtype=int64)
precision_score(y_test, y_pred)
output
0.8947368421052632
recall_score(y_test, y_pred)
output
0.4657534246575342