회귀분석

import pandas as pd
from statsmodels.formula.api import ols

# 가져오기
df = pd.read_excel('car.xlsx')

# 분석
m = ols("price ~ mileage", data = df).fit()

# 결과
m.summary()

output

c:\Users\eupho\anaconda3\lib\site-packages\pandas\core\arrays\masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (

<class 'statsmodels.iolib.summary.Summary'>
"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.457
Model:                            OLS   Adj. R-squared:                  0.455
Method:                 Least Squares   F-statistic:                     229.1
Date:                Thu, 08 Aug 2024   Prob (F-statistic):           5.81e-38
Time:                        16:58:38   Log-Likelihood:                -1895.7
No. Observations:                 274   AIC:                             3795.
Df Residuals:                     272   BIC:                             3803.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   1258.7668     30.599     41.137      0.000    1198.526    1319.008
mileage       -0.0052      0.000    -15.136      0.000      -0.006      -0.005
==============================================================================
Omnibus:                        0.258   Durbin-Watson:                   1.101
Prob(Omnibus):                  0.879   Jarque-Bera (JB):                0.108
Skew:                           0.032   Prob(JB):                        0.947
Kurtosis:                       3.074   Cond. No.                     1.83e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.83e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
"""

예측

# 새로운 데이터 만들기
new_df = pd.DataFrame({'mileage': [10000, 20000]})

# 모형에 입력하여 예측
m.predict(new_df)

output

0    1206.483684
1    1154.200600
dtype: float64

다중회귀분석

ols("price ~ mileage + year", data = df).fit().summary()

output

<class 'statsmodels.iolib.summary.Summary'>
"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.749
Model:                            OLS   Adj. R-squared:                  0.747
Method:                 Least Squares   F-statistic:                     403.5
Date:                Thu, 08 Aug 2024   Prob (F-statistic):           5.62e-82
Time:                        16:58:39   Log-Likelihood:                -1790.2
No. Observations:                 274   AIC:                             3586.
Df Residuals:                     271   BIC:                             3597.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -1.688e+05   9597.865    -17.592      0.000   -1.88e+05    -1.5e+05
mileage       -0.0023      0.000     -8.143      0.000      -0.003      -0.002
year          84.3822      4.761     17.724      0.000      75.009      93.755
==============================================================================
Omnibus:                       11.272   Durbin-Watson:                   1.598
Prob(Omnibus):                  0.004   Jarque-Bera (JB):               11.786
Skew:                           0.435   Prob(JB):                      0.00276
Kurtosis:                       3.523   Cond. No.                     8.41e+07
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 8.41e+07. This might indicate that there are
strong multicollinearity or other numerical problems.
"""

표준화

ols("price ~ scale(mileage) + scale(year)", data = df).fit().summary()

output

<class 'statsmodels.iolib.summary.Summary'>
"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.749
Model:                            OLS   Adj. R-squared:                  0.747
Method:                 Least Squares   F-statistic:                     403.5
Date:                Thu, 08 Aug 2024   Prob (F-statistic):           5.62e-82
Time:                        16:58:39   Log-Likelihood:                -1790.2
No. Observations:                 274   AIC:                             3586.
Df Residuals:                     271   BIC:                             3597.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept        853.6606     10.112     84.419      0.000     833.752     873.569
scale(mileage)  -100.2044     12.306     -8.143      0.000    -124.431     -75.978
scale(year)      218.1006     12.306     17.724      0.000     193.874     242.327
==============================================================================
Omnibus:                       11.272   Durbin-Watson:                   1.598
Prob(Omnibus):                  0.004   Jarque-Bera (JB):               11.786
Skew:                           0.435   Prob(JB):                      0.00276
Kurtosis:                       3.523   Cond. No.                         1.91
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
"""

더미코딩

ols("price ~ model", data = df).fit().summary()

output

<class 'statsmodels.iolib.summary.Summary'>
"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.011
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     3.039
Date:                Thu, 08 Aug 2024   Prob (F-statistic):             0.0824
Time:                        16:58:39   Log-Likelihood:                -1977.9
No. Observations:                 274   AIC:                             3960.
Df Residuals:                     272   BIC:                             3967.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
===============================================================================
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept     833.4146     23.144     36.009      0.000     787.850     878.980
model[T.K3]    80.3970     46.121      1.743      0.082     -10.402     171.196
==============================================================================
Omnibus:                       13.893   Durbin-Watson:                   0.528
Prob(Omnibus):                  0.001   Jarque-Bera (JB):               15.007
Skew:                           0.573   Prob(JB):                     0.000551
Kurtosis:                       3.002   Cond. No.                         2.48
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
"""

범주가 3개일 경우

dep = pd.read_excel('depression.xlsx')
ols('y ~ TRT', dep).fit().summary()

output

<class 'statsmodels.iolib.summary.Summary'>
"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.172
Model:                            OLS   Adj. R-squared:                  0.122
Method:                 Least Squares   F-statistic:                     3.424
Date:                Thu, 08 Aug 2024   Prob (F-statistic):             0.0445
Time:                        16:58:39   Log-Likelihood:                -137.86
No. Observations:                  36   AIC:                             281.7
Df Residuals:                      33   BIC:                             286.5
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     62.3333      3.359     18.557      0.000      55.500      69.167
TRT[T.B]     -10.4167      4.750     -2.193      0.035     -20.081      -0.752
TRT[T.C]     -11.0833      4.750     -2.333      0.026     -20.748      -1.419
==============================================================================
Omnibus:                        0.553   Durbin-Watson:                   1.488
Prob(Omnibus):                  0.758   Jarque-Bera (JB):                0.544
Skew:                          -0.267   Prob(JB):                        0.762
Kurtosis:                       2.721   Cond. No.                         3.73
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
"""

선형 회귀분석과 분산 분석

import pingouin as pg
pg.anova(dep, dv='y', between='TRT')

output

  Source  ddof1  ddof2         F     p-unc       np2
0    TRT      2     33  3.424087  0.044539  0.171857

c:\Users\eupho\anaconda3\lib\site-packages\outdated\utils.py:14: OutdatedPackageWarning: The package pingouin is out of date. Your version is 0.5.3, the latest is 0.5.4.
Set the environment variable OUTDATED_IGNORE=1 to disable these warnings.
  return warn(

기준 범주 바꾸기

ols('price ~ C(model, Treatment("K3"))', df).fit().summary()

output

<class 'statsmodels.iolib.summary.Summary'>
"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.011
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     3.039
Date:                Thu, 08 Aug 2024   Prob (F-statistic):             0.0824
Time:                        16:58:40   Log-Likelihood:                -1977.9
No. Observations:                 274   AIC:                             3960.
Df Residuals:                     272   BIC:                             3967.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
=======================================================================================================
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
Intercept                             913.8116     39.893     22.906      0.000     835.273     992.350
C(model, Treatment("K3"))[T.Avante]   -80.3970     46.121     -1.743      0.082    -171.196      10.402
==============================================================================
Omnibus:                       13.893   Durbin-Watson:                   0.528
Prob(Omnibus):                  0.001   Jarque-Bera (JB):               15.007
Skew:                           0.573   Prob(JB):                     0.000551
Kurtosis:                       3.002   Cond. No.                         3.76
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
"""

무작위 데이터의 예시

무작위로 만든 데이터도 독립변수가 많으면 R제곱이 높게 나옴

import numpy as np
d = np.random.random(size=(10, 21))
d = pd.DataFrame(d, columns=['y'] + [f'x{i}' for i in range(20)])
ols('y ~ x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8', d).fit().summary()

output

<class 'statsmodels.iolib.summary.Summary'>
"""
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Thu, 08 Aug 2024   Prob (F-statistic):                nan
Time:                        16:59:03   Log-Likelihood:                 318.98
No. Observations:                  10   AIC:                            -618.0
Df Residuals:                       0   BIC:                            -614.9
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.9427        inf          0        nan         nan         nan
x0            -0.4604        inf         -0        nan         nan         nan
x1             1.2115        inf          0        nan         nan         nan
x2            -0.2894        inf         -0        nan         nan         nan
x3            -0.5839        inf         -0        nan         nan         nan
x4            -1.0369        inf         -0        nan         nan         nan
x5            -1.5566        inf         -0        nan         nan         nan
x6             1.7873        inf          0        nan         nan         nan
x7            -1.2252        inf         -0        nan         nan         nan
x8             0.4618        inf          0        nan         nan         nan
==============================================================================
Omnibus:                        1.046   Durbin-Watson:                   0.071
Prob(Omnibus):                  0.593   Jarque-Bera (JB):                0.826
Skew:                           0.543   Prob(JB):                        0.662
Kurtosis:                       2.104   Cond. No.                         69.6
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
"""

교차 검증

from sklearn.model_selection import train_test_split

# 분할
train_df, test_df = train_test_split(
df,               # 원자료    
test_size=0.2,    # 테스트 데이터의 비율(0.2 = 20%) 
random_state=42)  # 난수 생성의 seed를을 고정(동일한 분할을 위해) 

# 분석
m = ols('price ~ year', train_df).fit()

# 예측
y_pred = m.predict(test_df)

# 잔차 분산
from sklearn.metrics import mean_squared_error
mean_squared_error(test_df.price, y_pred)

output

34805.44825035994

예측​

다중회귀분석​

표준화​

더미코딩​

선형 회귀분석과 분산 분석​

기준 범주 바꾸기​

무작위 데이터의 예시​

교차 검증​

예측

다중회귀분석

표준화

더미코딩

선형 회귀분석과 분산 분석

기준 범주 바꾸기

무작위 데이터의 예시

교차 검증