IMPORT THE ESSENTIAL LIBRARIES
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
DEFINE THE ESSENTIAL FUNCTIONS
def drop_columns(data_frame, column_name):
data_frame = data_frame.copy()
data_frame.drop(column_name, axis=1, inplace = True)
return data_frame
def best_correlated_features(data_frame, output_variable, max_corr = 0.25, min_corr = -0.25):
data_frame = data_frame.copy()
correlation_with_output = pd.DataFrame(data_frame.corr()[f'{output_variable}'])
column_names = correlation_with_output.index
column_values = correlation_with_output.values
best_columns = list()
for i, j in zip(column_names, column_values):
if j >= max_corr:
best_columns.append(i)
elif j <= min_corr:
best_columns.append(i)
return best_columns
def display_correlation(data_frame):
plt.figure(figsize=(10, 7))
cmap_value = 'CMRmap_r'
sns.heatmap(data_frame.corr(), annot = True, cmap = cmap_value)
plt.show()
def dataset_info(data_frame):
data_frame = data_frame.copy()
null_dataset = data_frame.isnull().sum()
data_type_dataset = data_frame.dtypes
indices = null_dataset.index
null_values = null_dataset.values
data_type_values = data_type_dataset.values
dataset_info_dict = {'features': indices, 'null_values': null_values, 'data_type': data_type_values}
print(pd.DataFrame(dataset_info_dict).head(len(indices)))
print('-'*80)
print(f'Shape of the data frame {data_frame.shape}')
IMPORT THE DATASET
df = pd.read_csv('diabetes.csv')
dataset_info(df)
features null_values data_type
0 Pregnancies 0 int64
1 Glucose 0 int64
2 BloodPressure 0 int64
3 SkinThickness 0 int64
4 Insulin 0 int64
5 BMI 0 float64
6 DiabetesPedigreeFunction 0 float64
7 Age 0 int64
8 Outcome 0 int64
--------------------------------------------------------------------------
Shape of the data frame (768, 9)
print(df.head())
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \
0 6 148 72 35 0 33.6
1 1 85 66 29 0 26.6
2 8 183 64 0 0 23.3
3 1 89 66 23 94 28.1
4 0 137 40 35 168 43.1
DiabetesPedigreeFunction Age Outcome
0 0.627 50 1
1 0.351 31 0
2 0.672 32 1
3 0.167 21 0
4 2.288 33 1
df.Outcome.value_counts()
0 500
1 268
Name: Outcome, dtype: int64
best_correlated_columns = best_correlated_features(df, 'Outcome', 0.21, -0.20)
print(best_correlated_columns)
['Pregnancies', 'Glucose', 'BMI', 'Age', 'Outcome']
X = df.iloc[:, :-1]
y = df.loc[:, ['Outcome']]
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X, y = sm.fit_resample(X, y)
display_correlation(pd.concat([pd.DataFrame(X), y], axis = 1))
aug_df = pd.concat([pd.DataFrame(X), y], axis = 1)
best_correlated_columns = best_correlated_features(aug_df, 'Outcome', 0.21, -0.20)
print(best_correlated_columns)
['Glucose', 'BMI', 'Age', 'Outcome']
X = aug_df.loc[:, best_correlated_columns]
y = aug_df.loc[:, ['Outcome']]
scaler = MinMaxScaler()
model=scaler.fit(X)
X = model.transform(X)
scaler = MinMaxScaler()
model=scaler.fit(y)
y = model.transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
pd.DataFrame(y_train).value_counts()
0.0 400
1.0 400
dtype: int64
params = {
'n_estimators': [100,200,500,750,1000],
'max_depth': [3,5,7,9],
'min_child_weight': [1,3,5],
'gamma':[i/10.0 for i in range(0,5)],
'subsample':[i/10.0 for i in range(6,10)],
'colsample_bytree':[i/10.0 for i in range(6,10)],
'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05, 0.1, 1],
'learning_rate': [0.01, 0.02, 0.05, 0.1]
}
xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
silent=True, nthread=1)
folds = 5
param_comb = 5
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)
random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, random_state=1001 )
random_search.fit(X_train, y_train.ravel())
Fitting 5 folds for each of 5 candidates, totalling 25 fits
RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7f9e54a51820>,
estimator=XGBClassifier(learning_rate=0.02, n_estimators=600,
nthread=1, silent=True),
n_iter=5, n_jobs=4,
param_distributions={'colsample_bytree': [0.6, 0.7, 0.8,
0.9],
'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
'learning_rate': [0.01, 0.02, 0.05,
0.1],
'max_depth': [3, 5, 7, 9],
'min_child_weight': [1, 3, 5],
'n_estimators': [100, 200, 500, 750,
1000],
'reg_alpha': [0, 0.001, 0.005, 0.01,
0.05, 0.1, 1],
'subsample': [0.6, 0.7, 0.8, 0.9]},
random_state=1001, scoring='roc_auc', verbose=3)
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)
All results: {'mean_fit_time': array([0.85436773, 0.12571445, 0.29614768, 0.40871468, 0.56435437]), 'std_fit_time': array([0.05036582, 0.03244128, 0.06253441, 0.10844844, 0.06243001]), 'mean_score_time': array([0.01597557, 0.01393166, 0.01356397, 0.01151047, 0.00780344]), 'std_score_time': array([0.00593373, 0.00377552, 0.00399773, 0.00429262, 0.0047163 ]), 'param_subsample': masked_array(data=[0.7, 0.7, 0.8, 0.8, 0.9], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_reg_alpha': masked_array(data=[1, 1, 0.001, 0.1, 1], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_n_estimators': masked_array(data=[750, 100, 200, 500, 750], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_min_child_weight': masked_array(data=[1, 1, 5, 5, 3], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_max_depth': masked_array(data=[3, 3, 5, 7, 7], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_learning_rate': masked_array(data=[0.01, 0.1, 0.01, 0.05, 0.05], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_gamma': masked_array(data=[0.2, 0.2, 0.2, 0.4, 0.4], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'param_colsample_bytree': masked_array(data=[0.8, 0.7, 0.9, 0.7, 0.8], mask=[False, False, False, False, False], fill_value='?', dtype=object), 'params': [{'subsample': 0.7, 'reg_alpha': 1, 'n_estimators': 750, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 0.8}, {'subsample': 0.7, 'reg_alpha': 1, 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0.2, 'colsample_bytree': 0.7}, {'subsample': 0.8, 'reg_alpha': 0.001, 'n_estimators': 200, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 0.9}, {'subsample': 0.8, 'reg_alpha': 0.1, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 0.4, 'colsample_bytree': 0.7}, {'subsample': 0.9, 'reg_alpha': 1, 'n_estimators': 750, 'min_child_weight': 3, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 0.4, 'colsample_bytree': 0.8}], 'split0_test_score': array([1., 1., 1., 1., 1.]), 'split1_test_score': array([1., 1., 1., 1., 1.]), 'split2_test_score': array([1., 1., 1., 1., 1.]), 'split3_test_score': array([1., 1., 1., 1., 1.]), 'split4_test_score': array([1., 1., 1., 1., 1.]), 'mean_test_score': array([1., 1., 1., 1., 1.]), 'std_test_score': array([0.00000000e+00, 4.96506831e-17, 4.96506831e-17, 4.96506831e-17, 0.00000000e+00]), 'rank_test_score': array([1, 1, 1, 1, 1], dtype=int32)} Best estimator: XGBClassifier(colsample_bytree=0.8, gamma=0.2, learning_rate=0.01, n_estimators=750, nthread=1, reg_alpha=1, silent=True, subsample=0.7) Best normalized gini score for 5-fold search with 5 parameter combinations: 1.0 Best hyperparameters: {'subsample': 0.7, 'reg_alpha': 1, 'n_estimators': 750, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 0.8}
print(random_search.best_score_)
1.0
y_pred = random_search.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support
0.0 1.00 1.00 1.00 100
1.0 1.00 1.00 1.00 100
accuracy 1.00 200
macro avg 1.00 1.00 1.00 200
weighted avg 1.00 1.00 1.00 200
sns.pairplot(df.loc[:, best_correlated_columns], hue="Outcome")
plt.show()
If you are looking for help in any project contact us contact@codersarts.com
Commentaires