import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("cleandata.csv")
df.head()
| Unnamed: 0 | SeniorCitizen | MonthlyCharges | TotalCharges | Churn | gender_Male | Partner_Yes | Dependents_Yes | PhoneService_Yes | MultipleLines_No phone service | ... | PaperlessBilling_Yes | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | tenure_group_12 - 23 | tenure_group_24 - 35 | tenure_group_36 - 47 | tenure_group_48 - 59 | tenure_group_60 - 71 | tenure_group_72 - 72 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 29.85 | 29.85 | 0 | 0 | 1 | 0 | 0 | 1 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 1 | 0 | 56.95 | 1889.50 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2 | 2 | 0 | 53.85 | 108.15 | 1 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 3 | 0 | 42.30 | 1840.75 | 0 | 1 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | 4 | 0 | 70.70 | 151.65 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 37 columns
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()
| SeniorCitizen | MonthlyCharges | TotalCharges | Churn | gender_Male | Partner_Yes | Dependents_Yes | PhoneService_Yes | MultipleLines_No phone service | MultipleLines_Yes | ... | PaperlessBilling_Yes | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | tenure_group_12 - 23 | tenure_group_24 - 35 | tenure_group_36 - 47 | tenure_group_48 - 59 | tenure_group_60 - 71 | tenure_group_72 - 72 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 29.85 | 29.85 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 56.95 | 1889.50 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2 | 0 | 53.85 | 108.15 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 42.30 | 1840.75 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | 0 | 70.70 | 151.65 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 36 columns
x = df.drop('Churn', axis=1)
y = df['Churn']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)
model = DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)
model.fit(x_train,y_train)
DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)
predict = model.predict(x_test)
predict
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
# Print accuracy score
accuracy = accuracy_score(y_test, predict)
print("Accuracy:", accuracy)
# Print classification report
print("Classification Report:\n", classification_report(y_test, predict))
# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, predict))
Accuracy: 0.7742546142924751
Classification Report:
precision recall f1-score support
0 0.83 0.87 0.85 1564
1 0.57 0.51 0.54 549
accuracy 0.77 2113
macro avg 0.70 0.69 0.69 2113
weighted avg 0.77 0.77 0.77 2113
Confusion Matrix:
[[1358 206]
[ 271 278]]
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
model_rf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=6, min_samples_leaf=8)
model_rf.fit(x_train, y_train)
RandomForestClassifier(max_depth=6, min_samples_leaf=8)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(max_depth=6, min_samples_leaf=8)
model_gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
model_gbm.fit(x_train, y_train)
GradientBoostingClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GradientBoostingClassifier()
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(x_train, y_train)
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsClassifier()
model_svm = SVC(kernel='rbf', C=1.0, probability=True)
model_svm.fit(x_train, y_train)
SVC(probability=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(probability=True)
models = [model_rf, model_gbm, model_knn, model_svm]
model_names = ['Random Forest', 'GBM', 'KNN', 'SVM']
for model, name in zip(models, model_names):
y_pred = model.predict(x_test)
accuracy = model.score(x_test, y_test)
print(f"Classifier: {name}")
print(f"Accuracy: {accuracy:.2f}")
print(metrics.classification_report(y_test, y_pred))
print("------------")
Classifier: Random Forest
Accuracy: 0.79
precision recall f1-score support
0 0.81 0.93 0.87 1564
1 0.68 0.40 0.50 549
accuracy 0.79 2113
macro avg 0.75 0.66 0.68 2113
weighted avg 0.78 0.79 0.77 2113
------------
Classifier: GBM
Accuracy: 0.80
precision recall f1-score support
0 0.84 0.91 0.87 1564
1 0.66 0.49 0.56 549
accuracy 0.80 2113
macro avg 0.75 0.70 0.72 2113
weighted avg 0.79 0.80 0.79 2113
------------
Classifier: KNN
Accuracy: 0.77
precision recall f1-score support
0 0.82 0.89 0.85 1564
1 0.58 0.44 0.50 549
accuracy 0.77 2113
macro avg 0.70 0.66 0.68 2113
weighted avg 0.76 0.77 0.76 2113
------------
Classifier: SVM
Accuracy: 0.74
precision recall f1-score support
0 0.74 1.00 0.85 1564
1 0.00 0.00 0.00 549
accuracy 0.74 2113
macro avg 0.37 0.50 0.43 2113
weighted avg 0.55 0.74 0.63 2113
------------
sm = SMOTEENN()
x_resampled, y_resampled = sm.fit_resample(x_train, y_train)
xr_train,xr_test,yr_train,yr_test=train_test_split(x_resampled, y_resampled,test_size=0.3)
model_smote = DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)
model_smote.fit(xr_train,yr_train)
yr_pred_smote = model_smote.predict(xr_test)
model_score_r = model_smote.score(xr_test, yr_test)
print(round(model_score_r, 2))
print(metrics.classification_report(yr_test, yr_pred_smote))
print(metrics.confusion_matrix(yr_test, yr_pred_smote))
0.9
precision recall f1-score support
0 0.86 0.95 0.90 585
1 0.95 0.85 0.90 632
accuracy 0.90 1217
macro avg 0.90 0.90 0.90 1217
weighted avg 0.90 0.90 0.90 1217
[[554 31]
[ 93 539]]
as we have seen random forest did a better perform above even without smoteenn, lets's see how it performed after smoteen
model_rf_smote = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=6, min_samples_leaf=8)
model_rf_smote.fit(xr_train,yr_train)
yr_pred_smote = model_rf_smote.predict(xr_test)
model_score_r = model_rf_smote.score(xr_test, yr_test)
print(round(model_score_r, 2))
print(metrics.classification_report(yr_test, yr_pred_smote))
print(metrics.confusion_matrix(yr_test, yr_pred_smote))
0.94
precision recall f1-score support
0 0.96 0.92 0.94 585
1 0.93 0.96 0.94 632
accuracy 0.94 1217
macro avg 0.94 0.94 0.94 1217
weighted avg 0.94 0.94 0.94 1217
[[537 48]
[ 25 607]]
model_gbm_smote = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5)
model_gbm_smote.fit(xr_train, yr_train)
yr_pred_smote = model_gbm_smote.predict(xr_test)
model_score_r = model_gbm_smote.score(xr_test, yr_test)
print(round(model_score_r, 2))
print(metrics.classification_report(yr_test, yr_pred_smote))
print(metrics.confusion_matrix(yr_test, yr_pred_smote))
0.96
precision recall f1-score support
0 0.96 0.95 0.96 585
1 0.96 0.96 0.96 632
accuracy 0.96 1217
macro avg 0.96 0.96 0.96 1217
weighted avg 0.96 0.96 0.96 1217
[[558 27]
[ 25 607]]
model_knn_smote = KNeighborsClassifier(n_neighbors=5)
model_knn_smote.fit(xr_train,yr_train)
yr_pred_smote = model_knn_smote.predict(xr_test)
model_score_r = model_knn_smote.score(xr_test, yr_test)
print(round(model_score_r, 2))
print(metrics.classification_report(yr_test, yr_pred_smote))
print(metrics.confusion_matrix(yr_test, yr_pred_smote))
0.95
precision recall f1-score support
0 0.94 0.95 0.95 585
1 0.96 0.94 0.95 632
accuracy 0.95 1217
macro avg 0.95 0.95 0.95 1217
weighted avg 0.95 0.95 0.95 1217
[[558 27]
[ 36 596]]
import pickle
knnmodel = 'knnchurnmodel.sav'
pickle.dump(model_knn_smote, open(knnmodel, 'wb'))
rfmodel = 'rfchurnmodel.sav'
pickle.dump(model_rf_smote, open(rfmodel, 'wb'))
gbm_model = 'gbmchurnmodel.sav'
pickle.dump(model_gbm_smote, open(gbm_model, 'wb'))
load_model = pickle.load(open(gbm_model, 'rb'))
gbm_model_score = load_model.score(xr_test, yr_test)
gbm_model_score
0.9572719802793755
load_model = pickle.load(open(rfmodel, 'rb'))
rf_model_score = load_model.score(xr_test, yr_test)
rf_model_score
0.9400164338537387
load_model = pickle.load(open(knnmodel, 'rb'))
knn_model_score = load_model.score(xr_test, yr_test)
knn_model_score
0.9482333607230896
I have saved three of the model "rfchurnmodel.sav" (Random forest), gbmchurnmodel.sav (Gradient Boosting) and "knnchurnmodel.sav" (K-nearest neighbours)
Now, I will use Knnchurnmodel.sav as my final model, to create APIs for accessing the model from the UI.
With this implementation, users can efficiently utilize the predictive power of the model through the user interface.