import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")


df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')


df.shape

(7043, 21)


df.head()


df.columns.values

array(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges',
       'TotalCharges', 'Churn'], dtype=object)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   object 
 20  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(18)
memory usage: 1.1+ MB


df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


df.describe()


cda = df.copy()


cda.TotalCharges = pd.to_numeric(cda.TotalCharges, errors = 'coerce')


cda.describe()


cda.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64


cda.loc[cda['TotalCharges'].isnull()== True]


cda['MonthlyCharges'].corr(cda['TotalCharges'])

0.6510648032262024


cda[cda['MonthlyCharges'].between(19, 25)].median()

SeniorCitizen       0.000
tenure             22.000
MonthlyCharges     20.050
TotalCharges      460.225
dtype: float64


cda[cda['MonthlyCharges'].between(50, 80)].median()

SeniorCitizen        0.00
tenure              22.00
MonthlyCharges      69.20
TotalCharges      1395.05
dtype: float64


cda['TotalCharges'] = np.where((cda['TotalCharges'].isnull()) & (cda['MonthlyCharges'].between(19, 26)), 460.225, cda['TotalCharges'])


# cda.loc[cda['TotalCharges'].isnull()== True]


cda['TotalCharges'] = np.where((cda['TotalCharges'].isnull()) & (cda['MonthlyCharges'].between(52, 81)), 1395.05, cda['TotalCharges'])


cda.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


cda.describe()


cda.duplicated().sum()

0


cda['tenure'].min()

0


cda['tenure'].max()

72


max_tenure = 72
bins = range(0, max_tenure + 13, 12)  # Increment the upper limit of the range by 12
labels = ["{0} - {1}".format(i, min(i + 11, max_tenure)) for i in bins[:-1]]
cda['tenure_group'] = pd.cut(cda['tenure'], bins=bins, labels=labels, right=False)


cda['tenure_group'].value_counts()

0 - 11     2069
60 - 71    1121
12 - 23    1047
24 - 35     876
48 - 59     820
36 - 47     748
72 - 72     362
Name: tenure_group, dtype: int64


cda.drop(['customerID'], axis=1, inplace=True)


cda.head()


cda['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64


cda['Churn'].value_counts(normalize=True)

No     0.73463
Yes    0.26537
Name: Churn, dtype: float64


def plot_show(data, x_column, hue_column, palette=None, figsize=(6, 4)):
    plt.figure(figsize=figsize)
    sns.countplot(x=x_column, hue=hue_column, data=data, palette={'No': '#1984c5', 'Yes': '#a7d5ed'})

    x_label = x_column
    y_label = 'Count'
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    # Add annotations for 'No' and 'Yes' counts in 'Churn' column
    ax = plt.gca()  # Get the current axes
    for p in ax.patches:
        if p.get_height() > 0:  # Only add text for non-zero counts
            ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2, p.get_height()), ha='center', va='bottom')

    plt.show()


plot_show(cda, 'gender', 'Churn')


def calculate_count_and_percentage(data, column_name):
    # Calculate count and percentage for each category in the specified column
    count_percentage= data[column_name].value_counts(dropna=False).reset_index()
    count_percentage.columns = [column_name, 'Count']
    
    # Check if the column contains numeric values (integer or float)
    if pd.api.types.is_numeric_dtype(data[column_name]):
        count_percentage['Percentage'] = (count_percentage['Count'] / count_percentage['Count'].sum()) * 100
    else:
        # For non-numeric values, calculate percentage using total non-null values
        total_non_null = data[column_name].count()
        count_percentage['Percentage'] = (count_percentage['Count'] / total_non_null) * 100

    # Sort the DataFrame by the column values
    count_percentage.sort_values(by=column_name, inplace=True)

    return count_percentage


calculate_count_and_percentage(cda, 'gender')


def calculate_churn_ratio(cda, column_name, target_column='Churn'):
    # Grouping the data by the specified column and 'Churn' columns and calculating the counts
    churn_counts = cda.groupby([column_name, target_column]).size().unstack()

    # Calculating the total counts for each group in the specified column (Yes and No)
    total_counts = cda[column_name].value_counts()

    # Calculate the churn ratio (percentage) for 'Yes' and 'No' in the specified column
    churn_ratio = (churn_counts.T / total_counts) * 100

    # Transpose churn_ratio for better visual
    churn_ratio = churn_ratio.T

    return churn_ratio


calculate_churn_ratio(cda, 'gender')


plot_show(cda, 'SeniorCitizen', 'Churn')


calculate_count_and_percentage(cda, 'SeniorCitizen')


calculate_churn_ratio(cda, 'SeniorCitizen')


plot_show(cda, 'InternetService', 'Churn')


calculate_count_and_percentage(cda, 'InternetService')


calculate_churn_ratio(cda, 'InternetService')


plot_show(cda, 'tenure_group', 'Churn', figsize=(10, 5))


calculate_count_and_percentage(cda, 'tenure_group')


calculate_churn_ratio(cda, 'tenure_group')


plot_show(cda, 'PaymentMethod', 'Churn', figsize=(10, 5))


calculate_count_and_percentage(cda, 'PaymentMethod')


calculate_churn_ratio(cda, 'PaymentMethod')


plot_show(cda, 'Dependents', 'Churn')


calculate_count_and_percentage(cda, 'Dependents')


calculate_churn_ratio(cda, 'Dependents')


plot_show(cda, 'Partner', 'Churn')


calculate_count_and_percentage(cda, 'Partner')


calculate_churn_ratio(cda, 'Partner')


plot_show(cda, 'OnlineBackup', 'Churn')


calculate_count_and_percentage(cda, 'OnlineBackup')


calculate_churn_ratio(cda, 'OnlineBackup')


plot_show(cda, 'Contract', 'Churn')


calculate_count_and_percentage(cda, 'Contract')


calculate_churn_ratio(cda, 'Contract')


plot_show(cda, 'PhoneService', 'Churn')


calculate_count_and_percentage(cda, 'PhoneService')


calculate_churn_ratio(cda, 'PhoneService')


plot_show(cda, 'MultipleLines', 'Churn')


calculate_count_and_percentage(cda, 'MultipleLines')


calculate_churn_ratio(cda, 'MultipleLines')


plot_show(cda, 'OnlineSecurity', 'Churn')


calculate_count_and_percentage(cda, 'OnlineSecurity')


calculate_churn_ratio(cda, 'OnlineSecurity')


plot_show(cda, 'TechSupport', 'Churn')


calculate_count_and_percentage(cda, 'TechSupport')


calculate_churn_ratio(cda, 'TechSupport')


plot_show(cda, 'DeviceProtection', 'Churn')


calculate_count_and_percentage(cda, 'DeviceProtection')


calculate_churn_ratio(cda, 'DeviceProtection')


plot_show(cda, 'StreamingTV', 'Churn')


calculate_count_and_percentage(cda, 'StreamingTV')


calculate_churn_ratio(cda, 'StreamingTV')


plot_show(cda, 'StreamingMovies', 'Churn')


calculate_count_and_percentage(cda, 'StreamingMovies')


calculate_churn_ratio(cda, 'StreamingMovies')


plot_show(cda, 'PaperlessBilling', 'Churn')


calculate_count_and_percentage(cda, 'PaperlessBilling')


calculate_churn_ratio(cda, 'PaperlessBilling')


plt.figure(figsize=(10, 5))
plt.scatter(cda['MonthlyCharges'], cda['TotalCharges'], c= '#6F61C0', alpha=0.2)
plt.title('Relationship between Monthly Charges and Total Charges')
plt.xlabel('Monthly Charges')
plt.ylabel('Total Charges')
plt.show()


cmc = sns.FacetGrid(cda, hue="Churn", height=4, aspect=2)
cmc.map(sns.kdeplot, "MonthlyCharges", shade=True)
cmc.add_legend(title="Churn")
cmc.set_xlabels("Monthly Charges")
cmc.fig.suptitle("Churn by Monthly charges ", fontsize=16)
plt.show()


ctc = sns.FacetGrid(cda, hue="Churn", height=4, aspect=2)
ctc.map(sns.kdeplot, "TotalCharges", shade=True)
ctc.add_legend(title="Churn")
ctc.set_xlabels("Total Charges")
ctc.fig.suptitle("Churn by Total charges ", fontsize=16)
plt.show()


cda['Churn'] = cda['Churn'].map({'Yes':'1', 'No':'0'})
cda['Churn'] = pd.to_numeric(cda['Churn'], errors='coerce')


cda.head()


target0=cda.loc[cda["Churn"]==0]
target1=cda.loc[cda["Churn"]==1]


def plot(cda, col, hue=None):

    sns.set_style('whitegrid')
    sns.set_context('talk')
    plt.rcParams["axes.labelsize"] = 15
    plt.rcParams['axes.titlesize'] = 15
    plt.rcParams['axes.titlepad'] = 20

    plt.figure(figsize=(10, 4), )
    ax = sns.countplot(data=cda, x=col, hue=hue, palette='Blues', order=cda[col].value_counts().index)
    plt.xticks(rotation=45)
    plt.yscale('log')
    ax.yaxis.grid(False)
    plt.show()


plot(target1, col='Partner', hue='gender')


plot(target1, col='Dependents', hue='gender')


plot(target1, col='TechSupport', hue='gender')


plot(target1, col='Partner', hue='InternetService')


plot(target1, col='PaymentMethod', hue='Dependents')


plot(target1, col='Contract', hue='InternetService')


plot(target1, col='TechSupport', hue='PaymentMethod')


plot(target1, col='PaperlessBilling', hue='Contract')


plot(target1, col='InternetService', hue='TechSupport')


plot(target1, col='PaperlessBilling', hue='InternetService')


plot(target1, col='PhoneService', hue='gender')


plot(target1, col='InternetService', hue='gender')


plot(target1, col='TechSupport', hue='gender')


plot(target1, col='Contract', hue='tenure_group')


plot(target1, col='tenure_group', hue='gender')


cda.drop(['tenure'],  axis=1, inplace=True)


categorical_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'tenure_group']


dummy_data = pd.get_dummies(cda, columns=categorical_columns, drop_first=True)


dummy_data.head()


plt.figure(figsize=(12,5))
dummy_data.corr()['Churn'].sort_values(ascending=False).plot(kind='bar', grid=False)

<Axes: >


dummy_data.to_csv('cleandata.csv')

	SeniorCitizen	tenure	MonthlyCharges
count	7043.000000	7043.000000	7043.000000
mean	0.162147	32.371149	64.761692
std	0.368612	24.559481	30.090047
min	0.000000	0.000000	18.250000
25%	0.000000	9.000000	35.500000
50%	0.000000	29.000000	70.350000
75%	0.000000	55.000000	89.850000
max	1.000000	72.000000	118.750000

	SeniorCitizen	tenure	MonthlyCharges	TotalCharges
count	7043.000000	7043.000000	7043.000000	7032.000000
mean	0.162147	32.371149	64.761692	2283.300441
std	0.368612	24.559481	30.090047	2266.771362
min	0.000000	0.000000	18.250000	18.800000
25%	0.000000	9.000000	35.500000	401.450000
50%	0.000000	29.000000	70.350000	1397.475000
75%	0.000000	55.000000	89.850000	3794.737500
max	1.000000	72.000000	118.750000	8684.800000

	customerID	gender	Partner	Dependents	PhoneService	MultipleLines	InternetService	OnlineSecurity	...	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
488	4472-LVYGI	Female	Yes	Yes	No	No phone service	DSL	Yes	...	Yes	Yes	Yes	No	Two year	Yes	Bank transfer (automatic)	52.55	NaN	No
753	3115-CZMZD	Male	No	Yes	Yes	No	No	No internet service	...	No internet service	No internet service	No internet service	No internet service	Two year	No	Mailed check	20.25	NaN	No
936	5709-LVOEQ	Female	Yes	Yes	Yes	No	DSL	Yes	...	Yes	No	Yes	Yes	Two year	No	Mailed check	80.85	NaN	No
1082	4367-NUYAO	Male	Yes	Yes	Yes	Yes	No	No internet service	...	No internet service	No internet service	No internet service	No internet service	Two year	No	Mailed check	25.75	NaN	No
1340	1371-DWPAZ	Female	Yes	Yes	No	No phone service	DSL	Yes	...	Yes	Yes	Yes	No	Two year	No	Credit card (automatic)	56.05	NaN	No
3331	7644-OMVMY	Male	Yes	Yes	Yes	No	No	No internet service	...	No internet service	No internet service	No internet service	No internet service	Two year	No	Mailed check	19.85	NaN	No
3826	3213-VVOLG	Male	Yes	Yes	Yes	Yes	No	No internet service	...	No internet service	No internet service	No internet service	No internet service	Two year	No	Mailed check	25.35	NaN	No
4380	2520-SGTTA	Female	Yes	Yes	Yes	No	No	No internet service	...	No internet service	No internet service	No internet service	No internet service	Two year	No	Mailed check	20.00	NaN	No
5218	2923-ARZLG	Male	Yes	Yes	Yes	No	No	No internet service	...	No internet service	No internet service	No internet service	No internet service	One year	Yes	Mailed check	19.70	NaN	No
6670	4075-WKNIU	Female	Yes	Yes	Yes	Yes	DSL	No	...	Yes	Yes	Yes	No	Two year	No	Mailed check	73.35	NaN	No
6754	2775-SEFEE	Male	No	Yes	Yes	Yes	DSL	Yes	...	No	Yes	No	No	Two year	Yes	Bank transfer (automatic)	61.90	NaN	No

	SeniorCitizen	tenure	MonthlyCharges	TotalCharges
count	7043.000000	7043.000000	7043.000000	7043.000000
mean	0.162147	32.371149	64.761692	2281.116754
std	0.368612	24.559481	30.090047	2265.747871
min	0.000000	0.000000	18.250000	18.800000
25%	0.000000	9.000000	35.500000	402.225000
50%	0.000000	29.000000	70.350000	1395.050000
75%	0.000000	55.000000	89.850000	3786.600000
max	1.000000	72.000000	118.750000	8684.800000

	gender	Count	Percentage
1	Female	3488	49.52435
0	Male	3555	50.47565

Import the necessary libraries¶

Import dataset¶

Look into the dataset¶

Copy original data to clean the data¶

Data Cleaning¶

Convert datatype of TotalCharges¶

Checking missing values¶

Handling missing value¶

Look into duplicate values¶

Grouping tenure in bins of 12 months¶

Remove columns that does not not required for processing¶

Churn ratio¶

Univariate Analysis¶

> 1. What is the distribution of churn among different genders? ¶

Convert Churn value to numeric form: 'Yes' to 1 and 'No' to 0¶

Convert all the categorical variables into dummy variables, using get_dummies()¶

Bivariate Analysis¶

Delete tenure column as it is not required further processing¶

convert data to csv¶

	customerID	gender	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	...	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	7590-VHVEG	Female	Yes	No	1	No	No phone service	DSL	No	...	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	5575-GNVDE	Male	No	No	34	Yes	No	DSL	Yes	...	Yes	No	No	No	One year	No	Mailed check	56.95	1889.5	No
2	3668-QPYBK	Male	No	No	2	Yes	No	DSL	Yes	...	No	No	No	No	Month-to-month	Yes	Mailed check	53.85	108.15	Yes
3	7795-CFOCW	Male	No	No	45	No	No phone service	DSL	Yes	...	Yes	Yes	No	No	One year	No	Bank transfer (automatic)	42.30	1840.75	No
4	9237-HQITU	Female	No	No	2	Yes	No	Fiber optic	No	...	No	No	No	No	Month-to-month	Yes	Electronic check	70.70	151.65	Yes

Churn	No	Yes
DSL	81.040892	18.959108
Fiber optic	58.107235	41.892765
No	92.595020	7.404980

	tenure_group	Count	Percentage
0	0 - 11	2069	29.376686
2	12 - 23	1047	14.865824
3	24 - 35	876	12.437882
5	36 - 47	748	10.620474
4	48 - 59	820	11.642766
1	60 - 71	1121	15.916513
6	72 - 72	362	5.139855

Churn	No	Yes
0 - 11	51.715805	48.284195
12 - 23	70.487106	29.512894
24 - 35	77.968037	22.031963
36 - 47	80.481283	19.518717
48 - 59	85.000000	15.000000
60 - 71	91.703836	8.296164
72 - 72	98.342541	1.657459

	PaymentMethod	Count	Percentage
2	Bank transfer (automatic)	1544	21.922476
3	Credit card (automatic)	1522	21.610109
0	Electronic check	2365	33.579441
1	Mailed check	1612	22.887974

Churn	No	Yes
Bank transfer (automatic)	83.290155	16.709845
Credit card (automatic)	84.756899	15.243101
Electronic check	54.714588	45.285412
Mailed check	80.893300	19.106700

	OnlineBackup	Count	Percentage
0	No	3088	43.844952
2	No internet service	1526	21.666903
1	Yes	2429	34.488144

Churn	No	Yes
No	60.071244	39.928756
No internet service	92.595020	7.404980
Yes	78.468506	21.531494

	Contract	Count	Percentage
0	Month-to-month	3875	55.019168
2	One year	1473	20.914383
1	Two year	1695	24.066449

Churn	No	Yes
Month-to-month	57.290323	42.709677
One year	88.730482	11.269518
Two year	97.168142	2.831858

Churn	No	Yes
Female	73.079128	26.920872
Male	73.839662	26.160338

	SeniorCitizen	Count	Percentage
0	0	5901	83.785319
1	1	1142	16.214681

Churn	No	Yes
SeniorCitizen
0	76.393832	23.606168
1	58.318739	41.681261

	Dependents	Count	Percentage
0	No	4933	70.041176
1	Yes	2110	29.958824

	MultipleLines	Count	Percentage
0	No	3390	48.132898
2	No phone service	682	9.683374
1	Yes	2971	42.183729

Churn	No	Yes
No	74.955752	25.044248
No phone service	75.073314	24.926686
Yes	71.390104	28.609896

	OnlineSecurity	Count	Percentage
0	No	3498	49.666335
2	No internet service	1526	21.666903
1	Yes	2019	28.666761

Churn	No	Yes
No	58.233276	41.766724
No internet service	92.595020	7.404980
Yes	85.388806	14.611194

	TechSupport	Count	Percentage
0	No	3473	49.311373
2	No internet service	1526	21.666903
1	Yes	2044	29.021724

Churn	No	Yes
No	58.364526	41.635474
No internet service	92.595020	7.404980
Yes	84.833659	15.166341

	DeviceProtection	Count	Percentage
0	No	3095	43.944342
2	No internet service	1526	21.666903
1	Yes	2422	34.388755

Churn	No	Yes
No	60.872375	39.127625
No internet service	92.595020	7.404980
Yes	77.497936	22.502064

	StreamingTV	Count	Percentage
0	No	2810	39.897771
2	No internet service	1526	21.666903
1	Yes	2707	38.435326

Churn	No	Yes
No	66.476868	33.523132
No internet service	92.595020	7.404980
Yes	69.929812	30.070188

	StreamingMovies	Count	Percentage
0	No	2785	39.542808
2	No internet service	1526	21.666903
1	Yes	2732	38.790288

Churn	No	Yes
No	66.319569	33.680431
No internet service	92.595020	7.404980
Yes	70.058565	29.941435