import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import chi2_contingency
import scipy.stats as stats
!pip install drawarrow
# for scaling numerical features
from sklearn.preprocessing import StandardScaler
# for hot-encoding
from sklearn.preprocessing import OneHotEncoder
# prediction model validation
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve

Requirement already satisfied: drawarrow in /usr/local/lib/python3.10/dist-packages (0.1.0)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from drawarrow) (3.7.5)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->drawarrow) (1.3.1)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->drawarrow) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->drawarrow) (4.55.3)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->drawarrow) (1.4.7)
Requirement already satisfied: numpy<2,>=1.20 in /usr/local/lib/python3.10/dist-packages (from matplotlib->drawarrow) (1.26.4)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->drawarrow) (24.2)
Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->drawarrow) (11.0.0)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->drawarrow) (3.2.0)
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->drawarrow) (2.9.0.post0)
Requirement already satisfied: mkl_fft in /usr/local/lib/python3.10/dist-packages (from numpy<2,>=1.20->matplotlib->drawarrow) (1.3.8)
Requirement already satisfied: mkl_random in /usr/local/lib/python3.10/dist-packages (from numpy<2,>=1.20->matplotlib->drawarrow) (1.2.4)
Requirement already satisfied: mkl_umath in /usr/local/lib/python3.10/dist-packages (from numpy<2,>=1.20->matplotlib->drawarrow) (0.1.1)
Requirement already satisfied: mkl in /usr/local/lib/python3.10/dist-packages (from numpy<2,>=1.20->matplotlib->drawarrow) (2025.0.1)
Requirement already satisfied: tbb4py in /usr/local/lib/python3.10/dist-packages (from numpy<2,>=1.20->matplotlib->drawarrow) (2022.0.0)
Requirement already satisfied: mkl-service in /usr/local/lib/python3.10/dist-packages (from numpy<2,>=1.20->matplotlib->drawarrow) (2.4.1)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->drawarrow) (1.17.0)
Requirement already satisfied: intel-openmp>=2024 in /usr/local/lib/python3.10/dist-packages (from mkl->numpy<2,>=1.20->matplotlib->drawarrow) (2024.2.0)
Requirement already satisfied: tbb==2022.* in /usr/local/lib/python3.10/dist-packages (from mkl->numpy<2,>=1.20->matplotlib->drawarrow) (2022.0.0)
Requirement already satisfied: tcmlib==1.* in /usr/local/lib/python3.10/dist-packages (from tbb==2022.*->mkl->numpy<2,>=1.20->matplotlib->drawarrow) (1.2.0)
Requirement already satisfied: intel-cmplr-lib-rt in /usr/local/lib/python3.10/dist-packages (from mkl_umath->numpy<2,>=1.20->matplotlib->drawarrow) (2024.2.0)
Requirement already satisfied: intel-cmplr-lib-ur==2024.2.0 in /usr/local/lib/python3.10/dist-packages (from intel-openmp>=2024->mkl->numpy<2,>=1.20->matplotlib->drawarrow) (2024.2.0)

df_raw = pd.read_csv("/kaggle/input/telco-customer-churn-11-1-3/telco.csv")

df_raw.head()

df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 50 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Customer ID                        7043 non-null   object 
 1   Gender                             7043 non-null   object 
 2   Age                                7043 non-null   int64  
 3   Under 30                           7043 non-null   object 
 4   Senior Citizen                     7043 non-null   object 
 5   Married                            7043 non-null   object 
 6   Dependents                         7043 non-null   object 
 7   Number of Dependents               7043 non-null   int64  
 8   Country                            7043 non-null   object 
 9   State                              7043 non-null   object 
 10  City                               7043 non-null   object 
 11  Zip Code                           7043 non-null   int64  
 12  Latitude                           7043 non-null   float64
 13  Longitude                          7043 non-null   float64
 14  Population                         7043 non-null   int64  
 15  Quarter                            7043 non-null   object 
 16  Referred a Friend                  7043 non-null   object 
 17  Number of Referrals                7043 non-null   int64  
 18  Tenure in Months                   7043 non-null   int64  
 19  Offer                              3166 non-null   object 
 20  Phone Service                      7043 non-null   object 
 21  Avg Monthly Long Distance Charges  7043 non-null   float64
 22  Multiple Lines                     7043 non-null   object 
 23  Internet Service                   7043 non-null   object 
 24  Internet Type                      5517 non-null   object 
 25  Avg Monthly GB Download            7043 non-null   int64  
 26  Online Security                    7043 non-null   object 
 27  Online Backup                      7043 non-null   object 
 28  Device Protection Plan             7043 non-null   object 
 29  Premium Tech Support               7043 non-null   object 
 30  Streaming TV                       7043 non-null   object 
 31  Streaming Movies                   7043 non-null   object 
 32  Streaming Music                    7043 non-null   object 
 33  Unlimited Data                     7043 non-null   object 
 34  Contract                           7043 non-null   object 
 35  Paperless Billing                  7043 non-null   object 
 36  Payment Method                     7043 non-null   object 
 37  Monthly Charge                     7043 non-null   float64
 38  Total Charges                      7043 non-null   float64
 39  Total Refunds                      7043 non-null   float64
 40  Total Extra Data Charges           7043 non-null   int64  
 41  Total Long Distance Charges        7043 non-null   float64
 42  Total Revenue                      7043 non-null   float64
 43  Satisfaction Score                 7043 non-null   int64  
 44  Customer Status                    7043 non-null   object 
 45  Churn Label                        7043 non-null   object 
 46  Churn Score                        7043 non-null   int64  
 47  CLTV                               7043 non-null   int64  
 48  Churn Category                     1869 non-null   object 
 49  Churn Reason                       1869 non-null   object 
dtypes: float64(8), int64(11), object(31)
memory usage: 2.7+ MB

df_raw.describe()

# Calculate missing values
missing_values = df_raw.isnull().sum()

# Create a DataFrame to check ratios per feature
missing_df = pd.DataFrame({
    'Feature': missing_values.index,
    'Missing Values': missing_values.values,
    '% Missing': (missing_values.values / len(df_raw)) * 100
})

# Filter to show only columns with missing values
missing_df = missing_df[missing_df['Missing Values'] > 0]

# Display the missing values report
print(missing_df)

           Feature  Missing Values  % Missing
19           Offer            3877  55.047565
24   Internet Type            1526  21.666903
48  Churn Category            5174  73.463013
49    Churn Reason            5174  73.463013

fig, ax = plt.subplots(figsize=(25, 5))
 
# drawing the plot
sns.heatmap(df_raw.isnull(), cbar=False)
plt.show()

print(plt.rcParams["font.sans-serif"])
print(plt.rcParams["font.monospace"])

['Liberation Sans']
['DejaVu Sans Mono', 'Bitstream Vera Sans Mono', 'Computer Modern Typewriter', 'Andale Mono', 'Nimbus Mono L', 'Courier New', 'Courier', 'Fixed', 'Terminal', 'monospace']

# Set Seaborn theme 
sns.set_theme(
    style="whitegrid", 
    palette="pastel",
    rc={
        'axes.grid': False,  # Corrected placement
        'figure.figsize': (4,3),
        'axes.facecolor': '#f5f5f5', 
        'figure.facecolor': '#f5f5f5'
    },
    font_scale=0.85  # Adjust overall scaling
)


# Set font
plt.rcParams["font.family"] = "sans-serif"
plt.rcParams["font.sans-serif"] = ["Liberation Sans"]

ax = sns.histplot(data=df_raw, x="Customer Status")

# Remove the grid and border
ax.grid(False)
sns.despine()

# Set title & axis labels
ax.set_title("Customer Status Distribution", 
             fontsize=11.5, 
             fontweight='bold')
ax.set_ylabel("Count", fontsize=10)

# Add labels to bars
for container in ax.containers:
    ax.bar_label(container, fmt='%d')

plt.show()

/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):

df_raw['Churn Category'].unique()

array(['Competitor', 'Dissatisfaction', 'Price', 'Other', 'Attitude', nan],
      dtype=object)

df_raw['Churn Reason'].unique()

array(['Competitor offered more data', 'Competitor made better offer',
       'Limited range of services', 'Extra data charges',
       'Competitor had better devices', "Don't know",
       'Service dissatisfaction',
       'Lack of affordable download/upload speed',
       'Product dissatisfaction', 'Long distance charges',
       'Poor expertise of online support', 'Attitude of support person',
       'Network reliability', 'Competitor offered higher download speeds',
       'Moved', 'Price too high', 'Attitude of service provider',
       'Poor expertise of phone support', 'Deceased', nan,
       'Lack of self-service on Website'], dtype=object)

ax = sns.histplot(data=df_raw, x="Internet Service")

# Remove the grid and border
sns.despine()

ax.set_title("Internet service offered", 
             fontsize=11.5, 
             fontweight='bold')

for i in ax.containers:
    ax.bar_label(i,)

/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):

df_raw['Internet Type'] = df_raw['Internet Type'].fillna('No Internet')

df_raw['Offer'].unique()

array([nan, 'Offer E', 'Offer D', 'Offer C', 'Offer B', 'Offer A'],
      dtype=object)

df_raw['Offer'] = df_raw['Offer'].fillna('No Offer')

df_raw['Internet Type'] = df_raw['Internet Type'].fillna('No Internet')

df_raw['Churn Category'] = df_raw['Churn Category'].fillna('not applicable')
df_raw['Churn Reason'] = df_raw['Churn Reason'].fillna('not applicable')

fig, ax = plt.subplots(figsize=(25, 5))
 
# drawing the plot
sns.heatmap(df_raw.isnull(), cbar=False)
plt.show()

num_duplicates = df_raw.duplicated().sum()
print(f"Total duplicate rows: {num_duplicates}")

Total duplicate rows: 0

# Identify categorical columns
cat_cols = df_raw.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical Columns: {cat_cols}")

Categorical Columns: ['Customer ID', 'Gender', 'Under 30', 'Senior Citizen', 'Married', 'Dependents', 'Country', 'State', 'City', 'Quarter', 'Referred a Friend', 'Offer', 'Phone Service', 'Multiple Lines', 'Internet Service', 'Internet Type', 'Online Security', 'Online Backup', 'Device Protection Plan', 'Premium Tech Support', 'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data', 'Contract', 'Paperless Billing', 'Payment Method', 'Customer Status', 'Churn Label', 'Churn Category', 'Churn Reason']

# Print unique values for each categorical column
for col in cat_cols:
    print(f"Unique values in {col}: {df_raw[col].unique()}\n")

Unique values in Customer ID: ['8779-QRDMV' '7495-OOKFY' '1658-BYGOY' ... '2234-XADUH' '4801-JZAZL'
 '3186-AJIEK']

Unique values in Gender: ['Male' 'Female']

Unique values in Under 30: ['No' 'Yes']

Unique values in Senior Citizen: ['Yes' 'No']

Unique values in Married: ['No' 'Yes']

Unique values in Dependents: ['No' 'Yes']

Unique values in Country: ['United States']

Unique values in State: ['California']

Unique values in City: ['Los Angeles' 'Inglewood' 'Whittier' ... 'Topaz' 'Jacumba' 'Holtville']

Unique values in Quarter: ['Q3']

Unique values in Referred a Friend: ['No' 'Yes']

Unique values in Offer: ['No Offer' 'Offer E' 'Offer D' 'Offer C' 'Offer B' 'Offer A']

Unique values in Phone Service: ['No' 'Yes']

Unique values in Multiple Lines: ['No' 'Yes']

Unique values in Internet Service: ['Yes' 'No']

Unique values in Internet Type: ['DSL' 'Fiber Optic' 'Cable' 'No Internet']

Unique values in Online Security: ['No' 'Yes']

Unique values in Online Backup: ['No' 'Yes']

Unique values in Device Protection Plan: ['Yes' 'No']

Unique values in Premium Tech Support: ['No' 'Yes']

Unique values in Streaming TV: ['No' 'Yes']

Unique values in Streaming Movies: ['Yes' 'No']

Unique values in Streaming Music: ['No' 'Yes']

Unique values in Unlimited Data: ['No' 'Yes']

Unique values in Contract: ['Month-to-Month' 'One Year' 'Two Year']

Unique values in Paperless Billing: ['Yes' 'No']

Unique values in Payment Method: ['Bank Withdrawal' 'Credit Card' 'Mailed Check']

Unique values in Customer Status: ['Churned' 'Stayed' 'Joined']

Unique values in Churn Label: ['Yes' 'No']

Unique values in Churn Category: ['Competitor' 'Dissatisfaction' 'Price' 'Other' 'Attitude'
 'not applicable']

Unique values in Churn Reason: ['Competitor offered more data' 'Competitor made better offer'
 'Limited range of services' 'Extra data charges'
 'Competitor had better devices' "Don't know" 'Service dissatisfaction'
 'Lack of affordable download/upload speed' 'Product dissatisfaction'
 'Long distance charges' 'Poor expertise of online support'
 'Attitude of support person' 'Network reliability'
 'Competitor offered higher download speeds' 'Moved' 'Price too high'
 'Attitude of service provider' 'Poor expertise of phone support'
 'Deceased' 'not applicable' 'Lack of self-service on Website']

df_raw = df_raw.drop(columns=['Country', 'State', 'Quarter'])

# Convert Customer ID & Zip Code to string
df_raw['Zip Code'] = df_raw['Zip Code'].astype('string')

# Convert boolean-like categorical columns to boolean
bool_columns = ['Under 30', 'Senior Citizen', 'Married', 'Dependents', 'Referred a Friend']
for col in bool_columns:
    df_raw[col] = df_raw[col].map({'Yes': True, 'No': False})

# Convert categorical columns to category type
category_columns = ['Contract', 'Payment Method']
for col in category_columns:
    df_raw[col] = df_raw[col].astype('category')

# Confirm changes
print(df_raw.dtypes)

Customer ID                                  object
Gender                                       object
Age                                           int64
Under 30                                       bool
Senior Citizen                                 bool
Married                                        bool
Dependents                                     bool
Number of Dependents                          int64
City                                         object
Zip Code                             string[python]
Latitude                                    float64
Longitude                                   float64
Population                                    int64
Referred a Friend                              bool
Number of Referrals                           int64
Tenure in Months                              int64
Offer                                        object
Phone Service                                object
Avg Monthly Long Distance Charges           float64
Multiple Lines                               object
Internet Service                             object
Internet Type                                object
Avg Monthly GB Download                       int64
Online Security                              object
Online Backup                                object
Device Protection Plan                       object
Premium Tech Support                         object
Streaming TV                                 object
Streaming Movies                             object
Streaming Music                              object
Unlimited Data                               object
Contract                                   category
Paperless Billing                            object
Payment Method                             category
Monthly Charge                              float64
Total Charges                               float64
Total Refunds                               float64
Total Extra Data Charges                      int64
Total Long Distance Charges                 float64
Total Revenue                               float64
Satisfaction Score                            int64
Customer Status                              object
Churn Label                                  object
Churn Score                                   int64
CLTV                                          int64
Churn Category                               object
Churn Reason                                 object
dtype: object

plt.figure(figsize=(12,6))
df_numerical = df_raw.select_dtypes(include=['int64', 'float64'])

sns.heatmap(df_numerical.corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

# Compute correlation matrix
corr_matrix = df_numerical.corr().abs()

# Find features with correlation higher than 0.95 
high_corr_vars = np.where((corr_matrix > 0.95) & (corr_matrix < 1))

# Get feature pairs
high_corr_pairs = [(df_numerical.columns[i], df_numerical.columns[j]) 
                   for i, j in zip(*high_corr_vars) if i != j]

# Remove duplicates (since correlation matrix is symmetric)
high_corr_pairs = list(set(tuple(sorted(pair)) for pair in high_corr_pairs))

# Print highly correlated features
print("Highly correlated features (R > 0.95):", high_corr_pairs)

Highly correlated features (R > 0.95): [('Total Charges', 'Total Revenue')]

df_numerical = df_numerical.drop(columns=['Total Charges'])

# Remove any columns that are constant (to avoid division by zero)
df_numerical = df_numerical.loc[:, df_numerical.nunique() > 1]

# Compute VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = df_numerical.columns
vif_data["VIF"] = [variance_inflation_factor(df_numerical.values, i) for i in range(df_numerical.shape[1])]

# Sort by highest VIF
vif_data = vif_data.sort_values(by="VIF", ascending=False)

print(vif_data)

                              Feature         VIF
3                           Longitude  622.151977
2                            Latitude  472.159198
13                      Total Revenue   33.539593
9                      Monthly Charge   22.978875
6                    Tenure in Months   20.784133
16                               CLTV   17.636865
12        Total Long Distance Charges   12.979178
15                        Churn Score   11.945573
14                 Satisfaction Score   11.862379
0                                 Age   11.713265
7   Avg Monthly Long Distance Charges    9.153916
8             Avg Monthly GB Download    3.266723
4                          Population    2.673368
5                 Number of Referrals    1.734954
1                Number of Dependents    1.446732
11           Total Extra Data Charges    1.102625
10                      Total Refunds    1.073243

# Define high VIF features to drop
high_vif_features = [
    'Longitude', 'Latitude', 'Total Revenue',  
    'Tenure in Months', 'CLTV', 'Total Long Distance Charges', 
    'Churn Score'
]

# Drop high VIF columns
df_numerical_reduced = df_numerical.drop(columns=high_vif_features)

# Recompute VIF after dropping
vif_data_reduced = pd.DataFrame()
vif_data_reduced["Feature"] = df_numerical_reduced.columns
vif_data_reduced["VIF"] = [variance_inflation_factor(df_numerical_reduced.values, i) for i in range(df_numerical_reduced.shape[1])]

print(vif_data_reduced.sort_values(by="VIF", ascending=False))

                             Feature       VIF
6                     Monthly Charge  8.053790
0                                Age  7.456286
9                 Satisfaction Score  5.709441
4  Avg Monthly Long Distance Charges  3.114525
5            Avg Monthly GB Download  2.910361
2                         Population  2.001026
3                Number of Referrals  1.597317
1               Number of Dependents  1.431258
8           Total Extra Data Charges  1.095145
7                      Total Refunds  1.066228

# Select categorical columns (object, category, and boolean)
cat_cols = df_raw.select_dtypes(include=['object', 'category', 'bool', 'string[python]']).columns.drop(['Customer ID'])

# Function to compute Cramér's V
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    r, k = confusion_matrix.shape
    return np.sqrt(chi2 / (n * (min(r, k) - 1)))

# Compute Cramér's V for all categorical pairs
cramers_results = pd.DataFrame(index=cat_cols, columns=cat_cols)

for col1 in cat_cols:
    for col2 in cat_cols:
        if col1 != col2:
            cramers_results.loc[col1, col2] = cramers_v(df_raw[col1], df_raw[col2])
        else:
            cramers_results.loc[col1, col2] = 1.0  # Correlation with itself

# Convert to float for better sorting
cramers_results = cramers_results.astype(float)

plt.figure(figsize=(12, 8))
sns.heatmap(cramers_results, cmap="coolwarm", annot=False, linewidths=0.5)
plt.title("Cramér’s V Heatmap of Categorical Features")
plt.show()

print("The City column has",df_raw['City'].nunique(),"unique values")
print("The Zip Code column has",df_raw['Zip Code'].nunique(),"unique values")

The City column has 1106 unique values
The Zip Code column has 1626 unique values

# Set correlation threshold 
threshold = 0.5

# Extract upper triangle of correlation matrix (excluding self-correlations)
cramers_filtered = cramers_results.where(np.triu(np.ones(cramers_results.shape), k=1).astype(bool))

# Convert matrix to a long format DataFrame
cramers_long = cramers_filtered.stack().reset_index()
cramers_long.columns = ['Feature 1', 'Feature 2', 'Cramér’s V']

# Filter only strong correlations
strong_correlations = cramers_long[cramers_long["Cramér’s V"] > threshold].sort_values(by="Cramér’s V", ascending=False)

print(strong_correlations)

            Feature 1          Feature 2  Cramér’s V
242  Internet Service      Internet Type    1.000000
372   Customer Status        Churn Label    1.000000
125              City           Zip Code    1.000000
376       Churn Label       Churn Reason    1.000000
375       Churn Label     Churn Category    1.000000
377    Churn Category       Churn Reason    0.993715
81            Married  Referred a Friend    0.949701
333  Streaming Movies    Streaming Music    0.848223
265     Internet Type     Unlimited Data    0.755751
250  Internet Service     Unlimited Data    0.755366
373   Customer Status     Churn Category    0.707107
374   Customer Status       Churn Reason    0.707107
165          Zip Code        Churn Label    0.534890
323      Streaming TV   Streaming Movies    0.532794
164          Zip Code    Customer Status    0.507938
166          Zip Code     Churn Category    0.503812
167          Zip Code       Churn Reason    0.500979

df_categorical = df_raw[cat_cols].drop(['Internet Service', 'Churn Label', 'Customer Status', 'Churn Reason', 'Churn Category'], axis=1)

# Select categorical and numerical columns
cat_cols = df_categorical.columns
num_cols = df_numerical_reduced.columns

# Dictionary to store ANOVA results
anova_results = {}

# Loop through categorical and numerical feature pairs
for cat in cat_cols:
    for num in num_cols:
        try:
            # Group numerical data by categorical values
            groups = [df_raw[num][df_raw[cat] == category] for category in df_raw[cat].unique()]
            
            # Perform ANOVA
            f_stat, p_value = stats.f_oneway(*groups)
            
            # Store results
            anova_results[(cat, num)] = {'F-Statistic': f_stat, 'P-Value': p_value}

        except:
            continue  # Skip if error occurs

/usr/local/lib/python3.10/dist-packages/scipy/stats/_axis_nan_policy.py:531: ConstantInputWarning: Each of the input arrays is constant; the F statistic is not defined or infinite
  res = hypotest_fun_out(*samples, **kwds)

# Convert results to DataFrame
anova_df = pd.DataFrame.from_dict(anova_results, orient='index')
anova_df = anova_df.sort_values(by="P-Value")

# Filter for significant results (p < 0.05)
anova_significant = anova_df[anova_df["P-Value"] < 0.05]

# Count how many numerical features each categorical feature influences
feature_impact = anova_significant.reset_index().groupby("level_0").size().reset_index()
feature_impact.columns = ["Categorical Feature", "Number of Influenced Numerical Features"]

# Display summarized insights
print("\n **Top Categorical Features Impacting Numerical Features** ")
print(feature_impact.sort_values(by="Number of Influenced Numerical Features", ascending=False))

 **Top Categorical Features Impacting Numerical Features** 
       Categorical Feature  Number of Influenced Numerical Features
6           Multiple Lines                                        9
7                    Offer                                        9
11          Payment Method                                        8
15          Senior Citizen                                        8
13    Premium Tech Support                                        8
9          Online Security                                        8
4            Internet Type                                        8
20          Unlimited Data                                        7
18            Streaming TV                                        7
10       Paperless Billing                                        7
1                 Contract                                        7
19                Under 30                                        7
3   Device Protection Plan                                        7
16        Streaming Movies                                        7
17         Streaming Music                                        7
5                  Married                                        6
14       Referred a Friend                                        6
2               Dependents                                        6
0                     City                                        5
8            Online Backup                                        5
12           Phone Service                                        4
21                Zip Code                                        4

df_categorical_reduced = df_categorical.drop(['Referred a Friend', 'Dependents', 'Online Backup', 'Phone Service'], axis=1)

# Check correlation among the top 10 categorical features
top_cat_features = ["Multiple Lines", "Offer", "Payment Method", "Premium Tech Support", "Internet Type",
                     "Online Security",  "Senior Citizen", "Unlimited Data", "Under 30", "Streaming TV"]

# Compute Cramér’s V among top features
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    r, k = confusion_matrix.shape
    return np.sqrt(chi2 / (n * (min(r, k) - 1)))

cramers_results = pd.DataFrame(index=top_cat_features, columns=top_cat_features)

for col1 in top_cat_features:
    for col2 in top_cat_features:
        if col1 != col2:
            cramers_results.loc[col1, col2] = cramers_v(df_raw[col1], df_raw[col2])
        else:
            cramers_results.loc[col1, col2] = 1.0  # Correlation with itself

# Convert to float for sorting
cramers_results = cramers_results.astype(float)

# Display highly correlated categorical feature pairs
high_correlation = cramers_results.where(np.triu(np.ones(cramers_results.shape), k=1).astype(bool))
high_correlation = high_correlation.stack().reset_index()
high_correlation.columns = ['Feature 1', 'Feature 2', 'Cramér’s V']
high_correlation = high_correlation[high_correlation["Cramér’s V"] > 0.7]  # Threshold for strong correlation

print("\n **Highly Correlated Categorical Features (Cramér’s V > 0.7)**")
print(high_correlation)

 **Highly Correlated Categorical Features (Cramér’s V > 0.7)**
        Feature 1       Feature 2  Cramér’s V
32  Internet Type  Unlimited Data    0.755751

df_categorical_reduced = df_categorical_reduced.drop(['Unlimited Data'], axis=1)

# Function to calculate Cramér's V
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    r, k = confusion_matrix.shape
    return np.sqrt(chi2 / (n * (min(r, k) - 1)))

# Compute Cramér’s V for each categorical feature vs. Churn Label
cramers_v_results = {}
for cat in df_categorical_reduced.columns:
    cramers_v_results[cat] = cramers_v(df_categorical_reduced[cat], df_raw["Churn Label"])  

# Convert to DataFrame and sort by importance
cramers_v_df = pd.DataFrame.from_dict(cramers_v_results, orient='index', columns=['Cramers V'])
cramers_v_df.sort_values(by="Cramers V", ascending=False, inplace=True)

print(cramers_v_df)

                        Cramers V
Zip Code                 0.534890
Contract                 0.453001
City                     0.443082
Internet Type            0.304687
Offer                    0.261530
Payment Method           0.219014
Paperless Billing        0.191498
Online Security          0.170871
Premium Tech Support     0.164319
Senior Citizen           0.150453
Married                  0.150126
Device Protection Plan   0.065821
Streaming TV             0.062897
Streaming Movies         0.061052
Under 30                 0.054201
Streaming Music          0.045251
Multiple Lines           0.039777
Gender                   0.008291

df_categorical_reduced = df_categorical_reduced.drop(['Device Protection Plan', 'Zip Code', 'Streaming TV', 'Streaming Movies', 'Under 30', 'Streaming Music', 'Multiple Lines', 'Gender'], axis=1)

cities_column = df_categorical_reduced["City"]
city_counts = df_categorical_reduced["City"].value_counts()
city_counts

City
Los Angeles         293
San Diego           285
San Jose            112
Sacramento          108
San Francisco       104
                   ... 
Jacumba               2
South Lake Tahoe      2
Johannesburg          2
Eldridge              2
Holtville             2
Name: count, Length: 1106, dtype: int64

df_categorical_reduced["City_Frequency"] = df_categorical_reduced["City"].map(city_counts)
df_categorical_reduced.drop(columns=["City"], inplace=True)

df_categorical_reduced['Churn_Label_Encoded'] = df_raw['Churn Label'].map({'Yes': 1, 'No': 0})

df_categorical_reduced.head()

# Encode 'Churn Status' as numerical (e.g., Churned = 1, Stayed = 0)
df_numerical_reduced['Churn_Label_Encoded'] = df_categorical_reduced['Churn_Label_Encoded']

# Compute correlation matrix
corr_matrix = df_numerical_reduced.drop(['Churn_Label_Encoded'], axis=1).corrwith(df_numerical_reduced["Churn_Label_Encoded"]).to_frame(name="Correlation")

# Sort by absolute value
corr_matrix = corr_matrix.abs().sort_values(by="Correlation", ascending=False)

# Plot heatmap
plt.figure(figsize=(8, 5))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation of Numerical Features with Churn Label")
plt.show()

df_numerical_reduced = df_numerical_reduced.drop(['Satisfaction Score', 'Avg Monthly Long Distance Charges', 'Total Extra Data Charges', 'Population', 'Avg Monthly GB Download', 'Total Refunds'], axis=1)
df_numerical_reduced.head()

# Function to plot stacked bar charts
def plot_stacked_bar(df, cat_cols, target):
    fig, axes = plt.subplots(nrows=len(cat_cols), figsize=(6, 3.5 * len(categorical_cols_filtered)))

    for i, col in enumerate(cat_cols):
        crosstab = pd.crosstab(df[col], df[target], normalize="index") * 100
        crosstab.plot(kind="bar", stacked=True, ax=axes[i])

        axes[i].set_title(f"Churn Proportion by {col}")
        axes[i].set_ylabel("Percentage")
        axes[i].set_xlabel(col, rotation = 0)
        axes[i].tick_params(axis='x', rotation=0) 
        axes[i].legend(title="Churn", loc="upper right")

    plt.tight_layout()
    plt.show()
    
# Define target variable
target = "Churn_Label_Encoded"
categorical_cols = df_categorical_reduced.columns
excluded_cols = ["Zip Code", "Churn_Label_Encoded"]
categorical_cols_filtered = [col for col in categorical_cols if col not in excluded_cols]
# Run the function
plot_stacked_bar(df_categorical_reduced, categorical_cols_filtered, target)

# Define numerical columns
numerical_cols = df_numerical_reduced.columns.tolist()
numerical_cols.remove("Churn_Label_Encoded")  # Exclude target variable

# Plot distribution for numerical variables
fig, axes = plt.subplots(nrows=len(numerical_cols), figsize=(6, 3.5 * len(numerical_cols)))

for i, col in enumerate(numerical_cols):
    sns.histplot(df_numerical_reduced[col], bins=30, kde=True, ax=axes[i])
    axes[i].set_title(f"Distribution of {col}")
    axes[i].set_xlabel(col)
    axes[i].set_ylabel("Frequency")

plt.tight_layout()
plt.show()

/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):

# Define numerical columns (excluding churn)
numerical_cols = df_numerical_reduced.columns.tolist()
numerical_cols.remove("Churn_Label_Encoded")

fig, axes = plt.subplots(nrows=len(numerical_cols), figsize=(6, 3.5 * len(numerical_cols)))

for i, col in enumerate(numerical_cols):
    sns.stripplot(x=df_numerical_reduced["Churn_Label_Encoded"], y=df_numerical_reduced[col], jitter=True, alpha=0.5, ax=axes[i])
    axes[i].set_title(f"{col} vs. Churn")
    axes[i].set_xlabel("Churn")
    axes[i].set_ylabel(col)

plt.tight_layout()
plt.show()

/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)

# Define key numerical feature pairs for scatter plot
feature_pairs = [
    ("Monthly Charge", "Number of Dependents"),
    ("Number of Referrals", "Age")
]

# Create scatter plots
fig, axes = plt.subplots(nrows=len(feature_pairs), figsize=(12, 5 * len(feature_pairs)))

for i, (x_feature, y_feature) in enumerate(feature_pairs):
    sns.scatterplot(x=df_numerical_reduced[x_feature], 
                    y=df_numerical_reduced[y_feature], 
                    hue=df_numerical_reduced["Churn_Label_Encoded"], 
                    alpha=0.5, ax=axes[i])
    
    axes[i].set_title(f"{y_feature} vs. {x_feature}")
    axes[i].set_xlabel(x_feature)
    axes[i].set_ylabel(y_feature)

plt.tight_layout()
plt.show()

# Selecting important numerical features for pairplot
selected_features = ["Monthly Charge", "Age", "Churn_Label_Encoded"]

# Pairplot
sns.pairplot(df_numerical_reduced[selected_features], hue="Churn_Label_Encoded", diag_kind="kde")
plt.show()

/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/usr/local/lib/python3.10/dist-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)

# New family status feature
df_numerical_reduced["Family_Status"] = ((df_numerical_reduced["Number of Dependents"] > 0) & 
                                         (df_numerical_reduced["Number of Referrals"] > 0)).astype(int)

# New low cost plan feature
df_numerical_reduced["Discounted_Plan"] = (df_numerical_reduced["Monthly Charge"] < 30).astype(int)

# Plot new features distribution
new_features_cols = ['Family_Status', 'Discounted_Plan']

fig, axes = plt.subplots(nrows = len(new_features_cols), figsize=(6,3.5 * len(new_features_cols)))

for i, col in enumerate(new_features_cols):
    sns.countplot(df_numerical_reduced, x=col, ax=axes[i], palette="hls")
    axes[i].set_title(f'{col} Distribution')

plt.tight_layout()
plt.show()

# Plot new features related to churn
fig, axes = plt.subplots(nrows = len(new_features_cols), figsize=(6,3.5 * len(new_features_cols)))

for i, col in enumerate(new_features_cols):
    sns.barplot(x=df_numerical_reduced[col], y=df_numerical_reduced["Churn_Label_Encoded"], ax=axes[i], palette="hls")
    axes[i].set_title(f'{col} Churn Rate')

plt.tight_layout()
plt.show()

# Correlation heatmap
corr_matrix = df_numerical_reduced.corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

categorical_cols

Index(['Senior Citizen', 'Married', 'Offer', 'Internet Type',
       'Online Security', 'Premium Tech Support', 'Contract',
       'Paperless Billing', 'Payment Method', 'City_Frequency',
       'Churn_Label_Encoded'],
      dtype='object')

# Select categorical columns (excluding City and Churn)
low_cardinality_cols = categorical_cols[:-2]
low_cardinality_cols

Index(['Senior Citizen', 'Married', 'Offer', 'Internet Type',
       'Online Security', 'Premium Tech Support', 'Contract',
       'Paperless Billing', 'Payment Method'],
      dtype='object')

# Apply One-Hot Encoding
encoder = OneHotEncoder(drop="first", sparse=False)
X_categorical = encoder.fit_transform(df_categorical_reduced[low_cardinality_cols])
X_categorical = pd.DataFrame(X_categorical, columns=encoder.get_feature_names_out(low_cardinality_cols))

# Merge with dataset
df_categorical_reduced = df_categorical_reduced.drop(columns=low_cardinality_cols).join(X_categorical)

/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_encoders.py:868: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.
  warnings.warn(

# Drop target variable before merging
df_final = df_numerical_reduced.drop(columns=["Churn_Label_Encoded"]).join(df_categorical_reduced).drop(columns=["Churn_Label_Encoded"])
df_final.head()

print(df_final.isnull().sum())

Age                            0
Number of Dependents           0
Number of Referrals            0
Monthly Charge                 0
Family_Status                  0
Discounted_Plan                0
City_Frequency                 0
Senior Citizen_True            0
Married_True                   0
Offer_Offer A                  0
Offer_Offer B                  0
Offer_Offer C                  0
Offer_Offer D                  0
Offer_Offer E                  0
Internet Type_DSL              0
Internet Type_Fiber Optic      0
Internet Type_No Internet      0
Online Security_Yes            0
Premium Tech Support_Yes       0
Contract_One Year              0
Contract_Two Year              0
Paperless Billing_Yes          0
Payment Method_Credit Card     0
Payment Method_Mailed Check    0
dtype: int64

print(df_final.dtypes)

Age                              int64
Number of Dependents             int64
Number of Referrals              int64
Monthly Charge                 float64
Family_Status                    int64
Discounted_Plan                  int64
City_Frequency                   int64
Senior Citizen_True            float64
Married_True                   float64
Offer_Offer A                  float64
Offer_Offer B                  float64
Offer_Offer C                  float64
Offer_Offer D                  float64
Offer_Offer E                  float64
Internet Type_DSL              float64
Internet Type_Fiber Optic      float64
Internet Type_No Internet      float64
Online Security_Yes            float64
Premium Tech Support_Yes       float64
Contract_One Year              float64
Contract_Two Year              float64
Paperless Billing_Yes          float64
Payment Method_Credit Card     float64
Payment Method_Mailed Check    float64
dtype: object

# Define X and y
# Features (X)
X = df_final  
# Define target variable (y)
y = df_numerical_reduced["Churn_Label_Encoded"]

# Using stratify=y ensures that the proportion of churn (1s) and non-churn (0s) remains the same in both the training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training size:", X_train.shape)
print("Testing size:", X_test.shape)

Training size: (5634, 24)
Testing size: (1409, 24)

# Initialize Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

# Train the model
dt_model.fit(X_train, y_train)

# Make predictions
y_pred = dt_model.predict(X_test)

# Evaluate performance
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Decision Tree Accuracy: 0.7792760823278921

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.86      0.85      1035
           1       0.59      0.55      0.57       374

    accuracy                           0.78      1409
   macro avg       0.72      0.71      0.71      1409
weighted avg       0.77      0.78      0.78      1409

# Get feature importance
feature_importance = pd.Series(dt_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 5))
feature_importance[:10].plot(kind="barh")
plt.xlabel("Contribution to the model’s Decision-Making")
plt.title("Top 10 Feature Importances - Decision Tree")
plt.gca().invert_yaxis()

# Add labels to bars
for index, value in enumerate(feature_importance[:10]):
    plt.text(value + 0.0019, index, f"{value:.2f}", va='center')  

plt.show()

# Train a Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

Random Forest Accuracy: 0.8339247693399574

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.92      0.89      1035
           1       0.73      0.59      0.65       374

    accuracy                           0.83      1409
   macro avg       0.80      0.76      0.77      1409
weighted avg       0.83      0.83      0.83      1409


Confusion Matrix:
 [[954  81]
 [153 221]]

# Get the predicted probabilities of churn for each test customer
y_proba = rf_model.predict_proba(X_test)[:, 1]

# Set a custom threshold
threshold = 0.4
# Get the number of clients above the 0.4 threshold
y_pred_custom = (y_proba >= threshold)

# Evaluate
print(f"Random Forest Accuracy (threshold={threshold}):", accuracy_score(y_test, y_pred_custom))
print("\nClassification Report:\n", classification_report(y_test, y_pred_custom))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_custom))

Random Forest Accuracy (threshold=0.4): 0.8254080908445706

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.87      0.88      1035
           1       0.66      0.70      0.68       374

    accuracy                           0.83      1409
   macro avg       0.78      0.78      0.78      1409
weighted avg       0.83      0.83      0.83      1409


Confusion Matrix:
 [[903 132]
 [114 260]]

precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
plt.plot(thresholds, precision[:-1], label='Precision')
plt.plot(thresholds, recall[:-1], label='Recall')
plt.xlabel("Threshold")
plt.legend()
plt.title("Precision-Recall vs Threshold")
plt.grid()
plt.show()

# Extract feature importance from the model
feature_importance_rf = pd.Series(rf_model.feature_importances_, index=X_train.columns)

# Sort values in descending order
feature_importance_rf = feature_importance_rf.sort_values(ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 5))
feature_importance_rf[:10].plot(kind="barh", color="lightblue")
plt.xlabel("Contribution to Model’s Decision-Making")
plt.title("Top 10 Feature Importances (Random Forest)")
plt.gca().invert_yaxis()  # Highest importance at the top

# Add labels to bars
for index, value in enumerate(feature_importance_rf[:10]):
    plt.text(value + 0.0019, index, f"{value:.2f}", va='center')

plt.show()

# I'll start by re-introducing Churn Label in order to perform groupby
df_final["Churn_Label_Encoded"] = df_numerical_reduced["Churn_Label_Encoded"]

# Define charge segments
bins = [0, 40, 80, df_final["Monthly Charge"].max()]
labels = ["Low Spend (<40)", "Mid Spend (40-80)", "High Spend (>80)"]
df_final["Charge_Segment"] = pd.cut(df_final["Monthly Charge"], bins=bins, labels=labels, include_lowest=True)

# Calculate churn rate per segment
charge_churn = df_final.groupby("Charge_Segment")["Churn_Label_Encoded"].mean()

# Plot
plt.figure(figsize=(6,3.5))
charge_churn.plot(kind='bar', colormap='summer')

# Add labels on top of bars
for index, value in enumerate(charge_churn):
    plt.text(index, value + 0.0055, f"{value:.1%}", ha='center', fontsize=10)

plt.title("Churn Rate by Monthly Charge Segment")
plt.ylabel("Churn Rate")
plt.xlabel("Monthly Charge Segment")
plt.xticks(rotation=0)
plt.show()

<ipython-input-234-5fddaac2aaff>:7: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  charge_churn = df_final.groupby("Charge_Segment")["Churn_Label_Encoded"].mean()

# Define age segments
bins = [18, 30, 55, df_final["Age"].max()]
labels = ["Young (18-30)", "Middle-Aged (30-55)", "Senior (55+)"]
df_final["Age_Segment"] = pd.cut(df_final["Age"], bins=bins, labels=labels, include_lowest=True)

# Calculate churn rate per age segment
age_churn = df_final.groupby("Age_Segment")["Churn_Label_Encoded"].mean()

# Plot
plt.figure(figsize=(6,3.5))
age_churn.plot(kind='bar', colormap='summer')

# Add labels on top of bars
for index, value in enumerate(age_churn):
    plt.text(index, value + 0.0055, f"{value:.1%}", ha='center', fontsize=10)

plt.title("Churn Rate by Age Group")
plt.ylabel("Churn Rate")
plt.xlabel("Age Segment")
plt.xticks(rotation=0)
plt.show()

<ipython-input-235-221170b4c2d9>:7: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  age_churn = df_final.groupby("Age_Segment")["Churn_Label_Encoded"].mean()

# Create city frequency categories (quartiles)
# qcut is used to discretize a variable into equal-sized buckets based on Number of quantiles, in this case 4
df_final["City_Frequency_Category"] = pd.qcut(df_final["City_Frequency"], q=4, labels=["Very Low", "Low", "Medium", "High"])

# Calculate churn rate per category
city_churn = df_final.groupby("City_Frequency_Category")["Churn_Label_Encoded"].mean()

# Plot
plt.figure(figsize=(6,3.5))
ax = city_churn.plot(kind='bar', colormap='summer')

# Add labels on top of bars
for index, value in enumerate(city_churn):
    plt.text(index, value + 0.01, f"{value:.1%}", ha='center', fontsize=10)

plt.title("Churn Rate by City Frequency in Dataset")
plt.ylabel("Churn Rate")
plt.xlabel("City Frequency Category")
plt.xticks(rotation=0)

plt.show()

<ipython-input-236-d3d120856311>:6: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  city_churn = df_final.groupby("City_Frequency_Category")["Churn_Label_Encoded"].mean()

# Display top 10 cities with highest churn
city_counts.head(10)

City
Los Angeles      293
San Diego        285
San Jose         112
Sacramento       108
San Francisco    104
Fresno            61
Long Beach        60
Oakland           52
Escondido         51
Stockton          44
Name: count, dtype: int64

df_final["City"] = cities_column

# Filter the DataFrame to include only the specified California cities
california_cities = ["Los Angeles", "San Diego", "San Jose", "Sacramento", "San Francisco",
                     "Fresno", "Long Beach", "Oakland", "Escondido", "Stockton"]
filtered_df = df_final[df_final["City"].isin(california_cities)]

# Group by city, calculate the mean, and sort
sorted_grouped_data = filtered_df.groupby("City")[["Contract_One Year", "Contract_Two Year", "Churn_Label_Encoded"]].mean()
sorted_grouped_data = sorted_grouped_data.sort_values(by='Churn_Label_Encoded', ascending = False)

sorted_grouped_data.head(10)

df_final[df_final["City"].isin(["Los Angeles", "San Diego", "San Jose", "Sacramento", "San Francisco",
                                "Fresno", "Long Beach", "Oakland", "Escondido", "Stockton"])] \
        .groupby("City")["Monthly Charge"].mean()

City
Escondido        67.879412
Fresno           63.633607
Long Beach       65.367500
Los Angeles      62.132594
Oakland          66.400000
Sacramento       62.726389
San Diego        72.021930
San Francisco    64.642308
San Jose         65.589286
Stockton         65.878409
Name: Monthly Charge, dtype: float64

# Export clean dataset
df_final.to_csv("telco_churn_clean.csv", index=False)

	Age	Number of Dependents	Zip Code	Latitude	Longitude	Population	Number of Referrals	Tenure in Months	Avg Monthly Long Distance Charges	Avg Monthly GB Download	Monthly Charge	Total Charges	Total Refunds	Total Extra Data Charges	Total Long Distance Charges	Total Revenue	Satisfaction Score	Churn Score	CLTV
count	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000
mean	46.509726	0.468692	93486.070567	36.197455	-119.756684	22139.603294	1.951867	32.386767	22.958954	20.515405	64.761692	2280.381264	1.962182	6.860713	749.099262	3034.379056	3.244924	58.505040	4400.295755
std	16.750352	0.962802	1856.767505	2.468929	2.154425	21152.392837	3.001199	24.542061	15.448113	20.418940	30.090047	2266.220462	7.902614	25.104978	846.660055	2865.204542	1.201657	21.170031	1183.057152
min	19.000000	0.000000	90001.000000	32.555828	-124.301372	11.000000	0.000000	1.000000	0.000000	0.000000	18.250000	18.800000	0.000000	0.000000	0.000000	21.360000	1.000000	5.000000	2003.000000
25%	32.000000	0.000000	92101.000000	33.990646	-121.788090	2344.000000	0.000000	9.000000	9.210000	3.000000	35.500000	400.150000	0.000000	0.000000	70.545000	605.610000	3.000000	40.000000	3469.000000
50%	46.000000	0.000000	93518.000000	36.205465	-119.595293	17554.000000	0.000000	29.000000	22.890000	17.000000	70.350000	1394.550000	0.000000	0.000000	401.440000	2108.640000	3.000000	61.000000	4527.000000
75%	60.000000	0.000000	95329.000000	38.161321	-117.969795	36125.000000	3.000000	55.000000	36.395000	27.000000	89.850000	3786.600000	0.000000	0.000000	1191.100000	4801.145000	4.000000	75.500000	5380.500000
max	80.000000	9.000000	96150.000000	41.962127	-114.192901	105285.000000	11.000000	72.000000	49.990000	85.000000	118.750000	8684.800000	49.790000	150.000000	3564.720000	11979.340000	5.000000	96.000000	6500.000000

Feature Type	Feature Name	Data Type
Numerical	Age	Continuous
	Monthly Charge	Continuous
	Number of Dependents	Ordinal, Discrete
	Number of Referrals	Ordinal, Discrete
Categorical	Senior Citizen	Binary
	Married	Binary
	Online Security	Binary
	Premium Tech Support	Binary
	Paperless Billing	Binary
	Offer	Multi-Class
	Internet Type	Multi-Class
	Contract	Multi-Class
	Payment Method	Multi-Class
	City	Numerical (from Categorical)
Target	Churn_Label_Encoded	Binary

	Contract_One Year	Contract_Two Year	Churn_Label_Encoded
City
San Diego	0.189474	0.108772	0.649123
Escondido	0.235294	0.274510	0.313725
San Francisco	0.221154	0.288462	0.298077
Stockton	0.136364	0.272727	0.272727
Los Angeles	0.215017	0.259386	0.266212
San Jose	0.205357	0.267857	0.258929
Long Beach	0.166667	0.266667	0.250000
Oakland	0.288462	0.211538	0.250000
Sacramento	0.277778	0.268519	0.240741
Fresno	0.229508	0.311475	0.213115

	Customer ID	Gender	Age	Under 30	Senior Citizen	Married	Dependents	Number of Dependents	Country	State	...	Total Extra Data Charges	Total Long Distance Charges	Total Revenue	Satisfaction Score	Customer Status	Churn Label	Churn Score	CLTV	Churn Category	Churn Reason
0	8779-QRDMV	Male	78	No	Yes	No	No	0	United States	California	...	20	0.00	59.65	3	Churned	Yes	91	5433	Competitor	Competitor offered more data
1	7495-OOKFY	Female	74	No	Yes	Yes	Yes	1	United States	California	...	0	390.80	1024.10	3	Churned	Yes	69	5302	Competitor	Competitor made better offer
2	1658-BYGOY	Male	71	No	Yes	No	Yes	3	United States	California	...	0	203.94	1910.88	2	Churned	Yes	81	3179	Competitor	Competitor made better offer
3	4598-XLKNJ	Female	78	No	Yes	Yes	Yes	1	United States	California	...	0	494.00	2995.07	2	Churned	Yes	88	5337	Dissatisfaction	Limited range of services
4	4846-WHAFZ	Female	80	No	Yes	Yes	Yes	1	United States	California	...	0	234.21	3102.36	2	Churned	Yes	67	2793	Price	Extra data charges

Column	Current Type	Correct Type
Under 30	object	bool (True/False)
Senior Citizen	object	bool (True/False)
Married	object	bool (True/False)
Dependents	object	bool (True/False)
Zip Code	int64	string
Quarter	object	category
Referred a Friend	object	bool (Yes/No)
Contract	object	category
Payment Method	object	category

	Senior Citizen	Married	Offer	Internet Type	Online Security	Premium Tech Support	Contract	Paperless Billing	Payment Method	City_Frequency	Churn_Label_Encoded
0	True	False	No Offer	DSL	No	No	Month-to-Month	Yes	Bank Withdrawal	293	1
1	True	True	Offer E	Fiber Optic	No	No	Month-to-Month	Yes	Credit Card	293	1
2	True	False	Offer D	Fiber Optic	No	No	Month-to-Month	Yes	Bank Withdrawal	293	1
3	True	True	Offer C	Fiber Optic	No	No	Month-to-Month	Yes	Bank Withdrawal	25	1
4	True	True	Offer C	Fiber Optic	No	No	Month-to-Month	Yes	Bank Withdrawal	30	1

	Age	Number of Dependents	Number of Referrals	Monthly Charge	Family_Status	City_Frequency	Senior Citizen_True	Married_True	...	Internet Type_DSL	Internet Type_Fiber Optic	Paperless Billing_Yes	Payment Method_Credit Card
0	78	0	0	39.65	0	293	1.0	0.0	...	1.0	0.0	1.0	0.0
1	74	1	1	80.65	1	293	1.0	1.0	...	0.0	1.0	1.0	1.0
2	71	3	0	95.45	0	293	1.0	0.0	...	0.0	1.0	1.0	0.0
3	78	1	1	98.50	1	25	1.0	1.0	...	0.0	1.0	1.0	0.0
4	80	1	1	76.50	1	30	1.0	1.0	...	0.0	1.0	1.0	0.0

Churn Prediction: Model Comparison and Optimization¶

Introduction¶

1. Understanding the problem¶

Background and definitions¶

Why is Churn important?¶

Preventing Churn¶

Approach¶

2. Data Structure¶

3. Data Cleaning¶

3.1. Checking for missing values¶

3.2. Checking for Duplicates¶

3.3. Checking for Inconsistencies¶

3.4. Fixing Data Types¶

4. Data Transformation¶

4.1. Numerical features correlations¶

Observations on numerical features correlation¶

Variance Inflation Factor (VIF) Analysis¶

Comments on the Variance Inflation Factor (VIF) results:¶

Dropping high VIF numerical features¶

4.2. Categorical Features Correlation¶

4.3. Categorical vs Numerical Features Correlation¶

4.4. Correlation Between Categorical Features & Churn Status¶

Deciding how to handle Cities and Zip Codes¶

4.5. Correlation Between Numerical Features & Churn Status¶

5. Data Explanatory Analysis¶

5.1. Distribution of Categorical variables vs. Churn¶

5.2. Key Observations from Categorical Variables vs. Churn¶

5.3. Distribution of Numerical variables vs. Churn¶

5.4. Key Observations from Numerical Feature Interactions¶

5.5. New Features and Evaluation¶

5.6. Model Choice for Analysis and Prediction¶

5.6.1. Encode Categorical Variables¶

6. Prediction Model¶

6.1. Data split¶

6.2. Decision Tree model training and evaluation¶

6.3. Random Forest model training and evaluation¶

6.3.1. Threshold tuning¶

6.4. Feature exploration¶

6.4.1. Monthly Charge¶

6.4.2. Age¶

6.4.3. Cities (Location)¶

7. Business Action Recommendations¶

1. Address High Churn in Key Cities (San Diego & Escondido)¶

2. Pricing Optimization for Monthly Charge Segments¶

3. Increase Long-Term Contract Engagement¶

4. Addressing High Churn Among Senior Customers¶

5. Using Referrals & Family Status to Reduce Churn¶

6. Monitor Client Satisfaction for Future Churn Prevention¶