# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import seaborn as sns

# Statistical analysis
from scipy.stats import (
    ttest_ind,
    chi2_contingency,
    norm,
    mannwhitneyu,
    shapiro,
    probplot
)
from statsmodels.stats.proportion import proportions_ztest

# Machine learning tools
from sklearn.impute import SimpleImputer

# Warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Disable scientific notation
pd.set_option('display.float_format', '{:.2f}'.format)
# Define the palette we will be using
palette = sns.color_palette("colorblind")

df_demographics = pd.read_csv('/kaggle/input/vanguards-ab-testing-dataset/df_final_demo.txt')
df_web_1 = pd.read_csv('/kaggle/input/vanguards-ab-testing-dataset/df_final_web_data_pt_1.txt') 
df_web_2 = pd.read_csv('/kaggle/input/vanguards-ab-testing-dataset/df_final_web_data_pt_2.txt')
df_experiment_clients = pd.read_csv('/kaggle/input/vanguards-ab-testing-dataset/df_final_experiment_clients.txt')

df_demographics.head()

new_column_names = {
        "clnt_tenure_yr": "tenure_year",
        "clnt_tenure_mnth": "tenure_month",
        "clnt_age": "age",
        "gendr": "gender",
        "num_accts": "number_of_accounts",
        "bal": "balance",
        "calls_6_mnth": "calls_6_month",
        "logons_6_mnth": "logons_6_month"
    }

# Renaming columns
df_demographics.rename(columns=new_column_names, inplace=True)

# Check on the demographics dataset
df_demographics.info()

# check duplicates rows
df_demographics_duplicates = df_demographics.duplicated().sum()
print("Number of duplicates:", df_demographics_duplicates)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70609 entries, 0 to 70608
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   client_id           70609 non-null  int64  
 1   tenure_year         70595 non-null  float64
 2   tenure_month        70595 non-null  float64
 3   age                 70594 non-null  float64
 4   gender              70595 non-null  object 
 5   number_of_accounts  70595 non-null  float64
 6   balance             70595 non-null  float64
 7   calls_6_month       70595 non-null  float64
 8   logons_6_month      70595 non-null  float64
dtypes: float64(7), int64(1), object(1)
memory usage: 4.8+ MB
Number of duplicates: 0

df_demographics['client_id'] = df_demographics['client_id'].astype(str)
df_demographics[['number_of_accounts', 'calls_6_month', 'logons_6_month', 'tenure_year', 'tenure_month']] = df_demographics[['number_of_accounts', 'calls_6_month', 'logons_6_month', 'tenure_year', 'tenure_month']].astype('Int64')

df_numerical_columns = ['tenure_year', 'tenure_month', 'age', 'number_of_accounts', 'balance', 'calls_6_month', 'logons_6_month']
df_categorical_columns = ['client_id', 'gender']

df_demographics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70609 entries, 0 to 70608
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   client_id           70609 non-null  object 
 1   tenure_year         70595 non-null  Int64  
 2   tenure_month        70595 non-null  Int64  
 3   age                 70594 non-null  float64
 4   gender              70595 non-null  object 
 5   number_of_accounts  70595 non-null  Int64  
 6   balance             70595 non-null  float64
 7   calls_6_month       70595 non-null  Int64  
 8   logons_6_month      70595 non-null  Int64  
dtypes: Int64(5), float64(2), object(2)
memory usage: 5.2+ MB

# Count null values in each column
null_counts = df_demographics.isnull().sum() 

print("Number of null values in each column:")
print(null_counts)

Number of null values in each column:
client_id              0
tenure_year           14
tenure_month          14
age                   15
gender                14
number_of_accounts    14
balance               14
calls_6_month         14
logons_6_month        14
dtype: int64

# Impute numerical columns with median
imputer_num = SimpleImputer(strategy='median') 
df_demographics[df_numerical_columns] = imputer_num.fit_transform(df_demographics[df_numerical_columns])

# Impute categorical columns with the most frequent value (mode)
imputer_cat = SimpleImputer(strategy='most_frequent') 
df_demographics[df_categorical_columns] = imputer_cat.fit_transform(df_demographics[df_categorical_columns])

# Define a new function
def unique_and_missing_values_dtype(df):

    # Non-null counts and data types
    non_null_counts = df.notnull().sum()
    dtypes = df.dtypes

    # Count of unique values
    unique_count = df.nunique()

    # Percentage of unique values
    unique_percentage = (df.nunique() / len(df)) * 100

    # Count of missing values
    missing_count = df.isnull().sum()

    # Percentage of missing values
    missing_percentage = df.isnull().mean() * 100

    # Combine into a DataFrame
    summary = pd.DataFrame({
        'non-Null_count': non_null_counts,
        'dtype': dtypes,
        'unique_values': unique_count,
        '%_unique': unique_percentage.round(2).astype(str) + '%',
        'missing_values': missing_count,
        '%_missing': missing_percentage.round(2).astype(str) + '%'
    })

    return summary

unique_and_missing_values_dtype(df_demographics)

def analyze_numerical(df):
    # Select numerical columns
    numerical_cols = df.select_dtypes(include=['number']).columns

    # Perform descriptive analysis on numerical columns
    numerical_desc = df[numerical_cols].describe()

    # Display the resulting DataFrame
    print("\nNumerical Columns Analysis:")

    return numerical_desc

analyze_numerical(df_demographics)

Numerical Columns Analysis:

# Merge files
df_web_data = pd.concat([df_web_1, df_web_2])

df_web_data.head()

df_web_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 755405 entries, 0 to 412263
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   client_id     755405 non-null  int64 
 1   visitor_id    755405 non-null  object
 2   visit_id      755405 non-null  object
 3   process_step  755405 non-null  object
 4   date_time     755405 non-null  object
dtypes: int64(1), object(4)
memory usage: 34.6+ MB

df_web_data['client_id'] = df_web_data['client_id'].astype(str)
df_web_data['date_time'] = df_web_data['date_time'].astype('datetime64[ns]')

# check duplicates rows
df_web_concat_duplicates = df_web_data.duplicated().sum()
print("Number of duplicates:", df_web_concat_duplicates)

# Count null values in each column
null_counts = df_web_data.isnull().sum() 

print("Number of null values in each column:")
print(null_counts)

Number of duplicates: 10764
Number of null values in each column:
client_id       0
visitor_id      0
visit_id        0
process_step    0
date_time       0
dtype: int64

# Drop duplicates
df_web_data = df_web_data.drop_duplicates()

df_web_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 744641 entries, 0 to 412263
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   client_id     744641 non-null  object        
 1   visitor_id    744641 non-null  object        
 2   visit_id      744641 non-null  object        
 3   process_step  744641 non-null  object        
 4   date_time     744641 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 34.1+ MB

selected_rows = df_web_data[df_web_data['visit_id']=='960651974_70596002104_312201'].sort_values(by=['date_time'], ascending=True)

selected_rows

# Check the dataset
df_experiment_clients.head()

df_experiment_clients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70609 entries, 0 to 70608
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   client_id  70609 non-null  int64 
 1   Variation  50500 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.1+ MB

# Format column names to lower case for consistency
df_experiment_clients.columns = df_experiment_clients.columns.map(str.lower)

# Convert client id to a string, similar to before
df_experiment_clients['client_id'] = df_experiment_clients['client_id'].astype(str)

# check duplicates rows
df_experiment_clients_duplicates = df_experiment_clients.duplicated().sum()
print("Number of duplicates:", df_experiment_clients_duplicates)

# Count null values in each column
null_counts = df_experiment_clients.isnull().sum() 

print("Number of null values in each column")
print(null_counts)

Number of duplicates: 0
Number of null values in each column
client_id        0
variation    20109
dtype: int64

# Drop all missing values
df_experiment_clients = df_experiment_clients.dropna()

df_demographics.head()

# Get all column names except 'client_id'
selected_columns = df_demographics.columns[~df_demographics.columns.isin(['client_id'])]

# Set figure size based on the number of subplots
n_cols = 1  # Number of columns in the grid layout
n_rows = len(selected_columns) # I calculate the number of rows needed

# Create a grid of subplots
fig, axs = plt.subplots(n_rows, n_cols, figsize=(7, 2 * n_rows))

# Loop through each selected column and plot based on their type
# I choose to use enumerate to add a counter to each item and create an enumerate object. 
# Each column name will come with its number (starting from 0).
for i, col in enumerate(selected_columns):
    if df_demographics[col].dtype in ['int64', 'float64']:
        sns.histplot(df_demographics[col].dropna(), ax=axs[i], bins=20, stat="count")
    else:
        # for the gender column
        sns.countplot(x=df_demographics[col].dropna(), ax=axs[i])
    axs[i].set_title(f"Distribution of {col}")

plt.tight_layout()
plt.show()

plt.figure(figsize=(4, 4))
sns.boxplot(df_demographics['tenure_year'], color=palette[2])
plt.title('Distribution of Tenure Years') 

plt.show()

print("Number of clients with over 42 tenure years:", (df_demographics['tenure_year'] > 42).sum(),"out of a total",len(df_demographics))

Number of clients with over 42 tenure years: 67 out of a total 70609

# Separate outliers
outliers = df_demographics[df_demographics['tenure_year'] > 42]

# Non-outliers
non_outliers = df_demographics[df_demographics['tenure_year'] <= 42]

print("Outliers balance statistics:")
print(outliers['balance'].describe())
print("\nNon-outliers balance statistics:")
print(non_outliers['balance'].describe())

Outliers balance statistics:
count        67.00
mean     368171.52
std      457130.32
min       14167.31
25%       70815.78
50%      149881.38
75%      438176.56
max     1874019.73
Name: balance, dtype: float64

Non-outliers balance statistics:
count      70542.00
mean      147218.90
std       301223.11
min        13789.42
25%        37345.83
50%        63312.62
75%       137283.28
max     16320040.15
Name: balance, dtype: float64

fig, axes = plt.subplots(1, 2, figsize=(9, 3))

axes[0].boxplot([outliers['calls_6_month'], non_outliers['calls_6_month']], labels=['Outliers', 'Non-Outliers'])
axes[0].set_title('Calls (6 Months) Comparison')
axes[0].set_ylabel('Calls')

axes[1].boxplot([outliers['logons_6_month'], non_outliers['logons_6_month']], labels=['Outliers', 'Non-Outliers'])
axes[1].set_title('Logins (6 Months) Comparison')
axes[1].set_ylabel('Logins')

plt.tight_layout()
plt.show()

# Merge files
df_web_data_exp = pd.merge(df_web_data, df_experiment_clients, on="client_id")

df_web_data_exp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 317235 entries, 0 to 317234
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   client_id     317235 non-null  object        
 1   visitor_id    317235 non-null  object        
 2   visit_id      317235 non-null  object        
 3   process_step  317235 non-null  object        
 4   date_time     317235 non-null  datetime64[ns]
 5   variation     317235 non-null  object        
dtypes: datetime64[ns](1), object(5)
memory usage: 14.5+ MB

# Define the order of process steps
process_step_order = ['start', 'step_1', 'step_2', 'step_3', 'confirm']

# setting the dimensions of the plot
fig, ax = plt.subplots(figsize=(8, 3))
 
# drawing the plot
sns.countplot(x ='process_step', data = df_web_data_exp, palette = [palette[1],palette[0]], order=process_step_order, hue = 'variation')
# Get the containers for the bar plots
containers = ax.containers

# Add white labels below each bar
for c in containers:
    ax.bar_label(c, color='white', fmt='%d', label_type='center', fontsize=9)

plt.title('Process steps registered for the two variations') 
plt.show()

df_web_data_visits = df_web_data_exp.groupby(['client_id', 'variation']).agg(
    total_visits=('visit_id', 'count')
# I use reset_index() to turn client_id and variation from index columns to normal ones:
).reset_index()

print("Missing values in total_visits:", df_web_data_visits['total_visits'].isnull().sum())
print("\nDescriptive stats for total_visits:\n", df_web_data_visits['total_visits'].describe())

Missing values in total_visits: 0

Descriptive stats for total_visits:
 count   50500.00
mean        6.28
std         4.03
min         1.00
25%         5.00
50%         5.00
75%         7.00
max        72.00
Name: total_visits, dtype: float64

df_web_data_exp_visits = pd.merge(
    df_web_data_exp,
    # Drop variation from df_web_data_visits before the merge to avoid redundant columns
    df_web_data_visits.drop('variation', axis=1).rename(columns={'total_visits': 'user_total_visits'}),
    on='client_id',
    how='left'
)

df_web_data_exp_visits.head()

# Calculate Summary Stats for Each Variation Group
def calculate_whiskers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    upper_whisker = Q3 + 1.5 * IQR
    return Q1, Q3, upper_whisker

# Group by variation and calculate Q1, Q3, and upper whisker
group_stats = df_web_data_exp_visits.groupby('variation')['user_total_visits'].apply(calculate_whiskers)

# Convert the result to a DataFrame
group_stats = group_stats.apply(pd.Series)
# Q3 is the value at the 75th percentile of the data
# The upper whisker represents the largest data point that is not an outlier
group_stats.columns = ['Q1', 'Q3', 'upper_whisker']
print(group_stats)

x_order = df_web_data_exp_visits['variation'].unique()
group_stats = group_stats.loc[x_order]

            Q1    Q3  upper_whisker
variation                          
Control   5.00 10.00          17.50
Test      5.00 11.00          20.00

# I'll use a boxplot
plt.figure(figsize=(7, 5))
sns.boxplot(x='variation', y='user_total_visits', data=df_web_data_exp_visits, palette=[palette[1], palette[0]])

plt.title("Boxplot of Total Visits by Variation", fontsize=14)
plt.xlabel("Variation", fontsize=12)
plt.ylabel("User Total Visits", fontsize=12)

# Annotations on the chart
for i, variation in enumerate(group_stats.index):
    upper_whisker = group_stats.loc[variation, 'upper_whisker']
    Q3 = group_stats.loc[variation, 'Q3']
    
    # Annotate Upper Whisker
    plt.text(i, upper_whisker, f"UW: {upper_whisker:.1f}", ha='center', va='bottom', fontsize=10, color='black')
    
    # Annotate Q3 
    plt.text(i, Q3, f"Q3: {Q3:.1f}", ha='center', va='bottom', fontsize=10, color='black')

plt.tight_layout()
plt.show()

# numpy quantile computes the qth quantile of the given data along the specified axis
Q3 = df_web_data_exp_visits['user_total_visits'].quantile(0.75)
# Find clients with total_visits exceeding Q3
exceeding_clients = df_web_data_exp_visits[df_web_data_exp_visits['user_total_visits'] > Q3]['client_id']

# Filter for visits exceeding Q3
exceeding_visits = df_web_data_exp_visits[df_web_data_exp_visits['client_id'].isin(exceeding_clients)]

# Define a function to keep only the last Q3 visits for each client
def keep_last_q3_visits(group, q3_value):
    # Sort by visit_id (alphabetical = chronological in this case)
    group = group.sort_values(by='visit_id', ascending=True)
    # Keep only the last Q3 visits
    return group.iloc[-int(q3_value):]

# Apply this function to the exceeding visits
filtered_exceeding_visits = exceeding_visits.groupby('client_id').apply(keep_last_q3_visits, q3_value=Q3)

# Combine filtered visits with non-exceeding visits
remaining_visits = df_web_data_exp_visits[~df_web_data_exp_visits['client_id'].isin(exceeding_clients)]
df_web_data_filtered = pd.concat([remaining_visits, filtered_exceeding_visits]).reset_index(drop=True)

# Check total visits for each client after filtering
df_web_data_visits_filtered = df_web_data_filtered.groupby('client_id').agg(
    total_visits=('visit_id', 'count')
)

print(df_web_data_visits_filtered['total_visits'].describe())

count   50500.00
mean        5.77
std         2.53
min         1.00
25%         5.00
50%         5.00
75%         7.00
max        10.00
Name: total_visits, dtype: float64

df_web_data_filtered.head()

fig, ax = plt.subplots(figsize=(4, 3))

# I'll use a countplot
fig = sns.countplot(x='variation', data=df_experiment_clients, palette=[palette[1], palette[0]])

# Add labels 
for container in ax.containers:
    for bar in container:
        # get the height of the bar and the x-position
        yval = bar.get_height()
        xval = bar.get_x() + bar.get_width() / 2
        # Add label below the top of the bar (by adjusting yval)
        ax.text(xval, yval - 3500, f'{yval}', ha='center', va='bottom', fontsize=10, color='white')

ax.set_title('Number of total Experiment Clients', y=1.0, pad=15)

plt.show()

# Merge files
df_demographics = pd.merge(df_demographics, df_experiment_clients, on="client_id")

df_demographics.head()

# Convert categorical variables to numeric
df_demographics["gender_num"], gender_mapping = pd.factorize(df_demographics["gender"])
df_demographics["variation_num"], variation_mapping = pd.factorize(df_demographics["variation"])

# Print the mappings
print("Gender Mapping:")
for num, category in enumerate(gender_mapping):
    print(f"{category} → {num}")

print("\nVariation Mapping:")
for num, category in enumerate(variation_mapping):
    print(f"{category} → {num}")

Gender Mapping:
U → 0
M → 1
F → 2
X → 3

Variation Mapping:
Test → 0
Control → 1

# Drop non-numeric and redundant columns
df_numeric = df_demographics.drop(columns=["client_id", "gender", "variation"])

# Compute the correlation matrix
correlation_matrix = df_numeric.corr()

# Create the heatmap 
fig, ax = plt.subplots(figsize=(6, 5))
sns.heatmap(round(correlation_matrix, 2), annot=True, ax=ax,
            cmap="coolwarm", vmin=-1, vmax=1,
            cbar_kws={"label": "Correlation Coefficient"}) 

# Add labels and title
plt.title("Correlation Heatmap for Selected Numerical Variables", fontsize=14)
plt.xticks(ha="right", fontsize=10)  
plt.yticks(fontsize=10)
plt.show()

# merge all the dataframes on client_id
df_merge_1 = pd.merge(df_web_data_filtered.drop('variation', axis=1), df_experiment_clients, on='client_id', how='inner')
df_whole = pd.merge(df_merge_1, df_demographics.drop('variation', axis=1), on='client_id', how='inner')

# format datetime and add date and month column
# the date value will not include the time
df_whole['date'] = df_whole['date_time'].dt.date
df_whole['date'] = pd.to_datetime(df_whole['date'], errors='coerce')
# extract the month
df_whole['month'] = df_whole['date_time'].dt.strftime('%B')

df_whole.head()

# Create the composite variable
df_whole['total_activity'] = df_whole['calls_6_month'] + df_whole['logons_6_month']

# Define columns to drop
irrelevant_columns = ['tenure_month', 'number_of_accounts', 'calls_6_month', 
                      'logons_6_month', 'visitor_id']

# Drop irrelevant columns and remove duplicates
df_whole = df_whole.drop(columns=irrelevant_columns).drop_duplicates().reset_index(drop=True)

# Group by date and variation, count unique visit_ids
daily_visits = df_whole.groupby(['date', 'variation'])['visit_id'].nunique().reset_index(name='unique_visits')

# Create the lineplot with a figure size of 6x5
fig, ax = plt.subplots(figsize=(8, 5))  # Combine figure creation and lineplot

sns.lineplot(x='date', y='unique_visits', hue='variation', data=daily_visits, ax=ax, palette = palette[:2])  # Pass ax to sns.lineplot

# Format x-axis ticks to show only day and month
plt.xlabel('Date')
plt.ylabel('Number of Unique Visits')
plt.xticks(rotation=0)
ax.xaxis.set_major_formatter(DateFormatter('%b %d'))  # '%b' for month abbreviation, '%d' for day

plt.title('Daily Unique Visits by Variation')
plt.show()

variation_totals = df_whole.groupby(['variation']).size()

print(f"Test  size: {variation_totals.loc['Test']}, Control size: {variation_totals.loc['Control']}")

Test  size: 161010, Control size: 130314

# setting the dimensions of the plot
fig, ax = plt.subplots(figsize=(8, 3))
 
# drawing the plot
sns.countplot(x ='process_step', data = df_whole, palette = [palette[1],palette[0]], order=process_step_order, hue = 'variation')
# Get the containers for the bar plots
containers = ax.containers

# Add white labels below each bar
for c in containers:
    ax.bar_label(c, color='white', fmt='%d', label_type='center', fontsize=9)

plt.title('Process steps registered for the two variations (filtered)') 
plt.show()

# I'll create age bins to help me get some insights
bins = [18, 30, 40, 50, 60, 70, 100]
labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70+']
df_whole['age_group'] = pd.cut(df_whole['age'], bins=bins, labels=labels)

# Count the number of clients in each age group per variation. The new column we will named 'count'
age_group_stats = df_whole.groupby(['variation', 'age_group'])['client_id'].count().reset_index(name='count')

# Calculate the total clients per variation
total_clients_per_variation = age_group_stats.groupby('variation')['count'].sum().reset_index(name='total_clients')

# Merge the total clients with age group counts
age_group_stats = age_group_stats.merge(total_clients_per_variation, on='variation')

# Calculate the rate of each age group within each variation
age_group_stats['rate'] = age_group_stats['count'] / age_group_stats['total_clients'] 

print(age_group_stats)

   variation age_group  count  total_clients  rate
0    Control     18-29  22105         129970  0.17
1    Control     30-39  22964         129970  0.18
2    Control     40-49  22568         129970  0.17
3    Control     50-59  29501         129970  0.23
4    Control     60-69  24180         129970  0.19
5    Control       70+   8652         129970  0.07
6       Test     18-29  27053         160639  0.17
7       Test     30-39  28619         160639  0.18
8       Test     40-49  27439         160639  0.17
9       Test     50-59  36783         160639  0.23
10      Test     60-69  30148         160639  0.19
11      Test       70+  10597         160639  0.07

plt.figure(figsize=(9, 4))
sns.barplot(data=age_group_stats, x='age_group', y='rate', hue='variation', palette=palette)

# Format the y-axis to show percentages
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y:.0%}'))

# Add labels
for container in plt.gca().containers:
    for patch in container:
        height = patch.get_height()
        plt.text(patch.get_x() + patch.get_width() / 2, height - 0.02, f'{height:.1%}',
                 ha='center', va='bottom', color='white')

plt.title('Age Group Ratios by Variation')
plt.xlabel('Age Group')
plt.ylabel('Percentage')
plt.legend(title='Variation')

plt.show()

# Identify visits with 'confirm' process step
# these are the visits that have 'confirm' registered as one of their steps
confirm_visits = df_whole[df_whole['process_step'] == 'confirm']['visit_id'].unique()
# so assign them a 'confirm' value if True, or no_confirm if it isn't
df_whole['confirm_check'] = df_whole['visit_id'].isin(confirm_visits).map({True: 'confirm', False: 'no_confirm'})

# Calculate total unique visits for both variations
total_visits = df_whole.groupby('variation')['visit_id'].nunique()

# Calculate totals per variation
n_test_total = df_whole.groupby('variation')['visit_id'].nunique()['Test']
n_control_total = df_whole.groupby('variation')['visit_id'].nunique()['Control']

# Calculate confirmed visits for each variation
confirmed_visits = df_whole[df_whole['confirm_check'] == 'confirm'].groupby('variation')['visit_id']

# also total visits with no_cofirm per variation
n_test_nonvalidated = confirmed_visits.nunique()['Test']
n_control_nonvalidated = confirmed_visits.nunique()['Control']

# Calculate proportions
p_test_nonvalidated = n_test_nonvalidated / n_test_total
p_control_nonvalidated = n_control_nonvalidated / n_control_total

# Print the results
print(f"Out of the total {n_test_total} test sessions, {n_test_nonvalidated} have reached the 'confirm' step.")
print(f"Out of the total {n_control_total} control sessions, {n_control_nonvalidated} have reached the 'confirm' step.")

Out of the total 35498 test sessions, 20067 have reached the 'confirm' step.
Out of the total 30852 control sessions, 15088 have reached the 'confirm' step.

def validate_visit_steps(df):    
    # Group by visit_id and collect the unique process_step values for each group
    grouped_steps = df.groupby('visit_id')['process_step'].apply(set)

    # Use predefined process_step_order as required steps
    # set() function creates a set object, i.e. a sequence of iterable elements with distinct elements
    required_steps = set(process_step_order)
    
    # Extract confirmed visit_ids for Test and Control groups
    confirmed_visits_test = df.loc[
        (df['confirm_check'] == 'confirm') & (df['variation'] == 'Test'), 'visit_id'
    ].unique()
    
    confirmed_visits_control = df.loc[
        (df['confirm_check'] == 'confirm') & (df['variation'] == 'Control'), 'visit_id'
    ].unique()
    
    # Filter confirmed visit_ids to ensure they exist in grouped_steps
    confirmed_visits_test = [visit_id for visit_id in confirmed_visits_test if visit_id in grouped_steps.index]
    confirmed_visits_control = [visit_id for visit_id in confirmed_visits_control if visit_id in grouped_steps.index]

    # Check if each confirmed visit_id has all the required steps
    valid_visits_test = grouped_steps.loc[confirmed_visits_test].apply(lambda steps: required_steps.issubset(steps))
    valid_visits_control = grouped_steps.loc[confirmed_visits_control].apply(lambda steps: required_steps.issubset(steps))
    
    # Count valid and invalid visits
    n_test_validated, n_test_false = valid_visits_test.sum(), valid_visits_test.size - valid_visits_test.sum()
    n_control_validated, n_control_false = valid_visits_control.sum(), valid_visits_control.size - valid_visits_control.sum()

    # Print the results
    print(f"Valid visit_ids for test dataset (True): {n_test_validated}")
    print(f"Invalid visit_ids for test dataset (False): {n_test_false} ({n_test_false / (n_test_validated + n_test_false):.2%})")
    print(f"Valid visit_ids for control dataset (True): {n_control_validated}")
    print(f"Invalid visit_ids for control dataset (False): {n_control_false} ({n_control_false / (n_control_validated + n_control_false):.2%})")

    return valid_visits_test, valid_visits_control, n_test_validated, n_test_false, n_control_validated, n_control_false

# Capture the returned values 
valid_visits_test, valid_visits_control, n_test_validated, n_test_false, n_control_validated, n_control_false = validate_visit_steps(df_whole)

Valid visit_ids for test dataset (True): 16541
Invalid visit_ids for test dataset (False): 3526 (17.57%)
Valid visit_ids for control dataset (True): 13903
Invalid visit_ids for control dataset (False): 1185 (7.85%)

p_test_validated = valid_visits_test.sum() / n_test_total
p_control_validated = valid_visits_control.sum() / n_control_total

print(f"Valid completion rate for test dataset (True): {p_test_validated:.2%}")
print(f"Valid completion rate for control dataset (True): {p_control_validated:.2%}")

Valid completion rate for test dataset (True): 46.60%
Valid completion rate for control dataset (True): 45.06%

# Calculating confirmed and non-confirmed percentages
validated_percent = np.array([p_test_validated, p_control_validated]) * 100
non_validated_percent = np.array([1 - p_test_validated, 1 - p_control_validated]) * 100

# Labels 
labels = ['Test', 'Control']

fig, ax = plt.subplots(figsize=(4, 4))
index = np.arange(len(labels))

# Plot the bars with separate colors for Test and Control
bars_confirmed = ax.bar(index, validated_percent, label='Valid',
                        color=palette[2])
bars_non_confirmed = ax.bar(index, non_validated_percent, bottom=validated_percent,
                            label='Non-Valid',
                            color=palette[1], alpha=0.3)

# Add percentage labels to each segment
for i, (vp, nvp) in enumerate(zip(validated_percent, non_validated_percent)):
    ax.text(i, vp / 2, f'{vp:.1f}%', ha='center', va='center', color='white', fontsize=10)
    ax.text(i, vp + nvp / 2, f'{nvp:.1f}%', ha='center', va='center', color='black', fontsize=10)

ax.set_xlabel('Group', fontsize=12)
ax.set_ylabel('Percentage', fontsize=12)
ax.set_title('Valid vs Non-Valid Completed Visits\n(Test vs Control)', fontsize=14)
ax.set_xticks(index)
ax.set_xticklabels(labels)
ax.legend()

plt.tight_layout()
plt.show()

# Ensure the dataframe is sorted by visit_id and date_time
df_whole.sort_values(by=['variation', 'visit_id', 'date_time'], inplace=True)

# Create a function to check step completion conditions
def calculate_completion_rate_per_variation(df, process_step_order):
    # Initialize a dictionary to store step completion counts per variation
    completion_rates_per_variation = {}

    # Group by variation
    # variation_group is a sub-DataFrame containing only rows that belong to that group (variation)
    # variation stores the name of the current variation (i.e. Test or Completion)
    for variation, variation_group in df.groupby('variation'):
        # Creates a dictionary (step_completion) using a dictionary comprehension
        # Each step will be the key, initially assigned a 0 value
        step_completion = {step: 0 for step in process_step_order}
        # Calculate the total visits per variation
        total_visits = len(variation_group['visit_id'].unique())

        # Iterate over each visit_id in the variation and create a group for each visit
        for visit_id, group in variation_group.groupby('visit_id'):
            # Extract the 'process_step' column from the specific group as a list
            group_steps = group['process_step'].tolist()

            # Check each step's completion
            for step in process_step_order:
                # Check if all preceding steps and current step are in the group
                # Find the index position of step in the list process_step_order
                # Then slice the list from the start up to and including the current step
                # Then iterate over the sliced steps and check if each step is in group_steps
                if all(s in group_steps for s in process_step_order[:process_step_order.index(step) + 1]):
                    # Add one count to that step
                    step_completion[step] += 1

        # Calculate completion rates for this variation
        # Create a dictionary with the results
        step_completion_rate = {step: completion / total_visits for step, completion in step_completion.items()}
        completion_rates_per_variation[variation] = step_completion_rate

    return completion_rates_per_variation

# Calculate completion rates per variation
completion_rates_variation = calculate_completion_rate_per_variation(df_whole, process_step_order)

# Display completion rates per variation
print("Completion Rates by Process Step per Variation:")
for variation, rates in completion_rates_variation.items():
    print(f"Variation: {variation}")
    for step, rate in rates.items():
        print(f"  {step}: {rate:.2%}")

Completion Rates by Process Step per Variation:
Variation: Control
  start: 95.98%
  step_1: 72.68%
  step_2: 61.77%
  step_3: 55.22%
  confirm: 45.01%
Variation: Test
  start: 89.61%
  step_1: 75.38%
  step_2: 64.39%
  step_3: 57.06%
  confirm: 46.57%

completion_rates_variation

{'Control': {'start': 0.959775703357967,
  'step_1': 0.7268248411772332,
  'step_2': 0.617658498638662,
  'step_3': 0.5521846233631531,
  'confirm': 0.4501490989238947},
 'Test': {'start': 0.8960786523184405,
  'step_1': 0.7538452870584258,
  'step_2': 0.6439236013296524,
  'step_3': 0.5706236971096963,
  'confirm': 0.46566003718519355}}

# Get all unique visit IDs
total_visits = df_whole['visit_id'].nunique()

# Get unique visit IDs where 'start' is present
visits_with_start = df_whole[df_whole['process_step'] == 'start']['visit_id'].nunique()

# Calculate visits missing 'start'
missing_start_visits = total_visits - visits_with_start

# Compute the rate
missing_start_rate = missing_start_visits / total_visits

# Display results
print(f"Total visits: {total_visits}")
print(f"Visits missing 'start': {missing_start_visits}")
print(f"Rate of missing 'start' visits: {missing_start_rate:.2%}")  # Display as percentage

Total visits: 66232
Visits missing 'start': 4926
Rate of missing 'start' visits: 7.44%

# Prepare data for plotting
variations = list(completion_rates_variation.keys())
steps = list(completion_rates_variation[variations[0]].keys())
data = {step: [completion_rates_variation[var][step] for var in variations] for step in steps}

# Convert data into a matrix for plotting
step_indices = np.arange(len(steps))  # Index for each step
bar_width = 0.35  # Width of each bar

# Plot
fig, ax = plt.subplots(figsize=(8, 4))

# Plot bars for each variation
for i, variation in enumerate(variations):
    ax.bar(
        step_indices + i * bar_width,
        [data[step][i] for step in steps],
        bar_width,
        label=variation,
        color=palette[i]
    )

ax.set_xlabel('Process Steps', fontsize=12)
ax.set_ylabel('Completion Rate', fontsize=12)
ax.set_title('Completion Rates by Process Step and Variation', fontsize=14)
ax.set_xticks(step_indices + bar_width / 2)  # Center tick labels
ax.set_xticklabels(steps, fontsize=10)
ax.legend(title='Variation', fontsize=10)

# Add percentage labels above bars
for i, variation in enumerate(variations):
    for j, step in enumerate(steps):
        rate = data[step][i]  # Correctly access the rate
        ax.text(
            step_indices[j] + i * bar_width,
            rate + 0.01,
            f"{rate:.1%}",
            ha='center',
            va='bottom',
            fontsize=9,
        )

plt.tight_layout()
plt.show()

# Ensure the dataframe is sorted by visit_id and date_time
df_whole.sort_values(by=['variation', 'visit_id', 'date_time'], inplace=True)

# Filter valid completions by ensuring all steps are present
valid_completions = []

# Group by variation and visit_id to check steps
for (age_group, variation, visit_id), group in df_whole.groupby(['age_group', 'variation', 'visit_id']):
    group_steps = group['process_step'].tolist()

    # Check if all steps are completed
    if all(s in group_steps for s in process_step_order):
        valid_completions.append((age_group, variation, visit_id))

# Create a DataFrame of valid completions
valid_completions_df = pd.DataFrame(valid_completions, columns=['age_group', 'variation', 'visit_id'])

# Calculate total visits by age_group and variation
total_visits = (
    df_whole.groupby(['age_group', 'variation'])
    .agg(total_visits=('visit_id', 'nunique'))
    .reset_index()
)

# Calculate valid confirmations (visit_ids in valid_completions)
valid_confirmations = (
    valid_completions_df.groupby(['age_group', 'variation'])
    .agg(valid_confirmations=('visit_id', 'nunique'))
    .reset_index()
)

# Merge valid confirmations and total visits
completion_rates_by_age = pd.merge(
    total_visits,
    valid_confirmations,
    on=['age_group', 'variation'],
    how='left'
)

# Fill missing valid confirmations with 0
completion_rates_by_age['valid_confirmations'] = completion_rates_by_age['valid_confirmations'].fillna(0)

# Calculate valid completion rate
completion_rates_by_age['Valid Completion Rate'] = (
    completion_rates_by_age['valid_confirmations'] / completion_rates_by_age['total_visits']
)

# Prepare data for plotting
variations = completion_rates_by_age['variation'].unique()
age_groups = completion_rates_by_age['age_group'].unique()

# Pivot the data to make it easier to plot
plot_data = completion_rates_by_age.pivot(index='age_group', columns='variation', values='Valid Completion Rate')

# Define positions for bars
bar_width = 0.35
age_indices = np.arange(len(age_groups))

fig, ax = plt.subplots(figsize=(8, 4))

for i, variation in enumerate(variations):
    ax.bar(
        age_indices + i * bar_width,
        plot_data[variation],
        bar_width,
        label=variation,
        color=palette[i]
    )

ax.set_xlabel('Age Group', fontsize=12)
ax.set_ylabel('Valid Completion Rate', fontsize=12)
ax.set_title('Valid Completion Rate by Age Group and Variation', fontsize=14)
ax.set_xticks(age_indices + bar_width / 2)  # Center ticks
ax.set_xticklabels(age_groups, fontsize=10)
ax.legend(title='Variation', fontsize=10)

# Add percentage labels above bars
for i, variation in enumerate(variations):
    for j, rate in enumerate(plot_data[variation]):
        ax.text(
            age_indices[j] + i * bar_width,
            rate + 0.01,
            f"{rate:.1%}",
            ha='center',
            va='bottom',
            fontsize=9,
        )

plt.tight_layout()
plt.show()

# Sort the DataFrame by 'visit_id' and 'date_time' to ensure chronological order
time_spent_df = df_whole.sort_values(by=['visit_id', 'date_time'])

# Calculate the time difference (duration) to the next row within each visit_id
# .diff(-1) calculates the difference between the current row and the next row. I also make sure the absolute number is used.
# The time difference is then assigned to the current process_step.
time_spent_df['time_to_next_step'] = time_spent_df.groupby('visit_id')['date_time'].diff(-1).abs()

# Convert each timedelta object in the series into the total number of seconds it represents
time_spent_df['time_to_next_step'] = time_spent_df['time_to_next_step'].dt.total_seconds()

# Set 'time_to_next_step' to na if the process_step is 'confirm'
time_spent_df.loc[time_spent_df['process_step'] == 'confirm', 'time_to_next_step'] = np.nan

# Group by 'visit_id' and 'process_step' while preserving NaN values
df_step_times = time_spent_df.groupby(['visit_id', 'process_step'])['time_to_next_step'].sum(min_count=1).reset_index()

# Pivot the table
df_step_times_pivot = df_step_times.pivot(index='visit_id', columns='process_step', values='time_to_next_step').reset_index()

# When the last step of a isn't 'confirm', we have a drop off. These steps should also have 'na' assigned to them
df_step_times_pivot = df_step_times_pivot.where(pd.notna(df_step_times_pivot), np.nan)

# Display the result
df_step_times_pivot.head()

time_by_variation = df_step_times_pivot.merge(df_whole[['visit_id', 'variation']].drop_duplicates(), on='visit_id', how='left')
test_time = time_by_variation[time_by_variation['variation'] == 'Test']
control_time = time_by_variation[time_by_variation['variation'] == 'Control']

print("Average Mean Time per Step - Test Variation")
print(test_time.describe())
print("Average Mean Time per Step - Control Variation")
print(control_time.describe())

Average Mean Time per Step - Test Variation
       confirm    start   step_1   step_2   step_3
count     0.00 28634.00 24814.00 22177.00 19474.00
mean       NaN    88.81    83.02   110.26   138.65
std        NaN   330.57   223.07   214.56   283.04
min        NaN     1.00     0.00     0.00     0.00
25%        NaN     9.00    18.00    47.00    33.00
50%        NaN    17.00    34.00    74.00    60.00
75%        NaN    68.00    70.00   120.00   131.00
max        NaN 31624.00 15620.00 16036.00  6760.00
Average Mean Time per Step - Control Variation
       confirm    start   step_1   step_2   step_3
count     0.00 24551.00 20288.00 18204.00 15966.00
mean       NaN    85.57    63.41   116.84   158.66
std        NaN   360.83   221.90   212.45   234.41
min        NaN     1.00     1.00     1.00     1.00
25%        NaN    12.00    11.00    51.00    49.00
50%        NaN    25.00    25.00    82.00    87.00
75%        NaN    59.00    56.00   135.00   170.00
max        NaN 41957.00 16950.00 21763.00  6164.00

# Reshape the data to long format
time_long = time_by_variation.melt(
    id_vars=['visit_id', 'variation'], 
    value_vars=['confirm', 'start', 'step_1', 'step_2', 'step_3'], 
    var_name='process_step', 
    value_name='time'
)

# Drop rows with NaN time values 
time_long = time_long.dropna()

# Preview the reshaped data
print(time_long.head())

                           visit_id variation process_step   time
66351   100019538_17884295066_43909      Test        start 136.00
66352  100022086_87870757897_149620      Test        start  22.00
66354  100037962_47432393712_705583   Control        start 124.00
66355  100057941_88477660212_944512   Control        start  30.00
66356   10006594_66157970412_679648   Control        start  10.00

# Group by process_step and variation to calculate mean time
mean_time = time_long.groupby(['process_step', 'variation'])['time'].mean().reset_index()

plt.figure(figsize=(8, 5))
sns.barplot(data=mean_time, x='process_step', y='time', hue='variation', palette=palette, order=process_step_order)

# Add labels to the bars
for p in plt.gca().patches:
    height = p.get_height()
    plt.gca().text(p.get_x() + p.get_width() / 2, height - 8, f'{height:.1f}', 
                   ha='center', va='bottom', fontsize=10, color = 'white')

# Add titles and labels
plt.title('Mean Time per Process Step and Variation', fontsize=14)
plt.xlabel('Process Step', fontsize=12)
plt.ylabel('Mean Time (Seconds)', fontsize=12)
plt.xticks(rotation=0)
plt.legend(title='Variation')

# Show the plot
plt.tight_layout()
plt.show()

# Define process step order
step_map = {step: i for i, step in enumerate(process_step_order)}

# Map process steps to numeric order
df_whole["process_step_order"] = df_whole["process_step"].map(step_map)

# Sort by visit_id and date_time for correct order of events
df_whole = df_whole.sort_values(["visit_id", "date_time"])

# Identify step repeat errors (same step appearing consecutively within the same visit_id)
df_whole["prev_step"] = df_whole.groupby("visit_id")["process_step"].shift(1)
df_whole["step_repeat_error"] = (
    (df_whole["process_step"] == df_whole["prev_step"]) & 
    (df_whole["visit_id"] == df_whole["visit_id"].shift(1))
).astype(int)

# Identify step back errors (previous step has a higher numerical order)
df_whole["prev_step_order"] = df_whole.groupby("visit_id")["process_step_order"].shift(1)
df_whole["step_back_error"] = (
    (df_whole["prev_step_order"] > df_whole["process_step_order"]) & 
    (df_whole["visit_id"] == df_whole["visit_id"].shift(1))
).astype(int)

# Aggregate errors per visit_id and process_step
error_df = df_whole.groupby(["visit_id", "process_step"]).agg(
    variation=("variation", "first"),  
    step_repeat_errors=("step_repeat_error", "sum"),
    step_back_errors=("step_back_error", "sum")
).reset_index()

# Aggregate total errors per variation
error_summary = error_df.groupby("variation").agg(
    total_step_repeat_errors=("step_repeat_errors", "sum"),
    total_step_back_errors=("step_back_errors", "sum")
).reset_index()

# Display the summary
print(error_summary)

  variation  total_step_repeat_errors  total_step_back_errors
0   Control                     10859                    8326
1      Test                     16117                   13261

# Create a figure and axis
fig, ax = plt.subplots(figsize=(6,4))

# Define custom colors (adjusting based on palette)
bars = error_summary.plot(kind='bar', color=[palette[3], palette[2]], ax=ax)

# Add labels to each bar
for container in bars.containers:
    ax.bar_label(container, fmt='%d', label_type='edge', padding=3, fontsize=10, color='black')

# Labels and title
ax.set_xlabel("Variation")
ax.set_ylabel("Error Count")
ax.set_title("Error Counts by Variation")
ax.set_xticklabels(error_summary['variation'], rotation=0)

ax.legend(title="Error Type")

plt.show()

# Create a grouped bar plot comparing step repeat errors between the two variations
plt.figure(figsize=(8, 5))

sns.barplot(data=error_df, x='process_step', y='step_repeat_errors', hue='variation', ci=None, palette=[palette[1],palette[0]], order=process_step_order)

plt.title('Errors in Process: Step Repeat', fontsize=14)
plt.xlabel('Process Step', fontsize=12)
plt.ylabel('Count of Errors', fontsize=12)
plt.xticks(rotation=0)
plt.legend(title='Variation')

for bar in plt.gca().patches:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, yval + 0.01, f'{yval:.2f}', ha='center', va='bottom', fontsize=9, color='black')

plt.tight_layout()
plt.show()

# Create a grouped bar plot comparing step back errors between the two variations
plt.figure(figsize=(8, 5))

sns.barplot(data=error_df, x='process_step', y='step_back_errors', hue='variation', ci=None, palette=[palette[1],palette[0]], order=process_step_order)

# Add titles and labels
plt.title('Errors in Process: Step Back', fontsize=14)
plt.xlabel('Process Step', fontsize=12)
plt.ylabel('Count of Errors', fontsize=12)
plt.xticks(rotation=0)
plt.legend(title='Variation')

# Add labels to the bars
for bar in plt.gca().patches:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, yval + 0.01, f'{yval:.2f}', ha='center', va='bottom', fontsize=9, color='black')

plt.tight_layout()
plt.show()

def count_visit_ids_per_process_step(df):
    # Define the process steps of interest
    df['process_step'] = pd.Categorical(df['process_step'], categories=process_step_order, ordered=True)

    # Group by 'variation' and 'process_step', and count unique 'visit_id'
    visit_counts = df.groupby(['variation', 'process_step'])['visit_id'].nunique().reset_index(name='visit_count')

    # Get the start counts for each variation
    start_counts = visit_counts[visit_counts['process_step'] == 'start'][['variation', 'visit_count']]
    start_counts.rename(columns={'visit_count': 'start_count'}, inplace=True)

    # Merge start counts back to visit_counts
    visit_counts = visit_counts.merge(start_counts, on='variation', how='left')

    # Calculate percentage and drop percentage
    visit_counts['percentage'] = (visit_counts['visit_count'] / visit_counts['start_count']) * 100
    visit_counts['drop_percentage'] = visit_counts.groupby('variation')['visit_count'].pct_change() * 100
    visit_counts.loc[visit_counts['process_step'] == 'start', 'drop_percentage'] = 0

    # Calculate total drop percentage for each variation
    total_drop = visit_counts[visit_counts['process_step'] == 'confirm'].copy()
    total_drop['drop_percentage'] = ((total_drop['visit_count'] / total_drop['start_count']) * 100 - 100).round(2)
    total_drop['process_step'] = 'total_drop'
    total_drop['visit_count'] = None
    total_drop['percentage'] = None

    # Append total_drop rows to visit_counts
    final_df = pd.concat([visit_counts, total_drop], ignore_index=True).reset_index(drop=True)

    # Round percentage and drop_percentage columns
    final_df['percentage'] = final_df['percentage'].round(2)
    final_df['drop_percentage'] = final_df['drop_percentage'].round(2)

    return final_df

# Calculate abandonment stats
abandonment_stats = count_visit_ids_per_process_step(df_whole)

# Calculate drop-off rates
confirmed_visits = df_whole[df_whole['process_step'] == 'confirm']['visit_id'].unique()
df_whole['dropped_off'] = ~df_whole['visit_id'].isin(confirmed_visits)
dropoffs = df_whole[df_whole['dropped_off']].groupby(['process_step', 'variation'])['visit_id'].nunique().reset_index(name='drop_offs')
total_visits = df_whole.groupby(['process_step', 'variation'])['visit_id'].nunique().reset_index(name='total_visits')
dropoff_rates = pd.merge(dropoffs, total_visits, on=['process_step', 'variation'])
dropoff_rates['drop_off_rate'] = dropoff_rates['drop_offs'] / dropoff_rates['total_visits']

# Display results
print(dropoff_rates)

  process_step variation  drop_offs  total_visits  drop_off_rate
0        start   Control      15583         29611           0.53
1        start      Test      15169         31809           0.48
2       step_1   Control       8570         22712           0.38
3       step_1      Test      10301         27098           0.38
4       step_2   Control       5279         19468           0.27
5       step_2      Test       6502         23342           0.28
6       step_3   Control       3285         17637           0.19
7       step_3      Test       3933         20970           0.19
8      confirm   Control          0         15073           0.00
9      confirm      Test          0         20056           0.00

# Plot the drop-off rates
plt.figure(figsize=(8, 5))
sns.barplot(data=dropoff_rates, x='process_step', y='drop_off_rate', hue='variation', palette=palette)

# Add titles and labels
plt.title('Drop-off Rate per Process Step (Control vs Test)', fontsize=14)
plt.xlabel('Process Step', fontsize=12)
plt.ylabel('Drop-off Rate', fontsize=12)
plt.xticks(rotation=0)
plt.legend(title='Variation')

# Add labels to the bars
for bar in plt.gca().patches:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, yval + 0.006, f'{yval:.2f}', ha='center', va='bottom', fontsize=9, color='black')

plt.tight_layout()
plt.show()

print(f"Test Completion Rate: {p_test_validated:.2%}")
print(f"Test Completion Rate: {p_control_validated:.2%}")

Test Completion Rate: 46.60%
Test Completion Rate: 45.06%

# norm.ppf(0.975) gives the z-score corresponding to the 97.5th percentile of the standard normal distribution 
# Confidence level (95%)
z = norm.ppf(0.975)  # Two-tailed 95% CI 

# Confidence interval for Test group
se_test = (p_test_validated * (1 - p_test_validated) / n_test_total) ** 0.5
ci_test = (p_test_validated - z * se_test, p_test_validated + z * se_test)

# Confidence interval for Control group
se_control = (p_control_validated * (1 - p_control_validated) / n_control_total) ** 0.5
ci_control = (p_control_validated - z * se_control, p_control_validated + z * se_control)

# Difference in proportions
p_diff = p_test_validated - p_control_validated
se_diff = ((p_test_validated * (1 - p_test_validated) / n_test_total) + (p_control_validated * (1 - p_control_validated) / n_control_total)) ** 0.5
ci_diff = (p_diff - z * se_diff, p_diff + z * se_diff)

# Print results
print(f"Test Completion Rate: {p_test_validated:.2%}, Confidence Interval: ({ci_test[0]:.2%}, {ci_test[1]:.2%})")
print(f"Control Completion Rate: {p_control_validated:.2%}, Confidence Interval: ({ci_control[0]:.2%}, {ci_control[1]:.2%})")
print(f"Difference in Completion Rates: {p_diff:.2%}, Confidence Interval: ({ci_diff[0]:.2%}, {ci_diff[1]:.2%})")

Test Completion Rate: 46.60%, Confidence Interval: (46.08%, 47.12%)
Control Completion Rate: 45.06%, Confidence Interval: (44.51%, 45.62%)
Difference in Completion Rates: 1.53%, Confidence Interval: (0.77%, 2.29%)

# Calculate completion rates as percentages
completion_rates = [p_control_validated * 100, p_test_validated * 100]  # Convert proportions to percentages
confidence_intervals = [
    (ci_control[1] - ci_control[0]) * 100 / 2,  # Half-width of CI in percentage
    (ci_test[1] - ci_test[0]) * 100 / 2
]

# CI bounds
ci_lower = [completion_rates[0] - confidence_intervals[0], completion_rates[1] - confidence_intervals[1]]
ci_upper = [completion_rates[0] + confidence_intervals[0], completion_rates[1] + confidence_intervals[1]]

# Define labels 
labels = ["Control", "Test"]

fig, ax = plt.subplots(figsize=(4.5, 3.5))

bars = ax.bar(labels, completion_rates, yerr=confidence_intervals, capsize=10, color=[palette[0], palette[1]], alpha=0.8)

# Annotate the completion rates and confidence intervals
for i, bar in enumerate(bars):
    # Annotate the completion rate
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 2, 
             f"{completion_rates[i]:.2f}%", ha='center', fontsize=10)
    # Annotate the confidence interval
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() - confidence_intervals[i] - 3,
             f"CI: [{ci_lower[i]:.2f}%, {ci_upper[i]:.2f}%]", ha='center', fontsize=9, color='black')

ax.set_title("Completion Rates with Confidence Intervals", fontsize=14)
ax.set_ylabel("Completion Rate (%)", fontsize=12)

plt.tight_layout()
plt.show()

# Observed counts
observed = np.array([[n_test_validated, n_test_total - n_test_validated], 
                     [n_control_validated, n_control_total - n_control_validated]])

# Perform Chi-Square test
chi2_stat, p_value, dof, expected = chi2_contingency(observed)

print(f"Chi-Square Statistic: {chi2_stat:.4f}")
print(f"P-value: {p_value:.4f}")

if p_value < 0.05:
    print("Reject the null hypothesis: There is a significant difference in total completion rates.")
else:
    print("Fail to reject the null hypothesis: No significant difference in total completion rates.")

Chi-Square Statistic: 15.5700
P-value: 0.0001
Reject the null hypothesis: There is a significant difference in total completion rates.

# Initialize a list to store z-test results
z_test_results = []

# Loop through each step to compare proportions between variations
for step in process_step_order:
    # Filter the data to include visits that have completed the current step
    df_step = df_whole[df_whole['process_step'] == step].copy()

    # Assign completion status
    if step == 'confirm':
        df_step['completed'] = 1  # Mark confirm step as always completed
    else:
        next_steps = process_step_order[process_step_order.index(step) + 1:]  # Steps after the current one
        df_step['completed'] = df_step['visit_id'].isin(
            df_whole[df_whole['process_step'].isin(next_steps)]['visit_id']
        ).astype(int) if next_steps else 0  # Check if the visit progresses to next steps

    # Calculate counts of completed steps for Test and Control
    count_control = df_step[(df_step['variation'] == 'Control') & (df_step['completed'] == 1)].shape[0]
    count_test = df_step[(df_step['variation'] == 'Test') & (df_step['completed'] == 1)].shape[0]

    # Calculate total visits for Test and Control
    nobs_control = df_step[df_step['variation'] == 'Control'].shape[0]
    nobs_test = df_step[df_step['variation'] == 'Test'].shape[0]

    # Perform Z-Test if valid data exists
    if (count_test + count_control) > 0 and (nobs_test > 0 and nobs_control > 0):
        stat, pval = proportions_ztest([count_control, count_test], [nobs_control, nobs_test])  
    else:
        stat, pval = float('nan'), float('nan')  # Assign NaN if invalid input

    # Append the results
    z_test_results.append({
        'Process Step': step,
        'Z-Statistic': stat,
        'p-value': pval
    })

# Convert the results to a DataFrame
z_test_df_completion_rates = pd.DataFrame(z_test_results)

# Ensure the correct ordering of steps
z_test_df_completion_rates['Process Step'] = pd.Categorical(z_test_df_completion_rates['Process Step'], categories=process_step_order, ordered=True)
z_test_df_completion_rates = z_test_df_completion_rates.sort_values('Process Step')

# Print the results
print(z_test_df_completion_rates)

  Process Step  Z-Statistic  p-value
0        start       -32.54     0.00
1       step_1        -2.05     0.04
2       step_2         8.48     0.00
3       step_3         1.95     0.05
4      confirm          NaN      NaN

/usr/local/lib/python3.10/dist-packages/statsmodels/stats/weightstats.py:792: RuntimeWarning: invalid value encountered in scalar divide
  zstat = value / std

# Plot p-values
heatmap_data_pvalues = z_test_df_completion_rates.set_index('Process Step')[['p-value']].T 

# Create figure and axes for subplots
fig = plt.figure(figsize=(7, 1))  

# create a heatmap for the P-values
sns.heatmap(
    heatmap_data_pvalues, 
    annot=True, 
    cmap='Blues_r', 
    fmt='.3f',  
    linewidths=0.5,  
    linecolor='gray',  
    # cbar_kws={'label': 'p-value'}, 
)
plt.title('p-Values for Completion Rates by Process Step')
plt.xlabel('')
# plt.ylabel('p-value')
plt.tick_params(axis='x', rotation=0)  # Keep x labels readable

plt.show()

# Plot Z-Statistic
heatmap_data_zscores = z_test_df_completion_rates.set_index('Process Step')[['Z-Statistic']].T  # Z-scores

fig = plt.figure(figsize=(7, 1))  

sns.heatmap(
    heatmap_data_zscores, 
    annot=True, 
    # I will use a diverging palette, because larger |Z| (absolute values) mean more statistical significance
    cmap='PRGn',
    center=0,
    fmt='.3f',  
    linewidths=0.5,  
    linecolor='gray',  
    # cbar_kws={'label': 'z-statistic'}, 
)
plt.title('Z-Statistic for Completion Rates by Process Step')
plt.xlabel('')
# plt.ylabel('z-statistic')
plt.tick_params(axis='x', rotation=0)  

plt.show()

# Function to compute Cohen's h
def cohen_h(p1, p2):
    return 2 * (np.arcsin(np.sqrt(p1)) - np.arcsin(np.sqrt(p2)))

# Initialize a list to store Cohen’s h results
cohen_h_results = []

# Loop through each step and compute Cohen's h using precomputed completion rates
for step in [s for s in process_step_order if s != 'confirm']:
    # Get completion rates from the precomputed dictionary
    p_test_step = completion_rates_variation['Test'][step]
    p_control_step = completion_rates_variation['Control'][step]

    # Compute Cohen's h if valid proportions exist
    h_step = cohen_h(p_test_step, p_control_step) if not np.isnan(p_test_step) and not np.isnan(p_control_step) else np.nan

    # Append the results
    cohen_h_results.append({
        'Process Step': step,
        'Cohen\'s h': h_step,
        'Test Completion Rate': p_test_step,
        'Control Completion Rate': p_control_step
    })

# Convert the results to a DataFrame
cohen_h_df_steps = pd.DataFrame(cohen_h_results)

# Ensure the correct ordering of steps
cohen_h_df_steps['Process Step'] = pd.Categorical(cohen_h_df_steps['Process Step'], categories=process_step_order, ordered=True)
cohen_h_df_steps = cohen_h_df_steps.sort_values('Process Step')

print(cohen_h_df_steps)

  Process Step  Cohen's h  Test Completion Rate  Control Completion Rate
0        start      -0.25                  0.90                     0.96
1       step_1       0.06                  0.75                     0.73
2       step_2       0.05                  0.64                     0.62
3       step_3       0.04                  0.57                     0.55

# Exclude the 'confirm' step 
filtered_steps = [step for step in process_step_order if step != 'confirm']

# Create a figure with 2 rows and 4 columns
fig, axes = plt.subplots(2, 4, figsize=(12, 6))  # 2 rows (Control & Test), 4 columns (one per step)

for col, step in enumerate(filtered_steps):
    for row, variation in enumerate(['Control', 'Test']):
        # Filter data
        step_data = time_long[(time_long['process_step'] == step) & (time_long['variation'] == variation)]['time']
        
        # Q-Q Plot 
        probplot(step_data, dist="norm", plot=axes[row, col])  
        axes[row, col].set_title(f"{step} ({variation})")

plt.tight_layout()
plt.show()

# List to store Mann-Whitney U test results
mann_whitney_results = []

# Loop through each process step
for step in time_long['process_step'].unique():
    # Filter data for the current step
    step_data = time_long[time_long['process_step'] == step]
    
    # Separate times for test and control groups
    test_times = step_data[step_data['variation'] == 'Test']['time']
    control_times = step_data[step_data['variation'] == 'Control']['time']
    
    # Perform Mann-Whitney U test
    if len(test_times) > 0 and len(control_times) > 0:
        stat, pval = mannwhitneyu(test_times, control_times, alternative='two-sided')  # Two-sided test
    else:
        stat, pval = float('nan'), float('nan')  # Handle cases with insufficient data
    
    # Append results
    mann_whitney_results.append({
        'Process Step': step,
        'U-Statistic': stat,
        'P-Value': pval
    })

# Convert results to DataFrame
mann_whitney_df = pd.DataFrame(mann_whitney_results)

# Display the results
print(mann_whitney_df)

  Process Step  U-Statistic  P-Value
0        start 314088948.50     0.00
1       step_1 297505605.50     0.00
2       step_2 187008680.00     0.00
3       step_3 124730529.00     0.00

# Function to compute Rank-Biserial Correlation
def rank_biserial_correlation(u_stat, n1, n2):
    return 1 - (2 * u_stat) / (n1 * n2)

# List to store effect sizes
effect_sizes = []

# Loop through each process step and use the U-statistics calculated above
for index, row in mann_whitney_df.iterrows():
    step = row['Process Step']
    
    # Retrieve sample sizes
    test_n = time_long[(time_long['process_step'] == step) & (time_long['variation'] == 'Test')].shape[0]
    control_n = time_long[(time_long['process_step'] == step) & (time_long['variation'] == 'Control')].shape[0]
    
    # Compute Rank-Biserial Correlation
    if test_n > 0 and control_n > 0:
        rbc = rank_biserial_correlation(row['U-Statistic'], test_n, control_n)
    else:
        rbc = float('nan')  # Handle cases with insufficient data

    # Append results
    effect_sizes.append({'Process Step': step, 'Rank-Biserial Correlation': rbc})

# Convert to DataFrame
effect_size_df = pd.DataFrame(effect_sizes)

# Merge with Mann-Whitney results
mann_whitney_results_df = mann_whitney_df.merge(effect_size_df, on='Process Step')

# Display results
print(mann_whitney_results_df)

  Process Step  U-Statistic  P-Value  Rank-Biserial Correlation
0        start 314088948.50     0.00                       0.11
1       step_1 297505605.50     0.00                      -0.18
2       step_2 187008680.00     0.00                       0.07
3       step_3 124730529.00     0.00                       0.20

# Data for Rank-Biserial Correlation
rank_biserial_corr = mann_whitney_results_df['Rank-Biserial Correlation']

# Create the bar chart
plt.figure(figsize=(7, 3))
bars = plt.bar(mann_whitney_results_df['Process Step'], mann_whitney_results_df['Rank-Biserial Correlation'], color=[palette[3] if x > 0 else palette[2] for x in rank_biserial_corr])

# Add a horizontal line at y=0
plt.axhline(y=0, color='black', linestyle='--', linewidth=1)

# Labels and title
plt.xlabel("Process Step")
plt.ylabel("Rank-Biserial Correlation")
plt.title("Rank-Biserial Correlation per Process Step")

# Annotate bars with values
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, f"{yval:.2f}", 
             ha='center', va='bottom' if yval < 0 else 'top', fontsize=10)

# Show the plot
plt.show()

time_long.groupby(['process_step', 'variation'])['time'].describe()

error_df.head()

# Initialize an empty list to store results
results = []

# Loop through each process step and perform the two-proportion z-test
for step in process_step_order:
    # Filter data for the current process step
    step_data = df_whole[df_whole['process_step'] == step]
    
    # Split into Test and Control groups for step_back_count
    test_group = step_data[step_data['variation'] == 'Test']['step_back_error']
    control_group = step_data[step_data['variation'] == 'Control']['step_back_error']
    
    # Calculate counts and sample sizes for each group
    count_test = test_group.sum()  # Number of "1"s in the test group
    n_test = len(test_group)      # Total sample size in the test group
    count_control = control_group.sum()  # Number of "1"s in the control group
    n_control = len(control_group)       # Total sample size in the control group
    
    # Perform the two-proportion z-test
    counts = [count_test, count_control]
    nobs = [n_test, n_control]
    stat, p_value = proportions_ztest(counts, nobs, alternative='two-sided')
    
    # Calculate effect size (Cohen's h)
    p_test = count_test / n_test
    p_control = count_control / n_control
    effect_size = 2 * abs(np.arcsin(np.sqrt(p_test)) - np.arcsin(np.sqrt(p_control)))
    
    # Add results as a dictionary to the list
    results.append({
        'Process Step': step,
        'Z-Statistic': stat,
        'P-Value': p_value,
        'Effect Size (Cohen\'s h)': effect_size
    })

# Convert the list of results into a DataFrame
step_back_error_results_df = pd.DataFrame(results)

print(step_back_error_results_df)

  Process Step  Z-Statistic  P-Value  Effect Size (Cohen's h)
0        start        30.76     0.00                     0.21
1       step_1         1.84     0.07                     0.01
2       step_2       -10.89     0.00                     0.10
3       step_3        -7.17     0.00                     0.07
4      confirm          NaN      NaN                     0.00

/usr/local/lib/python3.10/dist-packages/statsmodels/stats/weightstats.py:792: RuntimeWarning: invalid value encountered in scalar divide
  zstat = value / std

# Initialize an empty list to store results
results = []

# Loop through each process step and perform the two-proportion z-test
for step in process_step_order:
    # Filter data for the current process step
    step_data = df_whole[df_whole['process_step'] == step]
    
    # Split into Test and Control groups for step_repeat_error
    test_group = step_data[step_data['variation'] == 'Test']['step_repeat_error']
    control_group = step_data[step_data['variation'] == 'Control']['step_repeat_error']
    
    # Calculate counts and sample sizes for each group
    count_test = test_group.sum()  # Number of "1"s in the test group
    n_test = len(test_group)      # Total sample size in the test group
    count_control = control_group.sum()  # Number of "1"s in the control group
    n_control = len(control_group)       # Total sample size in the control group
    
    # Check if we have valid data to run the test
    if n_test > 0 and n_control > 0 and (count_test > 0 or count_control > 0):  
        # Perform the two-proportion z-test
        counts = [count_test, count_control]
        nobs = [n_test, n_control]
        stat, p_value = proportions_ztest(counts, nobs, alternative='two-sided')

        # Calculate effect size (Cohen's h)
        p_test = count_test / n_test
        p_control = count_control / n_control
        effect_size = 2 * abs(np.arcsin(np.sqrt(p_test)) - np.arcsin(np.sqrt(p_control)))
    else:
        # If the test is not valid due to missing values, assign NaN
        stat, p_value, effect_size = np.nan, np.nan, np.nan

    # Add results as a dictionary to the list
    results.append({
        'Process Step': step,
        'Z-Statistic': stat,
        'P-Value': p_value,
        'Effect Size (Cohen\'s h)': effect_size
    })

# Convert the list of results into a DataFrame
step_repeat_error_results_df = pd.DataFrame(results)

print(step_repeat_error_results_df)

  Process Step  Z-Statistic  P-Value  Effect Size (Cohen's h)
0        start         9.46     0.00                     0.06
1       step_1         1.18     0.24                     0.01
2       step_2       -10.41     0.00                     0.09
3       step_3        -1.58     0.11                     0.01
4      confirm        22.51     0.00                     0.24

# Initialize a dictionary to store results
chi_square_results_drop_off = {'Process Step': [], 'P-Value': []}

# Loop through each process step
for step in dropoff_rates['process_step'].unique():
    # Filter data for the current step
    step_data = df_whole[df_whole['process_step'] == step]
    
    # Create a contingency table
    contingency_table = pd.crosstab(step_data['variation'], step_data['dropped_off'])
    
    # Perform Chi-Square test
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    
    # Append results
    chi_square_results_drop_off['Process Step'].append(step)
    chi_square_results_drop_off['P-Value'].append(p_value)

# Convert results to a DataFrame
chi_square_df_drop_off = pd.DataFrame(chi_square_results_drop_off)

print(chi_square_df_drop_off)

  Process Step  P-Value
0        start     0.00
1       step_1     0.01
2       step_2     0.00
3       step_3     0.05
4      confirm     1.00

# Prepare data for heatmap
heatmap_data = chi_square_df_drop_off.set_index('Process Step').T

plt.figure(figsize=(8, 2))
sns.heatmap(
    heatmap_data,
    annot=True,
    fmt='.3f',
    cmap='coolwarm',
    linewidths=0.5,
    # cbar_kws={'label': 'P-Value'}
)
plt.title('Chi-Square Test P-Values for Drop-Off Comparisons', fontsize=16)
plt.xlabel('Process Step', fontsize=12)
plt.ylabel('P-Value', fontsize=12)
plt.tight_layout()
plt.show()

# Initialize a list to store effect size results
cramers_v_results = []

# Loop through each process step
for step in chi_square_df_drop_off['Process Step']:
    # Filter data for the current step
    step_data = df_whole[df_whole['process_step'] == step]

    # Create a contingency table
    contingency_table = pd.crosstab(step_data['variation'], step_data['dropped_off'])

    # Perform Chi-Square test
    chi2, _, _, _ = chi2_contingency(contingency_table)

    # Compute sample size (n) and Cramér’s V
    n = contingency_table.to_numpy().sum()
    k = min(contingency_table.shape)
    cramers_v = np.sqrt(chi2 / (n * (k - 1))) if k > 1 else 0  # Avoid division by zero

    # Append results
    cramers_v_results.append({'Process Step': step, 'Effect Size (Cramér’s V)': cramers_v})

# Convert results to a DataFrame
cramers_v_df = pd.DataFrame(cramers_v_results)

# Merge with existing Chi-Square results
chi_square_df_drop_off = chi_square_df_drop_off.merge(cramers_v_df, on='Process Step')

print(chi_square_df_drop_off)

  Process Step  P-Value  Effect Size (Cramér’s V)
0        start     0.00                      0.02
1       step_1     0.01                      0.01
2       step_2     0.00                      0.03
3       step_3     0.05                      0.01
4      confirm     1.00                      0.00

# Export data to use in a Tableau dashboard

# 1. Error Rates Summary
error_df.to_csv("error_df.csv", index=False)
step_back_error_results_df.to_csv("step_back_error_results_df.csv", index=False)
step_repeat_error_results_df.to_csv("step_repeat_error_results_df.csv", index=False)

# 2. Drop-off Rates DataFrame
dropoff_rates.to_csv("dropoff_rates.csv", index=False)
chi_square_df_drop_off.to_csv("chi_square_df_drop_off.csv", index=False)

# 3. Completion Rates
# Convert dictionary to DataFrame in long format directly
df_long = (
    pd.DataFrame
    .from_dict(completion_rates_variation, orient='index')  # Convert dictionary to DataFrame
    .reset_index()  # Convert index (Control/Test) to a column
    .rename(columns={'index': 'Variation'})  # Rename index column
    .melt(id_vars=['Variation'], var_name='Process Step', value_name='Completion Rate')  # Convert to long format
)
# Export to CSV
df_long.to_csv("completion_rates_variation_step.csv", index=False)

# Completion rates by age
completion_rates_by_age.to_csv("valid_completion_rate_by_age_group.csv", index=False)
z_test_df_completion_rates.to_csv("z_test_df_completion_rates.csv", index=False)

# 4. Mean time per step
mean_time.to_csv("mean_time.csv", index=False)
# for time spent on each step
mann_whitney_results_df.to_csv("mann_whitney_results_df.csv", index=False)

# 5. Number of Clients per variation
df_experiment_clients.to_csv("df_experiment_clients.csv", index=False) 

# 6. Get daily visits per variation
daily_visits.to_csv("daily_visits.csv", index=False)

# 7. Age by variation
age_group_stats.to_csv("age_group_stats.csv", index=False)

# 8. Daily Unique Visits by Variation
df_whole.to_csv("df_whole.csv", index=False)

	client_id	visitor_id	visit_id	process_step	date_time
0	9988021	580560515_7732621733	781255054_21935453173_531117	step_3	2017-04-17 15:27:07
1	9988021	580560515_7732621733	781255054_21935453173_531117	step_2	2017-04-17 15:26:51
2	9988021	580560515_7732621733	781255054_21935453173_531117	step_3	2017-04-17 15:19:22
3	9988021	580560515_7732621733	781255054_21935453173_531117	step_2	2017-04-17 15:19:13
4	9988021	580560515_7732621733	781255054_21935453173_531117	step_3	2017-04-17 15:18:04

Cohen's h	Effect Size Interpretation
0.01 - 0.20	Small effect (Minimal difference)
0.20 - 0.50	Medium effect (Moderate difference)
0.50+	Large effect (Strong difference)

Process Step	Completion Rate Result	Significance Interpretation	Effect Size Interpretation
start	The Control group has a higher completion rate (96%) than the Test group (90%)	Highly significant difference: the Test group is significantly worse at progressing past the start step	It has a moderate practical effect
step_1	Test variation has a slightly higher completion rate than the Control one	The difference is small but real and statistically significant	The actual effect size is negligible. This means that in practice, the improvement may not be meaningful
step_2	Test has a slightly higher completion rate than Control	Significant difference: the Test performs significantly better than Control	Very small effect
step_3	Test has a slightly higher completion rate than Control	The difference in completion rates is not statistically significant.	Very small effect

Process Step	Test Mean Time	Control Mean Time	Test Faster or Slower (Mean Time)?	RBC Value	RBC Interpretation (Ranks)	Do Mean & RBC Agree?
Start	88.81 sec	85.57 sec	Test is slower	0.11	Test is slower	Yes, they agree
Step 1	83.02 sec	63.41 sec	Test is slower	-0.18	Test is faster	Contradiction
Step 2	110.26 sec	116.84 sec	Test is faster	0.07	Test is slower	Contradiction
Step 3	138.65 sec	158.66 sec	Test is faster	0.20	Test is slower	Contradiction

Process Step	Statistical Significance	Effect Size
Start	A significant portion of users take much longer in the Test variation than in Control, and a statistically significant difference exists.	A small positive Rank-Biserial correlation suggests the effect is small.
Step 1	Most users in Test complete Step 1 faster, but a subset of users take significantly longer. The mean is skewed by a few very slow users in Test.	The Rank-Biserial correlation indicates a small to medium effect.
Step 2	Test is faster by median and mean times, but RBC suggests the opposite. Even though most Test users are faster, some take much longer - and the difference is statistically significant.	A small positive Rank-Biserial correlation indicates users in the Test Variation take slightly longer to complete step 2.
Step 3	Median and mean confirm that Test is faster, but RBC suggests Test is slower. This means that while most Test users are faster, some take significantly longer.	A small to medium positive Rank-Biserial correlation indicates users in the Test Variation take longer to complete step 3.

	client_id	clnt_tenure_yr	clnt_tenure_mnth	clnt_age	gendr	num_accts	bal	calls_6_mnth	logons_6_mnth
0	836976	6.00	73.00	60.50	U	2.00	45105.30	6.00	9.00
1	2304905	7.00	94.00	58.00	U	2.00	110860.30	6.00	9.00
2	1439522	5.00	64.00	32.00	U	2.00	52467.79	6.00	9.00
3	1562045	16.00	198.00	49.00	M	2.00	67454.65	3.00	6.00
4	5126305	12.00	145.00	33.00	F	2.00	103671.75	0.00	3.00

	non-Null_count	dtype	unique_values	%_unique	%_missing
client_id	70609	object	70609	100.0%	0.0%
tenure_year	70609	float64	54	0.08%	0.0%
tenure_month	70609	float64	482	0.68%	0.0%
age	70609	float64	165	0.23%	0.0%
gender	70609	object	4	0.01%	0.0%
number_of_accounts	70609	float64	8	0.01%	0.0%
balance	70609	float64	70328	99.6%	0.0%
calls_6_month	70609	float64	8	0.01%	0.0%
logons_6_month	70609	float64	9	0.01%	0.0%

	tenure_year	tenure_month	age	number_of_accounts	balance	calls_6_month	logons_6_month
count	70609.00	70609.00	70609.00	70609.00	70609.00	70609.00	70609.00
mean	12.05	150.66	46.44	2.26	147428.56	3.38	5.57
std	6.87	82.08	15.59	0.53	301481.14	2.24	2.35
min	2.00	33.00	13.50	1.00	13789.42	0.00	1.00
25%	6.00	82.00	32.50	2.00	37350.12	1.00	4.00
50%	11.00	136.00	47.00	2.00	63332.90	3.00	5.00
75%	16.00	192.00	59.00	2.00	137521.72	6.00	7.00
max	62.00	749.00	96.00	8.00	16320040.15	7.00	9.00

	client_id	visitor_id	visit_id	process_step	date_time
12	8320017	39393514_33118319366	960651974_70596002104_312201	start	2017-04-05 13:08:06
11	8320017	39393514_33118319366	960651974_70596002104_312201	step_1	2017-04-05 13:08:24
10	8320017	39393514_33118319366	960651974_70596002104_312201	step_2	2017-04-05 13:08:40
9	8320017	39393514_33118319366	960651974_70596002104_312201	step_3	2017-04-05 13:09:43
8	8320017	39393514_33118319366	960651974_70596002104_312201	confirm	2017-04-05 13:10:05

	client_id	Variation
0	9988021	Test
1	8320017	Test
2	4033851	Control
3	1982004	Test
4	9294070	Control

process_step	visit_id	confirm	start	step_1	step_2	step_3
0	100012776_37918976071_457913	NaN	NaN	NaN	NaN	NaN
1	100019538_17884295066_43909	NaN	136.00	NaN	NaN	NaN
2	100022086_87870757897_149620	NaN	22.00	45.00	53.00	60.00
3	100030127_47967100085_936361	NaN	NaN	NaN	NaN	NaN
4	100037962_47432393712_705583	NaN	124.00	8.00	NaN	NaN

		count	mean	std	min	25%	50%	75%	max
process_step	variation
start	Control	24551.00	85.57	360.83	1.00	12.00	25.00	59.00	41957.00
start	Test	28634.00	88.81	330.57	1.00	9.00	17.00	68.00	31624.00
step_1	Control	20288.00	63.41	221.90	1.00	11.00	25.00	56.00	16950.00
step_1	Test	24814.00	83.02	223.07	0.00	18.00	34.00	70.00	15620.00
step_2	Control	18204.00	116.84	212.45	1.00	51.00	82.00	135.00	21763.00
step_2	Test	22177.00	110.26	214.56	0.00	47.00	74.00	120.00	16036.00
step_3	Control	15966.00	158.66	234.41	1.00	49.00	87.00	170.00	6164.00
step_3	Test	19474.00	138.65	283.04	0.00	33.00	60.00	131.00	6760.00

Process Step	Statistical Significance	Effect Size
Start	Highly significant difference - More step back errors in Test group.	Small effect.
Step 1	Marginally significant difference - No major variation impact.	No meaningful effect - Similar errors in both groups.
Step 2	Highly significant difference - The Control variation performs better at Step 2.	Small effect.
Step 3	Highly significant difference - The Control variation performs better.	No meaningful effect - Difference is statistically significant but negligible.

Process Step	Statistical Significance	Effect Size
Start	Highly significant difference - Test users repeat steps more often than Control users.	Small effect.
Step 1	No significant difference.	No meaningful effect - Similar step repeat errors in both groups.
Step 2	Highly significant difference - Control users repeat steps slightly more often.	Small effect - The Test group has slightly fewer step repeat errors.
Step 3	No significant difference.	No meaningful effect - Difference is statistically marginal but practically negligible.
Confirm	Highly significant difference - Test users repeat more often the confirmation step.	Small effect.

Process Step	Statistical Significance	Effect Size Interpretation
start	Highly significant difference - The Test variation has lower drop-off rates than the Control one at the start.	Small effect.
step_1	Significant difference - but almost negligible rate difference.	Very small effect - Minimal difference in drop-off behavior.
step_2	Highly significant difference.	Small effect.
step_3	Marginally significant difference.	Very small effect - Statistically significant but practically negligible.

A/B Test for Vanguard - Evaluating New Features¶

Introduction¶

Project Brief¶

Methodology Outline¶

1. Data Cleaning¶

2. Data Exploration & Preparation¶

3. Experimental Metrics & KPI¶

4. Hypothesis Testing¶

1. Data Cleaning¶

1.2. Importing libraries¶

1.3. Demographics dataset cleaning¶

1.4. Web dataset cleaning¶

Critical Assumption¶

1.5. Experiment clients dataset cleaning¶

2. Data Exploration and Preparation¶

2.1. Demographics dataframe exploration¶

Comments on the demographics plots¶

2.2. Web Data dataframe¶

2.3. Experiment Clients dataframe¶

2.4. Relationships between parameters (Bivariate Analysis)¶

Observations¶

3. Experimental Metric & KPIs¶

3.1. Completion rates¶

3.2. Time Spent on Each Step¶

3.3. Error Rates¶

3.4. Drop off rate per step¶

4. A/B Hypothesis testing¶

4.1. Completion rates¶

4.1.1. Confidence Intervals calculation¶

4.1.2. Overall Completion Rates¶

4.1.2.1. Statistical significance¶

Conclusions¶

4.1.3. Individual steps completion rates analysis¶

4.1.3.1. Statistical significance¶

4.1.3.2. Effect size¶

4.2. Time Spent on Each Step¶

4.3. Error Rates¶

4.3.1. Step back count errors¶

4.3.2. Step repeat count errors¶

4.4. Drop-off rates¶

5. Conclusions¶