1
# Python implementation of A/B test analysis
import scipy.stats as stats
# Sample data from experiment
control_contacts = 5842
control_responses = 837
treatment_contacts = 5813
treatment_responses = 1046
# Calculate response rates
control_rate = control_responses / control_contacts
treatment_rate = treatment_responses / treatment_contacts
# Run statistical test
z_score, p_value = stats.proportions_ztest(
[treatment_responses, control_responses],
[treatment_contacts, control_contacts]
)
print(f"Control response rate: {control_rate:.2%}")
print(f"Treatment response rate: {treatment_rate:.2%}")
print(f"p-value: {p_value:.4f}")
2
# Experimental design code
import numpy as np
from scipy import stats
def calculate_sample_size(
baseline_rate=0.15, # Current engagement rate
min_detectable_effect=0.05, # 5% improvement
significance_level=0.05,
power=0.80
):
# Statistical power calculations
sample_size = stats.norm.ppf(1-significance_level/2) + \
stats.norm.ppf(power)
# Return required sample per group
return int(sample_size) * 2
3
# Python code for hypothesis definition
def define_hypotheses():
"""
H₀: μ_treatment - μ_control = 0
H₁: μ_treatment - μ_control ≠ 0 (two-tailed)
Where μ represents the mean 24-hour engagement rate
"""
# Power analysis to determine sample size
import statsmodels.stats.power as smp
# Parameters
effect_size = 0.15 # Minimum detectable effect
alpha = 0.05 # Significance level
power = 0.80 # Statistical power
# Calculate required sample size per group
analysis = smp.TTestIndPower()
sample_size = analysis.solve_power(effect_size, power=power, alpha=alpha)
return sample_size
# Minimum sample size per group
min_sample_size = define_hypotheses()
print(f"Required sample size per group: {min_sample_size:.0f}")# Mathematical representation
H₀: μ_treatment - μ_control = 0# Mathematical representation
H₁: μ_treatment - μ_control ≠ 0# SQL query to calculate our primary metric
SELECT
assigned_feature_flag AS test_group,
COUNT(*) AS total_outreach,
SUM(engaged_within_24h) AS engaged_count,
AVG(engaged_within_24h) AS engagement_rate
FROM experiment_data
GROUP BY assigned_feature_flag;
# Python function to verify metric integrity
def validate_engagement_metric(df):
"""
Validates the engaged_within_24h metric:
1. Checks for missing values
2. Verifies binary nature (0 or 1 only)
3. Confirms temporal accuracy (engagement timestamp - outreach timestamp ≤ 24h)
"""
# Implementation details
assert df['engaged_within_24h'].isin([0, 1]).all(), "Metric must be binary"
# Additional validation logic
return True4
import pandas as pd
import psycopg2
# Database connection
conn = psycopg2.connect(
dbname="sales_analytics",
user="analyst",
password="****",
host="db.internal"
)
# Extract relevant fields
query = """
SELECT
rep_id, account_id,
assigned_feature_flag,
industry, region,
outreach_timestamp,
engaged_within_24h
FROM outreach_data
WHERE outreach_timestamp >= NOW() - INTERVAL '30 days'
"""
df = pd.read_sql(query, conn)
print(f"Extracted {len(df)} records")
# Check treatment/control distribution
assignment_counts = df['assigned_feature_flag'].value_counts(normalize=True)
print(f"Treatment: {assignment_counts[1]:.2%}")
print(f"Control: {assignment_counts[0]:.2%}")
# Chi-square tests for independence
from scipy.stats import chi2_contingency
# Industry balance check
industry_contingency = pd.crosstab(
df['industry'],
df['assigned_feature_flag']
)
chi2, p, _, _ = chi2_contingency(industry_contingency)
print(f"Industry balance p-value: {p:.4f}")
# Similar tests for region
# One-hot encoding for categorical variables
df_processed = pd.get_dummies(
df,
columns=['industry', 'region'],
drop_first=True
)
# Create interaction terms for segment analysis
segments = ['enterprise', 'mid_market', 'smb']
for segment in segments:
column_name = f'industry_{segment}'
if column_name in df_processed.columns:
df_processed[f'{column_name}_treatment'] = (
df_processed[column_name] *
df_processed['assigned_feature_flag']
)
# Final validation
assert df_processed.isnull().sum().sum() == 0, "Missing values found"
print(f"Final dataset shape: {df_processed.shape}")
5
# Python power analysis code
import statsmodels.stats.power as smp
# Parameters
p1 = 0.18 # Control group proportion
p2 = 0.18 + 0.03 # Treatment group proportion
alpha = 0.05 # Significance level
power = 0.8 # Target power
# Calculate required sample size
analysis = smp.TTestIndPower()
n = analysis.solve_power(
effect_size=p2-p1,
alpha=alpha,
power=power
)
print(f"Required observations per group: {round(n)}")
# Output: Required observations per group: 698
6
# R code for two-proportion Z-test analysis
treatment_success <- 562
treatment_total <- 2458
control_success <- 462
control_total <- 2542
# Calculate proportions
p1 <- treatment_success/treatment_total # 22.86%
p2 <- control_success/control_total # 18.17%
# Perform Z-test
prop.test(
x = c(treatment_success, control_success),
n = c(treatment_total, control_total),
alternative = "greater"
)
# Statistical output summary
# Z = 4.11, p-value < 0.0001
# 95% confidence interval: [2.45%, 6.93%]
# Effect size (Cohen's h): 0.12 (small-to-medium effect)
7
# Python implementation of two-proportion z-test
import numpy as np
import scipy.stats as stats
# Treatment group data
treatment_success = 562
treatment_total = 2458
# Control group data
control_success = 462
control_total = 2542
# Calculate proportions
p1 = treatment_success / treatment_total # 0.2286
p2 = control_success / control_total # 0.1817
# Calculate pooled proportion
p_pooled = (treatment_success + control_success) / (treatment_total + control_total)
# Calculate standard error
se = np.sqrt(p_pooled * (1 - p_pooled) * (1/treatment_total + 1/control_total))
# Calculate z-statistic
z_stat = (p1 - p2) / se # 4.11
# Calculate p-value
p_value = stats.norm.sf(abs(z_stat)) * 2 # < 0.0001
# Calculate 95% confidence interval
margin_error = 1.96 * se
ci_lower = (p1 - p2) - margin_error # 0.0245 (2.45%)
ci_upper = (p1 - p2) + margin_error # 0.0693 (6.93%)
8
# Python implementation of segment analysis
def calculate_segment_lift(df, segment_col):
results = {}
for segment in df[segment_col].unique():
segment_data = df[df[segment_col] == segment]
control = segment_data[segment_data['group'] == 'control']['engagement'].mean()
treatment = segment_data[segment_data['group'] == 'treatment']['engagement'].mean()
lift_pct = (treatment - control) / control * 100
results[segment] = round(lift_pct, 2)
return results
9
# Python Analysis Code
import statsmodels.api as sm
from statsmodels.formula.api import ols
# Run regression model with segment interaction
model = ols('engagement ~ treatment*segment', data=experiment_data).fit()
anova_results = sm.stats.anova_lm(model, typ=2)
# Effect sizes by segment
segments = experiment_data.groupby('segment')['engagement'].agg(
['mean', 'count', 'std']).reset_index()
print(f"Overall lift: {lift_pct:.2f}%, p-value: {p_value:.6f}")
10
# Algorithm Overview - Optimal Outreach Timing
def calculate_optimal_time(user_data, region, industry):
base_time = user_data.get_historical_engagement_times()
industry_modifier = INDUSTRY_COEFFICIENTS.get(industry, 1.0)
region_modifier = REGION_COEFFICIENTS.get(region, 1.0)
return optimize_time_window(base_time,
industry_modifier,
region_modifier)
# Industry Coefficient Table
INDUSTRY_COEFFICIENTS = {
'Healthcare': 1.41, # 9.02% lift / 6.4% baseline
'Manufacturing': 1.28, # 6.14% lift / 4.8% baseline
'Technology': 1.12, # 4.1% lift / 3.65% baseline
'Financial': 1.05 # 3.2% lift / 3.05% baseline
}
# Region Optimization Parameters
REGION_COEFFICIENTS = {
'APAC': 1.22, # 5.64% lift
'EMEA': 1.15, # 4.9% lift
'NA': 1.08, # 4.2% lift
'LATAM': 1.10 # 4.35% lift
}
-- Response Time Analysis SQL
SELECT
industry,
region,
AVG(response_time_seconds) as avg_response,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY response_time_seconds) as median_response,
COUNT(*) as total_interactions
FROM interactions
WHERE feature_enabled = true
AND timestamp >= '2023-04-01'
GROUP BY industry, region
HAVING COUNT(*) > 500
ORDER BY avg_response ASC;
11