import os
import sys
import json
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ab_bayes_test import ABBayesTest

# -----------------------------
# Local application imports
# -----------------------------
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(project_root)
from  src.helper_functions import (summarize_dataframe, 
                                   boxplot_outliers_groups,
                                   histogram_groups,
                                   categorical_distribution_groups,
                                   plot_histograms_data_1vsdata_2,
                                   plot_histogram_single,
                                   simulate_profit_posterior,
                                   plot_kde_data1vsdata_2_same_plot,
                                   display_recommendation
                                   )

df_raw = pd.read_csv("../data/cookie_cats.csv")
display(df_raw.head(11))
display(df_raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90189 entries, 0 to 90188
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   userid          90189 non-null  int64 
 1   version         90189 non-null  object
 2   sum_gamerounds  90189 non-null  int64 
 3   retention_1     90189 non-null  bool  
 4   retention_7     90189 non-null  bool  
dtypes: bool(2), int64(2), object(1)
memory usage: 2.2+ MB

None

df_raw['userid'] = df_raw['userid'].astype(str)

df_checks = df_raw.copy()

with pd.option_context(
    'display.max_columns', None,
    'display.max_rows', None,
    'display.max_colwidth', None
):
    summarize_dataframe(df_checks)

===== DATA OVERVIEW =====

DataFrame shape: (90189, 5)

===== CATEGORICAL / BOOLEAN STATS =====

Column: userid

Column: version

Column: retention_1

Column: retention_7

DataFrame head (5 rows):

boxplot_outliers_groups(
    df=df_checks,              
    column='sum_gamerounds', 
    label_col='version',     
    outlier_color='red',     
    title='Outlier Analysis for sum_gamerounds by Version',
    percentile_filter=(0, 95),  # optional: filter extreme percentiles
    show_mean=True
)

histogram_groups(
    df=df_checks,              
    column='sum_gamerounds', 
    label_col='version',     
    outlier_color='red',     
    title='Outlier Analysis for sum_gamerounds by Version',
    percentile_filter=(0, 95),  
    show_mean=True,
    show_median = True,
    bins = 50
)

# Copy positive values (Box-Cox requires y > 0)
y = df_raw['sum_gamerounds'].copy()
#Check y > 0:
if (y <=0).any():
    y+=1
# Apply Box-Cox transformation
y_boxcox, lambda_ = stats.boxcox(y)

# Add new column to the DataFrame
df_checks['sum_gamerounds_boxcox'] = y_boxcox

print(f"Lambda used for Box-Cox: {lambda_:.4f}")

#Output: 
#Lambda used for Box-Cox: -0.0302

Lambda used for Box-Cox: -0.0302

df_checks['sum_gamerounds_log'] =  np.log1p(df_checks['sum_gamerounds'])

histogram_groups(
    df=df_checks,              
    column='sum_gamerounds_log', 
    label_col='version',     
    outlier_color='red',     
    title='Outlier Analysis for sum_gamerounds_log by Version',
    percentile_filter=(0, 100),  # optional: filter extreme percentiles
    show_mean=True,
    show_median = True,
    bins = 50
)

histogram_groups(
    df=df_checks,              
    column='sum_gamerounds_boxcox', 
    label_col='version',     
    outlier_color='red',     
    title='Outlier Analysis for sum_gamerounds_boxcox by Version',
    percentile_filter=(0, 100),  # optional: filter extreme percentiles
    show_mean=True,
    show_median = True,
    bins = 50
)

with pd.option_context(
    'display.max_columns', None,
    'display.max_rows', None,
    'display.max_colwidth', None
):
    summarize_dataframe(df_checks[['sum_gamerounds_log','sum_gamerounds_boxcox']])

===== DATA OVERVIEW =====

DataFrame shape: (90189, 2)

DataFrame head (5 rows):

df_checks = df_checks.query("sum_gamerounds > 0")

initial_shape =  df_raw.shape
new_shape =  df_checks.shape
drop_pct = 100 * (initial_shape[0] - new_shape[0]) / initial_shape[0] 
print(f"Dimension of the entire dataset: {initial_shape}\n"
      f"Dimension after removing zero sum_gamerounds: {new_shape} \n "
      f"(drop of {drop_pct:.2f}% of rows), as expected!")

Dimension of the entire dataset: (90189, 5)
Dimension after removing zero sum_gamerounds: (86195, 7) 
 (drop of 4.43% of rows), as expected!

# ----------------------------------------------------------------------------------
# Plotting categorical distributions of retention rates for each group (Gate 30 / Gate 40)
# ----------------------------------------------------------------------------------
# This function call generates side-by-side bar plots for the columns "retention_1" and "retention_7"
# grouped by "version" (i.e., Gate 30 and Gate 40). 
# - Each group (Gate 30 and Gate 40) is treated independently.
# - The height of the bars represents the proportion of players in that group that fall into 
#   each category of "retention_1" or "retention_7".
# - This is useful to compare the distribution of retention **within each gate**,
#   not the overall retention across all players.

categorical_distribution_groups(
    df = df_checks,
    categorical_cols = ["retention_1","retention_7"],
    target_col= "version",
    top_k  = None,
    group_titles = ("Gate 30", "Gate 40"),
    global_normalize = False,
    normalize_within_group = True
)

mean_log = df_checks['sum_gamerounds_log'].mean()

# Create boolean columns
df_checks['retention_1_above_mean'] = df_checks['retention_1'] & (df_checks['sum_gamerounds_log'] > mean_log)
df_checks['retention_1_below_mean'] = df_checks['retention_1'] & (df_checks['sum_gamerounds_log'] <= mean_log)

df_checks['retention_7_above_mean'] = df_checks['retention_7'] & (df_checks['sum_gamerounds_log'] > mean_log)
df_checks['retention_7_below_mean'] = df_checks['retention_7'] & (df_checks['sum_gamerounds_log'] <= mean_log)


categorical_distribution_groups(
    df = df_checks,
    categorical_cols = ["retention_1_below_mean","retention_1_above_mean" ],
    target_col= "version",
    top_k  = None,
    group_titles = ("Gate 30", "Gate 40"),
    global_normalize = False,
    normalize_within_group = True,
    top  = 0.9,
    hspace = 0.8,
    wspace = 0.4,

    )


categorical_distribution_groups(
    df = df_checks,
    categorical_cols = ["retention_7_below_mean","retention_7_above_mean" ],
    target_col= "version",
    top_k  = None,
    group_titles = ("Gate 30", "Gate 40"),
    global_normalize = False,
    normalize_within_group = True,
    top  = 0.9,
    hspace = 0.8,
    wspace = 0.4,
    )

# ----------------------------------------------------------------------------------
# Here we do not have a classical distribution that sums to 100%.
# Instead, the interpretation is the contribution of a single player in each group
# relative to the total number of players.
# This helps to understand the overall impact of each group and retention category
# on the total player base, rather than the internal distribution within each group.
# ----------------------------------------------------------------------------------

categorical_distribution_groups(
    df=df_checks,
    categorical_cols=["retention_1", "retention_7"],   
    target_col="version",                              
    group_titles=("Gate 30", "Gate 40"),
    global_normalize=True,
    normalize_within_group=False
)

## Bad Users in Both Groups
# ----
# Detect users who appear as bad in both Gate 30 and Gate 40 simultaneously.
# ----
df_checks[['version','userid']].groupby('userid').nunique().query('version > 1').reset_index()

# Output: Empty DataFrame -> No users are in both groups, all good!

## Instantiate A/B tests
# -----------------------
ab_test_retention_1 = ABBayesTest(
    df=df_checks,
    group_col='version',
    value_col='retention_1',    # Retention at day 1
    metric_type='proportion',
    inference_type='conjugate',
    prior_params=None,
    sampling_size=10000,
    control_group='gate_30',
    treatment_group='gate_40'
)

ab_test_retention_7 = ABBayesTest(
    df=df_checks,
    group_col='version',
    value_col='retention_7',    # Retention at day 7
    metric_type='proportion',
    inference_type='conjugate',
    prior_params=None,
    sampling_size=10000,
    control_group='gate_30',
    treatment_group='gate_40'
)

## Fit the tests
# -----------------------
ab_test_retention_1.fit()
ab_test_retention_7.fit()

## Posterior summary
# -----------------------
posterior_summary_retention_1 = ab_test_retention_1.results(ci=0.95)
posterior_summary_retention_7 = ab_test_retention_7.results(ci=0.95)

## Posterior distributions
# -----------------------
posterior_samples_retention_1 = ab_test_retention_1.get_distributions()
posterior_samples_retention_7 = ab_test_retention_7.get_distributions()

## Lift summary
# -----------------------
lift_summary_retention_1 = ab_test_retention_1.lift_summary()
lift_summary_retention_7 = ab_test_retention_7.lift_summary()

## Compute lift samples
# -----------------------
lift_samples_retention_1 = posterior_samples_retention_1['gate_40'] / posterior_samples_retention_1['gate_30'] - 1
lift_samples_retention_7 = posterior_samples_retention_7['gate_40'] / posterior_samples_retention_7['gate_30'] - 1

## Instantiate A/B test for Total Rounds Played
# --------------------------------------------
ab_test_total_rounds = ABBayesTest(
    df=df_checks,
    group_col='version',
    value_col='sum_gamerounds_log',  
    metric_type='mean',
    inference_type='conjugate',
    prior_params=None,
    sampling_size=10000,
    control_group='gate_30',
    treatment_group='gate_40'
)

## Fit the test
# --------------------------------------------
ab_test_total_rounds.fit()

## Posterior summary
# --------------------------------------------
posterior_summary_total_rounds = ab_test_total_rounds.results(ci=0.95)

## Posterior distributions
# --------------------------------------------
posterior_samples_total_rounds = ab_test_total_rounds.get_distributions()

## Lift summary
# --------------------------------------------
lift_summary_total_rounds = ab_test_total_rounds.lift_summary()

## Compute lift samples
# --------------------------------------------
lift_samples_total_rounds = posterior_samples_total_rounds['gate_40'] / posterior_samples_total_rounds['gate_30'] - 1

pd.DataFrame(posterior_summary_retention_1)

plot_histograms_data_1vsdata_2(posterior_samples_retention_1)
plot_kde_data1vsdata_2_same_plot(posterior_samples_retention_1)

print(json.dumps(lift_summary_retention_1, indent=4))

{
    "mean_lift": -0.011313637336259043,
    "std_lift": 0.007222386220802068,
    "prob_treatment_superior": 0.0602
}

plot_histogram_single(lift_samples_retention_1)

display_recommendation(
    "<b>Recommendation:</b> Keep Gate 30 as the active version, as the data does not justify switching to Gate 40."
)

pd.DataFrame(posterior_summary_retention_7)

plot_histograms_data_1vsdata_2(posterior_samples_retention_7)
plot_kde_data1vsdata_2_same_plot(posterior_samples_retention_7)

print(json.dumps(lift_summary_retention_7, indent=4))

{
    "mean_lift": -0.040779099852556205,
    "std_lift": 0.013229163782679883,
    "prob_treatment_superior": 0.0013
}

plot_histogram_single(lift_samples_retention_7)

display_recommendation(
    "<b>Recommendation:</b> Keep Gate 30 as the active version, as the data does not justify switching to Gate 40."
)

pd.DataFrame(posterior_summary_total_rounds)

plot_histograms_data_1vsdata_2(posterior_samples_total_rounds)
plot_kde_data1vsdata_2_same_plot(posterior_samples_total_rounds)

print(json.dumps(lift_summary_total_rounds, indent=4))

{
    "mean_lift": -0.004335803906307668,
    "std_lift": 0.0031242245016132168,
    "prob_treatment_superior": 0.0826
}

plot_histogram_single(lift_samples_total_rounds)

display_recommendation(
    "<b>Recommendation:</b> Keep Gate 30 as the active version, as the data does not justify switching to Gate 40."
)

# -----------------------------
# Business Rule: Estimated LTV
# -----------------------------
# According to our business understanding, each retained player generates:
# - $2 from in-app purchases (skips, boosters, etc.)
# - $1 from advertising (ad impressions during waiting or gameplay)
# Therefore, total estimated revenue per retained player:
# ltv_per_player = 3 
     
ltv_per_player = 3  
initial_players = len(df_checks)

profit_summary_7d = simulate_profit_posterior(
    posterior_samples_retention_7,
    posterior_summary_retention_7,
    initial_players,
    ltv_per_player=ltv_per_player,
    title="Projected 7-Day Profit Distribution"
) 

display(profit_summary_7d)

# ------------------------------------------------------
# A/B Bayesian Test Report Generation
# ------------------------------------------------------
# This script generates both PowerPoint and Beamer (LaTeX) reports
# for the A/B test analysis.
#
# Instructions:
# 1. Navigate to the 'report' folder.
# 2. Execute this script to generate the reports.
# 3. For the Beamer (LaTeX) version, update the author name where needed.
#
# The full pipeline is included below:
# - Data processing and Bayesian modeling
# - Posterior, lift, and summary visualizations
# - PowerPoint slide creation
# - Beamer PDF report generation from existing images

# # =============================
# # Standard library imports
# # =============================
# import os
# import sys
# import warnings
# warnings.filterwarnings("ignore")

# # =============================
# # Third-party imports
# # =============================
# import pandas as pd
# import numpy as np
# import matplotlib
# matplotlib.use('Agg')  # headless backend
# import matplotlib.pyplot as plt
# import seaborn as sns
# from scipy import stats
# import subprocess
# from pdf2image import convert_from_path
# from pptx import Presentation
# from pptx.util import Inches, Pt
# from pptx.enum.text import PP_ALIGN
# from pptx.dml.color import RGBColor
# from ab_bayes_test import ABBayesTest  

# # =============================
# # Local application imports
# # =============================
# current_dir = os.path.dirname(os.path.abspath(__file__))
# project_root = os.path.abspath(os.path.join(current_dir, '..'))
# sys.path.append(project_root)

# from src.helper_functions import (
#     plot_histograms_data_1vsdata_2,
#     plot_histogram_single,
#     plot_kde_data1vsdata_2_same_plot,
#     simulate_profit_posterior
# )
# # -----------------------------
# # PowerPoint Slide Utilities
# # -----------------------------
# def create_graph_slide(prs, title, subtitle, image_path):
#     slide = prs.slides.add_slide(prs.slide_layouts[6])
    
#     # Title
#     textbox = slide.shapes.add_textbox(Inches(0.5), Inches(0.2), Inches(9), Inches(1))
#     tf = textbox.text_frame
#     tf.clear()
    
#     p_title = tf.paragraphs[0]
#     run = p_title.add_run()
#     run.text = title
#     run.font.bold = True
#     run.font.size = Pt(32)
#     run.font.color.rgb = RGBColor(0, 0, 0)
#     p_title.alignment = PP_ALIGN.CENTER

#     # Subtitle
#     if subtitle:
#         p_sub = tf.add_paragraph()
#         run_sub = p_sub.add_run()
#         run_sub.text = subtitle
#         run_sub.font.size = Pt(20)
#         run_sub.font.color.rgb = RGBColor(50, 50, 50)
#         p_sub.alignment = PP_ALIGN.CENTER

#     # Image
#     if image_path:
#         slide.shapes.add_picture(image_path, Inches(0.5), Inches(1.8), width=Inches(9))
    
#     return slide

# def build_presentation(slide_creators, output_file="presentation.pptx"):
#     output_dir = os.path.dirname(output_file)
#     os.makedirs(output_dir, exist_ok=True)
#     prs = Presentation()
#     for func, args in slide_creators:
#         func(prs, **args)
#     prs.save(output_file)
#     print(f"Presentation saved at: {output_file}")

# # -----------------------------
# # LaTeX Slide Utilities
# # -----------------------------
# def create_recommendation_slide_image(value_col, control_group, treatment_group, lift_summary, slides_dir="slides"):
#     os.makedirs(slides_dir, exist_ok=True)
    
#     prob = lift_summary['prob_treatment_superior'] * 100
#     val_col_safe = value_col.replace("_", r"\_")
#     treat_group_safe = treatment_group.replace("_", r"\_")
#     control_group_safe = control_group.replace("_", r"\_")
    
#     path_tex = os.path.join(slides_dir, f"{value_col}_slide_4.tex")
#     path_pdf = os.path.join(slides_dir, f"{value_col}_slide_4.pdf")
#     path_png = os.path.join(slides_dir, f"{value_col}_slide_4.png")

#     latex_content = f"""\\documentclass[16pt]{{article}}
# \\usepackage{{amsmath,amssymb,xcolor}}
# \\usepackage[active,tightpage]{{preview}}
# \\PreviewEnvironment{{center}}
# \\setlength\\parindent{{0pt}}
# \\pagestyle{{empty}}
# \\begin{{document}}
# \\begin{{center}}
# \\textbf{{}}\\\\[1em]
# No statistically significant improvement detected. Maintain current version.\\\\[1em]
# Probability that

# $$
# \\text{{Prob}}(\\text{{Treatment}} > \\text{{Control}}) = {prob:.2f}\\%
# $$

# is far below the 95\\% threshold.
# \\end{{center}}
# \\end{{document}}"""
    
#     with open(path_tex, "w") as f:
#         f.write(latex_content)
    
#     subprocess.run(["pdflatex", "-output-directory", slides_dir, path_tex], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    
#     pages = convert_from_path(path_pdf, dpi=200)
#     pages[0].save(path_png, "PNG")
    
#     return path_png

# def create_conclusion_slide_image(value_col="final", slides_dir="slides"):
#     os.makedirs(slides_dir, exist_ok=True)
    
#     path_tex = os.path.join(slides_dir, f"{value_col}_slide_1.tex")
#     path_pdf = os.path.join(slides_dir, f"{value_col}_slide_1.pdf")
#     path_png = os.path.join(slides_dir, f"{value_col}_slide_1.png")

#     latex_content = rf"""\documentclass[16pt]{{article}}
# \usepackage{{amsmath,amssymb,xcolor}}
# \usepackage[active,tightpage]{{preview}}
# \PreviewEnvironment{{center}}
# \setlength\parindent{{0pt}}
# \pagestyle{{empty}}
# \begin{{document}}
# \begin{{center}}
# \textbf{{Final Recommendation}} \\[1em]
# After reviewing all metrics, including retention on Day 1 and Day 7,
# engagement through total rounds, and overall business performance, 
# the results show no consistent evidence that the treatment version delivers better outcomes. \\[1em]
# The recommendation is to keep the current control version and continue monitoring performance 
# over the next cycles.
# \end{{center}}
# \end{{document}}"""

#     with open(path_tex, "w") as f:
#         f.write(latex_content)
    
#     subprocess.run(["pdflatex", "-output-directory", slides_dir, path_tex], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    
#     pages = convert_from_path(path_pdf, dpi=200)
#     pages[0].save(path_png, "PNG")
    
#     return path_png

# # -----------------------------
# # Data Preparation
# # -----------------------------
# df_raw = pd.read_csv(os.path.join(project_root, "data", "cookie_cats.csv"))
# df_raw['userid'] = df_raw['userid'].astype(str)
# df_checks = df_raw.copy()
# df_checks['sum_gamerounds_log'] = np.log1p(df_checks['sum_gamerounds'])
# df_checks = df_checks.query("sum_gamerounds > 0")

# # -----------------------------
# # Slide Generation
# # -----------------------------
# def generate_all_slide_images(
#     df=df_checks,
#     group_col='version',
#     value_col='retention_1',
#     metric_type='proportion',
#     inference_type='conjugate',
#     prior_params=None,
#     sampling_size=10000,
#     control_group='gate_30',
#     treatment_group='gate_40',
#     alpha: float = 0.05,
#     slides_dir="slides"
# ):
#     paths = []

#     # Fit Bayesian model
#     ab_test = ABBayesTest(
#         df=df,
#         group_col=group_col,
#         value_col=value_col,
#         metric_type=metric_type,
#         inference_type=inference_type,
#         prior_params=prior_params,
#         sampling_size=sampling_size,
#         control_group=control_group,
#         treatment_group=treatment_group
#     )
#     ab_test.fit()

#     # Posterior distributions and lift
#     posterior_samples = ab_test.get_distributions()
#     lift_samples = posterior_samples[treatment_group] / posterior_samples[control_group] - 1

#     posterior_summary = ab_test.results(1 - alpha)
#     lift_summary = ab_test.lift_summary()

#     df_post = pd.DataFrame(posterior_summary).T.reset_index().rename(columns={'index': 'metric'})
#     df_lift = pd.DataFrame([lift_summary]).T.reset_index().rename(columns={'index':'metric', 0:'value'})

#     # Posterior histogram
#     path1 = f"{slides_dir}/{value_col}_slide_0.png"
#     plot_histograms_data_1vsdata_2(posterior_samples, path_to_save=path1)
#     paths.append(path1)

#     # Posterior KDE
#     path2 = f"{slides_dir}/{value_col}_slide_1.png"
#     plot_kde_data1vsdata_2_same_plot(posterior_samples, path_to_save=path2)
#     paths.append(path2)

#     # Lift distribution
#     path3 = f"{slides_dir}/{value_col}_slide_2.png"
#     plot_histogram_single(lift_samples, path_to_save=path3)
#     paths.append(path3)

#     # Summary tables
#     path4 = f"{slides_dir}/{value_col}_slide_3.png"
#     fig, axes = plt.subplots(2, 1, figsize=(6, 4))

#     axes[0].axis('off')
#     table_post = axes[0].table(
#         cellText=df_post.round(4).values,
#         colLabels=df_post.columns,
#         cellLoc='center',
#         loc='center'
#     )
#     table_post.auto_set_font_size(False)
#     table_post.set_fontsize(9)
#     table_post.scale(1.2, 1.2)

#     axes[1].axis('off')
#     table_lift = axes[1].table(
#         cellText=df_lift.round(4).values,
#         colLabels=df_lift.columns,
#         cellLoc='center',
#         loc='center'
#     )
#     table_lift.auto_set_font_size(False)
#     table_lift.set_fontsize(9)
#     table_lift.scale(1.2, 1.2)

#     plt.tight_layout()
#     plt.savefig(path4, bbox_inches='tight')
#     plt.close()
#     paths.append(path4)

#     # Recommendation slide
#     path5 = create_recommendation_slide_image(
#         value_col=value_col,
#         control_group=control_group,
#         treatment_group=treatment_group,
#         lift_summary=lift_summary
#     )
#     paths.append(path5)

#     # Optional 7-day profit projection
#     path6 = f"{slides_dir}/final_slide_0.png"
#     if value_col == 'retention_7':
#         simulate_profit_posterior(
#             posterior_samples=posterior_samples,
#             posterior_summary=posterior_summary,
#             initial_players=len(df_checks),
#             ltv_per_player=3,
#             title="Projected 7-Day Profit Distribution",
#             path_to_save=path6
#         )
#         paths.append(path6)

#     return paths

# # -----------------------------
# # Generate Slides & PowerPoint
# # -----------------------------
# slides_to_create = {}
# variables = ["retention_1", "retention_7", "sum_gamerounds_log"]

# for var in variables:
#     if "retention" in var:
#         day = "1" if var == "retention_1" else "7"
#         title_prefix = f"Retention Day {day}"
#         metric_type = "proportion"
#     else:
#         title_prefix = "Total Rounds"
#         metric_type = "mean"

#     slides = generate_all_slide_images(
#         df=df_checks,
#         value_col=var,
#         metric_type=metric_type,
#         slides_dir='slides'
#     )

#     slides_to_create[var] = [
#         (create_graph_slide, {"title": title_prefix, "subtitle":"Posterior Distribution", "image_path": slides[0]}),
#         (create_graph_slide, {"title": title_prefix, "subtitle":"Posterior Distribution", "image_path": slides[1]}),
#         (create_graph_slide, {"title": title_prefix, "subtitle":"Lift Distribution", "image_path": slides[2]}),
#         (create_graph_slide, {"title": title_prefix, "subtitle":"Summary", "image_path": slides[3]}),
#         (create_graph_slide, {"title": title_prefix, "subtitle":"Recommendations", "image_path": slides[4]})
#     ]

# # Conclusion slide
# create_conclusion_slide_image()
# slides_to_create["final"] = [
#     (create_graph_slide, {"title":"Business Performance", "subtitle":"Posterior Distribution", "image_path":"slides/final_slide_0.png"}),
#     (create_graph_slide, {"title":"Conclusion", "subtitle":"", "image_path":"slides/final_slide_1.png"})
# ]

# # Combine all slides
# slides_to_create_final = (
#     slides_to_create["retention_1"] +
#     slides_to_create["retention_7"] +
#     slides_to_create["sum_gamerounds_log"] +
#     slides_to_create["final"]
# )

# # Generate PowerPoint
# build_presentation(slides_to_create_final, output_file="powerpoint/ab_test_full_presentation.pptx")

# # -----------------------------
# # LaTeX Beamer Report
# # -----------------------------
# import re

# def latex_escape(s: str) -> str:
#     replacements = {
#         '\\': r'\textbackslash{}',
#         '{': r'\{',
#         '}': r'\}',
#         '$': r'\$',
#         '&': r'\&',
#         '#': r'\#',
#         '_': r'\_',
#         '%': r'\%',
#         '~': r'\textasciitilde{}',
#         '^': r'\textasciicircum{}',
#     }
#     pattern = re.compile('|'.join(re.escape(key) for key in replacements.keys()))
#     return pattern.sub(lambda x: replacements[x.group()], s)

# def build_beamer_report_from_existing_images(slides_dict, slides_dir="pdf", output_file="ab_test_report.pdf"):
#     tex_path = os.path.join(slides_dir, "ab_test_report.tex")
#     os.makedirs(slides_dir, exist_ok=True)

#     tex_lines = [
#     r"\documentclass{beamer}",
#     r"\usetheme{Singapore}",
#     r"\usecolortheme{seahorse}",
#     r"\useinnertheme{rectangles}",
#     r"\useoutertheme{miniframes}",
#     r"\usepackage{graphicx,amsmath,amssymb,xcolor,url}",
#     r"\title{AB Test Report}",
#     r"\author{Thiago Guimarães Santos \\ thiago.guimaraes.sto@gmail.com \\ linkedin.com/in/thiagogsdsa}",
#     r"\date{\today}",
#     r"\begin{document}",
#     r"\frame{\titlepage}"
#  ]

#     for var, image_paths in slides_dict.items():
#         slide_titles = [
#             "Posterior Distribution",
#             "Posterior KDE",
#             "Lift Distribution",
#             "Summary",
#             "Recommendation"
#         ]
#         for title, img_path in zip(slide_titles, image_paths):
#             tex_lines.append(rf"\begin{{frame}}{{{latex_escape(var)} - {latex_escape(title)}}}")
#             if img_path.endswith(".tex"):
#                 tex_lines.append(rf"\input{{{img_path}}}")
#             else:
#                 tex_lines.append(rf"\includegraphics[width=\textwidth]{{{latex_escape(img_path)}}}")
#             tex_lines.append(r"\end{frame}")

#     tex_lines.append(r"\end{document}")

#     with open(tex_path, "w") as f:
#         f.write("\n".join(tex_lines))

#     subprocess.run(["pdflatex", "-output-directory", slides_dir, tex_path], check=True)

#     final_pdf_path = os.path.join(slides_dir, output_file)
#     print(f"PDF Beamer report generated at: {final_pdf_path}")
#     return final_pdf_path


# slides_dict = {
#     "retention_1": [
#         "slides/retention_1_slide_0.png",
#         "slides/retention_1_slide_1.png",
#         "slides/retention_1_slide_2.png",
#         "slides/retention_1_slide_3.png",
#         "slides/retention_1_slide_4.png",
#         "slides/final_slide_0.png",
#         "slides/final_slide_1.png"

#     ],
#     "retention_7": [
#         "slides/retention_7_slide_0.png",
#         "slides/retention_7_slide_1.png",
#         "slides/retention_7_slide_2.png",
#         "slides/retention_7_slide_3.png",
#         "slides/retention_7_slide_4.png",
#         "slides/final_slide_0.png",
#         "slides/final_slide_1.png"
#     ],
#     "sum_gamerounds_log": [
#         "slides/sum_gamerounds_log_slide_0.png",
#         "slides/sum_gamerounds_log_slide_1.png",
#         "slides/sum_gamerounds_log_slide_2.png",
#         "slides/sum_gamerounds_log_slide_3.png",
#         "slides/sum_gamerounds_log_slide_4.png",
#         "slides/final_slide_0.png",
#         "slides/final_slide_1.png"
#     ]
# }

# pdf_beamer = build_beamer_report_from_existing_images(slides_dict, slides_dir="pdf_beamer")

#!/bin/bash
# ------------------------------------------------------
# Convert a Jupyter Notebook to an HTML report
# with embedded images, and move it to the project root
# ------------------------------------------------------
# Usage:
#   Execute this script from the folder containing the notebook
#   or adjust NOTEBOOK_PATH accordingly.

# Path to the notebook to convert
# NOTEBOOK="../notebooks/1.0.tgs.cookies_cats_ab_test.ipynb"

# # Output HTML filename
# OUTPUT="index.html"

# # Project root where the HTML will be saved
# PROJECT_ROOT="../"

# # --- Convert the notebook to HTML ---
# jupyter nbconvert \
#     --to html \
#     --execute \
#     --embed-images \
#     "$NOTEBOOK" \
#     --output "$OUTPUT" \
#     --output-dir="$PROJECT_ROOT"

# echo "Notebook converted successfully!"
# echo "HTML report available at: $PROJECT_ROOT/$OUTPUT"

	Value	Count	Percent
userid
9999861	9999861	1	0.0
116	116	1	0.0
337	337	1	0.0
377	377	1	0.0
483	483	1	0.0

	Num NAs	Percent NAs	Num unique	Data Type
sum_gamerounds_log	0	0.0	942	float64
sum_gamerounds_boxcox	0	0.0	942	float64

	attribute	mean	median	std	min	max	range	skew	kurtosis	IQR	lower_bound	upper_bound	num_outliers	prop_outliers_%	pct_2	pct_5	pct_10	pct_25	pct_50	pct_75	pct_90	pct_95	pct_98
0	sum_gamerounds_log	2.88	2.83	1.50	0.0	10.82	10.82	0.10	-0.58	2.16	-1.45	7.19	41	0.05	0.0	0.69	0.69	1.79	2.83	3.95	4.91	5.40	5.9
1	sum_gamerounds_boxcox	2.73	2.72	1.38	0.0	9.23	9.23	0.01	-0.62	1.98	-1.23	6.69	12	0.01	0.0	0.69	0.69	1.74	2.72	3.72	4.56	4.98	5.4

	sum_gamerounds_log	sum_gamerounds_boxcox
0	1.386294	1.357632
1	3.663562	3.467873
2	5.111988	4.736380
3	0.693147	0.685932
4	5.192957	4.805665

Step	Description
Input	- Business problem and hypotheses - A/B test dataset (control vs experiment) - Metrics of interest: retention - Prior assumptions for Bayesian models (weakly informative or historical priors)
Procedure	- Clean and prepare dataset - Validate random assignment of players - Fit Bayesian models for retention, engagement - Retention: Beta-Binomial model - Engagement/revenue: Normal or log-Normal model - Compute posterior distributions and credible intervals - Calculate probability of improvement for each metric - Visualize posteriors, credible intervals, and probability of lift
Result	- Posterior distributions for retention, engagement, and revenue - Probability that moving the gate increases retention or revenue - Credible intervals for metric differences - Recommendations for gate placement based on Bayesian evidence

	userid	version	sum_gamerounds	retention_1	retention_7
0	116	gate_30	3	False	False
1	337	gate_30	38	True	False
2	377	gate_40	165	True	False
3	483	gate_40	1	False	False
4	488	gate_40	179	True	True
5	540	gate_40	187	True	True
6	1066	gate_30	0	False	False
7	1444	gate_40	2	False	False
8	1574	gate_40	108	True	True
9	1587	gate_40	153	True	False
10	1842	gate_40	3	False	True

	Num unique	Data Type
userid	90189	object
version	2	object
sum_gamerounds	942	int64
retention_1	2	bool
retention_7	2	bool

	gate_30	gate_40
mean	0.467495	0.462194
std	0.002403	0.002407
ci_lower	0.462782	0.457474
ci_upper	0.472218	0.466880

	gate_30	gate_40
mean	0.198466	0.190354
std	0.001931	0.001861
ci_lower	0.194701	0.186787
ci_upper	0.202244	0.194024

	gate_30	gate_40
mean	3.019626	3.006518
std	0.006759	0.006699
ci_lower	3.006546	2.993136
ci_upper	3.033027	3.019671

	gate_30	gate_40
mean	51320.211149	49222.758320
std	499.437825	481.106007
ci_lower	50346.805742	48300.335298
ci_upper	52297.197771	50171.824330

Data Science Project¶

BAYESIAN A/B TESTING METHODOLOGY (OSEMN)¶

Step 0: Obtain (Data + Business Understanding)¶

Background¶

What is the company?¶

What does the company do?¶

How does the company make money?¶

Example:¶

What challenge is the company facing?¶

Step 1: Scrub (Data Understanding)¶

Imports¶

Load Data¶

Converting userid to a string¶

Data Description¶

EDA¶

Data Quality Checks¶

Restore DataFrame Copy¶

Box plot and distribution of sum_gamerounds¶

BoxCox Transformation on sum_gamerounds¶

$\log$ Transformation on sum_gamerounds¶

Distribution of sum_gamerounds_log and sum_gamerounds_boxcox¶

Data Quality Checks¶

Remove null values¶

Check dimension¶

Distributions of retention rates for each group¶

Distributions of retention rates above/below mean for each group¶

Retention Rate for Each Group¶

Bad Users in Both Groups¶

Step 2: Explore (Analysis Plan)¶

Objective¶

Step 3: Model (Procedure / IPR)¶

Retention Metrics¶

Total Rounds Played¶

Step 4: Interpret (Evaluation & Reporting)¶

Retention Day 1 Results¶

Posterior summary¶

Posterior distributions¶

Lift summary¶

Lift distributions¶

Recommendations for Retention Day 1¶

Retention Day 7 Results¶

Posterior summary¶

Posterior distributions¶

Lift summary¶

Lift distributions¶

Recommendations for Retention Day 7¶

Total Rounds Results¶

Posterior summary¶

Posterior distributions¶

Lift summary¶

Lift distributions¶

Recommendations¶

Business Performance¶

Conclusion¶

Deliverable¶

PDF¶

HTML¶

Box plot and distribution of `sum_gamerounds`¶

BoxCox Transformation on `sum_gamerounds`¶

$\log$ Transformation on `sum_gamerounds`¶

Distribution of `sum_gamerounds_log` and `sum_gamerounds_boxcox`¶