import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import plotly.express as px
from scipy import stats
warnings.filterwarnings('ignore')

raw_df = pd.read_csv("data/00-raw/RAW_recipes.csv")
raw_df.head()

raw_df.shape

(231637, 12)

raw_df.dtypes

name              object
id                 int64
minutes            int64
contributor_id     int64
submitted         object
tags              object
nutrition         object
n_steps            int64
steps             object
description       object
ingredients       object
n_ingredients      int64
dtype: object

processed_df = raw_df[['tags', 'nutrition', 'ingredients']]
processed_df.head()

processed_df.isnull().sum()

tags           0
nutrition      0
ingredients    0
dtype: int64

def to_list(str):
    as_list = str.strip("[]").replace("'", "").replace('"', "").strip().split(",")
    cleaned = []
    for elem in as_list:
        cleaned.append(elem.strip())
    return cleaned

processed_df["tags"] = processed_df["tags"].apply(to_list)
processed_df["nutrition"] = processed_df["nutrition"].apply(to_list)
processed_df

is_dessert_list = []

for tag_sublist in processed_df['tags']:
    if 'desserts' in tag_sublist:
        is_dessert_list.append(True)
    else:
        is_dessert_list.append(False)

processed_df['is_dessert'] = is_dessert_list
processed_df

processed_df = processed_df[processed_df['is_dessert']].drop(columns=['tags', 'is_dessert']).reset_index(drop=True)
processed_df

processed_df["contains_honey"] = processed_df["ingredients"].str.contains("honey", case=False)
processed_df["contains_sugar"] = processed_df["ingredients"].str.contains("sugar", case=False)
processed_df

processed_df["only_honey"] = processed_df[(processed_df["contains_honey"] == True) & (processed_df["contains_sugar"] == False)]["contains_honey"]
processed_df["only_sugar"] = processed_df[(processed_df["contains_honey"] == False) & (processed_df["contains_sugar"] == True)]["contains_sugar"]
processed_df["only_honey"] = processed_df["only_honey"].fillna(False)
processed_df["only_sugar"] = processed_df["only_sugar"].fillna(False)
processed_df = processed_df.drop(columns=["contains_honey", "contains_sugar"])
processed_df

processed_df = processed_df[((processed_df["only_honey"] == True) & (processed_df["only_sugar"] == False)) | ((processed_df["only_honey"] == False) & (processed_df["only_sugar"] == True))]
processed_df = processed_df.reset_index(drop=True)
processed_df

processed_df = processed_df.drop(columns=['ingredients'])
processed_df

col_names = ["calories", "total_fat (%DV)", "sugar (%DV)", "sodium (%DV)", "protein (%DV)", "sat_fat (%DV)", "carbs (%DV)"]
nutrition_df = pd.DataFrame(processed_df["nutrition"].tolist())
nutrition_df.columns = col_names
nutrition_df = nutrition_df.apply(pd.to_numeric).astype(float)
nutrition_df

processed_df = processed_df.merge(nutrition_df, left_on=processed_df.index, right_on=nutrition_df.index, how='outer').drop(columns=["key_0", "nutrition"])
processed_df.index.name = 'dessert_id'
processed_df

var_cols = ['calories', 'total_fat (%DV)', 'sugar (%DV)', 'sodium (%DV)', 'protein (%DV)', 'sat_fat (%DV)', 'carbs (%DV)']
processed_df.groupby(['only_honey','only_sugar'])[var_cols].agg(['mean','median','std'])

processed_df['primary_sweetener'] = processed_df.apply(lambda x: 'Honey' if x['only_honey'] else 'Sugar', axis=1)
sns.boxplot(x='primary_sweetener', y='calories', data=processed_df)
plt.yscale('log')
plt.title('Calories Distribution (log scale): Honey vs Sugar Desserts')
plt.xlabel('Primary Sweetener')
plt.ylabel('Calories')
plt.figure(figsize=(6,4))
plt.show()

<Figure size 600x400 with 0 Axes>

Q1 = processed_df['calories'].quantile(0.25)
Q3 = processed_df['calories'].quantile(0.75)
IQR = Q3 - Q1

lower_quartile = Q1 - 1.5 * IQR
upper_quartile = Q3 + 1.5 * IQR

processed_df = processed_df[(processed_df['calories'] >= lower_quartile) & (processed_df['calories'] <= upper_quartile)]
processed_df

sns.boxplot(x='primary_sweetener', y='calories', data=processed_df)
plt.title('Calories: Honey vs Sugar Desserts (Outliers Removed)')
plt.xlabel('Primary Sweetener')
plt.ylabel('Calories')
plt.figure(figsize=(6,4))
plt.show()

<Figure size 600x400 with 0 Axes>

processed_df.dtypes

only_honey              bool
only_sugar              bool
calories             float64
total_fat (%DV)      float64
sugar (%DV)          float64
sodium (%DV)         float64
protein (%DV)        float64
sat_fat (%DV)        float64
carbs (%DV)          float64
primary_sweetener     object
dtype: object

processed_df.dtypes.value_counts()

float64    7
bool       2
object     1
Name: count, dtype: int64

processed_df['primary_sweetener'].value_counts()

primary_sweetener
Sugar    30236
Honey      750
Name: count, dtype: int64

processed_df.to_csv("data/02-processed/desserts.csv")

desserts = pd.read_csv('data/02-processed/desserts.csv')

required_cols = ['primary_sweetener', 'calories', 'sugar (%DV)', 'total_fat (%DV)']
missing = [col for col in required_cols if col not in desserts.columns]
if missing:
    raise ValueError(f'Missing columns in cleaned data: {missing}')

eda_df = desserts[desserts['primary_sweetener'].isin(['Honey', 'Sugar'])].copy()
metrics = ['calories', 'sugar (%DV)', 'total_fat (%DV)']
eda_df = eda_df.dropna(subset=metrics + ['primary_sweetener'])

print(f'Rows in EDA subset: {len(eda_df):,}')
display(eda_df.head())

Rows in EDA subset: 30,986

print('Group counts:')
print(eda_df['primary_sweetener'].value_counts())

print('\nDescriptive statistics by sweetener group:')
display(eda_df.groupby('primary_sweetener')[metrics].describe().round(2))

median_compare = (
    eda_df.groupby('primary_sweetener')[metrics]
    .median()
    .rename_axis('primary_sweetener')
    .round(2))

print('\nMedian comparison (Honey vs Sugar):')
display(median_compare)

Group counts:
primary_sweetener
Sugar    30236
Honey      750
Name: count, dtype: int64

Descriptive statistics by sweetener group:

Median comparison (Honey vs Sugar):

fig, axes = plt.subplots(1, 3, figsize=(16, 4.8))

for i, metric in enumerate(metrics):
    sns.boxplot(
        data=eda_df, x='primary_sweetener', y=metric,
        order=['Honey', 'Sugar'], ax=axes[i], palette='Set2'
    )
    axes[i].set_title(f'{metric} by Primary Sweetener')
    axes[i].set_xlabel('Primary Sweetener')
    axes[i].set_ylabel(metric)

plt.tight_layout()
plt.show()

fig, axes = plt.subplots(1, 3, figsize=(16, 4.8))

for i, metric in enumerate(metrics):
    sns.histplot(
        data=eda_df, x=metric, hue='primary_sweetener',
        hue_order=['Honey', 'Sugar'], bins=30, stat='density',
        common_norm=False, kde=True, alpha=0.35, ax=axes[i]
    )
    axes[i].set_title(f'Distribution of {metric}')
    axes[i].set_xlabel(metric)
    axes[i].set_ylabel('Density')

plt.tight_layout()
plt.show()

fig, ax = plt.subplots(figsize=(6, 4))
counts = eda_df['primary_sweetener'].value_counts()
sns.barplot(x=counts.index, y=counts.values, palette='Set2', ax=ax)
ax.set_title('Number of Recipes by Primary Sweetener')
ax.set_xlabel('Primary Sweetener')
ax.set_ylabel('Count')
for i, v in enumerate(counts.values):
    ax.text(i, v + 100, f'{v:,}', ha='center')
plt.tight_layout()
plt.show()

fig, axes = plt.subplots(2, 3, figsize=(16, 8))

for j, group in enumerate(['Honey', 'Sugar']):
    group_data = eda_df[eda_df['primary_sweetener'] == group]
    for i, metric in enumerate(metrics):
        stats.probplot(group_data[metric], dist='norm', plot=axes[j, i])
        axes[j, i].set_title(f'{group}: {metric}')

plt.suptitle('QQ Plots by Primary Sweetener and Metric', y=1.02)
plt.tight_layout()
plt.show()

fig, ax = plt.subplots(figsize=(6, 4))
corr = eda_df[metrics].corr()
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', 
            ax=ax, vmin=-1, vmax=1, square=True)
ax.set_title('Correlation Between Nutritional Metrics')
plt.tight_layout()
plt.show()

metrics = ['calories', 'sugar (%DV)', 'total_fat (%DV)']
results = []

for metric in metrics:
    honey = desserts[desserts['primary_sweetener'] == 'Honey'][metric].dropna()
    sugar = desserts[desserts['primary_sweetener'] == 'Sugar'][metric].dropna()
    
    t_stat, p_two_tailed = stats.ttest_ind(honey, sugar, equal_var=False)
    if metric in ['calories', 'total_fat (%DV)']:
        p_val = p_two_tailed / 2 if t_stat > 0 else 1 - (p_two_tailed / 2)
        test_type = 'one-tailed'
    else:
        p_val = p_two_tailed
        test_type = 'two-tailed'
    
    n1, n2 = len(honey), len(sugar)
    s1, s2 = honey.var(ddof=1), sugar.var(ddof=1)
    pooled_sd = np.sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2))
    cohen_d = (honey.mean() - sugar.mean()) / pooled_sd
    
    results.append({
        'metric': metric,
        'test_type': test_type,
        'honey_mean': honey.mean(),
        'sugar_mean': sugar.mean(),
        't_stat': t_stat,
        'p_value': p_val,
        'cohen_d': cohen_d
    })


welch_results = pd.DataFrame(results).round(4)
welch_results

sugar_cal_mean = desserts[desserts["only_sugar"] == True]["calories"].mean()
honey_cal_mean = desserts[desserts["only_honey"] == True]["calories"].mean()
obs_diff_means_cal = honey_cal_mean - sugar_cal_mean
print(f"Observed Difference In Mean Calfories Between Honey and Sugar: {obs_diff_means_cal}")

sugar_sugar_mean = desserts[desserts["only_sugar"] == True]["sugar (%DV)"].mean()
honey_sugar_mean = desserts[desserts["only_honey"] == True]["sugar (%DV)"].mean()
obs_diff_means_sugar = honey_sugar_mean - sugar_sugar_mean
print(f"Observed Difference In Mean Sugar Between Honey and Sugar: {obs_diff_means_sugar}")

sugar_fat_mean = desserts[desserts["only_sugar"] == True]["total_fat (%DV)"].mean()
honey_fat_mean = desserts[desserts["only_honey"] == True]["total_fat (%DV)"].mean()
obs_diff_means_fat = honey_fat_mean - sugar_fat_mean
print(f"Observed Difference In Mean Fat Between Honey and Sugar: {obs_diff_means_fat}")

Observed Difference In Mean Calfories Between Honey and Sugar: -87.85571865767074
Observed Difference In Mean Sugar Between Honey and Sugar: -20.300498390439657
Observed Difference In Mean Fat Between Honey and Sugar: -8.759690523437845

def diff_in_means(y, group):
    y = np.array(y)
    group = np.array(group)
    return y[group].mean() - y[~group].mean()

def permutation_test_diff_means(y, group, repetitions=10000):
    y = np.array(y)
    group = np.array(group)
    results = []
    for i in range(repetitions):
        shuffled = np.random.permutation(group)
        results.append(diff_in_means(y, shuffled))
    return results

perm_diff_calories = np.array(permutation_test_diff_means(desserts["calories"].values, desserts["only_honey"].values.astype(bool)))
perm_diff_sugar = np.array(permutation_test_diff_means(desserts["sugar (%DV)"].values, desserts["only_honey"].values.astype(bool)))
perm_diff_fat = np.array(permutation_test_diff_means(desserts["total_fat (%DV)"].values, desserts["only_honey"].values.astype(bool)))

p_val_calories = np.mean(perm_diff_calories >= obs_diff_means_cal)
p_val_sugar = np.mean(np.abs(perm_diff_sugar) >= np.abs(obs_diff_means_sugar))
p_val_fat = np.mean(perm_diff_fat >= obs_diff_means_fat)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
perm_data = [(perm_diff_calories, obs_diff_means_cal, 'Calories', p_val_calories),
             (perm_diff_sugar, obs_diff_means_sugar, 'Sugar (%DV)', p_val_sugar),
             (perm_diff_fat, obs_diff_means_fat,   'Total Fat (%DV)', p_val_fat),]

for ax, (perm, obs, label, pval) in zip(axes, perm_data):
    if (label == "Sugar (%DV)"):
        ax.hist(perm, bins=30, edgecolor='black', color='steelblue', alpha=0.8)
        ax.axvline(obs,  color='red',  linewidth=2, label=f'Obs. Difference = {obs:0.2f}')
        ax.axvline(-obs,  color='red',  linestyle='--', label=f'Mirror = {-obs:0.2f}')
        p_label = f'= {pval:.2f}'
        ax.set_title(f'{label}\np-value {p_label}')
        ax.set_xlabel('Permuted Difference in Means')
        ax.set_ylabel('Frequency')
        ax.legend()
    else:
        ax.hist(perm, bins=30, edgecolor='black', color='steelblue', alpha=0.8)
        ax.axvline(obs,  color='red',  linewidth=2, label=f'Obs. Difference = {obs:0.2f}')
        p_label = f'= {pval:.2f}'
        ax.set_title(f'{label}\np-value {p_label}')
        ax.set_xlabel('Permuted Difference in Means')
        ax.set_ylabel('Frequency')
        ax.legend()

plt.suptitle('Permutation Test: Null Distributions vs Observed Differences', 
             fontsize=13, y=1.02)
plt.tight_layout()
plt.show()

fda_dv_calories = 2000
fda_dv_sugar_g = 50
fda_dv_feat_g = 78
sweetness_factor = 1.25

theoretical_unadj = {'calories (%DV)': ((64-48)/fda_dv_calories)*100, 
                     'sugar (%DV)': ((17-13)/fda_dv_sugar_g)*100,
                     'total_fat (%DV)': 0.0}

theoretical_adj = {'calories (%DV)': ((64-(48*sweetness_factor))/fda_dv_calories)* 100,
                   'sugar (%DV)': ((17-(13*sweetness_factor))/fda_dv_sugar_g)*100,
                   'total_fat (%DV)': 0.0}

honey_df = desserts[desserts['primary_sweetener'] == 'Honey']
sugar_df = desserts[desserts['primary_sweetener'] == 'Sugar']

observed_diff = {'calories (%DV)': ((honey_df['calories'].mean() - sugar_df['calories'].mean())/fda_dv_calories)*100,
                 'sugar (%DV)': honey_df['sugar (%DV)'].mean() - sugar_df['sugar (%DV)'].mean(),
                 'total_fat (%DV)': honey_df['total_fat (%DV)'].mean() - sugar_df['total_fat (%DV)'].mean()}

metrics = ['calories (%DV)', 'sugar (%DV)', 'total_fat (%DV)']
labels = ['Calories (% DV)', 'Sugar (% DV)', 'Total Fat (% DV)']

unadj_vals = [theoretical_unadj[m] for m in metrics]
adj_vals = [theoretical_adj[m] for m in metrics]
observed_vals = [observed_diff[m] for m in metrics]

comparison_df = pd.DataFrame({'Metric': labels, 'Theoretical (Unadjusted)': unadj_vals,
                              'Theoretical (Sweetness-Adjusted)': adj_vals, 'Observed (Dataset)': observed_vals})
comparison_df

fig, axes = plt.subplots(1, 3, figsize=(18, 6))
metric_labels = ['Calories (% DV)', 'Sugar (% DV)', 'Total Fat (% DV)']
metric_keys = ['calories (%DV)', 'sugar (%DV)', 'total_fat (%DV)']
colors = ['#f4a261', '#e9c46a', '#2a9d8f']

for i, (metric, label) in enumerate(zip(metric_keys, metric_labels)):
    ax = axes[i]
    vals = [theoretical_unadj[metric], theoretical_adj[metric], observed_diff[metric]]
    bar_labels = ['Theoretical\n(Unadjusted)', 'Theoretical\n(Sweetness-Adj.)', 'Observed\n(Dataset)']
    bars = ax.bar(bar_labels, vals, color=colors, edgecolor='black', width=0.5)
    ax.axhline(0, color='black', linewidth=0.8, linestyle='--')
    ax.set_title(label, fontsize=13)
    ax.set_ylabel('Difference in % DV (Honey - Sugar)' if i == 0 else '')
    ax.set_ylim(
        min(vals) * 1.3 if min(vals) < 0 else -1,
        max(vals) * 1.3 if max(vals) > 0 else 1
    )
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.2f}%',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 4 if height >= 0 else -12),
                    textcoords='offset points',
                    ha='center', fontsize=10)
plt.suptitle('Theoretical vs Observed Nutritional Difference (Normalized to % DV)\n', fontsize=13, y=1.02)
plt.tight_layout()
plt.show()

	name	id	minutes	contributor_id	submitted	tags	nutrition	n_steps	steps	description	ingredients	n_ingredients
0	arriba baked winter squash mexican style	137739	55	47892	2005-09-16	['60-minutes-or-less', 'time-to-make', 'course...	[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]	11	['make a choice and proceed with recipe', 'dep...	autumn is my favorite time of year to cook! th...	['winter squash', 'mexican seasoning', 'mixed ...	7
1	a bit different breakfast pizza	31490	30	26278	2002-06-17	['30-minutes-or-less', 'time-to-make', 'course...	[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]	9	['preheat oven to 425 degrees f', 'press dough...	this recipe calls for the crust to be prebaked...	['prepared pizza crust', 'sausage patty', 'egg...	6
2	all in the kitchen chili	112140	130	196586	2005-02-25	['time-to-make', 'course', 'preparation', 'mai...	[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]	6	['brown ground beef in large pot', 'add choppe...	this modified version of 'mom's' chili was a h...	['ground beef', 'yellow onions', 'diced tomato...	13
3	alouette potatoes	59389	45	68585	2003-04-14	['60-minutes-or-less', 'time-to-make', 'course...	[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]	11	['place potatoes in a large pot of lightly sal...	this is a super easy, great tasting, make ahea...	['spreadable cheese with garlic and herbs', 'n...	11
4	amish tomato ketchup for canning	44061	190	41706	2002-10-25	['weeknight', 'time-to-make', 'course', 'main-...	[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]	5	['mix all ingredients& boil for 2 1 / 2 hours ...	my dh's amish mother raised him on this recipe...	['tomato juice', 'apple cider vinegar', 'sugar...	8

	tags	nutrition	ingredients
0	['60-minutes-or-less', 'time-to-make', 'course...	[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]	['winter squash', 'mexican seasoning', 'mixed ...
1	['30-minutes-or-less', 'time-to-make', 'course...	[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]	['prepared pizza crust', 'sausage patty', 'egg...
2	['time-to-make', 'course', 'preparation', 'mai...	[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]	['ground beef', 'yellow onions', 'diced tomato...
3	['60-minutes-or-less', 'time-to-make', 'course...	[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]	['spreadable cheese with garlic and herbs', 'n...
4	['weeknight', 'time-to-make', 'course', 'main-...	[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]	['tomato juice', 'apple cider vinegar', 'sugar...

	tags	nutrition	ingredients
0	[60-minutes-or-less, time-to-make, course, mai...	[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]	['winter squash', 'mexican seasoning', 'mixed ...
1	[30-minutes-or-less, time-to-make, course, mai...	[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]	['prepared pizza crust', 'sausage patty', 'egg...
2	[time-to-make, course, preparation, main-dish,...	[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]	['ground beef', 'yellow onions', 'diced tomato...
3	[60-minutes-or-less, time-to-make, course, mai...	[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]	['spreadable cheese with garlic and herbs', 'n...
4	[weeknight, time-to-make, course, main-ingredi...	[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]	['tomato juice', 'apple cider vinegar', 'sugar...
...	...	...	...
231632	[ham, 60-minutes-or-less, time-to-make, course...	[415.2, 26.0, 34.0, 26.0, 44.0, 21.0, 15.0]	['celery', 'onion', 'green sweet pepper', 'gar...
231633	[15-minutes-or-less, time-to-make, course, pre...	[14.8, 0.0, 2.0, 58.0, 1.0, 0.0, 1.0]	['paprika', 'salt', 'garlic powder', 'onion po...
231634	[60-minutes-or-less, time-to-make, course, mai...	[59.2, 6.0, 2.0, 3.0, 6.0, 5.0, 0.0]	['hard-cooked eggs', 'mayonnaise', 'dijon must...
231635	[30-minutes-or-less, time-to-make, course, pre...	[188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]	['butter', 'eagle brand condensed milk', 'ligh...
231636	[30-minutes-or-less, time-to-make, course, pre...	[174.9, 14.0, 33.0, 4.0, 4.0, 11.0, 6.0]	['granulated sugar', 'shortening', 'eggs', 'fl...

	tags	nutrition	ingredients	is_dessert
0	[60-minutes-or-less, time-to-make, course, mai...	[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]	['winter squash', 'mexican seasoning', 'mixed ...	False
1	[30-minutes-or-less, time-to-make, course, mai...	[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]	['prepared pizza crust', 'sausage patty', 'egg...	False
2	[time-to-make, course, preparation, main-dish,...	[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]	['ground beef', 'yellow onions', 'diced tomato...	False
3	[60-minutes-or-less, time-to-make, course, mai...	[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]	['spreadable cheese with garlic and herbs', 'n...	False
4	[weeknight, time-to-make, course, main-ingredi...	[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]	['tomato juice', 'apple cider vinegar', 'sugar...	False
...	...	...	...	...
231632	[ham, 60-minutes-or-less, time-to-make, course...	[415.2, 26.0, 34.0, 26.0, 44.0, 21.0, 15.0]	['celery', 'onion', 'green sweet pepper', 'gar...	False
231633	[15-minutes-or-less, time-to-make, course, pre...	[14.8, 0.0, 2.0, 58.0, 1.0, 0.0, 1.0]	['paprika', 'salt', 'garlic powder', 'onion po...	False
231634	[60-minutes-or-less, time-to-make, course, mai...	[59.2, 6.0, 2.0, 3.0, 6.0, 5.0, 0.0]	['hard-cooked eggs', 'mayonnaise', 'dijon must...	False
231635	[30-minutes-or-less, time-to-make, course, pre...	[188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]	['butter', 'eagle brand condensed milk', 'ligh...	True
231636	[30-minutes-or-less, time-to-make, course, pre...	[174.9, 14.0, 33.0, 4.0, 4.0, 11.0, 6.0]	['granulated sugar', 'shortening', 'eggs', 'fl...	True

	nutrition	ingredients
0	[4270.8, 254.0, 1306.0, 111.0, 127.0, 431.0, 2...	['chocolate sandwich style cookies', 'chocolat...
1	[734.1, 66.0, 199.0, 10.0, 10.0, 117.0, 28.0]	['vanilla wafers', 'butter', 'powdered sugar',...
2	[232.7, 21.0, 77.0, 4.0, 6.0, 38.0, 8.0]	['butterscotch chips', 'chinese noodles', 'sal...
3	[1663.3, 221.0, 168.0, 66.0, 19.0, 158.0, 29.0]	['all-purpose flour', 'granulated sugar', 'bak...
4	[174.4, 13.0, 67.0, 5.0, 4.0, 26.0, 7.0]	['butter', 'sugar', 'vanilla', 'eggs', 'all-pu...
...	...	...
43198	[561.3, 38.0, 122.0, 2.0, 16.0, 76.0, 25.0]	['all-purpose flour', 'unsalted butter', 'egg'...
43199	[535.0, 29.0, 194.0, 18.0, 15.0, 15.0, 28.0]	['margarine', 'all-purpose flour', 'sugar', 'b...
43200	[56.2, 2.0, 4.0, 1.0, 2.0, 3.0, 3.0]	['sugar', 'active dry yeast', 'milk', 'butter'...
43201	[188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]	['butter', 'eagle brand condensed milk', 'ligh...
43202	[174.9, 14.0, 33.0, 4.0, 4.0, 11.0, 6.0]	['granulated sugar', 'shortening', 'eggs', 'fl...

🍯 Sweet but Healthy? Evaluating Honey vs. Sugar in Recipes¶

Abstract¶

Research Question¶

Background and Prior Work¶

Hypothesis¶

Data¶

Data overview¶

Food.com Recipes Dataset¶

Description¶

Concerns¶

Imports¶

Data Loading¶

Data Preprocessing¶

Summary Statistics¶

Results¶

Exploratory Data Analysis¶

Section 1: Comparing Honey vs Sugar Desserts on Key Nutrition Metrics¶

Section 2: Group Representation and Class Imbalance¶

Section 3: Outlier and Distribution Assumptions¶

Section 4: Correlation between Nutrition Metrics¶

Hypothesis Testing¶

Section 1: Welch's t-test¶

Section 2: Permutation Test on the Difference of Means¶

Further Analysis: Does the Sweetener Alone Explain the Nutritional Gap?¶

Ethics¶

A. Data Collection¶

B. Data Storage¶

C. Analysis¶

D. Modeling¶

E. Deployment¶

Discussion and Conclusion¶

	calories	total_fat (%DV)	sugar (%DV)	sodium (%DV)	protein (%DV)	sat_fat (%DV)	carbs (%DV)
0	734.1	66.0	199.0	10.0	10.0	117.0	28.0
1	1663.3	221.0	168.0	66.0	19.0	158.0	29.0
2	174.4	13.0	67.0	5.0	4.0	26.0	7.0
3	5467.4	516.0	1196.0	135.0	110.0	615.0	188.0
4	175.2	11.0	15.0	8.0	7.0	21.0	7.0
...	...	...	...	...	...	...	...
34752	561.3	38.0	122.0	2.0	16.0	76.0	25.0
34753	535.0	29.0	194.0	18.0	15.0	15.0	28.0
34754	56.2	2.0	4.0	1.0	2.0	3.0	3.0
34755	188.0	11.0	57.0	11.0	7.0	21.0	9.0
34756	174.9	14.0	33.0	4.0	4.0	11.0	6.0

	only_honey	only_sugar	calories	total_fat (%DV)	sugar (%DV)	sodium (%DV)	protein (%DV)	sat_fat (%DV)	carbs (%DV)
dessert_id
0	False	True	734.1	66.0	199.0	10.0	10.0	117.0	28.0
1	False	True	1663.3	221.0	168.0	66.0	19.0	158.0	29.0
2	False	True	174.4	13.0	67.0	5.0	4.0	26.0	7.0
3	False	True	5467.4	516.0	1196.0	135.0	110.0	615.0	188.0
4	False	True	175.2	11.0	15.0	8.0	7.0	21.0	7.0
...	...	...	...	...	...	...	...	...	...
34752	False	True	561.3	38.0	122.0	2.0	16.0	76.0	25.0
34753	False	True	535.0	29.0	194.0	18.0	15.0	15.0	28.0
34754	False	True	56.2	2.0	4.0	1.0	2.0	3.0	3.0
34755	False	True	188.0	11.0	57.0	11.0	7.0	21.0	9.0
34756	False	True	174.9	14.0	33.0	4.0	4.0	11.0	6.0

		calories			total_fat (%DV)			sugar (%DV)			sodium (%DV)			protein (%DV)			sat_fat (%DV)			carbs (%DV)
		mean	median	std	mean	median	std	mean	median	std	mean	...	std	mean	median	std	mean	median	std	mean	median	std
only_honey	only_sugar
False	True	652.526698	314.9	1250.196051	48.962675	21.0	103.249706	229.333745	104.0	472.586461	15.973831	...	41.856871	16.724302	8.0	34.572365	74.437713	30.0	163.696113	28.259125	13.0	56.071097
True	False	340.492357	196.0	540.636840	22.686624	9.0	44.201373	132.029299	80.0	218.588061	7.439490	...	26.092051	12.364331	7.0	21.829751	28.863694	10.0	57.253023	15.704459	10.0	25.626921

	calories								sugar (%DV)					total_fat (%DV)
	count	mean	std	min	25%	50%	75%	max	count	mean	...	75%	max	count	mean	std	min	25%	50%	75%	max
primary_sweetener
Honey	750.0	241.12	193.68	0.4	109.25	186.0	307.75	1105.1	750.0	94.66	...	130.0	695.0	750.0	15.19	18.77	0.0	3.0	8.0	21.0	137.0
Sugar	30236.0	328.97	224.31	0.3	152.30	278.9	450.32	1122.8	30236.0	114.96	...	155.0	1087.0	30236.0	23.95	21.39	0.0	8.0	18.0	34.0	168.0

	metric	test_type	honey_mean	sugar_mean	t_stat	p_value	cohen_d
0	calories	one-tailed	241.1173	328.9731	-12.2212	1.0	-0.3929
1	sugar (%DV)	two-tailed	94.6573	114.9578	-6.9343	0.0	-0.2106
2	total_fat (%DV)	one-tailed	15.1907	23.9504	-12.5780	1.0	-0.4106

	Metric	Theoretical (Unadjusted)	Theoretical (Sweetness-Adjusted)	Observed (Dataset)
0	Calories (% DV)	0.8	0.2	-4.392786
1	Sugar (% DV)	8.0	1.5	-20.300498
2	Total Fat (% DV)	0.0	0.0	-8.759691