from google.colab import drive
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
warnings.filterwarnings("ignore")
drive.mount('/content/drive')

Mounted at /content/drive

data = []
with open("/content/drive/MyDrive/endomondoHR_proper.json") as f:
    for i, line in enumerate(f):
        if i == 50000:
            break
        data.append(eval(line))

def summarize_workout(r):
    w = {
        'user_id': r.get('userId'),
        'gender': r.get('gender'),
        'sport': r.get('sport'),
        'workout_id': r.get('id'),
    }

    ts = r.get('timestamp', [])
    if ts:
        w['start'] = min(ts)
        w['end'] = max(ts)
        w['duration_min'] = (w['end'] - w['start']) / 60
    else:
        w['start'] = None
        w['end'] = None
        w['duration_min'] = None


    hr = r.get('heart_rate', [])
    hr_pos = [x for x in hr if x > 0]
    w['avg_heart_rate'] = np.mean(hr_pos) if hr_pos else None

    return w

df = pd.DataFrame([summarize_workout(d) for d in data])
df['datetime'] = pd.to_datetime(df['start'], unit='s')
df = df.drop(columns=["start", "end"])
df

df.isna().sum()

df['sport'].value_counts()

df['sport'].value_counts().plot(kind='bar', figsize=(6,4))
plt.title("Workout Count by Sport")
plt.ylabel("Count")
plt.show()

sport_counts = df['sport'].value_counts()
valid_sports = sport_counts[sport_counts > 300].index
df = df[df['sport'].isin(valid_sports)]
df

df['sport'].value_counts().plot(kind='bar', figsize=(6,4))
plt.title("Workout Count by Sport")
plt.ylabel("Count")
plt.show()

df["duration_min"].describe()

df = df[df["duration_min"] < 4320]
df["duration_min"].describe()

df['duration_min'].dropna().hist(bins=40, figsize=(6,4))
plt.title("Distribution of Workout Duration (min)")
plt.show()

df["avg_heart_rate"].describe()

df['avg_heart_rate'].dropna().hist(bins=40, figsize=(6,4))
plt.title("Distribution of Avg. Heart Rate (bpm)")
plt.show()

df['date'] = df['datetime'].dt.date
df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.day_name()
df['day_of_week'].value_counts()

df['day_of_week'].value_counts().plot(kind='bar', figsize=(6,4))
plt.title("Workout Count by Day of Week")
plt.ylabel("Count")
plt.show()

df.groupby('date').size().plot(figsize=(10,4))
plt.title("Workouts Over Time")
plt.ylabel("Number of Workouts")
plt.show()

df['hour'].hist(bins=24, figsize=(6,4))
plt.title("Workouts by Hour of Day")
plt.xlabel("Hour")
plt.ylabel("Frequency")
plt.show()

df = df.sort_values(["user_id", "datetime"]).reset_index(drop=True)[["user_id", "datetime","workout_id","gender","sport","duration_min","avg_heart_rate","date","hour","day_of_week"]]
rows = []

for user, user_df in df.groupby("user_id"):
    user_df = user_df.sort_values("datetime").reset_index(drop=True)

    for i in range(1, len(user_df)):
        prev = user_df.loc[i-1]
        curr = user_df.loc[i]

        rows.append({

            "user_id": user,

            "prev_sport": prev["sport"],
            "prev_duration": prev["duration_min"],
            "prev_avg_heart_rate": prev["avg_heart_rate"],
            "prev_hour": prev["datetime"].hour,
            "prev_dayofweek": prev["datetime"].dayofweek,

            "days_since_prev": (curr["datetime"] - prev["datetime"]).total_seconds() / 86400,

            "target_sport": curr["sport"]
        })

df_ml = pd.DataFrame(rows)
df_ml = df_ml.dropna()
df_ml

most_freq = df_ml.groupby('user_id')['prev_sport'].agg(lambda x: x.mode()[0])
df_ml['freq_pred'] = df_ml['user_id'].map(most_freq)

freq_acc = accuracy_score(df_ml['target_sport'], df_ml['freq_pred'])
freq_f1_macro = f1_score(df_ml['target_sport'], df_ml['freq_pred'], average='macro')
freq_f1_weighted = f1_score(df_ml['target_sport'], df_ml['freq_pred'], average='weighted')

print("Frequency Baseline:")
print("Accuracy:", round(freq_acc,4))
print("F1 Macro:", round(freq_f1_macro,4))
print("F1 Weighted:", round(freq_f1_weighted,4))

Frequency Baseline:
Accuracy: 0.7923
F1 Macro: 0.5324
F1 Weighted: 0.7819

df_ml['recency_pred'] = df_ml.groupby('user_id')['prev_sport'].shift(1)

recency_df = df_ml.dropna(subset=['recency_pred'])

recency_acc = accuracy_score(recency_df['target_sport'], recency_df['recency_pred'])
recency_f1_macro = f1_score(recency_df['target_sport'], recency_df['recency_pred'], average='macro')
recency_f1_weighted = f1_score(recency_df['target_sport'], recency_df['recency_pred'], average='weighted')

print("Recency Baseline:")
print("Accuracy:", round(recency_acc,4))
print("F1 Macro:", round(recency_f1_macro,4))
print("F1 Weighted:", round(recency_f1_weighted,4))

Recency Baseline:
Accuracy: 0.7967
F1 Macro: 0.6631
F1 Weighted: 0.7967

N = 5
def weighted_freq_recent_sports(sports):
    preds = []
    past_activities = []
    for sport in sports:
        if past_activities:
            last_n = past_activities[-N:]
            mode_sport = pd.Series(last_n).mode()[0]
            preds.append(mode_sport)
        else:
            preds.append(None)
        past_activities.append(sport)
    return preds

df_ml['weighted_pred'] = df_ml.groupby('user_id')['prev_sport'].transform(weighted_freq_recent_sports)

weighted_df = df_ml.dropna(subset=['weighted_pred'])

weighted_acc = accuracy_score(weighted_df['target_sport'], weighted_df['weighted_pred'])
weighted_f1_macro = f1_score(weighted_df['target_sport'], weighted_df['weighted_pred'], average='macro')
weighted_f1_weighted = f1_score(weighted_df['target_sport'], weighted_df['weighted_pred'], average='weighted')

print("Weighted Frequency-Recency Baseline:")
print("Accuracy:", round(weighted_acc,4))
print("F1 Macro:", round(weighted_f1_macro,4))
print("F1 Weighted:", round(weighted_f1_weighted,4))

Weighted Frequency-Recency Baseline:
Accuracy: 0.8127
F1 Macro: 0.6874
F1 Weighted: 0.8112

le_sport = LabelEncoder()
df_ml['target_sport_enc'] = le_sport.fit_transform(df_ml['target_sport'])
df_ml['prev_sport_enc'] = le_sport.transform(df_ml['prev_sport'])

df_ml = df_ml.dropna(subset=['weighted_pred'])
df_ml.loc[:, 'weighted_pred_enc'] = le_sport.transform(df_ml['weighted_pred'])

numeric_cols = ['prev_duration', 'prev_avg_heart_rate', 'prev_hour', 'prev_dayofweek', 'days_since_prev']
for col in numeric_cols:
    df_ml[col] = df_ml[col].astype(float)

df_ml['prev_eq_weighted'] = (df_ml['prev_sport_enc'] == df_ml['weighted_pred_enc']).astype(int)
df_ml['prev_intensity'] = df_ml['prev_duration'] * df_ml['prev_avg_heart_rate']

df_ml = df_ml.dropna(subset=numeric_cols)
feature_cols = ['prev_sport_enc','weighted_pred_enc','prev_duration','prev_avg_heart_rate',
    'prev_hour','prev_dayofweek','days_since_prev','prev_eq_weighted','prev_intensity','target_sport_enc']

df_features = df_ml[feature_cols].copy()
df_features

feature_cols = ['prev_sport_enc', 'weighted_pred_enc', 'prev_duration', 'prev_avg_heart_rate',
                'prev_hour', 'prev_dayofweek', 'days_since_prev', 'prev_eq_weighted', 'prev_intensity']

X = df_features[feature_cols].copy()
y = df_features['target_sport_enc'].copy()

n = len(df_features)
train_end = int(0.7 * n)
val_end = int(0.85 * n)

X_train, y_train = X.iloc[:train_end], y.iloc[:train_end]
X_val, y_val = X.iloc[train_end:val_end], y.iloc[train_end:val_end]
X_test, y_test = X.iloc[val_end:], y.iloc[val_end:]

numeric_cols = ['prev_duration', 'prev_avg_heart_rate', 'prev_hour', 'prev_dayofweek', 'days_since_prev', 'prev_intensity']
scaler = StandardScaler()
X_train.loc[:, numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_val.loc[:, numeric_cols]  = scaler.transform(X_val[numeric_cols])
X_test.loc[:, numeric_cols] = scaler.transform(X_test[numeric_cols])
print(f"Train size: {X_train.shape[0]}\nValidation size: {X_val.shape[0]}\nTest size: {X_test.shape[0]}")

Train size: 34031
Validation size: 7292
Test size: 7293

lr = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=1000,
    n_jobs=-1
)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_val)

val_classes = np.unique(y_val)
target_names = le_sport.inverse_transform(val_classes)

print("Logistic Regression Performance (VAL)")
print("Accuracy:", round(accuracy_score(y_val, y_pred_lr), 4))
print("F1 Macro:", round(f1_score(y_val, y_pred_lr, average='macro'), 4))
print("F1 Weighted:", round(f1_score(y_val, y_pred_lr, average='weighted'), 4))
print("\nDetailed Classification Report:")
print(classification_report(
    y_val,
    y_pred_lr,
    labels=val_classes,
    target_names=target_names,
    zero_division=0
))

Logistic Regression Performance (VAL)
Accuracy: 0.7592
F1 Macro: 0.2509
F1 Weighted: 0.7044

Detailed Classification Report:
                  precision    recall  f1-score   support

            bike       0.78      0.86      0.82      2659
bike (transport)       0.00      0.00      0.00        49
  indoor cycling       0.00      0.00      0.00       239
   mountain bike       1.00      0.00      0.00       521
    orienteering       0.00      0.00      0.00       170
             run       0.76      0.91      0.82      3589
            walk       0.67      0.06      0.11        65

        accuracy                           0.76      7292
       macro avg       0.46      0.26      0.25      7292
    weighted avg       0.73      0.76      0.70      7292

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_val)

val_classes = np.unique(y_val)
target_names = le_sport.inverse_transform(val_classes)

print("Naïve Bayes Performance (VAL)")
print("Accuracy:", round(accuracy_score(y_val, y_pred_nb), 4))
print("F1 Macro:", round(f1_score(y_val, y_pred_nb, average='macro'), 4))
print("F1 Weighted:", round(f1_score(y_val, y_pred_nb, average='weighted'), 4))
print("\nDetailed Classification Report:")
print(classification_report(
    y_val,
    y_pred_nb,
    labels=val_classes,
    target_names=target_names,
    zero_division=0
))

Naïve Bayes Performance (VAL)
Accuracy: 0.6529
F1 Macro: 0.2414
F1 Weighted: 0.6458

Detailed Classification Report:
                  precision    recall  f1-score   support

            bike       0.73      0.61      0.66      2659
bike (transport)       0.02      0.41      0.04        49
  indoor cycling       0.00      0.00      0.00       239
   mountain bike       0.34      0.03      0.05       521
    orienteering       1.00      0.01      0.01       170
             run       0.76      0.87      0.81      3589
            walk       0.23      0.08      0.11        65

        accuracy                           0.65      7292
       macro avg       0.44      0.28      0.24      7292
    weighted avg       0.69      0.65      0.65      7292

svm = SVC(
    kernel='rbf',
    decision_function_shape='ovo',
    probability=False,
)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_val)

val_classes = np.unique(y_val)
target_names = le_sport.inverse_transform(val_classes)

print("Support Vector Machine (SVM) Performance (VAL)")
print("Accuracy:", round(accuracy_score(y_val, y_pred_svm), 4))
print("F1 Macro:", round(f1_score(y_val, y_pred_svm, average='macro'), 4))
print("F1 Weighted:", round(f1_score(y_val, y_pred_svm, average='weighted'), 4))
print("\nDetailed Classification Report:")
print(classification_report(
    y_val,
    y_pred_svm,
    labels=val_classes,
    target_names=target_names,
    zero_division=0
))

Support Vector Machine (SVM) Performance (VAL)
Accuracy: 0.7999
F1 Macro: 0.4466
F1 Weighted: 0.7802

Detailed Classification Report:
                  precision    recall  f1-score   support

            bike       0.81      0.86      0.84      2659
bike (transport)       0.12      0.45      0.19        49
  indoor cycling       0.00      0.00      0.00       239
   mountain bike       0.79      0.61      0.69       521
    orienteering       0.00      0.00      0.00       170
             run       0.82      0.89      0.85      3589
            walk       0.72      0.45      0.55        65

        accuracy                           0.80      7292
       macro avg       0.47      0.46      0.45      7292
    weighted avg       0.77      0.80      0.78      7292

rf_model = RandomForestClassifier(
    n_estimators=1000,
    max_depth=20,
    min_samples_split=60,
    min_samples_leaf=2,
    max_features='sqrt',
    class_weight='balanced_subsample',
    bootstrap=True,
    oob_score=True,
    n_jobs=-1,
    random_state=42,
)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_val)

print("Random Forest Performance (VAL)")
print("Accuracy:", round(accuracy_score(y_val, y_pred), 4))
print("F1 Macro:", round(f1_score(y_val, y_pred, average='macro'), 4))
print("F1 Weighted:", round(f1_score(y_val, y_pred, average='weighted'), 4))
print("\nDetailed Classification Report:")
print(classification_report(
    y_val,
    y_pred,
    labels=val_classes,
    target_names=target_names,
    zero_division=0
))

Random Forest Performance (VAL)
Accuracy: 0.8206
F1 Macro: 0.6972
F1 Weighted: 0.824

Detailed Classification Report:
                  precision    recall  f1-score   support

            bike       0.85      0.82      0.83      2659
bike (transport)       0.43      0.47      0.45        49
  indoor cycling       0.53      0.75      0.62       239
   mountain bike       0.66      0.80      0.73       521
    orienteering       0.61      0.81      0.69       170
             run       0.88      0.84      0.86      3589
            walk       0.61      0.80      0.69        65

        accuracy                           0.82      7292
       macro avg       0.65      0.75      0.70      7292
    weighted avg       0.83      0.82      0.82      7292

y_test_pred = rf_model.predict(X_test)

print("\nRandom Forest Performance (TEST)")
print("Accuracy:", round(accuracy_score(y_test, y_test_pred), 4))
print("F1 Macro:", round(f1_score(y_test, y_test_pred, average='macro'), 4))
print("F1 Weighted:", round(f1_score(y_test, y_test_pred, average='weighted'), 4))

print("\nDetailed Classification Report (TEST):")
print(classification_report(
    y_test,
    y_test_pred,
    labels=val_classes,
    target_names=target_names,
    zero_division=0
))

Random Forest Performance (TEST)
Accuracy: 0.8719
F1 Macro: 0.5933
F1 Weighted: 0.8757

Detailed Classification Report (TEST):
                  precision    recall  f1-score   support

            bike       0.93      0.92      0.92      4336
bike (transport)       0.52      0.70      0.60        70
  indoor cycling       0.23      0.75      0.35         8
   mountain bike       0.53      0.75      0.62       229
    orienteering       0.22      0.32      0.26        22
             run       0.87      0.82      0.85      2527
            walk       0.49      0.64      0.55       101

        accuracy                           0.87      7293
       macro avg       0.54      0.70      0.59      7293
    weighted avg       0.88      0.87      0.88      7293

models = ["Frequency Baseline", "Recency Baseline", "Weighted Frequency-Recency Baseline",
    "Logistic Regression (VAL)", "Naïve Bayes (VAL)", "SVM (VAL)", "Random Forest (VAL)", "Random Forest (TEST)"]

accuracy = [0.7923, 0.7967, 0.8127, 0.7592, 0.6529, 0.7999, 0.8206, 0.8719]
f1_macro = [0.5324, 0.6631, 0.6874, 0.2509, 0.2414, 0.4466, 0.6972, 0.5933]
f1_weighted = [0.7819, 0.7967, 0.8112, 0.7044, 0.6458, 0.7802, 0.8240, 0.8757]

df_summary = pd.DataFrame({"Model": models, "Accuracy": accuracy, "F1 Macro": f1_macro, "F1 Weighted": f1_weighted})
df_summary[["Accuracy", "F1 Macro", "F1 Weighted"]] = df_summary[["Accuracy", "F1 Macro", "F1 Weighted"]].round(4)
df_summary

y_test_orig = le_sport.inverse_transform(y_test)
y_test_pred_orig = le_sport.inverse_transform(y_test_pred)
val_classes_orig = le_sport.inverse_transform(val_classes)

cm = confusion_matrix(y_test_orig, y_test_pred_orig, labels=val_classes_orig)
cm_pct = cm / cm.sum(axis=1, keepdims=True)

plt.figure(figsize=(8,8))
plt.imshow(cm_pct, cmap="Blues")
plt.title("Confusion Matrix (Percent) — Test Set")
plt.xlabel("Predicted")
plt.ylabel("True")

plt.xticks(np.arange(len(val_classes_orig)), val_classes_orig, rotation=90)
plt.yticks(np.arange(len(val_classes_orig)), val_classes_orig)

for i in range(cm_pct.shape[0]):
    for j in range(cm_pct.shape[1]):
        value = cm_pct[i, j] * 100
        text = f"{value:.1f}%" if value > 0 else ""
        plt.text(j, i, text, ha="center", va="center", fontsize=9)

plt.colorbar(label="Percent")
plt.tight_layout()
plt.show()

	user_id	gender	sport	workout_id	duration_min	avg_heart_rate	datetime
0	10921915	male	bike	396826535	126.483333	152.650	2014-08-24 16:45:46
1	10921915	male	bike	392337038	74.000000	147.710	2014-08-16 20:41:22
2	10921915	male	bike	389643739	112.483333	140.554	2014-08-12 15:47:39
3	10921915	male	bike	386729739	75.316667	147.020	2014-08-07 17:20:42
4	10921915	male	bike (transport)	383186560	22.616667	167.154	2014-08-01 16:10:34
...	...	...	...	...	...	...	...
49995	4399772	male	mountain bike	183825234	130.083333	164.816	2013-05-01 15:00:45
49996	4399772	male	bike	183288370	106.666667	111.954	2013-04-30 15:05:54
49997	4399772	male	indoor cycling	182873807	91.200000	127.910	2013-04-29 17:39:55
49998	4399772	male	bike	181847177	176.333333	120.300	2013-04-26 22:01:36
49999	4399772	male	bike	181075198	192.200000	126.388	2013-04-24 22:23:48

	count
sport
run	22297
bike	20125
mountain bike	3382
bike (transport)	2358
indoor cycling	428
orienteering	416
walk	340
skate	170
cross-country skiing	118
core stability training	109
fitness walking	63
rowing	53
hiking	46
kayaking	26
soccer	13
weight training	11
circuit training	8
treadmill running	7
roller skiing	5
downhill skiing	5
horseback riding	3
gymnastics	3
elliptical	3
tennis	2
snowboarding	2
swimming	2
basketball	1
snowshoeing	1
yoga	1
aerobics	1
stair climing	1

	user_id	gender	sport	workout_id	duration_min	avg_heart_rate	datetime
0	10921915	male	bike	396826535	126.483333	152.650	2014-08-24 16:45:46
1	10921915	male	bike	392337038	74.000000	147.710	2014-08-16 20:41:22
2	10921915	male	bike	389643739	112.483333	140.554	2014-08-12 15:47:39
3	10921915	male	bike	386729739	75.316667	147.020	2014-08-07 17:20:42
4	10921915	male	bike (transport)	383186560	22.616667	167.154	2014-08-01 16:10:34
...	...	...	...	...	...	...	...
49995	4399772	male	mountain bike	183825234	130.083333	164.816	2013-05-01 15:00:45
49996	4399772	male	bike	183288370	106.666667	111.954	2013-04-30 15:05:54
49997	4399772	male	indoor cycling	182873807	91.200000	127.910	2013-04-29 17:39:55
49998	4399772	male	bike	181847177	176.333333	120.300	2013-04-26 22:01:36
49999	4399772	male	bike	181075198	192.200000	126.388	2013-04-24 22:23:48

	duration_min
count	4.934600e+04
mean	2.342148e+02
std	1.827681e+04
min	8.350000e+00
25%	4.853333e+01
50%	6.903333e+01
75%	1.138500e+02
max	2.479691e+06

	duration_min
count	49342.000000
mean	87.959696
std	55.899893
min	8.350000
25%	48.533333
50%	69.033333
75%	113.845833
max	299.866667

🏃‍♂️ NextMove: Predicting Next Workout Activity¶

This notebook focuses on data preprocessing, exploratory data analysis (EDA), development and evaluation of a machine learning model used to predict a user's next workout activity based on their historical fitness data on Endomondo.¶

Step 1: Predictive Task¶

Problem Statement:

Model Evaluation Strategy:

Model Validation Approach:

Baseline Approaches:

Classification Models:

¶

Step 2: Data Preprocesssing and Exploratory Data Analysis (EDA)¶

Dataset Context:¶

Data Preprocessing:¶

Exploratory Data Analysis (EDA):¶

At this stage, I have a solid exploratory foundation of the data I am working with, and I can begin with the machine learning model development process, i.e., finetuning a model to determine which sporting activity is the most likely for a user the next time they exercise.¶

¶

Step 3: Model Development¶

Model Context:¶

Model Approach¶

Feature Engineering¶

Step 4: Model Evaluation¶

Evaluating Model on Testing Data¶

Evaluation Metrics¶

How has this dataset (or similar datasets) been used before?¶

How has prior work approached the same (or similar) tasks?¶

Dataset Source:¶

Modeling heart rate and activity data for personalized fitness recommendation
Jianmo Ni, Larry Muhlstein, Julian McAuley
WWW, 2019

Author¶

Dylan Dsouza¶

GitHub · Website · LinkedIn ¶

	avg_heart_rate
count	49254.000000
mean	140.693914
std	16.155742
min	40.788000
25%	130.612000
50%	141.530000
75%	151.686000
max	210.194000

	count
day_of_week
Sunday	8644
Wednesday	7301
Tuesday	7299
Thursday	7226
Saturday	7033
Friday	6595
Monday	5244

	user_id	prev_sport	prev_duration	prev_avg_heart_rate	prev_hour	prev_dayofweek	days_since_prev	target_sport
0	5844	mountain bike	132.900000	103.558	22	4	103.849780	bike
1	5844	bike	42.750000	115.716	18	3	34.096134	bike
2	5844	bike	62.916667	102.902	20	2	1.981898	bike
3	5844	bike	70.783333	104.298	20	4	7.812234	bike
4	5844	bike	107.266667	96.570	15	5	31.363368	bike
...	...	...	...	...	...	...	...	...
49015	15279967	run	110.033333	157.782	19	4	11.298333	run
49016	15279967	run	172.000000	153.150	2	2	6.892731	run
49017	15279967	run	166.066667	151.474	23	1	4.783669	run
49018	15279967	run	84.566667	135.420	18	6	28.941308	run
49019	15279967	run	49.466667	156.118	17	0	38.009201	run

	prev_sport_enc	weighted_pred_enc	prev_duration	prev_avg_heart_rate	prev_hour	prev_dayofweek	days_since_prev	prev_eq_weighted	prev_intensity	target_sport_enc
1	0	3	42.750000	115.716	18.0	3.0	34.096134	0	4946.859000	0
2	0	0	62.916667	102.902	20.0	2.0	1.981898	1	6474.250833	0
3	0	0	70.783333	104.298	20.0	4.0	7.812234	1	7382.560100	0
4	0	0	107.266667	96.570	15.0	5.0	31.363368	1	10358.742000	0
5	0	0	67.633333	101.834	0.0	2.0	2.855729	1	6887.372867	0
...	...	...	...	...	...	...	...	...	...	...
49015	5	5	110.033333	157.782	19.0	4.0	11.298333	1	17361.279400	5
49016	5	5	172.000000	153.150	2.0	2.0	6.892731	1	26341.800000	5
49017	5	5	166.066667	151.474	23.0	1.0	4.783669	1	25154.782267	5
49018	5	5	84.566667	135.420	18.0	6.0	28.941308	1	11452.018000	5
49019	5	5	49.466667	156.118	17.0	0.0	38.009201	1	7722.637067	5

	Model	Accuracy	F1 Macro	F1 Weighted
0	Frequency Baseline	0.7923	0.5324	0.7819
1	Recency Baseline	0.7967	0.6631	0.7967
2	Weighted Frequency-Recency Baseline	0.8127	0.6874	0.8112
3	Logistic Regression (VAL)	0.7592	0.2509	0.7044
4	Naïve Bayes (VAL)	0.6529	0.2414	0.6458
5	SVM (VAL)	0.7999	0.4466	0.7802
6	Random Forest (VAL)	0.8206	0.6972	0.8240
7	Random Forest (TEST)	0.8719	0.5933	0.8757

🏃‍♂️ NextMove: Predicting Next Workout Activity¶

This notebook focuses on data preprocessing, exploratory data analysis (EDA), development and evaluation of a machine learning model used to predict a user's next workout activity based on their historical fitness data on Endomondo.¶

Step 1: Predictive Task¶

Problem Statement:

Model Evaluation Strategy:

Model Validation Approach:

Baseline Approaches:

Classification Models:

¶

Step 2: Data Preprocesssing and Exploratory Data Analysis (EDA)¶

Dataset Context:¶

Data Preprocessing:¶

Exploratory Data Analysis (EDA):¶

At this stage, I have a solid exploratory foundation of the data I am working with, and I can begin with the machine learning model development process, i.e., finetuning a model to determine which sporting activity is the most likely for a user the next time they exercise.¶

¶

Step 3: Model Development¶

Model Context:¶

Model Approach¶

Feature Engineering¶

Step 4: Model Evaluation¶

Evaluating Model on Testing Data¶

Evaluation Metrics¶

Step 5: Discussion of Related Work¶

How has this dataset (or similar datasets) been used before?¶

How has prior work approached the same (or similar) tasks?¶

How do your results match or differ from what has been reported in related work?¶

Dataset Source:¶

Modeling heart rate and activity data for personalized fitness recommendationJianmo Ni, Larry Muhlstein, Julian McAuleyWWW, 2019

Author¶

Dylan Dsouza¶

GitHub · Website · LinkedIn¶

Modeling heart rate and activity data for personalized fitness recommendation
Jianmo Ni, Larry Muhlstein, Julian McAuley
WWW, 2019

GitHub · Website · LinkedIn ¶