1 Introduction

This case study analyzes user activity data from Bellabeat, a high-tech health product company for women. The goal is to understand user behavior patterns by exploring activity, sleep, and calorie data, and to identify actionable insights for product improvement.

2 Data Loading and Preparation

We load and combine two datasets from different collections, then clean and summarize the data.

# Load datasets
dailyActivity_merged1 <- read_csv("C:/Users/zsham/OneDrive/google_data_analysis_bellabeat/data/First_Collection/dailyActivity_merged.csv")
dailyActivity_merged2 <- read_csv("C:/Users/zsham/OneDrive/google_data_analysis_bellabeat/data/Second_Collection/dailyActivity_merged.csv")

# Combine datasets
dailyActivity_merged <- rbind(dailyActivity_merged1, dailyActivity_merged2)
rm(dailyActivity_merged1, dailyActivity_merged2)

# Check for duplicate rows by Id and ActivityDate
duplicates <- dailyActivity_merged %>%
  group_by(Id, ActivityDate) %>%
  filter(n() > 1)

# Aggregate numeric columns by Id and ActivityDate
dailyActivity_merged <- dailyActivity_merged %>%
  group_by(Id, ActivityDate) %>%
  summarise(across(where(is.numeric), sum, na.rm = TRUE), .groups = "drop")

# Format dates and create weekday column
dailyActivity_merged <- dailyActivity_merged %>%
  mutate(
    ActivityDate = as.Date(ActivityDate, format = "%m/%d/%Y"),
    Weekday = weekdays(ActivityDate)
  )

3 Data Quality Checks

# Check for any duplicate rows overall
any_duplicated <- any(duplicated(dailyActivity_merged))

# Missing value summary
missing_values <- colSums(is.na(dailyActivity_merged))

4 Feature Engineering

We create new features for active distance and active minutes by summing relevant columns.

dailyActivity_merged <- dailyActivity_merged %>%
  mutate(
    total_active_dis = VeryActiveDistance + ModeratelyActiveDistance + LightActiveDistance,
    total_active_min = VeryActiveMinutes + FairlyActiveMinutes + LightlyActiveMinutes
  )

5 Exploratory Data Analysis (EDA)

5.1 Distribution of Key Variables

ggplot(dailyActivity_merged, aes(x = TotalSteps)) +
  geom_histogram(binwidth = 1000, color = "white", fill = "skyblue") +
  theme_minimal() +
  labs(title = "Distribution of Total Steps")

ggplot(dailyActivity_merged, aes(x = Calories)) +
  geom_histogram(binwidth = 200, color = "white", fill = "skyblue") +
  theme_minimal() +
  labs(title = "Distribution of Calories Burned")

ggplot(dailyActivity_merged, aes(x = total_active_dis)) +
  geom_histogram(color = "white", fill = "orange") +
  theme_minimal() +
  labs(title = "Distribution of Total Active Distance")

ggplot(dailyActivity_merged, aes(x = total_active_min)) +
  geom_histogram(color = "white", fill = "orange") +
  theme_minimal() +
  labs(title = "Distribution of Total Active Minutes")

5.2 Activity by User and Weekday

dailyActivity_merged %>%
  group_by(Id, Weekday) %>%
  summarise(total_active_dis = sum(total_active_dis), .groups = "drop") %>%
  ggplot(aes(x = factor(Id), y = total_active_dis, fill = Weekday)) +
  geom_col(position = "stack", width = 0.7) +
  labs(x = "User ID", y = "Total Active Distance (km)", title = "Stacked Total Active Distance by User and Weekday") +
  theme_minimal() +
  theme(axis.text.x = element_blank())

5.3 Time Series of Activity

Mean Active Distance with Standard Error Bars

summary_df <- dailyActivity_merged %>%
  group_by(ActivityDate) %>%
  summarise(
    mean_active_dis = mean(total_active_dis),
    se_active_dis = sd(total_active_dis) / sqrt(n()),
    .groups = "drop"
  )

ggplot(summary_df, aes(x = ActivityDate, y = mean_active_dis)) +
  geom_point(color = "steelblue", alpha = 0.7) +
  geom_errorbar(aes(ymin = mean_active_dis - se_active_dis, ymax = mean_active_dis + se_active_dis), width = 0.2, color = "gray30", alpha = 0.8) +
  labs(title = "Time Series of Mean Active Distance with Standard Error Bars", x = "Date", y = "Mean Active Distance (km)") +
  theme_minimal()

7 Activity Level Breakdown

activity_levels <- dailyActivity_merged %>%
  summarise(
    Sedentary = mean(SedentaryMinutes),
    LightlyActive = mean(LightlyActiveMinutes),
    FairlyActive = mean(FairlyActiveMinutes),
    VeryActive = mean(VeryActiveMinutes)
  ) %>%
  pivot_longer(everything(), names_to = "ActivityLevel", values_to = "Minutes")

ggplot(activity_levels, aes(x = ActivityLevel, y = Minutes, fill = ActivityLevel)) +
  geom_col() +
  labs(title = "Average Minutes by Activity Level", y = "Minutes") +
  theme_minimal()

8 Steps by Day of the Week (Boxplot)

dailyActivity_merged %>%
  mutate(Weekday = factor(Weekday, levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))) %>%
  ggplot(aes(x = Weekday, y = TotalSteps)) +
  geom_boxplot(fill = "skyblue") +
  labs(title = "Steps by Weekday", x = "Weekday", y = "Total Steps") +
  theme_minimal()

## Cumulative Steps per User Over Time

cumulative_steps <- dailyActivity_merged %>%
  group_by(Id, ActivityDate) %>%
  summarise(Steps = sum(TotalSteps), .groups = "drop") %>%
  arrange(Id, ActivityDate) %>%
  group_by(Id) %>%
  mutate(CumulativeSteps = cumsum(Steps))

ggplot(cumulative_steps, aes(x = ActivityDate, y = CumulativeSteps, color = factor(Id))) +
  geom_line() +
  labs(title = "Cumulative Steps Over Time by User", x = "Date", y = "Cumulative Steps") +
  theme_minimal()

9 Calories Burned vs Steps (Bubble Plot)

ggplot(dailyActivity_merged, aes(x = TotalSteps, y = Calories, size = VeryActiveMinutes)) +
  geom_point(alpha = 0.6, color = "tomato") +
  labs(title = "Calories vs Steps (Size = Very Active Minutes)", x = "Total Steps", y = "Calories Burned") +
  theme_minimal()

10 Relationship Between Steps and Calories Burned

# ```{r, echo=FALSE}
ggplot(dailyActivity_merged, aes(x = TotalSteps, y = Calories)) +
  geom_point(alpha = 0.5, color = "tomato") +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Steps vs Calories Burned", x = "Steps", y = "Calories") +
  theme_minimal()

11 Conclusion

This analysis highlights how Bellabeat users’ activity and health metrics vary by day, user, and activity intensity. The visualizations and statistics provide actionable insights to improve user engagement and wellness product design.