Solution Using the tidyverse

First, make sure you downloaded the tidyverse and are able to load it. Furthermore, you need to install the palmerpenguins package to access the data sets. Load both packages and check if you have any NAMESPACE conflicts.

library(tidyverse)
library(palmerpenguins)

My conflicts are stats::filter() and stats::lag(), which should not be an issue.

Have a look at the class and structure of the penguins_raw data set. Also, make yourself familiar with the columns. Since the data set is part of a package, you can also use the corresponding help page (? or F1).

class(penguins_raw)
## [1] "tbl_df"     "tbl"        "data.frame"
str(penguins_raw)
## tibble [344 × 17] (S3: tbl_df/tbl/data.frame)
##  $ studyname        : chr [1:344] "PAL0708" "PAL0708" "PAL0708" "PAL0708" ...
##  $ sample_number    : num [1:344] 1 2 3 4 5 6 7 8 9 10 ...
##  $ species          : chr [1:344] "Adelie Penguin (Pygoscelis adeliae)" "Adelie Penguin (Pygoscelis adeliae)" "Adelie Penguin (Pygoscelis adeliae)" "Adelie Penguin (Pygoscelis adeliae)" ...
##  $ region           : chr [1:344] "Anvers" "Anvers" "Anvers" "Anvers" ...
##  $ island           : chr [1:344] "Torgersen" "Torgersen" "Torgersen" "Torgersen" ...
##  $ stage            : chr [1:344] "Adult, 1 Egg Stage" "Adult, 1 Egg Stage" "Adult, 1 Egg Stage" "Adult, 1 Egg Stage" ...
##  $ individual_id    : chr [1:344] "N1A1" "N1A2" "N2A1" "N2A2" ...
##  $ clutch_completion: chr [1:344] "Yes" "Yes" "Yes" "Yes" ...
##  $ date_egg         : Date[1:344], format: "2007-11-11" "2007-11-11" "2007-11-16" "2007-11-16" ...
##  $ culmen_length    : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
##  $ culmen_depth     : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
##  $ flipper_length   : num [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
##  $ body_mass        : num [1:344] 3750 3800 3250 NA 3450 ...
##  $ sex              : chr [1:344] "MALE" "FEMALE" "FEMALE" NA ...
##  $ delta_15_n       : num [1:344] NA 8.95 8.37 NA 8.77 ...
##  $ delta_13_c       : num [1:344] NA -24.7 -25.3 NA -25.3 ...
##  $ comments         : chr [1:344] "Not enough blood for isotopes." NA NA "Adult not sampled." ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   studyName = col_character(),
##   ..   `Sample Number` = col_double(),
##   ..   Species = col_character(),
##   ..   Region = col_character(),
##   ..   Island = col_character(),
##   ..   Stage = col_character(),
##   ..   `Individual ID` = col_character(),
##   ..   `Clutch Completion` = col_character(),
##   ..   `Date Egg` = col_date(format = ""),
##   ..   `Culmen Length (mm)` = col_double(),
##   ..   `Culmen Depth (mm)` = col_double(),
##   ..   `Flipper Length (mm)` = col_double(),
##   ..   `Body Mass (g)` = col_double(),
##   ..   Sex = col_character(),
##   ..   `Delta 15 N (o/oo)` = col_double(),
##   ..   `Delta 13 C (o/oo)` = col_double(),
##   ..   Comments = col_character()
##   .. )
names(penguins_raw)
##  [1] "studyname"         "sample_number"     "species"           "region"            "island"           
##  [6] "stage"             "individual_id"     "clutch_completion" "date_egg"          "culmen_length"    
## [11] "culmen_depth"      "flipper_length"    "body_mass"         "sex"               "delta_15_n"       
## [16] "delta_13_c"        "comments"
?penguins_raw

Next, for easier data handling, clean the column names by removing all special characters (e.g., brackets, units, …) and replacing all white spaces with an underscore. Last, makle sure all column names are either all lower case or all upper case

col_names <- names(penguins_raw) |> 
  stringr::str_remove_all(pattern = " \\([^()]+\\)") |> 
  stringr::str_replace_all(pattern = " ", replacement = "_") |> 
  stringr::str_to_lower()

names(penguins_raw) <- col_names

Now, remove all rows that don’t have a measure for stable isotopes (both Delta 15 N or Delta 13 C). Save this into a new tibble.

penguins_cln <- tidyr::drop_na(penguins_raw, delta_15_n, delta_13_c)

Filter the data set to include only the 50% smallest individuals in terms of body mass. Select the individual id, species, the culmen dimensions, and the sex columns. Save this into a new tibble called penguins_small (or something similar).

penguins_small <- dplyr::filter(penguins_cln, body_mass <= quantile(body_mass, 0.5)) |> 
  dplyr::select(individual_id, species, tidyselect::starts_with("culmen"), sex)

Create a new column (culmen_class) in which each male individual with a culmen length larger than 50 mm is identified by 1, each female individual with a culmen length larger than 45 mm is identified by 2, and all other individuals are identified by 0.

penguins_small <- 
  dplyr::mutate(penguins_small, culmen_class = dplyr::case_when(culmen_length > 50 & sex == "MALE" ~ 1,
                                                                culmen_length > 45 & sex == "FEMALE" ~ 2,
                                                                TRUE ~ 0))

Calculate the relative number (%) of individuals within each group and the ratio between the minimum culmen length and depth as well as between the maximum culmen length and depth. Add a sex_new column again (culmen_class 1 = "male", culmen_class 2 = "female", culmen_class 0 = "mixed). Save the result as penguings_sum.

penguings_sum <- dplyr::group_by(penguins_small, culmen_class) |> 
  dplyr::summarise(n_rel = dplyr::n() / nrow(penguins_small) * 100, 
                   ratio_min = min(culmen_length) / min(culmen_depth), 
                   ratio_max = max(culmen_length) / max(culmen_depth)) |> 
  dplyr::mutate(sex_new = dplyr::case_when(culmen_class == 0 ~ "mixed", 
                                           culmen_class == 1 ~ "male", 
                                           culmen_class == 2 ~ "female"))

Now, combine penguins and penguins_sum to one tibble using sex and sex_new as ID columns.

dplyr::left_join(penguins, penguings_sum, by = c("sex" = "sex_new"))
## # A tibble: 344 × 12
##    species island    bill_length_mm bill_depth_mm flipper_len…¹ body_…² sex    year culme…³ n_rel ratio…⁴ ratio…⁵
##    <fct>   <fct>              <dbl>         <dbl>         <int>   <int> <chr> <int>   <dbl> <dbl>   <dbl>   <dbl>
##  1 Adelie  Torgersen           39.1          18.7           181    3750 male   2007       1  9.64    2.77    2.75
##  2 Adelie  Torgersen           39.5          17.4           186    3800 fema…  2007       2 16.3     2.76    2.99
##  3 Adelie  Torgersen           40.3          18             195    3250 fema…  2007       2 16.3     2.76    2.99
##  4 Adelie  Torgersen           NA            NA              NA      NA <NA>   2007      NA NA      NA      NA   
##  5 Adelie  Torgersen           36.7          19.3           193    3450 fema…  2007       2 16.3     2.76    2.99
##  6 Adelie  Torgersen           39.3          20.6           190    3650 male   2007       1  9.64    2.77    2.75
##  7 Adelie  Torgersen           38.9          17.8           181    3625 fema…  2007       2 16.3     2.76    2.99
##  8 Adelie  Torgersen           39.2          19.6           195    4675 male   2007       1  9.64    2.77    2.75
##  9 Adelie  Torgersen           34.1          18.1           193    3475 <NA>   2007      NA NA      NA      NA   
## 10 Adelie  Torgersen           42            20.2           190    4250 <NA>   2007      NA NA      NA      NA   
## # … with 334 more rows, and abbreviated variable names ¹flipper_length_mm, ²body_mass_g, ³culmen_class,
## #   ⁴ratio_min, ⁵ratio_max
dplyr::right_join(penguins, penguings_sum, by = c("sex" = "sex_new"))
## # A tibble: 334 × 12
##    species island    bill_length_mm bill_depth_mm flipper_len…¹ body_…² sex    year culme…³ n_rel ratio…⁴ ratio…⁵
##    <fct>   <fct>              <dbl>         <dbl>         <int>   <int> <chr> <int>   <dbl> <dbl>   <dbl>   <dbl>
##  1 Adelie  Torgersen           39.1          18.7           181    3750 male   2007       1  9.64    2.77    2.75
##  2 Adelie  Torgersen           39.5          17.4           186    3800 fema…  2007       2 16.3     2.76    2.99
##  3 Adelie  Torgersen           40.3          18             195    3250 fema…  2007       2 16.3     2.76    2.99
##  4 Adelie  Torgersen           36.7          19.3           193    3450 fema…  2007       2 16.3     2.76    2.99
##  5 Adelie  Torgersen           39.3          20.6           190    3650 male   2007       1  9.64    2.77    2.75
##  6 Adelie  Torgersen           38.9          17.8           181    3625 fema…  2007       2 16.3     2.76    2.99
##  7 Adelie  Torgersen           39.2          19.6           195    4675 male   2007       1  9.64    2.77    2.75
##  8 Adelie  Torgersen           41.1          17.6           182    3200 fema…  2007       2 16.3     2.76    2.99
##  9 Adelie  Torgersen           38.6          21.2           191    3800 male   2007       1  9.64    2.77    2.75
## 10 Adelie  Torgersen           34.6          21.1           198    4400 male   2007       1  9.64    2.77    2.75
## # … with 324 more rows, and abbreviated variable names ¹flipper_length_mm, ²body_mass_g, ³culmen_class,
## #   ⁴ratio_min, ⁵ratio_max

Reshape the penguins_small tibble from wide to long in a way that the culmen length and depth columns are tidy. The name of the new column specifying the information should be fun, the new column containing the values should be measurements. Save the results as penguins_small_long tibble.

penguins_small_long <- tidyr::pivot_longer(penguins_small, c(culmen_length, culmen_depth),
                                           values_to = "measurement", names_to = "fun")

Use the map function to fit a linear model (flipper_length_mm ~ body_mass_g) to the penguins data set, but seperated by species. Extract the R squared and p value and save the results in a data.frame that additionally includes the species. (Tip: Have a look at broom::glance, however, there are many ways to achieve this).

library(broom)

dplyr::group_by(penguins, species) |> 
  dplyr::group_split() |> 
  purrr::map_dfr(function(i) {
    lm_model <- lm(data = i, flipper_length_mm ~ body_mass_g)
    cbind(species = unique(i$species), broom::glance(lm_model))
  })
##     species r.squared adj.r.squared    sigma statistic      p.value df    logLik      AIC      BIC deviance
## 1    Adelie 0.2192128     0.2139726 5.797764  41.83305 1.343265e-09  1 -478.6314 963.2627 972.3146 5008.496
## 2 Chinstrap 0.4115985     0.4026833 5.511975  46.16830 3.748130e-09  1 -211.5436 429.0872 435.7457 2005.203
## 3    Gentoo 0.4937402     0.4895563 4.633213 118.00774 1.330279e-19  1 -362.1110 730.2221 738.6587 2597.467
##   df.residual nobs
## 1         149  151
## 2          66   68
## 3         121  123

Solution Using the tidyverse

Maximilian H.K. Hesselbarth

2022/10/24