First, make sure you downloaded the tidyverse
and are
able to load it. Furthermore, you need to install the
palmerpenguins
package to access the data sets. Load both
packages and check if you have any NAMESPACE
conflicts.
library(tidyverse)
library(palmerpenguins)
My conflicts are stats::filter()
and
stats::lag()
, which should not be an issue.
Have a look at the class and structure of the
penguins_raw
data set. Also, make yourself familiar with
the columns. Since the data set is part of a package, you can also use
the corresponding help page (?
or F1).
class(penguins_raw)
## [1] "tbl_df" "tbl" "data.frame"
str(penguins_raw)
## tibble [344 × 17] (S3: tbl_df/tbl/data.frame)
## $ studyname : chr [1:344] "PAL0708" "PAL0708" "PAL0708" "PAL0708" ...
## $ sample_number : num [1:344] 1 2 3 4 5 6 7 8 9 10 ...
## $ species : chr [1:344] "Adelie Penguin (Pygoscelis adeliae)" "Adelie Penguin (Pygoscelis adeliae)" "Adelie Penguin (Pygoscelis adeliae)" "Adelie Penguin (Pygoscelis adeliae)" ...
## $ region : chr [1:344] "Anvers" "Anvers" "Anvers" "Anvers" ...
## $ island : chr [1:344] "Torgersen" "Torgersen" "Torgersen" "Torgersen" ...
## $ stage : chr [1:344] "Adult, 1 Egg Stage" "Adult, 1 Egg Stage" "Adult, 1 Egg Stage" "Adult, 1 Egg Stage" ...
## $ individual_id : chr [1:344] "N1A1" "N1A2" "N2A1" "N2A2" ...
## $ clutch_completion: chr [1:344] "Yes" "Yes" "Yes" "Yes" ...
## $ date_egg : Date[1:344], format: "2007-11-11" "2007-11-11" "2007-11-16" "2007-11-16" ...
## $ culmen_length : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
## $ culmen_depth : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
## $ flipper_length : num [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
## $ body_mass : num [1:344] 3750 3800 3250 NA 3450 ...
## $ sex : chr [1:344] "MALE" "FEMALE" "FEMALE" NA ...
## $ delta_15_n : num [1:344] NA 8.95 8.37 NA 8.77 ...
## $ delta_13_c : num [1:344] NA -24.7 -25.3 NA -25.3 ...
## $ comments : chr [1:344] "Not enough blood for isotopes." NA NA "Adult not sampled." ...
## - attr(*, "spec")=
## .. cols(
## .. studyName = col_character(),
## .. `Sample Number` = col_double(),
## .. Species = col_character(),
## .. Region = col_character(),
## .. Island = col_character(),
## .. Stage = col_character(),
## .. `Individual ID` = col_character(),
## .. `Clutch Completion` = col_character(),
## .. `Date Egg` = col_date(format = ""),
## .. `Culmen Length (mm)` = col_double(),
## .. `Culmen Depth (mm)` = col_double(),
## .. `Flipper Length (mm)` = col_double(),
## .. `Body Mass (g)` = col_double(),
## .. Sex = col_character(),
## .. `Delta 15 N (o/oo)` = col_double(),
## .. `Delta 13 C (o/oo)` = col_double(),
## .. Comments = col_character()
## .. )
names(penguins_raw)
## [1] "studyname" "sample_number" "species" "region" "island"
## [6] "stage" "individual_id" "clutch_completion" "date_egg" "culmen_length"
## [11] "culmen_depth" "flipper_length" "body_mass" "sex" "delta_15_n"
## [16] "delta_13_c" "comments"
?penguins_raw
Next, for easier data handling, clean the column names by removing all special characters (e.g., brackets, units, …) and replacing all white spaces with an underscore. Last, makle sure all column names are either all lower case or all upper case
col_names <- names(penguins_raw) |>
stringr::str_remove_all(pattern = " \\([^()]+\\)") |>
stringr::str_replace_all(pattern = " ", replacement = "_") |>
stringr::str_to_lower()
names(penguins_raw) <- col_names
Now, remove all rows that don’t have a measure for stable isotopes
(both Delta 15 N or Delta 13 C). Save this into a new
tibble
.
penguins_cln <- tidyr::drop_na(penguins_raw, delta_15_n, delta_13_c)
Filter the data set to include only the 50% smallest individuals in
terms of body mass. Select the individual id, species, the culmen
dimensions, and the sex columns. Save this into a new
tibble
called penguins_small
(or something
similar).
penguins_small <- dplyr::filter(penguins_cln, body_mass <= quantile(body_mass, 0.5)) |>
dplyr::select(individual_id, species, tidyselect::starts_with("culmen"), sex)
Create a new column (culmen_class
) in which each male
individual with a culmen length larger than 50 mm is identified by
1
, each female individual with a culmen length larger than
45 mm is identified by 2
, and all other individuals are
identified by 0
.
penguins_small <-
dplyr::mutate(penguins_small, culmen_class = dplyr::case_when(culmen_length > 50 & sex == "MALE" ~ 1,
culmen_length > 45 & sex == "FEMALE" ~ 2,
TRUE ~ 0))
Calculate the relative number (%) of individuals within each group
and the ratio between the minimum culmen length and depth as well as
between the maximum culmen length and depth. Add a sex_new
column again (culmen_class 1 = "male"
,
culmen_class 2 = "female"
,
culmen_class 0 = "mixed
). Save the result as
penguings_sum
.
penguings_sum <- dplyr::group_by(penguins_small, culmen_class) |>
dplyr::summarise(n_rel = dplyr::n() / nrow(penguins_small) * 100,
ratio_min = min(culmen_length) / min(culmen_depth),
ratio_max = max(culmen_length) / max(culmen_depth)) |>
dplyr::mutate(sex_new = dplyr::case_when(culmen_class == 0 ~ "mixed",
culmen_class == 1 ~ "male",
culmen_class == 2 ~ "female"))
Now, combine penguins
and penguins_sum
to
one tibble
using sex
and sex_new
as ID columns.
dplyr::left_join(penguins, penguings_sum, by = c("sex" = "sex_new"))
## # A tibble: 344 × 12
## species island bill_length_mm bill_depth_mm flipper_len…¹ body_…² sex year culme…³ n_rel ratio…⁴ ratio…⁵
## <fct> <fct> <dbl> <dbl> <int> <int> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie Torgersen 39.1 18.7 181 3750 male 2007 1 9.64 2.77 2.75
## 2 Adelie Torgersen 39.5 17.4 186 3800 fema… 2007 2 16.3 2.76 2.99
## 3 Adelie Torgersen 40.3 18 195 3250 fema… 2007 2 16.3 2.76 2.99
## 4 Adelie Torgersen NA NA NA NA <NA> 2007 NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450 fema… 2007 2 16.3 2.76 2.99
## 6 Adelie Torgersen 39.3 20.6 190 3650 male 2007 1 9.64 2.77 2.75
## 7 Adelie Torgersen 38.9 17.8 181 3625 fema… 2007 2 16.3 2.76 2.99
## 8 Adelie Torgersen 39.2 19.6 195 4675 male 2007 1 9.64 2.77 2.75
## 9 Adelie Torgersen 34.1 18.1 193 3475 <NA> 2007 NA NA NA NA
## 10 Adelie Torgersen 42 20.2 190 4250 <NA> 2007 NA NA NA NA
## # … with 334 more rows, and abbreviated variable names ¹flipper_length_mm, ²body_mass_g, ³culmen_class,
## # ⁴ratio_min, ⁵ratio_max
dplyr::right_join(penguins, penguings_sum, by = c("sex" = "sex_new"))
## # A tibble: 334 × 12
## species island bill_length_mm bill_depth_mm flipper_len…¹ body_…² sex year culme…³ n_rel ratio…⁴ ratio…⁵
## <fct> <fct> <dbl> <dbl> <int> <int> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie Torgersen 39.1 18.7 181 3750 male 2007 1 9.64 2.77 2.75
## 2 Adelie Torgersen 39.5 17.4 186 3800 fema… 2007 2 16.3 2.76 2.99
## 3 Adelie Torgersen 40.3 18 195 3250 fema… 2007 2 16.3 2.76 2.99
## 4 Adelie Torgersen 36.7 19.3 193 3450 fema… 2007 2 16.3 2.76 2.99
## 5 Adelie Torgersen 39.3 20.6 190 3650 male 2007 1 9.64 2.77 2.75
## 6 Adelie Torgersen 38.9 17.8 181 3625 fema… 2007 2 16.3 2.76 2.99
## 7 Adelie Torgersen 39.2 19.6 195 4675 male 2007 1 9.64 2.77 2.75
## 8 Adelie Torgersen 41.1 17.6 182 3200 fema… 2007 2 16.3 2.76 2.99
## 9 Adelie Torgersen 38.6 21.2 191 3800 male 2007 1 9.64 2.77 2.75
## 10 Adelie Torgersen 34.6 21.1 198 4400 male 2007 1 9.64 2.77 2.75
## # … with 324 more rows, and abbreviated variable names ¹flipper_length_mm, ²body_mass_g, ³culmen_class,
## # ⁴ratio_min, ⁵ratio_max
Reshape the penguins_small
tibble
from wide
to long in a way that the culmen length and depth columns are tidy. The
name of the new column specifying the information should be
fun
, the new column containing the values should be
measurements
. Save the results as
penguins_small_long
tibble
.
penguins_small_long <- tidyr::pivot_longer(penguins_small, c(culmen_length, culmen_depth),
values_to = "measurement", names_to = "fun")
Use the map
function to fit a linear model
(flipper_length_mm ~ body_mass_g
) to the penguins data set,
but seperated by species. Extract the R squared and p value and save the
results in a data.frame
that additionally includes the
species. (Tip: Have a look at broom::glance
, however, there
are many ways to achieve this).
library(broom)
dplyr::group_by(penguins, species) |>
dplyr::group_split() |>
purrr::map_dfr(function(i) {
lm_model <- lm(data = i, flipper_length_mm ~ body_mass_g)
cbind(species = unique(i$species), broom::glance(lm_model))
})
## species r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance
## 1 Adelie 0.2192128 0.2139726 5.797764 41.83305 1.343265e-09 1 -478.6314 963.2627 972.3146 5008.496
## 2 Chinstrap 0.4115985 0.4026833 5.511975 46.16830 3.748130e-09 1 -211.5436 429.0872 435.7457 2005.203
## 3 Gentoo 0.4937402 0.4895563 4.633213 118.00774 1.330279e-19 1 -362.1110 730.2221 738.6587 2597.467
## df.residual nobs
## 1 149 151
## 2 66 68
## 3 121 123