Data manipulation using `dplyr` and `tidyr`

relative paths

sessionInfo()
#> R version 4.5.1 (2025-06-13)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 22.04.5 LTS
#>
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0
#>
#> locale:
#>  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8
#>  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8
#>  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C
#> [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base
#>
#> other attached packages:
#>  [1] RSQLite_2.3.11  lubridate_1.9.4 forcats_1.0.0   stringr_1.5.1
#>  [5] dplyr_1.1.4     purrr_1.0.4     readr_2.1.5     tidyr_1.3.1
#>  [9] tibble_3.2.1    ggplot2_3.5.2   tidyverse_2.0.0 knitr_1.50
#>
#> loaded via a namespace (and not attached):
#>  [1] bit_4.6.0          gtable_0.3.6       compiler_4.5.1     renv_1.1.4
#>  [5] tidyselect_1.2.1   blob_1.2.4         scales_1.4.0       fastmap_1.2.0
#>  [9] yaml_2.3.10        R6_2.6.1           generics_0.1.3     DBI_1.2.3
#> [13] pillar_1.10.2      RColorBrewer_1.1-3 tzdb_0.5.0         rlang_1.1.6
#> [17] cachem_1.1.0       stringi_1.8.7      xfun_0.52          bit64_4.6.0-1
#> [21] memoise_2.0.1      timechange_0.3.0   cli_3.6.5          withr_3.0.2
#> [25] magrittr_2.0.3     grid_4.5.1         hms_1.1.3          lifecycle_1.0.4
#> [29] vctrs_0.6.5        evaluate_1.0.3     glue_1.8.0         farver_2.1.2
#> [33] tools_4.5.1        pkgconfig_2.0.3
3 + 5
12 / 7
weight_kg <- 55
weight_kg <- 55    # doesn't print anything
(weight_kg <- 55)  # but putting parenthesis around the call prints the value of `weight_kg`
weight_kg          # and so does typing the name of the object
2.2 * weight_kg
weight_kg <- 57.5
2.2 * weight_kg
weight_lb <- 2.2 * weight_kg
weight_kg <- 100
mass <- 47.5            # mass?
age  <- 122             # age?
mass <- mass * 2.0      # mass?
age  <- age - 20        # age?
mass_index <- mass/age  # mass_index?
weight_kg <- sqrt(10)
round(3.14159)
#> [1] 3
args(round)
#> function (x, digits = 0, ...)
#> NULL
?round
round(3.14159, digits = 2)
#> [1] 3.14
round(3.14159, 2)
#> [1] 3.14
round(digits = 2, x = 3.14159)
#> [1] 3.14
weight_g <- c(50, 60, 65, 82)
weight_g
animals <- c("mouse", "rat", "dog")
animals
length(weight_g)
length(animals)
class(weight_g)
class(animals)
str(weight_g)
str(animals)
weight_g <- c(weight_g, 90) # add to the end of the vector
weight_g <- c(30, weight_g) # add to the beginning of the vector
weight_g
num_char <- c(1, 2, 3, "a")
num_logical <- c(1, 2, 3, TRUE)
char_logical <- c("a", "b", "c", TRUE)
tricky <- c(1, 2, 3, "4")
combined_logical <- c(num_logical, char_logical)
animals <- c("mouse", "rat", "dog", "cat")
animals[2]
#> [1] "rat"
animals[c(3, 2)]
#> [1] "dog" "rat"
more_animals <- animals[c(1, 2, 3, 2, 1, 4)]
more_animals
#> [1] "mouse" "rat"   "dog"   "rat"   "mouse" "cat"
weight_g <- c(21, 34, 39, 54, 55)
weight_g[c(TRUE, FALSE, FALSE, TRUE, TRUE)]
#> [1] 21 54 55
weight_g > 50    # will return logicals with TRUE for the indices that meet the condition
#> [1] FALSE FALSE FALSE  TRUE  TRUE
## so we can use this to select only the values above 50
weight_g[weight_g > 50]
#> [1] 54 55
weight_g[weight_g > 30 & weight_g < 50]
#> [1] 34 39
weight_g[weight_g <= 30 | weight_g == 55]
#> [1] 21 55
weight_g[weight_g >= 30 & weight_g == 21]
#> numeric(0)
animals <- c("mouse", "rat", "dog", "cat", "cat")

# return both rat and cat
animals[animals == "cat" | animals == "rat"]
#> [1] "rat" "cat" "cat"
# return a logical vector that is TRUE for the elements within animals
# that are found in the character vector and FALSE for those that are not
animals %in% c("rat", "cat", "dog", "duck", "goat", "bird", "fish")
#> [1] FALSE  TRUE  TRUE  TRUE  TRUE
# use the logical vector created by %in% to return elements from animals
# that are found in the character vector
animals[animals %in% c("rat", "cat", "dog", "duck", "goat", "bird", "fish")]
#> [1] "rat" "dog" "cat" "cat"
heights <- c(2, 4, 4, NA, 6)
mean(heights)
max(heights)
mean(heights, na.rm = TRUE)
max(heights, na.rm = TRUE)
## Extract those elements which are not missing values.
heights[!is.na(heights)]

## Returns the object with incomplete cases removed.
#The returned object is an atomic vector of type `"numeric"` (or #`"double"`).
na.omit(heights)

## Extract those elements which are complete cases.
#The returned object is an atomic vector of type `"numeric"` (or #`"double"`).
heights[complete.cases(heights)]
heights <- c(63, 69, 60, 65, NA, 68, 61, 70, 61, 59, 64, 69, 63, 63, NA, 72, 65, 64, 70, 63, 65)
heights <- c(63, 69, 60, 65, NA, 68, 61, 70, 61, 59, 64, 69, 63, 63, NA, 72, 65, 64, 70, 63, 65)

# 1.
heights_no_na <- heights[!is.na(heights)]
# or
heights_no_na <- na.omit(heights)
# or
heights_no_na <- heights[complete.cases(heights)]

# 2.
median(heights, na.rm = TRUE)

# 3.
heights_above_67 <- heights_no_na[heights_no_na > 67]
length(heights_above_67)
download.file(url = "https://ndownloader.figshare.com/files/2292169",
              destfile = "data_raw/portal_data_joined.csv")
## load the tidyverse packages, incl. dplyr
library(tidyverse)
surveys <- read_csv("data_raw/portal_data_joined.csv")
#> Rows: 34786 Columns: 13
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (6): species_id, sex, genus, species, taxa, plot_type
#> dbl (7): record_id, month, day, year, plot_id, hindfoot_length, weight
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(surveys)
#> # A tibble: 6 × 13
#>   record_id month   day  year plot_id species_id sex   hindfoot_length weight
#>       <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
#> 1         1     7    16  1977       2 NL         M                  32     NA
#> 2        72     8    19  1977       2 NL         M                  31     NA
#> 3       224     9    13  1977       2 NL         <NA>               NA     NA
#> 4       266    10    16  1977       2 NL         <NA>               NA     NA
#> 5       349    11    12  1977       2 NL         <NA>               NA     NA
#> 6       363    11    12  1977       2 NL         <NA>               NA     NA
#> # ℹ 4 more variables: genus <chr>, species <chr>, taxa <chr>, plot_type <chr>
view(surveys)
str(surveys)
str(surveys)
#> spc_tbl_ [34,786 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
#>  $ record_id      : num [1:34786] 1 72 224 266 349 363 435 506 588 661 ...
#>  $ month          : num [1:34786] 7 8 9 10 11 11 12 1 2 3 ...
#>  $ day            : num [1:34786] 16 19 13 16 12 12 10 8 18 11 ...
#>  $ year           : num [1:34786] 1977 1977 1977 1977 1977 ...
#>  $ plot_id        : num [1:34786] 2 2 2 2 2 2 2 2 2 2 ...
#>  $ species_id     : chr [1:34786] "NL" "NL" "NL" "NL" ...
#>  $ sex            : chr [1:34786] "M" "M" NA NA ...
#>  $ hindfoot_length: num [1:34786] 32 31 NA NA NA NA NA NA NA NA ...
#>  $ weight         : num [1:34786] NA NA NA NA NA NA NA NA 218 NA ...
#>  $ genus          : chr [1:34786] "Neotoma" "Neotoma" "Neotoma" "Neotoma" ...
#>  $ species        : chr [1:34786] "albigula" "albigula" "albigula" "albigula" ...
#>  $ taxa           : chr [1:34786] "Rodent" "Rodent" "Rodent" "Rodent" ...
#>  $ plot_type      : chr [1:34786] "Control" "Control" "Control" "Control" ...
#>  - attr(*, "spec")=
#>   .. cols(
#>   ..   record_id = col_double(),
#>   ..   month = col_double(),
#>   ..   day = col_double(),
#>   ..   year = col_double(),
#>   ..   plot_id = col_double(),
#>   ..   species_id = col_character(),
#>   ..   sex = col_character(),
#>   ..   hindfoot_length = col_double(),
#>   ..   weight = col_double(),
#>   ..   genus = col_character(),
#>   ..   species = col_character(),
#>   ..   taxa = col_character(),
#>   ..   plot_type = col_character()
#>   .. )
#>  - attr(*, "problems")=<externalptr>
## * class: data frame
## * how many rows: 34786,  how many columns: 13
# We can extract specific values by specifying row and column indices
# in the format: 
# data_frame[row_index, column_index]
# For instance, to extract the first row and column from surveys:
surveys[1, 1]

# First row, sixth column:
surveys[1, 6]   

# We can also use shortcuts to select a number of rows or columns at once
# To select all columns, leave the column index blank
# For instance, to select all columns for the first row:
surveys[1, ]

# The same shortcut works for rows --
# To select the first column across all rows:
surveys[, 1]

# An even shorter way to select first column across all rows:
surveys[1] # No comma! 

# To select multiple rows or columns, use vectors!
# To select the first three rows of the 5th and 6th column
surveys[c(1, 2, 3), c(5, 6)] 

# We can use the : operator to create those vectors for us:
surveys[1:3, 5:6] 

# This is equivalent to head_surveys <- head(surveys)
head_surveys <- surveys[1:6, ]

# As we've seen, when working with tibbles 
# subsetting with single square brackets ("[]") always returns a data frame.
# If you want a vector, use double square brackets ("[[]]")

# For instance, to get the first column as a vector:
surveys[[1]]

# To get the first value in our data frame:
surveys[[1, 1]]
surveys[, -1]                 # The whole data frame, except the first column
surveys[-(7:nrow(surveys)), ] # Equivalent to head(surveys)
# As before, using single brackets returns a data frame:
surveys["species_id"]
surveys[, "species_id"]

# Double brackets returns a vector:
surveys[["species_id"]]

# We can also use the $ operator with column names instead of double brackets
# This returns a vector:
surveys$species_id
## 1.
surveys_200 <- surveys[200, ]
## 2.
# Saving `n_rows` to improve readability and reduce duplication
n_rows <- nrow(surveys)
surveys_last <- surveys[n_rows, ]
## 3.
surveys_middle <- surveys[n_rows / 2, ]
## 4.
surveys_head <- surveys[-(7:n_rows), ]
surveys$sex <- factor(surveys$sex)
summary(surveys$sex)
sex <- factor(c("male", "female", "female", "male"))
levels(sex)
nlevels(sex)
sex # current order
#> [1] male   female female male
#> Levels: female male
sex <- factor(sex, levels = c("male", "female"))
sex # after re-ordering
#> [1] male   female female male
#> Levels: male female
surveys$taxa <- factor(surveys$taxa)
surveys$genus <- factor(surveys$genus)
summary(surveys)
nlevels(surveys$genus)

## * how many genera: There are 26 unique genera in the `genus` column.
## * how many rabbts: There are 75 rabbits in the `taxa` column.
as.character(sex)
year_fct <- factor(c(1990, 1983, 1977, 1998, 1990))
as.numeric(year_fct)               # Wrong! And there is no warning...
as.numeric(as.character(year_fct)) # Works...
as.numeric(levels(year_fct))[year_fct]    # The recommended way.
## bar plot of the number of females and males captured during the experiment:
plot(surveys$sex)
sex <- surveys$sex
levels(sex)
#> [1] "F" "M"
sex <- addNA(sex)
levels(sex)
#> [1] "F" "M" NA
head(sex)
#> [1] M    M    <NA> <NA> <NA> <NA>
#> Levels: F M <NA>
levels(sex)[3] <- "undetermined"
levels(sex)
#> [1] "F"            "M"            "undetermined"
head(sex)
#> [1] M            M            undetermined undetermined undetermined
#> [6] undetermined
#> Levels: F M undetermined
levels(sex)[1:2] <- c("female", "male")
sex <- factor(sex, levels = c("undetermined", "female", "male"))
plot(sex)
animal_data <- data.frame(
          animal = c(dog, cat, sea cucumber, sea urchin),
          feel = c("furry", "squishy", "spiny"),
          weight = c(45, 8 1.1, 0.8)
          )
country_climate <- data.frame(
       country = c("Canada", "Panama", "South Africa", "Australia"),
       climate = c("cold", "hot", "temperate", "hot/temperate"),
       temperature = c(10, 30, 18, "15"),
       northern_hemisphere = c(TRUE, TRUE, FALSE, "FALSE"),
       has_kangaroo = c(FALSE, FALSE, FALSE, 1)
       )
str(surveys)
library(lubridate)
my_date <- ymd("2015-01-01")
str(my_date)
# sep indicates the character to use to separate each component
my_date <- ymd(paste("2015", "1", "1", sep = "-")) 
str(my_date)
paste(surveys$year, surveys$month, surveys$day, sep = "-")
ymd(paste(surveys$year, surveys$month, surveys$day, sep = "-"))
#> Warning: 129 failed to parse.
surveys$date <- ymd(paste(surveys$year, surveys$month, surveys$day, sep = "-"))
#> Warning: 129 failed to parse.
str(surveys) # notice the new column, with 'date' as the class
summary(surveys$date)
#>         Min.      1st Qu.       Median         Mean      3rd Qu.         Max.
#> "1977-07-16" "1984-03-12" "1990-07-22" "1990-12-15" "1997-07-29" "2002-12-31"
#>         NA's
#>        "129"
missing_dates <- surveys[is.na(surveys$date), c("year", "month", "day")]

head(missing_dates)
#> # A tibble: 6 × 3
#>    year month   day
#>   <dbl> <dbl> <dbl>
#> 1  2000     9    31
#> 2  2000     4    31
#> 3  2000     4    31
#> 4  2000     4    31
#> 5  2000     4    31
#> 6  2000     9    31
surveys <- read_csv("data_raw/portal_data_joined.csv")
#> Rows: 34786 Columns: 13
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (6): species_id, sex, genus, species, taxa, plot_type
#> dbl (7): record_id, month, day, year, plot_id, hindfoot_length, weight
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## inspect the data
str(surveys)
## preview the data
view(surveys)
select(surveys, plot_id, species_id, weight)
select(surveys, -record_id, -species_id)
filter(surveys, year == 1995)
surveys2 <- filter(surveys, weight < 5)
surveys_sml <- select(surveys2, species_id, sex, weight)
surveys_sml <- select(filter(surveys, weight < 5), species_id, sex, weight)
surveys %>%
  filter(weight < 5) %>%
  select(species_id, sex, weight)
surveys_sml <- surveys %>%
  filter(weight < 5) %>%
  select(species_id, sex, weight)

surveys_sml
surveys %>%
    filter(year < 1995) %>%
    select(year, sex, weight)
surveys %>%
  mutate(weight_kg = weight / 1000)
surveys %>%
  mutate(weight_kg = weight / 1000,
         weight_lb = weight_kg * 2.2)
surveys %>%
  mutate(weight_kg = weight / 1000) %>%
  head()
surveys %>%
  filter(!is.na(weight)) %>%
  mutate(weight_kg = weight / 1000) %>%
  head()
surveys_hindfoot_cm <- surveys %>%
    filter(!is.na(hindfoot_length)) %>%
    mutate(hindfoot_cm = hindfoot_length / 10) %>%
    filter(hindfoot_cm < 3) %>%
    select(species_id, hindfoot_cm)
surveys %>%
  group_by(sex) %>%
  summarize(mean_weight = mean(weight, na.rm = TRUE))
surveys %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight, na.rm = TRUE)) %>%
  tail()
#> `summarise()` has grouped output by 'sex'. You can override using the `.groups`
#> argument.
surveys %>%
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight))
#> `summarise()` has grouped output by 'sex'. You can override using the `.groups`
#> argument.
surveys %>%
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight)) %>%
  print(n = 15)
#> `summarise()` has grouped output by 'sex'. You can override using the `.groups`
#> argument.
surveys %>%
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight),
            min_weight = min(weight))
#> `summarise()` has grouped output by 'sex'. You can override using the `.groups`
#> argument.
surveys %>%
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight),
            min_weight = min(weight)) %>%
  arrange(min_weight)
#> `summarise()` has grouped output by 'sex'. You can override using the `.groups`
#> argument.
surveys %>%
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight),
            min_weight = min(weight)) %>%
  arrange(desc(mean_weight))
#> `summarise()` has grouped output by 'sex'. You can override using the `.groups`
#> argument.
surveys %>%
    count(sex)
surveys %>%
    group_by(sex) %>%
    summarize(count = n())
surveys %>%
    count(sex, sort = TRUE)
surveys %>%
  count(sex, species)
surveys %>%
  count(sex, species) %>%
  arrange(species, desc(n))
surveys %>%
    count(plot_type)
surveys %>%
    filter(!is.na(hindfoot_length)) %>%
    group_by(species_id) %>%
    summarize(
        mean_hindfoot_length = mean(hindfoot_length),
        min_hindfoot_length = min(hindfoot_length),
        max_hindfoot_length = max(hindfoot_length),
        n = n()
    )
surveys %>%
    filter(!is.na(weight)) %>%
    group_by(year) %>%
    filter(weight == max(weight)) %>%
    select(year, genus, species, weight) %>%
    arrange(year)
surveys_gw <- surveys %>%
  filter(!is.na(weight)) %>%
  group_by(plot_id, genus) %>%
  summarize(mean_weight = mean(weight))
#> `summarise()` has grouped output by 'plot_id'. You can override using the
#> `.groups` argument.
str(surveys_gw)
surveys_wide <- surveys_gw %>%
  pivot_wider(names_from = genus, values_from = mean_weight)

str(surveys_wide)
surveys_gw %>%
  pivot_wider(names_from = genus, values_from = mean_weight, values_fill = 0) %>%
  head()
surveys_long <- surveys_wide %>%
  pivot_longer(names_to = "genus", values_to = "mean_weight", cols = -plot_id)

str(surveys_long)
surveys_wide %>%
  pivot_longer(names_to = "genus", values_to = "mean_weight", cols = -plot_id) %>%
  head()
surveys_wide_genera <- surveys %>%
  group_by(plot_id, year) %>%
  summarize(n_genera = n_distinct(genus)) %>%
  pivot_wider(names_from = year, values_from = n_genera)
#> `summarise()` has grouped output by 'plot_id'. You can override using the
#> `.groups` argument.
head(surveys_wide_genera)
surveys_wide_genera %>%
  pivot_longer(names_to = "year", values_to = "n_genera", cols = -plot_id)
surveys_long <- surveys %>%
  pivot_longer(names_to = "measurement", values_to = "value", cols = c(hindfoot_length, weight))
surveys_long %>%
  group_by(year, measurement, plot_type) %>%
  summarize(mean_value = mean(value, na.rm=TRUE)) %>%
  pivot_wider(names_from = measurement, values_from = mean_value)
#> `summarise()` has grouped output by 'year', 'measurement'. You can override
#> using the `.groups` argument.
surveys_complete <- surveys %>%
  filter(!is.na(weight),           # remove missing weight
         !is.na(hindfoot_length),  # remove missing hindfoot_length
         !is.na(sex))                # remove missing sex
## Extract the most common species_id
species_counts <- surveys_complete %>%
    count(species_id) %>%
    filter(n >= 50)

## Only keep the most common species
surveys_complete <- surveys_complete %>%
  filter(species_id %in% species_counts$species_id)
write_csv(surveys_complete, file = "data/surveys_complete.csv")
library(tidyverse)
surveys_complete <- read_csv("data/surveys_complete.csv")
ggplot(data = <DATA>, mapping = aes(<MAPPINGS>)) +  <GEOM_FUNCTION>()
ggplot(data = surveys_complete)
ggplot(data = surveys_complete, mapping = aes(x = weight, y = hindfoot_length))
ggplot(data = surveys_complete, aes(x = weight, y = hindfoot_length)) +
  geom_point()
# Assign plot to a variable
surveys_plot <- ggplot(data = surveys_complete,
                       mapping = aes(x = weight, y = hindfoot_length))

# Draw the plot
surveys_plot +
    geom_point()
# This is the correct syntax for adding layers
surveys_plot +
  geom_point()

# This will not add the new layer and will return an error message
surveys_plot
  + geom_point()
install.packages("hexbin")
library(hexbin)
surveys_plot +
 geom_hex()
ggplot(data = surveys_complete, aes(x = weight, y = hindfoot_length)) +
    geom_point()
ggplot(data = surveys_complete, aes(x = weight, y = hindfoot_length)) +
    geom_point(alpha = 0.1)
ggplot(data = surveys_complete, mapping = aes(x = weight, y = hindfoot_length)) +
    geom_point(alpha = 0.1, color = "blue")
ggplot(data = surveys_complete, mapping = aes(x = weight, y = hindfoot_length)) +
    geom_point(alpha = 0.1, aes(color = species_id))
ggplot(data = surveys_complete,
       mapping = aes(x = species_id, y = weight)) +
   geom_point(aes(color = plot_type))
ggplot(data = surveys_complete, mapping = aes(x = species_id, y = weight)) +
    geom_boxplot()
ggplot(data = surveys_complete, mapping = aes(x = species_id, y = weight)) +
    geom_boxplot(outlier.shape = NA) +
    geom_jitter(alpha = 0.3, color = "tomato")
ggplot(data = surveys_complete, mapping = aes(x = species_id, y = weight)) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_violin() 
ggplot(data = surveys_complete, mapping = aes(x = species_id, y = weight)) +
scale_y_log10() +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(outlier.shape = NA)
ggplot(data = surveys_complete, mapping = aes(x = species_id, y = hindfoot_length)) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(outlier.shape = NA)
yearly_counts <- surveys_complete %>%
  count(year, genus)
ggplot(data = yearly_counts, aes(x = year, y = n)) +
     geom_line()
ggplot(data = yearly_counts, aes(x = year, y = n, group = genus)) +
    geom_line()
ggplot(data = yearly_counts, aes(x = year, y = n, color = genus)) +
    geom_line()
yearly_counts %>%
    ggplot(mapping = aes(x = year, y = n, color = genus)) +
    geom_line()
yearly_counts_graph <- surveys_complete %>%
    count(year, genus) %>%
    ggplot(mapping = aes(x = year, y = n, color = genus)) +
    geom_line()

yearly_counts_graph
ggplot(data = yearly_counts, aes(x = year, y = n)) +
    geom_line() +
    facet_wrap(facets = vars(genus))
 yearly_sex_counts <- surveys_complete %>%
                      count(year, genus, sex)
ggplot(data = yearly_sex_counts, mapping = aes(x = year, y = n, color = sex)) +
  geom_line() +
  facet_wrap(facets =  vars(genus))
ggplot(data = yearly_sex_counts,
       mapping = aes(x = year, y = n, color = sex)) +
  geom_line() +
  facet_grid(rows = vars(sex), cols =  vars(genus))
# One column, facet by rows
ggplot(data = yearly_sex_counts,
       mapping = aes(x = year, y = n, color = sex)) +
  geom_line() +
  facet_grid(rows = vars(genus))
# One row, facet by column
ggplot(data = yearly_sex_counts,
       mapping = aes(x = year, y = n, color = sex)) +
  geom_line() +
  facet_grid(cols = vars(genus))
 ggplot(data = yearly_sex_counts,
        mapping = aes(x = year, y = n, color = sex)) +
     geom_line() +
     facet_wrap(vars(genus)) +
     theme_bw()
yearly_weight <- surveys_complete %>%
                group_by(year, species_id) %>%
                 summarize(avg_weight = mean(weight))
#> `summarise()` has grouped output by 'year'. You can override using the
#> `.groups` argument.
ggplot(data = yearly_weight, mapping = aes(x=year, y=avg_weight)) +
   geom_line() +
   facet_wrap(vars(species_id)) +
   theme_bw()
ggplot(data = yearly_sex_counts, aes(x = year, y = n, color = sex)) +
    geom_line() +
    facet_wrap(vars(genus)) +
    labs(title = "Observed genera through time",
         x = "Year of observation",
         y = "Number of individuals") +
    theme_bw()
ggplot(data = yearly_sex_counts, mapping = aes(x = year, y = n, color = sex)) +
    geom_line() +
    facet_wrap(vars(genus)) +
    labs(title = "Observed genera through time",
        x = "Year of observation",
        y = "Number of individuals") +
    theme_bw() +
    theme(text=element_text(size = 16))
ggplot(data = yearly_sex_counts, mapping = aes(x = year, y = n, color = sex)) +
    geom_line() +
    facet_wrap(vars(genus)) +
    labs(title = "Observed genera through time",
        x = "Year of observation",
        y = "Number of individuals") +
    theme_bw() +
    theme(axis.text.x = element_text(colour = "grey20", size = 12, angle = 90, hjust = 0.5, vjust = 0.5),
                        axis.text.y = element_text(colour = "grey20", size = 12),
                        strip.text = element_text(face = "italic"),
                        text = element_text(size = 16))
grey_theme <- theme(axis.text.x = element_text(colour="grey20", size = 12,
                                               angle = 90, hjust = 0.5,
                                               vjust = 0.5),
                    axis.text.y = element_text(colour = "grey20", size = 12),
                    text=element_text(size = 16))

ggplot(surveys_complete, aes(x = species_id, y = hindfoot_length)) +
    geom_boxplot() +
    grey_theme
install.packages("patchwork")
library(patchwork)

plot_weight <- ggplot(data = surveys_complete, aes(x = species_id, y = weight)) +
  geom_boxplot() +
  labs(x = "Species", y = expression(log[10](Weight))) +
  scale_y_log10()

plot_count <- ggplot(data = yearly_counts, aes(x = year, y = n, color = genus)) +
  geom_line() +
  labs(x = "Year", y = "Abundance")

plot_weight / plot_count + plot_layout(heights = c(3, 2))
my_plot <- ggplot(data = yearly_sex_counts,
                  aes(x = year, y = n, color = sex)) +
    geom_line() +
    facet_wrap(vars(genus)) +
    labs(title = "Observed genera through time",
        x = "Year of observation",
        y = "Number of individuals") +
    theme_bw() +
    theme(axis.text.x = element_text(colour = "grey20", size = 12, angle = 90,
                                     hjust = 0.5, vjust = 0.5),
          axis.text.y = element_text(colour = "grey20", size = 12),
          text = element_text(size = 16))

ggsave("name_of_file.png", my_plot, width = 15, height = 10)

## This also works for plots combined with patchwork
plot_combined <- plot_weight / plot_count + plot_layout(heights = c(3, 2))
ggsave("plot_combined.png", plot_combined, width = 10, dpi = 300)
install.packages(c("dbplyr", "RSQLite"))
dir.create("data_raw", showWarnings = FALSE)
download.file(url = "https://ndownloader.figshare.com/files/2292171",
              destfile = "data_raw/portal_mammals.sqlite", mode = "wb")
library(dplyr)
library(dbplyr)
#>
#> Attaching package: 'dbplyr'
#> The following objects are masked from 'package:dplyr':
#>
#>     ident, sql
mammals <- DBI::dbConnect(RSQLite::SQLite(), "data_raw/portal_mammals.sqlite")
src_dbi(mammals)
#> src:  sqlite 3.49.1 [/home/runner/work/R-ecology-lesson-previous/R-ecology-lesson-previous/site/built/data_raw/portal_mammals.sqlite]
#> tbls: plots, species, surveys
tbl(mammals, sql("SELECT year, species_id, plot_id FROM surveys"))
surveys <- tbl(mammals, "surveys")
surveys %>%
    select(year, species_id, plot_id)
head(surveys, n = 10)
#> # Source:   SQL [?? x 9]
#> # Database: sqlite 3.49.1 [/home/runner/work/R-ecology-lesson-previous/R-ecology-lesson-previous/site/built/data_raw/portal_mammals.sqlite]
#>    record_id month   day  year plot_id species_id sex   hindfoot_length weight
#>        <int> <int> <int> <int>   <int> <chr>      <chr>           <int>  <int>
#>  1         1     7    16  1977       2 NL         M                  32     NA
#>  2         2     7    16  1977       3 NL         M                  33     NA
#>  3         3     7    16  1977       2 DM         F                  37     NA
#>  4         4     7    16  1977       7 DM         M                  36     NA
#>  5         5     7    16  1977       3 DM         M                  35     NA
#>  6         6     7    16  1977       1 PF         M                  14     NA
#>  7         7     7    16  1977       2 PE         F                  NA     NA
#>  8         8     7    16  1977       1 DM         M                  37     NA
#>  9         9     7    16  1977       1 DM         F                  34     NA
#> 10        10     7    16  1977       6 PF         F                  20     NA
nrow(surveys)
#> [1] NA
SELECT *
FROM `surveys`
LIMIT 10
show_query(head(surveys, n = 10))
surveys %>%
  filter(weight < 5) %>%
  select(species_id, sex, weight)
#> # Source:   SQL [?? x 3]
#> # Database: sqlite 3.49.1 [/home/runner/work/R-ecology-lesson-previous/R-ecology-lesson-previous/site/built/data_raw/portal_mammals.sqlite]
#>    species_id sex   weight
#>    <chr>      <chr>  <int>
#>  1 PF         M          4
#>  2 PF         F          4
#>  3 PF         <NA>       4
#>  4 PF         F          4
#>  5 PF         F          4
#>  6 RM         M          4
#>  7 RM         F          4
#>  8 RM         M          4
#>  9 RM         M          4
#> 10 RM         M          4
#> 11 RM         M          4
#> 12 RM         F          4
#> 13 RM         M          4
#> 14 RM         M          4
#> 15 RM         M          4
#> 16 PF         M          4
#> 17 PP         M          4
# ... with more rows
data_subset <- surveys %>%
  filter(weight < 5) %>%
  select(species_id, sex, weight)

data_subset %>%
  select(-sex)
#> # Source:   SQL [?? x 2]
#> # Database: sqlite 3.49.1 [/home/runner/work/R-ecology-lesson-previous/R-ecology-lesson-previous/site/built/data_raw/portal_mammals.sqlite]
#>    species_id weight
#>    <chr>       <int>
#>  1 PF              4
#>  2 PF              4
#>  3 PF              4
#>  4 PF              4
#>  5 PF              4
#>  6 RM              4
#>  7 RM              4
#>  8 RM              4
#>  9 RM              4
#> 10 RM              4
#> 11 RM              4
#> 12 RM              4
#> 13 RM              4
#> 14 RM              4
#> 15 RM              4
#> 16 PF              4
#> 17 PP              4
data_subset <- surveys %>%
  filter(weight < 5) %>%
  select(species_id, sex, weight) %>%
  collect()
plots <- tbl(mammals, "plots")
plots
#> # Source:   table<`plots`> [?? x 2]
#> # Database: sqlite 3.49.1 [/home/runner/work/R-ecology-lesson-previous/R-ecology-lesson-previous/site/built/data_raw/portal_mammals.sqlite]
#>    plot_id plot_type
#>      <int> <chr>
#>  1       1 Spectab exclosure
#>  2       2 Control
#>  3       3 Long-term Krat Exclosure
#>  4       4 Control
#>  5       5 Rodent Exclosure
#>  6       6 Short-term Krat Exclosure
#>  7       7 Rodent Exclosure
#>  8       8 Control
#>  9       9 Spectab exclosure
#> 10      10 Rodent Exclosure
#> # ℹ more rows
surveys
#> # Source:   table<`surveys`> [?? x 9]
#> # Database: sqlite 3.49.1 [/home/runner/work/R-ecology-lesson-previous/R-ecology-lesson-previous/site/built/data_raw/portal_mammals.sqlite]
#>    record_id month   day  year plot_id species_id sex   hindfoot_length weight
#>        <int> <int> <int> <int>   <int> <chr>      <chr>           <int>  <int>
#>  1         1     7    16  1977       2 NL         M                  32     NA
#>  2         2     7    16  1977       3 NL         M                  33     NA
#>  3         3     7    16  1977       2 DM         F                  37     NA
#>  4         4     7    16  1977       7 DM         M                  36     NA
#>  5         5     7    16  1977       3 DM         M                  35     NA
#>  6         6     7    16  1977       1 PF         M                  14     NA
#>  7         7     7    16  1977       2 PE         F                  NA     NA
#>  8         8     7    16  1977       1 DM         M                  37     NA
#>  9         9     7    16  1977       1 DM         F                  34     NA
#> 10        10     7    16  1977       6 PF         F                  20     NA
#> # ℹ more rows
plots %>%
  filter(plot_id == 1) %>%
  inner_join(surveys) %>%
  collect()
#> Joining with `by = join_by(plot_id)`
#> # A tibble: 1,995 × 10
#>    plot_id plot_type         record_id month   day  year species_id sex
#>      <int> <chr>                 <int> <int> <int> <int> <chr>      <chr>
#>  1       1 Spectab exclosure         6     7    16  1977 PF         M
#>  2       1 Spectab exclosure         8     7    16  1977 DM         M
#>  3       1 Spectab exclosure         9     7    16  1977 DM         F
#>  4       1 Spectab exclosure        78     8    19  1977 PF         M
#>  5       1 Spectab exclosure        80     8    19  1977 DS         M
#>  6       1 Spectab exclosure       218     9    13  1977 PF         M
#>  7       1 Spectab exclosure       222     9    13  1977 DS         M
#>  8       1 Spectab exclosure       239     9    13  1977 DS         M
#>  9       1 Spectab exclosure       263    10    16  1977 DM         M
#> 10       1 Spectab exclosure       270    10    16  1977 DM         F
#> # ℹ 1,985 more rows
#> # ℹ 2 more variables: hindfoot_length <int>, weight <int>
SELECT table.col, table.col
FROM table1 JOIN table2
ON table1.key = table2.key
JOIN table3 ON table2.key = table3.key
## with dplyr syntax
species <- tbl(mammals, "species")

left_join(surveys, species) %>%
  filter(taxa == "Rodent") %>%
  group_by(taxa, year, plot_id) %>%
  tally() %>%
  collect()
#> Joining with `by = join_by(species_id)`
## with SQL syntax
query <- paste("
SELECT a.year, b.taxa,count(*) as count
FROM surveys a
JOIN species b
ON a.species_id = b.species_id
AND b.taxa = 'Rodent'
GROUP BY b.taxa, a.year, a.plot_id",
sep = "" )

tbl(mammals, sql(query))
species <- tbl(mammals, "species")
genus_counts <- left_join(surveys, plots) %>%
  left_join(species) %>%
  filter(taxa == "Rodent") %>%
  group_by(plot_type, genus) %>%
  tally() %>%
  collect()
species <- tbl(mammals, "species")
unique_genera <- left_join(surveys, plots) %>%
    left_join(species) %>%
    group_by(plot_type) %>%
    summarize(
        n_genera = n_distinct(genus)
    ) %>%
    collect()
#> Joining with `by = join_by(plot_id)`
#> Joining with `by = join_by(species_id)`
download.file("https://ndownloader.figshare.com/files/3299483",
              "data_raw/species.csv")
download.file("https://ndownloader.figshare.com/files/10717177",
              "data_raw/surveys.csv")
download.file("https://ndownloader.figshare.com/files/3299474",
              "data_raw/plots.csv")
library(tidyverse)
species <- read_csv("data_raw/species.csv")
#> Rows: 54 Columns: 4
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (4): species_id, genus, species, taxa
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
surveys <- read_csv("data_raw/surveys.csv")
#> Rows: 35549 Columns: 9
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (2): species_id, sex
#> dbl (7): record_id, month, day, year, plot_id, hindfoot_length, weight
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
plots <- read_csv("data_raw/plots.csv")
#> Rows: 24 Columns: 2
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (1): plot_type
#> dbl (1): plot_id
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
my_db_file <- "data/portal-database-output.sqlite"
my_db <- src_sqlite(my_db_file, create = TRUE)
#> Warning: `src_sqlite()` was deprecated in dplyr 1.0.0.
#> ℹ Please use `tbl()` directly with a database connection
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
my_db
#> src:  sqlite 3.49.1 [/home/runner/work/R-ecology-lesson-previous/R-ecology-lesson-previous/site/built/data/portal-database-output.sqlite]
#> tbls:
copy_to(my_db, surveys)
copy_to(my_db, plots)
my_db

Column	Description
record_id	Unique id for the observation
month	month of observation
day	day of observation
year	year of observation
plot_id	ID of a particular experimental plot of land
species_id	2-letter code
sex	sex of animal (“M”, “F”)
hindfoot_length	length of the hindfoot in mm
weight	weight of the animal in grams
genus	genus of animal
species	species of animal
taxon	e.g. Rodent, Reptile, Bird, Rabbit
plot_type	type of plot

Overview

Questions

Objectives

What is R? What is RStudio?

Why learn R?

R does not involve lots of pointing and clicking, and that’s a good thing

R code is great for reproducibility

R is interdisciplinary and extensible

R works on data of all shapes and sizes

R produces high-quality graphics

R has a large and welcoming community

Not only is R free, but it is also open-source and cross-platform

Knowing your way around RStudio

Getting set up

Organizing your working directory

The working directory

Interacting with R

Seeking help

Searching function documentation with ? and ??

Automatic code completion

Package vignettes and cheat sheets

Finding more functions and packages

Dealing with error messages

Asking for help

R

OUTPUT

How to learn more after the workshop?

More resources

More about R

How to ask good programming questions?

Overview

Questions

Objectives

Creating objects in R

R

R

Objects vs. variables

R

R

R

R

R

Saving your code

Comments

Challenge

R

Functions and their arguments

R

R

OUTPUT

R

OUTPUT

R

R

OUTPUT

R

OUTPUT

R

OUTPUT

Vectors and data types

R

R

R

R

R

R

Challenge

Show me the solution

Challenge (continued)

R

Show me the solution

Challenge (continued)

R

Show me the solution

Challenge (continued)

Show me the solution

Subsetting vectors

R

OUTPUT

R

Searching function documentation with `?` and `??`