Title: | Data Table Back-End for 'dplyr' |
---|---|
Description: | Provides a data.table backend for 'dplyr'. The goal of 'dtplyr' is to allow you to write 'dplyr' code that is automatically translated to the equivalent, but usually much faster, data.table code. |
Authors: | Hadley Wickham [cre, aut], Maximilian Girlich [aut], Mark Fairbanks [aut], Ryan Dickerson [aut], Posit Software, PBC [cph, fnd] |
Maintainer: | Hadley Wickham <[email protected]> |
License: | MIT + file LICENSE |
Version: | 1.3.1.9000 |
Built: | 2024-12-04 05:12:39 UTC |
Source: | https://github.com/tidyverse/dtplyr |
This is a method for dplyr generic arrange()
. It is translated to
an order()
call in the i
argument of [.data.table
.
## S3 method for class 'dtplyr_step' arrange(.data, ..., .by_group = FALSE)
## S3 method for class 'dtplyr_step' arrange(.data, ..., .by_group = FALSE)
.data |
A |
... |
< |
.by_group |
If |
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) dt %>% arrange(vs, cyl) dt %>% arrange(desc(vs), cyl) dt %>% arrange(across(mpg:disp))
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) dt %>% arrange(vs, cyl) dt %>% arrange(desc(vs), cyl) dt %>% arrange(across(mpg:disp))
collect()
returns a tibble, grouped if needed.
compute()
generates an intermediate assignment in the translation.
as.data.table()
returns a data.table.
as.data.frame()
returns a data frame.
as_tibble()
returns a tibble.
## S3 method for class 'dtplyr_step' collect(x, ...) ## S3 method for class 'dtplyr_step' compute(x, name = unique_name(), ...) ## S3 method for class 'dtplyr_step' as.data.table(x, keep.rownames = FALSE, ...) ## S3 method for class 'dtplyr_step' as.data.frame(x, ...) ## S3 method for class 'dtplyr_step' as_tibble(x, ..., .name_repair = "check_unique")
## S3 method for class 'dtplyr_step' collect(x, ...) ## S3 method for class 'dtplyr_step' compute(x, name = unique_name(), ...) ## S3 method for class 'dtplyr_step' as.data.table(x, keep.rownames = FALSE, ...) ## S3 method for class 'dtplyr_step' as.data.frame(x, ...) ## S3 method for class 'dtplyr_step' as_tibble(x, ..., .name_repair = "check_unique")
x |
A lazy_dt |
... |
Arguments used by other methods. |
name |
Name of intermediate data.table. |
keep.rownames |
Ignored as dplyr never preserves rownames. |
.name_repair |
Treatment of problematic column names |
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) # Generate translation avg_mpg <- dt %>% filter(am == 1) %>% group_by(cyl) %>% summarise(mpg = mean(mpg)) # Show translation and temporarily compute result avg_mpg # compute and return tibble avg_mpg_tb <- as_tibble(avg_mpg) avg_mpg_tb # compute and return data.table avg_mpg_dt <- data.table::as.data.table(avg_mpg) avg_mpg_dt # modify translation to use intermediate assignment compute(avg_mpg)
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) # Generate translation avg_mpg <- dt %>% filter(am == 1) %>% group_by(cyl) %>% summarise(mpg = mean(mpg)) # Show translation and temporarily compute result avg_mpg # compute and return tibble avg_mpg_tb <- as_tibble(avg_mpg) avg_mpg_tb # compute and return data.table avg_mpg_dt <- data.table::as.data.table(avg_mpg) avg_mpg_dt # modify translation to use intermediate assignment compute(avg_mpg)
This is a method for the tidyr complete()
generic. This is a wrapper
around dtplyr
translations for expand()
, full_join()
, and replace_na()
that's useful for completing missing combinations of data.
## S3 method for class 'dtplyr_step' complete(data, ..., fill = list())
## S3 method for class 'dtplyr_step' complete(data, ..., fill = list())
data |
A |
... |
<
When used with factors, When used with continuous variables, you may need to fill in values
that do not appear in the data: to do so use expressions like
|
fill |
A named list that for each variable supplies a single value to
use instead of |
library(tidyr) tbl <- tibble(x = 1:2, y = 1:2, z = 3:4) dt <- lazy_dt(tbl) dt %>% complete(x, y) dt %>% complete(x, y, fill = list(z = 10L))
library(tidyr) tbl <- tibble(x = 1:2, y = 1:2, z = 3:4) dt <- lazy_dt(tbl) dt %>% complete(x, y) dt %>% complete(x, y, fill = list(z = 10L))
This is a method for the dplyr count()
generic. It is translated using
.N
in the j
argument, and supplying groups to keyby
as appropriate.
## S3 method for class 'dtplyr_step' count(x, ..., wt = NULL, sort = FALSE, name = NULL)
## S3 method for class 'dtplyr_step' count(x, ..., wt = NULL, sort = FALSE, name = NULL)
x |
|
... |
< |
wt |
<
|
sort |
If |
name |
The name of the new column in the output. If omitted, it will default to |
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(dplyr::starwars) dt %>% count(species) dt %>% count(species, sort = TRUE) dt %>% count(species, wt = mass, sort = TRUE)
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(dplyr::starwars) dt %>% count(species) dt %>% count(species, sort = TRUE) dt %>% count(species, wt = mass, sort = TRUE)
This is a method for the dplyr distinct()
generic. It is translated to
data.table::unique.data.table()
.
## S3 method for class 'dtplyr_step' distinct(.data, ..., .keep_all = FALSE)
## S3 method for class 'dtplyr_step' distinct(.data, ..., .keep_all = FALSE)
.data |
|
... |
< |
.keep_all |
If |
library(dplyr, warn.conflicts = FALSE) df <- lazy_dt(data.frame( x = sample(10, 100, replace = TRUE), y = sample(10, 100, replace = TRUE) )) df %>% distinct(x) df %>% distinct(x, y) df %>% distinct(x, .keep_all = TRUE)
library(dplyr, warn.conflicts = FALSE) df <- lazy_dt(data.frame( x = sample(10, 100, replace = TRUE), y = sample(10, 100, replace = TRUE) )) df %>% distinct(x) df %>% distinct(x, y) df %>% distinct(x, .keep_all = TRUE)
This is a method for the tidyr drop_na()
generic. It is translated to
data.table::na.omit()
## S3 method for class 'dtplyr_step' drop_na(data, ...)
## S3 method for class 'dtplyr_step' drop_na(data, ...)
data |
A |
... |
< |
library(dplyr) library(tidyr) dt <- lazy_dt(tibble(x = c(1, 2, NA), y = c("a", NA, "b"))) dt %>% drop_na() dt %>% drop_na(x) vars <- "y" dt %>% drop_na(x, any_of(vars))
library(dplyr) library(tidyr) dt <- lazy_dt(tibble(x = c(1, 2, NA), y = c("a", NA, "b"))) dt %>% drop_na() dt %>% drop_na(x) vars <- "y" dt %>% drop_na(x, any_of(vars))
This is a method for the tidyr expand()
generic. It is translated to
data.table::CJ()
.
## S3 method for class 'dtplyr_step' expand(data, ..., .name_repair = "check_unique")
## S3 method for class 'dtplyr_step' expand(data, ..., .name_repair = "check_unique")
data |
A |
... |
Specification of columns to expand. Columns can be atomic vectors or lists.
Unlike the data.frame method, this method does not use the full set of levels, just those that appear in the data. When used with continuous variables, you may need to fill in values
that do not appear in the data: to do so use expressions like
|
.name_repair |
Treatment of problematic column names:
This argument is passed on as |
library(tidyr) fruits <- lazy_dt(tibble( type = c("apple", "orange", "apple", "orange", "orange", "orange"), year = c(2010, 2010, 2012, 2010, 2010, 2012), size = factor( c("XS", "S", "M", "S", "S", "M"), levels = c("XS", "S", "M", "L") ), weights = rnorm(6, as.numeric(size) + 2) )) # All possible combinations --------------------------------------- # Note that only present levels of the factor variable `size` are retained. fruits %>% expand(type) fruits %>% expand(type, size) # This is different from the data frame behaviour: fruits %>% dplyr::collect() %>% expand(type, size) # Other uses ------------------------------------------------------- fruits %>% expand(type, size, 2010:2012) # Use `anti_join()` to determine which observations are missing all <- fruits %>% expand(type, size, year) all all %>% dplyr::anti_join(fruits) # Use with `right_join()` to fill in missing rows fruits %>% dplyr::right_join(all)
library(tidyr) fruits <- lazy_dt(tibble( type = c("apple", "orange", "apple", "orange", "orange", "orange"), year = c(2010, 2010, 2012, 2010, 2010, 2012), size = factor( c("XS", "S", "M", "S", "S", "M"), levels = c("XS", "S", "M", "L") ), weights = rnorm(6, as.numeric(size) + 2) )) # All possible combinations --------------------------------------- # Note that only present levels of the factor variable `size` are retained. fruits %>% expand(type) fruits %>% expand(type, size) # This is different from the data frame behaviour: fruits %>% dplyr::collect() %>% expand(type, size) # Other uses ------------------------------------------------------- fruits %>% expand(type, size, 2010:2012) # Use `anti_join()` to determine which observations are missing all <- fruits %>% expand(type, size, year) all all %>% dplyr::anti_join(fruits) # Use with `right_join()` to fill in missing rows fruits %>% dplyr::right_join(all)
This is a method for the tidyr fill()
generic. It is translated to
data.table::nafill()
. Note that data.table::nafill()
currently only
works for integer and double columns.
## S3 method for class 'dtplyr_step' fill(data, ..., .direction = c("down", "up", "downup", "updown"))
## S3 method for class 'dtplyr_step' fill(data, ..., .direction = c("down", "up", "downup", "updown"))
data |
A data frame. |
... |
< |
.direction |
Direction in which to fill missing values. Currently either "down" (the default), "up", "downup" (i.e. first down and then up) or "updown" (first up and then down). |
library(tidyr) # Value (year) is recorded only when it changes sales <- lazy_dt(tibble::tribble( ~quarter, ~year, ~sales, "Q1", 2000, 66013, "Q2", NA, 69182, "Q3", NA, 53175, "Q4", NA, 21001, "Q1", 2001, 46036, "Q2", NA, 58842, "Q3", NA, 44568, "Q4", NA, 50197, "Q1", 2002, 39113, "Q2", NA, 41668, "Q3", NA, 30144, "Q4", NA, 52897, "Q1", 2004, 32129, "Q2", NA, 67686, "Q3", NA, 31768, "Q4", NA, 49094 )) # `fill()` defaults to replacing missing data from top to bottom sales %>% fill(year) # Value (n_squirrels) is missing above and below within a group squirrels <- lazy_dt(tibble::tribble( ~group, ~name, ~role, ~n_squirrels, 1, "Sam", "Observer", NA, 1, "Mara", "Scorekeeper", 8, 1, "Jesse", "Observer", NA, 1, "Tom", "Observer", NA, 2, "Mike", "Observer", NA, 2, "Rachael", "Observer", NA, 2, "Sydekea", "Scorekeeper", 14, 2, "Gabriela", "Observer", NA, 3, "Derrick", "Observer", NA, 3, "Kara", "Scorekeeper", 9, 3, "Emily", "Observer", NA, 3, "Danielle", "Observer", NA )) # The values are inconsistently missing by position within the group # Use .direction = "downup" to fill missing values in both directions squirrels %>% dplyr::group_by(group) %>% fill(n_squirrels, .direction = "downup") %>% dplyr::ungroup() # Using `.direction = "updown"` accomplishes the same goal in this example
library(tidyr) # Value (year) is recorded only when it changes sales <- lazy_dt(tibble::tribble( ~quarter, ~year, ~sales, "Q1", 2000, 66013, "Q2", NA, 69182, "Q3", NA, 53175, "Q4", NA, 21001, "Q1", 2001, 46036, "Q2", NA, 58842, "Q3", NA, 44568, "Q4", NA, 50197, "Q1", 2002, 39113, "Q2", NA, 41668, "Q3", NA, 30144, "Q4", NA, 52897, "Q1", 2004, 32129, "Q2", NA, 67686, "Q3", NA, 31768, "Q4", NA, 49094 )) # `fill()` defaults to replacing missing data from top to bottom sales %>% fill(year) # Value (n_squirrels) is missing above and below within a group squirrels <- lazy_dt(tibble::tribble( ~group, ~name, ~role, ~n_squirrels, 1, "Sam", "Observer", NA, 1, "Mara", "Scorekeeper", 8, 1, "Jesse", "Observer", NA, 1, "Tom", "Observer", NA, 2, "Mike", "Observer", NA, 2, "Rachael", "Observer", NA, 2, "Sydekea", "Scorekeeper", 14, 2, "Gabriela", "Observer", NA, 3, "Derrick", "Observer", NA, 3, "Kara", "Scorekeeper", 9, 3, "Emily", "Observer", NA, 3, "Danielle", "Observer", NA )) # The values are inconsistently missing by position within the group # Use .direction = "downup" to fill missing values in both directions squirrels %>% dplyr::group_by(group) %>% fill(n_squirrels, .direction = "downup") %>% dplyr::ungroup() # Using `.direction = "updown"` accomplishes the same goal in this example
This is a method for the dplyr arrange()
generic. It is translated to
the i
argument of [.data.table
## S3 method for class 'dtplyr_step' filter(.data, ..., .by = NULL, .preserve = FALSE)
## S3 method for class 'dtplyr_step' filter(.data, ..., .by = NULL, .preserve = FALSE)
.data |
A |
... |
< |
.by |
< |
.preserve |
Ignored |
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) dt %>% filter(cyl == 4) dt %>% filter(vs, am) dt %>% group_by(cyl) %>% filter(mpg > mean(mpg))
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) dt %>% filter(cyl == 4) dt %>% filter(vs, am) dt %>% group_by(cyl) %>% filter(mpg > mean(mpg))
These are methods for dplyr's group_by()
and ungroup()
generics.
Grouping is translated to the either keyby
and by
argument of
[.data.table
depending on the value of the arrange
argument.
## S3 method for class 'dtplyr_step' group_by(.data, ..., .add = FALSE, arrange = TRUE) ## S3 method for class 'dtplyr_step' ungroup(x, ...)
## S3 method for class 'dtplyr_step' group_by(.data, ..., .add = FALSE, arrange = TRUE) ## S3 method for class 'dtplyr_step' ungroup(x, ...)
.data |
|
... |
In |
.add , add
|
When This argument was previously called |
arrange |
If |
x |
A |
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) # group_by() is usually translated to `keyby` so that the groups # are ordered in the output dt %>% group_by(cyl) %>% summarise(mpg = mean(mpg)) # use `arrange = FALSE` to instead use `by` so the original order # or groups is preserved dt %>% group_by(cyl, arrange = FALSE) %>% summarise(mpg = mean(mpg))
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) # group_by() is usually translated to `keyby` so that the groups # are ordered in the output dt %>% group_by(cyl) %>% summarise(mpg = mean(mpg)) # use `arrange = FALSE` to instead use `by` so the original order # or groups is preserved dt %>% group_by(cyl, arrange = FALSE) %>% summarise(mpg = mean(mpg))
These are methods for the dplyr group_map()
and group_modify()
generics.
They are both translated to [.data.table
.
## S3 method for class 'dtplyr_step' group_modify(.data, .f, ..., keep = FALSE) ## S3 method for class 'dtplyr_step' group_map(.data, .f, ..., keep = FALSE)
## S3 method for class 'dtplyr_step' group_modify(.data, .f, ..., keep = FALSE) ## S3 method for class 'dtplyr_step' group_map(.data, .f, ..., keep = FALSE)
.data |
|
.f |
The name of a two argument function. The first argument is passed
|
... |
Additional arguments passed to |
keep |
Not supported for lazy_dt. |
group_map()
applies .f
to each group, returning a list.
group_modify()
replaces each group with the results of .f
, returning a
modified lazy_dt()
.
library(dplyr) dt <- lazy_dt(mtcars) dt %>% group_by(cyl) %>% group_modify(head, n = 2L) dt %>% group_by(cyl) %>% group_map(head, n = 2L)
library(dplyr) dt <- lazy_dt(mtcars) dt %>% group_by(cyl) %>% group_modify(head, n = 2L) dt %>% group_by(cyl) %>% group_map(head, n = 2L)
These are methods for the base generics head()
and tail()
. They
are not translated.
## S3 method for class 'dtplyr_step' head(x, n = 6L, ...) ## S3 method for class 'dtplyr_step' tail(x, n = 6L, ...)
## S3 method for class 'dtplyr_step' head(x, n = 6L, ...) ## S3 method for class 'dtplyr_step' tail(x, n = 6L, ...)
x |
|
n |
Number of rows to select. Can use a negative number to instead drop rows from the other end. |
... |
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(data.frame(x = 1:10)) # first three rows head(dt, 3) # last three rows tail(dt, 3) # drop first three rows tail(dt, -3)
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(data.frame(x = 1:10)) # first three rows head(dt, 3) # last three rows tail(dt, 3) # drop first three rows tail(dt, -3)
These are methods for the dplyr generics intersect()
, union()
,
union_all()
, and setdiff()
. They are translated to
data.table::fintersect()
, data.table::funion()
, and
data.table::fsetdiff()
.
## S3 method for class 'dtplyr_step' intersect(x, y, ...) ## S3 method for class 'dtplyr_step' union(x, y, ...) ## S3 method for class 'dtplyr_step' union_all(x, y, ...) ## S3 method for class 'dtplyr_step' setdiff(x, y, ...)
## S3 method for class 'dtplyr_step' intersect(x, y, ...) ## S3 method for class 'dtplyr_step' union(x, y, ...) ## S3 method for class 'dtplyr_step' union_all(x, y, ...) ## S3 method for class 'dtplyr_step' setdiff(x, y, ...)
x , y
|
A pair of |
... |
Ignored |
dt1 <- lazy_dt(data.frame(x = 1:4)) dt2 <- lazy_dt(data.frame(x = c(2, 4, 6))) intersect(dt1, dt2) union(dt1, dt2) setdiff(dt1, dt2)
dt1 <- lazy_dt(data.frame(x = 1:4)) dt2 <- lazy_dt(data.frame(x = c(2, 4, 6))) intersect(dt1, dt2) union(dt1, dt2) setdiff(dt1, dt2)
A lazy data.table captures the intent of dplyr verbs, only actually
performing computation when requested (with collect()
, pull()
,
as.data.frame()
, data.table::as.data.table()
, or tibble::as_tibble()
).
This allows dtplyr to convert dplyr verbs into as few data.table expressions
as possible, which leads to a high performance translation.
See vignette("translation")
for the details of the translation.
lazy_dt(x, name = NULL, immutable = TRUE, key_by = NULL)
lazy_dt(x, name = NULL, immutable = TRUE, key_by = NULL)
x |
A data table (or something can can be coerced to a data table). |
name |
Optionally, supply a name to be used in generated expressions. For expert use only. |
immutable |
If |
key_by |
Set keys for data frame, using This uses See |
library(dplyr, warn.conflicts = FALSE) mtcars2 <- lazy_dt(mtcars) mtcars2 mtcars2 %>% select(mpg:cyl) mtcars2 %>% select(x = mpg, y = cyl) mtcars2 %>% filter(cyl == 4) %>% select(mpg) mtcars2 %>% select(mpg, cyl) %>% filter(cyl == 4) mtcars2 %>% mutate(cyl2 = cyl * 2, cyl4 = cyl2 * 2) mtcars2 %>% transmute(cyl2 = cyl * 2, vs2 = vs * 2) mtcars2 %>% filter(cyl == 8) %>% mutate(cyl2 = cyl * 2) # Learn more about translation in vignette("translation") by_cyl <- mtcars2 %>% group_by(cyl) by_cyl %>% summarise(mpg = mean(mpg)) by_cyl %>% mutate(mpg = mean(mpg)) by_cyl %>% filter(mpg < mean(mpg)) %>% summarise(hp = mean(hp))
library(dplyr, warn.conflicts = FALSE) mtcars2 <- lazy_dt(mtcars) mtcars2 mtcars2 %>% select(mpg:cyl) mtcars2 %>% select(x = mpg, y = cyl) mtcars2 %>% filter(cyl == 4) %>% select(mpg) mtcars2 %>% select(mpg, cyl) %>% filter(cyl == 4) mtcars2 %>% mutate(cyl2 = cyl * 2, cyl4 = cyl2 * 2) mtcars2 %>% transmute(cyl2 = cyl * 2, vs2 = vs * 2) mtcars2 %>% filter(cyl == 8) %>% mutate(cyl2 = cyl * 2) # Learn more about translation in vignette("translation") by_cyl <- mtcars2 %>% group_by(cyl) by_cyl %>% summarise(mpg = mean(mpg)) by_cyl %>% mutate(mpg = mean(mpg)) by_cyl %>% filter(mpg < mean(mpg)) %>% summarise(hp = mean(hp))
These are methods for the dplyr generics left_join()
, right_join()
,
inner_join()
, full_join()
, anti_join()
, and semi_join()
. Left, right,
inner, and anti join are translated to the [.data.table
equivalent,
full joins to data.table::merge.data.table()
.
Left, right, and full joins are in some cases followed by calls to
data.table::setcolorder()
and data.table::setnames()
to ensure that column
order and names match dplyr conventions.
Semi-joins don't have a direct data.table equivalent.
## S3 method for class 'dtplyr_step' left_join(x, y, ..., by = NULL, copy = FALSE, suffix = c(".x", ".y"))
## S3 method for class 'dtplyr_step' left_join(x, y, ..., by = NULL, copy = FALSE, suffix = c(".x", ".y"))
x , y
|
A pair of |
... |
Other parameters passed onto methods. |
by |
A join specification created with If To join on different variables between To join by multiple variables, use a
For simple equality joins, you can alternatively specify a character vector
of variable names to join by. For example, To perform a cross-join, generating all combinations of |
copy |
If |
suffix |
If there are non-joined duplicate variables in |
library(dplyr, warn.conflicts = FALSE) band_dt <- lazy_dt(dplyr::band_members) instrument_dt <- lazy_dt(dplyr::band_instruments) band_dt %>% left_join(instrument_dt) band_dt %>% right_join(instrument_dt) band_dt %>% inner_join(instrument_dt) band_dt %>% full_join(instrument_dt) band_dt %>% semi_join(instrument_dt) band_dt %>% anti_join(instrument_dt)
library(dplyr, warn.conflicts = FALSE) band_dt <- lazy_dt(dplyr::band_members) instrument_dt <- lazy_dt(dplyr::band_instruments) band_dt %>% left_join(instrument_dt) band_dt %>% right_join(instrument_dt) band_dt %>% inner_join(instrument_dt) band_dt %>% full_join(instrument_dt) band_dt %>% semi_join(instrument_dt) band_dt %>% anti_join(instrument_dt)
This is a method for the dplyr mutate()
generic. It is translated to
the j
argument of [.data.table
, using :=
to modify "in place". If
.before
or .after
is provided, the new columns are relocated with a call
to data.table::setcolorder()
.
## S3 method for class 'dtplyr_step' mutate( .data, ..., .by = NULL, .keep = c("all", "used", "unused", "none"), .before = NULL, .after = NULL )
## S3 method for class 'dtplyr_step' mutate( .data, ..., .by = NULL, .keep = c("all", "used", "unused", "none"), .before = NULL, .after = NULL )
.data |
A |
... |
< The value can be:
|
.by |
< |
.keep |
Control which columns from
Note: With dtplyr |
.before , .after
|
< |
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(data.frame(x = 1:5, y = 5:1)) dt %>% mutate(a = (x + y) / 2, b = sqrt(x^2 + y^2)) # It uses a more sophisticated translation when newly created variables # are used in the same expression dt %>% mutate(x1 = x + 1, x2 = x1 + 1)
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(data.frame(x = 1:5, y = 5:1)) dt %>% mutate(a = (x + y) / 2, b = sqrt(x^2 + y^2)) # It uses a more sophisticated translation when newly created variables # are used in the same expression dt %>% mutate(x1 = x + 1, x2 = x1 + 1)
This is a method for the tidyr tidyr::nest()
generic. It is translated
using the non-nested variables in the by
argument and .SD
in the j
argument.
## S3 method for class 'dtplyr_step' nest(.data, ..., .names_sep = NULL, .key = deprecated())
## S3 method for class 'dtplyr_step' nest(.data, ..., .names_sep = NULL, .key = deprecated())
.data |
A data frame. |
... |
< |
.names_sep |
If |
.key |
Not supported. |
data |
A |
if (require("tidyr", quietly = TRUE)) { dt <- lazy_dt(tibble(x = c(1, 2, 1), y = c("a", "a", "b"))) dt %>% nest(data = y) dt %>% dplyr::group_by(x) %>% nest() }
if (require("tidyr", quietly = TRUE)) { dt <- lazy_dt(tibble(x = c(1, 2, 1), y = c("a", "a", "b"))) dt %>% nest(data = y) dt %>% dplyr::group_by(x) %>% nest() }
This is a method for the tidyr pivot_longer()
generic. It is translated to
data.table::melt()
## S3 method for class 'dtplyr_step' pivot_longer( data, cols, names_to = "name", names_prefix = NULL, names_sep = NULL, names_pattern = NULL, names_ptypes = NULL, names_transform = NULL, names_repair = "check_unique", values_to = "value", values_drop_na = FALSE, values_ptypes = NULL, values_transform = NULL, ... )
## S3 method for class 'dtplyr_step' pivot_longer( data, cols, names_to = "name", names_prefix = NULL, names_sep = NULL, names_pattern = NULL, names_ptypes = NULL, names_transform = NULL, names_repair = "check_unique", values_to = "value", values_drop_na = FALSE, values_ptypes = NULL, values_transform = NULL, ... )
data |
A |
cols |
< |
names_to |
A character vector specifying the new column or columns to
create from the information stored in the column names of
|
names_prefix |
A regular expression used to remove matching text from the start of each variable name. |
names_sep , names_pattern
|
If
If these arguments do not give you enough control, use
|
names_ptypes , names_transform , values_ptypes , values_transform
|
Not currently supported by dtplyr. |
names_repair |
What happens if the output has invalid column names?
The default, |
values_to |
A string specifying the name of the column to create
from the data stored in cell values. If |
values_drop_na |
If |
... |
Additional arguments passed on to methods. |
library(tidyr) # Simplest case where column names are character data relig_income_dt <- lazy_dt(relig_income) relig_income_dt %>% pivot_longer(!religion, names_to = "income", values_to = "count") # Slightly more complex case where columns have common prefix, # and missing missings are structural so should be dropped. billboard_dt <- lazy_dt(billboard) billboard %>% pivot_longer( cols = starts_with("wk"), names_to = "week", names_prefix = "wk", values_to = "rank", values_drop_na = TRUE ) # Multiple variables stored in column names lazy_dt(who) %>% pivot_longer( cols = new_sp_m014:newrel_f65, names_to = c("diagnosis", "gender", "age"), names_pattern = "new_?(.*)_(.)(.*)", values_to = "count" ) # Multiple observations per row anscombe_dt <- lazy_dt(anscombe) anscombe_dt %>% pivot_longer( everything(), names_to = c(".value", "set"), names_pattern = "(.)(.)" )
library(tidyr) # Simplest case where column names are character data relig_income_dt <- lazy_dt(relig_income) relig_income_dt %>% pivot_longer(!religion, names_to = "income", values_to = "count") # Slightly more complex case where columns have common prefix, # and missing missings are structural so should be dropped. billboard_dt <- lazy_dt(billboard) billboard %>% pivot_longer( cols = starts_with("wk"), names_to = "week", names_prefix = "wk", values_to = "rank", values_drop_na = TRUE ) # Multiple variables stored in column names lazy_dt(who) %>% pivot_longer( cols = new_sp_m014:newrel_f65, names_to = c("diagnosis", "gender", "age"), names_pattern = "new_?(.*)_(.)(.*)", values_to = "count" ) # Multiple observations per row anscombe_dt <- lazy_dt(anscombe) anscombe_dt %>% pivot_longer( everything(), names_to = c(".value", "set"), names_pattern = "(.)(.)" )
This is a method for the tidyr pivot_wider()
generic. It is translated to
data.table::dcast()
## S3 method for class 'dtplyr_step' pivot_wider( data, id_cols = NULL, names_from = name, names_prefix = "", names_sep = "_", names_glue = NULL, names_sort = FALSE, names_repair = "check_unique", values_from = value, values_fill = NULL, values_fn = NULL, ... )
## S3 method for class 'dtplyr_step' pivot_wider( data, id_cols = NULL, names_from = name, names_prefix = "", names_sep = "_", names_glue = NULL, names_sort = FALSE, names_repair = "check_unique", values_from = value, values_fill = NULL, values_fn = NULL, ... )
data |
A |
id_cols |
< Defaults to all columns in |
names_from , values_from
|
< If |
names_prefix |
String added to the start of every variable name. This is
particularly useful if |
names_sep |
If |
names_glue |
Instead of |
names_sort |
Should the column names be sorted? If |
names_repair |
What happens if the output has invalid column names?
The default, |
values_fill |
Optionally, a (scalar) value that specifies what each
This can be a named list if you want to apply different fill values to different value columns. |
values_fn |
A function, the default is |
... |
Additional arguments passed on to methods. |
library(tidyr) fish_encounters_dt <- lazy_dt(fish_encounters) fish_encounters_dt fish_encounters_dt %>% pivot_wider(names_from = station, values_from = seen) # Fill in missing values fish_encounters_dt %>% pivot_wider(names_from = station, values_from = seen, values_fill = 0) # Generate column names from multiple variables us_rent_income_dt <- lazy_dt(us_rent_income) us_rent_income_dt us_rent_income_dt %>% pivot_wider(names_from = variable, values_from = c(estimate, moe)) # When there are multiple `names_from` or `values_from`, you can use # use `names_sep` or `names_glue` to control the output variable names us_rent_income_dt %>% pivot_wider( names_from = variable, names_sep = ".", values_from = c(estimate, moe) ) # Can perform aggregation with values_fn warpbreaks_dt <- lazy_dt(as_tibble(warpbreaks[c("wool", "tension", "breaks")])) warpbreaks_dt warpbreaks_dt %>% pivot_wider( names_from = wool, values_from = breaks, values_fn = mean )
library(tidyr) fish_encounters_dt <- lazy_dt(fish_encounters) fish_encounters_dt fish_encounters_dt %>% pivot_wider(names_from = station, values_from = seen) # Fill in missing values fish_encounters_dt %>% pivot_wider(names_from = station, values_from = seen, values_fill = 0) # Generate column names from multiple variables us_rent_income_dt <- lazy_dt(us_rent_income) us_rent_income_dt us_rent_income_dt %>% pivot_wider(names_from = variable, values_from = c(estimate, moe)) # When there are multiple `names_from` or `values_from`, you can use # use `names_sep` or `names_glue` to control the output variable names us_rent_income_dt %>% pivot_wider( names_from = variable, names_sep = ".", values_from = c(estimate, moe) ) # Can perform aggregation with values_fn warpbreaks_dt <- lazy_dt(as_tibble(warpbreaks[c("wool", "tension", "breaks")])) warpbreaks_dt warpbreaks_dt %>% pivot_wider( names_from = wool, values_from = breaks, values_fn = mean )
This is a method for the dplyr reframe()
generic. It is translated to
the j
argument of [.data.table
.
## S3 method for class 'dtplyr_step' reframe(.data, ..., .by = NULL)
## S3 method for class 'dtplyr_step' reframe(.data, ..., .by = NULL)
.data |
A |
... |
Name-value pairs of functions. The name will be the name of the variable in the result. The value can be a vector of any length. Unnamed data frame values add multiple columns from a single expression. |
.by |
< |
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) dt %>% reframe(qs = quantile(disp, c(0.25, 0.75)), prob = c(0.25, 0.75), .by = cyl) dt %>% group_by(cyl) %>% reframe(qs = quantile(disp, c(0.25, 0.75)), prob = c(0.25, 0.75))
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) dt %>% reframe(qs = quantile(disp, c(0.25, 0.75)), prob = c(0.25, 0.75), .by = cyl) dt %>% group_by(cyl) %>% reframe(qs = quantile(disp, c(0.25, 0.75)), prob = c(0.25, 0.75))
This is a method for the dplyr relocate()
generic. It is translated to
the j
argument of [.data.table
.
## S3 method for class 'dtplyr_step' relocate(.data, ..., .before = NULL, .after = NULL)
## S3 method for class 'dtplyr_step' relocate(.data, ..., .before = NULL, .after = NULL)
.data |
A |
... |
< |
.before , .after
|
< |
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(data.frame(x = 1, y = 2, z = 3)) dt %>% relocate(z) dt %>% relocate(y, .before = x) dt %>% relocate(y, .after = y)
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(data.frame(x = 1, y = 2, z = 3)) dt %>% relocate(z) dt %>% relocate(y, .before = x) dt %>% relocate(y, .after = y)
These are methods for the dplyr generics rename()
and rename_with()
.
They are both translated to data.table::setnames()
.
## S3 method for class 'dtplyr_step' rename(.data, ...) ## S3 method for class 'dtplyr_step' rename_with(.data, .fn, .cols = everything(), ...)
## S3 method for class 'dtplyr_step' rename(.data, ...) ## S3 method for class 'dtplyr_step' rename_with(.data, .fn, .cols = everything(), ...)
.data |
|
... |
For For |
.fn |
A function used to transform the selected |
.cols |
< |
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(data.frame(x = 1, y = 2, z = 3)) dt %>% rename(new_x = x, new_y = y) dt %>% rename_with(toupper)
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(data.frame(x = 1, y = 2, z = 3)) dt %>% rename(new_x = x, new_y = y) dt %>% rename_with(toupper)
This is a method for the tidyr replace_na()
generic. It is translated to
data.table::fcoalesce()
.
Note that unlike tidyr::replace_na()
, data.table::fcoalesce()
cannot
replace NULL
values in lists.
## S3 method for class 'dtplyr_step' replace_na(data, replace = list())
## S3 method for class 'dtplyr_step' replace_na(data, replace = list())
data |
A |
replace |
If If |
library(tidyr) # Replace NAs in a data frame dt <- lazy_dt(tibble(x = c(1, 2, NA), y = c("a", NA, "b"))) dt %>% replace_na(list(x = 0, y = "unknown")) # Replace NAs using `dplyr::mutate()` dt %>% dplyr::mutate(x = replace_na(x, 0))
library(tidyr) # Replace NAs in a data frame dt <- lazy_dt(tibble(x = c(1, 2, NA), y = c("a", NA, "b"))) dt %>% replace_na(list(x = 0, y = "unknown")) # Replace NAs using `dplyr::mutate()` dt %>% dplyr::mutate(x = replace_na(x, 0))
This is a method for the dplyr select()
generic. It is translated to
the j
argument of [.data.table
.
## S3 method for class 'dtplyr_step' select(.data, ...)
## S3 method for class 'dtplyr_step' select(.data, ...)
.data |
A |
... |
< |
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(data.frame(x1 = 1, x2 = 2, y1 = 3, y2 = 4)) dt %>% select(starts_with("x")) dt %>% select(ends_with("2")) dt %>% select(z1 = x1, z2 = x2)
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(data.frame(x1 = 1, x2 = 2, y1 = 3, y2 = 4)) dt %>% select(starts_with("x")) dt %>% select(ends_with("2")) dt %>% select(z1 = x1, z2 = x2)
This is a method for the tidyr::separate()
generic. It is translated to
data.table::tstrsplit()
in the j
argument of [.data.table
.
## S3 method for class 'dtplyr_step' separate( data, col, into, sep = "[^[:alnum:]]+", remove = TRUE, convert = FALSE, ... )
## S3 method for class 'dtplyr_step' separate( data, col, into, sep = "[^[:alnum:]]+", remove = TRUE, convert = FALSE, ... )
data |
A |
col |
Column name or position. This argument is passed by expression and supports quasiquotation (you can unquote column names or column positions). |
into |
Names of new variables to create as character vector.
Use |
sep |
Separator between columns. The default value is a regular expression that matches any sequence of non-alphanumeric values. |
remove |
If TRUE, remove the input column from the output data frame. |
convert |
If TRUE, will run type.convert() with as.is = TRUE on new columns. This is useful if the component columns are integer, numeric or logical. NB: this will cause string "NA"s to be converted to NAs. |
... |
Arguments passed on to methods |
library(tidyr) # If you want to split by any non-alphanumeric value (the default): df <- lazy_dt(data.frame(x = c(NA, "x.y", "x.z", "y.z")), "DT") df %>% separate(x, c("A", "B")) # If you just want the second variable: df %>% separate(x, c(NA, "B")) # Use regular expressions to separate on multiple characters: df <- lazy_dt(data.frame(x = c(NA, "x?y", "x.z", "y:z")), "DT") df %>% separate(x, c("A","B"), sep = "([.?:])") # convert = TRUE detects column classes: df <- lazy_dt(data.frame(x = c("x:1", "x:2", "y:4", "z", NA)), "DT") df %>% separate(x, c("key","value"), ":") %>% str df %>% separate(x, c("key","value"), ":", convert = TRUE) %>% str
library(tidyr) # If you want to split by any non-alphanumeric value (the default): df <- lazy_dt(data.frame(x = c(NA, "x.y", "x.z", "y.z")), "DT") df %>% separate(x, c("A", "B")) # If you just want the second variable: df %>% separate(x, c(NA, "B")) # Use regular expressions to separate on multiple characters: df <- lazy_dt(data.frame(x = c(NA, "x?y", "x.z", "y:z")), "DT") df %>% separate(x, c("A","B"), sep = "([.?:])") # convert = TRUE detects column classes: df <- lazy_dt(data.frame(x = c("x:1", "x:2", "y:4", "z", NA)), "DT") df %>% separate(x, c("key","value"), ":") %>% str df %>% separate(x, c("key","value"), ":", convert = TRUE) %>% str
These are methods for the dplyr slice()
, slice_head()
, slice_tail()
,
slice_min()
, slice_max()
and slice_sample()
generics. They are
translated to the i
argument of [.data.table
.
Unlike dplyr, slice()
(and slice()
alone) returns the same number of
rows per group, regardless of whether or not the indices appear in each
group.
## S3 method for class 'dtplyr_step' slice(.data, ..., .by = NULL) ## S3 method for class 'dtplyr_step' slice_head(.data, ..., n, prop, by = NULL) ## S3 method for class 'dtplyr_step' slice_tail(.data, ..., n, prop, by = NULL) ## S3 method for class 'dtplyr_step' slice_min(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE) ## S3 method for class 'dtplyr_step' slice_max(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE)
## S3 method for class 'dtplyr_step' slice(.data, ..., .by = NULL) ## S3 method for class 'dtplyr_step' slice_head(.data, ..., n, prop, by = NULL) ## S3 method for class 'dtplyr_step' slice_tail(.data, ..., n, prop, by = NULL) ## S3 method for class 'dtplyr_step' slice_min(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE) ## S3 method for class 'dtplyr_step' slice_max(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE)
.data |
A |
... |
For Provide either positive values to keep, or negative values to drop. The values provided must be either all positive or all negative. Indices beyond the number of rows in the input are silently ignored. For |
.by , by
|
< |
n , prop
|
Provide either A negative value of |
order_by |
< |
with_ties |
Should ties be kept together? The default, |
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) dt %>% slice(1, 5, 10) dt %>% slice(-(1:4)) # First and last rows based on existing order dt %>% slice_head(n = 5) dt %>% slice_tail(n = 5) # Rows with minimum and maximum values of a variable dt %>% slice_min(mpg, n = 5) dt %>% slice_max(mpg, n = 5) # slice_min() and slice_max() may return more rows than requested # in the presence of ties. Use with_ties = FALSE to suppress dt %>% slice_min(cyl, n = 1) dt %>% slice_min(cyl, n = 1, with_ties = FALSE) # slice_sample() allows you to random select with or without replacement dt %>% slice_sample(n = 5) dt %>% slice_sample(n = 5, replace = TRUE) # you can optionally weight by a variable - this code weights by the # physical weight of the cars, so heavy cars are more likely to get # selected dt %>% slice_sample(weight_by = wt, n = 5)
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) dt %>% slice(1, 5, 10) dt %>% slice(-(1:4)) # First and last rows based on existing order dt %>% slice_head(n = 5) dt %>% slice_tail(n = 5) # Rows with minimum and maximum values of a variable dt %>% slice_min(mpg, n = 5) dt %>% slice_max(mpg, n = 5) # slice_min() and slice_max() may return more rows than requested # in the presence of ties. Use with_ties = FALSE to suppress dt %>% slice_min(cyl, n = 1) dt %>% slice_min(cyl, n = 1, with_ties = FALSE) # slice_sample() allows you to random select with or without replacement dt %>% slice_sample(n = 5) dt %>% slice_sample(n = 5, replace = TRUE) # you can optionally weight by a variable - this code weights by the # physical weight of the cars, so heavy cars are more likely to get # selected dt %>% slice_sample(weight_by = wt, n = 5)
This is a method for the dplyr summarise()
generic. It is translated to
the j
argument of [.data.table
.
## S3 method for class 'dtplyr_step' summarise(.data, ..., .by = NULL, .groups = NULL)
## S3 method for class 'dtplyr_step' summarise(.data, ..., .by = NULL, .groups = NULL)
.data |
A |
... |
< The value can be:
Returning values with size 0 or >1 was
deprecated as of 1.1.0. Please use |
.by |
< |
.groups |
Grouping structure of the result.
When
In addition, a message informs you of that choice, unless the result is ungrouped,
the option "dplyr.summarise.inform" is set to |
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) dt %>% group_by(cyl) %>% summarise(vs = mean(vs)) dt %>% group_by(cyl) %>% summarise(across(disp:wt, mean))
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(mtcars) dt %>% group_by(cyl) %>% summarise(vs = mean(vs)) dt %>% group_by(cyl) %>% summarise(across(disp:wt, mean))
This is a method for the dplyr transmute()
generic. It is translated to
the j
argument of [.data.table
.
## S3 method for class 'dtplyr_step' transmute(.data, ...)
## S3 method for class 'dtplyr_step' transmute(.data, ...)
.data |
A |
... |
< The value can be:
|
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(dplyr::starwars) dt %>% transmute(name, sh = paste0(species, "/", homeworld))
library(dplyr, warn.conflicts = FALSE) dt <- lazy_dt(dplyr::starwars) dt %>% transmute(name, sh = paste0(species, "/", homeworld))
This is a method for the tidyr unite()
generic.
## S3 method for class 'dtplyr_step' unite(data, col, ..., sep = "_", remove = TRUE, na.rm = FALSE)
## S3 method for class 'dtplyr_step' unite(data, col, ..., sep = "_", remove = TRUE, na.rm = FALSE)
data |
A data frame. |
col |
The name of the new column, as a string or symbol. This argument is passed by expression and supports
quasiquotation (you can unquote strings
and symbols). The name is captured from the expression with
|
... |
< |
sep |
Separator to use between values. |
remove |
If |
na.rm |
If |
library(tidyr) df <- lazy_dt(expand_grid(x = c("a", NA), y = c("b", NA))) df df %>% unite("z", x:y, remove = FALSE) # Separate is almost the complement of unite df %>% unite("xy", x:y) %>% separate(xy, c("x", "y")) # (but note `x` and `y` contain now "NA" not NA)
library(tidyr) df <- lazy_dt(expand_grid(x = c("a", NA), y = c("b", NA))) df df %>% unite("z", x:y, remove = FALSE) # Separate is almost the complement of unite df %>% unite("xy", x:y) %>% separate(xy, c("x", "y")) # (but note `x` and `y` contain now "NA" not NA)