One of my favorite posts is the comparison between data.table and the tidyverse’s dplyr packages. Here is the link to that post. I have used that when trying to build my competence in using data.table. Now I’m going to try and expand on that by creating this post that compares cases of using dplyr, data.table, pandas, and polars. Hopefully this can be as useful as the comparison between dplyr and data.table post was for me. This is not an extensive way of comparing them but just to get started for anyone that wants to use python more.
Packages & Data
Code
library (tidyverse)
library (data.table)
r_data <- tibble (
x = rnorm (n = 1000 ),
x2 = rnorm (n = 1000 , sd = 3 ),
y = rbinom (n = 1000 , size = 1 , prob = .6 )
)
print (r_data)
Code
r_table <- data.table (
x = rnorm (n = 1000 ),
x2 = rnorm (n = 1000 , sd = 3 ),
y = rbinom (n = 1000 , size = 1 , prob = .6 )
)
print (r_table)
Code
import pandas as pd
import numpy as np
data = {
'x' : np.random.normal(0 , scale= 1 , size= 1000 ),
'x2' : np.random.normal(0 , scale= 3 , size= 1000 ),
'y' : np.random.binomial(n= 1 , p= 0.6 , size= 1000 )
}
# Creating DataFrame
py_data = pd.DataFrame(data)
print (py_data)
Code
import polars as pl
pl_data = pl.DataFrame(data)
print (pl_data)
Basic Operations
Filtering (Integers)
Code
r_data |>
filter (
x > 1
) |>
head ()
Code
py_data[py_data["x" ] > 1 ].head()
Code
pl_data.filter (pl.col('x' ) > 1 ).head()
Filtering (Categorical)
Code
r_data |>
filter (
y == 1
) |>
head ()
Code
py_data[py_data["y" ] == 1 ].head()
Code
pl_data.filter (pl.col('y' ) == 1 ).head()
Filtering Multiple Columns
Code
r_data |>
filter (
y == 1 &
x2 < 0
) |>
head ()
Code
head (
r_table[
y == 1 &
x2 > 0
]
)
Code
py_data[
(py_data["y" ] == 1 ) &
(py_data["x2" ] > 0 )
].head()
Code
pl_data.filter (pl.col('y' ) == 1 , pl.col('x2' ) > 0 ).head()
# uses a comma instead of using &
Sorting Rows
Code
r_data |>
arrange (y) |>
head ()
Code
head (
r_table[order (y)]
)
Code
py_data.sort_values(by = "y" ).head()
Code
pl_data.sort(pl.col('y' )).head()
Selecting Specific Columns
Code
r_data |>
select (
y
) |>
head ()
Code
py_data["y" ].head()
# py_data.filter(items = "y").head()
Code
pl_data.select(pl.col('y' )).head()
Selecting Multiple Columns
Code
r_data |>
select (x, x2) |>
head ()
Code
head (
r_table[,list (x, x2)]
)
Code
py_data[["x" , "x2" ]].head()
# or
py_data.filter (items = ["x" , "x2" ]).head()
Code
pl_data.select(pl.col('x' ), pl.col('x2' )).head()
Selecting Using Regex
Code
r_data |>
select (
matches ("x" )
) |>
head ()
Code
cols <- grep ("^x" , names (r_table))
head (
r_table[, ..cols]
)
Code
py_data.filter (regex = "x" ).head()
Code
import polars.selectors as cs
pl_data.select(cs.starts_with('x' )).head()
Summarize Data
Code
r_data |>
summarize (
avg = mean (x)
)
r_data |>
summarize (
total = sum (x)
)
Code
r_table[, .(avg = mean (x))]
r_table[, .(total = sum (x))]
Code
py_data["x" ].mean()
py_data["x" ].sum ()
Code
pl_data.select(pl.mean('x' ))
pl_data.select(pl.sum ('x' ))
Adding/Updating/Deleting Columns
Code
r_data <- r_data |>
mutate (
x_mult = x* x2
)
head (r_data)
Code
r_table[, x_mult : = x* x2]
head (r_table[, "x_mult" ])
Code
py_data["x_mult" ] = py_data["x" ] * py_data["x2" ]
py_data["x_mult" ].head()
Code
pl_data.with_columns((pl.col('x' ) * pl.col('x2' )).alias('x_mult' ))
Counting
Code
py_data["y" ].value_counts()
Code
pl.Series(pl_data.select(pl.col('y' ))).value_counts()
Group By
Code
r_data |>
group_by (y) |>
summarize (
avg = mean (x)
)
Code
r_table[, .(avg = mean (x)), by = "y" ]
Code
py_data.groupby("y" )["x" ].mean()
Code
pl_data.group_by('y' ).agg(pl.col('x' ).mean())
Chain Expressions
Code
r_data |>
group_by (y) |>
summarize (
avg = mean (x)
) |>
filter (
y == 1
)
Code
r_table[,
by = y,
.(avg = mean (x))
][
y == 1
]
Code
py_group = py_data.groupby("y" )["x" ].mean().reset_index()
py_group.iloc[1 :, ]
Code
pl_group = pl_data.group_by('y' ).agg(pl.col('x' ).mean())
pl_group.filter (pl.col('y' ) == 1 )
Pivot Data
Code
r_data |>
pivot_longer (
- y
) |>
head ()
Code
head (melt (r_table, id.vars = "y" ))
Code
py_data.melt(id_vars = ['y' ], value_vars = ['x' , 'x2' , 'x_mult' ]).head()
Code
pl_data.unpivot(index = 'y' ).head()