One of my favorite posts is the comparison between data.table and the tidyverse’s dplyr packages. Here is the link to that post. I have used that when trying to build my competence in using data.table. Now I’m going to try and expand on that by creating this post that compares cases of using dplyr, data.table, pandas, and polars. Hopefully this can be as useful as the comparison between dplyr and data.table post was for me. This is not an extensive way of comparing them but just to get started for anyone that wants to use python more.
Packages & Data
Code
library (tidyverse)
library (data.table)
r_data <- tibble (
x = rnorm (n = 1000 ),
x2 = rnorm (n = 1000 , sd = 3 ),
y = rbinom (n = 1000 , size = 1 , prob = .6 )
)
print (r_data)
# A tibble: 1,000 × 3
x x2 y
<dbl> <dbl> <int>
1 0.678 -0.881 1
2 0.740 1.67 1
3 0.698 -1.54 1
4 -0.681 -1.69 0
5 2.12 0.254 0
6 -2.02 2.11 1
7 -0.777 -1.13 0
8 0.894 3.60 1
9 0.993 -0.821 0
10 -0.00111 -4.88 1
# ℹ 990 more rows
Code
r_table <- data.table (
x = rnorm (n = 1000 ),
x2 = rnorm (n = 1000 , sd = 3 ),
y = rbinom (n = 1000 , size = 1 , prob = .6 )
)
print (r_table)
x x2 y
<num> <num> <int>
1: 0.0478488 0.6959370 1
2: -0.1134272 1.5324302 1
3: -0.2430403 -1.3031583 1
4: 0.1240982 4.5723036 1
5: 0.3497546 -1.8382310 1
---
996: 0.3711398 0.9629948 1
997: -0.2156559 1.5034822 1
998: 3.2653217 -4.2318717 1
999: -1.3480967 -3.9285489 1
1000: -1.1387217 -3.5947204 1
Code
import pandas as pd
import numpy as np
data = {
'x' : np.random.normal(0 , scale= 1 , size= 1000 ),
'x2' : np.random.normal(0 , scale= 3 , size= 1000 ),
'y' : np.random.binomial(n= 1 , p= 0.6 , size= 1000 )
}
# Creating DataFrame
py_data = pd.DataFrame(data)
print (py_data)
x x2 y
0 0.135675 -0.972398 1
1 -0.913936 -1.200411 1
2 -1.301978 2.226628 1
3 -0.091329 4.175439 0
4 1.757646 1.702309 1
.. ... ... ..
995 -0.698766 -3.441828 0
996 -0.977856 -2.579932 1
997 0.927726 1.937473 1
998 -1.617165 2.029949 1
999 0.050764 1.700852 0
[1000 rows x 3 columns]
Code
import polars as pl
pl_data = pl.DataFrame(data)
print (pl_data)
shape: (1_000, 3)
┌───────────┬───────────┬─────┐
│ x ┆ x2 ┆ y │
│ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ i64 │
╞═══════════╪═══════════╪═════╡
│ 0.135675 ┆ -0.972398 ┆ 1 │
│ -0.913936 ┆ -1.200411 ┆ 1 │
│ -1.301978 ┆ 2.226628 ┆ 1 │
│ -0.091329 ┆ 4.175439 ┆ 0 │
│ 1.757646 ┆ 1.702309 ┆ 1 │
│ … ┆ … ┆ … │
│ -0.698766 ┆ -3.441828 ┆ 0 │
│ -0.977856 ┆ -2.579932 ┆ 1 │
│ 0.927726 ┆ 1.937473 ┆ 1 │
│ -1.617165 ┆ 2.029949 ┆ 1 │
│ 0.050764 ┆ 1.700852 ┆ 0 │
└───────────┴───────────┴─────┘
Basic Operations
Filtering (Integers)
Code
r_data |>
filter (
x > 1
) |>
head ()
# A tibble: 6 × 3
x x2 y
<dbl> <dbl> <int>
1 2.12 0.254 0
2 1.01 -2.21 0
3 1.28 -2.35 1
4 1.06 -7.83 1
5 1.56 -2.19 1
6 1.60 -1.68 1
Code
x x2 y
<num> <num> <int>
1: 1.284609 -1.08079020 1
2: 1.591378 0.02632543 1
3: 1.674949 -6.36352846 0
4: 1.635163 -1.44105381 0
5: 1.018293 -0.28489632 0
6: 1.020523 -2.94176344 1
Code
py_data[py_data["x" ] > 1 ].head()
x x2 y
4 1.757646 1.702309 1
22 2.337422 -0.872268 0
25 1.379007 -0.217651 1
34 1.775157 -3.427305 1
41 1.169759 1.420520 0
Code
pl_data.filter (pl.col('x' ) > 1 ).head()
shape: (5, 3)
f64
f64
i64
1.757646
1.702309
1
2.337422
-0.872268
0
1.379007
-0.217651
1
1.775157
-3.427305
1
1.169759
1.42052
0
Filtering (Categorical)
Code
r_data |>
filter (
y == 1
) |>
head ()
# A tibble: 6 × 3
x x2 y
<dbl> <dbl> <int>
1 0.678 -0.881 1
2 0.740 1.67 1
3 0.698 -1.54 1
4 -2.02 2.11 1
5 0.894 3.60 1
6 -0.00111 -4.88 1
Code
x x2 y
<num> <num> <int>
1: 0.0478488 0.695937 1
2: -0.1134272 1.532430 1
3: -0.2430403 -1.303158 1
4: 0.1240982 4.572304 1
5: 0.3497546 -1.838231 1
6: 1.2846087 -1.080790 1
Code
py_data[py_data["y" ] == 1 ].head()
x x2 y
0 0.135675 -0.972398 1
1 -0.913936 -1.200411 1
2 -1.301978 2.226628 1
4 1.757646 1.702309 1
6 -0.789002 2.008444 1
Code
pl_data.filter (pl.col('y' ) == 1 ).head()
shape: (5, 3)
f64
f64
i64
0.135675
-0.972398
1
-0.913936
-1.200411
1
-1.301978
2.226628
1
1.757646
1.702309
1
-0.789002
2.008444
1
Filtering Multiple Columns
Code
r_data |>
filter (
y == 1 &
x2 < 0
) |>
head ()
# A tibble: 6 × 3
x x2 y
<dbl> <dbl> <int>
1 0.678 -0.881 1
2 0.698 -1.54 1
3 -0.00111 -4.88 1
4 -1.65 -3.97 1
5 0.678 -0.0252 1
6 0.981 -0.0460 1
Code
head (
r_table[
y == 1 &
x2 > 0
]
)
x x2 y
<num> <num> <int>
1: 0.0478488 0.69593699 1
2: -0.1134272 1.53243024 1
3: 0.1240982 4.57230356 1
4: -0.3913492 3.27903295 1
5: 1.5913775 0.02632543 1
6: -1.1253344 0.47936905 1
Code
py_data[
(py_data["y" ] == 1 ) &
(py_data["x2" ] > 0 )
].head()
x x2 y
2 -1.301978 2.226628 1
4 1.757646 1.702309 1
6 -0.789002 2.008444 1
12 -1.044933 1.005732 1
13 -0.519737 2.415808 1
Code
pl_data.filter (pl.col('y' ) == 1 , pl.col('x2' ) > 0 ).head()
shape: (5, 3)
f64
f64
i64
-1.301978
2.226628
1
1.757646
1.702309
1
-0.789002
2.008444
1
-1.044933
1.005732
1
-0.519737
2.415808
1
Code
# uses a comma instead of using &
Sorting Rows
Code
r_data |>
arrange (y) |>
head ()
# A tibble: 6 × 3
x x2 y
<dbl> <dbl> <int>
1 -0.681 -1.69 0
2 2.12 0.254 0
3 -0.777 -1.13 0
4 0.993 -0.821 0
5 -1.57 -1.84 0
6 0.508 -1.16 0
Code
head (
r_table[order (y)]
)
x x2 y
<num> <num> <int>
1: 0.1528183 -0.08437265 0
2: -0.1938756 1.78788362 0
3: -2.4107423 6.63182147 0
4: -1.3682575 0.05525738 0
5: -2.1888937 -1.98375940 0
6: -0.5623052 0.05425300 0
Code
py_data.sort_values(by = "y" ).head()
x x2 y
980 -1.960833 -2.790498 0
11 -1.350877 1.116719 0
10 -0.574171 2.722374 0
5 -0.494218 3.842638 0
3 -0.091329 4.175439 0
Code
pl_data.sort(pl.col('y' )).head()
shape: (5, 3)
f64
f64
i64
-0.091329
4.175439
0
-0.494218
3.842638
0
-0.574171
2.722374
0
-1.350877
1.116719
0
0.322848
0.835612
0
Selecting Specific Columns
Code
r_data |>
select (
y
) |>
head ()
# A tibble: 6 × 1
y
<int>
1 1
2 1
3 1
4 0
5 0
6 1
Code
y
<int>
1: 1
2: 1
3: 1
4: 1
5: 1
6: 0
Code
0 1
1 1
2 1
3 0
4 1
Name: y, dtype: int64
Code
# py_data.filter(items = "y").head()
Code
pl_data.select(pl.col('y' )).head()
Selecting Multiple Columns
Code
r_data |>
select (x, x2) |>
head ()
# A tibble: 6 × 2
x x2
<dbl> <dbl>
1 0.678 -0.881
2 0.740 1.67
3 0.698 -1.54
4 -0.681 -1.69
5 2.12 0.254
6 -2.02 2.11
Code
head (
r_table[,list (x, x2)]
)
x x2
<num> <num>
1: 0.0478488 0.69593699
2: -0.1134272 1.53243024
3: -0.2430403 -1.30315830
4: 0.1240982 4.57230356
5: 0.3497546 -1.83823096
6: 0.1528183 -0.08437265
Code
py_data[["x" , "x2" ]].head()
x x2
0 0.135675 -0.972398
1 -0.913936 -1.200411
2 -1.301978 2.226628
3 -0.091329 4.175439
4 1.757646 1.702309
Code
# or
py_data.filter (items = ["x" , "x2" ]).head()
x x2
0 0.135675 -0.972398
1 -0.913936 -1.200411
2 -1.301978 2.226628
3 -0.091329 4.175439
4 1.757646 1.702309
Code
pl_data.select(pl.col('x' ), pl.col('x2' )).head()
shape: (5, 2)
f64
f64
0.135675
-0.972398
-0.913936
-1.200411
-1.301978
2.226628
-0.091329
4.175439
1.757646
1.702309
Selecting Using Regex
Code
r_data |>
select (
matches ("x" )
) |>
head ()
# A tibble: 6 × 2
x x2
<dbl> <dbl>
1 0.678 -0.881
2 0.740 1.67
3 0.698 -1.54
4 -0.681 -1.69
5 2.12 0.254
6 -2.02 2.11
Code
cols <- grep ("^x" , names (r_table))
head (
r_table[, ..cols]
)
x x2
<num> <num>
1: 0.0478488 0.69593699
2: -0.1134272 1.53243024
3: -0.2430403 -1.30315830
4: 0.1240982 4.57230356
5: 0.3497546 -1.83823096
6: 0.1528183 -0.08437265
Code
py_data.filter (regex = "x" ).head()
x x2
0 0.135675 -0.972398
1 -0.913936 -1.200411
2 -1.301978 2.226628
3 -0.091329 4.175439
4 1.757646 1.702309
Code
import polars.selectors as cs
pl_data.select(cs.starts_with('x' )).head()
shape: (5, 2)
f64
f64
0.135675
-0.972398
-0.913936
-1.200411
-1.301978
2.226628
-0.091329
4.175439
1.757646
1.702309
Summarize Data
Code
r_data |>
summarize (
avg = mean (x)
)
# A tibble: 1 × 1
avg
<dbl>
1 0.0406
Code
r_data |>
summarize (
total = sum (x)
)
# A tibble: 1 × 1
total
<dbl>
1 40.6
Code
r_table[, .(avg = mean (x))]
avg
<num>
1: -0.004438116
Code
r_table[, .(total = sum (x))]
Code
np.float64(-0.011356872534100343)
Code
np.float64(-11.356872534100342)
Code
pl_data.select(pl.mean('x' ))
Code
pl_data.select(pl.sum ('x' ))
Adding/Updating/Deleting Columns
Code
r_data <- r_data |>
mutate (
x_mult = x* x2
)
head (r_data)
# A tibble: 6 × 4
x x2 y x_mult
<dbl> <dbl> <int> <dbl>
1 0.678 -0.881 1 -0.597
2 0.740 1.67 1 1.23
3 0.698 -1.54 1 -1.07
4 -0.681 -1.69 0 1.15
5 2.12 0.254 0 0.537
6 -2.02 2.11 1 -4.27
Code
r_table[, x_mult : = x* x2]
head (r_table[, "x_mult" ])
x_mult
<num>
1: 0.03329975
2: -0.17381921
3: 0.31671995
4: 0.56741486
5: -0.64292978
6: -0.01289369
Code
py_data["x_mult" ] = py_data["x" ] * py_data["x2" ]
py_data["x_mult" ].head()
0 -0.131930
1 1.097099
2 -2.899022
3 -0.381339
4 2.992057
Name: x_mult, dtype: float64
Code
pl_data.with_columns((pl.col('x' ) * pl.col('x2' )).alias('x_mult' ))
shape: (1_000, 4)
f64
f64
i64
f64
0.135675
-0.972398
1
-0.13193
-0.913936
-1.200411
1
1.097099
-1.301978
2.226628
1
-2.899022
-0.091329
4.175439
0
-0.381339
1.757646
1.702309
1
2.992057
…
…
…
…
-0.698766
-3.441828
0
2.405032
-0.977856
-2.579932
1
2.522803
0.927726
1.937473
1
1.797444
-1.617165
2.029949
1
-3.282763
0.050764
1.700852
0
0.086342
Counting
Code
# A tibble: 2 × 2
y n
<int> <int>
1 0 387
2 1 613
Code
y N
<int> <int>
1: 1 600
2: 0 400
Code
py_data["y" ].value_counts()
y
1 609
0 391
Name: count, dtype: int64
Code
pl.Series(pl_data.select(pl.col('y' ))).value_counts()
Group By
Code
r_data |>
group_by (y) |>
summarize (
avg = mean (x)
)
# A tibble: 2 × 2
y avg
<int> <dbl>
1 0 0.0621
2 1 0.0270
Code
r_table[, .(avg = mean (x)), by = "y" ]
y avg
<int> <num>
1: 1 -0.009699608
2: 0 0.003454121
Code
py_data.groupby("y" )["x" ].mean()
y
0 -0.069581
1 0.026025
Name: x, dtype: float64
Code
pl_data.group_by('y' ).agg(pl.col('x' ).mean())
shape: (2, 2)
i64
f64
1
0.026025
0
-0.069581
Chain Expressions
Code
r_data |>
group_by (y) |>
summarize (
avg = mean (x)
) |>
filter (
y == 1
)
# A tibble: 1 × 2
y avg
<int> <dbl>
1 1 0.0270
Code
r_table[,
by = y,
.(avg = mean (x))
][
y == 1
]
y avg
<int> <num>
1: 1 -0.009699608
Code
py_group = py_data.groupby("y" )["x" ].mean().reset_index()
py_group.iloc[1 :, ]
Code
pl_group = pl_data.group_by('y' ).agg(pl.col('x' ).mean())
pl_group.filter (pl.col('y' ) == 1 )
Pivot Data
Code
r_data |>
pivot_longer (
- y
) |>
head ()
# A tibble: 6 × 3
y name value
<int> <chr> <dbl>
1 1 x 0.678
2 1 x2 -0.881
3 1 x_mult -0.597
4 1 x 0.740
5 1 x2 1.67
6 1 x_mult 1.23
Code
head (melt (r_table, id.vars = "y" ))
y variable value
<int> <fctr> <num>
1: 1 x 0.0478488
2: 1 x -0.1134272
3: 1 x -0.2430403
4: 1 x 0.1240982
5: 1 x 0.3497546
6: 0 x 0.1528183
Code
py_data.melt(id_vars = ['y' ], value_vars = ['x' , 'x2' , 'x_mult' ]).head()
y variable value
0 1 x 0.135675
1 1 x -0.913936
2 1 x -1.301978
3 0 x -0.091329
4 1 x 1.757646
Code
pl_data.unpivot(index = 'y' ).head()
shape: (5, 3)
i64
str
f64
1
"x"
0.135675
1
"x"
-0.913936
1
"x"
-1.301978
0
"x"
-0.091329
1
"x"
1.757646