An image of an incomplete puzzle. — Photo by Joseph Malone on Unsplash

One of my favorite posts is the comparison between data.table and the tidyverse’s dplyr packages. Here is the link to that post. I have used that when trying to build my competence in using data.table. Now I’m going to try and expand on that by creating this post that compares cases of using dplyr, data.table, pandas, and polars. Hopefully this can be as useful as the comparison between dplyr and data.table post was for me. This is not an extensive way of comparing them but just to get started for anyone that wants to use python more.

Packages & Data

Code

library(tidyverse)
library(data.table)

r_data <- tibble(
  x = rnorm(n = 1000),
  x2 = rnorm(n = 1000, sd = 3),
  y = rbinom(n = 1000, size = 1, prob = .6)
)

print(r_data)

Code

r_table <- data.table(
  x = rnorm(n = 1000),
  x2 = rnorm(n = 1000, sd = 3),
  y = rbinom(n = 1000, size = 1, prob = .6)
)

print(r_table)

Code

import pandas as pd
import numpy as np

data = {
    'x': np.random.normal(0, scale=1, size=1000),
    'x2': np.random.normal(0, scale=3, size=1000),
    'y': np.random.binomial(n=1, p=0.6, size=1000)
}

# Creating DataFrame
py_data = pd.DataFrame(data)

print(py_data)

Code

import polars as pl

pl_data = pl.DataFrame(data)

print(pl_data)

Basic Operations

Filtering (Integers)

Code

r_data |>
  filter(
    x > 1
  ) |>
  head()

Code

head(
  r_table[x > 1]
)

Code

py_data[py_data["x"] > 1].head()

Code

pl_data.filter(pl.col('x') > 1).head()

Filtering (Categorical)

Code

r_data |>
  filter(
    y == 1
  ) |>
  head()

Code

head(
  r_table[y == 1]
)

Code

py_data[py_data["y"] == 1].head()

Code

pl_data.filter(pl.col('y') == 1).head()

Filtering Multiple Columns

Code

r_data |>
  filter(
    y == 1 &
    x2 < 0
  ) |>
  head()

Code

head(
  r_table[
    y == 1 &
    x2 > 0
  ]
)

Code

py_data[
  (py_data["y"] == 1) & 
  (py_data["x2"] > 0)
    ].head()

Code

pl_data.filter(pl.col('y') == 1, pl.col('x2') > 0).head()
# uses a comma instead of using &

Sorting Rows

Code

r_data |> 
  arrange(y) |>
  head()

Code

head(
  r_table[order(y)]
)

Code

py_data.sort_values(by = "y").head()

Code

pl_data.sort(pl.col('y')).head()

Selecting Specific Columns

Code

r_data |>
  select(
    y
  ) |>
  head()

Code

head(
  r_table[,"y"]
)

Code

py_data["y"].head()

# py_data.filter(items = "y").head()

Code

pl_data.select(pl.col('y')).head()

Selecting Multiple Columns

Code

r_data |> 
  select(x, x2) |> 
  head()

Code

head(
  r_table[,list(x, x2)]
)

Code

py_data[["x", "x2"]].head()
# or
py_data.filter(items = ["x", "x2"]).head()

Code

pl_data.select(pl.col('x'), pl.col('x2')).head()

Selecting Using Regex

Code

r_data |>
  select(
    matches("x")
  ) |>
  head()

Code

cols <- grep("^x", names(r_table))

head(
  r_table[, ..cols]
)

Code

py_data.filter(regex = "x").head()

Code

import polars.selectors as cs

pl_data.select(cs.starts_with('x')).head()

Summarize Data

Code

r_data |>
  summarize(
    avg = mean(x)
  )

  r_data |>
  summarize(
    total = sum(x)
  )

Code

r_table[, .(avg = mean(x))]

r_table[, .(total = sum(x))]

Code

py_data["x"].mean()

py_data["x"].sum()

Code

pl_data.select(pl.mean('x'))

pl_data.select(pl.sum('x'))

Adding/Updating/Deleting Columns

Code

r_data <- r_data |>
  mutate(
    x_mult = x*x2
  )
head(r_data)

Code

r_table[, x_mult := x*x2]
head(r_table[, "x_mult"])

Code

py_data["x_mult"] = py_data["x"] * py_data["x2"]
py_data["x_mult"].head()

Code

pl_data.with_columns((pl.col('x') * pl.col('x2')).alias('x_mult'))

Counting

Code

r_data |> count(y)

Code

r_table[, .N, by = (y)]

Code

py_data["y"].value_counts()

Code

pl.Series(pl_data.select(pl.col('y'))).value_counts()

Group By

Code

r_data |>
  group_by(y) |>
  summarize(
    avg = mean(x)
  )

Code

r_table[, .(avg = mean(x)), by = "y"]

Code

py_data.groupby("y")["x"].mean()

Code

pl_data.group_by('y').agg(pl.col('x').mean())

Chain Expressions

Code

r_data |>
  group_by(y) |>
  summarize(
    avg = mean(x)
  ) |>
  filter(
    y == 1
  )

Code

r_table[, 
  by = y,
  .(avg = mean(x))
  ][
    y == 1
  ]

Code

py_group = py_data.groupby("y")["x"].mean().reset_index()

py_group.iloc[1:, ]

Code

pl_group = pl_data.group_by('y').agg(pl.col('x').mean())

pl_group.filter(pl.col('y') == 1)

Pivot Data

Code

r_data |>
  pivot_longer(
    -y
  ) |>
  head()

Code

head(melt(r_table, id.vars = "y"))

Code

py_data.melt(id_vars = ['y'], value_vars = ['x', 'x2', 'x_mult']).head()

Code

pl_data.unpivot(index = 'y').head()