Data Manipulation in R & Python

Using dplyr, data.table, pandas, and polars

data manipulation
dplyr
pandas
numpy
python
R
data.table
polars
Published

November 15, 2024

An image of an incomplete puzzle.

Photo by Joseph Malone on Unsplash

One of my favorite posts is the comparison between data.table and the tidyverse’s dplyr packages. Here is the link to that post. I have used that when trying to build my competence in using data.table. Now I’m going to try and expand on that by creating this post that compares cases of using dplyr, data.table, pandas, and polars. Hopefully this can be as useful as the comparison between dplyr and data.table post was for me. This is not an extensive way of comparing them but just to get started for anyone that wants to use python more.

Packages & Data

Code
library(tidyverse)
library(data.table)

r_data <- tibble(
  x = rnorm(n = 1000),
  x2 = rnorm(n = 1000, sd = 3),
  y = rbinom(n = 1000, size = 1, prob = .6)
)

print(r_data)
Code
r_table <- data.table(
  x = rnorm(n = 1000),
  x2 = rnorm(n = 1000, sd = 3),
  y = rbinom(n = 1000, size = 1, prob = .6)
)

print(r_table)
Code
import pandas as pd
import numpy as np

data = {
    'x': np.random.normal(0, scale=1, size=1000),
    'x2': np.random.normal(0, scale=3, size=1000),
    'y': np.random.binomial(n=1, p=0.6, size=1000)
}

# Creating DataFrame
py_data = pd.DataFrame(data)

print(py_data)
Code
import polars as pl

pl_data = pl.DataFrame(data)

print(pl_data)

Basic Operations

Filtering (Integers)

Code
r_data |>
  filter(
    x > 1
  ) |>
  head()
Code
head(
  r_table[x > 1]
)
Code
py_data[py_data["x"] > 1].head()
Code
pl_data.filter(pl.col('x') > 1).head()

Filtering (Categorical)

Code
r_data |>
  filter(
    y == 1
  ) |>
  head()
Code
head(
  r_table[y == 1]
)
Code
py_data[py_data["y"] == 1].head()
Code
pl_data.filter(pl.col('y') == 1).head()

Filtering Multiple Columns

Code
r_data |>
  filter(
    y == 1 &
    x2 < 0
  ) |>
  head()
Code
head(
  r_table[
    y == 1 &
    x2 > 0
  ]
)
Code
py_data[
  (py_data["y"] == 1) & 
  (py_data["x2"] > 0)
    ].head()
Code
pl_data.filter(pl.col('y') == 1, pl.col('x2') > 0).head()
# uses a comma instead of using &

Sorting Rows

Code
r_data |> 
  arrange(y) |>
  head()
Code
head(
  r_table[order(y)]
)
Code
py_data.sort_values(by = "y").head()
Code
pl_data.sort(pl.col('y')).head()

Selecting Specific Columns

Code
r_data |>
  select(
    y
  ) |>
  head()
Code
head(
  r_table[,"y"]
)
Code
py_data["y"].head()

# py_data.filter(items = "y").head()
Code
pl_data.select(pl.col('y')).head()

Selecting Multiple Columns

Code
r_data |> 
  select(x, x2) |> 
  head()
Code
head(
  r_table[,list(x, x2)]
)
Code
py_data[["x", "x2"]].head()
# or
py_data.filter(items = ["x", "x2"]).head()
Code
pl_data.select(pl.col('x'), pl.col('x2')).head()

Selecting Using Regex

Code
r_data |>
  select(
    matches("x")
  ) |>
  head()
Code
cols <- grep("^x", names(r_table))

head(
  r_table[, ..cols]
)
Code
py_data.filter(regex = "x").head()
Code
import polars.selectors as cs

pl_data.select(cs.starts_with('x')).head()

Summarize Data

Code
r_data |>
  summarize(
    avg = mean(x)
  )

  r_data |>
  summarize(
    total = sum(x)
  )
Code
r_table[, .(avg = mean(x))]

r_table[, .(total = sum(x))]
Code
py_data["x"].mean()

py_data["x"].sum()
Code
pl_data.select(pl.mean('x'))

pl_data.select(pl.sum('x'))

Adding/Updating/Deleting Columns

Code
r_data <- r_data |>
  mutate(
    x_mult = x*x2
  )
head(r_data)
Code
r_table[, x_mult := x*x2]
head(r_table[, "x_mult"])
Code
py_data["x_mult"] = py_data["x"] * py_data["x2"]
py_data["x_mult"].head()
Code
pl_data.with_columns((pl.col('x') * pl.col('x2')).alias('x_mult'))

Counting

Code
r_data |> count(y)
Code
r_table[, .N, by = (y)]
Code
py_data["y"].value_counts()
Code
pl.Series(pl_data.select(pl.col('y'))).value_counts()

Group By

Code
r_data |>
  group_by(y) |>
  summarize(
    avg = mean(x)
  )
Code
r_table[, .(avg = mean(x)), by = "y"]
Code
py_data.groupby("y")["x"].mean()
Code
pl_data.group_by('y').agg(pl.col('x').mean())

Chain Expressions

Code
r_data |>
  group_by(y) |>
  summarize(
    avg = mean(x)
  ) |>
  filter(
    y == 1
  )
Code
r_table[, 
  by = y,
  .(avg = mean(x))
  ][
    y == 1
  ]
Code
py_group = py_data.groupby("y")["x"].mean().reset_index()

py_group.iloc[1:, ]
Code
pl_group = pl_data.group_by('y').agg(pl.col('x').mean())

pl_group.filter(pl.col('y') == 1)

Pivot Data

Code
r_data |>
  pivot_longer(
    -y
  ) |>
  head()
Code
head(melt(r_table, id.vars = "y"))
Code
py_data.melt(id_vars = ['y'], value_vars = ['x', 'x2', 'x_mult']).head()
Code
pl_data.unpivot(index = 'y').head()