Introduction
Today we’ll explore structs in Polars, which allow you to store multiple fields as a single column. Think of structs as nested dictionaries or JSON objects within your DataFrame. They’re perfect for representing complex, hierarchical data while keeping your DataFrame organized.
import polars as pl
# Create a DataFrame with a struct column
df = pl.DataFrame({
"user_id": [1, 2, 3],
"name": ["Alice", "Bob", "Charlie"],
"address": [
{"street": "123 Main St", "city": "NYC", "zip": "10001"},
{"street": "456 Oak Ave", "city": "LA", "zip": "90001"},
{"street": "789 Pine Rd", "city": "Chicago", "zip": "60601"}
]
})
print(df)shape: (3, 3)
┌─────────┬─────────┬─────────────────────────────┐
│ user_id ┆ name ┆ address │
│ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ struct[3] │
╞═════════╪═════════╪═════════════════════════════╡
│ 1 ┆ Alice ┆ {123 Main St,NYC,10001} │
│ 2 ┆ Bob ┆ {456 Oak Ave,LA,90001} │
│ 3 ┆ Charlie ┆ {789 Pine Rd,Chicago,60601} │
└─────────┴─────────┴─────────────────────────────┘
Creating Structs
From Python Dictionaries
The simplest way to create structs is from Python dictionaries:
df = pl.DataFrame({
"id": [1, 2],
"person": [
{"name": "Alice", "age": 30},
{"name": "Bob", "age": 25}
]
})Using pl.struct()
You can create struct columns from existing columns:
df = pl.DataFrame({
"first_name": ["Alice", "Bob"],
"last_name": ["Smith", "Jones"],
"age": [30, 25]
})
# Combine columns into a struct
df_with_struct = df.select([
pl.struct(["first_name", "last_name", "age"]).alias("person")
])
print(df_with_struct)shape: (2, 1)
┌─────────────────────────┐
│ person │
│ --- │
│ struct[3] │
╞═════════════════════════╡
│ {Alice,Smith,30} │
│ {Bob,Jones,25} │
└─────────────────────────┘
You can also create named structs with different field names:
df_renamed = df.select([
pl.struct(
first=pl.col("first_name"),
last=pl.col("last_name"),
years=pl.col("age")
).alias("person")
])Accessing Struct Fields
Using .field()
Access individual fields from a struct column:
df = pl.DataFrame({
"id": [1, 2, 3],
"address": [
{"street": "123 Main St", "city": "NYC"},
{"street": "456 Oak Ave", "city": "LA"},
{"street": "789 Pine Rd", "city": "Chicago"}
]
})
# Extract specific fields
df_with_fields = df.with_columns([
pl.col("address").struct.field("city").alias("city"),
pl.col("address").struct.field("street").alias("street")
])
print(df_with_fields)shape: (3, 4)
┌─────┬──────────────────────────┬─────────┬──────────────┐
│ id ┆ address ┆ city ┆ street │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ struct[2] ┆ str ┆ str │
╞═════╪══════════════════════════╪═════════╪══════════════╡
│ 1 ┆ {123 Main St,NYC} ┆ NYC ┆ 123 Main St │
│ 2 ┆ {456 Oak Ave,LA} ┆ LA ┆ 456 Oak Ave │
│ 3 ┆ {789 Pine Rd,Chicago} ┆ Chicago ┆ 789 Pine Rd │
└─────┴──────────────────────────┴─────────┴──────────────┘
Using .struct.unnest()
Unnest a struct to expand all fields into separate columns:
df_unnested = df.unnest("address")
print(df_unnested)shape: (3, 3)
┌─────┬──────────────┬─────────┐
│ id ┆ street ┆ city │
│ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str │
╞═════╪══════════════╪═════════╡
│ 1 ┆ 123 Main St ┆ NYC │
│ 2 ┆ 456 Oak Ave ┆ LA │
│ 3 ┆ 789 Pine Rd ┆ Chicago │
└─────┴──────────────┴─────────┘
Renaming Struct Fields
You can rename fields within a struct:
df = pl.DataFrame({
"person": [
{"first": "Alice", "last": "Smith"},
{"first": "Bob", "last": "Jones"}
]
})
# Rename struct fields
df_renamed = df.with_columns(
pl.col("person").struct.rename_fields(["first_name", "last_name"])
)
print(df_renamed)Working with Nested Structs
Structs can contain other structs, creating deeply nested data:
df = pl.DataFrame({
"user_id": [1, 2],
"profile": [
{
"name": {"first": "Alice", "last": "Smith"},
"contact": {"email": "alice@example.com", "phone": "555-0001"}
},
{
"name": {"first": "Bob", "last": "Jones"},
"contact": {"email": "bob@example.com", "phone": "555-0002"}
}
]
})
# Access nested fields
df_with_nested = df.with_columns([
pl.col("profile").struct.field("name").struct.field("first").alias("first_name"),
pl.col("profile").struct.field("contact").struct.field("email").alias("email")
])
print(df_with_nested)JSON to Struct
Polars can parse JSON strings into structs:
df = pl.DataFrame({
"id": [1, 2],
"json_data": [
'{"name": "Alice", "age": 30, "city": "NYC"}',
'{"name": "Bob", "age": 25, "city": "LA"}'
]
})
# Parse JSON strings to structs
df_parsed = df.with_columns(
pl.col("json_data").str.json_decode(pl.Struct([
pl.Field("name", pl.Utf8),
pl.Field("age", pl.Int64),
pl.Field("city", pl.Utf8)
])).alias("parsed")
)
print(df_parsed)
# Extract specific fields from parsed JSON
df_extracted = df_parsed.with_columns([
pl.col("parsed").struct.field("name").alias("name"),
pl.col("parsed").struct.field("age").alias("age")
])
print(df_extracted).struct.with_fields()
import polars as pl
df = pl.DataFrame({
"user_data": [
{"name": "alice", "score": 85, "active": True},
{"name": "bob", "score": 42, "active": False}
]
})
# Update score and add a new category field inside the struct
df_updated = df.with_columns(
pl.col("user_data").struct.with_fields(
# 1. Update an existing field
pl.field("score") + 5,
# 2. Add a brand new field inside the struct
is_passing = pl.field("score") >= 50,
# 3. Use logic from one field to update another
active = pl.when(pl.field("score") > 80).then(True).otherwise(pl.field("active"))
)
)Combining Structs and Lists
You can have lists of structs or structs containing lists:
# List of structs
df_list_of_structs = pl.DataFrame({
"user_id": [1, 2],
"orders": [
[
{"order_id": 101, "amount": 50.0},
{"order_id": 102, "amount": 75.0}
],
[
{"order_id": 201, "amount": 100.0}
]
]
})
# Explode the list and access struct fields
df_exploded = df_list_of_structs.explode("orders").with_columns([
pl.col("orders").struct.field("order_id").alias("order_id"),
pl.col("orders").struct.field("amount").alias("amount")
])
print(df_exploded)Practice Exercise
Now it’s time to practice! Try solving this exercise using struct operations:
Scenario: You’re analyzing customer data from an e-commerce API that returns nested JSON data:
import polars as pl
customers = pl.DataFrame({
"customer_id": [1, 2, 3],
"profile": [
{
"name": {"first": "Alice", "last": "Johnson"},
"age": 28,
"address": {"city": "NYC", "state": "NY", "zip": "10001"}
},
{
"name": {"first": "Bob", "last": "Smith"},
"age": 35,
"address": {"city": "Los Angeles", "state": "CA", "zip": "90001"}
},
{
"name": {"first": "Charlie", "last": "Brown"},
"age": 42,
"address": {"city": "Chicago", "state": "IL", "zip": "60601"}
}
],
"orders": [
[{"order_id": 101, "total": 150.50}, {"order_id": 102, "total": 200.00}],
[{"order_id": 201, "total": 89.99}],
[{"order_id": 301, "total": 450.00}, {"order_id": 302, "total": 125.75}, {"order_id": 303, "total": 75.00}]
]
})Tasks:
- Extract the first name and city from each customer into separate columns
- Create a new column
full_nameby combining first and last names - Unnest the address information into separate columns (city, state, zip)
- Filter customers who are older than 30 years
- Explode the orders list and calculate the total amount spent per customer
Bonus Challenge: Create a new struct column called customer_summary that contains: - full_name (first + last) - location (city, state) - order_count (number of orders) - total_spent (sum of all order totals)
Click to see solutions
# Task 1: Extract first name and city
task1 = customers.with_columns([
pl.col("profile").struct.field("name").struct.field("first").alias("first_name"),
pl.col("profile").struct.field("address").struct.field("city").alias("city")
])
print(task1)
# Task 2: Create full_name column
task2 = customers.with_columns(
(pl.col("profile").struct.field("name").struct.field("first") + " " +
pl.col("profile").struct.field("name").struct.field("last")).alias("full_name")
)
print(task2)
# Task 3: Unnest address information
task3 = customers.with_columns(
pl.col("profile").struct.field("address").alias("address")
).unnest("address")
print(task3)
# Task 4: Filter customers older than 30
task4 = customers.filter(
pl.col("profile").struct.field("age") > 30
)
print(task4)
# Task 5: Explode orders and calculate total spent per customer
task5 = customers.explode("orders").with_columns([
pl.col("orders").struct.field("order_id").alias("order_id"),
pl.col("orders").struct.field("total").alias("order_total")
]).group_by("customer_id").agg([
pl.col("order_total").sum().alias("total_spent"),
pl.col("order_id").count().alias("order_count")
])
print(task5)
# Bonus: Create customer_summary struct
bonus = customers.explode("orders").with_columns([
pl.col("orders").struct.field("total").alias("order_total")
]).group_by("customer_id").agg([
pl.col("profile").first().alias("profile"),
pl.col("order_total").sum().alias("total_spent"),
pl.col("order_total").count().alias("order_count")
]).with_columns(
pl.struct([
(pl.col("profile").struct.field("name").struct.field("first") + " " +
pl.col("profile").struct.field("name").struct.field("last")).alias("full_name"),
(pl.col("profile").struct.field("address").struct.field("city") + ", " +
pl.col("profile").struct.field("address").struct.field("state")).alias("location"),
pl.col("order_count"),
pl.col("total_spent")
]).alias("customer_summary")
)
print(bonus.select(["customer_id", "customer_summary"]))Expected output for Task 5:
shape: (3, 3)
┌─────────────┬─────────────┬─────────────┐
│ customer_id ┆ total_spent ┆ order_count │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ u32 │
╞═════════════╪═════════════╪═════════════╡
│ 1 ┆ 350.5 ┆ 2 │
│ 2 ┆ 89.99 ┆ 1 │
│ 3 ┆ 650.75 ┆ 3 │
└─────────────┴─────────────┴─────────────┘