Skip to main content

Overview

The tif1 library supports two DataFrame backends:
  1. Pandas (default): Widely used, excellent compatibility, rich ecosystem
  2. Polars: High performance, 2-4x faster for large datasets, modern API
Both backends provide the same API, making it easy to switch between them.

Why Choose Polars?

Polars offers significant performance advantages:
  • 2-4x faster data loading and processing
  • Lower memory usage through efficient internal representation
  • Better parallelization using Rust-based execution engine
  • Lazy evaluation for query optimization
  • Modern API with method chaining

Setting the Backend

Per-Session

Specify backend when loading a session:
import tif1

# Use pandas (default)
session_pandas = tif1.get_session(
    2025, 
    "Abu Dhabi Grand Prix", 
    "Practice 1",
    lib="pandas"
)

# Use polars
session_polars = tif1.get_session(
    2025, 
    "Abu Dhabi Grand Prix", 
    "Practice 1",
    lib="polars"
)

print(f"Pandas session: {session_pandas.lib}")
print(f"Polars session: {session_polars.lib}")

Global Configuration

Set default backend in .tif1rc file:
{
  "lib": "polars"
}
Or via environment variable:
export TIF1_LIB=polars
python your_script.py

Runtime Configuration

from tif1.config import get_config

config = get_config()
config.set("lib", "polars")

import tif1

# All sessions will now use polars by default
session = tif1.get_session(2025, "Monaco Grand Prix", "Race")
print(f"Backend: {session.lib}")

Performance Comparison

Compare backends with the same data:
import tif1
import time

# Clear cache for fair comparison
cache = tif1.get_cache()
cache.clear()

print("BACKEND PERFORMANCE COMPARISON")
print("=" * 60)

# Test with Polars
print("\n1. POLARS BACKEND")
start = time.time()
session_polars = tif1.get_session(
    2025, 
    "Abu Dhabi Grand Prix", 
    "Practice 1",
    lib="polars"
)
laps_polars = session_polars.laps
polars_time = time.time() - start

print(f"   Load time: {polars_time:.2f}s")
print(f"   Total laps: {len(laps_polars)}")
print(f"   DataFrame type: {type(laps_polars).__name__}")

# Clear cache again
cache.clear()

# Test with Pandas
print("\n2. PANDAS BACKEND")
start = time.time()
session_pandas = tif1.get_session(
    2025, 
    "Abu Dhabi Grand Prix", 
    "Practice 1",
    lib="pandas"
)
laps_pandas = session_pandas.laps
pandas_time = time.time() - start

print(f"   Load time: {pandas_time:.2f}s")
print(f"   Total laps: {len(laps_pandas)}")
print(f"   DataFrame type: {type(laps_pandas).__name__}")

# Performance comparison
print("\n3. COMPARISON")
if polars_time > 0:
    speedup = pandas_time / polars_time
    print(f"   Speedup: {speedup:.1f}x faster with Polars")
Typical results:
1. POLARS BACKEND
   Load time: 1.23s
   Total laps: 342
   DataFrame type: DataFrame

2. PANDAS BACKEND
   Load time: 2.87s
   Total laps: 342
   DataFrame type: DataFrame

3. COMPARISON
   Speedup: 2.3x faster with Polars

Working with Polars

Basic Operations

Polars DataFrames work similarly to pandas:
import tif1

session = tif1.get_session(2025, "Monaco Grand Prix", "Race", lib="polars")
laps = session.laps

# Shape and columns
print(f"Shape: {laps.shape}")
print(f"Columns: {laps.columns}")

# Head and tail
print(laps.head(10))
print(laps.tail(5))

# Select columns
subset = laps.select(["Driver", "LapTime", "Compound"])

# Filter rows
fast_laps = laps.filter(laps["LapTime"] < 90.0)
ver_laps = laps.filter(laps["Driver"] == "VER")

Polars-Specific Operations

Take advantage of Polars’ powerful API:
import polars as pl

# Group by and aggregate
fastest_by_driver = (
    laps.group_by("Driver")
    .agg([
        pl.col("LapTime").min().alias("fastest_time"),
        pl.col("LapNumber").count().alias("lap_count"),
        pl.col("Compound").first().alias("tire_compound")
    ])
    .sort("fastest_time")
)

print(fastest_by_driver.head(10))

# Method chaining
analysis = (
    laps
    .filter(pl.col("LapTime").is_not_null())
    .group_by(["Driver", "Stint"])
    .agg([
        pl.col("LapTime").mean().alias("avg_time"),
        pl.col("LapTime").min().alias("best_time"),
        pl.col("TyreLife").max().alias("stint_length")
    ])
    .sort(["Driver", "Stint"])
)

print(analysis)

Lazy Evaluation

Use lazy evaluation for complex queries:
import polars as pl

# Convert to lazy frame
lazy_laps = laps.lazy()

# Build query
query = (
    lazy_laps
    .filter(pl.col("TrackStatus") == 1)  # Green flag only
    .filter(pl.col("LapTime").is_not_null())
    .group_by("Driver")
    .agg([
        pl.col("LapTime").min().alias("fastest"),
        pl.col("LapTime").mean().alias("average")
    ])
    .sort("fastest")
)

# Execute query (optimized by Polars)
result = query.collect()
print(result)

Working with Pandas

Basic Operations

import tif1

session = tif1.get_session(2025, "Monaco Grand Prix", "Race", lib="pandas")
laps = session.laps

# Shape and columns
print(f"Shape: {laps.shape}")
print(f"Columns: {laps.columns}")

# Head and tail
print(laps.head(10))
print(laps.tail(5))

# Select columns
subset = laps[["Driver", "LapTime", "Compound"]]

# Filter rows
fast_laps = laps[laps["LapTime"] < 90.0]
ver_laps = laps[laps["Driver"] == "VER"]

Pandas-Specific Operations

import pandas as pd

# Group by and aggregate
fastest_by_driver = (
    laps.groupby("Driver", observed=True)
    .agg({
        "LapTime": "min",
        "LapNumber": "count",
        "Compound": "first"
    })
    .rename(columns={
        "LapTime": "fastest_time",
        "LapNumber": "lap_count",
        "Compound": "tire_compound"
    })
    .sort_values("fastest_time")
)

print(fastest_by_driver.head(10))

# Complex filtering
green_flag_laps = laps[
    (laps["TrackStatus"] == 1) &
    (laps["LapTime"].notna())
]

# Pivot tables
pivot = pd.pivot_table(
    laps,
    values="LapTime",
    index="Driver",
    columns="Stint",
    aggfunc="min"
)

print(pivot)

Converting Between Backends

Polars to Pandas

import tif1

# Load with Polars
session = tif1.get_session(2025, "Monaco Grand Prix", "Race", lib="polars")
laps_polars = session.laps

print(f"Polars type: {type(laps_polars).__name__}")

# Convert to pandas
laps_pandas = laps_polars.to_pandas()

print(f"Pandas type: {type(laps_pandas).__name__}")
print(f"Shape: {laps_pandas.shape}")

Pandas to Polars

import polars as pl
import tif1

# Load with Pandas
session = tif1.get_session(2025, "Monaco Grand Prix", "Race", lib="pandas")
laps_pandas = session.laps

print(f"Pandas type: {type(laps_pandas).__name__}")

# Convert to polars
laps_polars = pl.from_pandas(laps_pandas)

print(f"Polars type: {type(laps_polars).__name__}")
print(f"Shape: {laps_polars.shape}")

Backend-Agnostic Code

Write code that works with both backends:
import tif1

def analyze_session(year, event, session_type, lib="pandas"):
    """Analyze session with any backend."""
    session = tif1.get_session(year, event, session_type, lib=lib)
    laps = session.laps
    
    # These operations work with both backends
    total_laps = len(laps)
    drivers = laps["Driver"].unique()
    
    print(f"Backend: {lib}")
    print(f"Total laps: {total_laps}")
    print(f"Drivers: {len(drivers)}")
    
    # Backend-specific aggregation
    if lib == "polars":
        import polars as pl
        fastest = (
            laps.group_by("Driver")
            .agg(pl.col("LapTime").min().alias("fastest"))
            .sort("fastest")
        )
    else:  # pandas
        fastest = (
            laps.groupby("Driver", observed=True)["LapTime"]
            .min()
            .sort_values()
            .to_frame("fastest")
        )
    
    print("\nFastest laps:")
    print(fastest.head(5))
    
    return fastest

# Test with both backends
print("=" * 60)
analyze_session(2025, "Monaco Grand Prix", "Race", lib="pandas")

print("\n" + "=" * 60)
analyze_session(2025, "Monaco Grand Prix", "Race", lib="polars")

When to Use Each Backend

Use Pandas When:

  • You need maximum compatibility with existing pandas code
  • You’re using libraries that require pandas DataFrames (matplotlib, seaborn, etc.)
  • You’re working with small datasets (<10,000 rows)
  • You need pandas-specific features (MultiIndex, etc.)
  • You’re teaching or learning (pandas has more tutorials)

Use Polars When:

  • You need maximum performance
  • You’re working with large datasets (>10,000 rows)
  • You’re doing complex aggregations or transformations
  • Memory usage is a concern
  • You want modern, consistent API design
  • You’re starting a new project

Complete Comparison Example

import tif1
import time

def benchmark_backend(lib):
    """Benchmark a backend."""
    print(f"\nTesting {lib.upper()} backend:")
    print("-" * 60)
    
    # Clear cache
    cache = tif1.get_cache()
    cache.clear()
    
    # Load session
    start = time.time()
    session = tif1.get_session(
        2025, 
        "Abu Dhabi Grand Prix", 
        "Practice 1",
        lib=lib
    )
    laps = session.laps
    load_time = time.time() - start
    
    print(f"Load time: {load_time:.2f}s")
    print(f"Laps loaded: {len(laps)}")
    print(f"DataFrame type: {type(laps).__module__}.{type(laps).__name__}")
    
    # Benchmark aggregation
    start = time.time()
    if lib == "polars":
        import polars as pl
        result = (
            laps.group_by("Driver")
            .agg([
                pl.col("LapTime").min().alias("fastest"),
                pl.col("LapTime").mean().alias("average"),
                pl.col("LapNumber").count().alias("count")
            ])
            .sort("fastest")
        )
    else:
        result = (
            laps.groupby("Driver", observed=True)
            .agg({
                "LapTime": ["min", "mean", "count"]
            })
        )
    agg_time = time.time() - start
    
    print(f"Aggregation time: {agg_time:.3f}s")
    
    return load_time, agg_time

print("BACKEND COMPARISON BENCHMARK")
print("=" * 60)

pandas_load, pandas_agg = benchmark_backend("pandas")
polars_load, polars_agg = benchmark_backend("polars")

print("\n" + "=" * 60)
print("RESULTS")
print("=" * 60)
print(f"Load time speedup: {pandas_load / polars_load:.2f}x")
print(f"Aggregation speedup: {pandas_agg / polars_agg:.2f}x")

Next Steps