Quick Start

This guide walks through a complete v0.3.4 data pipeline: loading a CSV, running vectorized math, inspecting results, and writing to a relational database — all in native Rust speed.

Hello World

import pardox as px

# 1. Load data — parallel Rust CSV parser, automatic type inference
df = px.read_csv("sales_data.csv")
print(f"Loaded {df.shape[0]:,} rows × {df.shape[1]} columns")

# 2. Inspect columns
print(df.columns)
df.show(5)   # ASCII table preview

# 3. Cast and compute
df.cast("quantity", "Float64")
revenue_df = df.mul("price", "quantity")   # new DataFrame with 'result_mul'

# 4. Statistics
std_val = revenue_df.std("result_mul")
print(f"Revenue std dev: {std_val:,.2f}")

# 5. Write to PostgreSQL (COPY FROM STDIN auto-activated for > 10k rows)
from pardox.io import execute_sql
CONN = "postgresql://user:password@localhost:5432/mydb"
execute_sql(CONN, "CREATE TABLE IF NOT EXISTS sales (price FLOAT, quantity FLOAT)")
rows = df.to_sql(CONN, "sales", mode="append")
print(f"Written {rows:,} rows to PostgreSQL")

Step-by-Step Breakdown

1. Ingestion (`read_csv`)

PardoX spawns a Rust thread pool to parse chunks of the CSV file in parallel. No Python objects are created during ingestion — data flows directly into Rust-managed memory buffers.

df = px.read_csv("dataset.csv")

!!! tip “Type inference” The engine scans the first rows to classify each column as Int64, Float64, or Utf8 (string). You can override with an explicit schema:

```python
df = px.read_csv("data.csv", schema={"price": "Float64", "id": "Int64"})
```

2. Inspect (`show`, `shape`, `columns`, `dtypes`)

print(df.shape)      # (50000, 14)
print(df.columns)    # ['id', 'price', 'quantity', ...]
print(df.dtypes)     # {'id': 'Utf8', 'price': 'Float64', ...}
df.show(5)           # ASCII table, first 5 rows

3. Type Casting (`cast`)

When a column is inferred as Int64 but you need Float64 for arithmetic:

df.cast("quantity", "Float64")

Supported types: Int64, Float64, Utf8.

4. Vectorized Arithmetic

# Series operators (via Proxy / __getitem__)
df['total'] = df['price'] * df['quantity']
df['tax']   = df['total'] * 0.16

# DataFrame-level methods (return new DataFrame)
revenue_df    = df.mul("price", "quantity")   # result column: 'result_mul'
profit_df     = df.sub("revenue", "cost")     # result column: 'result_sub'

5. Native Math Methods

# Standard deviation (pure Rust, no NumPy)
std_val = revenue_df.std("result_mul")

# Min-Max normalization → new DataFrame with 'result_minmax' column
normed_df = df.min_max_scale("price")

# Sort by column (CPU or GPU)
sorted_df = df.sort_values("price", ascending=False)
sorted_df = df.sort_values("price", ascending=True, gpu=True)  # GPU Bitonic sort

6. Observer — Data Inspection & Export

# Value frequency table
state_counts = df.value_counts("state")
print(state_counts)   # {'TX': 6345, 'CA': 6301, ...}

# Unique values in a column
unique_cats = df.unique("category")

# Export full DataFrame to Python
records = df.to_dict()   # list of dicts — 50k records
json_str = df.to_json()  # JSON string "[{...}, ...]"

7. Database Write (Relational Conqueror)

from pardox.io import execute_sql, execute_mysql, read_sql

PG_CONN    = "postgresql://pardox:secret@localhost:5432/mydb"
MYSQL_CONN = "mysql://pardox:secret@localhost:3306/mydb"

# PostgreSQL — auto COPY FROM STDIN for > 10k rows
execute_sql(PG_CONN, "CREATE TABLE IF NOT EXISTS sales (price FLOAT, quantity FLOAT)")
rows = df.to_sql(PG_CONN, "sales", mode="append")
print(f"Postgres: {rows:,} rows written")

# MySQL — chunked batch INSERT (auto LOAD DATA if server allows)
execute_mysql(MYSQL_CONN, "CREATE TABLE IF NOT EXISTS sales (price DOUBLE, quantity DOUBLE)")
rows = df.to_mysql(MYSQL_CONN, "sales", mode="append")
print(f"MySQL: {rows:,} rows written")

# Read back from PostgreSQL
df_check = read_sql(PG_CONN, "SELECT COUNT(*) FROM sales")

8. Zero-Copy NumPy Integration

import numpy as np

# Direct pointer from Rust buffer — no data copy
arr = np.array(df["price"])
print(arr.dtype)   # float64
print(arr.mean())  # same value as df["price"].mean()

9. Persist to .prdx

# Save
df.to_prdx("sales_processed.prdx")

# Load (4.6 GB/s read throughput)
df2 = px.read_prdx("sales_processed.prdx")

!!! example “Performance benchmark” Loading 2 GB of data: CSV ~8s · Parquet ~3s · PRDX ~0.5s

Full Pipeline Example

import pardox as px
from pardox.io import execute_sql

CONN = "postgresql://pardox:secret@localhost:5432/analytics"

# Load
df = px.read_csv("sales_50k.csv")
df.cast("quantity", "Float64")

# Transform
df.fillna(0.0)
df['revenue'] = df['price'] * df['quantity']
df['tax']     = df['revenue'] * 0.08

# Analyze
print(f"Total revenue : ${df['revenue'].sum():,.2f}")
print(f"Avg ticket    : ${df['revenue'].mean():,.2f}")
print(f"Std deviation : {df['revenue'].std():,.2f}")

# Inspect
top_states = df.value_counts("state")
print("Top states:", list(top_states.items())[:5])

# Write
execute_sql(CONN, "DROP TABLE IF EXISTS sales_results")
execute_sql(CONN, (
    "CREATE TABLE sales_results "
    "(price FLOAT, quantity FLOAT, revenue FLOAT, tax FLOAT)"
))
rows = df.to_sql(CONN, "sales_results", mode="append")
print(f"\nWrote {rows:,} rows to PostgreSQL")

# Save locally
df.to_prdx("sales_results.prdx")
print("Saved to sales_results.prdx")