PardoX Python SDK Documentation

Installation

pip install pardox

Or install from source:

cd pardox_project
pip install -e .

Quick Start

import pardox as px

# Create DataFrame from dictionary
df = px.DataFrame([
    {"name": "Alice", "age": 30, "score": 85.5},
    {"name": "Bob", "age": 25, "score": 92.0},
    {"name": "Charlie", "age": 35, "score": 78.5}
])

print(df.head())

API Reference

DataFrame Creation

`px.DataFrame(data)`

Create a DataFrame from a list of dictionaries.

import pardox as px

# From list of dicts
df = px.DataFrame([
    {"name": "Alice", "age": 30},
    {"name": "Bob", "age": 25}
])

# From dict with lists
df = px.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [30, 25, 35]
})

I/O Operations

`px.read_csv(path, schema=None, **kwargs)`

Read a CSV file into a DataFrame.

import pardox as px

# Basic CSV reading
df = px.read_csv("data.csv")

# With schema specification
df = px.read_csv("data.csv", schema={
    "id": "Int64",
    "name": "Utf8",
    "amount": "Float64"
})

# With custom delimiter
df = px.read_csv("data.csv", delimiter=";")

`px.read_sql(conn_string, query)`

Execute a SQL query and return results as DataFrame.

import pardox as px

# PostgreSQL
df = px.read_sql(
    "postgresql://user:password@localhost:5432/mydb",
    "SELECT * FROM sales WHERE amount > 1000"
)

`px.read_prdx(path)`

Read a PardoX binary file (.prdx).

import pardox as px

df = px.read_prdx("data.prdx")

DataFrame Methods

`df.head(n=5)`

Return the first n rows.

# First 10 rows
print(df.head(10))

`df.tail(n=5)`

Return the last n rows.

# Last 5 rows
print(df.tail())

`df.shape`

Return the dimensions of the DataFrame (rows, columns).

rows, cols = df.shape
print(f"DataFrame has {rows} rows and {cols} columns")

`df.dtypes`

Return the data types of each column.

print(df.dtypes)
# Output: {'name': 'Utf8', 'age': 'Int64', 'score': 'Float64'}

`df.iloc[start:end]`

Slice rows by position.

# Rows 10-20
subset = df.iloc[10:20]

`df.show(n=10)`

Display an ASCII table representation.

df.show(10)

Column Operations

Column Selection

# Select single column (returns Series)
ages = df["age"]

# Select multiple columns
subset = df[["name", "score"]]

Column Assignment

# Add new column
df["new_col"] = df["age"] * 2

# Or use assign
df = df.assign(new_col=lambda x: x["age"] * 2)

Arithmetic Operations

`df.add(col_a, col_b)`

Add two columns.

# Create new column with sum
df = df.add("amount", "tax")
# Result stored in "result_math_add" column

`df.sub(col_a, col_b)`

Subtract columns.

df = df.sub("price", "discount")

`df.std(col)`

Calculate standard deviation.

std_value = df.std("score")

`df.min_max_scale(col)`

Normalize column to [0, 1] range.

df = df.min_max_scale("amount")

Filtering

Boolean Filtering

# Filter by condition
filtered = df[df["age"] > 25]

# Multiple conditions
filtered = df[(df["age"] > 25) & (df["score"] > 80)]

Aggregations

Available on Series objects:

# Column as Series
col = df["score"]

# Aggregations
total = col.sum()
average = col.mean()
maximum = col.max()
minimum = col.min()
count = col.count()
std_dev = col.std()

Sorting

`df.sort_values(by, ascending=True, gpu=False)`

Sort DataFrame by column values.

# Sort by name ascending
df_sorted = df.sort_values("name")

# Sort by amount descending
df_sorted = df.sort_values("amount", ascending=False)

# Use GPU for large datasets
df_sorted = df.sort_values("amount", gpu=True)

Joins

`df.join(other, on, how='inner')`

Join with another DataFrame.

# Inner join on client_id
result = df.join(
    other=clients_df,
    on="client_id"
)

# Left join
result = df.join(
    other=clients_df,
    on="client_id",
    how="left"
)

Data Cleaning

`df.fillna(value)`

Fill null values.

# Fill numeric columns with 0
df = df.fillna(0.0)

# Fill with specific value
df = df.fillna({"score": 50.0, "name": "Unknown"})

`df.round(decimals)`

Round numeric columns.

# Round to 2 decimal places
df = df.round(2)

Type Casting

`df.cast(column, target_type)`

Convert column to different type.

# Convert string to integer
df = df.cast("age", "Int64")

# Convert to float
df = df.cast("price", "Float64")

Export

`df.to_csv(path, **kwargs)`

Export to CSV.

df.to_csv("output.csv")

# With options
df.to_csv("output.csv", delimiter=";")

`df.to_prdx(path)`

Export to PardoX binary format.

df.to_prdx("data.prdx")

`df.to_sql(conn_string, table, mode='append', conflict_cols=None)`

Write to SQL table.

# Append to table
df.to_sql(
    "postgresql://user:pass@localhost/mydb",
    "sales",
    mode="append"
)

# Upsert with conflict handling
df.to_sql(
    "postgresql://user:pass@localhost/mydb",
    "sales",
    mode="upsert",
    conflict_cols=["id"]
)

Series API

The Series class represents a single column:

s = df["name"]

# Arithmetic
s2 = s + " (copy)"

# Comparisons (return boolean Series)
mask = s == "Alice"

# String operations
upper = s.upper()
lower = s.lower()

Database Support

PostgreSQL

import pardox as px

# Read from PostgreSQL
df = px.read_sql(
    "postgresql://user:password@host:5432/db",
    "SELECT * FROM table"
)

# Write to PostgreSQL
df.to_sql(
    "postgresql://user:password@host:5432/db",
    "table_name",
    mode="append"  # or "upsert"
)

MySQL

# Read from MySQL
df = px.read_mysql(
    "mysql://user:password@host:3306/db",
    "SELECT * FROM table"
)

# Write to MySQL
df.to_mysql(
    "mysql://user:password@host:3306/db",
    "table_name",
    mode="append"
)

SQL Server

# Read from SQL Server
df = px.read_sqlserver(
    "Server=host;Database=db;User Id=user;Password=pass;",
    "SELECT * FROM table"
)

# Write to SQL Server
df.to_sqlserver(
    "Server=host;Database=db;User Id=user;Password=pass;",
    "table_name"
)

MongoDB

# Read from MongoDB
df = px.read_mongodb(
    "mongodb://user:password@host:27017",
    "database.collection"
)

# Write to MongoDB
df.to_mongodb(
    "mongodb://user:password@host:27017",
    "database.collection",
    mode="append"  # or "replace"
)

Performance Tips

Use .prdx format for repeated reads - it’s much faster than CSV
Use GPU sorting for large datasets: df.sort_values(col, gpu=True)
Batch SQL writes: Use mode="append" for bulk inserts
Use fillna() before computations to handle missing values

Error Handling

import pardox as px

try:
    df = px.read_csv("nonexistent.csv")
except FileNotFoundError as e:
    print(f"File not found: {e}")
except Exception as e:
    print(f"Error: {e}")

Binary Format (.prdx)

PardoX’s native binary format provides:

~4.6 GB/s read throughput
Columnar storage (HyperBlock)
Automatic compression

# Write
df.to_prdx("data.prdx")

# Read (much faster than CSV)
df = px.read_prdx("data.prdx")