Series Overview:
Part 1: Fundamentals — Series, DataFrames, I/O, Basic Operations ✅
Part 2: Data Manipulation — Indexing, Filtering, GroupBy, Merge, Reshape ✅
Part 3 (This Post): Advanced Pandas — Performance, Time Series, ML, Interview Questions
1. MultiIndex — Hierarchical Indexing
A MultiIndex (or hierarchical index) lets you have multiple levels of indexing on rows or columns. Think of it as a 3D+ spreadsheet compressed into 2D.
Creating MultiIndex DataFrames
import pandas as pd
import numpy as np
# ── Method 1: From tuples ──────────────────────────────────────────────
idx = pd.MultiIndex.from_tuples(
[("IT", "Senior"), ("IT", "Junior"), ("HR", "Senior"), ("HR", "Junior")],
names=["dept", "level"]
)
df = pd.DataFrame({"salary": [80000, 55000, 70000, 45000], "count": [5, 8, 4, 6]}, index=idx)
# ── Method 2: From product (all combinations) ──────────────────────────
idx = pd.MultiIndex.from_product(
[["IT", "HR", "Finance"], ["Q1", "Q2", "Q3", "Q4"]],
names=["dept", "quarter"]
)
# ── Method 3: From GroupBy ─────────────────────────────────────────────
df = pd.DataFrame({
"dept": ["IT", "IT", "HR", "HR", "Finance"],
"level": ["Senior", "Junior", "Senior", "Junior", "Senior"],
"salary": [80000, 55000, 70000, 45000, 85000]
})
grouped = df.groupby(["dept", "level"]).agg(avg_salary=("salary", "mean"), count=("salary", "count"))
print(grouped)
avg_salary count
dept level
Finance Senior 85000.0 1
HR Junior 45000.0 1
Senior 70000.0 1
IT Junior 55000.0 1
Senior 80000.0 1
Indexing into MultiIndex
# Direct access
grouped.loc["IT"] # all IT rows
grouped.loc["IT", "Senior"] # IT Senior specifically
grouped.loc[("IT", "Senior")] # same as above
grouped.loc[("IT", "Senior"), "avg_salary"] # single value
# Cross-section — slice across one level
grouped.xs("Senior", level="level") # all Senior rows across all depts
grouped.xs("IT", level="dept") # same as grouped.loc["IT"]
# Slice — use slice() object for ranges
grouped.loc[(slice(None), "Junior"), :] # all Junior rows
# Or use pd.IndexSlice for clarity:
idx_slice = pd.IndexSlice
grouped.loc[idx_slice["IT":"HR", "Senior"], :]
Working with MultiIndex
# Reset to flat DataFrame
grouped.reset_index() # all index levels become columns
# Stack / Unstack — move index levels to/from columns
grouped.unstack("level") # 'level' index → columns
grouped.unstack() # default: last index level → columns
# Sort index
grouped.sort_index()
grouped.sort_index(level="dept", ascending=True)
# Check levels
grouped.index.names # ['dept', 'level']
grouped.index.get_level_values("dept") # only dept values
grouped.index.get_level_values(0) # by position
2. ⏰ Time Series — Complete Guide
Time series data is everywhere — stock prices, server logs, IoT sensor readings, sales data. Pandas has outstanding time series support.
DatetimeIndex — The Foundation
import pandas as pd
import numpy as np
# Create a time-series DataFrame
dates = pd.date_range("2024-01-01", periods=365, freq="D")
ts = pd.Series(np.random.randn(365).cumsum(), index=dates, name="value")
# DatetimeIndex allows date-based slicing
ts["2024-03"] # March 2024 only
ts["2024-Q1"] # Q1 2024 only
ts["2024-01-01":"2024-06-30"] # date range
# Access by partial date string
ts.loc["2024-05"] # all May rows
# Set a datetime column as index
df["date"] = pd.to_datetime(df["date"])
df = df.set_index("date")
df = df.sort_index() # always sort time series!
resample() — Aggregate by Time Period
resample() is like groupby() but for time periods.
# Generate hourly data
idx = pd.date_range("2024-01-01", periods=24*30, freq="h") # 30 days of hourly data
ts = pd.Series(np.random.uniform(10, 100, len(idx)), index=idx)
# ── Downsample: more granular → less granular ──────────────────────────
ts.resample("D").sum() # hourly → daily (sum)
ts.resample("D").mean() # daily average
ts.resample("D").ohlc() # open, high, low, close (finance!)
ts.resample("W").sum() # weekly
ts.resample("ME").sum() # monthly (month-end)
ts.resample("QE").mean() # quarterly
ts.resample("YE").sum() # yearly
# ── Upsample: less granular → more granular ────────────────────────────
monthly_ts = ts.resample("ME").sum()
daily_ts = monthly_ts.resample("D").ffill() # forward-fill the gaps
daily_ts = monthly_ts.resample("D").interpolate("linear") # interpolate
# ── Resample with multiple aggregations ───────────────────────────────
ts.resample("W").agg(["sum", "mean", "min", "max"])
# ── Custom period ─────────────────────────────────────────────────────
ts.resample("4h").sum() # every 4 hours
ts.resample("2W").mean() # every 2 weeks
# ── With label/closed control ─────────────────────────────────────────
ts.resample("ME", closed="left", label="left").sum()
rolling() — Rolling Window Operations
ts = pd.Series(np.random.uniform(50, 150, 100), name="price")
# ── Basic rolling ─────────────────────────────────────────────────────
ts.rolling(7).mean() # 7-day moving average
ts.rolling(30).mean() # 30-day moving average
ts.rolling(7).sum()
ts.rolling(7).std() # rolling volatility
ts.rolling(7).min()
ts.rolling(7).max()
# ── min_periods: compute even with fewer points ───────────────────────
ts.rolling(7, min_periods=1).mean() # don't require 7 data points
# ── Exponentially Weighted Moving Average (EWMA) ──────────────────────
ts.ewm(span=7).mean() # recent values weighted more
ts.ewm(span=30, adjust=False).mean()
# ── Rolling with apply ────────────────────────────────────────────────
ts.rolling(14).apply(lambda x: x[-1] / x[0] - 1) # 14-day return
# ── Rolling correlation between two series ────────────────────────────
ts2 = pd.Series(np.random.uniform(50, 150, 100))
ts.rolling(30).corr(ts2)
# ── Practical example: Bollinger Bands ────────────────────────────────
prices = pd.Series(np.random.uniform(100, 200, 200).cumsum() / 100 + 150)
window = 20
ma = prices.rolling(window).mean()
std = prices.rolling(window).std()
bollinger = pd.DataFrame({
"price": prices,
"ma_20": ma,
"upper_band": ma + (std * 2),
"lower_band": ma - (std * 2)
})
shift() — Lag and Lead Operations
ts = pd.Series([10, 20, 30, 40, 50], index=pd.date_range("2024-01", periods=5, freq="ME"))
ts.shift(1) # lag 1 period (previous value)
ts.shift(-1) # lead 1 period (next value)
ts.shift(3) # lag 3 periods
# ── Calculate returns using shift ─────────────────────────────────────
prices = pd.Series([100, 105, 103, 108, 112])
returns_pct = prices.pct_change() # % change from previous
returns_abs = prices.diff() # absolute change
returns_2day = prices.pct_change(periods=2) # 2-period returns
# ── Useful for ML lag features ────────────────────────────────────────
df = pd.DataFrame({"value": [10, 20, 30, 40, 50, 60]})
df["lag_1"] = df["value"].shift(1)
df["lag_2"] = df["value"].shift(2)
df["lag_3"] = df["value"].shift(3)
df["next_value"] = df["value"].shift(-1) # future value (target)
df["rolling_mean_3"] = df["value"].rolling(3).mean()
Time Zones
# Create time zone aware datetime
ts = pd.Series([1, 2, 3], index=pd.date_range("2024-01-01", periods=3, freq="D"))
# Localize (naive → aware)
ts_utc = ts.tz_localize("UTC")
# Convert to another timezone
ts_ist = ts_utc.tz_convert("Asia/Kolkata") # IST (India)
ts_est = ts_utc.tz_convert("US/Eastern")
# Remove timezone info
ts_naive = ts_ist.tz_localize(None)
Business Day Operations
from pandas.tseries.offsets import BDay, BMonthEnd, BQuarterEnd
# Business days only
bdays = pd.date_range("2024-01-01", periods=20, freq=BDay())
# Business month end
bme = pd.date_range("2024-01-01", periods=6, freq=BMonthEnd())
# Add business days
date = pd.Timestamp("2024-01-01")
date + BDay(5) # 5 business days later
# Check if business day
pd.Timestamp("2024-01-01").dayofweek < 5 # Monday=0, Sunday=6
3. ⚡ Performance Optimization
Profiling Your Code
import pandas as pd
import numpy as np
import time
# Method 1: time.time()
start = time.time()
result = df.groupby("dept")["salary"].mean()
elapsed = time.time() - start
print(f"Time: {elapsed:.4f} seconds")
# Method 2: %timeit (Jupyter)
# %timeit df.groupby("dept")["salary"].mean()
# Method 3: memory_profiler
# pip install memory_profiler
# %load_ext memory_profiler
# %memit df.groupby("dept")["salary"].mean()
# Check memory usage
df.memory_usage(deep=True) # bytes per column
df.memory_usage(deep=True).sum() / 1024**2 # total in MB
Memory Optimization Techniques
import pandas as pd
import numpy as np
# ── Start with a memory-heavy DataFrame ───────────────────────────────
df = pd.DataFrame({
"user_id": np.random.randint(1, 100000, 2_000_000),
"age": np.random.randint(18, 80, 2_000_000),
"score": np.random.uniform(0, 100, 2_000_000),
"status": np.random.choice(["active", "inactive", "pending"], 2_000_000),
"dept": np.random.choice(["IT", "HR", "Finance", "Marketing"], 2_000_000),
"salary": np.random.randint(30000, 150000, 2_000_000).astype(float),
})
def mem_mb(df):
return df.memory_usage(deep=True).sum() / 1024**2
print(f"Before: {mem_mb(df):.1f} MB")
# ── Optimization 1: Integer downcasting ───────────────────────────────
df["user_id"] = pd.to_numeric(df["user_id"], downcast="unsigned") # uint32 instead of int64
df["age"] = pd.to_numeric(df["age"], downcast="unsigned") # uint8 (0-255 is enough)
# ── Optimization 2: Float downcasting ────────────────────────────────
df["score"] = df["score"].astype("float32")
# ── Optimization 3: Categorical for low-cardinality strings ──────────
df["status"] = df["status"].astype("category")
df["dept"] = df["dept"].astype("category")
# ── Optimization 4: Integer salary (no decimal needed) ───────────────
df["salary"] = df["salary"].astype("int32")
print(f"After: {mem_mb(df):.1f} MB")
# Typical result: 150 MB → 35 MB (77% reduction!)
# ── Auto-optimize function ────────────────────────────────────────────
def optimize_dataframe(df):
"""Automatically downcast numeric types and convert low-cardinality strings to category."""
for col in df.columns:
col_type = df[col].dtype
if col_type == "object":
# Convert to category if cardinality is low (< 50% unique)
if df[col].nunique() / len(df) < 0.5:
df[col] = df[col].astype("category")
elif col_type in ["int64", "int32"]:
df[col] = pd.to_numeric(df[col], downcast="integer")
elif col_type in ["float64", "float32"]:
df[col] = pd.to_numeric(df[col], downcast="float")
return df
df_optimized = optimize_dataframe(df.copy())
print(f"Auto-optimized: {mem_mb(df_optimized):.1f} MB")
Speed Optimization: Avoid apply(), Use Vectorized Operations
import numpy as np
import timeit
df = pd.DataFrame({
"a": np.random.randint(1, 100, 1_000_000),
"b": np.random.randint(1, 100, 1_000_000),
})
# ❌ Slowest: apply() with Python function
# ~5-10 seconds for 1M rows
# df.apply(lambda row: row["a"] + row["b"], axis=1)
# 🟡 Faster: apply() on Series
# ~500ms
# df["a"].apply(lambda x: x * 2)
# ✅ Fast: Pandas vectorized
# ~10ms
result = df["a"] + df["b"]
# ✅ Fastest for complex logic: NumPy
result = np.where(df["a"] > df["b"], df["a"], df["b"])
# Speed comparison example:
print("apply (row):", end=" ")
%timeit df.apply(lambda row: row["a"] + row["b"], axis=1) # ~5s
print("vectorized:", end=" ")
%timeit df["a"] + df["b"] # ~5ms
print("numpy:", end=" ")
%timeit df["a"].values + df["b"].values # ~2ms
Speed Optimization: Use .values or .to_numpy() for Loops
# When you genuinely need to loop (rare), use .values to avoid Pandas overhead
# ❌ Slow: pandas overhead per iteration
for val in df["salary"]:
pass
# ✅ Fast: raw numpy array iteration
for val in df["salary"].values:
pass
# Even better: vectorize the whole thing!
eval() and query() for Large DataFrames
# For large DataFrames, eval() and query() use numexpr (C-based) under the hood
# They're faster than regular operations AND use less memory
df = pd.DataFrame(np.random.randn(1_000_000, 4), columns=["A", "B", "C", "D"])
# ❌ Standard (creates 3 intermediate arrays)
result = df["A"] + df["B"] + df["C"] + df["D"]
# ✅ eval() (one pass, less memory, faster for large DFs)
result = df.eval("A + B + C + D")
# Complex expressions
df.eval("E = A * B + C ** 2", inplace=True)
# query() also uses numexpr
df.query("A > 0 and B < 0 and C > 0.5")
Parallel Processing with Dask (for Large Data)
# When Pandas is too slow for your dataset size:
# pip install dask
import dask.dataframe as dd
# Read large file
ddf = dd.read_csv("huge_file.csv")
# Same Pandas-like API, but computed lazily
result = ddf.groupby("dept")["salary"].mean().compute()
4. 🔄 Pandas 2.x — What's New and Important
Copy-on-Write (CoW) — Biggest Change in Pandas 2.0
# In Pandas 2.0+, Copy-on-Write is introduced
# In Pandas 3.0, it becomes the default
# Before CoW: modifying a slice might or might not modify the original
# After CoW: slices are ALWAYS independent — no more SettingWithCopyWarning!
import pandas as pd
# Enable CoW (Pandas 2.x)
pd.options.mode.copy_on_write = True
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
subset = df["a"] # no copy yet (lazy)
subset.iloc[0] = 99 # NOW a copy is made — df["a"] is NOT modified
# With CoW, this is the new clean behavior:
# You always get a copy when you modify, never silent mutation
New String Dtype
# Old: object dtype (Python objects — slow, memory-heavy)
s_old = pd.Series(["hello", "world"])
print(s_old.dtype) # object
# New: string dtype (backed by Arrow or Pandas StringArray)
s_new = pd.Series(["hello", "world"], dtype="string") # or pd.StringDtype()
print(s_new.dtype) # string
# Benefits:
# - pd.NA instead of np.nan for missing values (semantically correct)
# - Faster string operations
# - Less memory
# Convert existing column
df["name"] = df["name"].astype("string")
# New nullable integer types (can have NA/null)
df["age"] = df["age"].astype("Int64") # capital I = nullable integer
# vs
df["age"] = df["age"].astype("int64") # lowercase = cannot have NA
Arrow-Backed DataFrames (Pandas 2.0+)
# pip install pyarrow
import pandas as pd
import pyarrow as pa
# Use ArrowDtype for significant speed and memory gains
df = pd.DataFrame({
"name": pd.array(["Alice", "Bob"], dtype=pd.ArrowDtype(pa.string())),
"salary": pd.array([50000, 60000], dtype=pd.ArrowDtype(pa.int32())),
})
# Or use backend="pyarrow" when reading
df = pd.read_csv("data.csv", dtype_backend="pyarrow")
df = pd.read_parquet("data.parquet", dtype_backend="pyarrow")
# Benefits:
# - Much faster string operations
# - Better null handling
# - Interoperability with Arrow ecosystem (Spark, Polars, etc.)
pd.convert_dtypes() — Smart Type Inference
df = pd.read_csv("data.csv") # everything is object/float64
# Automatically convert to the best nullable dtypes
df_better = df.convert_dtypes()
# With Arrow backend
df_arrow = df.convert_dtypes(dtype_backend="pyarrow")
5. 🤖 Pandas + Machine Learning
Feature Engineering with Pandas
import pandas as pd
import numpy as np
df = pd.DataFrame({
"age": [25, 30, 35, 28, 45],
"salary": [50000, 65000, 75000, 60000, 90000],
"dept": ["IT", "HR", "IT", "Finance", "IT"],
"hire_date": pd.to_datetime(["2020-01-15", "2018-06-01", "2015-09-22", "2021-03-10", "2012-11-30"]),
"gender": ["M", "F", "M", "F", "M"],
"target": [0, 1, 1, 0, 1]
})
today = pd.Timestamp("2024-01-01")
# ── Numerical features ─────────────────────────────────────────────────
df["tenure_years"] = ((today - df["hire_date"]).dt.days / 365.25).round(1)
df["salary_per_year"] = df["salary"] / df["tenure_years"]
df["log_salary"] = np.log1p(df["salary"]) # log transform (reduces skew)
df["salary_squared"] = df["salary"] ** 2 # polynomial feature
# ── Binning (discretization) ───────────────────────────────────────────
df["age_group"] = pd.cut(df["age"], bins=[0, 30, 40, 50, 100],
labels=["<30", "30-40", "40-50", "50+"])
df["salary_quantile"] = pd.qcut(df["salary"], q=4,
labels=["Q1", "Q2", "Q3", "Q4"])
# ── Encoding: One-Hot Encoding ─────────────────────────────────────────
dept_dummies = pd.get_dummies(df["dept"], prefix="dept", drop_first=True, dtype=int)
gender_dummy = pd.get_dummies(df["gender"], prefix="gender", drop_first=True, dtype=int)
df = pd.concat([df, dept_dummies, gender_dummy], axis=1)
# ── Encoding: Label Encoding (ordinal) ───────────────────────────────
level_order = {"Junior": 0, "Mid": 1, "Senior": 2, "Lead": 3}
df["level_encoded"] = df.get("level", pd.Series()).map(level_order)
# ── Encoding: Target Encoding (mean of target per category) ──────────
target_mean = df.groupby("dept")["target"].mean()
df["dept_target_enc"] = df["dept"].map(target_mean)
# ── Date features ─────────────────────────────────────────────────────
df["hire_month"] = df["hire_date"].dt.month
df["hire_quarter"] = df["hire_date"].dt.quarter
df["hire_weekday"] = df["hire_date"].dt.weekday
df["hire_year"] = df["hire_date"].dt.year
# ── Interaction features ───────────────────────────────────────────────
df["age_x_salary"] = df["age"] * df["salary"]
df["age_div_tenure"] = df["age"] / (df["tenure_years"] + 1)
Normalization and Scaling
# Min-Max Scaling (0 to 1)
def min_max_scale(series):
return (series - series.min()) / (series.max() - series.min())
df["salary_scaled"] = min_max_scale(df["salary"])
# Z-score Standardization
def z_score(series):
return (series - series.mean()) / series.std()
df["salary_zscore"] = z_score(df["salary"])
# Using sklearn (most common in practice)
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
df["salary_std"] = scaler.fit_transform(df[["salary"]])
Preparing Data for scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
# ── Select features and target ─────────────────────────────────────────
feature_cols = ["age", "tenure_years", "log_salary", "dept_HR", "dept_IT", "gender_M"]
feature_cols = [c for c in feature_cols if c in df.columns] # safely filter
X = df[feature_cols]
y = df["target"]
# Check for remaining NaN
print(f"NaN in X: {X.isnull().sum().sum()}")
X = X.fillna(X.median()) # fill remaining NaN
# ── Split ──────────────────────────────────────────────────────────────
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train size: {X_train.shape}, Test size: {X_test.shape}")
# ── Train ──────────────────────────────────────────────────────────────
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# ── Evaluate ───────────────────────────────────────────────────────────
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2%}")
# ── Feature Importance back into Pandas ───────────────────────────────
feat_importance = pd.DataFrame({
"feature": feature_cols,
"importance": model.feature_importances_
}).sort_values("importance", ascending=False)
print("\nFeature Importance:")
print(feat_importance.to_string(index=False))
Pandas for EDA (Exploratory Data Analysis)
import pandas as pd
import numpy as np
def eda_report(df):
"""Generate a comprehensive EDA report for a DataFrame."""
print("=" * 70)
print("📊 EXPLORATORY DATA ANALYSIS REPORT")
print("=" * 70)
print(f"\n📐 Shape: {df.shape[0]:,} rows × {df.shape[1]} columns\n")
# Column overview
col_info = pd.DataFrame({
"dtype": df.dtypes,
"non_null": df.notnull().sum(),
"null_count": df.isnull().sum(),
"null_pct": (df.isnull().sum() / len(df) * 100).round(2),
"unique": df.nunique(),
"unique_pct": (df.nunique() / len(df) * 100).round(2)
})
print("📋 Column Overview:")
print(col_info.to_string())
# Numeric stats
numeric_cols = df.select_dtypes(include="number").columns
if len(numeric_cols) > 0:
print("\n📈 Numeric Column Statistics:")
stats = df[numeric_cols].agg(["mean", "std", "min", "max", "skew", "kurt"]).round(2)
print(stats.to_string())
# Categorical frequency
cat_cols = df.select_dtypes(include=["object", "category"]).columns
for col in cat_cols[:5]: # limit to first 5
print(f"\n🔤 Top values in '{col}':")
print(df[col].value_counts().head(5).to_string())
# Duplicates
dup_count = df.duplicated().sum()
print(f"\n🔁 Duplicate rows: {dup_count:,} ({dup_count/len(df)*100:.2f}%)")
# Correlation (numeric only)
if len(numeric_cols) > 1:
print("\n📉 Correlation Matrix (Top pairs):")
corr = df[numeric_cols].corr().abs()
# Get upper triangle pairs
pairs = (corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
.stack().sort_values(ascending=False))
print(pairs.head(10).to_string())
return col_info
# Usage
# eda_report(df)
6. 🏭 Real-World Production Pipelines
ETL Pipeline with Pandas
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from pathlib import Path
import logging
# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
logger = logging.getLogger(__name__)
class SalesPipeline:
"""Production-grade ETL pipeline for sales data."""
def __init__(self, db_url: str, output_dir: str = "./output"):
self.engine = create_engine(db_url)
self.output_dir = Path(output_dir)
self.output_dir.mkdir(exist_ok=True)
def extract(self, source_path: str) -> pd.DataFrame:
"""E — Extract data from CSV source."""
logger.info(f"Extracting from {source_path}")
df = pd.read_csv(
source_path,
dtype={
"order_id": str,
"customer_id": str,
"product_id": str,
"quantity": int,
},
parse_dates=["order_date"],
na_values=["", "NULL", "N/A", "null"],
low_memory=False,
)
logger.info(f"Extracted {len(df):,} rows, {df.shape[1]} columns")
return df
def validate(self, df: pd.DataFrame) -> pd.DataFrame:
"""Validate data quality."""
original_len = len(df)
issues = []
# Check required columns
required = ["order_id", "customer_id", "quantity", "order_date"]
missing = [c for c in required if c not in df.columns]
if missing:
raise ValueError(f"Missing required columns: {missing}")
# Check for duplicate order IDs
dups = df["order_id"].duplicated().sum()
if dups > 0:
issues.append(f"{dups} duplicate order_ids")
df = df.drop_duplicates(subset=["order_id"], keep="first")
# Check for negative quantities
neg = (df["quantity"] < 0).sum()
if neg > 0:
issues.append(f"{neg} negative quantities → removed")
df = df[df["quantity"] >= 0]
# Log issues
for issue in issues:
logger.warning(f"Data quality issue: {issue}")
logger.info(f"Validation: {original_len:,} → {len(df):,} rows kept")
return df
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""T — Transform and enrich data."""
logger.info("Transforming data...")
# Standardize strings
df["customer_id"] = df["customer_id"].str.strip().str.upper()
df["product_id"] = df["product_id"].str.strip().str.upper()
# Handle missing values
df["quantity"] = df["quantity"].fillna(0).astype(int)
df["unit_price"] = df["unit_price"].fillna(df["unit_price"].median())
# Calculate revenue
df["gross_revenue"] = df["quantity"] * df["unit_price"]
df["discount_amount"] = df["gross_revenue"] * df["discount"].fillna(0)
df["net_revenue"] = df["gross_revenue"] - df["discount_amount"]
# Date features
df["order_year"] = df["order_date"].dt.year
df["order_month"] = df["order_date"].dt.month
df["order_quarter"] = df["order_date"].dt.quarter
df["order_weekday"] = df["order_date"].dt.day_name()
df["is_weekend"] = df["order_date"].dt.weekday >= 5
# Categorize
df["revenue_tier"] = pd.cut(
df["net_revenue"],
bins=[0, 100, 500, 2000, float("inf")],
labels=["Low", "Medium", "High", "Premium"]
)
logger.info(f"Transform complete. Total revenue: ${df['net_revenue'].sum():,.2f}")
return df
def aggregate(self, df: pd.DataFrame) -> dict:
"""Create summary aggregations."""
return {
"daily_summary": df.groupby("order_date").agg(
orders=("order_id", "count"),
revenue=("net_revenue", "sum"),
avg_order_value=("net_revenue", "mean")
).round(2),
"product_summary": df.groupby("product_id").agg(
orders=("order_id", "count"),
units_sold=("quantity", "sum"),
total_revenue=("net_revenue", "sum"),
avg_price=("unit_price", "mean")
).sort_values("total_revenue", ascending=False).round(2),
"customer_summary": df.groupby("customer_id").agg(
orders=("order_id", "count"),
total_spent=("net_revenue", "sum"),
avg_order=("net_revenue", "mean"),
first_order=("order_date", "min"),
last_order=("order_date", "max")
).round(2),
}
def load(self, df: pd.DataFrame, aggregates: dict):
"""L — Load to database and files."""
logger.info("Loading data...")
# Load to database
df.to_sql("orders_fact", self.engine, if_exists="replace",
index=False, chunksize=1000, method="multi")
for table_name, agg_df in aggregates.items():
agg_df.to_sql(table_name, self.engine, if_exists="replace",
index=True, method="multi")
# Export to Parquet (for data lake)
df.to_parquet(self.output_dir / "orders_fact.parquet", index=False)
# Export aggregates to Excel
with pd.ExcelWriter(self.output_dir / "sales_report.xlsx") as writer:
df.to_excel(writer, sheet_name="Raw", index=False)
for name, agg_df in aggregates.items():
agg_df.to_excel(writer, sheet_name=name.replace("_", " ").title())
logger.info(f"Loaded {len(df):,} rows to database and files")
def run(self, source_path: str):
"""Run the full ETL pipeline."""
logger.info("🚀 Starting ETL pipeline")
try:
df = self.extract(source_path)
df = self.validate(df)
df = self.transform(df)
aggregates = self.aggregate(df)
self.load(df, aggregates)
logger.info("✅ Pipeline completed successfully")
except Exception as e:
logger.error(f"❌ Pipeline failed: {e}")
raise
# Usage:
# pipeline = SalesPipeline(db_url="sqlite:///sales.db")
# pipeline.run("raw_orders.csv")
Configuration-Driven Data Cleaning
import pandas as pd
import json
# Configuration (can be loaded from JSON/YAML)
config = {
"columns": {
"keep": ["order_id", "customer_id", "product", "quantity", "price", "date"],
"rename": {"customer_id": "cust_id", "product": "product_name"},
"dtypes": {"order_id": "str", "cust_id": "str", "quantity": "int32", "price": "float32"},
"parse_dates": ["date"]
},
"cleaning": {
"drop_duplicates": ["order_id"],
"fillna": {"quantity": 0, "price": "median"},
"filter": "quantity > 0 and price > 0"
},
"derived": {
"revenue": "quantity * price",
"month": "date.dt.month"
}
}
def apply_config(df, config):
"""Apply a configuration dict to clean and transform a DataFrame."""
col_cfg = config["columns"]
clean_cfg = config["cleaning"]
# Keep only specified columns (if they exist)
available = [c for c in col_cfg["keep"] if c in df.columns]
df = df[available]
# Rename
df = df.rename(columns=col_cfg.get("rename", {}))
# Parse dates
for col in col_cfg.get("parse_dates", []):
if col in df.columns:
df[col] = pd.to_datetime(df[col], errors="coerce")
# Set dtypes
for col, dtype in col_cfg.get("dtypes", {}).items():
if col in df.columns:
df[col] = df[col].astype(dtype, errors="ignore")
# Drop duplicates
dup_cols = clean_cfg.get("drop_duplicates", [])
if dup_cols:
df = df.drop_duplicates(subset=dup_cols)
# Fill NaN
for col, value in clean_cfg.get("fillna", {}).items():
if col in df.columns:
if value == "median":
df[col] = df[col].fillna(df[col].median())
elif value == "mean":
df[col] = df[col].fillna(df[col].mean())
else:
df[col] = df[col].fillna(value)
# Filter
filter_expr = clean_cfg.get("filter")
if filter_expr:
df = df.query(filter_expr)
return df.reset_index(drop=True)
7. 📊 Data Science Perspective
Statistical Analysis with Pandas
import pandas as pd
import numpy as np
df = pd.DataFrame({
"group": np.repeat(["A", "B", "C"], 100),
"value": np.concatenate([
np.random.normal(100, 15, 100),
np.random.normal(110, 12, 100),
np.random.normal(95, 20, 100)
])
})
# ── Descriptive stats ─────────────────────────────────────────────────
print(df.groupby("group")["value"].describe().round(2))
# ── Percentiles ────────────────────────────────────────────────────────
df.groupby("group")["value"].quantile([0.25, 0.5, 0.75, 0.90, 0.95, 0.99])
# ── Outlier detection: IQR method ─────────────────────────────────────
Q1 = df["value"].quantile(0.25)
Q3 = df["value"].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df["value"] < Q1 - 1.5 * IQR) | (df["value"] > Q3 + 1.5 * IQR)]
print(f"\nOutliers: {len(outliers)} ({len(outliers)/len(df)*100:.1f}%)")
# ── Outlier detection: Z-score method ────────────────────────────────
from scipy import stats
z_scores = np.abs(stats.zscore(df["value"]))
outliers_z = df[z_scores > 3]
# ── Correlation analysis ───────────────────────────────────────────────
numerical_df = pd.DataFrame(np.random.randn(100, 5), columns=["A", "B", "C", "D", "E"])
corr_matrix = numerical_df.corr()
# Find highly correlated pairs
def get_high_correlations(corr, threshold=0.5):
pairs = (corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
.stack()
.reset_index())
pairs.columns = ["var1", "var2", "correlation"]
return pairs[pairs["correlation"].abs() > threshold].sort_values("correlation", ascending=False)
high_corr = get_high_correlations(corr_matrix, 0.3)
print("\nHigh Correlations:")
print(high_corr.to_string(index=False))
# ── Distribution analysis ──────────────────────────────────────────────
print(f"\nSkewness: {df['value'].skew():.3f}") # > 0 = right skewed
print(f"Kurtosis: {df['value'].kurt():.3f}") # > 0 = heavy tails
Time Series Analysis for ML
import pandas as pd
import numpy as np
# Create time series
dates = pd.date_range("2020-01-01", periods=730, freq="D")
ts = pd.Series(
np.sin(np.arange(730) * 2 * np.pi / 365) * 50 + 200 +
np.random.randn(730) * 10,
index=dates,
name="demand"
)
df = ts.to_frame()
# ── Feature engineering for time series ML ────────────────────────────
# Lag features
for lag in [1, 7, 14, 30]:
df[f"lag_{lag}"] = df["demand"].shift(lag)
# Rolling features
for window in [7, 14, 30]:
df[f"rolling_mean_{window}"] = df["demand"].rolling(window).mean()
df[f"rolling_std_{window}"] = df["demand"].rolling(window).std()
df[f"rolling_max_{window}"] = df["demand"].rolling(window).max()
# Calendar features
df["day_of_week"] = df.index.dayofweek # 0=Mon, 6=Sun
df["day_of_month"] = df.index.day
df["month"] = df.index.month
df["quarter"] = df.index.quarter
df["is_weekend"] = df.index.dayofweek >= 5
df["week_of_year"] = df.index.isocalendar().week.astype(int)
# Cyclical encoding (avoids the "12 → 1" distance problem in months)
df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)
df["dow_sin"] = np.sin(2 * np.pi * df["day_of_week"] / 7)
df["dow_cos"] = np.cos(2 * np.pi * df["day_of_week"] / 7)
# Drop rows with NaN from lag features
df = df.dropna()
print(f"Feature matrix shape: {df.shape}")
print(f"Features: {df.columns.tolist()}")
8. ⚠️ Advanced Edge Cases and Errors
Error 1: FutureWarning: observed=False in GroupBy
df["dept"] = df["dept"].astype("category")
# ❌ Pandas 2.x warns about this
df.groupby("dept").sum() # FutureWarning: observed=False
# ✅ Always specify observed
df.groupby("dept", observed=True).sum() # only categories that exist
df.groupby("dept", observed=False).sum() # include all categories (even empty)
Error 2: KeyError in resample() — Non-datetime Index
# ❌ resample requires datetime index
df.resample("D").sum() # KeyError or TypeError if index isn't datetime
# ✅ Set datetime index first
df["date"] = pd.to_datetime(df["date"])
df = df.set_index("date").sort_index()
df.resample("D").sum() # works now
Error 3: ValueError: cannot insert, already exists in pd.concat()
# ❌ Both DataFrames have the same column names + same index
df1 = pd.DataFrame({"a": [1, 2]})
df2 = pd.DataFrame({"a": [3, 4]})
pd.concat([df1, df2], axis=1) # duplicate column "a"
# ✅ Rename or use ignore_index
pd.concat([df1.rename(columns={"a": "a1"}), df2.rename(columns={"a": "a2"})], axis=1)
# Or:
pd.concat([df1, df2], axis=0, ignore_index=True)
Error 4: Silent precision loss with float64 → int
# ❌ Float NaN can't be converted to int directly
s = pd.Series([1.0, 2.0, np.nan, 4.0])
s.astype(int) # ValueError: Cannot convert non-finite values (NA or inf) to integer
# ✅ Use nullable integer type
s.astype("Int64") # capital I — supports NA
# Output: 1, 2, <NA>, 4
# Or fill first
s.fillna(0).astype(int)
Error 5: Timezone-naive vs Timezone-aware Comparison
# ❌ Comparing naive and aware timestamps
ts_naive = pd.Timestamp("2024-01-01")
ts_aware = pd.Timestamp("2024-01-01", tz="UTC")
ts_naive < ts_aware # TypeError!
# ✅ Normalize both
ts_naive = ts_naive.tz_localize("UTC")
ts_naive < ts_aware # True — works now
9. 💡 Pro Developer Insights
Building a Clean Data Pipeline with pipe()
import pandas as pd
def standardize_columns(df):
df.columns = df.columns.str.strip().str.lower().str.replace(r"[\s\-]+", "_", regex=True)
return df
def drop_duplicates(df, subset=None):
return df.drop_duplicates(subset=subset).reset_index(drop=True)
def fill_nulls(df, strategy="median"):
for col in df.select_dtypes(include="number").columns:
if strategy == "median":
df[col] = df[col].fillna(df[col].median())
elif strategy == "mean":
df[col] = df[col].fillna(df[col].mean())
return df
def remove_outliers_iqr(df, cols, factor=1.5):
for col in cols:
Q1, Q3 = df[col].quantile(0.25), df[col].quantile(0.75)
IQR = Q3 - Q1
df = df[df[col].between(Q1 - factor * IQR, Q3 + factor * IQR)]
return df.reset_index(drop=True)
# ── Clean pipeline using pipe() ────────────────────────────────────────
cleaned_df = (
df
.pipe(standardize_columns)
.pipe(drop_duplicates, subset=["order_id"])
.pipe(fill_nulls, strategy="median")
.pipe(remove_outliers_iqr, cols=["salary", "age"])
)
print(f"Cleaned: {cleaned_df.shape}")
Testing Your Pandas Code
import pandas as pd
import pytest
def calculate_revenue(df):
df = df.copy()
df["revenue"] = df["quantity"] * df["price"]
return df
# Unit tests for data transformation functions
class TestCalculateRevenue:
def test_basic(self):
df = pd.DataFrame({"quantity": [2, 3], "price": [10.0, 20.0]})
result = calculate_revenue(df)
assert "revenue" in result.columns
assert result["revenue"].tolist() == [20.0, 60.0]
def test_zero_quantity(self):
df = pd.DataFrame({"quantity": [0, 5], "price": [100.0, 100.0]})
result = calculate_revenue(df)
assert result["revenue"].iloc[0] == 0.0
def test_does_not_modify_original(self):
df = pd.DataFrame({"quantity": [1], "price": [10.0]})
original_cols = df.columns.tolist()
calculate_revenue(df)
assert df.columns.tolist() == original_cols # original unchanged
def test_shape_preserved(self):
df = pd.DataFrame({"quantity": range(100), "price": [5.0] * 100})
result = calculate_revenue(df)
assert len(result) == len(df)
10. 📊 Pandas vs Alternatives — Complete Comparison
Feature | Pandas | Polars | Dask | Spark (PySpark) | Vaex |
|---|---|---|---|---|---|
Release | 2008 | 2021 | 2015 | 2014 | 2012 |
Language | Python/C | Rust | Python | Scala/Python | C++ |
API Style | Object-oriented | Lazy + Eager | Lazy | Lazy (RDD/DF) | Lazy |
Multi-threading | ❌ GIL-limited | ✅ Native | Partial | ✅ Full | ✅ |
Multi-node | ❌ | ❌ | ✅ Dask cluster | ✅ Spark cluster | ❌ |
Memory | Load all | Streaming | Larger-than-RAM | Cluster mem | Out-of-core |
Speed (10M rows) | Baseline | 5-20x faster | Similar | Overhead | 2-10x faster |
Ecosystem | Huge (10+ yrs) | Growing | Good | Huge | Small |
SQL support | Via SQLAlchemy | Native | Via Dask SQL | Native SQL | Limited |
Streaming | ❌ | ❌ | Partial | ✅ Spark Streaming | ❌ |
Learning Curve | Medium | Low | Medium | High | Medium |
Best for | <10GB, EDA | <100GB fast | 10GB-1TB | 1TB+ distributed | 1TB+ single machine |
When to Use What
# Use PANDAS when:
# - Dataset fits comfortably in RAM (<10GB)
# - Quick EDA and data exploration
# - Integration with scikit-learn, statsmodels, matplotlib
# - Team is familiar with it (huge knowledge base)
# Use POLARS when:
# - Speed is critical (reports, dashboards, APIs)
# - Data up to ~100GB on a single machine
# - CPU-bound operations (no GIL limitation)
# - Lazy evaluation needed
# Use DASK when:
# - Data doesn't fit in RAM
# - Want Pandas-like API
# - Single or small cluster setup
# Use SPARK when:
# - Petabyte-scale data
# - Distributed cluster infrastructure exists
# - Streaming data processing
# Use VAEX when:
# - Billions of rows on one machine
# - Memory-mapped operations
# - Visualization of large datasets
11. 🎤 Complete Interview Questions — All Levels
Basic Level (1-2 years experience)
Q1: What is the difference between df.copy() and df.view()?
.copy()creates an independent copy — changes won't affect the original..view()shares memory — changes will affect the original. In practice, always use.copy()when you want independence.
Q2: How do you convert a DataFrame column to datetime?
df["date"] = pd.to_datetime(df["date_string"], format="%Y-%m-%d", errors="coerce")
Q3: How do you remove duplicate rows?
df.drop_duplicates() # based on all columns
df.drop_duplicates(subset=["email"]) # based on specific column
df.drop_duplicates(keep="last") # keep last occurrence
df.drop_duplicates(keep=False) # drop ALL duplicates
Q4: What is df.info() used for?
Shows column names, count of non-null values, dtypes, and total memory usage. Essential first step in any data analysis to understand your dataset's structure and quality.
Q5: How do you get the top N rows by a column value?
df.nlargest(5, "salary") # top 5 by salary
df.nsmallest(5, "age") # bottom 5 by age
df.sort_values("salary", ascending=False).head(5) # same result
Q6: What does value_counts(normalize=True) do?
Returns proportions (0 to 1) instead of counts. Multiply by 100 for percentages.
Q7: How do you rename columns in Pandas?
df.rename(columns={"old_name": "new_name"})
df.columns = ["col1", "col2", "col3"] # replace all at once
df.columns = df.columns.str.lower() # transform all
Intermediate Level (2-4 years)
Q8: What is the difference between apply(), map(), and transform()?
Method | Works on | Returns | Use case |
|---|---|---|---|
| Series | Series (same length) | Element-wise on Series |
| Series | Any shape | Custom function on Series |
| DataFrame | Series or scalar | Apply along axis |
| Groups | Same shape as input | Group stats back to original |
Q9: How does pd.merge() differ from pd.concat()?
merge()joins tables based on matching key columns/indices (like SQL JOIN).concat()simply stacks DataFrames vertically (rows) or horizontally (columns) without matching on keys.
Q10: What happens if you merge two DataFrames that both have a column called "date"?
Pandas adds suffixes (
_x,_y) to disambiguate. Control this withsuffixes=("_left", "_right")parameter.
Q11: How would you calculate a 7-day rolling average for each group separately?
# Option 1: groupby + transform
df["rolling_avg"] = (
df.groupby("group")["value"]
.transform(lambda x: x.rolling(7, min_periods=1).mean())
)
# Option 2: resample per group
result = (
df.set_index("date")
.groupby("group")["value"]
.resample("D")
.sum()
.groupby("group")
.transform(lambda x: x.rolling(7, min_periods=1).mean())
)
Q12: How do you efficiently handle a CSV with 100M rows?
# Strategy 1: Read in chunks
results = []
for chunk in pd.read_csv("huge.csv", chunksize=500_000):
processed = chunk.groupby("category")["revenue"].sum()
results.append(processed)
final = pd.concat(results).groupby(level=0).sum()
# Strategy 2: Use Dask
import dask.dataframe as dd
ddf = dd.read_csv("huge.csv")
result = ddf.groupby("category")["revenue"].sum().compute()
# Strategy 3: Read only needed columns
df = pd.read_csv("huge.csv", usecols=["category", "revenue"])
Q13: What is the difference between pd.cut() and pd.qcut()?
# pd.cut() — fixed-width bins
pd.cut(df["age"], bins=[0, 18, 35, 60, 100], labels=["Teen", "Young", "Mid", "Senior"])
# Bins have equal WIDTH
# pd.qcut() — quantile-based bins
pd.qcut(df["salary"], q=4, labels=["Q1", "Q2", "Q3", "Q4"])
# Bins have equal NUMBER OF DATA POINTS (each quartile has ~25% of data)
Advanced Level (4+ years)
Q14: Explain Copy-on-Write in Pandas 2.0 and why it matters.
CoW changes how Pandas handles copies of DataFrames. Before CoW, slicing returned a view OR a copy (ambiguous). With CoW, all slices are logically independent — modifications create a copy only when needed. This eliminates SettingWithCopyWarning and makes behavior predictable. It also improves performance since copies are deferred until actually needed.
Q15: How would you implement a custom aggregation function in groupby?
def weighted_avg(group):
"""Weighted average of salary by headcount."""
return (group["salary"] * group["headcount"]).sum() / group["headcount"].sum()
df.groupby("dept").apply(weighted_avg)
# Or with agg (more efficient)
df.groupby("dept")["salary"].agg(
custom_pct=lambda x: (x > x.mean()).mean() * 100 # % above mean
)
Q16: How does Pandas store object dtype internally, and what are the implications?
objectdtype stores Python object pointers — each element is a Python string object in memory. This means no vectorized C operations are possible, high memory overhead (one Python object per element + 8 bytes for pointer), and slow operations. The newstringdtype (or Arrow string) stores data contiguously in memory, enabling SIMD operations and much lower overhead.
Q17: What is MultiIndex and when would you use it?
MultiIndex is a hierarchical index with multiple levels. Use it when data has natural hierarchical structure (dept → team → employee), for pivot table results, or for panel data (stock prices per date per ticker). It enables efficient slicing across any index level using
.xs(),.loc, orpd.IndexSlice.
Q18: How do you detect and handle data drift in a Pandas pipeline?
def check_data_drift(reference_df, current_df, numeric_cols):
"""Compare distribution stats between reference and current data."""
drift_report = []
for col in numeric_cols:
ref_mean = reference_df[col].mean()
cur_mean = current_df[col].mean()
ref_std = reference_df[col].std()
z_score = abs(cur_mean - ref_mean) / ref_std if ref_std > 0 else 0
pct_change = (cur_mean - ref_mean) / ref_mean * 100 if ref_mean != 0 else 0
drift_report.append({
"column": col,
"ref_mean": round(ref_mean, 2),
"cur_mean": round(cur_mean, 2),
"pct_change": round(pct_change, 2),
"z_score": round(z_score, 2),
"drift_detected": z_score > 3 # threshold
})
return pd.DataFrame(drift_report)
Scenario-Based Questions
Q19: You have a DataFrame with 5M rows and need to find the top 3 customers by revenue per product category. What's your approach?
# Step 1: GroupBy product category AND customer, aggregate revenue
grouped = df.groupby(["category", "customer_id"])["revenue"].sum().reset_index()
# Step 2: Rank within each category
grouped["rank"] = grouped.groupby("category")["revenue"].rank(
method="dense", ascending=False
)
# Step 3: Filter top 3
top3 = grouped[grouped["rank"] <= 3].sort_values(["category", "rank"])
print(top3)
# Alternative using nlargest per group (cleaner)
top3_alt = (
df.groupby(["category", "customer_id"])["revenue"].sum()
.groupby(level="category", group_keys=False)
.nlargest(3)
.reset_index()
)
Q20: Your pipeline breaks when a new CSV comes in with extra columns. How do you handle this?
EXPECTED_SCHEMA = {
"order_id": "str",
"customer_id": "str",
"amount": "float64",
"date": "datetime64"
}
def safe_load(filepath):
df = pd.read_csv(filepath)
# Check for required columns
missing = set(EXPECTED_SCHEMA.keys()) - set(df.columns)
if missing:
raise ValueError(f"Missing required columns: {missing}")
# Keep only expected columns (ignore extras)
df = df[[col for col in EXPECTED_SCHEMA.keys() if col in df.columns]]
# Apply dtypes
for col, dtype in EXPECTED_SCHEMA.items():
if dtype == "datetime64":
df[col] = pd.to_datetime(df[col], errors="coerce")
else:
df[col] = df[col].astype(dtype)
return df
Q21: How would you find pairs of customers who bought the same products?
# Self-join approach
orders = pd.DataFrame({
"customer": ["A", "A", "B", "B", "C"],
"product": ["X", "Y", "X", "Z", "Y"]
})
# Merge the table with itself on product
pairs = pd.merge(orders, orders, on="product", suffixes=("_1", "_2"))
# Remove self-pairs and duplicates
pairs = pairs[pairs["customer_1"] < pairs["customer_2"]]
# Count shared products
shared = pairs.groupby(["customer_1", "customer_2"]).size().reset_index(name="shared_products")
print(shared)
12. 🔚 Conclusion — The Complete Pandas Series
What You've Learned Across All 3 Parts
Part 1 — Fundamentals:
Series and DataFrame creation and attributes
All I/O operations: CSV, Excel, JSON, SQL, Parquet
Essential operations: selecting, adding, removing, renaming
Data types and memory optimization
Part 2 — Data Manipulation:
.locand.ilocindexingBoolean filtering,
.query(),.isin(),.between()Handling missing data:
dropna(),fillna(),interpolate()GroupBy:
agg(),transform(),filter(),apply()Merge and join (all types):
pd.merge(),pd.concat(),.join()Reshaping:
pivot(),pivot_table(),melt(),stack()String operations with
.str
Part 3 — Advanced:
MultiIndex hierarchical indexing
Time series:
resample(),rolling(),shift(), time zonesPerformance optimization: memory, vectorization, eval/query
Pandas 2.x features: Copy-on-Write, Arrow backend
ML integration: feature engineering, encoding, scikit-learn prep
Production ETL pipelines
Complete interview preparation
🏁 Final Advice
Learn it by doing — take a real dataset and answer real questions with it.
Profile before optimizing — don't guess, measure.
Pandas is not a database — use SQL for 100M+ rows of filtering, then Pandas for analysis.
Vectorize everything — if you're writing a Python loop over rows, you're doing it wrong.
Read the changelog — Pandas releases meaningful updates; Pandas 2.x is a big deal.
Know when to leave Pandas — for 100GB+ data, Polars or Dask might serve you better.
Quick Reference Card
# ── Create ─────────────────────────────────────────────────────────────
pd.DataFrame({"a": [...], "b": [...]})
pd.read_csv("file.csv", usecols=[...], dtype={...}, parse_dates=[...])
# ── Explore ────────────────────────────────────────────────────────────
df.shape, df.dtypes, df.info(), df.describe(), df.isnull().sum()
# ── Select ─────────────────────────────────────────────────────────────
df["col"], df[["col1","col2"]], df.loc[row, col], df.iloc[0, 2]
# ── Filter ─────────────────────────────────────────────────────────────
df[df["a"] > 10], df.query("a > 10 and b == 'X'"), df[df["a"].isin([1,2,3])]
# ── Clean ──────────────────────────────────────────────────────────────
df.dropna(), df.fillna(0), df.drop_duplicates(), df.astype({"col": int})
# ── Transform ──────────────────────────────────────────────────────────
df["new"] = df["a"] * 2 # vectorized
df.assign(new=lambda x: x["a"] * 2)
np.where(condition, true_val, false_val)
# ── GroupBy ────────────────────────────────────────────────────────────
df.groupby("col").agg(sum=("val", "sum"), avg=("val", "mean"))
df.groupby("col")["val"].transform("mean")
# ── Join ───────────────────────────────────────────────────────────────
pd.merge(df1, df2, on="key", how="left")
pd.concat([df1, df2], ignore_index=True)
# ── Reshape ────────────────────────────────────────────────────────────
df.pivot_table(values="val", index="row", columns="col", aggfunc="sum")
pd.melt(df, id_vars=["id"], value_vars=["jan","feb"], var_name="month")
# ── Time Series ────────────────────────────────────────────────────────
df.resample("ME").sum()
df["val"].rolling(7).mean()
df["val"].shift(1)
# ── Export ─────────────────────────────────────────────────────────────
df.to_csv("out.csv", index=False)
df.to_parquet("out.parquet")
df.to_sql("table", engine, if_exists="replace", index=False)
🎯 You now have a complete, production-ready Pandas knowledge base. Bookmark all 3 parts, practice the examples, and you'll handle any data challenge that comes your way.
💬 Have questions? Drop them in the comments!
⭐ If this helped you, share it with your team — they'll thank you for it.