Example 3 — Preprocessing pipeline#
Demonstrates smoothing, blank subtraction, and clustering using preprocess().
Full script: examples/scripts/04_preprocessing.py
Data: two growth phenotypes + blanks#
import numpy as np
import pykinbiont
from pykinbiont import GrowthData, FitOptions, ModelSpec, LogLinModel, fit, preprocess
pykinbiont.configure("/path/to/KinBiont.jl")
def logistic(t, K=1.2, mu=0.5, N0=0.01):
return K / (1 + ((K - N0) / N0) * np.exp(-mu * t))
rng = np.random.default_rng(7)
times = np.linspace(0, 20, 50)
fast = [logistic(times, K=1.1, mu=0.70) + rng.normal(0, 0.015, len(times)) for _ in range(4)]
slow = [logistic(times, K=0.9, mu=0.30) + rng.normal(0, 0.015, len(times)) for _ in range(4)]
blank = [rng.normal(0.02, 0.003, len(times)) for _ in range(2)]
curves = np.stack(fast + slow + blank)
labels = (
[f"Fast_{i+1}" for i in range(4)]
+ [f"Slow_{i+1}" for i in range(4)]
+ [f"Blank_{i+1}" for i in range(2)]
)
data = GrowthData(curves=curves, times=times, labels=labels)
Smoothing only#
opts_smooth = FitOptions(smooth=True, smooth_method="rolling_avg", smooth_pt_avg=5)
smoothed = preprocess(data, opts_smooth)
print(f"After smoothing: shape={smoothed.curves.shape}")
Blank subtraction#
blank_mean = float(np.mean(data.curves[-2:]))
opts_blank = FitOptions(
blank_subtraction=True,
blank_value=blank_mean,
correct_negatives=True,
negative_method="thr_correction",
negative_threshold=0.001,
)
subtracted = preprocess(data, opts_blank)
print(f"After blank subtraction: min={subtracted.curves.min():.4f}")
Clustering#
opts_cluster = FitOptions(cluster=True, n_clusters=3, kmeans_seed=0)
clustered = preprocess(data, opts_cluster)
print("\nCluster assignments:")
for label, cid in zip(data.labels, clustered.clusters):
print(f" {label:12s} → cluster {cid}")
print(f"\nWCSS: {clustered.wcss:.4f}")
print(f"Centroid matrix: {clustered.centroids.shape}")
Expected output — fast growers in one cluster, slow in another, blanks in a third.
Fit after preprocessing#
Cluster first with preprocess(), then fit the growth wells only:
growth_labels = labels[:8] # exclude blank wells
# Step 1: cluster
opts_c = FitOptions(cluster=True, n_clusters=2)
clustered = preprocess(data[growth_labels], opts_c)
assignments = dict(zip(clustered.labels, map(int, clustered.clusters)))
# Step 2: fit
opts_fit = FitOptions(
smooth=True,
smooth_method="rolling_avg",
smooth_pt_avg=5,
blank_subtraction=True,
blank_value=blank_mean,
correct_negatives=True,
negative_method="thr_correction",
negative_threshold=0.001,
)
spec = ModelSpec(models=[LogLinModel()], params=[[]])
results = fit(data[growth_labels], spec, opts_fit)
df = results.to_dataframe()
df["cluster"] = df["label"].map(assignments)
print(df[["label", "cluster", "best_model", "aic"]])