# Example 3 — Preprocessing pipeline Demonstrates smoothing, blank subtraction, and clustering using `preprocess()`. Full script: `examples/scripts/04_preprocessing.py` ## Data: two growth phenotypes + blanks ```python import numpy as np import pykinbiont from pykinbiont import GrowthData, FitOptions, ModelSpec, LogLinModel, fit, preprocess pykinbiont.configure("/path/to/KinBiont.jl") def logistic(t, K=1.2, mu=0.5, N0=0.01): return K / (1 + ((K - N0) / N0) * np.exp(-mu * t)) rng = np.random.default_rng(7) times = np.linspace(0, 20, 50) fast = [logistic(times, K=1.1, mu=0.70) + rng.normal(0, 0.015, len(times)) for _ in range(4)] slow = [logistic(times, K=0.9, mu=0.30) + rng.normal(0, 0.015, len(times)) for _ in range(4)] blank = [rng.normal(0.02, 0.003, len(times)) for _ in range(2)] curves = np.stack(fast + slow + blank) labels = ( [f"Fast_{i+1}" for i in range(4)] + [f"Slow_{i+1}" for i in range(4)] + [f"Blank_{i+1}" for i in range(2)] ) data = GrowthData(curves=curves, times=times, labels=labels) ``` ## Smoothing only ```python opts_smooth = FitOptions(smooth=True, smooth_method="rolling_avg", smooth_pt_avg=5) smoothed = preprocess(data, opts_smooth) print(f"After smoothing: shape={smoothed.curves.shape}") ``` ## Blank subtraction ```python blank_mean = float(np.mean(data.curves[-2:])) opts_blank = FitOptions( blank_subtraction=True, blank_value=blank_mean, correct_negatives=True, negative_method="thr_correction", negative_threshold=0.001, ) subtracted = preprocess(data, opts_blank) print(f"After blank subtraction: min={subtracted.curves.min():.4f}") ``` ## Clustering ```python opts_cluster = FitOptions(cluster=True, n_clusters=3, kmeans_seed=0) clustered = preprocess(data, opts_cluster) print("\nCluster assignments:") for label, cid in zip(data.labels, clustered.clusters): print(f" {label:12s} → cluster {cid}") print(f"\nWCSS: {clustered.wcss:.4f}") print(f"Centroid matrix: {clustered.centroids.shape}") ``` Expected output — fast growers in one cluster, slow in another, blanks in a third. ## Fit after preprocessing Cluster first with `preprocess()`, then fit the growth wells only: ```python growth_labels = labels[:8] # exclude blank wells # Step 1: cluster opts_c = FitOptions(cluster=True, n_clusters=2) clustered = preprocess(data[growth_labels], opts_c) assignments = dict(zip(clustered.labels, map(int, clustered.clusters))) # Step 2: fit opts_fit = FitOptions( smooth=True, smooth_method="rolling_avg", smooth_pt_avg=5, blank_subtraction=True, blank_value=blank_mean, correct_negatives=True, negative_method="thr_correction", negative_threshold=0.001, ) spec = ModelSpec(models=[LogLinModel()], params=[[]]) results = fit(data[growth_labels], spec, opts_fit) df = results.to_dataframe() df["cluster"] = df["label"].map(assignments) print(df[["label", "cluster", "best_model", "aic"]]) ```