# Example 3 — Preprocessing pipeline

Demonstrates smoothing, blank subtraction, and clustering using `preprocess()`.

Full script: `examples/scripts/04_preprocessing.py`

## Data: two growth phenotypes + blanks

```python
import numpy as np
import pykinbiont
from pykinbiont import GrowthData, FitOptions, ModelSpec, LogLinModel, fit, preprocess

pykinbiont.configure("/path/to/KinBiont.jl")

def logistic(t, K=1.2, mu=0.5, N0=0.01):
    return K / (1 + ((K - N0) / N0) * np.exp(-mu * t))

rng   = np.random.default_rng(7)
times = np.linspace(0, 20, 50)

fast  = [logistic(times, K=1.1, mu=0.70) + rng.normal(0, 0.015, len(times)) for _ in range(4)]
slow  = [logistic(times, K=0.9, mu=0.30) + rng.normal(0, 0.015, len(times)) for _ in range(4)]
blank = [rng.normal(0.02, 0.003, len(times)) for _ in range(2)]

curves = np.stack(fast + slow + blank)
labels = (
    [f"Fast_{i+1}" for i in range(4)]
    + [f"Slow_{i+1}" for i in range(4)]
    + [f"Blank_{i+1}" for i in range(2)]
)
data = GrowthData(curves=curves, times=times, labels=labels)
```

## Smoothing only

```python
opts_smooth = FitOptions(smooth=True, smooth_method="rolling_avg", smooth_pt_avg=5)
smoothed = preprocess(data, opts_smooth)
print(f"After smoothing: shape={smoothed.curves.shape}")
```

## Blank subtraction

```python
blank_mean = float(np.mean(data.curves[-2:]))

opts_blank = FitOptions(
    blank_subtraction=True,
    blank_value=blank_mean,
    correct_negatives=True,
    negative_method="thr_correction",
    negative_threshold=0.001,
)
subtracted = preprocess(data, opts_blank)
print(f"After blank subtraction: min={subtracted.curves.min():.4f}")
```

## Clustering

```python
opts_cluster = FitOptions(cluster=True, n_clusters=3, kmeans_seed=0)
clustered = preprocess(data, opts_cluster)

print("\nCluster assignments:")
for label, cid in zip(data.labels, clustered.clusters):
    print(f"  {label:12s} → cluster {cid}")

print(f"\nWCSS: {clustered.wcss:.4f}")
print(f"Centroid matrix: {clustered.centroids.shape}")
```

Expected output — fast growers in one cluster, slow in another, blanks in a third.

## Fit after preprocessing

Cluster first with `preprocess()`, then fit the growth wells only:

```python
growth_labels = labels[:8]   # exclude blank wells

# Step 1: cluster
opts_c = FitOptions(cluster=True, n_clusters=2)
clustered = preprocess(data[growth_labels], opts_c)
assignments = dict(zip(clustered.labels, map(int, clustered.clusters)))

# Step 2: fit
opts_fit = FitOptions(
    smooth=True,
    smooth_method="rolling_avg",
    smooth_pt_avg=5,
    blank_subtraction=True,
    blank_value=blank_mean,
    correct_negatives=True,
    negative_method="thr_correction",
    negative_threshold=0.001,
)
spec    = ModelSpec(models=[LogLinModel()], params=[[]])
results = fit(data[growth_labels], spec, opts_fit)

df = results.to_dataframe()
df["cluster"] = df["label"].map(assignments)
print(df[["label", "cluster", "best_model", "aic"]])
```