Quick Start Guide

Basic Usage

Functional API

The simplest way to use kmeans:

import numpy as np
from kmeans import kmeans

# Generate sample data
data = np.random.randn(1000, 2)

# Perform clustering
centroids, labels = kmeans(data, k=5)

print(f"Centroids:\n{centroids}")
print(f"Labels: {labels}")

Object-Oriented API

For a scikit-learn compatible interface:

from kmeans import KMeans

# Create and fit the model
model = KMeans(n_clusters=5, max_iter=100)
model.fit(data)

# Access results
print(f"Centroids:\n{model.centroids_}")
print(f"Labels: {model.labels_}")

# Predict on new data
new_data = np.random.randn(100, 2)
predictions = model.predict(new_data)

Complete Example

import numpy as np
import matplotlib.pyplot as plt
from kmeans import KMeans

# Generate three clusters
np.random.seed(42)
cluster1 = np.random.randn(100, 2) + [0, 0]
cluster2 = np.random.randn(100, 2) + [5, 5]
cluster3 = np.random.randn(100, 2) + [10, 0]
data = np.vstack([cluster1, cluster2, cluster3])

# Fit k-means
kmeans_model = KMeans(n_clusters=3, max_iter=100, tol=1e-4)
kmeans_model.fit(data)

# Plot results
plt.scatter(data[:, 0], data[:, 1], c=kmeans_model.labels_, cmap='viridis')
plt.scatter(kmeans_model.centroids_[:, 0],
            kmeans_model.centroids_[:, 1],
            c='red', marker='x', s=200, linewidths=3)
plt.title('K-Means Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

Working with Different Dimensions

1D Data

# 1D clustering
data_1d = np.random.randn(500)
centroids, labels = kmeans(data_1d, k=3)
print(f"Shape: {centroids.shape}")  # (3, 1)

High-Dimensional Data

# 10D clustering
data_10d = np.random.randn(1000, 10)
model = KMeans(n_clusters=5)
model.fit(data_10d)
print(f"Centroids shape: {model.centroids_.shape}")  # (5, 10)

Integration with scikit-learn

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from kmeans import KMeans

# Create a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('kmeans', KMeans(n_clusters=3))
])

# Fit the pipeline
pipeline.fit(data)

# Get labels
labels = pipeline.named_steps['kmeans'].labels_