Quick Start Guide
Basic Usage
Functional API
The simplest way to use kmeans:
import numpy as np
from kmeans import kmeans
# Generate sample data
data = np.random.randn(1000, 2)
# Perform clustering
centroids, labels = kmeans(data, k=5)
print(f"Centroids:\n{centroids}")
print(f"Labels: {labels}")
Object-Oriented API
For a scikit-learn compatible interface:
from kmeans import KMeans
# Create and fit the model
model = KMeans(n_clusters=5, max_iter=100)
model.fit(data)
# Access results
print(f"Centroids:\n{model.centroids_}")
print(f"Labels: {model.labels_}")
# Predict on new data
new_data = np.random.randn(100, 2)
predictions = model.predict(new_data)
Complete Example
import numpy as np
import matplotlib.pyplot as plt
from kmeans import KMeans
# Generate three clusters
np.random.seed(42)
cluster1 = np.random.randn(100, 2) + [0, 0]
cluster2 = np.random.randn(100, 2) + [5, 5]
cluster3 = np.random.randn(100, 2) + [10, 0]
data = np.vstack([cluster1, cluster2, cluster3])
# Fit k-means
kmeans_model = KMeans(n_clusters=3, max_iter=100, tol=1e-4)
kmeans_model.fit(data)
# Plot results
plt.scatter(data[:, 0], data[:, 1], c=kmeans_model.labels_, cmap='viridis')
plt.scatter(kmeans_model.centroids_[:, 0],
kmeans_model.centroids_[:, 1],
c='red', marker='x', s=200, linewidths=3)
plt.title('K-Means Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
Working with Different Dimensions
1D Data
# 1D clustering
data_1d = np.random.randn(500)
centroids, labels = kmeans(data_1d, k=3)
print(f"Shape: {centroids.shape}") # (3, 1)
High-Dimensional Data
# 10D clustering
data_10d = np.random.randn(1000, 10)
model = KMeans(n_clusters=5)
model.fit(data_10d)
print(f"Centroids shape: {model.centroids_.shape}") # (5, 10)
Integration with scikit-learn
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from kmeans import KMeans
# Create a pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('kmeans', KMeans(n_clusters=3))
])
# Fit the pipeline
pipeline.fit(data)
# Get labels
labels = pipeline.named_steps['kmeans'].labels_