Kmeans clustering with fish dataset
# Step 1: Understand data
http://ww2.amstat.org/publications/jse/datasets/fishcatch.txt
https://drive.google.com/open?id=1P2YzTua5ZMEAdxnMbfDwv19VSKY4F8ZI
#Step 2: Load data
# Import modules
import pandas as pd
import numpy as np
df = pd.read_csv(“fish.csv”)
y = df[‘Species’].values
type(y)
X = (df[df.columns[[1,2,3,4,5,6]]].values)
type(X)
#Step 3: Work with StandardScaler and Kmeans
# import modules
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
# Create scaler: scaler
scaler = StandardScaler()
# Create KMeans instance: kmeans
kmeans = KMeans(n_clusters=4)
# Create pipeline: pipeline
pipeline = make_pipeline(scaler, kmeans)
# Fit the pipeline to samples
pipeline.fit(X)
# Calculate the cluster labels: labels
labels = pipeline.predict(X)
# Create a DataFrame with labels and species as columns: df
df1 = pd.DataFrame({‘labels’: labels, ‘species’: y})
# Create crosstab: ct
ct = pd.crosstab(df1[‘labels’], df1[‘species’])
# Display ct
print(ct)
Tag:CLUSTERING, KMeans