Data Mining

K-Nearest Neighbor (KNN)

Used to classify new data points based on “distance” to known data

Find the K nearest neighbors, based on your distance metrics

Let them all vote on the classification

import pandas as pd
import numpy as np
import os

r_cols = ["user_id", "movie_id", "rating"]
df = pd.read_csv("data/ml-100k/u.data", sep="\t", names=r_cols, usecols=range(3))
df.head()
user_id movie_id rating
0 0 50 5
1 0 172 5
2 0 133 1
3 196 242 3
4 186 302 3
movieProperties = df.groupby("movie_id").agg({"rating":[np.size, np.mean]})
movieProperties.head()
rating
size mean
movie_id
1 452 3.878319
2 131 3.206107
3 90 3.033333
4 209 3.550239
5 86 3.302326
movieNumRatings = pd.DataFrame(movieProperties["rating"]["size"])
movieNormalizedRatings = movieNumRatings.apply(lambda x: ( x - np.min(x)) / (np.max(x) - np.min(x)))
movieNormalizedRatings.head()
size
movie_id
1 0.773585
2 0.222985
3 0.152659
4 0.356775
5 0.145798
movieDict = {}
with open("data/ml-100k/u.item") as f:
    temp = ''
    for line in f:
        fields = line.rstrip('\n').split('|')
        movieID = int(fields[0])
        name = fields[1]
        genres = fields[5:25]
        genres = map(int, genres)
        movieDict[movieID] = (name,
                              np.array(list(genres)),
                              movieNormalizedRatings.loc[movieID].get("size"),
                              movieProperties.loc[movieID].rating.get("mean"))
movieDict[1]
('Toy Story (1995)',
 array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 0.7735849056603774,
 3.8783185840707963)
from scipy import spatial

def ComputeDistance(a, b):
    genresA = a[1]
    genresB = b[1]    
    genreDistance = spatial.distance.cosine(genresA, genresB)
    popularityA = a[2]
    popularityB = b[2]
    popularityDistance = abs(popularityA - popularityB)
    return genreDistance + popularityDistance
ComputeDistance(movieDict[2], movieDict[4])
0.8004574042309892
import operator

def getNeighbors(movieID, K):
    distances = []
    for movie in movieDict:
        if movie != movieID:
            dist = ComputeDistance(movieDict[movieID], movieDict[movie])
            distances.append((movie, dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(K):
        neighbors.append(distances[x][0])
    return neighbors
K = 10
avgRating = 0
neighbors = getNeighbors(1, K)
for neighbor in neighbors:
    avgRating += movieDict[neighbor][3]
    print(movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))
Liar Liar (1997) 3.156701030927835
Aladdin (1992) 3.8127853881278537
Willy Wonka and the Chocolate Factory (1971) 3.6319018404907975
Monty Python and the Holy Grail (1974) 4.0664556962025316
Full Monty, The (1997) 3.926984126984127
George of the Jungle (1997) 2.685185185185185
Beavis and Butt-head Do America (1996) 2.7884615384615383
Birdcage, The (1996) 3.4436860068259385
Home Alone (1990) 3.0875912408759123
Aladdin and the King of Thieves (1996) 2.8461538461538463
avgRating /= float(K)
avgRating
3.3445905900235564
movieDict[1]
('Toy Story (1995)',
 array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 0.7735849056603774,
 3.8783185840707963)

Principal Component Analysis

PCA is a dimensionality reduction technique; it lets you distill multi-dimensional data down to fewer dimensions, selecting new dimensions.

from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
import pylab as pl
from itertools import cycle

iris = load_iris()

numSamples, numFeatures = iris.data.shape
numSamples, numFeatures
(150, 4)
list(iris.target_names)
['setosa', 'versicolor', 'virginica']
X = iris.data
pca = PCA(n_components=2, whiten=True).fit(X)
X_pca = pca.transform(X)
pca.components_
array([[ 0.36138659, -0.08452251,  0.85667061,  0.3582892 ],
       [ 0.65658877,  0.73016143, -0.17337266, -0.07548102]])
pca.explained_variance_ratio_
array([0.92461872, 0.05306648])
sum(pca.explained_variance_ratio_)
0.9776852063187949
colors = cycle("rgb")
target_ids = range(len(iris.target_names))
pl.figure()
for i, c, label in zip(target_ids, colors, iris.target_names):
    pl.scatter(X_pca[iris.target == i, 0], X_pca[iris.target == i, 1],
    c=c, label=label)
pl.legend()
pl.show()

png