# -*- coding: utf-8 -*-
"""
Created on Mon Feb 27 15:45:28 2017
@author: varun
"""
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as ss
def plotting_points(points,p,n, predicted_outcome):
"""Plotting points based on outcomes"""
plt.figure()
plt.plot(points[:n,0], points[:n,1],"ro")
plt.plot(points[n:,0], points[n:,1],"bo")
def distance (p1,p2):
"""Find the distance between two points"""
import numpy as np
if p1.shape==p2.shape:
return np.sqrt(np.sum(np.power(p2-p1,2)))
else:
print('Distance cannot be found, both arrays have difference dimensions')
def majority_vote(votes):
"""Used to calculate the vote with maximum frequency"""
import random
vote_counts={}
for vote in votes:
if vote in vote_counts:
vote_counts[vote]+=1
else:
vote_counts[vote]=1
max_count=max(vote_counts.values())
winner=[]
for vote, count in vote_counts.items():
if count == max_count:
winner.append(vote)
return(random.choice(winner))
def kNeareastNeighbor(p, points, k = 5):
"""For implementing kNN of point p and return their indices"""
distances=np.zeros(points.shape[0])
for i in range(len(distances)):
distances[i]=distance(p, points[i])
ind=np.argsort(distances)
return(ind[:k])
def kNN_Predict_userdefined(p, points, outcomes, k=5):
ind = kNeareastNeighbor(p, points, k)
return (majority_vote(outcomes[ind]))
def generate_synthetic_data(n=50):
"""Generate synthetic data for two n X 2 sizes with specific mean and standard deviation (bivarial normal distribution)"""
points = np.concatenate((ss.norm(0,1).rvs((n,2)),ss.norm(1,1).rvs((n,2))),axis=0)
outcomes = np.concatenate((np.repeat(0,n),np.repeat(1,n)),axis=0)
return points, outcomes
--------------------------------------------------------------------------------------------------------------------------
Order of execution
n = 200
points, outcomes = generate_synthetic_data(n)
predicted_outcome = kNN_Predict(p,points, outcomes)
plotting_points(points, p,n,predicted_outcome)
"""
Created on Mon Feb 27 15:45:28 2017
@author: varun
"""
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as ss
def plotting_points(points,p,n, predicted_outcome):
"""Plotting points based on outcomes"""
plt.figure()
plt.plot(points[:n,0], points[:n,1],"ro")
plt.plot(points[n:,0], points[n:,1],"bo")
def distance (p1,p2):
"""Find the distance between two points"""
import numpy as np
if p1.shape==p2.shape:
return np.sqrt(np.sum(np.power(p2-p1,2)))
else:
print('Distance cannot be found, both arrays have difference dimensions')
def majority_vote(votes):
"""Used to calculate the vote with maximum frequency"""
import random
vote_counts={}
for vote in votes:
if vote in vote_counts:
vote_counts[vote]+=1
else:
vote_counts[vote]=1
max_count=max(vote_counts.values())
winner=[]
for vote, count in vote_counts.items():
if count == max_count:
winner.append(vote)
return(random.choice(winner))
def kNeareastNeighbor(p, points, k = 5):
"""For implementing kNN of point p and return their indices"""
distances=np.zeros(points.shape[0])
for i in range(len(distances)):
distances[i]=distance(p, points[i])
ind=np.argsort(distances)
return(ind[:k])
def kNN_Predict_userdefined(p, points, outcomes, k=5):
ind = kNeareastNeighbor(p, points, k)
return (majority_vote(outcomes[ind]))
def generate_synthetic_data(n=50):
"""Generate synthetic data for two n X 2 sizes with specific mean and standard deviation (bivarial normal distribution)"""
points = np.concatenate((ss.norm(0,1).rvs((n,2)),ss.norm(1,1).rvs((n,2))),axis=0)
outcomes = np.concatenate((np.repeat(0,n),np.repeat(1,n)),axis=0)
return points, outcomes
--------------------------------------------------------------------------------------------------------------------------
Order of execution
n = 200
points, outcomes = generate_synthetic_data(n)
predicted_outcome = kNN_Predict(p,points, outcomes)
plotting_points(points, p,n,predicted_outcome)