###############################################################################
# This file reads a dataset of IDs (column 1), binary outcomes (column 2) and
# variables (columns 3 to P+3). The file performs four operations:
# 1) Calculates the Mahalanobis distance between each pair of observations.
# 2) Uses the distances to identify the nearest neighbours for every
# observation and the associated distances.
# 3) Uses the nearest neighbour IDs to identify the nearest neighbour outcomes.
# 4) Calculates the weighted average outcome for the ~~ nearest neighbours,
# where and ~~~~ may be different.
# The file exports a dataset of IDs (column 1), raw outcomes (column2 ) and
# average outcomes for different size neighbourhoods (columns 3 to 3+).
#
# Inputs:
# - dataset described above, default name is input.csv
# - choice of scalar , the number of neighbours to be identified
# - choice of vector , a list of the neighbourhood sizes for which
# average outcomes should be identified.
#
# Outputs:
# - dataset described above, default name is output.csv
#
# Dependencies: numpy, pandas
#
# Created by Robert Garlick and updated in November 2015. Most recent version
# is available at www.robgarlick.com/nonparametric_matching.py. Comments are
# welcome at robert.garlick@duke.edu.
#
# Created and tested in Python 2.7.6 for Linux.
###############################################################################
##### Set-up: import packages and declare macros. #####
# Import packages
import numpy as np
import pandas as dp
# Declare number of nearest neighbours to identify.
K = 1000
# Declare list of nearest neighbourhood sizes.
sizes = [1, 10, 100, 1000]
nsizes = len(sizes)
##### Stage 0: load and reorganize data. #####
# Load main datset. See notes in introduction for restrictions on data format.
data = np.loadtxt('input.csv', delimiter=',')
rownum = np.shape(data)[0]
colnum = np.shape(data)[1]
# Create array of IDs only.
idmat = data[0:rownum,0:1]
# Create array of outcomes only.
ymat = data[0:rownum,1:2]
# Create array of covariates only.
xmat = data[0:rownum,2:colnum]
##### Stages 1-2: calculate distances and identify neighbours. #####
# Decorrelate the data so that the Mahalanobis distance is the inner product.
covmat = np.cov(xmat.T)
covmat_r = np.linalg.cholesky(covmat)
xmat_dc = np.linalg.solve(covmat_r, xmat.T).T
# Specify placeholder arrays
neighbour_indices = []
neighbour_distances = []
predictions = []
# Row-by-row Mahalanobis distance calculation
for i in range(rownum):
if i % 1000 == 0:
print(i)
# Calculate distances using decorrelated data and (Qi-Qj)'(Qi-Qj) formula
xc = xmat_dc - xmat_dc[i, :]
di = (xc**2).sum(1)
di[i] = np.inf
# Indices of K closest neighbours
ix = np.argpartition(di, K)[0:K]
# Distances to M closest neighbours
din = di[ix]
# Sort the neighbours by distance
jj = np.argsort(din)
ix = ix[jj]
din = din[jj]
# Populate the placeholder arrays
neighbour_indices.append(ix)
neighbour_distances.append(din)
# Save the neighbours and distances as arrays
neighbour_indices = np.asarray(neighbour_indices)
neighbour_distances = np.asarray(neighbour_distances)
# Convert distance matrix into inverse distance matrix, using the fact that
# multiplication/division operates elementwise by default
neighbour_invdist = 1/(1+neighbour_distances)
# Combine outcomes, neighbours and inverse distances as a single array
combined = np.hstack((ymat,neighbour_indices,neighbour_invdist))
##### Stage 3: identify nearest neighbours' outcomes. #####
# Convert matrix of IDs/neighbours/inverse distances into a data frame
mahaoutput = dp.DataFrame(data=combined, index=idmat)
# Convert matrix of IDs/outcomes into a data frame
ymat2 = dp.DataFrame(data=ymat)
# Merge matrix of neighbours' outcomes onto main data frame K times. We merge
# on column numbers, without naming columns in the data frame. This makes the
# data frame a bit disorganized but avoids the need to name columns.
for k in range(K):
j = k+1
l = k+1+2*K
mahaoutput = dp.merge(mahaoutput, ymat2, left_index=False, right_index=True, left_on=j, how='left')
mahaoutput.rename(columns={'0_x':0}, inplace=True)
mahaoutput.rename(columns={'0_y':l}, inplace=True)
# Convert merged data frame into matrix for subsequent analysis
data2 = np.asarray(mahaoutput)
##### Stage 4: calculate nearest neighbours' weighted average outcomes. #####
# Calculate distance-weighted average outcome amongst neighours, saved as vectors.
for s in sizes:
for i in range(rownum):
# print( data2[i,1+K:1+K+s], data2[i,1+2*K:1+2*K+s] )
predictions.append( np.dot(data2[i,1+K:1+K+s], data2[i,1+2*K:1+2*K+s] ) / np.sum(data2[i,1+K:1+K+s] ) )
# Convert (N*K)*1 vector of average outcomes into N*K matrix.
predictions = np.reshape(predictions, (rownum,nsizes), order='F')
##### Stage 5: Export predicted outcomes. #####
# Append predicted outcomes onto IDs and observed outcomes.
output = np.append(idmat, ymat, axis=1)
output = np.append(output, predictions, axis=1)
# Export IDs, outcomes, and predicted outcomes in CSV format.
np.savetxt('output.csv', output, delimiter=',')
~~