Saturday, February 1, 2014

Loan default prediction - Beating the Benchmark!


Beating the zero benchmark in Kaggle's Loan default prediction competition. Comments are most welcome :)


"""

Beating the Benchmark :::::: Kaggle Loan Default Prediction Challenge.
__author__ : Abhishek

"""

import pandas as pd
import numpy as np
import cPickle
from sklearn import preprocessing
from sklearn.svm import LinearSVC
import  scipy.stats as stats
import sklearn.linear_model as lm

def testdata(filename):
 X = pd.read_table(filename, sep=',', warn_bad_lines=True, error_bad_lines=True)

 X = np.asarray(X.values, dtype = float)

 col_mean = stats.nanmean(X,axis=0)
 inds = np.where(np.isnan(X))
 X[inds]=np.take(col_mean,inds[1])
 data = np.asarray(X[:,1:-3], dtype = float)

 return data
 
def data(filename):
 X = pd.read_table(filename, sep=',', warn_bad_lines=True, error_bad_lines=True)

 X = np.asarray(X.values, dtype = float)

 col_mean = stats.nanmean(X,axis=0)
 inds = np.where(np.isnan(X))
 X[inds]=np.take(col_mean,inds[1])

 labels = np.asarray(X[:,-1], dtype = float)
 data = np.asarray(X[:,1:-4], dtype = float)
 return data, labels


def createSub(clf, traindata, labels, testdata):
 sub = 1

 labels = np.asarray(map(int,labels))

 niter = 10
 auc_list = []
 mean_auc = 0.0; itr = 0
 if sub == 1:
  xtrain = traindata#[train]
  xtest = testdata#[test]

  ytrain = labels#[train]
  predsorig = np.asarray([0] * testdata.shape[0]) #np.copy(ytest)

  labelsP = []

  for i in range(len(labels)):
   if labels[i] > 0:
    labelsP.append(1)
   else:
    labelsP.append(0)

  labelsP = np.asarray(labelsP)
  ytrainP = labelsP

  lsvc = LinearSVC(C=0.01, penalty="l1", dual=False, verbose = 2)
  lsvc.fit(xtrain, ytrainP)
  xtrainP = lsvc.transform(xtrain)
  xtestP =  lsvc.transform(xtest)

  clf.fit(xtrainP,ytrainP)
  predsP = clf.predict(xtestP)

  nztrain = np.where(ytrainP > 0)[0]
  nztest = np.where(predsP == 1)[0]

  nztrain0 = np.where(ytrainP == 0)[0]
  nztest0 = np.where(predsP == 0)[0]

  xtrainP = xtrain[nztrain]
  xtestP = xtest[nztest]

  ytrain0 = ytrain[nztrain0]
  ytrain1 = ytrain[nztrain]

  clf.fit(xtrainP,ytrain1)
  preds = clf.predict(xtestP)

  predsorig[nztest] = preds
  predsorig[nztest0] = 0

  np.savetxt('predictions.csv',predsorig ,delimiter = ',', fmt = '%d')

if __name__ == '__main__':
 filename = 'trainv2.csv'
 X_test = testdata('testv2.csv')

 X, labels = data(filename)
 
 clf = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, 
                             C=1.0, fit_intercept=True, intercept_scaling=1.0, 
                             class_weight=None, random_state=None)

 X = preprocessing.scale(X) 
 X_test = preprocessing.scale(X_test)

 createSub(clf, X, labels, X_test)

4 comments:

  1. "beating the benchmark", huh? u gonna git a letter from my lawyer :P

    ReplyDelete
  2. nice work ....
    but when i try to replicate this on my machine...
    I get a Memory error in python , I'm running this on a 4GB VM
    I have tried increasing the resource by importing the resource module and setting limits as 4gb,
    but no luck....
    is this a python thing( which I don't believe it should be ) ,but if it's a machine issue
    isn't 4gb of ram enough

    ReplyDelete
  3. I have a mac osx with 8 gb ram and this code works fine. If you are facing memory problems, try to load the data in chunks using pandas.

    ReplyDelete