Tuesday, May 20, 2014

Beating the Benchmark @ KDD 2014

# -*- coding: utf-8 -*-

"""

Beating the benchmark @ KDD 2014

__author__ : Abhishek Thakur

"""

import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
import re

def clean(s):
        try:
            return " ".join(re.findall(r'\w+', s,flags = re.UNICODE | re.LOCALE)).lower()
        except:
            return " ".join(re.findall(r'\w+', "no_text",flags = re.UNICODE | re.LOCALE)).lower()

donations = pd.read_csv('donations.csv')
projects = pd.read_csv('projects.csv')
outcomes = pd.read_csv('outcomes.csv')
resources = pd.read_csv('resources.csv')
sample = pd.read_csv('sampleSubmission.csv')
essays = pd.read_csv('essays.csv')


essays = essays.sort('projectid')
projects = projects.sort('projectid')
sample = sample.sort('projectid')
ess_proj = pd.merge(essays, projects, on='projectid')
outcomes = outcomes.sort('projectid')


outcomes_arr = np.array(outcomes)


labels = outcomes_arr[:,1]

ess_proj['essay'] = ess_proj['essay'].apply(clean)

ess_proj_arr = np.array(ess_proj)

train_idx = np.where(ess_proj_arr[:,-1] < '2014-01-01')[0]
test_idx = np.where(ess_proj_arr[:,-1] >= '2014-01-01')[0]


traindata = ess_proj_arr[train_idx,:]
testdata = ess_proj_arr[test_idx,:]


tfidf = TfidfVectorizer(min_df=3,  max_features=1000)

tfidf.fit(traindata[:,5])
tr = tfidf.transform(traindata[:,5])
ts = tfidf.transform(testdata[:,5])


lr = linear_model.LogisticRegression()
lr.fit(tr, labels=='t')
preds =lr.predict_proba(ts)[:,1]


sample['is_exciting'] = preds
sample.to_csv('predictions.csv', index = False)

Saturday, February 1, 2014

Loan default prediction - Beating the Benchmark!


Beating the zero benchmark in Kaggle's Loan default prediction competition. Comments are most welcome :)


"""

Beating the Benchmark :::::: Kaggle Loan Default Prediction Challenge.
__author__ : Abhishek

"""

import pandas as pd
import numpy as np
import cPickle
from sklearn import preprocessing
from sklearn.svm import LinearSVC
import  scipy.stats as stats
import sklearn.linear_model as lm

def testdata(filename):
 X = pd.read_table(filename, sep=',', warn_bad_lines=True, error_bad_lines=True)

 X = np.asarray(X.values, dtype = float)

 col_mean = stats.nanmean(X,axis=0)
 inds = np.where(np.isnan(X))
 X[inds]=np.take(col_mean,inds[1])
 data = np.asarray(X[:,1:-3], dtype = float)

 return data
 
def data(filename):
 X = pd.read_table(filename, sep=',', warn_bad_lines=True, error_bad_lines=True)

 X = np.asarray(X.values, dtype = float)

 col_mean = stats.nanmean(X,axis=0)
 inds = np.where(np.isnan(X))
 X[inds]=np.take(col_mean,inds[1])

 labels = np.asarray(X[:,-1], dtype = float)
 data = np.asarray(X[:,1:-4], dtype = float)
 return data, labels


def createSub(clf, traindata, labels, testdata):
 sub = 1

 labels = np.asarray(map(int,labels))

 niter = 10
 auc_list = []
 mean_auc = 0.0; itr = 0
 if sub == 1:
  xtrain = traindata#[train]
  xtest = testdata#[test]

  ytrain = labels#[train]
  predsorig = np.asarray([0] * testdata.shape[0]) #np.copy(ytest)

  labelsP = []

  for i in range(len(labels)):
   if labels[i] > 0:
    labelsP.append(1)
   else:
    labelsP.append(0)

  labelsP = np.asarray(labelsP)
  ytrainP = labelsP

  lsvc = LinearSVC(C=0.01, penalty="l1", dual=False, verbose = 2)
  lsvc.fit(xtrain, ytrainP)
  xtrainP = lsvc.transform(xtrain)
  xtestP =  lsvc.transform(xtest)

  clf.fit(xtrainP,ytrainP)
  predsP = clf.predict(xtestP)

  nztrain = np.where(ytrainP > 0)[0]
  nztest = np.where(predsP == 1)[0]

  nztrain0 = np.where(ytrainP == 0)[0]
  nztest0 = np.where(predsP == 0)[0]

  xtrainP = xtrain[nztrain]
  xtestP = xtest[nztest]

  ytrain0 = ytrain[nztrain0]
  ytrain1 = ytrain[nztrain]

  clf.fit(xtrainP,ytrain1)
  preds = clf.predict(xtestP)

  predsorig[nztest] = preds
  predsorig[nztest0] = 0

  np.savetxt('predictions.csv',predsorig ,delimiter = ',', fmt = '%d')

if __name__ == '__main__':
 filename = 'trainv2.csv'
 X_test = testdata('testv2.csv')

 X, labels = data(filename)
 
 clf = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, 
                             C=1.0, fit_intercept=True, intercept_scaling=1.0, 
                             class_weight=None, random_state=None)

 X = preprocessing.scale(X) 
 X_test = preprocessing.scale(X_test)

 createSub(clf, X, labels, X_test)