Tuesday, May 20, 2014

Beating the Benchmark @ KDD 2014

# -*- coding: utf-8 -*-


__author__ : Abhishek Thakur


import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
import re

def clean(s):
            return " ".join(re.findall(r'\w+', s,flags = re.UNICODE | re.LOCALE)).lower()
            return " ".join(re.findall(r'\w+', "no_text",flags = re.UNICODE | re.LOCALE)).lower()

donations = pd.read_csv('donations.csv')
projects = pd.read_csv('projects.csv')
outcomes = pd.read_csv('outcomes.csv')
resources = pd.read_csv('resources.csv')
sample = pd.read_csv('sampleSubmission.csv')
essays = pd.read_csv('essays.csv')

essays = essays.sort('projectid')
projects = projects.sort('projectid')
sample = sample.sort('projectid')
ess_proj = pd.merge(essays, projects, on='projectid')
outcomes = outcomes.sort('projectid')

outcomes_arr = np.array(outcomes)

labels = outcomes_arr[:,1]

ess_proj['essay'] = ess_proj['essay'].apply(clean)

ess_proj_arr = np.array(ess_proj)

train_idx = np.where(ess_proj_arr[:,-1] < '2014-01-01')[0]
test_idx = np.where(ess_proj_arr[:,-1] >= '2014-01-01')[0]

traindata = ess_proj_arr[train_idx,:]
testdata = ess_proj_arr[test_idx,:]

tfidf = TfidfVectorizer(min_df=3,  max_features=1000)

tr = tfidf.transform(traindata[:,5])
ts = tfidf.transform(testdata[:,5])

lr = linear_model.LogisticRegression()
lr.fit(tr, labels=='t')
preds =lr.predict_proba(ts)[:,1]

sample['is_exciting'] = preds
sample.to_csv('predictions.csv', index = False)

Saturday, February 1, 2014

Loan default prediction - Beating the Benchmark!

Beating the zero benchmark in Kaggle's Loan default prediction competition. Comments are most welcome :)


Beating the Benchmark :::::: Kaggle Loan Default Prediction Challenge.
__author__ : Abhishek


import pandas as pd
import numpy as np
import cPickle
from sklearn import preprocessing
from sklearn.svm import LinearSVC
import  scipy.stats as stats
import sklearn.linear_model as lm

def testdata(filename):
 X = pd.read_table(filename, sep=',', warn_bad_lines=True, error_bad_lines=True)

 X = np.asarray(X.values, dtype = float)

 col_mean = stats.nanmean(X,axis=0)
 inds = np.where(np.isnan(X))
 data = np.asarray(X[:,1:-3], dtype = float)

 return data
def data(filename):
 X = pd.read_table(filename, sep=',', warn_bad_lines=True, error_bad_lines=True)

 X = np.asarray(X.values, dtype = float)

 col_mean = stats.nanmean(X,axis=0)
 inds = np.where(np.isnan(X))

 labels = np.asarray(X[:,-1], dtype = float)
 data = np.asarray(X[:,1:-4], dtype = float)
 return data, labels

def createSub(clf, traindata, labels, testdata):
 sub = 1

 labels = np.asarray(map(int,labels))

 niter = 10
 auc_list = []
 mean_auc = 0.0; itr = 0
 if sub == 1:
  xtrain = traindata#[train]
  xtest = testdata#[test]

  ytrain = labels#[train]
  predsorig = np.asarray([0] * testdata.shape[0]) #np.copy(ytest)

  labelsP = []

  for i in range(len(labels)):
   if labels[i] > 0:

  labelsP = np.asarray(labelsP)
  ytrainP = labelsP

  lsvc = LinearSVC(C=0.01, penalty="l1", dual=False, verbose = 2)
  lsvc.fit(xtrain, ytrainP)
  xtrainP = lsvc.transform(xtrain)
  xtestP =  lsvc.transform(xtest)

  predsP = clf.predict(xtestP)

  nztrain = np.where(ytrainP > 0)[0]
  nztest = np.where(predsP == 1)[0]

  nztrain0 = np.where(ytrainP == 0)[0]
  nztest0 = np.where(predsP == 0)[0]

  xtrainP = xtrain[nztrain]
  xtestP = xtest[nztest]

  ytrain0 = ytrain[nztrain0]
  ytrain1 = ytrain[nztrain]

  preds = clf.predict(xtestP)

  predsorig[nztest] = preds
  predsorig[nztest0] = 0

  np.savetxt('predictions.csv',predsorig ,delimiter = ',', fmt = '%d')

if __name__ == '__main__':
 filename = 'trainv2.csv'
 X_test = testdata('testv2.csv')

 X, labels = data(filename)
 clf = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, 
                             C=1.0, fit_intercept=True, intercept_scaling=1.0, 
                             class_weight=None, random_state=None)

 X = preprocessing.scale(X) 
 X_test = preprocessing.scale(X_test)

 createSub(clf, X, labels, X_test)

Tuesday, December 10, 2013

Packing Santa's Sleigh (Python Code for MATLAB Benchmark)

 # This is a translation of the MATLAB benchmark code   
 # given by Kaggle in "Packing Santa's Sleigh" competition  
 # This file will give a score same as the MATLAB benchmark score  
 import numpy as np  
 import scipy as sp  
 import pandas as pd  
 def getData():  
      print "reading data using pandas"  
      data = pd.read_table('../presents.csv', sep=',')  
      #print data  
      print "converting data to numpy array"  
      data = np.asarray(data)  
      #print data  
      return data  
 def pack(data):  
      #data = data[:100, :]  
      # the number of presents  
      presents = data[:,1:]  
      numPresents = data.shape[0]  
      print "total presents : ", numPresents  
      # width and length are 1000 units. Height is not fixed for the packing  
      width = 1000  
      length = 1000  
      # Initial coordinates  
      xs = 1  
      ys = 1  
      zs = -1  
      lastRowsInd = np.zeros((100, 1)) # temp array for storing indexes of last few rows   
      lastShelfInd = np.zeros((100,1)) # temp array for storing indexes of last few shelves  
      numInRow = 0     # Store the number of presents in current row  
      numInShelf = 0     # Store the number of presents in current shelf  
      presentCoordinates = np.zeros((numPresents, 25))  
      tempPresentLenRow = []  
      tempPresentHeightShelf = []  
      for i in range(numPresents):  
           # check if there is room in the row, else increase the row  
           if (xs + presents[i,0] > width + 1):  
                ys = ys + np.max(tempPresentLenRow)  
                xs = 1  
                numInRow = 0  
                tempPresentLenRow = []  
           # check if there is room in shelf, else increase the height  
           if (ys + presents[i,1] > length + 1):  
                zs = zs - np.max(tempPresentHeightShelf)  
                xs = 1  
                ys = 1  
                numInShelf = 0  
                tempPresentHeightShelf = []  
           presentCoordinates[i,0] = data[i,0]  
           presentCoordinates[i,[1,7,13,19]] = xs  
           presentCoordinates[i,[4,10,16,22]] = xs + presents[i,0] - 1  
           presentCoordinates[i,[2,5,14,17]] = ys  
           presentCoordinates[i,[8,11,20,23]] = ys + presents[i,1] - 1  
           presentCoordinates[i,[3,6,9,12]] = zs  
           presentCoordinates[i,[15,18,21,24]] = zs - presents[i,2] + 1  
           xs = xs + presents[i,0]  
           numInRow = numInRow + 1  
           numInShelf = numInShelf + 1  
           if i%1000 == 0: print i  
      zCoords = presentCoordinates[:,3::3]  
      minZ = np.min(zCoords.ravel())  
      presentCoordinates[:,3::3] = zCoords - minZ + 1  
      return presentCoordinates  
 def saveCSV(predictions):  
      datafile = pd.read_table('../presents.csv', sep=',')  
      submission = pd.DataFrame(predictions, columns="PresentID,x1,y1,z1,x2,y2,z2,x3,y3,z3,x4,y4,z4,x5,y5,z5,x6,y6,z6,x7,y7,z7,x8,y8,z8".split(','), dtype = int)  
      submission.to_csv('submission.csv', index = False)  
 if __name__ == '__main__':  
      data = getData()  
      predictions = pack(data)  

Thursday, December 5, 2013

Partly Sunny With a Chance of #Hashtags

Algorithm for the team (no_name):

The training data consisted of tweet and its location. The variables to be predicted were S, W and K which have been explained as follows:

s = sentiment
w = when
k = kind
s1,"I can't tell"
s3,"Neutral / author is just sharing information"
s5,"Tweet not related to weather condition"  
w1,"current (same day) weather"
w2,"future (forecast)"
w3,"I can't tell"
w4,"past weather"
k7,"I can't tell"

Competition Details : http://www.kaggle.com/c/crowdflower-weather-twitter

For classification we treated S, W and K separately and created different models for each of them. The dataset was also preprocessed separately for the 3 variables.

Functions implemented:
  • Sanitization Function - Each tweet was sanitized prior to vectorization. The sanitization part converted all tweets to lower-case and replaced “cloudy” to “cloud”, “rainy” to “rain” and so on.
  • Sentiment Dictionary - A list of words for different sentiments constituted the sentiment dictionary.
  • Sentiment Scoring - we provided a score to each tweet if the tweet consisted of any words found in the sentiment dictionary.
  • Tense Detection - A tense detector was implemented based on regular expressions and it provided score for “past”, “present”, “future” and “not known” to every tweet in the dataset.
  • Frequent language detection - This function removed tweets for which language was not frequent (10 frequent languages were used).
  • Tokenization - A custom tokenization function for tweets was implemented using NLTK.
  • Stopwords - Stopwords like 'RT','@','#','link','google','facebook','yahoo','rt' , etc. were removed from the dataset.
  • Replace Two or More - Repetitions of characters in a word were removed. Eg. “hottttt” was replaced with “hot”.
  • Spelling Correction - Spelling correction was implemented based on Levenshtein Distance.
  • Weather Vocabulary - A weather vocabulary was made by crawling a few weather sites which scored the tweets as related to weather or not.
  • Category OneHot - The categorical variables like state and location were one hot encoded using this function.

Types of Data Used:
  • All tweets
  • Count Vectorization
  • TFIDF Vectorization
  • Word ngrams (1,2)
  • Char ngrams (1,6)
  • LDA on the data
  • Predicted values of S, W and K using Linear Regression and Ridge Regression

Classifiers Used:
  • Ridge Regression
  • Logistic Regression
  • SGD

  • The different types of data were trained with both the classifiers and and ensemble was created from the different predictions.
  • We used approximately 10 different model-data combinations for creating the final ensemble.
  • The predictions for S and W were normalized between 0 and 1 in the end.

Our model gave a score of 0.1469 on the leaderboard.

In the end we did an average with Jack to end up at 4th position.

After this competition I ended up in the first page of Kaggle rankings: http://www.kaggle.com/users/5309/abhishek

Thursday, November 28, 2013

serially number all files

A very helpful script to serially number all files in a folder:

ls *.csv | gawk 'BEGIN{ a=1 }{ printf "mv \"%s\" %d.csv\n", $0, a++ }' | bash

Friday, November 8, 2013

StumbleUpon Evergreen Classification Challenge

Few days back I finished Kaggle.com's (www.kaggle.com) StumbleUpon Evergreen Classification Challenge. StumbleUpon is a user-curated web content discovery engine that recommends relevant, high quality pages and media to its users, based on their interests. 

The challengeYour mission is to build a classifier which will evaluate a large set of URLs and label them as either evergreen or ephemeral. Can you out-class(ify) StumbleUpon?


My overall rank in this competition was 6th. I was one of the two persons to maintain top 10 position after the private leaderboard was revealed (http://www.kaggle.com/users/5309/abhishek) . 

Lets's talk about the approach now. 

My best Public score was 0.89447 which got 6th rank when the private data was revealed. I had 40+ submissions which would have got a Top 10 rank in the Private Leaderboard (best being 3rd).

Anyways, I tried to keep my model as simple as possible and there were only 3 classification models in my ensemble. My ensemble consisted of two Logistic Regression and a k-NN. I used python + sklearn throughout the competition. 
I divided the data into two parts :
#1 Boilerplate: I used the preprocessing.py by Triseklion for preprocessing the boilerplate. In TFIDFVectorizer, I used NLTK for stemming and tokenization. So, it was basically the same as the beat_bench.py that I had posted, except pre-processing and NLTK tokenizer.
#2 Raw Data: I used my own data cleaner for cleaning and tokenization and HTML cleaner of NLTK. preprocessing.py by Triseklion was not used here, as I had deployed my own pre-processing. I used the same TFIDFVectorizer as the one for Boilerplate data. 
The next step was SVD. The TF-IDF values obtained from both the data were passed through TruncatedSVD of scikit-learn. Both the SVDs used 120 components. 
SVD1 ---> Logistic Regression
SVD1 ---> k-NN Classifier
SVD2 ---> Logistic Regression
The final ensemble was a simple mean of these three models.

Things that did not work for me (or gave a lower score) : 
#1 Rapid Automatic Keyword Extraction (RAKE) on both Boilerplate and Raw Data.
#2 SVM (I thought it would but it didn't)
#3 Naive Bayes worked to a certain extent, the results were not satisfactory.
#4 Use of Word Embeddings derived using neural network approach on Wikipedia Corpus.

I hope you liked my approach. I will soon be posting some code snippets(on request).