Tuesday, May 20, 2014

Beating the Benchmark @ KDD 2014

# -*- coding: utf-8 -*-

"""

Beating the benchmark @ KDD 2014

__author__ : Abhishek Thakur

"""

import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
import re

def clean(s):
        try:
            return " ".join(re.findall(r'\w+', s,flags = re.UNICODE | re.LOCALE)).lower()
        except:
            return " ".join(re.findall(r'\w+', "no_text",flags = re.UNICODE | re.LOCALE)).lower()

donations = pd.read_csv('donations.csv')
projects = pd.read_csv('projects.csv')
outcomes = pd.read_csv('outcomes.csv')
resources = pd.read_csv('resources.csv')
sample = pd.read_csv('sampleSubmission.csv')
essays = pd.read_csv('essays.csv')


essays = essays.sort('projectid')
projects = projects.sort('projectid')
sample = sample.sort('projectid')
ess_proj = pd.merge(essays, projects, on='projectid')
outcomes = outcomes.sort('projectid')


outcomes_arr = np.array(outcomes)


labels = outcomes_arr[:,1]

ess_proj['essay'] = ess_proj['essay'].apply(clean)

ess_proj_arr = np.array(ess_proj)

train_idx = np.where(ess_proj_arr[:,-1] < '2014-01-01')[0]
test_idx = np.where(ess_proj_arr[:,-1] >= '2014-01-01')[0]


traindata = ess_proj_arr[train_idx,:]
testdata = ess_proj_arr[test_idx,:]


tfidf = TfidfVectorizer(min_df=3,  max_features=1000)

tfidf.fit(traindata[:,5])
tr = tfidf.transform(traindata[:,5])
ts = tfidf.transform(testdata[:,5])


lr = linear_model.LogisticRegression()
lr.fit(tr, labels=='t')
preds =lr.predict_proba(ts)[:,1]


sample['is_exciting'] = preds
sample.to_csv('predictions.csv', index = False)

Saturday, February 1, 2014

Loan default prediction - Beating the Benchmark!


Beating the zero benchmark in Kaggle's Loan default prediction competition. Comments are most welcome :)


"""

Beating the Benchmark :::::: Kaggle Loan Default Prediction Challenge.
__author__ : Abhishek

"""

import pandas as pd
import numpy as np
import cPickle
from sklearn import preprocessing
from sklearn.svm import LinearSVC
import  scipy.stats as stats
import sklearn.linear_model as lm

def testdata(filename):
 X = pd.read_table(filename, sep=',', warn_bad_lines=True, error_bad_lines=True)

 X = np.asarray(X.values, dtype = float)

 col_mean = stats.nanmean(X,axis=0)
 inds = np.where(np.isnan(X))
 X[inds]=np.take(col_mean,inds[1])
 data = np.asarray(X[:,1:-3], dtype = float)

 return data
 
def data(filename):
 X = pd.read_table(filename, sep=',', warn_bad_lines=True, error_bad_lines=True)

 X = np.asarray(X.values, dtype = float)

 col_mean = stats.nanmean(X,axis=0)
 inds = np.where(np.isnan(X))
 X[inds]=np.take(col_mean,inds[1])

 labels = np.asarray(X[:,-1], dtype = float)
 data = np.asarray(X[:,1:-4], dtype = float)
 return data, labels


def createSub(clf, traindata, labels, testdata):
 sub = 1

 labels = np.asarray(map(int,labels))

 niter = 10
 auc_list = []
 mean_auc = 0.0; itr = 0
 if sub == 1:
  xtrain = traindata#[train]
  xtest = testdata#[test]

  ytrain = labels#[train]
  predsorig = np.asarray([0] * testdata.shape[0]) #np.copy(ytest)

  labelsP = []

  for i in range(len(labels)):
   if labels[i] > 0:
    labelsP.append(1)
   else:
    labelsP.append(0)

  labelsP = np.asarray(labelsP)
  ytrainP = labelsP

  lsvc = LinearSVC(C=0.01, penalty="l1", dual=False, verbose = 2)
  lsvc.fit(xtrain, ytrainP)
  xtrainP = lsvc.transform(xtrain)
  xtestP =  lsvc.transform(xtest)

  clf.fit(xtrainP,ytrainP)
  predsP = clf.predict(xtestP)

  nztrain = np.where(ytrainP > 0)[0]
  nztest = np.where(predsP == 1)[0]

  nztrain0 = np.where(ytrainP == 0)[0]
  nztest0 = np.where(predsP == 0)[0]

  xtrainP = xtrain[nztrain]
  xtestP = xtest[nztest]

  ytrain0 = ytrain[nztrain0]
  ytrain1 = ytrain[nztrain]

  clf.fit(xtrainP,ytrain1)
  preds = clf.predict(xtestP)

  predsorig[nztest] = preds
  predsorig[nztest0] = 0

  np.savetxt('predictions.csv',predsorig ,delimiter = ',', fmt = '%d')

if __name__ == '__main__':
 filename = 'trainv2.csv'
 X_test = testdata('testv2.csv')

 X, labels = data(filename)
 
 clf = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, 
                             C=1.0, fit_intercept=True, intercept_scaling=1.0, 
                             class_weight=None, random_state=None)

 X = preprocessing.scale(X) 
 X_test = preprocessing.scale(X_test)

 createSub(clf, X, labels, X_test)

Tuesday, December 10, 2013

Packing Santa's Sleigh (Python Code for MATLAB Benchmark)


 #  
 # This is a translation of the MATLAB benchmark code   
 # given by Kaggle in "Packing Santa's Sleigh" competition  
 #   
 # This file will give a score same as the MATLAB benchmark score  
 #  
 import numpy as np  
 import scipy as sp  
 import pandas as pd  
 def getData():  
      print "reading data using pandas"  
      data = pd.read_table('../presents.csv', sep=',')  
      #print data  
      print "converting data to numpy array"  
      data = np.asarray(data)  
      #print data  
      return data  
 def pack(data):  
      #data = data[:100, :]  
      # the number of presents  
      presents = data[:,1:]  
      numPresents = data.shape[0]  
      print "total presents : ", numPresents  
      # width and length are 1000 units. Height is not fixed for the packing  
      width = 1000  
      length = 1000  
      # Initial coordinates  
      xs = 1  
      ys = 1  
      zs = -1  
      lastRowsInd = np.zeros((100, 1)) # temp array for storing indexes of last few rows   
      lastShelfInd = np.zeros((100,1)) # temp array for storing indexes of last few shelves  
      numInRow = 0     # Store the number of presents in current row  
      numInShelf = 0     # Store the number of presents in current shelf  
      presentCoordinates = np.zeros((numPresents, 25))  
      tempPresentLenRow = []  
      tempPresentHeightShelf = []  
      for i in range(numPresents):  
           # check if there is room in the row, else increase the row  
           if (xs + presents[i,0] > width + 1):  
                ys = ys + np.max(tempPresentLenRow)  
                xs = 1  
                numInRow = 0  
                tempPresentLenRow = []  
           # check if there is room in shelf, else increase the height  
           if (ys + presents[i,1] > length + 1):  
                zs = zs - np.max(tempPresentHeightShelf)  
                xs = 1  
                ys = 1  
                numInShelf = 0  
                tempPresentHeightShelf = []  
           presentCoordinates[i,0] = data[i,0]  
           presentCoordinates[i,[1,7,13,19]] = xs  
           presentCoordinates[i,[4,10,16,22]] = xs + presents[i,0] - 1  
           presentCoordinates[i,[2,5,14,17]] = ys  
           presentCoordinates[i,[8,11,20,23]] = ys + presents[i,1] - 1  
           presentCoordinates[i,[3,6,9,12]] = zs  
           presentCoordinates[i,[15,18,21,24]] = zs - presents[i,2] + 1  
           xs = xs + presents[i,0]  
           numInRow = numInRow + 1  
           numInShelf = numInShelf + 1  
           tempPresentLenRow.append(presents[i,1])  
           tempPresentHeightShelf.append(presents[i,2])  
           if i%1000 == 0: print i  
      zCoords = presentCoordinates[:,3::3]  
      minZ = np.min(zCoords.ravel())  
      presentCoordinates[:,3::3] = zCoords - minZ + 1  
      return presentCoordinates  
 def saveCSV(predictions):  
      datafile = pd.read_table('../presents.csv', sep=',')  
      submission = pd.DataFrame(predictions, columns="PresentID,x1,y1,z1,x2,y2,z2,x3,y3,z3,x4,y4,z4,x5,y5,z5,x6,y6,z6,x7,y7,z7,x8,y8,z8".split(','), dtype = int)  
      submission.to_csv('submission.csv', index = False)  
 if __name__ == '__main__':  
      data = getData()  
      predictions = pack(data)  
      saveCSV(predictions)  

Thursday, December 5, 2013

Partly Sunny With a Chance of #Hashtags




Algorithm for the team (no_name):

The training data consisted of tweet and its location. The variables to be predicted were S, W and K which have been explained as follows:


s = sentiment
w = when
k = kind
============================================================
s1,"I can't tell"
s2,"Negative"
s3,"Neutral / author is just sharing information"
s4,"Positive"
s5,"Tweet not related to weather condition"  
w1,"current (same day) weather"
w2,"future (forecast)"
w3,"I can't tell"
w4,"past weather"
k1,"clouds"
k2,"cold"
k3,"dry"
k4,"hot"
k5,"humid"
k6,"hurricane"
k7,"I can't tell"
k8,"ice"
k9,"other"
k10,"rain"
k11,"snow"
k12,"storms"
k13,"sun"
k14,"tornado"
k15,"wind"

Competition Details : http://www.kaggle.com/c/crowdflower-weather-twitter



For classification we treated S, W and K separately and created different models for each of them. The dataset was also preprocessed separately for the 3 variables.


Functions implemented:
  • Sanitization Function - Each tweet was sanitized prior to vectorization. The sanitization part converted all tweets to lower-case and replaced “cloudy” to “cloud”, “rainy” to “rain” and so on.
  • Sentiment Dictionary - A list of words for different sentiments constituted the sentiment dictionary.
  • Sentiment Scoring - we provided a score to each tweet if the tweet consisted of any words found in the sentiment dictionary.
  • Tense Detection - A tense detector was implemented based on regular expressions and it provided score for “past”, “present”, “future” and “not known” to every tweet in the dataset.
  • Frequent language detection - This function removed tweets for which language was not frequent (10 frequent languages were used).
  • Tokenization - A custom tokenization function for tweets was implemented using NLTK.
  • Stopwords - Stopwords like 'RT','@','#','link','google','facebook','yahoo','rt' , etc. were removed from the dataset.
  • Replace Two or More - Repetitions of characters in a word were removed. Eg. “hottttt” was replaced with “hot”.
  • Spelling Correction - Spelling correction was implemented based on Levenshtein Distance.
  • Weather Vocabulary - A weather vocabulary was made by crawling a few weather sites which scored the tweets as related to weather or not.
  • Category OneHot - The categorical variables like state and location were one hot encoded using this function.


Types of Data Used:
  • All tweets
  • Count Vectorization
  • TFIDF Vectorization
  • Word ngrams (1,2)
  • Char ngrams (1,6)
  • LDA on the data
  • Predicted values of S, W and K using Linear Regression and Ridge Regression



Classifiers Used:
  • Ridge Regression
  • Logistic Regression
  • SGD


Model:
  • The different types of data were trained with both the classifiers and and ensemble was created from the different predictions.
  • We used approximately 10 different model-data combinations for creating the final ensemble.
  • The predictions for S and W were normalized between 0 and 1 in the end.




Our model gave a score of 0.1469 on the leaderboard.


In the end we did an average with Jack to end up at 4th position.

After this competition I ended up in the first page of Kaggle rankings: http://www.kaggle.com/users/5309/abhishek


Thursday, November 28, 2013

serially number all files

A very helpful script to serially number all files in a folder:

ls *.csv | gawk 'BEGIN{ a=1 }{ printf "mv \"%s\" %d.csv\n", $0, a++ }' | bash

Friday, November 8, 2013

StumbleUpon Evergreen Classification Challenge

Few days back I finished Kaggle.com's (www.kaggle.com) StumbleUpon Evergreen Classification Challenge. StumbleUpon is a user-curated web content discovery engine that recommends relevant, high quality pages and media to its users, based on their interests. 

The challengeYour mission is to build a classifier which will evaluate a large set of URLs and label them as either evergreen or ephemeral. Can you out-class(ify) StumbleUpon?

(http://www.kaggle.com/c/stumbleupon)

My overall rank in this competition was 6th. I was one of the two persons to maintain top 10 position after the private leaderboard was revealed (http://www.kaggle.com/users/5309/abhishek) . 


Lets's talk about the approach now. 


My best Public score was 0.89447 which got 6th rank when the private data was revealed. I had 40+ submissions which would have got a Top 10 rank in the Private Leaderboard (best being 3rd).

Anyways, I tried to keep my model as simple as possible and there were only 3 classification models in my ensemble. My ensemble consisted of two Logistic Regression and a k-NN. I used python + sklearn throughout the competition. 
I divided the data into two parts :
#1 Boilerplate: I used the preprocessing.py by Triseklion for preprocessing the boilerplate. In TFIDFVectorizer, I used NLTK for stemming and tokenization. So, it was basically the same as the beat_bench.py that I had posted, except pre-processing and NLTK tokenizer.
#2 Raw Data: I used my own data cleaner for cleaning and tokenization and HTML cleaner of NLTK. preprocessing.py by Triseklion was not used here, as I had deployed my own pre-processing. I used the same TFIDFVectorizer as the one for Boilerplate data. 
The next step was SVD. The TF-IDF values obtained from both the data were passed through TruncatedSVD of scikit-learn. Both the SVDs used 120 components. 
SVD1 ---> Logistic Regression
SVD1 ---> k-NN Classifier
SVD2 ---> Logistic Regression
The final ensemble was a simple mean of these three models.

Things that did not work for me (or gave a lower score) : 
#1 Rapid Automatic Keyword Extraction (RAKE) on both Boilerplate and Raw Data.
#2 SVM (I thought it would but it didn't)
#3 Naive Bayes worked to a certain extent, the results were not satisfactory.
#4 Use of Word Embeddings derived using neural network approach on Wikipedia Corpus.

I hope you liked my approach. I will soon be posting some code snippets(on request).