# -*- coding: utf-8 -*-
"""
Beating the benchmark @ KDD 2014
__author__ : Abhishek Thakur
"""
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
import re
def clean(s):
try:
return " ".join(re.findall(r'\w+', s,flags = re.UNICODE | re.LOCALE)).lower()
except:
return " ".join(re.findall(r'\w+', "no_text",flags = re.UNICODE | re.LOCALE)).lower()
donations = pd.read_csv('donations.csv')
projects = pd.read_csv('projects.csv')
outcomes = pd.read_csv('outcomes.csv')
resources = pd.read_csv('resources.csv')
sample = pd.read_csv('sampleSubmission.csv')
essays = pd.read_csv('essays.csv')
essays = essays.sort('projectid')
projects = projects.sort('projectid')
sample = sample.sort('projectid')
ess_proj = pd.merge(essays, projects, on='projectid')
outcomes = outcomes.sort('projectid')
outcomes_arr = np.array(outcomes)
labels = outcomes_arr[:,1]
ess_proj['essay'] = ess_proj['essay'].apply(clean)
ess_proj_arr = np.array(ess_proj)
train_idx = np.where(ess_proj_arr[:,-1] < '2014-01-01')[0]
test_idx = np.where(ess_proj_arr[:,-1] >= '2014-01-01')[0]
traindata = ess_proj_arr[train_idx,:]
testdata = ess_proj_arr[test_idx,:]
tfidf = TfidfVectorizer(min_df=3, max_features=1000)
tfidf.fit(traindata[:,5])
tr = tfidf.transform(traindata[:,5])
ts = tfidf.transform(testdata[:,5])
lr = linear_model.LogisticRegression()
lr.fit(tr, labels=='t')
preds =lr.predict_proba(ts)[:,1]
sample['is_exciting'] = preds
sample.to_csv('predictions.csv', index = False)
"""
Beating the benchmark @ KDD 2014
__author__ : Abhishek Thakur
"""
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
import re
def clean(s):
try:
return " ".join(re.findall(r'\w+', s,flags = re.UNICODE | re.LOCALE)).lower()
except:
return " ".join(re.findall(r'\w+', "no_text",flags = re.UNICODE | re.LOCALE)).lower()
donations = pd.read_csv('donations.csv')
projects = pd.read_csv('projects.csv')
outcomes = pd.read_csv('outcomes.csv')
resources = pd.read_csv('resources.csv')
sample = pd.read_csv('sampleSubmission.csv')
essays = pd.read_csv('essays.csv')
essays = essays.sort('projectid')
projects = projects.sort('projectid')
sample = sample.sort('projectid')
ess_proj = pd.merge(essays, projects, on='projectid')
outcomes = outcomes.sort('projectid')
outcomes_arr = np.array(outcomes)
labels = outcomes_arr[:,1]
ess_proj['essay'] = ess_proj['essay'].apply(clean)
ess_proj_arr = np.array(ess_proj)
train_idx = np.where(ess_proj_arr[:,-1] < '2014-01-01')[0]
test_idx = np.where(ess_proj_arr[:,-1] >= '2014-01-01')[0]
traindata = ess_proj_arr[train_idx,:]
testdata = ess_proj_arr[test_idx,:]
tfidf = TfidfVectorizer(min_df=3, max_features=1000)
tfidf.fit(traindata[:,5])
tr = tfidf.transform(traindata[:,5])
ts = tfidf.transform(testdata[:,5])
lr = linear_model.LogisticRegression()
lr.fit(tr, labels=='t')
preds =lr.predict_proba(ts)[:,1]
sample['is_exciting'] = preds
sample.to_csv('predictions.csv', index = False)
Hi Abhishek,
ReplyDeleteI enjoyed your blog posts! I'm in the process of creating a similar thing, and I'd love the opportunity to collaborate. Please let me know if you're interested at tristan@[domain name]. The domain name is @teamleada.com.
Thanks!
Tristan