# Import for Logistic Regression and SVM
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import PCA
from nltk.tokenize import TweetTokenizer, word_tokenize
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import log_loss
%matplotlib inline
# Read data
train_data = pd.read_csv("./train.csv")
train_data["label_num"] = train_data.handle.map({
"HillaryClinton": 0,
"realDonaldTrump": 1
})
test_data = pd.read_csv('./new_test.csv')
test_data["label_num"] = test_data.handle.map({
"HillaryClinton": 0,
"realDonaldTrump": 1
})
X_train, y_train = train_data["tweet"], train_data["label_num"]
X_test, y_test = test_data["tweet"], test_data["label_num"]
First try using Logistic Regression
count_vec = CountVectorizer(
decode_error='ignore', stop_words=stopwords.words("english"))
X_train_count = count_vec.fit_transform(X_train)
X_test_count = count_vec.transform(X_test)
tfidf_trans = TfidfTransformer()
X_train_tfidf = tfidf_trans.fit_transform(X_train_count)
X_test_tfidf = tfidf_trans.transform(X_test_count)
pca = PCA()
X_train_tfidf_rd = pca.fit_transform(X_train_tfidf.toarray())
X_test_tfidf_rd = pca.transform(X_test_tfidf.toarray())
lr = LogisticRegressionCV()
lr.fit(X_train_tfidf_rd, y_train)
logls = (log_loss(y_test, lr.predict_proba(X_test_tfidf_rd), labels=[0, 1]))
accuracy = sum(lr.predict(X_test_tfidf_rd) == y_test) * \
1.0 / X_test_tfidf_rd.shape[0]
print 'Accuracy is ' + str(accuracy) + ', log loss is ' + str(logls)
Accuracy is 0.914756025867, log loss is 0.234298274729
Use a better tokenizer
built_in_tokenizer = count_vec.build_tokenizer()
tokens = built_in_tokenizer(X_train[0])
print (tokens)
['The', 'question', 'in', 'this', 'election', 'Who', 'can', 'put', 'the', 'plans', 'into', 'action', 'that', 'will', 'make', 'your', 'life', 'better', 'https', 'co', 'XreEY9OicG']
tknzr = TweetTokenizer()
tokens = tknzr.tokenize(X_train[0])
print (tokens)
[u'The', u'question', u'in', u'this', u'election', u':', u'Who', u'can', u'put', u'the', u'plans', u'into', u'action', u'that', u'will', u'make', u'your', u'life', u'better', u'?', u'https://t.co/XreEY9OicG']
Use tricks
def tokenize(tweet):
tknzr = TweetTokenizer()
try:
tweet = tweet.lower()
tokens = tknzr.tokenize(tweet)
tokens = map(lambda t: t if not t.startswith(
'http') else '<url>', tokens)
return tokens
except:
return 'NC'
count_vec = CountVectorizer(
decode_error='ignore', tokenizer=tokenize, stop_words=stopwords.words("english"))
X_train_count = count_vec.fit_transform(X_train)
X_test_count = count_vec.transform(X_test)
tfidf_trans = TfidfTransformer()
X_train_tfidf = tfidf_trans.fit_transform(X_train_count)
X_test_tfidf = tfidf_trans.transform(X_test_count)
pca = PCA()
X_train_tfidf_rd = pca.fit_transform(X_train_tfidf.toarray())
X_test_tfidf_rd = pca.transform(X_test_tfidf.toarray())
lr = LogisticRegressionCV()
lr.fit(X_train_tfidf_rd, y_train)
logls = (log_loss(y_test, lr.predict_proba(X_test_tfidf_rd), labels=[0, 1]))
accuracy = sum(lr.predict(X_test_tfidf_rd) == y_test) * \
1.0 / X_test_tfidf_rd.shape[0]
print 'Accuracy is ' + str(accuracy) + ', log loss is ' + str(logls)
Accuracy is 0.943562610229, log loss is 0.15680548436
SVM with linear kernel
from sklearn.svm import SVC
clf = SVC(kernel='linear', probability=True)
clf.fit(X_train_tfidf_rd, y_train)
logls = (log_loss(y_test, clf.predict_proba(X_test_tfidf_rd), labels=[0, 1]))
accuracy = sum(clf.predict(X_test_tfidf_rd) == y_test) * \
1.0 / X_test_tfidf_rd.shape[0]
print 'Accuracy is ' + str(accuracy) + ', log loss is ' + str(logls)
Accuracy is 0.9482657260435038, log loss is 0.169479085788
2-layer LSTM with dropout
# Import for LSTM
import pandas as pd
import numpy as np
import tensorflow as tf
from nltk.tokenize import TweetTokenizer
from tensorflow.contrib import rnn
from collections import Counter
import tflearn
import gensim
%matplotlib inline
train_data = pd.read_csv("./train.csv")
X_train = train_data["tweet"]
y_train = train_data.handle.map({"HillaryClinton": np.array(
[1.0, 0]), "realDonaldTrump": np.array([0, 1.0])})
y_train = np.array(y_train.tolist(), dtype=float)
test_data = pd.read_csv('./new_test.csv')
X_test = test_data["tweet"]
y_test = test_data.handle.map({"HillaryClinton": np.array(
[1.0, 0]), "realDonaldTrump": np.array([0, 1.0])})
y_test = np.array(y_test.tolist(), dtype=float)
def tokenize(tknzr, tweet):
try:
tweet = tweet.lower()
tokens = tknzr.tokenize(tweet)
tokens = map(lambda t: t if not t.startswith(
'http') else '<url>', tokens)
# tokens = filter(lambda t: t not in stopwords.words('english'),tokens)
return tokens
except:
return 'NC'
tknzr = TweetTokenizer()
train_count = 0
X_train_tk = []
for i in range(X_train.shape[0]):
token_tmp = tokenize(tknzr, X_train[i])
tmp = []
for word in token_tmp:
if word[0] == '#' and word != '#':
tmp.append('<hashtag>')
tmp.append(word[1:].lower())
elif word[0] == '@' and word != '@':
tmp.append('<user>')
tmp.append(word[1:].lower())
elif word[0].isdigit():
tmp.append('<number>')
else:
tmp.append(word)
train_count = max(train_count, len(tmp))
X_train_tk.append(tmp)
print X_train[20]
print X_train_tk[20]
"You can go to https://t.co/tTgeqxNqYm to make sure you are registered. And I hope you all will.” —Hillary #NationalVoterRegistrationDay [u'"', u'you', u'can', u'go', u'to', '<url>', u'to', u'make', u'sure', u'you', u'are', u'registered', u'.', u'and', u'i', u'hope', u'you', u'all', u'will', u'.', u'\u201d', u'\u2014', u'hillary', '<hashtag>', u'nationalvoterregistrationday']
test_count = 0
X_test_tk = []
for i in range(X_test.shape[0]):
token_tmp = tokenize(tknzr, X_test[i])
tmp = []
for word in token_tmp:
if word[0] == '#' and word != '#':
tmp.append('<hashtag>')
tmp.append(word[1:].lower())
elif word[0] == '@' and word != '@':
tmp.append('<user>')
tmp.append(word[1:].lower())
elif word[0].isdigit():
tmp.append('<number>')
else:
tmp.append(word)
test_count = max(train_count, len(tmp))
X_test_tk.append(tmp)
X_test_tk = X_test_tk + X_test_tk[-66:]
model = gensim.models.Word2Vec(
X_train_tk + X_test_tk, min_count=1, size=25, iter=8)
# model = {}
# glove_data = 'data/glove.twitter.27B.25d.txt'
# f = open(glove_data)
# for line in f:
# values = line.split()
# word = values[0]
# value = np.asarray(values[1:], dtype='float32')
# model[word] = value
# f.close()
vocab_list = []
for word_list in X_train_tk:
vocab_list += word_list
count = Counter(vocab_list)
vocab_dict = dict()
embedding_matrix = np.empty((0, 25), float)
for word in count:
vocab_dict[word] = len(vocab_dict)
embedding_matrix = np.vstack((embedding_matrix, model[word]))
X_train_vec = []
X_train_length = []
for i in range(len(X_train_tk)):
tmp = []
for word in X_train_tk[i]:
tmp.append(vocab_dict[word])
tmp = np.array(tmp)
X_train_vec.append(tmp)
X_train_length.append(len(tmp))
X_train_vec = np.array(X_train_vec)
X_train_length = np.array(X_train_length)
X_test_vec = []
X_test_length = []
for i in range(len(X_test_tk)):
tmp = []
for word in X_test_tk[i]:
if word in vocab_dict:
tmp.append(vocab_dict[word])
tmp = np.array(tmp)
X_test_vec.append(tmp)
X_test_length.append(len(tmp))
X_test_vec = np.array(X_test_vec)
X_test_length = np.array(X_test_length)
X_train_pad = tflearn.data_utils.pad_sequences(X_train_vec)
X_test_pad = tflearn.data_utils.pad_sequences(
X_test_vec, maxlen=X_train_pad.shape[1])
# Training Parameters
training_steps = 20
batch_size = 93
display_step = 50
embed_size = 25
# Network Parameters
num_input = 1
time_step = X_train_pad.shape[1]
num_hidden = 20
num_classes = 2
# tf Graph input
X = tf.placeholder(tf.int32, [None, time_step])
X_length = tf.placeholder(tf.int32, [None])
embedding = tf.Variable(embedding_matrix)
Y = tf.placeholder(tf.float16, [None, num_classes])
# Define weights
weights = {
'out': tf.Variable(tf.random_normal([num_hidden, num_classes]))
}
biases = {
'out': tf.Variable(tf.random_normal([num_classes]))
}
def RNN(x, x_length, weights, biases):
batch_size_tmp = tf.shape(x)[0]
embedding = tf.get_variable('embedding', [len(vocab_dict), embed_size])
embed = [tf.nn.embedding_lookup(embedding, row)
for row in tf.split(x, batch_size)]
embed = tf.reshape(embed, (batch_size_tmp, time_step, embed_size))
embed = tf.unstack(embed, time_step, 1)
lstm_cell = rnn.BasicLSTMCell(num_hidden)
cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=0.5)
cell = rnn.MultiRNNCell([cell] * 1)
outputs, states = rnn.static_rnn(
cell, dtype=tf.float32, sequence_length=x_length, inputs=embed)
outputs = tf.stack(outputs)
outputs = tf.transpose(outputs, [1, 0, 2])
index = tf.range(0, batch_size_tmp) * \
X_train_pad.shape[1] + tf.reshape(x_length - 1, [batch_size_tmp])
outputs = tf.gather(tf.reshape(outputs, [-1, num_hidden]), index)
return tf.matmul(outputs, weights['out']) + biases['out']
logits = RNN(X, X_length, weights, biases)
prediction = tf.nn.softmax(logits)
tf.summary.histogram('logits', logits)
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer()
train_op = optimizer.minimize(loss_op)
tf.summary.scalar('loss', loss_op)
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
tf.summary.scalar('accuracy', accuracy)
init = tf.global_variables_initializer()
merged_summary = tf.summary.merge_all()
writer = tf.summary.FileWriter('./log/hw5')
/home/wjh/.local/lib/python2.7/site-packages/tensorflow/python/ops/gradients_impl.py:96: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory. "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
# Start training
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
predict_all = []
with tf.Session(config=config) as sess:
writer.add_graph(sess.graph)
# Run the initializer
sess.run(init)
for step in range(1, training_steps + 1):
indexes = list(range(X_train_pad.shape[0]))
np.random.shuffle(indexes)
for i in range(1, X_train_pad.shape[0] // batch_size + 1):
batch_x, batch_y = X_train_pad[indexes[batch_size * (
i - 1):batch_size * i]], y_train[indexes[batch_size * (i - 1):batch_size * i]]
batch_x_length = X_train_length[indexes[batch_size *
(i - 1):batch_size * i]]
batch_x_length = batch_x_length.reshape((-1))
summary, _ = sess.run([merged_summary, train_op], feed_dict={
X: batch_x, X_length: batch_x_length, Y: batch_y})
writer.add_summary(summary, step)
loss = []
acc = []
for i in range(1, X_train_pad.shape[0] // batch_size + 1):
batch_x, batch_y = X_train_pad[batch_size * (
i - 1):batch_size * i], y_train[batch_size * (i - 1):batch_size * i]
batch_x_length = X_train_length[batch_size *
(i - 1):batch_size * i]
batch_x_length = batch_x_length.reshape((-1))
loss_tmp, acc_tmp = sess.run([loss_op, accuracy], feed_dict={X: batch_x, X_length: batch_x_length,
Y: batch_y})
loss.append(loss_tmp)
acc.append(acc_tmp)
print("Step " + str(step) + ", Minibatch Loss= " +
"{:.4f}".format(np.mean(loss[:1701])) + ", Training Accuracy= " +
"{:.3f}".format(np.mean(acc[:1701])))
predict_all = np.empty((0, 2), float)
for i in range(1, X_test_pad.shape[0] // batch_size + 1):
batch_x, batch_y = X_test_pad[batch_size * (
i - 1):batch_size * i], y_test[batch_size * (i - 1):batch_size * i]
batch_x_length = X_test_length[batch_size * (i - 1):batch_size * i]
batch_x_length = batch_x_length.reshape((-1))
predict = sess.run(prediction, feed_dict={
X: batch_x, X_length: batch_x_length, Y: batch_y})
predict_all = np.vstack((predict_all, np.array(predict)))
print("Optimization Finished!")
Step 1, Minibatch Loss= 0.2469, Training Accuracy= 0.934 Step 2, Minibatch Loss= 0.0680, Training Accuracy= 0.985 Step 3, Minibatch Loss= 0.0272, Training Accuracy= 0.995 Step 4, Minibatch Loss= 0.0169, Training Accuracy= 0.997 Step 5, Minibatch Loss= 0.0102, Training Accuracy= 0.999 Step 6, Minibatch Loss= 0.0065, Training Accuracy= 0.999 Step 7, Minibatch Loss= 0.0049, Training Accuracy= 0.999 Step 8, Minibatch Loss= 0.0147, Training Accuracy= 0.996 Step 9, Minibatch Loss= 0.0061, Training Accuracy= 0.999 Step 10, Minibatch Loss= 0.0038, Training Accuracy= 0.999 Step 11, Minibatch Loss= 0.0028, Training Accuracy= 0.999 Step 12, Minibatch Loss= 0.0024, Training Accuracy= 0.999 Step 13, Minibatch Loss= 0.0021, Training Accuracy= 1.000 Step 14, Minibatch Loss= 0.0022, Training Accuracy= 1.000 Step 15, Minibatch Loss= 0.0017, Training Accuracy= 1.000 Step 16, Minibatch Loss= 0.0018, Training Accuracy= 0.999 Step 17, Minibatch Loss= 0.0014, Training Accuracy= 1.000 Step 18, Minibatch Loss= 0.0013, Training Accuracy= 1.000 Step 19, Minibatch Loss= 0.0013, Training Accuracy= 1.000 Step 20, Minibatch Loss= 0.0013, Training Accuracy= 1.000 Optimization Finished!
with open('res.csv', 'w') as f:
f.write('id,realDonaldTrump,HillaryClinton\n')
for i in range(1701):
f.write('{},{:0.6f},{:0.6f}\n'.format(
i, predict_all[i, 1], predict_all[i, 0]))