HW5 Hillary or Donald?

Jiahui Wei, Yunzheng Zhang, Yusi Xiang

Classify tweets from Hillary Clinton and Donald Trump

  1. Logistic Regression
  2. SVM
  3. LSTM
In [1]:
# Import for Logistic Regression and SVM
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import PCA
from nltk.tokenize import TweetTokenizer, word_tokenize
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import log_loss
%matplotlib inline
In [2]:
# Read data
train_data = pd.read_csv("./train.csv")
train_data["label_num"] = train_data.handle.map({
    "HillaryClinton": 0,
    "realDonaldTrump": 1
})

test_data = pd.read_csv('./new_test.csv')
test_data["label_num"] = test_data.handle.map({
    "HillaryClinton": 0,
    "realDonaldTrump": 1
})

X_train, y_train = train_data["tweet"], train_data["label_num"]

X_test, y_test = test_data["tweet"], test_data["label_num"]

1. Logistic Regression

First try using Logistic Regression

In [3]:
count_vec = CountVectorizer(
    decode_error='ignore', stop_words=stopwords.words("english"))
X_train_count = count_vec.fit_transform(X_train)
X_test_count = count_vec.transform(X_test)
In [4]:
tfidf_trans = TfidfTransformer()
X_train_tfidf = tfidf_trans.fit_transform(X_train_count)
X_test_tfidf = tfidf_trans.transform(X_test_count)
In [5]:
pca = PCA()
X_train_tfidf_rd = pca.fit_transform(X_train_tfidf.toarray())
X_test_tfidf_rd = pca.transform(X_test_tfidf.toarray())
In [6]:
lr = LogisticRegressionCV()
lr.fit(X_train_tfidf_rd, y_train)
logls = (log_loss(y_test, lr.predict_proba(X_test_tfidf_rd), labels=[0, 1]))
accuracy = sum(lr.predict(X_test_tfidf_rd) == y_test) * \
    1.0 / X_test_tfidf_rd.shape[0]

print 'Accuracy is ' + str(accuracy) + ', log loss is ' + str(logls)
Accuracy is 0.914756025867, log loss is 0.234298274729

Use a better tokenizer

In [7]:
built_in_tokenizer = count_vec.build_tokenizer()
tokens = built_in_tokenizer(X_train[0])
print (tokens)
['The', 'question', 'in', 'this', 'election', 'Who', 'can', 'put', 'the', 'plans', 'into', 'action', 'that', 'will', 'make', 'your', 'life', 'better', 'https', 'co', 'XreEY9OicG']
In [8]:
tknzr = TweetTokenizer()
tokens = tknzr.tokenize(X_train[0])
print (tokens)
[u'The', u'question', u'in', u'this', u'election', u':', u'Who', u'can', u'put', u'the', u'plans', u'into', u'action', u'that', u'will', u'make', u'your', u'life', u'better', u'?', u'https://t.co/XreEY9OicG']

Use tricks

In [9]:
def tokenize(tweet):
    tknzr = TweetTokenizer()
    try:
        tweet = tweet.lower()
        tokens = tknzr.tokenize(tweet)
        tokens = map(lambda t: t if not t.startswith(
            'http') else '<url>', tokens)
        return tokens
    except:
        return 'NC'
In [10]:
count_vec = CountVectorizer(
    decode_error='ignore', tokenizer=tokenize, stop_words=stopwords.words("english"))
X_train_count = count_vec.fit_transform(X_train)
X_test_count = count_vec.transform(X_test)
In [11]:
tfidf_trans = TfidfTransformer()
X_train_tfidf = tfidf_trans.fit_transform(X_train_count)
X_test_tfidf = tfidf_trans.transform(X_test_count)
In [12]:
pca = PCA()
X_train_tfidf_rd = pca.fit_transform(X_train_tfidf.toarray())
X_test_tfidf_rd = pca.transform(X_test_tfidf.toarray())
In [13]:
lr = LogisticRegressionCV()
lr.fit(X_train_tfidf_rd, y_train)
logls = (log_loss(y_test, lr.predict_proba(X_test_tfidf_rd), labels=[0, 1]))
accuracy = sum(lr.predict(X_test_tfidf_rd) == y_test) * \
    1.0 / X_test_tfidf_rd.shape[0]

print 'Accuracy is ' + str(accuracy) + ', log loss is ' + str(logls)
Accuracy is 0.943562610229, log loss is 0.15680548436

2. SVM

SVM with linear kernel

In [34]:
from sklearn.svm import SVC
clf = SVC(kernel='linear', probability=True)
clf.fit(X_train_tfidf_rd, y_train)
logls = (log_loss(y_test, clf.predict_proba(X_test_tfidf_rd), labels=[0, 1]))
accuracy = sum(clf.predict(X_test_tfidf_rd) == y_test) * \
    1.0 / X_test_tfidf_rd.shape[0]

print 'Accuracy is ' + str(accuracy) + ', log loss is ' + str(logls)
Accuracy is 0.9482657260435038, log loss is 0.169479085788

3. LSTM

2-layer LSTM with dropout

In [14]:
# Import for LSTM
import pandas as pd
import numpy as np
import tensorflow as tf
from nltk.tokenize import TweetTokenizer
from tensorflow.contrib import rnn
from collections import Counter
import tflearn
import gensim
%matplotlib inline
In [15]:
train_data = pd.read_csv("./train.csv")
X_train = train_data["tweet"]
y_train = train_data.handle.map({"HillaryClinton": np.array(
    [1.0, 0]), "realDonaldTrump": np.array([0, 1.0])})
y_train = np.array(y_train.tolist(), dtype=float)

test_data = pd.read_csv('./new_test.csv')
X_test = test_data["tweet"]
y_test = test_data.handle.map({"HillaryClinton": np.array(
    [1.0, 0]), "realDonaldTrump": np.array([0, 1.0])})
y_test = np.array(y_test.tolist(), dtype=float)
In [16]:
def tokenize(tknzr, tweet):
    try:
        tweet = tweet.lower()
        tokens = tknzr.tokenize(tweet)
        tokens = map(lambda t: t if not t.startswith(
            'http') else '<url>', tokens)
#         tokens = filter(lambda t: t not in stopwords.words('english'),tokens)
        return tokens
    except:
        return 'NC'
In [17]:
tknzr = TweetTokenizer()
train_count = 0
X_train_tk = []
for i in range(X_train.shape[0]):
    token_tmp = tokenize(tknzr, X_train[i])
    tmp = []
    for word in token_tmp:
        if word[0] == '#' and word != '#':
            tmp.append('<hashtag>')
            tmp.append(word[1:].lower())
        elif word[0] == '@' and word != '@':
            tmp.append('<user>')
            tmp.append(word[1:].lower())
        elif word[0].isdigit():
            tmp.append('<number>')
        else:
            tmp.append(word)

    train_count = max(train_count, len(tmp))
    X_train_tk.append(tmp)
In [18]:
print X_train[20]
print X_train_tk[20]
"You can go to https://t.co/tTgeqxNqYm to make sure you are registered. And I hope you all will.” —Hillary #NationalVoterRegistrationDay
[u'"', u'you', u'can', u'go', u'to', '<url>', u'to', u'make', u'sure', u'you', u'are', u'registered', u'.', u'and', u'i', u'hope', u'you', u'all', u'will', u'.', u'\u201d', u'\u2014', u'hillary', '<hashtag>', u'nationalvoterregistrationday']
In [19]:
test_count = 0
X_test_tk = []
for i in range(X_test.shape[0]):
    token_tmp = tokenize(tknzr, X_test[i])
    tmp = []
    for word in token_tmp:
        if word[0] == '#' and word != '#':
            tmp.append('<hashtag>')
            tmp.append(word[1:].lower())
        elif word[0] == '@' and word != '@':
            tmp.append('<user>')
            tmp.append(word[1:].lower())
        elif word[0].isdigit():
            tmp.append('<number>')
        else:
            tmp.append(word)

    test_count = max(train_count, len(tmp))
    X_test_tk.append(tmp)
    
X_test_tk = X_test_tk + X_test_tk[-66:]
In [20]:
model = gensim.models.Word2Vec(
    X_train_tk + X_test_tk, min_count=1, size=25, iter=8)
In [3]:
# model = {}
# glove_data = 'data/glove.twitter.27B.25d.txt'
# f = open(glove_data)
# for line in f:
#     values = line.split()
#     word = values[0]
#     value = np.asarray(values[1:], dtype='float32')
#     model[word] = value
# f.close()
In [23]:
vocab_list = []
for word_list in X_train_tk:
    vocab_list += word_list
count = Counter(vocab_list)

vocab_dict = dict()
embedding_matrix = np.empty((0, 25), float)

for word in count:
    vocab_dict[word] = len(vocab_dict)
    embedding_matrix = np.vstack((embedding_matrix, model[word]))
In [24]:
X_train_vec = []
X_train_length = []
for i in range(len(X_train_tk)):
    tmp = []
    for word in X_train_tk[i]:
        tmp.append(vocab_dict[word])
    tmp = np.array(tmp)
    X_train_vec.append(tmp)
    X_train_length.append(len(tmp))
X_train_vec = np.array(X_train_vec)
X_train_length = np.array(X_train_length)

X_test_vec = []
X_test_length = []
for i in range(len(X_test_tk)):
    tmp = []
    for word in X_test_tk[i]:
        if word in vocab_dict:
            tmp.append(vocab_dict[word])
    tmp = np.array(tmp)
    X_test_vec.append(tmp)
    X_test_length.append(len(tmp))
X_test_vec = np.array(X_test_vec)
X_test_length = np.array(X_test_length)
In [25]:
X_train_pad = tflearn.data_utils.pad_sequences(X_train_vec)
X_test_pad = tflearn.data_utils.pad_sequences(
    X_test_vec, maxlen=X_train_pad.shape[1])
In [26]:
# Training Parameters
training_steps = 20
batch_size = 93
display_step = 50
embed_size = 25

# Network Parameters
num_input = 1
time_step = X_train_pad.shape[1]
num_hidden = 20
num_classes = 2

# tf Graph input
X = tf.placeholder(tf.int32, [None, time_step])
X_length = tf.placeholder(tf.int32, [None])
embedding = tf.Variable(embedding_matrix)
Y = tf.placeholder(tf.float16, [None, num_classes])
In [27]:
# Define weights
weights = {
    'out': tf.Variable(tf.random_normal([num_hidden, num_classes]))
}
biases = {
    'out': tf.Variable(tf.random_normal([num_classes]))
}
In [28]:
def RNN(x, x_length, weights, biases):

    batch_size_tmp = tf.shape(x)[0]
    embedding = tf.get_variable('embedding', [len(vocab_dict), embed_size])
    embed = [tf.nn.embedding_lookup(embedding, row)
             for row in tf.split(x, batch_size)]
    embed = tf.reshape(embed, (batch_size_tmp, time_step, embed_size))
    embed = tf.unstack(embed, time_step, 1)

    lstm_cell = rnn.BasicLSTMCell(num_hidden)
    cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=0.5)
    cell = rnn.MultiRNNCell([cell] * 1)

    outputs, states = rnn.static_rnn(
        cell, dtype=tf.float32, sequence_length=x_length, inputs=embed)

    outputs = tf.stack(outputs)
    outputs = tf.transpose(outputs, [1, 0, 2])

    index = tf.range(0, batch_size_tmp) * \
        X_train_pad.shape[1] + tf.reshape(x_length - 1, [batch_size_tmp])
    outputs = tf.gather(tf.reshape(outputs, [-1, num_hidden]), index)

    return tf.matmul(outputs, weights['out']) + biases['out']
In [29]:
logits = RNN(X, X_length, weights, biases)
prediction = tf.nn.softmax(logits)
tf.summary.histogram('logits', logits)

loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
    logits=logits, labels=Y))

optimizer = tf.train.AdamOptimizer()
train_op = optimizer.minimize(loss_op)
tf.summary.scalar('loss', loss_op)

correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
tf.summary.scalar('accuracy', accuracy)

init = tf.global_variables_initializer()

merged_summary = tf.summary.merge_all()
writer = tf.summary.FileWriter('./log/hw5')
/home/wjh/.local/lib/python2.7/site-packages/tensorflow/python/ops/gradients_impl.py:96: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
In [30]:
# Start training
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

predict_all = []
with tf.Session(config=config) as sess:
    writer.add_graph(sess.graph)
    # Run the initializer
    sess.run(init)

    for step in range(1, training_steps + 1):
        indexes = list(range(X_train_pad.shape[0]))
        np.random.shuffle(indexes)
        for i in range(1, X_train_pad.shape[0] // batch_size + 1):
            batch_x, batch_y = X_train_pad[indexes[batch_size * (
                i - 1):batch_size * i]], y_train[indexes[batch_size * (i - 1):batch_size * i]]
            batch_x_length = X_train_length[indexes[batch_size *
                                                    (i - 1):batch_size * i]]

            batch_x_length = batch_x_length.reshape((-1))
            summary, _ = sess.run([merged_summary, train_op], feed_dict={
                                  X: batch_x, X_length: batch_x_length, Y: batch_y})
            writer.add_summary(summary, step)

        loss = []
        acc = []
        for i in range(1, X_train_pad.shape[0] // batch_size + 1):
            batch_x, batch_y = X_train_pad[batch_size * (
                i - 1):batch_size * i], y_train[batch_size * (i - 1):batch_size * i]
            batch_x_length = X_train_length[batch_size *
                                            (i - 1):batch_size * i]

            batch_x_length = batch_x_length.reshape((-1))
            loss_tmp, acc_tmp = sess.run([loss_op, accuracy], feed_dict={X: batch_x, X_length: batch_x_length,
                                                                         Y: batch_y})
            loss.append(loss_tmp)
            acc.append(acc_tmp)

        print("Step " + str(step) + ", Minibatch Loss= " +
              "{:.4f}".format(np.mean(loss[:1701])) + ", Training Accuracy= " +
              "{:.3f}".format(np.mean(acc[:1701])))

        predict_all = np.empty((0, 2), float)
        for i in range(1, X_test_pad.shape[0] // batch_size + 1):
            batch_x, batch_y = X_test_pad[batch_size * (
                i - 1):batch_size * i], y_test[batch_size * (i - 1):batch_size * i]
            batch_x_length = X_test_length[batch_size * (i - 1):batch_size * i]

            batch_x_length = batch_x_length.reshape((-1))
            predict = sess.run(prediction, feed_dict={
                               X: batch_x, X_length: batch_x_length, Y: batch_y})
            predict_all = np.vstack((predict_all, np.array(predict)))

    print("Optimization Finished!")
Step 1, Minibatch Loss= 0.2469, Training Accuracy= 0.934
Step 2, Minibatch Loss= 0.0680, Training Accuracy= 0.985
Step 3, Minibatch Loss= 0.0272, Training Accuracy= 0.995
Step 4, Minibatch Loss= 0.0169, Training Accuracy= 0.997
Step 5, Minibatch Loss= 0.0102, Training Accuracy= 0.999
Step 6, Minibatch Loss= 0.0065, Training Accuracy= 0.999
Step 7, Minibatch Loss= 0.0049, Training Accuracy= 0.999
Step 8, Minibatch Loss= 0.0147, Training Accuracy= 0.996
Step 9, Minibatch Loss= 0.0061, Training Accuracy= 0.999
Step 10, Minibatch Loss= 0.0038, Training Accuracy= 0.999
Step 11, Minibatch Loss= 0.0028, Training Accuracy= 0.999
Step 12, Minibatch Loss= 0.0024, Training Accuracy= 0.999
Step 13, Minibatch Loss= 0.0021, Training Accuracy= 1.000
Step 14, Minibatch Loss= 0.0022, Training Accuracy= 1.000
Step 15, Minibatch Loss= 0.0017, Training Accuracy= 1.000
Step 16, Minibatch Loss= 0.0018, Training Accuracy= 0.999
Step 17, Minibatch Loss= 0.0014, Training Accuracy= 1.000
Step 18, Minibatch Loss= 0.0013, Training Accuracy= 1.000
Step 19, Minibatch Loss= 0.0013, Training Accuracy= 1.000
Step 20, Minibatch Loss= 0.0013, Training Accuracy= 1.000
Optimization Finished!
In [ ]:
with open('res.csv', 'w') as f:
    f.write('id,realDonaldTrump,HillaryClinton\n')
    for i in range(1701):
        f.write('{},{:0.6f},{:0.6f}\n'.format(
            i, predict_all[i, 1], predict_all[i, 0]))