import pandas as pd
import re
import numpy as np
import os
from gensim.models import word2vec
data_t = pd.read_csv('labeledTrainData.tsv',sep='\t')
(25000, 3)
if not os.path.exists('mymodel'):
    if not os.path.exists('imdb_text'):
        data_un = pd.read_csv('unlabeledTrainData.tsv',header=0, delimiter="\t",quoting=3 )
        pat = re.compile(r'[A-Za-z]+')
        with open('imdb_text','a',encoding = 'utf-8') as f:
            for rev in
                str_list = pat.findall(rev)
                str_list = [x.lower() for x in str_list]
                string = ' '.join(str_list)
                f.write(string + '\n')
        del data_un
    sentences =word2vec.Text8Corpus("imdb_text")  # 加载语料
    model =word2vec.Word2Vec(sentences, size=50)  #训练skip-gram模型,默认window=5
else:
    model = word2vec.Word2Vec.load('mymodel')
word_vectors = model.wv
del model
data_t['vec'] = x :[word_vectors[w] for w in x.split() if w in word_vectors])
del data_t['review']
del word_vectors
import gc
gc.collect()
data_t = data_t[data_t['vec'].apply(lambda x:len(x)>0)]
data_t.sentiment.value_counts()
0    12499
1    12495
Name: sentiment, dtype: int64
maxlength = max([len(x) for x in data_t.vec])
maxlength
def pad(x):
    if len(x)>300:
        x1 = x[:300]
    else:
        x1 = np.zeros((300,50))
        x1[:len(x)] = x
    return x1
data_t['vec'] = data_t.vec.apply(pad)
import tensorflow as tf
learning_rate = 0.002
batch_size = 100
n_input = 50
n_steps = 300
n_hidden = 300
n_classes = 2
x = tf.placeholder(tf.float32, [None, n_steps,n_input])
y = tf.placeholder(tf.int64, [None])
keep_prob = tf.placeholder("float")
def length(shuru):
    return tf.reduce_sum(tf.sign(tf.reduce_max(tf.abs(shuru),reduction_indices=2)),reduction_indices=1)
cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell(n_hidden), output_keep_prob = keep_prob)
output, _ = tf.nn.dynamic_rnn(
    cell,
    x,
    dtype=tf.float32,
    sequence_length = length(x)
)
TensorShape([Dimension(None), Dimension(300), Dimension(300)])
index = tf.range(0,batch_size)*n_steps + (tf.cast(length(x),tf.int32) - 1)
flat = tf.reshape(output,[-1,int(output.get_shape()[2])])
last = tf.gather(flat,index)
weight = tf.Variable(tf.truncated_normal((n_hidden, n_classes), stddev=0.001))
bias = tf.Variable(tf.constant(0.1, shape=[n_classes]))
com_out = tf.matmul(last, weight) + bias
prediction = tf.nn.softmax(com_out)
cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels = y, logits = com_out))
optimizer = tf.train.AdamOptimizer(learning_rate)
grads = optimizer.compute_gradients(cross_entropy)
for i, (g, v) in enumerate(grads):
    if g is not None:
        grads[i] = (tf.clip_by_norm(g, 5), v) # clip gradients
train_op = optimizer.apply_gradients(grads)
correct_pred = tf.equal(tf.argmax(prediction,1), y)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
def generatebatch(X,Y,n_examples, batch_size):
    for batch_i in range(n_examples // batch_size):
        start = batch_i*batch_size
        end = start + batch_size
        batch_xs = X[start:end]
        batch_ys = Y[start:end]
        yield batch_xs, batch_ys # 生成每一个batch
sess = tf.Session()
init = tf.global_variables_initializer()
saver = tf.train.Saver()
for step in range(10):
    index = np.random.permutation(int(len(data_t.vec.values)))
    for batch_x,batch_y in generatebatch(data_t.vec.values[index],data_t.sentiment.values[index],len(data_t.vec.values),batch_size):
        batch_x = np.concatenate(batch_x).reshape(batch_size,300,50)
        batch_x.astype(np.float32)
, feed_dict={x: batch_x, y: batch_y,keep_prob: 0.5})
        acc =, feed_dict={x: batch_x, y: batch_y,keep_prob: 1})
        loss =, feed_dict={x: batch_x, y: batch_y,keep_prob: 1})
,'./lesson0',global_step = step)
    print("Iter " + str(step) + ", Minibatch Loss= " + "{}".format(loss) + ", Training Accuracy= " + "{}".format(acc))
print("Optimization Finished!")
Iter 0, Minibatch Loss= 0.3504045009613037, Training Accuracy= 0.8799999952316284
Iter 1, Minibatch Loss= 0.2799288034439087, Training Accuracy= 0.8899999856948853
Iter 2, Minibatch Loss= 0.25252586603164673, Training Accuracy= 0.8700000047683716
Iter 3, Minibatch Loss= 0.2636661231517792, Training Accuracy= 0.9300000071525574