利用keras自带的IMDB数据,构建一个简单的神经网络模型。IMDB为互联网电影数据库的简称,它包含了50000条观众的影评数据,其中25000条训练集,25000条测试集。
import numpy as np
from keras.datasets import imdb
(train_data,train_lables),(test_data,test_lables) = imdb.load_data(num_words=10000)
np.shape(train_data)
train_lables[1]
word_index = imdb.get_word_index()
reverse_word_index = dict([(value,key) for (key,value) in word_index.items()])
decoded_review = ' '.join([reverse_word_index.get(i - 3,'?') for i in train_data[0]])
decoded_review
将整数序列转化为二进制矩阵,采用one-hot编码,即构造一个10000维的向量,如果某文字出现,则相应位置为1,否则为0.
import numpy as np
def vectorize_sequence(sequences, dimension = 10000):
results = np.zeros((len(sequences),dimension))
for i,sequence in enumerate(sequences):
results[i,sequence] = 1.
return(results)
x_train = vectorize_sequence(train_data)
x_test = vectorize_sequence(test_data)
x_train[0]
y_train = np.asarray(train_lables).astype('float32')
y_test = np.asarray(test_lables).astype('float32')
y_test
from keras import models
from keras import layers
model = models.Sequential() ## 顺序模型,基本上都是选这个
model.add(layers.Dense(16,activation='relu',input_shape = (10000,))) ## 16个神经元,指定输入为10000维,后面就不需要指定了
model.add(layers.Dense(16,activation='relu'))
model.add(layers.Dense(1,activation='sigmoid')) ### 二分类问题,因此最后一层设置为1个神经元,激活函数为sigmoid.
## 分类问题,输出为0/1,因此loss设为‘binary-crossentropy'。如果是多分类,应该是’categorical-crossentropy'。
model.compile(optimizer = 'rmsprop',
loss = 'binary_crossentropy',
metrics = ['accuracy'])
model.summary()
x_val = x_train[:10000]
partial_x_train = x_train[10000:]
y_val = y_train[:10000]
partial_y_train = y_train[10000:]
history = model.fit(partial_x_train,
partial_y_train,
epochs = 20,
batch_size=512,
validation_data=(x_val,y_val))
history_dict = history.history
history_dict.keys()
import matplotlib.pyplot as plt
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1,len(loss_values)+1)
plt.plot(epochs,loss_values,'bo',label = "Training loss")
plt.plot(epochs, val_loss_values,'b',label = "Validation loss")
plt.title('Training and Validation loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()
训练到epoch = 4,validation loss最小,后面training loss一直在下降,但远低于validation loss,过拟合了。选定epoch = 4,重新训练一个模型。
model = models.Sequential()
model.add(layers.Dense(16,activation='relu',input_shape = (10000,)))
model.add(layers.Dense(16,activation='relu'))
model.add(layers.Dense(1,activation='sigmoid'))
model.compile(optimizer = 'rmsprop',
loss = 'binary_crossentropy',
metrics = ['accuracy'])
model.fit(x_train,y_train,epochs = 4,batch_size=512)
results = model.evaluate(x_test,y_test)
results
model.predict(x_test)