2019-07-05

keras seq2seq 详细注释版

seq2seq

'''
referrences:
    https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
    https://blog.csdn.net/PIPIXIU/article/details/81016974
dataset:
    http://www.manythings.org/anki/
'''


from tensorflow.python.keras.layers import Input, LSTM, Dense
from tensorflow.python.keras.models import Model
import pandas as pd
import numpy as np


# n_input: input one-hot维度；n_output: output one-hot维度
def create_model(n_input, n_output, n_units):
    # ----------训练阶段（decoder有teacher forcing）----------
    # Input实例化Keras张量
    encoder_input = Input(shape=(None, n_input))
    # n_units为LSTM单元中神经元的个数；return_state返回最后时刻的状态h和c
    encoder = LSTM(n_units, return_state=True)
    # call LSTM，获取最后时刻的状态，作为decoder的初始状态
    _, encoder_h, encoder_c = encoder(encoder_input)
    encoder_state = [encoder_h, encoder_c]

    decoder_input = Input(shape=(None, n_output))
    # 训练模型时需要decoder的输出序列来优化，故return_sequences
    decoder = LSTM(n_units, return_sequences=True, return_state=True)
    # call LSTM，获取输出序列。注意，decoder_output shape(?, ?, n_units)
    decoder_output, _, _ = decoder(decoder_input, initial_state=encoder_state)
    # FC + softmax
    decoder_dense = Dense(n_output, activation='softmax')
    # call Dense（FC + softmax），decoder_output shape = (?, ?, n_output)
    decoder_output = decoder_dense(decoder_output)

    # 生成训练模型
    # 第一个参数为训练模型的输入，包含了encoder和decoder的输入，第二个参数为模型的输出，包含了decoder的输出
    model = Model([encoder_input, decoder_input], decoder_output)


    # ----------推理阶段（即预测）----------
    # 调整训练模型（但是weights和biases不变），以适配预测场景
    # 一个直接的需要修改的地方是改变teacher forcing为前一时刻输出
    # encoder其实没有变化，主要修改decoder

    # 直接生成推断encoder模型
    encoder_infer = Model(encoder_input, encoder_state)

    # 准备生产推断decoder模型
    # On the first decoder call, the hidden and cell states from the encoder will be used to initialize the decoder LSTM layer, provided as input to the model directly.
    decoder_state_input_h = Input(shape=(n_units,))
    decoder_state_input_c = Input(shape=(n_units,))
    # initial_state
    decoder_state_input = [decoder_state_input_h, decoder_state_input_c]
    # call LSTM，获取输出
    decoder_infer_output, decoder_infer_state_h, decoder_infer_state_c = decoder(decoder_input, initial_state=decoder_state_input)

    # The decoder must output the hidden and cell states along with the predicted character on each call, so that these states can be assigned to a variable and used on each subsequent recursive call.
    # 当前时刻得到的状态
    decoder_infer_state = [decoder_infer_state_h, decoder_infer_state_c]
    # 当前时刻的输出，call Dense（FC + softmax），decoder_output shape = (?, ?, n_output)
    decoder_infer_output = decoder_dense(decoder_infer_output)

    # 生成推断decoder模型
    # decoder_state_input本身是个list，+运算符append到前面的list
    decoder_infer = Model([decoder_input] + decoder_state_input, [decoder_infer_output] + decoder_infer_state)

    # 可以看出，decoder_infer没法被训练，因为input不易被直接定义
    # 而model就可以很方便的按定义形式给出input、output，进行训练。具体见下文main中使用

    return model, encoder_infer, decoder_infer


# (samples, max_input_seq_len, features)
def predict_chinese(source, encoder_inference, decoder_inference, n_steps, features):
    # encoding
    state = encoder_inference.predict(source)
    # 第一个字符'\t',为起始标志
    predict_seq = np.zeros((1, 1, features))
    predict_seq[0, 0, target_dict['\t']] = 1

    # ----------看一下，这里就是如何使用推理模型了。----------
    # ----------我们的训练模型仅用于获取weights和biases，获取后真正使用的是推理模型了----------
    output = ''
    # 每次用前一次的预测输出（字符），作为输入，来预测下一次的字符，直到预测出了终止符/最大长度
    # n_steps为OUTPUT_SEQ_LENGTH，即最大句子长度
    for i in range(n_steps):
        # 输入前一次的预测输出字符predict_seq，及隐状态h，c
        # 输出yhat因经过FC+softmax，shape = (?, ?, n_output)
        yhat, h, c = decoder_inference.predict([predict_seq] + state)

        # sample=0，time=-1（last），实际上，yhat shape = (1, 1, output_onehot_dim)
        char_index = np.argmax(yhat[0, -1, :])
        char = target_dict_reverse[char_index]
        output += char

        # 本次state为下一次的init state
        state = [h, c]
        # 本次的output为下一次的input
        predict_seq = np.zeros((1, 1, features))
        predict_seq[0, 0, char_index] = 1

        # 遇到终止符则停止
        if char == '\n':
            break
    return output


NUM_SAMPLES = 2000
BATCH_SIZE = 64
EPOCH = 200
N_UNITS = 256


if __name__ == '__main__':

    # ----------处理数据----------
    data_path = 'cmn.txt'

    # 提取NUM_SAMPLES行数据
    # pd.read_table默认分隔符为'\t'（pd.read_csv默认分隔符则为','）
    df = pd.read_table(data_path, header=None).iloc[:NUM_SAMPLES, :]
    df.columns = ['inputs', 'targets']
    # target每句前后增加'\t'、'\n'，作为起始、终止标志
    df['targets'] = df['targets'].apply(lambda x: '\t' + x + '\n')

    # input、target句子列表
    input_texts = df.inputs.values.tolist()
    target_texts = df.targets.values.tolist()
    # input、target字符列表
    input_characters = sorted(list(set(df.inputs.unique().sum())))
    target_characters = sorted(list(set(df.targets.unique().sum())))

    INUPT_SEQ_LENGTH = max([len(i) for i in input_texts])
    OUTPUT_SEQ_LENGTH = max([len(i) for i in target_texts])
    # one-hot --> features，特征维度为one-hot维度，即字符数
    INPUT_FEATURE_LENGTH = len(input_characters)
    OUTPUT_FEATURE_LENGTH = len(target_characters)

    # encoder seq、decoder seq为定长，且decoder每次都输出，故使用max_input_seq_len、max_out_seq_len
    # (samples, max_input_seq_len, features)
    encoder_input = np.zeros((NUM_SAMPLES, INUPT_SEQ_LENGTH, INPUT_FEATURE_LENGTH))
    # (samples, max_out_seq_len, features)
    decoder_input = np.zeros((NUM_SAMPLES, OUTPUT_SEQ_LENGTH, OUTPUT_FEATURE_LENGTH))
    # (samples, max_out_seq_len, features)
    decoder_output = np.zeros((NUM_SAMPLES, OUTPUT_SEQ_LENGTH, OUTPUT_FEATURE_LENGTH))

    # {char: index}
    input_dict = {char: index for index, char in enumerate(input_characters)}
    # {index: char}
    input_dict_reverse = {index: char for index, char in enumerate(input_characters)}
    # {char: index}
    target_dict = {char: index for index, char in enumerate(target_characters)}
    # {index: char}
    target_dict_reverse = {index: char for index, char in enumerate(target_characters)}

    for seq_index, seq in enumerate(input_texts):
        for char_index, char in enumerate(seq):
            # (samples, max_input_seq_len, features)
            encoder_input[seq_index, char_index, input_dict[char]] = 1

    for seq_index, seq in enumerate(target_texts):
        for char_index, char in enumerate(seq):
            decoder_input[seq_index, char_index, target_dict[char]] = 1.0
            # decoder_input = 前一次的decoder_output
            if char_index > 0:
                decoder_output[seq_index, char_index - 1, target_dict[char]] = 1.0


    # ----------训练模型----------
    model_train, encoder_infer, decoder_infer = create_model(INPUT_FEATURE_LENGTH, OUTPUT_FEATURE_LENGTH, N_UNITS)
    model_train.compile(optimizer='rmsprop', loss='categorical_crossentropy')
    model_train.fit([encoder_input, decoder_input], decoder_output, batch_size=BATCH_SIZE, epochs=EPOCH, validation_split=0.2)


    # ----------测试模型----------
    for i in range(100, 200):
        # (samples, max_input_seq_len, features)
        test = encoder_input[i:i + 1, :, :]  # i:i+1保持数组是三维
        out = predict_chinese(test, encoder_infer, decoder_infer, OUTPUT_SEQ_LENGTH, OUTPUT_FEATURE_LENGTH)
        print(input_texts[i])
        print(out)

2019-07-01

Program Development

Tensorflow模型构建通用流程

1. prediction

dropout

[seq2seq]

encoder + decoder

teacher forcing

training model + inference model

training model用于训练，获取encoder、decoder的weights、biases，其中decoder的input使用ground truth
inference model用于预测，保留training model的weights、biases，改变decoder的input使用前一时刻输出（sampling）

attention

2. loss

3. optimizer

tf.train.AdamOptimizer()

4. train_step

5. batch and epoch

2019-06-20

Program Development

Tensorflow Cheatsheet

定义OP、执行OP

# ---------- 定义OP----------
m1 = tf.constant([[3, 3]])  # 常量OP
m2 = tf.constant([[2], [3]])
product = tf.matmul(m1, m2)  # 计算OP

state = tf.Variable(0, name='counter')  # 变量OP
get_new_value = tf.add(state, 1)
update = tf.assign(state, get_new_value)  # 为变量赋值OP

# 为变量定义一个初始化OP：变量执行时，需要先初始化
init = tf.global_variables_initializer()


# ---------- 执行OP ----------
with tf.Session() as sess:
	# 总是sess.run一个OP，才会产生（计算）结果
	res = sess.run(product)
	print(res)
	
	sess.run(init)
	print(sess.run(state))
	for _ in range(5):
		sess.run(update)
	print(sess.run(state))

feed

input1 = tf.placeholder(tf.float32)
input2 = tf.placeholder(tf.float32)
output = tf.multiply(input1, input2)

with tf.Session() as sess:
	res = sess.run(output, feed_dict={intput1:[7.], input2:[2.]})
	print(res)

training通用流程

# prediction为之前基于x（placeholder），已定义的预测输出
# y为ground truth（placeholder）

# 定义cost function OP
loss = tf.reduce_mean(tf.square(y - prediction))
# 定义优化方法（一次梯度下降迭代）OP
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(loss)

with tf.Session() as sess:
	sess.run(tf.global_variables_initializer())
	for _ in range(2000):
		# 每次优化迭代，改变prediction计算中的Variables，降低loss
		sess.run(train_step, feed_dict={x:x_data, y:y_data})


	# 预测时只需
	sess.run(pridiction, feed_dict={x:x_data}

mini batch


batch_size = 100
n_batch = m // batch_size

#...

with tf.Session as sess:
	# 迭代2000个epoch
	for epoch in range(2000):
		# 每个epoch有n_batch个batch
		for batch in range(n_batch):
			# 获取batch_size大小的batch
			batch_x, batch_y = get_next_batch()
			sess.run(train_step, feed_dict={x:batch_x, y:batch_y})

dropout

keep_prob = tf.placeholder(tf.float32)

layer1 = tf.nn.relu(tf.matmul(W1, x) + b1)
layer1 = tf.nn.dropout(layer1, keep_prob=keep_prob)

layer2 = tf.nn.relu(tf.matmul(W2, layer1) + b2)
layer2 = tf.nn.dropout(layer2, keep_prob=keep_prob)

初始化

随机初始化

W
- tf.Variable(tf.truncated_normal(shape, stddev=0.1))
b
- tf.Variable(tf.zeros(shape) + 0.1)

note：好的初始化系数、优化器等设置，可以在迭代次数一定的情形下，更好的逼近最优解，从而提升模型性能

计算准确率（OP）

correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(prediction, 1))  # tf.argmax返回最大值下标
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# ...

with tf.Session as sess:
	
	# ...
	
	acc = sess.run(accuracy, feed_dict(x:test_x, y:test_y))
	print(acc)

tensorboard

with tf.name_scope('input'):
	input1 = tf.placeholder(tf.float32, [nx, none], name='input1')
	input2 = tf.placeholder(tf.float32, [nx, none], name='input2')
	

tf.summary.scalar('name', var)

# ...

merged = tf.summary.merge_all()

with tf.Session() as sess:
	writer = tf.summary.FileWriter('dir', sess.graph)
	
	# for each epoch
	summary = sess.run(merged)
	writer.add_summary(summary, epoch)

tensorboard --logdir=dir
name_scope在定义图时进行设置（执行图则仅对定义图进行计算）

模型存取

模型保存

1
2
3

saver = tf.train.Saver()
# ...
saver.save(sess, 'file_path/file_name')

模型读取

1
2
3

saver = tf.train.Saver()
# ...
saver.restore(sess, 'file_path/file_name')

2019-06-14

Machine Learning

NLP

词汇表征

vocabulary + one-hot representation
- 词汇表：V = [a, an, …, zulu, ]
- 表征：“an” = [0, 1, …, 0, 0]
featureized representation: word embedding
- 表征：[feature1, feature2, …]对应一个词（好处是可以计算各个词之间的距离，识别相似的词）（如何将词映射为一个特征向量？：是学来的）
- word embedding含义是，将各个词embed嵌入到n维特征空间中去
- 有点类似人类识别中，将图像encoding
- cosine similarity $sim(u,v) = \frac{u^{T}v}{| u |_{2} | v |_{2}}$

word embedding 学习

word2vec: skip gram

构造一个预测（或者映射）模型，输入一个词context（one-hot），预测对应到另一个词target（one-hot）

输入context、输出target的配对关系（间隔），是在一个句子中，在一定正负范围内，随机取的
而context的选择，则需要特别设计，避免频繁sample无意义的词

$o_{context} \overset{E}{\rightarrow} e_{context} = Eo_{context} \rightarrow softmax \rightarrow \hat{o}_{target}$

目标是通过训练这个映射关系，学习到embedding matrix $E = [e_{1}, e_{2}, e_{3}, …]$

$o_{context}$是context的one-hot表示
$E$的各列，为词汇表中各个词的embedding；各行则为embedding的各个特征维度
$Eo_{content}$相当于从$E$中提取content的embedding

词汇表较大时，skip gram中的softmax，分母求和，会有计算的大的问题

negative sampling

构造一个预测（或者映射）模型，输入一个词context（one-hot），预测对于一个词target是否存在对应（每个target一个二分类问题）

输入context、判断目标target的配对关系（间隔），是在一个句子中，在一定正负范围内，随机取的
而context、target的选择，则需要特别设计，避免频繁sample无意义的词

$o_{context} \overset{E}{\rightarrow} e_{context} = Eo_{context} \rightarrow \text{vocabulary_size} * \text{logistic_regression} \rightarrow lable$

使用vocabulary_size个二分类器，代替softmax，提升计算性能
训练数据每次构造，从句子中抽取一对context、target（positive sampling），在从词汇表中随机为context抽取k个target（negative sampling）
训练时，上述构造的k+1对context、target，训练k+1个分类器

2019-06-14

Machine Learning

seq2seq

基础模型

encoder（序列迭代输入） + decoder（迭代输出序列）

encoder甚至有可能并不是一个RNN，而是例如一个CNN，对图像进行encoding

beam search

翻译模型，选择最可能的句子

考虑输入$x$由encoder进行encoding，给入decoder
第一迭代，输出的$\hat{y}^{\langle 1 \rangle}$；第二迭代，输出的$\hat{y}^{\langle 2 \rangle}$…

为了得到全局最佳的句子输出，优化是不应将$\hat{y}^{\langle 1 \rangle}$、$\hat{y}^{\langle 2 \rangle}$、…分开单独考虑，而应该全局考虑。即优化使得given $x$，能得到最优句子$P(\hat{y}^{\langle 1 \rangle},\hat{y}^{\langle 2 \rangle},…|x)$

首先，翻译模型使用language model自动生成一条结果语句，每次词的选取是根据softmax的输出概率，随机sample的。这样无法保障，同一个输入，每次输出相同（预期是每次都输出最佳句子）
其次，若对softmax的输出，不再根据概率sample，而是直接选择概率最大的，即使用贪心算法，则如上文所述，可能得到的不是全局最佳句子。应该$(\hat{y}^{\langle 1 \rangle},\hat{y}^{\langle 2 \rangle},…|x)$，整体考虑

为此，使用beam search做最优化的选择，流程如下：

对$\hat{y}^{\langle 1 \rangle}$，取top $B$个候选
对各$\hat{y}^{\langle 1 \rangle}$候选，获取$\hat{y}^{\langle 2 \rangle}$。整体考虑前两个输出联合，可以计算$P(\hat{y}^{\langle 1 \rangle}, \hat{y}^{\langle 2 \rangle} | x) = P(\hat{y}^{\langle 1 \rangle} | x)P(\hat{y}^{\langle 2 \rangle} | x, \hat{y}^{\langle 1 \rangle})$。对于所有$\hat{y}^{\langle 1 \rangle}$候选，所有$P(\hat{y}^{\langle 1 \rangle}, \hat{y}^{\langle 2 \rangle} | x)$计算结果中，取top $B$个候选
按2方式继续，直到EOS或预设的$T_{y}$

beam search的目标函数

$\text{arg }\max_{y}\prod_{t=1}^{T_{y}}P(\hat{y}^{\langle t \rangle} | x, \hat{y}^{\langle 1 \rangle}, ..., , \hat{y}^{\langle t-1 \rangle})$

为避免各个P过小，造成误差和无法表示（过小溢出），对P取log

$\text{arg }\max_{y}\sum_{t=1}^{T_{y}}\log P(\hat{y}^{\langle t \rangle} | x, \hat{y}^{\langle 1 \rangle}, ..., , \hat{y}^{\langle t-1 \rangle})$

但上述目标函数倾向于使用更短的输出句子，对此使用length normalization（$\alpha$为超参数）

$\text{arg }\max_{y}\frac{1}{T_{y}^{\alpha}}\sum_{t=1}^{T_{y}}\log P(\hat{y}^{\langle t \rangle} | x, \hat{y}^{\langle 1 \rangle}, ..., , \hat{y}^{\langle t-1 \rangle})$

误差分析：RNN还是beam search有问题

对于同一输入句子，输出：
human-level：$y^{h}$
model：$\hat{y}$

将$y^{h}$依次输入decoder，能够最终计算出$P(y^{h}|x)$

$P(y^{h}|x) > P(\hat{y}|x)$：RNN能够计算出human-level更好，但却没选到，beam search有问题
$P(y^{h}|x) < P(\hat{y}|x)$：RNN未能计算出human-level更好，但却没选到，RNN有问题

（若使用了length normalization，则对比length normalization之后的P）

汇总所有labeled data的对比结果，看看RNN还是beam search有问题
若RNN有问题，则进一步bias variance分析；若beam search有问题，考虑增大$B$

attention模型

对于seq2seq模型，
encoder由$a^{\langle 0 \rangle} = 0$初始化，每次用$a^{\langle t’-1 \rangle}$、$x^{\langle t’ \rangle}$计算$a^{\langle t’ \rangle}$，最终将$a^{\langle T_{x} \rangle}$传给decoder
decoder由$a^{\langle T_{x} \rangle}$初始化，每次用$a^{\langle t-1 \rangle}$、$\hat{y}^{\langle t-1 \rangle}$计算$\hat{y}^{\langle t \rangle}$

计算的unit可以是naive RNN、GRU、LSTM

当输入序列过长时，decoder性能下降（记不住这么长的输入）
attention模型试图在decoder生成$\hat{y}^{\langle t \rangle}$时，引入encoder不同$t’$的$a^{\langle t’ \rangle}$的考量

具体来讲，使用$s^{\langle t \rangle}$表示decoder的输出，以区分encoder的activation
$s^{\langle t \rangle}$的计算，除了输入$s^{\langle t-1 \rangle}$、$\hat{y}^{\langle t-1 \rangle}$，还增加了一个context $c^{\langle t \rangle} = \sum_{t’} \alpha^{\langle t, t’ \rangle} a^{\langle t’ \rangle}$，即对key各项的加权求和，并且这组权重对于不同的query（$s^{\langle t - 1 \rangle}$）是不同的

其中$t’$为encoder中$a$对应各时刻
$\alpha^{\langle t, t’ \rangle}$即表示计算$t$时刻的$s^{\langle t \rangle}$时，应付出多少attention在$a^{\langle t’ \rangle}$上
$\alpha^{\langle t, t’ \rangle}$是归一化的，由$s^{\langle t-1 \rangle}$（query）和$a^{\langle t’ \rangle}$（待weighted项）通过一个小NN-softmax算出（直接end-to-end计算）

ChenyuShuxin

晨雨舒心

keras seq2seq 详细注释版

Tensorflow模型构建通用流程

1. prediction

dropout

[seq2seq]

teacher forcing

attention

2. loss

3. optimizer

4. train_step

5. batch and epoch

Tensorflow Cheatsheet

定义OP、执行OP

feed

training通用流程

mini batch

dropout

初始化

计算准确率（OP）

tensorboard

模型存取

NLP

词汇表征

word embedding 学习

seq2seq

基础模型

beam search

attention模型