6. limited memory graio newton

牛顿方法需要计算$ hessian $矩阵和其逆,为方便计算和减少内存使用,使用L_BFGS算法优化.

最大墒模型:

$$ \begin{align} p(y|x) &= \frac{\exp \left[ \sum_{i=1} w_i f_i(x,y) \right] }{ Z_w(x) } \\ &= \frac{ \exp \left[ \sum_{i=1} w_i f_i(x,y) \right] }{ \sum_y \exp \left[ \sum_{i=1} w_i f_i(x,y) \right] } \end{align} $$

目标优化函数:

$$ \begin{align} \min_w f(w) &= -\sum_{x,y} \tilde{p}(x,y) \log p(y|x) \\ &= \sum_{x} \tilde{p}(x) \log Z_w(x) - \sum_{x,y} \tilde{p}(x,y) \sum_{i=1} w_i f_i(x,y) \end{align} $$

梯度:
$$ \nabla f(w) = \left[\frac{\partial f(w)}{\partial w_1},\frac{\partial f(w)}{\partial w_2},\frac{\partial f(w)}{\partial w_3},\cdots \right]^T $$

$$ \frac{\partial f(w)}{\partial w_i} = \sum_{x,y} \tilde{p}(x) p_w(y|x) f_i(x,y) - \sum_{x,y} \tilde{p}(x,y) f_i(x,y) $$

7. 代码实现和效果

7.1 代码实现

转换pku语料格式,空行表示换行,这么处理的目的是为了方便后续统计

# -*- coding: utf-8 -*-
import re
import numpy as np

def readfileBylines(filename):
    with open(filename, 'rt', encoding='utf8') as f:
        lines = f.readlines()
    return lines

lines = readfileBylines('./training/pku_training.utf8')

for line in lines:
    sent,label = line.replace(r'  ',''), []
    for w, word in enumerate(re.split(r'\s{2}', line)):
        I = len(word)
        for i, c in enumerate(word):
            if I == 1: a = 'S'
            else:
                if i == 0: a = 'B'
                elif i == I-1: a = 'E'
                else: a = 'M'
            label.append(a)
    sent, label = np.asarray(list(sent))[:-1], np.asarray(label)[:-1]
    for s,l in zip(sent,label):
        print('%s\t%s' % (s, l))
    print('')

转换后格式如下, 输入序列和标签用\t分隔,空行为原输入的换行.

字符/词\t标签
以      B
及      E
近      S
千      S
名      S

接着用此语料和模板生成模型并训练.这个过程时间很长,需要大约8-12小时(i7 4790k),这是试验性质(python不支持多线程,实在太慢),如生产环境使用更改为c++代码并使用openmp/openmpi性能可提高很多.

# -*- coding: utf-8 -*-
import time, math, pickle
import numpy as np
from scipy.optimize import fmin_l_bfgs_b

def readfileBylines(filename):
    with open(filename, 'rt', encoding='utf8') as f:
        lines = f.readlines()
    return lines

def getTextSequences(lines):
    sequences, result = [[[],[]]], []
    for line in lines:
        if len(line.split('\t')) == 2:
            word,label =  line.split('\t')
            sequences[-1][0].append(word)
            sequences[-1][1].append(label.strip())
        else:
            sequences.append([[],[]])
    for i in range(len(sequences)):
        if len(sequences[i][0]) == len(sequences[i][1]) > 0:
            result.append( dict(enumerate(zip(*sequences[i]))) )
    return result

def statistics_transition_matrix(labelTypes, sequences):
    matrix = {}
    for sequence in sequences:
        prev_label = None
        for i in range(len(sequence)):
            if prev_label not in matrix:
                matrix[prev_label] = {}
            if sequence[i][1] not in matrix[prev_label]:
                matrix[prev_label][sequence[i][1]] = 0
            matrix[prev_label][sequence[i][1]] += 1
            prev_label = sequence[i][1]
    for row in [None]+labelTypes:
        total = sum(matrix[row].values())
        for col in labelTypes:
            matrix[row][col] = matrix[row].get(col,0.0) / total
    return matrix

def generate_features(sequences):
    # 定义特征扫描模板,每个元素为定义输入特征为当前字符索引的相对偏移位置。
    templates = [[-2],[-1],[0],[1],[2],
                 [-2,-1],[-1,0],[0,1],[1,2]]
    X, XY, features, total = {},{},{},0
    for sequence in sequences:
        for i in range(len(sequence)):
            for template in templates:
                value = [ sequence.get(i+pos, (None,None))[0] for pos in template]
                x  = (template, value)
                xy = (template, value, sequence[i][1])
                feature = (template, value, sequence[i][1])                
                key_x = str(x)
                key_xy = str(xy)
                key_f = str(feature)                
                if key_x not in X: X[key_x] = [x, 0]
                if key_xy not in XY: XY[key_xy] = [xy, 0]
                if key_f not in features: features[key_f] = feature
                X[key_x][1] += 1
                XY[key_xy][1] += 1
                total += 1
    features = list(features.values())
    weights = np.zeros((len(features)))
    featureHashTable = {}
    for i, feature in enumerate(features):
        featureHashTable[str(feature)] = (feature, i)
    X = dict([(k, (x, c/total) ) for k,(x,c) in X.items()])
    XY = dict([(k, (xy, c/total) ) for k,(xy,c) in XY.items()])
    return templates, features, featureHashTable, weights, X, XY

#@nb.jit
def model_probability(xy, featureHashTable, weights, labelTypes):
    allyx = {}
    for _y in labelTypes:
        key_f = str((xy[0],xy[1],_y))
        allyx[_y] = math.exp(weights[featureHashTable[ key_f ][1]]) if key_f in featureHashTable else 1.0
    return allyx[xy[2]] / sum(allyx.values())

def opt_fun(weights, features, featureHashTable, labelTypes, data_x, data_xy):
    return -1.0 * sum([p * math.log( model_probability(xy, featureHashTable, weights, labelTypes) ) for xy,p in data_xy.values()])

def grads_fun(weights, features, featureHashTable, labelTypes, data_x, data_xy):
    return np.asarray( [data_x[str((features[i][0], features[i][1]))][1] * model_probability(features[i],featureHashTable, weights, labelTypes) - data_xy[str(features[i])][1] for i in range( len(features) )] )

_starTime = time.time()
filename_ = './memm_train_text.txt'
defineLabelTypes_ = ['B','M','E','S']
sequences_ = getTextSequences(readfileBylines(filename_))
ransition_matrix_ = statistics_transition_matrix(defineLabelTypes_, sequences_)
templates_, features_, featureHashTable_, weights_, data_X_, data_XY_ = generate_features(sequences_)

#opt_fun(weights_, features_, featureHashTable_, defineLabelTypes_, data_X_, data_XY_)
#grads_fun(weights_, features_, featureHashTable_, defineLabelTypes_, data_X_, data_XY_)

print( 'templates:', len(templates_))
print( 'features:', len(features_))

iter_n = 0
def callback(xk):
    global iter_n
    v = opt_fun(xk, features_, featureHashTable_, defineLabelTypes_, data_X_, data_XY_)
    iter_n += 1
    print('iter:', iter_n, 'f:', v)
    
x,_,_ = fmin_l_bfgs_b(func=opt_fun, x0=weights_, fprime=grads_fun,
                       args=(features_, featureHashTable_, defineLabelTypes_, data_X_, data_XY_),
                       callback=callback, disp=1)

features = [(features_[i], x[i]) for i in range(x.shape[0])]
with open('memm_model.pkl', 'wb') as f:
    pickle.dump((ransition_matrix_, features), f)

_endTime = time.time()
print( 'execute time: %fs' % (_endTime - _starTime) )

对pku测试语料分词. 此处代码和hmm分词十分类似. 注意在这里使用 $ p(q_i|q_{i-1}) * p(q_i | o) = p(q_i | q_{i-1}, o) $.实现概率计算,并且和HMM的处理过程一样,用对数求和替代概率连乘。因都是区间内递增函数,并不影响效果.

# -*- coding: utf-8 -*-
import sys, re, os, pickle, math
import numpy as np
import numba as nb

sys.argv.append('./gold/pku_training_words.utf8')
sys.argv.append('./training/pku_training.utf8')
sys.argv.append('./testing/pku_test.utf8')
sys.argv.append('train')

assert len(sys.argv) >= 4

test_filename = sys.argv[3]

with open(test_filename, 'rt', encoding='utf8') as f:
    test = f.readlines()

with open('memm_model.pkl', 'rb') as f:
    matrix, features = pickle.load(f)

templates = {}
feature2weight = {}
labels = set()
for (template, value, label), proba in features:
    key = str(template)
    if key not in templates:
        templates[key] = template
    key = str((template, value, label))
    if key not in feature2weight:
        feature2weight[key] = proba
    labels.add(label)
templates = templates.values()

cache = {}
def memm_log_proba(x, y):
    key = str((x,y))
    if key in cache: return cache[key]
    sequence, index = dict(enumerate(x[0])), x[1]
    values = [(template, [sequence.get(index + pos, None) for pos in template]) for template in templates]
    ally = dict([ (_label, math.exp( sum([feature2weight.get(str((value[0],value[1], _label)), 0.0) for value in values]))) for _label in labels])
    log_proba = math.log( ally[y] / sum(ally.values()) )
    cache[key] = log_proba
    return log_proba
        
def model_log_proba(label, prev_label, word):
    proba = memm_log_proba(word, label) + math.log(max(sys.float_info.min, matrix[prev_label][label], sys.float_info.min))
    return proba

def viterbi(observation):
    T = len(observation)
    delta = [None] * (T + 1)
    psi = [None] * (T + 1)
    delta[0] = dict([(i, model_log_proba(i, None, (observation, 0))) for i in labels])
    psi[0] = dict( [ (i, None) for i in labels ] )
    for t in range(1, len(observation)):
        Ot = observation,t
        delta[t] = dict([(j, max([delta[t-1][i] + model_log_proba(j, i, Ot) for i in labels])) for j in labels])
        psi[t] = dict([(j, max([(delta[t-1][i] + model_log_proba(j, i, Ot), i) for i in labels])[1]) for j in labels ])
    delta[T] = max( [ delta[T-1][i] for i in labels ] )
    psi[T] = max( [(delta[T-1][i], i) for i in labels  ] )[1]
    q = [None] * (T+1)
    q[T] = psi[T]
    for t in range(T-1, -1, -1):
        q[t] = psi[t][q[t+1]]
    return q[1:]

for sent in test:
    sequence = viterbi( list(sent) )
    segment = []
    for char, tag in zip(sent, sequence):
        if tag == 'B':
            segment.append(char)
        elif tag == 'M':
            segment[-1] += char
        elif tag == 'E':
            segment[-1] += char
        elif tag == 'S':
            segment.append(char)
        else:
            raise Exception()
    print('  '.join(segment), sep='', end='')
    #break

7.2 pku测试算法分词性能对比

这里可通过调整特征模板得到更好的性能,但训练一次太慢,试验性质,也就将就了.

algorithmPRFOOVOOV RecallIV Recall
maximum matching0.8360.9040.8690.0580.0590.956
maximum probability0.8590.9190.8880.0580.0850.970
HMM0.8040.7890.7960.0580.3640.815
Full Second Order HMM0.8240.7990.8110.0580.3600.825
MEMM0.9090.8910.9000.0580.3830.923

参考:
[1] 李航; 统计学习方法
[2] Jorge Nocedal Stephen J. Wright; 《Numerical Optimization》 Second Edition 7.2 LIMITED-MEMORY QUASI-NEWTON METHODS

Tag:none

Add a new comment.