自然语言:基于规则的分词算法

2022-01-21 00:09
92
0
添加收藏

 

实验一,基于规则的分词算法

from pyhanlp import *

def load_dictionary():
    IOUtil = JClass('com.hankcs.hanlp.corpus.io.IOUtil')
    path = HanLP.Config.CoreDictionaryPath.replace('.txt', 'mini.txt')
    dic = IOUtil.loadDictionary([path])
    return set(dic.keySet())

def fully_segment(text, dic):
    word_list = []
    for i in range(len(text)):
        for j in range(len(text)):
            word = text[i:j]
            if word in dic:
                word_list.append(word)
    return word_list

def forward_segment(text, dic):
    word_list = []
    i = 0
    while i < len(text):
        longest_word = text[i]
        for j in range(i+1, len(text) + 1):
            word = text[i:j]
            if word in dic:
                if len(word) > len(longest_word):
                    longest_word = word
        word_list.append(longest_word)
        i += len(longest_word)
    
    return word_list

def backward_segment(text, dic):
    word_list = []
    i = len(text) - 1
    while i >= 0:
        longest_word = text[i]
        for j in range(0, i):
            word = text[j: i+1]
            if word in dic:
                if len(word) > len(longest_word):
                    longest_word = word
                    break
        word_list.insert(0, longest_word)
        i -= len(longest_word)
    return word_list

def count_single_char(word_list:list):
    return sum(1 for word in word_list if len(word) == 1)

def bidirectional_segment(text, dic):
    f = forward_segment(text, dic)
    b = backward_segment(text, dic)
    if len(f) < len(b):
        return f
    elif len(f) > len(b):
        return b
    else:
        if count_single_char(f) < count_single_char(b):
            return f
        else:
            return b
 
dic = load_dictionary()
text = ['项目的研究', '商品和服务', '研究生命起源', '当下雨天地面积水', '结婚的和尚未结婚的', '欢迎新老师生前来就餐']

for i in text:
    print('正向最长匹配:' + str(forward_segment(i, dic)))
    print('逆向最长匹配:' + str(backward_segment(i, dic)))
    print('双向最长匹配:' + str(bidirectional_segment(i, dic)))
    print('---------------------------------------------')

全部评论