由中古音推导方音工具

工具说明

对于熟悉python的读者,这里提供一个大致可以由中古音推导方音工具,推导的结果最好由人工再次审核。

本项目的源码下载地址:下载地址,脚本代码在pythopnTools/InferDialect文件夹中。

data_dialect.txt放方言调查字表的数据,data_koxqim为中古音数据,koxqim_dialect.txt为自动生成的中古音与方音对应数据,output.txt为输出结果,执行InferDialect.py中的代码即可:

import re

char = []
list_dialect = []
dict_dialect = {}
dict_dialect_list = {}
dict_koxqim_dialect = {}
dict_out = {}

data_dialect = open('data_dialect.txt', encoding='utf-8')

for line in data_dialect.readlines():
    char = line.split()
    if char[0] in dict_dialect:
        dict_dialect[char[0]] = dict_dialect[char[0]] + '/' + char[1]
    else:
        dict_dialect[char[0]] = char[1]

data_dialect.close()

for key,item in dict_dialect.items():
    dict_dialect_list[key] = item.split('/')

# 第一步 生成中古音与方音对应数据

def judge(koxqim,jyutping):
    regstr = '([bdzgmnljiy]|dr|zr|zj|zs|zsr|zsj|gh|nr|nj|ng).*'
    regstr2 = '([ptckqsh]|tr|cr|cj|ph|th|sr|sj|thr|ch|chr|chj|kh).*'
    if re.match(r'.*[ptk]$', koxqim) and re.match(r'.*[ptk][1-6]$', jyutping):
        if re.match(r''+regstr+'[ptk]$', koxqim) and re.match(r'.*[ptk][56]$', jyutping):
            return True
        elif re.match(r''+regstr2+'[ptk]$', koxqim) and re.match(r'.*[ptk][123]$', jyutping):
            return True
        else:
            return False
    elif re.match(r'.*h$', koxqim) and re.match(r'.*[^ptk][356]$', jyutping):
        if re.match(r''+regstr+'h$', koxqim) and re.match(r'.*[^ptk][56]$', jyutping):
            return True
        elif re.match(r''+regstr2+'h$', koxqim) and re.match(r'.*[^ptk][3]$', jyutping):
            return True
        else:
            return False
    elif re.match(r'.*x$', koxqim) and re.match(r'.*[^ptk][256]$', jyutping):
        if re.match(r''+regstr+'x$', koxqim) and re.match(r'.*[^ptk][56]$', jyutping):
            return True
        elif re.match(r''+regstr2+'x$', koxqim) and re.match(r'.*[^ptk][2]$', jyutping):
            return True
        else:
            return False
    elif re.match(r'.*[^ptkhx]$', koxqim) and re.match(r'.*[^ptk][14]$', jyutping):
        if re.match(r''+regstr+'[^ptkxh]$', koxqim) and re.match(r'.*[^ptkxh][4]$', jyutping):
            return True
        elif re.match(r''+regstr2+'[^ptkxh]$', koxqim) and re.match(r'.*[^ptkxh][1]$', jyutping):
            return True
        else:
            return False
    elif re.match(r'.*d$', koxqim) and re.match(r'.*[36]$', jyutping):
         return True
    else:
        return False

data_koxqim = open('data_koxqim.txt', encoding='utf-8')
koxqim_dialect = open('koxqim_dialect.txt', 'w', encoding='utf-8')

for line in data_koxqim.readlines():
    char = line.split()
    if char[0] in dict_koxqim_dialect:
        if char[1] in dict_dialect_list:
            list_dialect = []
            for i in dict_dialect_list[char[1]]:
                if judge(char[0],i):
                    list_dialect.append(i)
                else:
                    continue
            dict_koxqim_dialect[char[0]] = '/'.join(filter(None,set(dict_koxqim_dialect[char[0]].split('/')).union(set(list_dialect))))
        else:
            continue
    else:
        if char[1] in dict_dialect_list:
            list_dialect = []
            for i in dict_dialect_list[char[1]]:
                if judge(char[0],i):
                    list_dialect.append(i)
                else:
                    continue
            dict_koxqim_dialect[char[0]] = '/'.join(list_dialect)
        else:
            continue

for key,item in dict_koxqim_dialect.items():
    koxqim_dialect.write(key+'\t'+item+'\n')

koxqim_dialect.close()
data_koxqim.close()

# 第二步 推导不在方调表中的字音(若手动调整过 koxqim_dialect.txt 需将第一步注释)

out = open('output.txt', 'w', encoding='utf-8')
data_koxqim = open('data_koxqim.txt', encoding='utf-8')

for line in data_koxqim.readlines():
    char = line.split()
    if char[1] in dict_dialect:
        continue
        #dict_out[char[1]] = dict_koxqim_dialect[char[0]]
    else:
        try:
            if char[1] in dict_out:
                dict_out[char[1]] = dict_out[char[1]] + '/' + dict_koxqim_dialect[char[0]]
            else:
                dict_out[char[1]] = dict_koxqim_dialect[char[0]]
        except:
            dict_out[char[1]] = '_noData_'
            continue

for key,item in dict_dialect.items():
    out.write(key+'\t'+item+'\t'+'0'+'\n')

for key,item in dict_out.items():
    out.write(key+'\t'+item+'\t'+'1'+'\n')


data_koxqim.close()
out.close()

judge()为中古音与具体方音对应的判断函数,可根据自己的需要修改。koxqim_dialect.txt为生成结果,「空」表示有中古音对应的方音数据但不符合judge()设定的古今对应规律故值为空,noData表示无中古音对应的方音数据。

results matching ""

    No results matching ""