Word2Vector 构建 “人名的名义“ 词向量

https://radimrehurek.com/gensim/models/word2vec.html

语料:

http://www.k6k4.com/resource/detail/aaswfrdtx1605446850024


import jieba
from gensim.models import word2vec, Word2Vec

jieba.suggest_freq('沙瑞金', True)
jieba.suggest_freq('田国富', True)
jieba.suggest_freq('高育良', True)
jieba.suggest_freq('侯亮平', True)
jieba.suggest_freq('钟小艾', True)
jieba.suggest_freq('陈岩石', True)
jieba.suggest_freq('欧阳菁', True)
jieba.suggest_freq('易学习', True)
jieba.suggest_freq('王大路', True)
jieba.suggest_freq('蔡成功', True)
jieba.suggest_freq('孙连城', True)
jieba.suggest_freq('季昌明', True)
jieba.suggest_freq('丁义珍', True)
jieba.suggest_freq('郑西坡', True)
jieba.suggest_freq('赵东来', True)
jieba.suggest_freq('高小琴', True)
jieba.suggest_freq('赵瑞龙', True)
jieba.suggest_freq('林华华', True)
jieba.suggest_freq('陆亦可', True)
jieba.suggest_freq('刘新建', True)
jieba.suggest_freq('刘庆祝', True)

path = 'in_the_name_of_people.txt'
with open(path, 'r', encoding='utf-8') as f:
    lines = [jieba.cut(line.strip()) for line in f.readlines() if line.strip() != '' and len(line) > 6]

lines = [' '.join(line) + '\n' for line in lines]
print('total line count=>', len(lines))
# total line count=> 2200

with open('train.txt', 'w', encoding='utf-8') as f:
    f.writelines(lines)

sentences = word2vec.LineSentence('train.txt')

model = Word2Vec(sentences, size=100, window=5, min_count=1, workers=4)

model.save('word2vec.model')

model = Word2Vec.load('word2vec.model')

print(model.wv['侯亮平'])
# [ 1.2608657  -0.31955615  1.0512999  -1.1955101  -0.54877764  0.8825323
#  -0.23593487  0.38839418  0.1108841  -0.5551717   0.7028596   1.6013402
#   0.26119116 -0.25903744 -0.52801746  1.8191248   0.20933713 -0.58041674
#  -1.2210642  -0.09129222  0.5809172  -0.16032848 -0.16483907  0.14679825
#  -0.7538626  -0.27101424  0.06765628  0.9971095  -1.4045182  -1.1781099
#   0.91175085 -0.8313674   2.1861002   0.17322211  0.44776583  0.42540348
#  -0.9376401   1.3471535  -0.7086678  -1.368492    0.14985737 -1.3741096
#   0.02821825  0.6822765   0.30579555 -0.15688124  1.0605482  -0.45804158
#  -0.27202618 -0.1547584   1.0001445   0.00995962  0.43815503  0.23531151
#  -0.02857795 -1.4710406   0.8649675   0.58276564  0.6302883  -0.65666556
#   1.9223623   0.27569762  0.53792554  0.33289945 -0.8774105   0.3597854
#  -0.3688891  -2.0895743   0.9067872  -0.5675777  -0.19521916  1.053807
#  -1.424331    0.18578833 -1.2480674  -0.8538316   0.5637747   0.73074526
#  -0.3430865  -0.12637     0.7318182   1.6930991  -0.43526727  0.8515712
#   0.69197536 -0.10268717 -1.2695224  -0.5271906   0.77203965 -1.3805364
#  -0.23319757 -0.28349143 -1.3337592   0.10251193  0.6908297   1.1695349
#  -1.1622112  -0.37568846 -1.5370712  -1.588368  ]

print(model.wv.most_similar('侯亮平', topn=10))
# [('李达康', 0.9998794794082642), ('祁同伟', 0.9998669624328613), ('这个', 0.999819278717041),
# ('易学习', 0.9998106956481934), ('得', 0.9997933506965637), ('汇报', 0.9997814297676086), 
# ('季昌明', 0.999774694442749), ('地说', 0.9997525215148926), ('要', 0.9997483491897583), 
# ('别', 0.9997408390045166)]





个人资料
时海
等级:8
文章:272篇
访问:16.0w
排名: 2
推荐圈子
上一篇: FastText 构建四大名著词向量
下一篇:AI比赛平台汇总
猜你感兴趣的圈子:
AI比赛交流圈
标签: freq、jieba、suggest、word2vec、lines、面试题
隐藏