1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
| from gensim.models import KeyedVectors import jieba
word2vec_model_path = 'path_to_pretrained_word2vec_model.bin' word2vec_model = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)
def tokenize(text): return [word for word in jieba.cut(text)]
def sentence_similarity(sentence1, sentence2): tokens1 = tokenize(sentence1) tokens2 = tokenize(sentence2)
tokens1 = [word for word in tokens1 if word in word2vec_model.vocab] tokens2 = [word for word in tokens2 if word in word2vec_model.vocab]
vector1 = word2vec_model[tokens1].mean(axis=0) vector2 = word2vec_model[tokens2].mean(axis=0)
similarity = word2vec_model.similarity('', '')
return similarity
sentence1 = '我喜欢吃水果' sentence2 = '水果是我喜欢吃的' similarity_score = sentence_similarity(sentence1, sentence2) print("两个句子的相似度:", similarity_score)
|