from io importopen import glob import os import unicodedata import string
all_letters = string.ascii_letters + " .,;'-" n_letters = len(all_letters) + 1# Plus EOS marker
deffindFiles(path): return glob.glob(path)
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427 defunicodeToAscii(s): return''.join( c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' and c in all_letters )
# Read a file and split into lines defreadLines(filename): withopen(filename, encoding='utf-8') as some_file: return [unicodeToAscii(line.strip()) for line in some_file]
# Build the category_lines dictionary, a list of lines per category category_lines = {} all_categories = [] for filename in findFiles('data/names/*.txt'): category = os.path.splitext(os.path.basename(filename))[0] all_categories.append(category) lines = readLines(filename) category_lines[category] = lines
n_categories = len(all_categories)
if n_categories == 0: raise RuntimeError('Data not found. Make sure that you downloaded data ' 'from https://download.pytorch.org/tutorial/data.zip and extract it to ' 'the current directory.')
# Random item from a list defrandomChoice(l): return l[random.randint(0, len(l) - 1)]
# Get a random category and random line from that category defrandomTrainingPair(): category = randomChoice(all_categories) line = randomChoice(category_lines[category]) return category, line
对于每个时间步(即,对于训练词中的每个字母),网络的输入将是(category, current letter, hidden state),输出将是(next letter, next hidden state) 。
类别张量是大小 <1 x n_categories> 为 的单热张量。在训练时,我们会在每个时间步将其馈送到网络 - 这是一种设计选择,它可以作为初始隐藏状态或其他策略的一部分包含在内。
# One-hot vector for category defcategoryTensor(category): li = all_categories.index(category) tensor = torch.zeros(1, n_categories) tensor[0][li] = 1 return tensor
# One-hot matrix of first to last letters (not including EOS) for input definputTensor(line): tensor = torch.zeros(len(line), 1, n_letters) for li inrange(len(line)): letter = line[li] tensor[li][0][all_letters.find(letter)] = 1 return tensor
# ``LongTensor`` of second letter to end (EOS) for target deftargetTensor(line): letter_indexes = [all_letters.find(line[li]) for li inrange(1, len(line))] letter_indexes.append(n_letters - 1) # EOS return torch.LongTensor(letter_indexes)
# Make category, input, and target tensors from a random category, line pair defrandomTrainingExample(): category, line = randomTrainingPair() category_tensor = categoryTensor(category) input_line_tensor = inputTensor(line) target_line_tensor = targetTensor(line) return category_tensor, input_line_tensor, target_line_tensor
loss = torch.Tensor([0]) # you can also just simply use ``loss = 0``
for i inrange(input_line_tensor.size(0)): output, hidden = rnn(category_tensor, input_line_tensor[i], hidden) l = criterion(output, target_line_tensor[i]) loss += l
loss.backward()
for p in rnn.parameters(): p.data.add_(p.grad.data, alpha=-learning_rate)
# Sample from a category and starting letter defsample(category, start_letter='A'): with torch.no_grad(): # no need to track history in sampling category_tensor = categoryTensor(category) input = inputTensor(start_letter) hidden = rnn.initHidden()
output_name = start_letter
for i inrange(max_length): output, hidden = rnn(category_tensor, input[0], hidden) topv, topi = output.topk(1) topi = topi[0][0] if topi == n_letters - 1: break else: letter = all_letters[topi] output_name += letter input = inputTensor(letter)
return output_name
# Get multiple samples from one category and multiple starting letters defsamples(category, start_letters='ABC'): for start_letter in start_letters: print(sample(category, start_letter))