Table of Contents
WMT14 error
Transformer를 구현하던 도중 Multi30k를 WMT14로 바꾸고자 하는 코드를 아래와 같이 짜고있었는데
def make_dataset(self):
#print('Loading') ##
#fr_en = load_dataset("wmt14", "fr-en") ##
if self.ext == ('.de', '.en'):
self.source = Field(tokenize=self.tokenize_de, init_token=self.init_token, eos_token=self.eos_token,
lower=True, batch_first=True)
self.target = Field(tokenize=self.tokenize_en, init_token=self.init_token, eos_token=self.eos_token,
lower=True, batch_first=True)
elif self.ext == ('.en', '.de'):
self.source = Field(tokenize=self.tokenize_en, init_token=self.init_token, eos_token=self.eos_token,
lower=True, batch_first=True)
self.target = Field(tokenize=self.tokenize_de, init_token=self.init_token, eos_token=self.eos_token,
lower=True, batch_first=True)
#train_data, valid_data, test_data = Multi30k.splits(exts=self.ext, fields=(self.source, self.target))
train_data, valid_data, test_data = WMT14.splits(exts=self.ext, fields=(self.source, self.target))
return train_data, valid_data, test_data
아래와 같은 에러가 났다.
FileNotFoundError: [Errno 2] No such file or directory: '.data/wmt14/train.tok.clean.bpe.32000.en'
Solve
그래서 아래와 같은 사이트를 참고해봤지만 안돼서,,,
아이에 아래와 같이 바꿨더니 잘 작동한다.
def make_dataset(self):
#print('Loading') ##
#fr_en = load_dataset("wmt14", "fr-en") ##
if self.ext == ('.de', '.en'):
self.source = Field(tokenize=self.tokenize_de, init_token=self.init_token, eos_token=self.eos_token,
lower=True, batch_first=True)
self.target = Field(tokenize=self.tokenize_en, init_token=self.init_token, eos_token=self.eos_token,
lower=True, batch_first=True)
elif self.ext == ('.en', '.de'):
self.source = Field(tokenize=self.tokenize_en, init_token=self.init_token, eos_token=self.eos_token,
lower=True, batch_first=True)
self.target = Field(tokenize=self.tokenize_de, init_token=self.init_token, eos_token=self.eos_token,
lower=True, batch_first=True)
#train_data, valid_data, test_data = Multi30k.splits(exts=self.ext, fields=(self.source, self.target))
#train_data, valid_data, test_data = WMT14.splits(exts=self.ext, fields=(self.source, self.target))
fr_en = load_dataset("wmt14", "fr-en")
train_data = fr_en["train"]['translation']
valid_data = fr_en["validation"]['translation']
test_data = fr_en["test"]['translation']
return train_data, valid_data, test_data
하하……. 다들 즐코딩….