Source code for sciquence.load_utils

import string

[docs]def load_txt(path):
    with open(path, mode='r') as f:
        return f.readlines()


[docs]def remove_punctuation(s):
    return s.translate(None, string.punctuation)

[docs]def word2idx(path):
    word2idx = {'START': 0, 'END': 1}
    current_idx = 2
    sentences = []
    for line in open(path):
        line = line.strip()
        if line:
            tokens = remove_punctuation(line.lower()).split()
            sentence = []
            for t in tokens:
                if t not in word2idx:
                    word2idx[t] = current_idx
                    current_idx += 1
                idx = word2idx[t]
                sentence.append(idx)
            sentences.append(sentence)
    return sentences, word2idx