from sklearn import preprocessing
from keras.utils import to_categorical
label_encoder = preprocessing.LabelEncoder()
# the y that is being passed in is a pandas series of categorical data
y = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
# prints the classes of y
print(label_encoder.classes_)
Preparing Text Input
Minimizing noise
To minimize noise in our text, we process the text by removing puncutations, numbers, and excessive spacing.
Convert text into suitable input format for model
Our model only understands numeric values, so we have to convert our textual input into vectors
We use pretrained word embeddings to create these vectors.
Creating vectors for each word provided by gloVe
from tqdm.notebook import tqdm
embeddings_dict = {}
# Download glove.6B.300d.txt from the gloVe website
with open(f"{ROOT}/glove.6B.300d.txt", "r", encoding="utf8") as glove_file:
for line in tqdm(glove_file):
records = line.split()
word = records[0]
vector_dim = np.asarray(records[1:], dtype="float32")
embeddings_dict[word] = vector_dim
def create_word_embeddings(vocab_size, tokenizer):
embedding_matrix = np.zeros((vocab_size, 300))
for word, index in tokenizer.word_index.items():
embedding_vector = embeddings_dict.get(word)
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
return embedding_matrix
X_train_embedding = create_word_embeddings(
channel_name_train["vocab_size"],
channel_name_train["tokenizer"]
)
Preparing meta data
We don't need to do much to this data since it is already numerical data.
X_train1 = X_train[["column1"]].values
Building the Model
def create_model():
input_1 = Input(shape=(maxlen,))
input_2 = Input(shape=(1, ))
# Submodel #1
embedding_layer1 = Embedding(
X_train["vocab_size"],
X_train_embedding.shape[1],
weights=[X_train_embedding],
trainable=False # Keras embedding layers are pretrained
)(input_1)
lstm_layer_1 = LSTM(32)(embedding_layer1)
# Submodel #2
dense_layer_1 = Dense(10, activation="relu")(input_2)
dense_layer_2 = Dense(10, activation="relu")(dense_layer_1)
# Concat submodel outputs together to produce input for the overall model
concat_layer = Concatenate()([lstm_layer_1, dense_layer_2])
# Overall model
dense_layer_3 = Dense(10, activation="relu")(concat_layer)
num_of_classes = 2
output = Dense(num_of_classes, activation="softmax")(dense_layer_3)
model = Model(inputs=[input_1, input_2], outputs=output)
return model