Multiinput Keras

Creating the Labels

from sklearn import preprocessing
from keras.utils import to_categorical

label_encoder = preprocessing.LabelEncoder()

# the y that is being passed in is a pandas series of categorical data
y = label_encoder.fit_transform(y) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

# prints the classes of y 
print(label_encoder.classes_)

Preparing Text Input

Minimizing noise

To minimize noise in our text, we process the text by removing puncutations, numbers, and excessive spacing.

Convert text into suitable input format for model

Our model only understands numeric values, so we have to convert our textual input into vectors
We use pretrained word embeddings to create these vectors.

Creating vectors for each word provided by gloVe

from tqdm.notebook import tqdm

embeddings_dict = {}

# Download glove.6B.300d.txt from the gloVe website
with open(f"{ROOT}/glove.6B.300d.txt", "r", encoding="utf8") as glove_file:
  for line in tqdm(glove_file):
    records = line.split()
    word = records[0]
    vector_dim = np.asarray(records[1:], dtype="float32")
    embeddings_dict[word] = vector_dim

Converting text to sequences

def convert_text_to_numeric(text_input):
  maxlen = 200
  tokenizer = Tokenizer(num_words=5000)
  tokenizer.fit_on_texts(text_input)

  sequences = tokenizer.texts_to_sequences(text_input)
  vocab_size = len(tokenizer.word_index) + 1
  sequences = pad_sequences(sequences, padding="post", maxlen=maxlen)

  return {
      "numeric": sequences, 
      "vocab_size": vocab_size,
      "tokenizer": tokenizer
  }

X_train = list(X_train["column"])
channel_name_train = convert_text_to_numeric(channel_name_train)

Creating word embeddings

def create_word_embeddings(vocab_size, tokenizer):
  embedding_matrix = np.zeros((vocab_size, 300))

  for word, index in tokenizer.word_index.items():
      embedding_vector = embeddings_dict.get(word)
      if embedding_vector is not None:
          embedding_matrix[index] = embedding_vector
  
  return embedding_matrix

X_train_embedding = create_word_embeddings(
    channel_name_train["vocab_size"],
    channel_name_train["tokenizer"]
)

Preparing meta data

We don't need to do much to this data since it is already numerical data.

X_train1 = X_train[["column1"]].values

Building the Model

def create_model():
  input_1 = Input(shape=(maxlen,))
  input_2 = Input(shape=(1, ))

  # Submodel #1
  embedding_layer1 = Embedding(
      X_train["vocab_size"], 
      X_train_embedding.shape[1],
      weights=[X_train_embedding],
      trainable=False # Keras embedding layers are pretrained
  )(input_1)
  lstm_layer_1 = LSTM(32)(embedding_layer1)
  
  # Submodel #2
  dense_layer_1 = Dense(10, activation="relu")(input_2)
  dense_layer_2 = Dense(10, activation="relu")(dense_layer_1)

  # Concat submodel outputs together to produce input for the overall model
  concat_layer = Concatenate()([lstm_layer_1, dense_layer_2])

  # Overall model
  dense_layer_3 = Dense(10, activation="relu")(concat_layer)
  
  num_of_classes = 2
  output = Dense(num_of_classes, activation="softmax")(dense_layer_3)
  model = Model(inputs=[input_1, input_2], outputs=output)

  return model

Training the Model

history = model.fit(
    x=[X_train["numeric"], X_train1], 
    y=y_train, 
    batch_size=32, 
    epochs=7, 
    verbose=1, 
    validation_split=0.2
)

PreviousTensorflow NextTensorflow Pretrained

Last updated 3 years ago

Was this helpful?