Multiinput Keras
Creating the Labels
from sklearn import preprocessing
from keras.utils import to_categorical
label_encoder = preprocessing.LabelEncoder()
# the y that is being passed in is a pandas series of categorical data
y = label_encoder.fit_transform(y) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
# prints the classes of y 
print(label_encoder.classes_)Preparing Text Input
Minimizing noise
- To minimize noise in our text, we process the text by removing puncutations, numbers, and excessive spacing. 
Convert text into suitable input format for model
- Our model only understands numeric values, so we have to convert our textual input into vectors 
- We use pretrained word embeddings to create these vectors. 
Creating vectors for each word provided by gloVe
from tqdm.notebook import tqdm
embeddings_dict = {}
# Download glove.6B.300d.txt from the gloVe website
with open(f"{ROOT}/glove.6B.300d.txt", "r", encoding="utf8") as glove_file:
  for line in tqdm(glove_file):
    records = line.split()
    word = records[0]
    vector_dim = np.asarray(records[1:], dtype="float32")
    embeddings_dict[word] = vector_dimConverting text to sequences
def convert_text_to_numeric(text_input):
  maxlen = 200
  tokenizer = Tokenizer(num_words=5000)
  tokenizer.fit_on_texts(text_input)
  sequences = tokenizer.texts_to_sequences(text_input)
  vocab_size = len(tokenizer.word_index) + 1
  sequences = pad_sequences(sequences, padding="post", maxlen=maxlen)
  return {
      "numeric": sequences, 
      "vocab_size": vocab_size,
      "tokenizer": tokenizer
  }
X_train = list(X_train["column"])
channel_name_train = convert_text_to_numeric(channel_name_train)Creating word embeddings
def create_word_embeddings(vocab_size, tokenizer):
  embedding_matrix = np.zeros((vocab_size, 300))
  for word, index in tokenizer.word_index.items():
      embedding_vector = embeddings_dict.get(word)
      if embedding_vector is not None:
          embedding_matrix[index] = embedding_vector
  
  return embedding_matrix
X_train_embedding = create_word_embeddings(
    channel_name_train["vocab_size"],
    channel_name_train["tokenizer"]
)Preparing meta data
We don't need to do much to this data since it is already numerical data.
X_train1 = X_train[["column1"]].valuesBuilding the Model
def create_model():
  input_1 = Input(shape=(maxlen,))
  input_2 = Input(shape=(1, ))
  # Submodel #1
  embedding_layer1 = Embedding(
      X_train["vocab_size"], 
      X_train_embedding.shape[1],
      weights=[X_train_embedding],
      trainable=False # Keras embedding layers are pretrained
  )(input_1)
  lstm_layer_1 = LSTM(32)(embedding_layer1)
  
  # Submodel #2
  dense_layer_1 = Dense(10, activation="relu")(input_2)
  dense_layer_2 = Dense(10, activation="relu")(dense_layer_1)
  # Concat submodel outputs together to produce input for the overall model
  concat_layer = Concatenate()([lstm_layer_1, dense_layer_2])
  # Overall model
  dense_layer_3 = Dense(10, activation="relu")(concat_layer)
  
  num_of_classes = 2
  output = Dense(num_of_classes, activation="softmax")(dense_layer_3)
  model = Model(inputs=[input_1, input_2], outputs=output)
  return modelTraining the Model
history = model.fit(
    x=[X_train["numeric"], X_train1], 
    y=y_train, 
    batch_size=32, 
    epochs=7, 
    verbose=1, 
    validation_split=0.2
)Last updated
Was this helpful?
