import numpy as np
import keras
from keras import layers
from sklearn.metrics import roc_auc_score, classification_report
import matplotlib.pyplot as plt

VOCAB_SIZE = 40000  # Consider only the most frequent VOCAB_SIZE words
MAX_LENGTH = 400 # Up to MAX_LENGTH words in a comment is considered
EMBED_DIM = 64 # Embedding dimension for words
LSTM_DIM = 32 # LSTM dimension

def get_data(vocab_size):
  (x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(num_words=vocab_size)
  WI = keras.datasets.imdb.get_word_index() # maps words to indices
  REV_WI = dict([(value, key) for (key, value) in WI.items()]) # maps indices to words
  return (x_train, y_train), (x_test, y_test), WI, REV_WI

def to_wds(idx_list, REV_WI):
  """Returns text corresponding to idx_list using REV_WI"""
  return ' '.join([REV_WI.get(i-3, '?') for i in idx_list[1:]])

def to_seq(text, WI):
  """Returns sequence corresponding to text using WI
     Out of vocabulary words are ignored"""
  W = [w.lower() for w in text.split() if w in WI]
  return [WI[w] for w in W]

(x_train, y_train), (x_test, y_test), WI, REV_WI = get_data(VOCAB_SIZE)
print(f"Number of training samples: {len(y_train)}")
print(f"Number of test samples: {len(y_test)}")
print(f"First training sample labeled {y_train[0]} is: {to_wds(x_train[0], REV_WI)}")
print(f"First test sample labeled {y_test[0]} is: {to_wds(x_test[0], REV_WI)}")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
17464789/17464789 ━━━━━━━━━━━━━━━━━━━━ 0s 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
1641221/1641221 ━━━━━━━━━━━━━━━━━━━━ 0s 0us/step
Number of training samples: 25000
Number of test samples: 25000
First training sample labeled 1 is: this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert redford's is an amazing actor and now the same being director norman's father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for retail and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also congratulations to the two little boy's that played the part's of norman and paul they were just brilliant children are often left out of the praising list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you think the whole story was so lovely because it was true and was someone's life after all that was shared with us all
First test sample labeled 0 is: please give this one a miss br br kristy swanson and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite lacklustre so all you madison fans give this a miss

def pad_sequences(x, maxlen=MAX_LENGTH):
  return keras.utils.pad_sequences(x, maxlen)

print(f"Shape of padded training inputs: {pad_sequences(x_train).shape}")
print(f"Shape of padded test inputs: {pad_sequences(x_test).shape}")

Shape of padded training inputs: (25000, 400)
Shape of padded test inputs: (25000, 400)

# Bidirectional LSTM Model

def Bidirectional_LSTM(VOCAB_SIZE, EMBED_DIM, LSTM_DIM):
  # Input for variable-length sequences of integers
  inputs = keras.Input(shape=(None,), dtype="int32")
  # Embed each integer in a 128-dimensional vector
  x = layers.Embedding(VOCAB_SIZE, EMBED_DIM)(inputs)
  # Add 2 bidirectional LSTMs
  x = layers.Bidirectional(layers.LSTM(LSTM_DIM, return_sequences=True))(x)
  x = layers.Bidirectional(layers.LSTM(LSTM_DIM))(x)
  # Add a classifier
  outputs = layers.Dense(1, activation="sigmoid")(x)
  model = keras.Model(inputs, outputs)
  return model

model = Bidirectional_LSTM(VOCAB_SIZE, EMBED_DIM, LSTM_DIM)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

Model: "functional"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer (InputLayer)        │ (None, None)           │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Embedding)           │ (None, None, 64)       │     2,560,000 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ bidirectional (Bidirectional)   │ (None, None, 64)       │        24,832 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ bidirectional_1 (Bidirectional) │ (None, 64)             │        24,832 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense (Dense)                   │ (None, 1)              │            65 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 2,609,729 (9.96 MB)

 Trainable params: 2,609,729 (9.96 MB)

 Non-trainable params: 0 (0.00 B)

# train your model so that the validation accuracy is at least 0.85
batch_size=32
epochs=3
validation_split=0.2
model.fit(pad_sequences(x_train), y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=validation_split)

Epoch 1/3
625/625 ━━━━━━━━━━━━━━━━━━━━ 41s 51ms/step - accuracy: 0.7368 - loss: 0.4885 - val_accuracy: 0.8726 - val_loss: 0.3105
Epoch 2/3
625/625 ━━━━━━━━━━━━━━━━━━━━ 30s 47ms/step - accuracy: 0.9172 - loss: 0.2124 - val_accuracy: 0.8812 - val_loss: 0.3016
Epoch 3/3
625/625 ━━━━━━━━━━━━━━━━━━━━ 42s 49ms/step - accuracy: 0.9690 - loss: 0.0940 - val_accuracy: 0.8676 - val_loss: 0.4544

<keras.src.callbacks.history.History at 0x7ed9ee6fdcd0>

# Evaluate your model on the test samples.
pred = model.predict(pad_sequences(x_test))

# Display the Area Under the Receiver Operating Characteristic Curve (rounded to 4 decimal places)
print(f'roc_auc_score = {roc_auc_score(y_test, pred):0.4f}')

# Display your classification report with metrics rounded to 4 decimal places.
print("Classification report:")
print(classification_report(y_test, np.round(pred), digits=4))

782/782 ━━━━━━━━━━━━━━━━━━━━ 15s 18ms/step
roc_auc_score = 0.9329
Classification report:
              precision    recall  f1-score   support

           0     0.9026    0.7948    0.8453     12500
           1     0.8167    0.9142    0.8627     12500

    accuracy                         0.8545     25000
   macro avg     0.8597    0.8545    0.8540     25000
weighted avg     0.8597    0.8545    0.8540     25000

# Finetune the GPT-2 model on positive reviews
import os
import numpy as np
import tensorflow as tf

os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"

import keras_hub
import keras
import tensorflow as tf
import time

keras.mixed_precision.set_global_policy("mixed_float16")


# Maximum sequence length is 1024

SEQUENCE_LENGTH = 512 # lower number may speed up training and generation

# Get preprocessor
preprocessor = keras_hub.models.GPT2CausalLMPreprocessor.from_preset(
    "gpt2_base_en",
    sequence_length=SEQUENCE_LENGTH,)

# Get language model gpt2_lm
review_generator = keras_hub.models.GPT2CausalLM.from_preset(
    "gpt2_base_en", preprocessor=preprocessor)


x = np.concatenate([x_train, x_test]) # all inputs
y = np.concatenate([y_train, y_test]) # all labels
x_pos = x[y==1] # all positive inputs

positive_comments = [to_wds(c, REV_WI) for c in x_pos] # all positive comments

print(f'Finetune GPT-2 on {len(positive_comments)} positive comments.')
print(f'Sample comment:\n{positive_comments[np.random.randint(0, len(positive_comments))]}')

Finetune GPT-2 on 25000 positive comments.
Sample comment:
i saw this film at the 3rd adelaide international film festival at the palace cinemas and was totally switched onto it in the opening five minutes thanks goodness for a film that ignores all the rubbish we often see in australian films that seem to revolve around a race b gender and c class in favour of er dare i say jolly good cinema the producer a shy slightly eccentric chap called alex introduced his film made with a bunch of his mates near the town he spent much of his childhood apparently he's spent much of the last year traveling the world with the film mostly in europe the world the film creates is both brilliant and arty not least because of strange and disconcerting editing style the gothic characters and the surreal sense of time and place that draws viewers into its nightmarish realm br br the producer returned for the q a after wards someone asked him what his inspiration was he replied south australia hear hear another asked him what a in the life of ? entailed he replied that he drives an old ? that he has breakfast at the same table at the same restaurant that he's jolly well eaten at for the past 8 years and that he plays piano which helps him to think he doesn't drink booze and plays cricket once a week then the q and a session ended abruptly because of the next film screening so my thoughts are that for the next festival they need to extend the after film sessions

# Train your model review_generator

%%time

train_ds = (
    tf.data.Dataset.from_tensor_slices(positive_comments)
    .batch(16)
    .cache()
    .prefetch(tf.data.AUTOTUNE))

train_ds = train_ds.take(500)

num_epochs = 3

# Linearly decaying learning rate.
learning_rate = keras.optimizers.schedules.PolynomialDecay(
    5e-5,
    decay_steps=train_ds.cardinality() * num_epochs,
    end_learning_rate=0.0,)

loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

review_generator.compile(
    optimizer=keras.optimizers.Adam(learning_rate),
    loss=loss,
    weighted_metrics=["accuracy"],
)

review_generator.fit(train_ds, epochs=num_epochs)

Epoch 1/3
500/500 ━━━━━━━━━━━━━━━━━━━━ 614s 1s/step - accuracy: 0.2742 - loss: 2.0084
Epoch 2/3
500/500 ━━━━━━━━━━━━━━━━━━━━ 503s 1s/step - accuracy: 0.2980 - loss: 1.8808
Epoch 3/3
500/500 ━━━━━━━━━━━━━━━━━━━━ 504s 1s/step - accuracy: 0.3063 - loss: 1.8431
CPU times: user 4min 39s, sys: 1min 12s, total: 5min 51s
Wall time: 27min 4s

<keras.src.callbacks.history.History at 0x7ed9e8b1b650>

%%time
k = 100 # number of reviews to generate
prompt = 'This movie' # input prompt to start review
# create 100 comments
output = [review_generator.generate(prompt, max_length=400) for _ in range(k)]

print(f'{len(output)} comments generated')  # prints the number of generated comments
print(f'Last 10 generated comments:') # prints heading and last 10 generated comments
for c in output[-10:]:
  print(c)

100 comments generated
Last 10 generated comments:
This movie is a great one to see if you like a good action thriller with lots of twists and turns and some great acting this movie is for you
This movie is a good example of how the director of this film could have created a movie that had a lot of action scenes that were not shown in the film but were shown in a movie and that had a very good plot that was well developed the movie had a lot to do with the fact that the film had a lot to do with the fact that there were many characters that were not in the movie that were actually in the movie the movie had to do with the fact that the movie had a lot of violence and that there were a lot of people that had been in the movie who were not in the movie and that they were not in the movie and that they had to kill the characters that were in the movie that were not in the movie and the story that was told in the movie had to do with that the movie had to do with the fact that there were a lot of people that were not in the movie the movie had to do with the fact that there were many people that were not in the movie and that they had to kill the characters that were in the movie that were not in the movie and that the movie had to do with the fact that there were many people that were not in the movie the movie had to do with the fact that there were a lot of people that were not in the movie and that the movie had to do with the fact that there were a lot of people that were not in the movie and that the movie had to do with the fact that there were many people that were not in the movie and that the movie had to do with the fact that there were a lot of people that were not in the movie and that the movie had to do with the fact that the movie had to do with the fact that there were a lot of people that were not in the movie the movie had to do with the fact that there were a lot of people that were not in the movie and that the movie had to do with the fact that there were a
This movie is so good and so funny that it makes you laugh and cry at the same time it also makes you feel good for the first time in years br br it was a good film that i enjoyed the first time and i will always remember the first time as an adult it was very good and i will always remember the second time as an adult and it was great
This movie is one of the best of the genre br br i was very impressed with this movie i have seen a few other movies and this time i liked this one more br br it's very funny and very good at the same time i think it's very interesting that the main character is so good and so intelligent that he is able to make his own decisions br br the story of this movie is very interesting and i think that it is very good br br i think that this movie will appeal to everyone
This movie is a classic it's a movie with a lot of great acting the story is simple but there are some good characters in this movie and the acting is good too this movie is a very well made movie and if you are looking for something to watch then this movie will do the job and it has a lot of fun the story of this movie is very simple and the story is well told it is about the love of a man who lost his father and how he is coping with the loss and how he can help the man who is still in the dark and the man who has been through a lot and is trying to make things right the movie is very good and the story is well told the story is very good and the movie is very well made the movie is very good and the movie is very well made
This movie was a good one i was surprised how well this movie was filmed and how well they managed to put the story to good use this movie was a must see for any horror fan br br i was very surprised by the acting and the direction the story is very well done the story lines are well done i think that the director was very clever with his use of the camera and the camera angles the movie is very well shot with a very good sense of style this movie is definitely recommended to anyone who likes horror
This movie was one of the first i saw that made me laugh i remember thinking to myself that this was a movie i would like to see more of i think that was one of the reasons it was so good i also remember thinking that this movie was very funny i think that was one of the reasons that this movie was so funny i also remember thinking to myself that this movie was really funny i was really surprised at how good the acting was in this movie br br this movie had a lot of action but i think that was because of the characters i really liked the character of kimmy kennedy i mean kimmy really had a great sense of humor and he was very funny br br the plot was really simple and very funny but it was very well done i think that the story was very well written and it was very well written the characters were very well developed and they had great chemistry and the acting was very well done br br it is a movie that will be very popular in the movie industry and it has to be seen to be believed i am not sure if it will be a hit in the movie industry but i think that it will be good in the movie industry it is very funny and it is very well written the characters were really developed and the story was well written i think that this movie was very good in the movie industry it is very funny and it was very well written and it was very well written br br i would also like to say that this movie was really good in my opinion it is very well acted and it has very good acting and there was a very good chemistry between the characters and they were very well developed and they were very well developed br br i would also like to say that this movie was very well written i think that the story was very well developed and it was very well written the characters were really developed and the story was well developed and it was very well written the acting was very well developed and the character was very well developed and the characters
This movie was a great one for me it has a good script and some great acting from the cast it's a very good movie with a good story and a great story line it's a good movie for kids and adults it's a good movie for adults and adults it's a good movie for everyone it's a good movie for kids and adults it's a good movie for everyone
This movie is not the first to use the phrase 'the story of two people' it's the first one that uses the word 'the movie' it has been used for many years in movies like the ? and in movies like the ? and the ? it's been used for years in movies like jane and the ? and in movies like the ? and ? it's not the first to use the words 'the story of two people' i'm not saying that it's a bad movie it's a good movie i like it because it is not the same as the ? movie and it doesn't have the same plot but it's not the same as jane the film has a lot of similarities and it has a lot of similarities in the movie but it's not a bad movie it's a good movie
This movie is a great movie and i recommend it to everyone i think it is a very good movie and the actors in this movie are very good the story is very well told i have seen this movie twice now and the movie still holds up well even today
CPU times: user 1min 16s, sys: 1.08 s, total: 1min 17s
Wall time: 1min 17s

# Convert text to sequences and predict
output_seq = [to_seq(c, WI) for c in output]
pred_gen = model.predict(pad_sequences(output_seq), verbose=0)

# Sanity check on a few predictions
print("Sample probabilities (first 5):", pred_gen[:5].flatten())

# Summary stats
print(f"Mean probability of positive review: {np.mean(pred_gen):.4f}")
print(f"Standard deviation: {np.std(pred_gen):.4f}")

# Histogram
plt.hist(pred_gen, bins=10, color='blue', alpha=0.5)
plt.xlabel('Predicted Probability of Positive Review')
plt.ylabel('Frequency')
plt.title('Distribution of Predictions')
plt.show()

Sample probabilities (first 5): [0.9201316 0.4215097 0.99593   0.1610334 0.9906   ]
Mean probability of positive review: 0.7124
Standard deviation: 0.3595

for i, seq in enumerate(output_seq[:5]):
    print(f"Tokenized Review {i+1}: {seq[:30]}")

Tokenized Review 1: [17, 63, 2343, 1, 3317, 4, 1, 17, 2, 6, 3, 84, 250, 103, 10, 101, 1, 62, 2, 102, 23, 52, 70, 221, 10, 101, 42, 3, 52, 70]
Tokenized Review 2: [17, 6, 28, 4, 1, 88, 438, 99, 123, 90, 2, 28, 4, 1, 1526, 99, 123, 9, 6, 79, 28, 4, 1, 1526, 99, 123, 2, 1, 115, 4]
Tokenized Review 3: [17, 6, 28, 4, 1, 115, 4, 1, 288, 42, 1, 115, 4, 1, 288, 8, 58, 649, 204, 107, 9, 1450, 147, 2, 42, 1, 115, 4, 1, 288]
Tokenized Review 4: [17, 13, 35, 70, 221, 2, 70, 914, 12, 42, 251, 5, 261, 12, 11, 17, 59, 25, 4338, 35, 193, 206, 98, 144, 113, 2, 1, 113, 13, 49]
Tokenized Review 5: [17, 6, 3, 52, 49, 17, 9, 44, 3, 52, 49, 766, 2, 3, 52, 49, 113, 10, 101, 1, 17, 6, 63, 49, 1, 62, 6, 49, 18, 9]

print(f"Vocabulary size: {len(WI)}")
print("Sample words:", list(WI.keys())[:10])

Vocabulary size: 88584
Sample words: ['fawn', 'tsukino', 'nunnery', 'sonja', 'vani', 'woods', 'spiders', 'hanging', 'woody', 'trawling']

Steven Lora - MSIT675 Project 3 - LSTM model for sentiment analysis¶

Import¶

Specify parameters¶

Get Data¶

Standardize Data¶

Create Bidirectional LSTM Model [5 Points]¶

Train model [5 Points]¶

Evaluate trained model [5 Points]¶

Finetune GPT_2 on positive reviews [5 Points]¶

Generate fake reviews [5 Points]¶

Predict probability that fake reviews are positive [5 Points]¶

Additional Struggles and Insights¶

Project Summary¶

Conclusion¶